wnm 0.0.9__py3-none-any.whl → 0.0.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of wnm might be problematic. Click here for more details.
- wnm/__init__.py +1 -1
- wnm/__main__.py +184 -1133
- wnm/actions.py +45 -0
- wnm/common.py +21 -0
- wnm/config.py +653 -1
- wnm/decision_engine.py +388 -0
- wnm/executor.py +1292 -0
- wnm/firewall/__init__.py +13 -0
- wnm/firewall/base.py +71 -0
- wnm/firewall/factory.py +95 -0
- wnm/firewall/null_firewall.py +71 -0
- wnm/firewall/ufw_manager.py +118 -0
- wnm/migration.py +42 -0
- wnm/models.py +305 -126
- wnm/process_managers/__init__.py +23 -0
- wnm/process_managers/base.py +203 -0
- wnm/process_managers/docker_manager.py +371 -0
- wnm/process_managers/factory.py +83 -0
- wnm/process_managers/launchd_manager.py +592 -0
- wnm/process_managers/setsid_manager.py +340 -0
- wnm/process_managers/systemd_manager.py +529 -0
- wnm/reports.py +286 -0
- wnm/utils.py +403 -0
- wnm-0.0.11.dist-info/METADATA +316 -0
- wnm-0.0.11.dist-info/RECORD +28 -0
- {wnm-0.0.9.dist-info → wnm-0.0.11.dist-info}/WHEEL +1 -1
- wnm-0.0.9.dist-info/METADATA +0 -95
- wnm-0.0.9.dist-info/RECORD +0 -9
- {wnm-0.0.9.dist-info → wnm-0.0.11.dist-info}/entry_points.txt +0 -0
- {wnm-0.0.9.dist-info → wnm-0.0.11.dist-info}/top_level.txt +0 -0
wnm/executor.py
ADDED
|
@@ -0,0 +1,1292 @@
|
|
|
1
|
+
"""Action executor for performing node lifecycle operations.
|
|
2
|
+
|
|
3
|
+
This module contains the ActionExecutor class which takes planned actions
|
|
4
|
+
from the DecisionEngine and executes them using ProcessManager abstractions.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import logging
|
|
8
|
+
import os
|
|
9
|
+
import shutil
|
|
10
|
+
import subprocess
|
|
11
|
+
import time
|
|
12
|
+
from typing import Any, Dict, List, Optional
|
|
13
|
+
|
|
14
|
+
from packaging.version import Version
|
|
15
|
+
from sqlalchemy import func, insert, select, text
|
|
16
|
+
from sqlalchemy.orm import scoped_session
|
|
17
|
+
|
|
18
|
+
from wnm.actions import Action, ActionType
|
|
19
|
+
from wnm.common import (
|
|
20
|
+
DEAD,
|
|
21
|
+
DISABLED,
|
|
22
|
+
METRICS_PORT_BASE,
|
|
23
|
+
PORT_MULTIPLIER,
|
|
24
|
+
REMOVING,
|
|
25
|
+
RESTARTING,
|
|
26
|
+
RUNNING,
|
|
27
|
+
STOPPED,
|
|
28
|
+
UPGRADING,
|
|
29
|
+
)
|
|
30
|
+
from wnm.config import LOG_DIR
|
|
31
|
+
from wnm.models import Machine, Node
|
|
32
|
+
from wnm.process_managers.factory import get_default_manager_type, get_process_manager
|
|
33
|
+
from wnm.utils import (
|
|
34
|
+
get_antnode_version,
|
|
35
|
+
parse_service_names,
|
|
36
|
+
update_nodes,
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class ActionExecutor:
|
|
41
|
+
"""Executes planned actions on nodes.
|
|
42
|
+
|
|
43
|
+
The ActionExecutor takes Action objects from the DecisionEngine and
|
|
44
|
+
performs the actual operations by calling utility functions and
|
|
45
|
+
managing database state.
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
def __init__(self, session_factory: scoped_session):
|
|
49
|
+
"""Initialize the action executor.
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
session_factory: SQLAlchemy session factory for database operations
|
|
53
|
+
"""
|
|
54
|
+
self.S = session_factory
|
|
55
|
+
|
|
56
|
+
def _get_process_manager(self, node: Node):
|
|
57
|
+
"""Get the appropriate process manager for a node.
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
node: Node database record
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
ProcessManager instance for the node's manager type
|
|
64
|
+
"""
|
|
65
|
+
# Get manager type from node, or use machine config default
|
|
66
|
+
manager_type = getattr(node, "manager_type", None)
|
|
67
|
+
return get_process_manager(manager_type)
|
|
68
|
+
|
|
69
|
+
def _set_node_status(self, node_id: int, status: str) -> bool:
|
|
70
|
+
"""Update node status in database.
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
node_id: ID of the node
|
|
74
|
+
status: New status to set
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
True if status was updated successfully
|
|
78
|
+
"""
|
|
79
|
+
try:
|
|
80
|
+
with self.S() as session:
|
|
81
|
+
session.query(Node).filter(Node.id == node_id).update(
|
|
82
|
+
{"status": status, "timestamp": int(time.time())}
|
|
83
|
+
)
|
|
84
|
+
session.commit()
|
|
85
|
+
return True
|
|
86
|
+
except Exception as e:
|
|
87
|
+
logging.error(f"Failed to set node status for {node_id}: {e}")
|
|
88
|
+
return False
|
|
89
|
+
|
|
90
|
+
def _upgrade_node_binary(self, node: Node, new_version: str) -> bool:
|
|
91
|
+
"""Upgrade a node's binary and restart it.
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
node: Node to upgrade
|
|
95
|
+
new_version: Version string for the new binary
|
|
96
|
+
|
|
97
|
+
Returns:
|
|
98
|
+
True if upgrade succeeded
|
|
99
|
+
"""
|
|
100
|
+
# Source binary location
|
|
101
|
+
source_binary = os.path.expanduser("~/.local/bin/antnode")
|
|
102
|
+
|
|
103
|
+
# Copy new binary to node directory
|
|
104
|
+
try:
|
|
105
|
+
shutil.copy2(source_binary, node.binary)
|
|
106
|
+
os.chmod(node.binary, 0o755)
|
|
107
|
+
logging.info(f"Copied new binary from {source_binary} to {node.binary}")
|
|
108
|
+
except (OSError, shutil.Error) as err:
|
|
109
|
+
logging.error(f"Failed to copy binary for upgrade: {err}")
|
|
110
|
+
return False
|
|
111
|
+
|
|
112
|
+
# Restart the node with new binary
|
|
113
|
+
manager = self._get_process_manager(node)
|
|
114
|
+
if not manager.restart_node(node):
|
|
115
|
+
logging.error(f"Failed to restart node {node.id} during upgrade")
|
|
116
|
+
return False
|
|
117
|
+
|
|
118
|
+
# Update status to UPGRADING
|
|
119
|
+
with self.S() as session:
|
|
120
|
+
session.query(Node).filter(Node.id == node.id).update(
|
|
121
|
+
{
|
|
122
|
+
"status": UPGRADING,
|
|
123
|
+
"timestamp": int(time.time()),
|
|
124
|
+
"version": new_version,
|
|
125
|
+
}
|
|
126
|
+
)
|
|
127
|
+
session.commit()
|
|
128
|
+
|
|
129
|
+
return True
|
|
130
|
+
|
|
131
|
+
def execute(
|
|
132
|
+
self,
|
|
133
|
+
actions: List[Action],
|
|
134
|
+
machine_config: Dict[str, Any],
|
|
135
|
+
metrics: Dict[str, Any],
|
|
136
|
+
dry_run: bool = False,
|
|
137
|
+
) -> Dict[str, Any]:
|
|
138
|
+
"""Execute a list of actions.
|
|
139
|
+
|
|
140
|
+
Args:
|
|
141
|
+
actions: List of Action objects to execute
|
|
142
|
+
machine_config: Machine configuration dictionary
|
|
143
|
+
metrics: Current system metrics
|
|
144
|
+
dry_run: If True, log actions without executing them
|
|
145
|
+
|
|
146
|
+
Returns:
|
|
147
|
+
Dictionary with execution status and results
|
|
148
|
+
"""
|
|
149
|
+
if not actions:
|
|
150
|
+
return {"status": "no-actions", "results": []}
|
|
151
|
+
|
|
152
|
+
results = []
|
|
153
|
+
|
|
154
|
+
for action in actions:
|
|
155
|
+
logging.info(
|
|
156
|
+
f"Executing: {action.type.value} (priority={action.priority}, reason={action.reason})"
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
try:
|
|
160
|
+
result = self._execute_action(action, machine_config, metrics, dry_run)
|
|
161
|
+
results.append(result)
|
|
162
|
+
except Exception as e:
|
|
163
|
+
logging.error(f"Failed to execute {action.type.value}: {e}")
|
|
164
|
+
results.append(
|
|
165
|
+
{"action": action.type.value, "success": False, "error": str(e)}
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
# Return status from the first (highest priority) action
|
|
169
|
+
if results:
|
|
170
|
+
return results[0]
|
|
171
|
+
return {"status": "no-results"}
|
|
172
|
+
|
|
173
|
+
def _execute_action(
|
|
174
|
+
self,
|
|
175
|
+
action: Action,
|
|
176
|
+
machine_config: Dict[str, Any],
|
|
177
|
+
metrics: Dict[str, Any],
|
|
178
|
+
dry_run: bool,
|
|
179
|
+
) -> Dict[str, Any]:
|
|
180
|
+
"""Execute a single action.
|
|
181
|
+
|
|
182
|
+
Args:
|
|
183
|
+
action: The action to execute
|
|
184
|
+
machine_config: Machine configuration
|
|
185
|
+
metrics: Current metrics
|
|
186
|
+
dry_run: If True, log without executing
|
|
187
|
+
|
|
188
|
+
Returns:
|
|
189
|
+
Dictionary with execution result
|
|
190
|
+
"""
|
|
191
|
+
if action.type == ActionType.RESURVEY_NODES:
|
|
192
|
+
return self._execute_resurvey(machine_config, dry_run)
|
|
193
|
+
|
|
194
|
+
elif action.type == ActionType.REMOVE_NODE:
|
|
195
|
+
return self._execute_remove_node(action, dry_run)
|
|
196
|
+
|
|
197
|
+
elif action.type == ActionType.STOP_NODE:
|
|
198
|
+
return self._execute_stop_node(machine_config, dry_run)
|
|
199
|
+
|
|
200
|
+
elif action.type == ActionType.UPGRADE_NODE:
|
|
201
|
+
return self._execute_upgrade_node(metrics, dry_run)
|
|
202
|
+
|
|
203
|
+
elif action.type == ActionType.START_NODE:
|
|
204
|
+
return self._execute_start_node(metrics, dry_run)
|
|
205
|
+
|
|
206
|
+
elif action.type == ActionType.ADD_NODE:
|
|
207
|
+
return self._execute_add_node(machine_config, metrics, dry_run)
|
|
208
|
+
|
|
209
|
+
elif action.type == ActionType.SURVEY_NODES:
|
|
210
|
+
return self._execute_survey(dry_run)
|
|
211
|
+
|
|
212
|
+
else:
|
|
213
|
+
logging.warning(f"Unknown action type: {action.type}")
|
|
214
|
+
return {"status": "unknown-action", "action": action.type.value}
|
|
215
|
+
|
|
216
|
+
def _execute_resurvey(
|
|
217
|
+
self, machine_config: Dict[str, Any], dry_run: bool
|
|
218
|
+
) -> Dict[str, Any]:
|
|
219
|
+
"""Execute node resurvey after system reboot."""
|
|
220
|
+
if dry_run:
|
|
221
|
+
logging.warning("DRYRUN: System rebooted, survey nodes")
|
|
222
|
+
else:
|
|
223
|
+
update_nodes(self.S)
|
|
224
|
+
# Update the last stopped time
|
|
225
|
+
with self.S() as session:
|
|
226
|
+
session.query(Machine).filter(Machine.id == 1).update(
|
|
227
|
+
{"last_stopped_at": int(time.time())}
|
|
228
|
+
)
|
|
229
|
+
session.commit()
|
|
230
|
+
|
|
231
|
+
return {"status": "system-rebooted"}
|
|
232
|
+
|
|
233
|
+
def _execute_remove_node(self, action: Action, dry_run: bool) -> Dict[str, Any]:
|
|
234
|
+
"""Execute node removal.
|
|
235
|
+
|
|
236
|
+
If reason contains 'dead', remove all dead nodes.
|
|
237
|
+
Otherwise, remove youngest stopped or running node based on reason.
|
|
238
|
+
"""
|
|
239
|
+
if "dead" in action.reason.lower():
|
|
240
|
+
# Remove all dead nodes
|
|
241
|
+
if dry_run:
|
|
242
|
+
logging.warning("DRYRUN: Remove Dead Nodes")
|
|
243
|
+
else:
|
|
244
|
+
with self.S() as session:
|
|
245
|
+
broken = session.execute(
|
|
246
|
+
select(Node)
|
|
247
|
+
.where(Node.status == DEAD)
|
|
248
|
+
.order_by(Node.timestamp.asc())
|
|
249
|
+
).all()
|
|
250
|
+
|
|
251
|
+
for row in broken:
|
|
252
|
+
node = row[0]
|
|
253
|
+
logging.info(f"Removing dead node {node.id}")
|
|
254
|
+
manager = self._get_process_manager(node)
|
|
255
|
+
manager.remove_node(node)
|
|
256
|
+
# Delete from database immediately (no delay for dead nodes)
|
|
257
|
+
with self.S() as session:
|
|
258
|
+
session.delete(node)
|
|
259
|
+
session.commit()
|
|
260
|
+
|
|
261
|
+
return {"status": "removed-dead-nodes"}
|
|
262
|
+
|
|
263
|
+
elif "stopped" in action.reason.lower():
|
|
264
|
+
# Remove youngest stopped node
|
|
265
|
+
with self.S() as session:
|
|
266
|
+
youngest = session.execute(
|
|
267
|
+
select(Node).where(Node.status == STOPPED).order_by(Node.age.desc())
|
|
268
|
+
).first()
|
|
269
|
+
|
|
270
|
+
if youngest:
|
|
271
|
+
if dry_run:
|
|
272
|
+
logging.warning("DRYRUN: Remove youngest stopped node")
|
|
273
|
+
else:
|
|
274
|
+
node = youngest[0]
|
|
275
|
+
manager = self._get_process_manager(node)
|
|
276
|
+
manager.remove_node(node)
|
|
277
|
+
# Delete from database immediately (no delay for stopped nodes)
|
|
278
|
+
with self.S() as session:
|
|
279
|
+
session.delete(node)
|
|
280
|
+
session.commit()
|
|
281
|
+
return {"status": "removed-stopped-node"}
|
|
282
|
+
else:
|
|
283
|
+
return {"status": "no-stopped-nodes-to-remove"}
|
|
284
|
+
|
|
285
|
+
else:
|
|
286
|
+
# Remove youngest running node (with delay)
|
|
287
|
+
with self.S() as session:
|
|
288
|
+
youngest = session.execute(
|
|
289
|
+
select(Node).where(Node.status == RUNNING).order_by(Node.age.desc())
|
|
290
|
+
).first()
|
|
291
|
+
|
|
292
|
+
if youngest:
|
|
293
|
+
if dry_run:
|
|
294
|
+
logging.warning("DRYRUN: Remove youngest running node")
|
|
295
|
+
else:
|
|
296
|
+
node = youngest[0]
|
|
297
|
+
manager = self._get_process_manager(node)
|
|
298
|
+
manager.stop_node(node)
|
|
299
|
+
# Mark as REMOVING (will be deleted later after delay)
|
|
300
|
+
self._set_node_status(node.id, REMOVING)
|
|
301
|
+
return {"status": "removed-running-node"}
|
|
302
|
+
else:
|
|
303
|
+
return {"status": "no-running-nodes-to-remove"}
|
|
304
|
+
|
|
305
|
+
def _execute_stop_node(
|
|
306
|
+
self, machine_config: Dict[str, Any], dry_run: bool
|
|
307
|
+
) -> Dict[str, Any]:
|
|
308
|
+
"""Execute node stop (to reduce resource usage)."""
|
|
309
|
+
with self.S() as session:
|
|
310
|
+
youngest = session.execute(
|
|
311
|
+
select(Node).where(Node.status == RUNNING).order_by(Node.age.desc())
|
|
312
|
+
).first()
|
|
313
|
+
|
|
314
|
+
if youngest:
|
|
315
|
+
if dry_run:
|
|
316
|
+
logging.warning("DRYRUN: Stopping youngest node")
|
|
317
|
+
else:
|
|
318
|
+
node = youngest[0]
|
|
319
|
+
manager = self._get_process_manager(node)
|
|
320
|
+
manager.stop_node(node)
|
|
321
|
+
self._set_node_status(node.id, STOPPED)
|
|
322
|
+
# Update the last stopped time
|
|
323
|
+
with self.S() as session:
|
|
324
|
+
session.query(Machine).filter(Machine.id == 1).update(
|
|
325
|
+
{"last_stopped_at": int(time.time())}
|
|
326
|
+
)
|
|
327
|
+
session.commit()
|
|
328
|
+
return {"status": "stopped-node"}
|
|
329
|
+
else:
|
|
330
|
+
return {"status": "no-nodes-to-stop"}
|
|
331
|
+
|
|
332
|
+
def _execute_upgrade_node(
|
|
333
|
+
self, metrics: Dict[str, Any], dry_run: bool
|
|
334
|
+
) -> Dict[str, Any]:
|
|
335
|
+
"""Execute node upgrade (oldest running node with outdated version)."""
|
|
336
|
+
with self.S() as session:
|
|
337
|
+
oldest = session.execute(
|
|
338
|
+
select(Node)
|
|
339
|
+
.where(Node.status == RUNNING)
|
|
340
|
+
.where(Node.version != metrics["antnode_version"])
|
|
341
|
+
.order_by(Node.age.asc())
|
|
342
|
+
).first()
|
|
343
|
+
|
|
344
|
+
if oldest:
|
|
345
|
+
if dry_run:
|
|
346
|
+
logging.warning("DRYRUN: Upgrade oldest node")
|
|
347
|
+
else:
|
|
348
|
+
node = oldest[0]
|
|
349
|
+
# If we don't have a version number from metadata, grab from binary
|
|
350
|
+
if not node.version:
|
|
351
|
+
node.version = get_antnode_version(node.binary)
|
|
352
|
+
|
|
353
|
+
# Perform the upgrade (copies binary, restarts, sets UPGRADING status)
|
|
354
|
+
if not self._upgrade_node_binary(node, metrics["antnode_version"]):
|
|
355
|
+
return {"status": "upgrade-failed"}
|
|
356
|
+
|
|
357
|
+
return {"status": "upgrading-node"}
|
|
358
|
+
else:
|
|
359
|
+
return {"status": "no-nodes-to-upgrade"}
|
|
360
|
+
|
|
361
|
+
def _execute_start_node(
|
|
362
|
+
self, metrics: Dict[str, Any], dry_run: bool
|
|
363
|
+
) -> Dict[str, Any]:
|
|
364
|
+
"""Execute starting a stopped node (may upgrade first if needed)."""
|
|
365
|
+
with self.S() as session:
|
|
366
|
+
oldest = session.execute(
|
|
367
|
+
select(Node).where(Node.status == STOPPED).order_by(Node.age.asc())
|
|
368
|
+
).first()
|
|
369
|
+
|
|
370
|
+
if oldest:
|
|
371
|
+
node = oldest[0]
|
|
372
|
+
# If we don't have a version number from metadata, grab from binary
|
|
373
|
+
if not node.version:
|
|
374
|
+
node.version = get_antnode_version(node.binary)
|
|
375
|
+
|
|
376
|
+
# If the stopped version is old, upgrade it (which also starts it)
|
|
377
|
+
if Version(metrics["antnode_version"]) > Version(node.version):
|
|
378
|
+
if dry_run:
|
|
379
|
+
logging.warning("DRYRUN: Upgrade and start stopped node")
|
|
380
|
+
else:
|
|
381
|
+
# Perform the upgrade (copies binary, restarts, sets UPGRADING status)
|
|
382
|
+
if not self._upgrade_node_binary(node, metrics["antnode_version"]):
|
|
383
|
+
return {"status": "failed-upgrade"}
|
|
384
|
+
return {"status": "upgrading-stopped-node"}
|
|
385
|
+
else:
|
|
386
|
+
if dry_run:
|
|
387
|
+
logging.warning("DRYRUN: Start stopped node")
|
|
388
|
+
return {"status": "starting-node"}
|
|
389
|
+
else:
|
|
390
|
+
manager = self._get_process_manager(node)
|
|
391
|
+
if manager.start_node(node):
|
|
392
|
+
self._set_node_status(node.id, RESTARTING)
|
|
393
|
+
return {"status": "started-node"}
|
|
394
|
+
else:
|
|
395
|
+
return {"status": "failed-start-node"}
|
|
396
|
+
else:
|
|
397
|
+
return {"status": "no-stopped-nodes"}
|
|
398
|
+
|
|
399
|
+
def _execute_add_node(
|
|
400
|
+
self, machine_config: Dict[str, Any], metrics: Dict[str, Any], dry_run: bool
|
|
401
|
+
) -> Dict[str, Any]:
|
|
402
|
+
"""Execute adding a new node."""
|
|
403
|
+
if dry_run:
|
|
404
|
+
logging.warning("DRYRUN: Add a node")
|
|
405
|
+
return {"status": "add-node"}
|
|
406
|
+
|
|
407
|
+
# Find next available node ID (look for holes first)
|
|
408
|
+
# First check if node 1 exists
|
|
409
|
+
with self.S() as session:
|
|
410
|
+
node_1_exists = session.execute(
|
|
411
|
+
select(Node.id).where(Node.id == 1)
|
|
412
|
+
).first()
|
|
413
|
+
|
|
414
|
+
if not node_1_exists:
|
|
415
|
+
# Node 1 is available, use it
|
|
416
|
+
node_id = 1
|
|
417
|
+
else:
|
|
418
|
+
# Look for holes in the sequence
|
|
419
|
+
sql = text(
|
|
420
|
+
"select n1.id + 1 as id from node n1 "
|
|
421
|
+
+ "left join node n2 on n2.id = n1.id + 1 "
|
|
422
|
+
+ "where n2.id is null "
|
|
423
|
+
+ "and n1.id <> (select max(id) from node) "
|
|
424
|
+
+ "order by n1.id;"
|
|
425
|
+
)
|
|
426
|
+
with self.S() as session:
|
|
427
|
+
result = session.execute(sql).first()
|
|
428
|
+
|
|
429
|
+
if result:
|
|
430
|
+
node_id = result[0]
|
|
431
|
+
else:
|
|
432
|
+
# No holes, use max + 1
|
|
433
|
+
with self.S() as session:
|
|
434
|
+
result = session.execute(
|
|
435
|
+
select(Node.id).order_by(Node.id.desc())
|
|
436
|
+
).first()
|
|
437
|
+
node_id = result[0] + 1 if result else 1
|
|
438
|
+
|
|
439
|
+
# Determine the appropriate manager type for this system
|
|
440
|
+
manager_type = get_default_manager_type()
|
|
441
|
+
|
|
442
|
+
# Create node object
|
|
443
|
+
node = Node(
|
|
444
|
+
id=node_id,
|
|
445
|
+
node_name=f"{node_id:04}",
|
|
446
|
+
service=f"antnode{node_id:04}.service",
|
|
447
|
+
user=machine_config.get("user", "ant"),
|
|
448
|
+
version=metrics["antnode_version"],
|
|
449
|
+
root_dir=f"{machine_config['node_storage']}/antnode{node_id:04}",
|
|
450
|
+
binary=f"{machine_config['node_storage']}/antnode{node_id:04}/antnode",
|
|
451
|
+
port=machine_config["port_start"] * PORT_MULTIPLIER + node_id,
|
|
452
|
+
metrics_port=METRICS_PORT_BASE + node_id,
|
|
453
|
+
network="evm-arbitrum-one",
|
|
454
|
+
wallet=machine_config["rewards_address"],
|
|
455
|
+
peer_id="",
|
|
456
|
+
status=STOPPED,
|
|
457
|
+
timestamp=int(time.time()),
|
|
458
|
+
records=0,
|
|
459
|
+
uptime=0,
|
|
460
|
+
shunned=0,
|
|
461
|
+
age=int(time.time()),
|
|
462
|
+
host=machine_config["host"],
|
|
463
|
+
method=manager_type,
|
|
464
|
+
layout="1",
|
|
465
|
+
environment=machine_config.get("environment", ""),
|
|
466
|
+
manager_type=manager_type,
|
|
467
|
+
)
|
|
468
|
+
|
|
469
|
+
# Insert into database
|
|
470
|
+
with self.S() as session:
|
|
471
|
+
session.add(node)
|
|
472
|
+
session.commit()
|
|
473
|
+
session.refresh(node) # Get the persisted node
|
|
474
|
+
|
|
475
|
+
# Create the node using process manager
|
|
476
|
+
source_binary = os.path.expanduser("~/.local/bin/antnode")
|
|
477
|
+
manager = self._get_process_manager(node)
|
|
478
|
+
|
|
479
|
+
if not manager.create_node(node, source_binary):
|
|
480
|
+
logging.error(f"Failed to create node {node.id}")
|
|
481
|
+
return {"status": "failed-create-node"}
|
|
482
|
+
|
|
483
|
+
# Update status to RESTARTING (node is starting up)
|
|
484
|
+
self._set_node_status(node.id, RESTARTING)
|
|
485
|
+
|
|
486
|
+
return {"status": "added-node"}
|
|
487
|
+
|
|
488
|
+
def _execute_survey(self, dry_run: bool) -> Dict[str, Any]:
|
|
489
|
+
"""Execute node survey (idle monitoring)."""
|
|
490
|
+
if dry_run:
|
|
491
|
+
logging.warning("DRYRUN: Update nodes")
|
|
492
|
+
else:
|
|
493
|
+
update_nodes(self.S)
|
|
494
|
+
return {"status": "idle"}
|
|
495
|
+
|
|
496
|
+
def _parse_node_name(self, service_name: str) -> Optional[int]:
|
|
497
|
+
"""Parse node ID from service name like 'antnode0001'.
|
|
498
|
+
|
|
499
|
+
Args:
|
|
500
|
+
service_name: Node name (e.g., 'antnode0001')
|
|
501
|
+
|
|
502
|
+
Returns:
|
|
503
|
+
Node ID as integer, or None if parsing fails
|
|
504
|
+
"""
|
|
505
|
+
import re
|
|
506
|
+
match = re.match(r"antnode(\d+)", service_name)
|
|
507
|
+
if match:
|
|
508
|
+
return int(match.group(1))
|
|
509
|
+
return None
|
|
510
|
+
|
|
511
|
+
def _get_node_by_name(self, service_name: str) -> Optional[Node]:
|
|
512
|
+
"""Get node by service name.
|
|
513
|
+
|
|
514
|
+
Args:
|
|
515
|
+
service_name: Node name (e.g., 'antnode0001')
|
|
516
|
+
|
|
517
|
+
Returns:
|
|
518
|
+
Node object or None if not found
|
|
519
|
+
"""
|
|
520
|
+
node_id = self._parse_node_name(service_name)
|
|
521
|
+
if node_id is None:
|
|
522
|
+
logging.error(f"Invalid node name format: {service_name}")
|
|
523
|
+
return None
|
|
524
|
+
|
|
525
|
+
with self.S() as session:
|
|
526
|
+
result = session.execute(
|
|
527
|
+
select(Node).where(Node.id == node_id)
|
|
528
|
+
).first()
|
|
529
|
+
|
|
530
|
+
if result:
|
|
531
|
+
return result[0]
|
|
532
|
+
else:
|
|
533
|
+
logging.error(f"Node not found: {service_name} (id={node_id})")
|
|
534
|
+
return None
|
|
535
|
+
|
|
536
|
+
def execute_forced_action(
|
|
537
|
+
self,
|
|
538
|
+
action_type: str,
|
|
539
|
+
machine_config: Dict[str, Any],
|
|
540
|
+
metrics: Dict[str, Any],
|
|
541
|
+
service_name: Optional[str] = None,
|
|
542
|
+
dry_run: bool = False,
|
|
543
|
+
count: int = 1,
|
|
544
|
+
) -> Dict[str, Any]:
|
|
545
|
+
"""Execute a forced action bypassing the decision engine.
|
|
546
|
+
|
|
547
|
+
Args:
|
|
548
|
+
action_type: Type of action ('add', 'remove', 'upgrade', 'start', 'stop', 'disable', 'teardown')
|
|
549
|
+
machine_config: Machine configuration
|
|
550
|
+
metrics: Current system metrics
|
|
551
|
+
service_name: Optional node name for targeted operations
|
|
552
|
+
dry_run: If True, log without executing
|
|
553
|
+
count: Number of nodes to affect (for add, remove, start, stop, upgrade actions)
|
|
554
|
+
|
|
555
|
+
Returns:
|
|
556
|
+
Dictionary with execution result
|
|
557
|
+
"""
|
|
558
|
+
if action_type == "add":
|
|
559
|
+
return self._force_add_node(machine_config, metrics, dry_run, count)
|
|
560
|
+
elif action_type == "remove":
|
|
561
|
+
return self._force_remove_node(service_name, dry_run, count)
|
|
562
|
+
elif action_type == "upgrade":
|
|
563
|
+
return self._force_upgrade_node(service_name, metrics, dry_run, count)
|
|
564
|
+
elif action_type == "start":
|
|
565
|
+
return self._force_start_node(service_name, metrics, dry_run, count)
|
|
566
|
+
elif action_type == "stop":
|
|
567
|
+
return self._force_stop_node(service_name, dry_run, count)
|
|
568
|
+
elif action_type == "disable":
|
|
569
|
+
return self._force_disable_node(service_name, dry_run)
|
|
570
|
+
elif action_type == "teardown":
|
|
571
|
+
return self._force_teardown_cluster(machine_config, dry_run)
|
|
572
|
+
elif action_type == "survey":
|
|
573
|
+
return self._force_survey_nodes(service_name, dry_run)
|
|
574
|
+
else:
|
|
575
|
+
return {"status": "error", "message": f"Unknown action type: {action_type}"}
|
|
576
|
+
|
|
577
|
+
def _force_add_node(
|
|
578
|
+
self, machine_config: Dict[str, Any], metrics: Dict[str, Any], dry_run: bool, count: int = 1
|
|
579
|
+
) -> Dict[str, Any]:
|
|
580
|
+
"""Force add new nodes.
|
|
581
|
+
|
|
582
|
+
Args:
|
|
583
|
+
machine_config: Machine configuration
|
|
584
|
+
metrics: Current system metrics
|
|
585
|
+
dry_run: If True, log without executing
|
|
586
|
+
count: Number of nodes to add (default: 1)
|
|
587
|
+
|
|
588
|
+
Returns:
|
|
589
|
+
Dictionary with execution result
|
|
590
|
+
"""
|
|
591
|
+
logging.info(f"Forced action: Adding {count} node(s)")
|
|
592
|
+
|
|
593
|
+
if count < 1:
|
|
594
|
+
return {"status": "error", "message": "count must be at least 1"}
|
|
595
|
+
|
|
596
|
+
added_nodes = []
|
|
597
|
+
failed_nodes = []
|
|
598
|
+
|
|
599
|
+
# Track the start time to identify newly created nodes
|
|
600
|
+
start_time = int(time.time())
|
|
601
|
+
|
|
602
|
+
for i in range(count):
|
|
603
|
+
result = self._execute_add_node(machine_config, metrics, dry_run)
|
|
604
|
+
if result["status"] in ["added-node", "add-node"]:
|
|
605
|
+
# Get the node that was just added (youngest by age >= start_time)
|
|
606
|
+
if not dry_run:
|
|
607
|
+
with self.S() as session:
|
|
608
|
+
newest = session.execute(
|
|
609
|
+
select(Node).where(Node.age >= start_time).order_by(Node.age.desc())
|
|
610
|
+
).first()
|
|
611
|
+
if newest:
|
|
612
|
+
added_nodes.append(newest[0].service.replace(".service", ""))
|
|
613
|
+
else:
|
|
614
|
+
added_nodes.append(f"node-{i+1}")
|
|
615
|
+
else:
|
|
616
|
+
failed_nodes.append({"index": i+1, "error": result.get("status", "unknown error")})
|
|
617
|
+
|
|
618
|
+
if count == 1:
|
|
619
|
+
# Keep backward compatibility for single node
|
|
620
|
+
return result
|
|
621
|
+
|
|
622
|
+
return {
|
|
623
|
+
"status": "added-nodes" if not dry_run else "add-nodes-dryrun",
|
|
624
|
+
"added_count": len(added_nodes),
|
|
625
|
+
"added_nodes": added_nodes if added_nodes else None,
|
|
626
|
+
"failed_count": len(failed_nodes),
|
|
627
|
+
"failed_nodes": failed_nodes if failed_nodes else None,
|
|
628
|
+
}
|
|
629
|
+
|
|
630
|
+
def _force_remove_node(
|
|
631
|
+
self, service_name: Optional[str], dry_run: bool, count: int = 1
|
|
632
|
+
) -> Dict[str, Any]:
|
|
633
|
+
"""Force remove nodes (specific or youngest by age).
|
|
634
|
+
|
|
635
|
+
Args:
|
|
636
|
+
service_name: Optional comma-separated list of service names
|
|
637
|
+
dry_run: If True, log without executing
|
|
638
|
+
count: Number of nodes to remove when service_name is not specified (default: 1)
|
|
639
|
+
|
|
640
|
+
Returns:
|
|
641
|
+
Dictionary with execution result
|
|
642
|
+
"""
|
|
643
|
+
# Parse comma-separated service names
|
|
644
|
+
service_names = parse_service_names(service_name)
|
|
645
|
+
|
|
646
|
+
if service_names:
|
|
647
|
+
# Remove specific nodes
|
|
648
|
+
removed_nodes = []
|
|
649
|
+
failed_nodes = []
|
|
650
|
+
|
|
651
|
+
for name in service_names:
|
|
652
|
+
node = self._get_node_by_name(name)
|
|
653
|
+
if not node:
|
|
654
|
+
failed_nodes.append({"service": name, "error": "not found"})
|
|
655
|
+
continue
|
|
656
|
+
|
|
657
|
+
logging.info(f"Forced action: Removing node {name}")
|
|
658
|
+
if dry_run:
|
|
659
|
+
logging.warning(f"DRYRUN: Remove node {name}")
|
|
660
|
+
removed_nodes.append(name)
|
|
661
|
+
else:
|
|
662
|
+
try:
|
|
663
|
+
manager = self._get_process_manager(node)
|
|
664
|
+
manager.remove_node(node)
|
|
665
|
+
# Remove from database immediately
|
|
666
|
+
with self.S() as session:
|
|
667
|
+
session.delete(node)
|
|
668
|
+
session.commit()
|
|
669
|
+
removed_nodes.append(name)
|
|
670
|
+
except Exception as e:
|
|
671
|
+
logging.error(f"Failed to remove node {name}: {e}")
|
|
672
|
+
failed_nodes.append({"service": name, "error": str(e)})
|
|
673
|
+
|
|
674
|
+
return {
|
|
675
|
+
"status": "removed-nodes" if not dry_run else "remove-dryrun",
|
|
676
|
+
"removed_count": len(removed_nodes),
|
|
677
|
+
"removed_nodes": removed_nodes,
|
|
678
|
+
"failed_count": len(failed_nodes),
|
|
679
|
+
"failed_nodes": failed_nodes if failed_nodes else None,
|
|
680
|
+
}
|
|
681
|
+
else:
|
|
682
|
+
# Remove youngest nodes (default behavior - highest age value)
|
|
683
|
+
if count < 1:
|
|
684
|
+
return {"status": "error", "message": "count must be at least 1"}
|
|
685
|
+
|
|
686
|
+
logging.info(f"Forced action: Removing {count} youngest node(s)")
|
|
687
|
+
|
|
688
|
+
# Get youngest nodes (highest age values)
|
|
689
|
+
with self.S() as session:
|
|
690
|
+
youngest_nodes = session.execute(
|
|
691
|
+
select(Node).order_by(Node.age.desc()).limit(count)
|
|
692
|
+
).all()
|
|
693
|
+
|
|
694
|
+
if not youngest_nodes:
|
|
695
|
+
return {"status": "error", "message": "No nodes to remove"}
|
|
696
|
+
|
|
697
|
+
if len(youngest_nodes) < count:
|
|
698
|
+
logging.warning(f"Only {len(youngest_nodes)} nodes available, removing all of them")
|
|
699
|
+
|
|
700
|
+
removed_nodes = []
|
|
701
|
+
failed_nodes = []
|
|
702
|
+
|
|
703
|
+
for row in youngest_nodes:
|
|
704
|
+
node = row[0]
|
|
705
|
+
if dry_run:
|
|
706
|
+
logging.warning(f"DRYRUN: Remove youngest node {node.node_name}")
|
|
707
|
+
removed_nodes.append(node.service.replace(".service", ""))
|
|
708
|
+
else:
|
|
709
|
+
try:
|
|
710
|
+
manager = self._get_process_manager(node)
|
|
711
|
+
manager.remove_node(node)
|
|
712
|
+
# Remove from database immediately
|
|
713
|
+
with self.S() as session:
|
|
714
|
+
session.delete(node)
|
|
715
|
+
session.commit()
|
|
716
|
+
removed_nodes.append(node.service.replace(".service", ""))
|
|
717
|
+
except Exception as e:
|
|
718
|
+
logging.error(f"Failed to remove node {node.node_name}: {e}")
|
|
719
|
+
failed_nodes.append({"service": node.service.replace(".service", ""), "error": str(e)})
|
|
720
|
+
|
|
721
|
+
if count == 1 and len(removed_nodes) == 1:
|
|
722
|
+
# Keep backward compatibility for single node
|
|
723
|
+
# Extract node name from service name (e.g., "antnode0001" -> "0001")
|
|
724
|
+
node_name = removed_nodes[0].replace("antnode", "")
|
|
725
|
+
return {"status": "removed-node", "node": node_name}
|
|
726
|
+
|
|
727
|
+
return {
|
|
728
|
+
"status": "removed-nodes" if not dry_run else "remove-dryrun",
|
|
729
|
+
"removed_count": len(removed_nodes),
|
|
730
|
+
"removed_nodes": removed_nodes if removed_nodes else None,
|
|
731
|
+
"failed_count": len(failed_nodes),
|
|
732
|
+
"failed_nodes": failed_nodes if failed_nodes else None,
|
|
733
|
+
}
|
|
734
|
+
|
|
735
|
+
def _force_upgrade_node(
|
|
736
|
+
self, service_name: Optional[str], metrics: Dict[str, Any], dry_run: bool, count: int = 1
|
|
737
|
+
) -> Dict[str, Any]:
|
|
738
|
+
"""Force upgrade nodes (specific or oldest running nodes by age).
|
|
739
|
+
|
|
740
|
+
Args:
|
|
741
|
+
service_name: Optional comma-separated list of service names
|
|
742
|
+
metrics: Current system metrics
|
|
743
|
+
dry_run: If True, log without executing
|
|
744
|
+
count: Number of nodes to upgrade when service_name is not specified (default: 1)
|
|
745
|
+
|
|
746
|
+
Returns:
|
|
747
|
+
Dictionary with execution result
|
|
748
|
+
"""
|
|
749
|
+
# Parse comma-separated service names
|
|
750
|
+
service_names = parse_service_names(service_name)
|
|
751
|
+
|
|
752
|
+
if service_names:
|
|
753
|
+
# Upgrade specific nodes
|
|
754
|
+
upgraded_nodes = []
|
|
755
|
+
failed_nodes = []
|
|
756
|
+
|
|
757
|
+
for name in service_names:
|
|
758
|
+
node = self._get_node_by_name(name)
|
|
759
|
+
if not node:
|
|
760
|
+
failed_nodes.append({"service": name, "error": "not found"})
|
|
761
|
+
continue
|
|
762
|
+
|
|
763
|
+
logging.info(f"Forced action: Upgrading node {name}")
|
|
764
|
+
if dry_run:
|
|
765
|
+
logging.warning(f"DRYRUN: Upgrade node {name}")
|
|
766
|
+
upgraded_nodes.append(name)
|
|
767
|
+
else:
|
|
768
|
+
try:
|
|
769
|
+
if not self._upgrade_node_binary(node, metrics["antnode_version"]):
|
|
770
|
+
failed_nodes.append({"service": name, "error": "upgrade failed"})
|
|
771
|
+
else:
|
|
772
|
+
upgraded_nodes.append(name)
|
|
773
|
+
except Exception as e:
|
|
774
|
+
logging.error(f"Failed to upgrade node {name}: {e}")
|
|
775
|
+
failed_nodes.append({"service": name, "error": str(e)})
|
|
776
|
+
|
|
777
|
+
return {
|
|
778
|
+
"status": "upgraded-nodes" if not dry_run else "upgrade-dryrun",
|
|
779
|
+
"upgraded_count": len(upgraded_nodes),
|
|
780
|
+
"upgraded_nodes": upgraded_nodes,
|
|
781
|
+
"failed_count": len(failed_nodes),
|
|
782
|
+
"failed_nodes": failed_nodes if failed_nodes else None,
|
|
783
|
+
}
|
|
784
|
+
else:
|
|
785
|
+
# Upgrade oldest running nodes (default behavior - lowest age values)
|
|
786
|
+
if count < 1:
|
|
787
|
+
return {"status": "error", "message": "count must be at least 1"}
|
|
788
|
+
|
|
789
|
+
logging.info(f"Forced action: Upgrading {count} oldest running node(s)")
|
|
790
|
+
|
|
791
|
+
# Get oldest running nodes (lowest age values)
|
|
792
|
+
with self.S() as session:
|
|
793
|
+
oldest_nodes = session.execute(
|
|
794
|
+
select(Node)
|
|
795
|
+
.where(Node.status == RUNNING)
|
|
796
|
+
.order_by(Node.age.asc())
|
|
797
|
+
.limit(count)
|
|
798
|
+
).all()
|
|
799
|
+
|
|
800
|
+
if not oldest_nodes:
|
|
801
|
+
return {"status": "error", "message": "No running nodes to upgrade"}
|
|
802
|
+
|
|
803
|
+
if len(oldest_nodes) < count:
|
|
804
|
+
logging.warning(f"Only {len(oldest_nodes)} running nodes available, upgrading all of them")
|
|
805
|
+
|
|
806
|
+
upgraded_nodes = []
|
|
807
|
+
failed_nodes = []
|
|
808
|
+
|
|
809
|
+
for row in oldest_nodes:
|
|
810
|
+
node = row[0]
|
|
811
|
+
if dry_run:
|
|
812
|
+
logging.warning(f"DRYRUN: Upgrade oldest node {node.node_name}")
|
|
813
|
+
upgraded_nodes.append(node.service.replace(".service", ""))
|
|
814
|
+
else:
|
|
815
|
+
try:
|
|
816
|
+
if not self._upgrade_node_binary(node, metrics["antnode_version"]):
|
|
817
|
+
failed_nodes.append({"service": node.service.replace(".service", ""), "error": "upgrade failed"})
|
|
818
|
+
else:
|
|
819
|
+
upgraded_nodes.append(node.service.replace(".service", ""))
|
|
820
|
+
except Exception as e:
|
|
821
|
+
logging.error(f"Failed to upgrade node {node.node_name}: {e}")
|
|
822
|
+
failed_nodes.append({"service": node.service.replace(".service", ""), "error": str(e)})
|
|
823
|
+
|
|
824
|
+
if count == 1 and len(upgraded_nodes) == 1:
|
|
825
|
+
# Keep backward compatibility for single node
|
|
826
|
+
# Extract node name from service name (e.g., "antnode0001" -> "0001")
|
|
827
|
+
node_name = upgraded_nodes[0].replace("antnode", "")
|
|
828
|
+
return {"status": "upgraded-node", "node": node_name}
|
|
829
|
+
|
|
830
|
+
return {
|
|
831
|
+
"status": "upgraded-nodes" if not dry_run else "upgrade-dryrun",
|
|
832
|
+
"upgraded_count": len(upgraded_nodes),
|
|
833
|
+
"upgraded_nodes": upgraded_nodes if upgraded_nodes else None,
|
|
834
|
+
"failed_count": len(failed_nodes),
|
|
835
|
+
"failed_nodes": failed_nodes if failed_nodes else None,
|
|
836
|
+
}
|
|
837
|
+
|
|
838
|
+
def _force_stop_node(
|
|
839
|
+
self, service_name: Optional[str], dry_run: bool, count: int = 1
|
|
840
|
+
) -> Dict[str, Any]:
|
|
841
|
+
"""Force stop nodes (specific or youngest running nodes by age).
|
|
842
|
+
|
|
843
|
+
Args:
|
|
844
|
+
service_name: Optional comma-separated list of service names
|
|
845
|
+
dry_run: If True, log without executing
|
|
846
|
+
count: Number of nodes to stop when service_name is not specified (default: 1)
|
|
847
|
+
|
|
848
|
+
Returns:
|
|
849
|
+
Dictionary with execution result
|
|
850
|
+
"""
|
|
851
|
+
# Parse comma-separated service names
|
|
852
|
+
service_names = parse_service_names(service_name)
|
|
853
|
+
|
|
854
|
+
if service_names:
|
|
855
|
+
# Stop specific nodes
|
|
856
|
+
stopped_nodes = []
|
|
857
|
+
failed_nodes = []
|
|
858
|
+
|
|
859
|
+
for name in service_names:
|
|
860
|
+
node = self._get_node_by_name(name)
|
|
861
|
+
if not node:
|
|
862
|
+
failed_nodes.append({"service": name, "error": "not found"})
|
|
863
|
+
continue
|
|
864
|
+
|
|
865
|
+
logging.info(f"Forced action: Stopping node {name}")
|
|
866
|
+
if dry_run:
|
|
867
|
+
logging.warning(f"DRYRUN: Stop node {name}")
|
|
868
|
+
stopped_nodes.append(name)
|
|
869
|
+
else:
|
|
870
|
+
try:
|
|
871
|
+
manager = self._get_process_manager(node)
|
|
872
|
+
manager.stop_node(node)
|
|
873
|
+
self._set_node_status(node.id, STOPPED)
|
|
874
|
+
stopped_nodes.append(name)
|
|
875
|
+
except Exception as e:
|
|
876
|
+
logging.error(f"Failed to stop node {name}: {e}")
|
|
877
|
+
failed_nodes.append({"service": name, "error": str(e)})
|
|
878
|
+
|
|
879
|
+
return {
|
|
880
|
+
"status": "stopped-nodes" if not dry_run else "stop-dryrun",
|
|
881
|
+
"stopped_count": len(stopped_nodes),
|
|
882
|
+
"stopped_nodes": stopped_nodes,
|
|
883
|
+
"failed_count": len(failed_nodes),
|
|
884
|
+
"failed_nodes": failed_nodes if failed_nodes else None,
|
|
885
|
+
}
|
|
886
|
+
else:
|
|
887
|
+
# Stop youngest running nodes (default behavior - highest age values)
|
|
888
|
+
if count < 1:
|
|
889
|
+
return {"status": "error", "message": "count must be at least 1"}
|
|
890
|
+
|
|
891
|
+
logging.info(f"Forced action: Stopping {count} youngest running node(s)")
|
|
892
|
+
|
|
893
|
+
# Get youngest running nodes (highest age values)
|
|
894
|
+
with self.S() as session:
|
|
895
|
+
youngest_nodes = session.execute(
|
|
896
|
+
select(Node)
|
|
897
|
+
.where(Node.status == RUNNING)
|
|
898
|
+
.order_by(Node.age.desc())
|
|
899
|
+
.limit(count)
|
|
900
|
+
).all()
|
|
901
|
+
|
|
902
|
+
if not youngest_nodes:
|
|
903
|
+
return {"status": "error", "message": "No running nodes to stop"}
|
|
904
|
+
|
|
905
|
+
if len(youngest_nodes) < count:
|
|
906
|
+
logging.warning(f"Only {len(youngest_nodes)} running nodes available, stopping all of them")
|
|
907
|
+
|
|
908
|
+
stopped_nodes = []
|
|
909
|
+
failed_nodes = []
|
|
910
|
+
|
|
911
|
+
for row in youngest_nodes:
|
|
912
|
+
node = row[0]
|
|
913
|
+
if dry_run:
|
|
914
|
+
logging.warning(f"DRYRUN: Stop youngest node {node.node_name}")
|
|
915
|
+
stopped_nodes.append(node.service.replace(".service", ""))
|
|
916
|
+
else:
|
|
917
|
+
try:
|
|
918
|
+
manager = self._get_process_manager(node)
|
|
919
|
+
manager.stop_node(node)
|
|
920
|
+
self._set_node_status(node.id, STOPPED)
|
|
921
|
+
stopped_nodes.append(node.service.replace(".service", ""))
|
|
922
|
+
except Exception as e:
|
|
923
|
+
logging.error(f"Failed to stop node {node.node_name}: {e}")
|
|
924
|
+
failed_nodes.append({"service": node.service.replace(".service", ""), "error": str(e)})
|
|
925
|
+
|
|
926
|
+
if count == 1 and len(stopped_nodes) == 1:
|
|
927
|
+
# Keep backward compatibility for single node
|
|
928
|
+
# Extract node name from service name (e.g., "antnode0001" -> "0001")
|
|
929
|
+
node_name = stopped_nodes[0].replace("antnode", "")
|
|
930
|
+
return {"status": "stopped-node", "node": node_name}
|
|
931
|
+
|
|
932
|
+
return {
|
|
933
|
+
"status": "stopped-nodes" if not dry_run else "stop-dryrun",
|
|
934
|
+
"stopped_count": len(stopped_nodes),
|
|
935
|
+
"stopped_nodes": stopped_nodes if stopped_nodes else None,
|
|
936
|
+
"failed_count": len(failed_nodes),
|
|
937
|
+
"failed_nodes": failed_nodes if failed_nodes else None,
|
|
938
|
+
}
|
|
939
|
+
|
|
940
|
+
def _force_start_node(
|
|
941
|
+
self, service_name: Optional[str], metrics: Dict[str, Any], dry_run: bool, count: int = 1
|
|
942
|
+
) -> Dict[str, Any]:
|
|
943
|
+
"""Force start nodes (specific or oldest stopped nodes by age).
|
|
944
|
+
|
|
945
|
+
Args:
|
|
946
|
+
service_name: Optional comma-separated list of service names
|
|
947
|
+
metrics: Current system metrics
|
|
948
|
+
dry_run: If True, log without executing
|
|
949
|
+
count: Number of nodes to start when service_name is not specified (default: 1)
|
|
950
|
+
|
|
951
|
+
Returns:
|
|
952
|
+
Dictionary with execution result
|
|
953
|
+
"""
|
|
954
|
+
# Parse comma-separated service names
|
|
955
|
+
service_names = parse_service_names(service_name)
|
|
956
|
+
|
|
957
|
+
if service_names:
|
|
958
|
+
# Start specific nodes
|
|
959
|
+
started_nodes = []
|
|
960
|
+
upgraded_nodes = []
|
|
961
|
+
failed_nodes = []
|
|
962
|
+
|
|
963
|
+
for name in service_names:
|
|
964
|
+
node = self._get_node_by_name(name)
|
|
965
|
+
if not node:
|
|
966
|
+
failed_nodes.append({"service": name, "error": "not found"})
|
|
967
|
+
continue
|
|
968
|
+
|
|
969
|
+
if node.status == RUNNING:
|
|
970
|
+
failed_nodes.append({"service": name, "error": "already running"})
|
|
971
|
+
continue
|
|
972
|
+
|
|
973
|
+
logging.info(f"Forced action: Starting node {name}")
|
|
974
|
+
if dry_run:
|
|
975
|
+
logging.warning(f"DRYRUN: Start node {name}")
|
|
976
|
+
started_nodes.append(name)
|
|
977
|
+
else:
|
|
978
|
+
try:
|
|
979
|
+
# Check if node needs upgrade
|
|
980
|
+
if not node.version:
|
|
981
|
+
node.version = get_antnode_version(node.binary)
|
|
982
|
+
|
|
983
|
+
# If the stopped version is old, upgrade it (which also starts it)
|
|
984
|
+
if Version(metrics["antnode_version"]) > Version(node.version):
|
|
985
|
+
if not self._upgrade_node_binary(node, metrics["antnode_version"]):
|
|
986
|
+
failed_nodes.append({"service": name, "error": "upgrade failed"})
|
|
987
|
+
else:
|
|
988
|
+
upgraded_nodes.append(name)
|
|
989
|
+
else:
|
|
990
|
+
manager = self._get_process_manager(node)
|
|
991
|
+
if manager.start_node(node):
|
|
992
|
+
self._set_node_status(node.id, RESTARTING)
|
|
993
|
+
started_nodes.append(name)
|
|
994
|
+
else:
|
|
995
|
+
failed_nodes.append({"service": name, "error": "start failed"})
|
|
996
|
+
except Exception as e:
|
|
997
|
+
logging.error(f"Failed to start node {name}: {e}")
|
|
998
|
+
failed_nodes.append({"service": name, "error": str(e)})
|
|
999
|
+
|
|
1000
|
+
return {
|
|
1001
|
+
"status": "started-nodes" if not dry_run else "start-dryrun",
|
|
1002
|
+
"started_count": len(started_nodes),
|
|
1003
|
+
"started_nodes": started_nodes,
|
|
1004
|
+
"upgraded_count": len(upgraded_nodes),
|
|
1005
|
+
"upgraded_nodes": upgraded_nodes if upgraded_nodes else None,
|
|
1006
|
+
"failed_count": len(failed_nodes),
|
|
1007
|
+
"failed_nodes": failed_nodes if failed_nodes else None,
|
|
1008
|
+
}
|
|
1009
|
+
else:
|
|
1010
|
+
# Start oldest stopped nodes (default behavior - lowest age values)
|
|
1011
|
+
if count < 1:
|
|
1012
|
+
return {"status": "error", "message": "count must be at least 1"}
|
|
1013
|
+
|
|
1014
|
+
logging.info(f"Forced action: Starting {count} oldest stopped node(s)")
|
|
1015
|
+
|
|
1016
|
+
# Get oldest stopped nodes (lowest age values)
|
|
1017
|
+
with self.S() as session:
|
|
1018
|
+
oldest_nodes = session.execute(
|
|
1019
|
+
select(Node)
|
|
1020
|
+
.where(Node.status == STOPPED)
|
|
1021
|
+
.order_by(Node.age.asc())
|
|
1022
|
+
.limit(count)
|
|
1023
|
+
).all()
|
|
1024
|
+
|
|
1025
|
+
if not oldest_nodes:
|
|
1026
|
+
return {"status": "error", "message": "No stopped nodes to start"}
|
|
1027
|
+
|
|
1028
|
+
if len(oldest_nodes) < count:
|
|
1029
|
+
logging.warning(f"Only {len(oldest_nodes)} stopped nodes available, starting all of them")
|
|
1030
|
+
|
|
1031
|
+
started_nodes = []
|
|
1032
|
+
upgraded_nodes = []
|
|
1033
|
+
failed_nodes = []
|
|
1034
|
+
|
|
1035
|
+
for row in oldest_nodes:
|
|
1036
|
+
node = row[0]
|
|
1037
|
+
if dry_run:
|
|
1038
|
+
logging.warning(f"DRYRUN: Start oldest stopped node {node.node_name}")
|
|
1039
|
+
started_nodes.append(node.service.replace(".service", ""))
|
|
1040
|
+
else:
|
|
1041
|
+
try:
|
|
1042
|
+
# Check if node needs upgrade
|
|
1043
|
+
if not node.version:
|
|
1044
|
+
node.version = get_antnode_version(node.binary)
|
|
1045
|
+
|
|
1046
|
+
# If the stopped version is old, upgrade it (which also starts it)
|
|
1047
|
+
if Version(metrics["antnode_version"]) > Version(node.version):
|
|
1048
|
+
if not self._upgrade_node_binary(node, metrics["antnode_version"]):
|
|
1049
|
+
failed_nodes.append({"service": node.service.replace(".service", ""), "error": "upgrade failed"})
|
|
1050
|
+
else:
|
|
1051
|
+
upgraded_nodes.append(node.service.replace(".service", ""))
|
|
1052
|
+
else:
|
|
1053
|
+
manager = self._get_process_manager(node)
|
|
1054
|
+
if manager.start_node(node):
|
|
1055
|
+
self._set_node_status(node.id, RESTARTING)
|
|
1056
|
+
started_nodes.append(node.service.replace(".service", ""))
|
|
1057
|
+
else:
|
|
1058
|
+
failed_nodes.append({"service": node.service.replace(".service", ""), "error": "start failed"})
|
|
1059
|
+
except Exception as e:
|
|
1060
|
+
logging.error(f"Failed to start node {node.node_name}: {e}")
|
|
1061
|
+
failed_nodes.append({"service": node.service.replace(".service", ""), "error": str(e)})
|
|
1062
|
+
|
|
1063
|
+
if count == 1 and len(started_nodes) == 1:
|
|
1064
|
+
# Keep backward compatibility for single node
|
|
1065
|
+
# Extract node name from service name (e.g., "antnode0001" -> "0001")
|
|
1066
|
+
node_name = started_nodes[0].replace("antnode", "")
|
|
1067
|
+
return {"status": "started-node", "node": node_name}
|
|
1068
|
+
elif count == 1 and len(upgraded_nodes) == 1:
|
|
1069
|
+
# Keep backward compatibility for single node upgrade
|
|
1070
|
+
# Extract node name from service name (e.g., "antnode0001" -> "0001")
|
|
1071
|
+
node_name = upgraded_nodes[0].replace("antnode", "")
|
|
1072
|
+
return {"status": "upgrading-node", "node": node_name}
|
|
1073
|
+
|
|
1074
|
+
return {
|
|
1075
|
+
"status": "started-nodes" if not dry_run else "start-dryrun",
|
|
1076
|
+
"started_count": len(started_nodes),
|
|
1077
|
+
"started_nodes": started_nodes if started_nodes else None,
|
|
1078
|
+
"upgraded_count": len(upgraded_nodes),
|
|
1079
|
+
"upgraded_nodes": upgraded_nodes if upgraded_nodes else None,
|
|
1080
|
+
"failed_count": len(failed_nodes),
|
|
1081
|
+
"failed_nodes": failed_nodes if failed_nodes else None,
|
|
1082
|
+
}
|
|
1083
|
+
|
|
1084
|
+
def _force_disable_node(
|
|
1085
|
+
self, service_name: Optional[str], dry_run: bool
|
|
1086
|
+
) -> Dict[str, Any]:
|
|
1087
|
+
"""Force disable a specific node (service_name required)."""
|
|
1088
|
+
if not service_name:
|
|
1089
|
+
return {"status": "error", "message": "service_name required for disable action"}
|
|
1090
|
+
|
|
1091
|
+
# Parse comma-separated service names
|
|
1092
|
+
service_names = parse_service_names(service_name)
|
|
1093
|
+
|
|
1094
|
+
disabled_nodes = []
|
|
1095
|
+
failed_nodes = []
|
|
1096
|
+
|
|
1097
|
+
for name in service_names:
|
|
1098
|
+
node = self._get_node_by_name(name)
|
|
1099
|
+
if not node:
|
|
1100
|
+
failed_nodes.append({"service": name, "error": "not found"})
|
|
1101
|
+
continue
|
|
1102
|
+
|
|
1103
|
+
logging.info(f"Forced action: Disabling node {name}")
|
|
1104
|
+
if dry_run:
|
|
1105
|
+
logging.warning(f"DRYRUN: Disable node {name}")
|
|
1106
|
+
disabled_nodes.append(name)
|
|
1107
|
+
else:
|
|
1108
|
+
try:
|
|
1109
|
+
# Stop the node if it's running
|
|
1110
|
+
if node.status == RUNNING:
|
|
1111
|
+
manager = self._get_process_manager(node)
|
|
1112
|
+
manager.stop_node(node)
|
|
1113
|
+
self._set_node_status(node.id, DISABLED)
|
|
1114
|
+
disabled_nodes.append(name)
|
|
1115
|
+
except Exception as e:
|
|
1116
|
+
logging.error(f"Failed to disable node {name}: {e}")
|
|
1117
|
+
failed_nodes.append({"service": name, "error": str(e)})
|
|
1118
|
+
|
|
1119
|
+
return {
|
|
1120
|
+
"status": "disabled-nodes" if not dry_run else "disable-dryrun",
|
|
1121
|
+
"disabled_count": len(disabled_nodes),
|
|
1122
|
+
"disabled_nodes": disabled_nodes,
|
|
1123
|
+
"failed_count": len(failed_nodes),
|
|
1124
|
+
"failed_nodes": failed_nodes if failed_nodes else None,
|
|
1125
|
+
}
|
|
1126
|
+
|
|
1127
|
+
def _force_teardown_cluster(
|
|
1128
|
+
self, machine_config: Dict[str, Any], dry_run: bool
|
|
1129
|
+
) -> Dict[str, Any]:
|
|
1130
|
+
"""Force teardown the entire cluster."""
|
|
1131
|
+
logging.info("Forced action: Tearing down cluster")
|
|
1132
|
+
|
|
1133
|
+
# Get all nodes
|
|
1134
|
+
with self.S() as session:
|
|
1135
|
+
all_nodes = session.execute(
|
|
1136
|
+
select(Node).order_by(Node.id.asc())
|
|
1137
|
+
).all()
|
|
1138
|
+
|
|
1139
|
+
if not all_nodes:
|
|
1140
|
+
return {"status": "no-nodes", "message": "No nodes to teardown"}
|
|
1141
|
+
|
|
1142
|
+
# Get the process manager (use the first node's manager or default)
|
|
1143
|
+
if all_nodes:
|
|
1144
|
+
sample_node = all_nodes[0][0]
|
|
1145
|
+
manager = self._get_process_manager(sample_node)
|
|
1146
|
+
else:
|
|
1147
|
+
manager = get_process_manager()
|
|
1148
|
+
|
|
1149
|
+
# Try manager-specific teardown first
|
|
1150
|
+
if hasattr(manager, 'teardown_cluster'):
|
|
1151
|
+
logging.info(f"Using {manager.__class__.__name__} teardown_cluster method")
|
|
1152
|
+
if dry_run:
|
|
1153
|
+
logging.warning("DRYRUN: Teardown cluster via manager")
|
|
1154
|
+
else:
|
|
1155
|
+
if manager.teardown_cluster():
|
|
1156
|
+
# Remove all nodes from database
|
|
1157
|
+
with self.S() as session:
|
|
1158
|
+
session.query(Node).delete()
|
|
1159
|
+
session.commit()
|
|
1160
|
+
return {"status": "cluster-teardown", "method": "manager-specific"}
|
|
1161
|
+
|
|
1162
|
+
# Fall back to removing each node individually (without delay)
|
|
1163
|
+
logging.info("Using default teardown (remove all nodes)")
|
|
1164
|
+
removed_count = 0
|
|
1165
|
+
for row in all_nodes:
|
|
1166
|
+
node = row[0]
|
|
1167
|
+
if dry_run:
|
|
1168
|
+
logging.warning(f"DRYRUN: Remove node {node.node_name}")
|
|
1169
|
+
removed_count += 1
|
|
1170
|
+
else:
|
|
1171
|
+
try:
|
|
1172
|
+
manager = self._get_process_manager(node)
|
|
1173
|
+
manager.remove_node(node)
|
|
1174
|
+
with self.S() as session:
|
|
1175
|
+
session.delete(node)
|
|
1176
|
+
session.commit()
|
|
1177
|
+
removed_count += 1
|
|
1178
|
+
logging.info(f"Removed node {node.node_name}")
|
|
1179
|
+
except Exception as e:
|
|
1180
|
+
logging.error(f"Failed to remove node {node.node_name}: {e}")
|
|
1181
|
+
|
|
1182
|
+
return {
|
|
1183
|
+
"status": "cluster-teardown",
|
|
1184
|
+
"method": "individual-remove",
|
|
1185
|
+
"removed_count": removed_count,
|
|
1186
|
+
}
|
|
1187
|
+
|
|
1188
|
+
def _survey_specific_nodes(self, service_names: List[str], dry_run: bool) -> Dict[str, Any]:
|
|
1189
|
+
"""Survey specific nodes by service name.
|
|
1190
|
+
|
|
1191
|
+
Args:
|
|
1192
|
+
service_names: List of service names to survey
|
|
1193
|
+
dry_run: If True, log without executing
|
|
1194
|
+
|
|
1195
|
+
Returns:
|
|
1196
|
+
Dictionary with survey results
|
|
1197
|
+
"""
|
|
1198
|
+
from wnm.utils import read_node_metrics, read_node_metadata, update_node_from_metrics
|
|
1199
|
+
|
|
1200
|
+
surveyed_nodes = []
|
|
1201
|
+
failed_nodes = []
|
|
1202
|
+
|
|
1203
|
+
for service_name in service_names:
|
|
1204
|
+
node = self._get_node_by_name(service_name)
|
|
1205
|
+
if not node:
|
|
1206
|
+
failed_nodes.append({"service": service_name, "error": "not found"})
|
|
1207
|
+
continue
|
|
1208
|
+
|
|
1209
|
+
if node.status == DISABLED:
|
|
1210
|
+
failed_nodes.append({"service": service_name, "error": "disabled"})
|
|
1211
|
+
continue
|
|
1212
|
+
|
|
1213
|
+
if dry_run:
|
|
1214
|
+
logging.warning(f"DRYRUN: Survey node {service_name}")
|
|
1215
|
+
surveyed_nodes.append(service_name)
|
|
1216
|
+
else:
|
|
1217
|
+
logging.info(f"Surveying node {service_name}")
|
|
1218
|
+
|
|
1219
|
+
# Check metadata first
|
|
1220
|
+
node_metadata = read_node_metadata(node.host, node.metrics_port)
|
|
1221
|
+
|
|
1222
|
+
# If metadata fails, fake metrics with 0's
|
|
1223
|
+
if node_metadata["status"] == STOPPED:
|
|
1224
|
+
node_metrics = {
|
|
1225
|
+
"status": STOPPED,
|
|
1226
|
+
"uptime": 0,
|
|
1227
|
+
"records": 0,
|
|
1228
|
+
"shunned": 0,
|
|
1229
|
+
"connected_peers": 0
|
|
1230
|
+
}
|
|
1231
|
+
else:
|
|
1232
|
+
# Metadata succeeded, now get metrics
|
|
1233
|
+
node_metrics = read_node_metrics(node.host, node.metrics_port)
|
|
1234
|
+
|
|
1235
|
+
# Skip update if node is stopped and already marked as stopped
|
|
1236
|
+
if node_metadata["status"] == STOPPED and node.status == STOPPED:
|
|
1237
|
+
surveyed_nodes.append(service_name)
|
|
1238
|
+
continue
|
|
1239
|
+
|
|
1240
|
+
update_node_from_metrics(self.S, node.id, node_metrics, node_metadata)
|
|
1241
|
+
surveyed_nodes.append(service_name)
|
|
1242
|
+
|
|
1243
|
+
return {
|
|
1244
|
+
"status": "survey-complete" if not dry_run else "survey-dryrun",
|
|
1245
|
+
"surveyed_count": len(surveyed_nodes),
|
|
1246
|
+
"surveyed_nodes": surveyed_nodes,
|
|
1247
|
+
"failed_count": len(failed_nodes),
|
|
1248
|
+
"failed_nodes": failed_nodes if failed_nodes else None,
|
|
1249
|
+
}
|
|
1250
|
+
|
|
1251
|
+
def _force_survey_nodes(self, service_name: Optional[str] = None, dry_run: bool = False) -> Dict[str, Any]:
|
|
1252
|
+
"""Force a survey of all nodes or specific nodes to update their status and metrics.
|
|
1253
|
+
|
|
1254
|
+
Args:
|
|
1255
|
+
service_name: Optional comma-separated list of service names to survey
|
|
1256
|
+
dry_run: If True, log without executing
|
|
1257
|
+
|
|
1258
|
+
Returns:
|
|
1259
|
+
Dictionary with survey results
|
|
1260
|
+
"""
|
|
1261
|
+
# Parse service names if provided
|
|
1262
|
+
service_names = parse_service_names(service_name)
|
|
1263
|
+
|
|
1264
|
+
if service_names:
|
|
1265
|
+
# Survey specific nodes
|
|
1266
|
+
logging.info(f"Forced action: Surveying {len(service_names)} specific nodes")
|
|
1267
|
+
return self._survey_specific_nodes(service_names, dry_run)
|
|
1268
|
+
else:
|
|
1269
|
+
# Survey all nodes
|
|
1270
|
+
logging.info("Forced action: Surveying all nodes")
|
|
1271
|
+
|
|
1272
|
+
if dry_run:
|
|
1273
|
+
logging.warning("DRYRUN: Survey all nodes")
|
|
1274
|
+
# Get count of non-disabled nodes
|
|
1275
|
+
with self.S() as session:
|
|
1276
|
+
from wnm.common import DISABLED
|
|
1277
|
+
node_count = session.execute(
|
|
1278
|
+
select(func.count(Node.id)).where(Node.status != DISABLED)
|
|
1279
|
+
).scalar()
|
|
1280
|
+
return {"status": "survey-dryrun", "node_count": node_count}
|
|
1281
|
+
|
|
1282
|
+
# Update all nodes
|
|
1283
|
+
update_nodes(self.S)
|
|
1284
|
+
|
|
1285
|
+
# Get updated count
|
|
1286
|
+
with self.S() as session:
|
|
1287
|
+
from wnm.common import DISABLED
|
|
1288
|
+
node_count = session.execute(
|
|
1289
|
+
select(func.count(Node.id)).where(Node.status != DISABLED)
|
|
1290
|
+
).scalar()
|
|
1291
|
+
|
|
1292
|
+
return {"status": "survey-complete", "node_count": node_count}
|