wnm 0.0.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of wnm might be problematic. Click here for more details.

wnm/executor.py ADDED
@@ -0,0 +1,1299 @@
1
+ """Action executor for performing node lifecycle operations.
2
+
3
+ This module contains the ActionExecutor class which takes planned actions
4
+ from the DecisionEngine and executes them using ProcessManager abstractions.
5
+ """
6
+
7
+ import logging
8
+ import os
9
+ import shutil
10
+ import subprocess
11
+ import time
12
+ from typing import Any, Dict, List, Optional
13
+
14
+ from packaging.version import Version
15
+ from sqlalchemy import func, insert, select, text
16
+ from sqlalchemy.orm import scoped_session
17
+
18
+ from wnm.actions import Action, ActionType
19
+ from wnm.common import (
20
+ DEAD,
21
+ DISABLED,
22
+ METRICS_PORT_BASE,
23
+ PORT_MULTIPLIER,
24
+ REMOVING,
25
+ RESTARTING,
26
+ RUNNING,
27
+ STOPPED,
28
+ UPGRADING,
29
+ )
30
+ from wnm.config import LOG_DIR
31
+ from wnm.models import Machine, Node
32
+ from wnm.process_managers.factory import get_default_manager_type, get_process_manager
33
+ from wnm.utils import (
34
+ get_antnode_version,
35
+ parse_service_names,
36
+ update_nodes,
37
+ )
38
+ from wnm.wallets import select_wallet_for_node
39
+
40
+
41
+ class ActionExecutor:
42
+ """Executes planned actions on nodes.
43
+
44
+ The ActionExecutor takes Action objects from the DecisionEngine and
45
+ performs the actual operations by calling utility functions and
46
+ managing database state.
47
+ """
48
+
49
+ def __init__(self, session_factory: scoped_session):
50
+ """Initialize the action executor.
51
+
52
+ Args:
53
+ session_factory: SQLAlchemy session factory for database operations
54
+ """
55
+ self.S = session_factory
56
+
57
+ def _get_process_manager(self, node: Node):
58
+ """Get the appropriate process manager for a node.
59
+
60
+ Args:
61
+ node: Node database record
62
+
63
+ Returns:
64
+ ProcessManager instance for the node's manager type
65
+ """
66
+ # Get manager type from node, or use machine config default
67
+ manager_type = getattr(node, "manager_type", None)
68
+ return get_process_manager(manager_type)
69
+
70
+ def _set_node_status(self, node_id: int, status: str) -> bool:
71
+ """Update node status in database.
72
+
73
+ Args:
74
+ node_id: ID of the node
75
+ status: New status to set
76
+
77
+ Returns:
78
+ True if status was updated successfully
79
+ """
80
+ try:
81
+ with self.S() as session:
82
+ session.query(Node).filter(Node.id == node_id).update(
83
+ {"status": status, "timestamp": int(time.time())}
84
+ )
85
+ session.commit()
86
+ return True
87
+ except Exception as e:
88
+ logging.error(f"Failed to set node status for {node_id}: {e}")
89
+ return False
90
+
91
+ def _upgrade_node_binary(self, node: Node, new_version: str) -> bool:
92
+ """Upgrade a node's binary and restart it.
93
+
94
+ Args:
95
+ node: Node to upgrade
96
+ new_version: Version string for the new binary
97
+
98
+ Returns:
99
+ True if upgrade succeeded
100
+ """
101
+ # Source binary location
102
+ source_binary = os.path.expanduser("~/.local/bin/antnode")
103
+
104
+ # Copy new binary to node directory
105
+ try:
106
+ shutil.copy2(source_binary, node.binary)
107
+ os.chmod(node.binary, 0o755)
108
+ logging.info(f"Copied new binary from {source_binary} to {node.binary}")
109
+ except (OSError, shutil.Error) as err:
110
+ logging.error(f"Failed to copy binary for upgrade: {err}")
111
+ return False
112
+
113
+ # Restart the node with new binary
114
+ manager = self._get_process_manager(node)
115
+ if not manager.restart_node(node):
116
+ logging.error(f"Failed to restart node {node.id} during upgrade")
117
+ return False
118
+
119
+ # Update status to UPGRADING
120
+ with self.S() as session:
121
+ session.query(Node).filter(Node.id == node.id).update(
122
+ {
123
+ "status": UPGRADING,
124
+ "timestamp": int(time.time()),
125
+ "version": new_version,
126
+ }
127
+ )
128
+ session.commit()
129
+
130
+ return True
131
+
132
+ def execute(
133
+ self,
134
+ actions: List[Action],
135
+ machine_config: Dict[str, Any],
136
+ metrics: Dict[str, Any],
137
+ dry_run: bool = False,
138
+ ) -> Dict[str, Any]:
139
+ """Execute a list of actions.
140
+
141
+ Args:
142
+ actions: List of Action objects to execute
143
+ machine_config: Machine configuration dictionary
144
+ metrics: Current system metrics
145
+ dry_run: If True, log actions without executing them
146
+
147
+ Returns:
148
+ Dictionary with execution status and results
149
+ """
150
+ if not actions:
151
+ return {"status": "no-actions", "results": []}
152
+
153
+ results = []
154
+
155
+ for action in actions:
156
+ logging.info(
157
+ f"Executing: {action.type.value} (priority={action.priority}, reason={action.reason})"
158
+ )
159
+
160
+ try:
161
+ result = self._execute_action(action, machine_config, metrics, dry_run)
162
+ results.append(result)
163
+ except Exception as e:
164
+ logging.error(f"Failed to execute {action.type.value}: {e}")
165
+ results.append(
166
+ {"action": action.type.value, "success": False, "error": str(e)}
167
+ )
168
+
169
+ # Return status from the first (highest priority) action
170
+ if results:
171
+ return results[0]
172
+ return {"status": "no-results"}
173
+
174
+ def _execute_action(
175
+ self,
176
+ action: Action,
177
+ machine_config: Dict[str, Any],
178
+ metrics: Dict[str, Any],
179
+ dry_run: bool,
180
+ ) -> Dict[str, Any]:
181
+ """Execute a single action.
182
+
183
+ Args:
184
+ action: The action to execute
185
+ machine_config: Machine configuration
186
+ metrics: Current metrics
187
+ dry_run: If True, log without executing
188
+
189
+ Returns:
190
+ Dictionary with execution result
191
+ """
192
+ if action.type == ActionType.RESURVEY_NODES:
193
+ return self._execute_resurvey(machine_config, dry_run)
194
+
195
+ elif action.type == ActionType.REMOVE_NODE:
196
+ return self._execute_remove_node(action, dry_run)
197
+
198
+ elif action.type == ActionType.STOP_NODE:
199
+ return self._execute_stop_node(machine_config, dry_run)
200
+
201
+ elif action.type == ActionType.UPGRADE_NODE:
202
+ return self._execute_upgrade_node(metrics, dry_run)
203
+
204
+ elif action.type == ActionType.START_NODE:
205
+ return self._execute_start_node(metrics, dry_run)
206
+
207
+ elif action.type == ActionType.ADD_NODE:
208
+ return self._execute_add_node(machine_config, metrics, dry_run)
209
+
210
+ elif action.type == ActionType.SURVEY_NODES:
211
+ return self._execute_survey(dry_run)
212
+
213
+ else:
214
+ logging.warning(f"Unknown action type: {action.type}")
215
+ return {"status": "unknown-action", "action": action.type.value}
216
+
217
+ def _execute_resurvey(
218
+ self, machine_config: Dict[str, Any], dry_run: bool
219
+ ) -> Dict[str, Any]:
220
+ """Execute node resurvey after system reboot."""
221
+ if dry_run:
222
+ logging.warning("DRYRUN: System rebooted, survey nodes")
223
+ else:
224
+ update_nodes(self.S)
225
+ # Update the last stopped time
226
+ with self.S() as session:
227
+ session.query(Machine).filter(Machine.id == 1).update(
228
+ {"last_stopped_at": int(time.time())}
229
+ )
230
+ session.commit()
231
+
232
+ return {"status": "system-rebooted"}
233
+
234
+ def _execute_remove_node(self, action: Action, dry_run: bool) -> Dict[str, Any]:
235
+ """Execute node removal.
236
+
237
+ If reason contains 'dead', remove all dead nodes.
238
+ Otherwise, remove youngest stopped or running node based on reason.
239
+ """
240
+ if "dead" in action.reason.lower():
241
+ # Remove all dead nodes
242
+ if dry_run:
243
+ logging.warning("DRYRUN: Remove Dead Nodes")
244
+ else:
245
+ with self.S() as session:
246
+ broken = session.execute(
247
+ select(Node)
248
+ .where(Node.status == DEAD)
249
+ .order_by(Node.timestamp.asc())
250
+ ).all()
251
+
252
+ for row in broken:
253
+ node = row[0]
254
+ logging.info(f"Removing dead node {node.id}")
255
+ manager = self._get_process_manager(node)
256
+ manager.remove_node(node)
257
+ # Delete from database immediately (no delay for dead nodes)
258
+ with self.S() as session:
259
+ session.delete(node)
260
+ session.commit()
261
+
262
+ return {"status": "removed-dead-nodes"}
263
+
264
+ elif "stopped" in action.reason.lower():
265
+ # Remove youngest stopped node
266
+ with self.S() as session:
267
+ youngest = session.execute(
268
+ select(Node).where(Node.status == STOPPED).order_by(Node.age.desc())
269
+ ).first()
270
+
271
+ if youngest:
272
+ if dry_run:
273
+ logging.warning("DRYRUN: Remove youngest stopped node")
274
+ else:
275
+ node = youngest[0]
276
+ manager = self._get_process_manager(node)
277
+ manager.remove_node(node)
278
+ # Delete from database immediately (no delay for stopped nodes)
279
+ with self.S() as session:
280
+ session.delete(node)
281
+ session.commit()
282
+ return {"status": "removed-stopped-node"}
283
+ else:
284
+ return {"status": "no-stopped-nodes-to-remove"}
285
+
286
+ else:
287
+ # Remove youngest running node (with delay)
288
+ with self.S() as session:
289
+ youngest = session.execute(
290
+ select(Node).where(Node.status == RUNNING).order_by(Node.age.desc())
291
+ ).first()
292
+
293
+ if youngest:
294
+ if dry_run:
295
+ logging.warning("DRYRUN: Remove youngest running node")
296
+ else:
297
+ node = youngest[0]
298
+ manager = self._get_process_manager(node)
299
+ manager.stop_node(node)
300
+ # Mark as REMOVING (will be deleted later after delay)
301
+ self._set_node_status(node.id, REMOVING)
302
+ return {"status": "removed-running-node"}
303
+ else:
304
+ return {"status": "no-running-nodes-to-remove"}
305
+
306
+ def _execute_stop_node(
307
+ self, machine_config: Dict[str, Any], dry_run: bool
308
+ ) -> Dict[str, Any]:
309
+ """Execute node stop (to reduce resource usage)."""
310
+ with self.S() as session:
311
+ youngest = session.execute(
312
+ select(Node).where(Node.status == RUNNING).order_by(Node.age.desc())
313
+ ).first()
314
+
315
+ if youngest:
316
+ if dry_run:
317
+ logging.warning("DRYRUN: Stopping youngest node")
318
+ else:
319
+ node = youngest[0]
320
+ manager = self._get_process_manager(node)
321
+ manager.stop_node(node)
322
+ self._set_node_status(node.id, STOPPED)
323
+ # Update the last stopped time
324
+ with self.S() as session:
325
+ session.query(Machine).filter(Machine.id == 1).update(
326
+ {"last_stopped_at": int(time.time())}
327
+ )
328
+ session.commit()
329
+ return {"status": "stopped-node"}
330
+ else:
331
+ return {"status": "no-nodes-to-stop"}
332
+
333
+ def _execute_upgrade_node(
334
+ self, metrics: Dict[str, Any], dry_run: bool
335
+ ) -> Dict[str, Any]:
336
+ """Execute node upgrade (oldest running node with outdated version)."""
337
+ with self.S() as session:
338
+ oldest = session.execute(
339
+ select(Node)
340
+ .where(Node.status == RUNNING)
341
+ .where(Node.version != metrics["antnode_version"])
342
+ .order_by(Node.age.asc())
343
+ ).first()
344
+
345
+ if oldest:
346
+ if dry_run:
347
+ logging.warning("DRYRUN: Upgrade oldest node")
348
+ else:
349
+ node = oldest[0]
350
+ # If we don't have a version number from metadata, grab from binary
351
+ if not node.version:
352
+ node.version = get_antnode_version(node.binary)
353
+
354
+ # Perform the upgrade (copies binary, restarts, sets UPGRADING status)
355
+ if not self._upgrade_node_binary(node, metrics["antnode_version"]):
356
+ return {"status": "upgrade-failed"}
357
+
358
+ return {"status": "upgrading-node"}
359
+ else:
360
+ return {"status": "no-nodes-to-upgrade"}
361
+
362
+ def _execute_start_node(
363
+ self, metrics: Dict[str, Any], dry_run: bool
364
+ ) -> Dict[str, Any]:
365
+ """Execute starting a stopped node (may upgrade first if needed)."""
366
+ with self.S() as session:
367
+ oldest = session.execute(
368
+ select(Node).where(Node.status == STOPPED).order_by(Node.age.asc())
369
+ ).first()
370
+
371
+ if oldest:
372
+ node = oldest[0]
373
+ # If we don't have a version number from metadata, grab from binary
374
+ if not node.version:
375
+ node.version = get_antnode_version(node.binary)
376
+
377
+ # If the stopped version is old, upgrade it (which also starts it)
378
+ if Version(metrics["antnode_version"]) > Version(node.version):
379
+ if dry_run:
380
+ logging.warning("DRYRUN: Upgrade and start stopped node")
381
+ else:
382
+ # Perform the upgrade (copies binary, restarts, sets UPGRADING status)
383
+ if not self._upgrade_node_binary(node, metrics["antnode_version"]):
384
+ return {"status": "failed-upgrade"}
385
+ return {"status": "upgrading-stopped-node"}
386
+ else:
387
+ if dry_run:
388
+ logging.warning("DRYRUN: Start stopped node")
389
+ return {"status": "starting-node"}
390
+ else:
391
+ manager = self._get_process_manager(node)
392
+ if manager.start_node(node):
393
+ self._set_node_status(node.id, RESTARTING)
394
+ return {"status": "started-node"}
395
+ else:
396
+ return {"status": "failed-start-node"}
397
+ else:
398
+ return {"status": "no-stopped-nodes"}
399
+
400
+ def _execute_add_node(
401
+ self, machine_config: Dict[str, Any], metrics: Dict[str, Any], dry_run: bool
402
+ ) -> Dict[str, Any]:
403
+ """Execute adding a new node."""
404
+ if dry_run:
405
+ logging.warning("DRYRUN: Add a node")
406
+ return {"status": "add-node"}
407
+
408
+ # Find next available node ID (look for holes first)
409
+ # First check if node 1 exists
410
+ with self.S() as session:
411
+ node_1_exists = session.execute(
412
+ select(Node.id).where(Node.id == 1)
413
+ ).first()
414
+
415
+ if not node_1_exists:
416
+ # Node 1 is available, use it
417
+ node_id = 1
418
+ else:
419
+ # Look for holes in the sequence
420
+ sql = text(
421
+ "select n1.id + 1 as id from node n1 "
422
+ + "left join node n2 on n2.id = n1.id + 1 "
423
+ + "where n2.id is null "
424
+ + "and n1.id <> (select max(id) from node) "
425
+ + "order by n1.id;"
426
+ )
427
+ with self.S() as session:
428
+ result = session.execute(sql).first()
429
+
430
+ if result:
431
+ node_id = result[0]
432
+ else:
433
+ # No holes, use max + 1
434
+ with self.S() as session:
435
+ result = session.execute(
436
+ select(Node.id).order_by(Node.id.desc())
437
+ ).first()
438
+ node_id = result[0] + 1 if result else 1
439
+
440
+ # Determine the appropriate manager type for this system
441
+ manager_type = get_default_manager_type()
442
+
443
+ # Select wallet for this node from weighted distribution
444
+ selected_wallet = select_wallet_for_node(
445
+ machine_config["rewards_address"],
446
+ machine_config["donate_address"]
447
+ )
448
+
449
+ # Create node object
450
+ node = Node(
451
+ id=node_id,
452
+ node_name=f"{node_id:04}",
453
+ service=f"antnode{node_id:04}.service",
454
+ user=machine_config.get("user", "ant"),
455
+ version=metrics["antnode_version"],
456
+ root_dir=f"{machine_config['node_storage']}/antnode{node_id:04}",
457
+ binary=f"{machine_config['node_storage']}/antnode{node_id:04}/antnode",
458
+ port=machine_config["port_start"] * PORT_MULTIPLIER + node_id,
459
+ metrics_port=METRICS_PORT_BASE + node_id,
460
+ network="evm-arbitrum-one",
461
+ wallet=selected_wallet,
462
+ peer_id="",
463
+ status=STOPPED,
464
+ timestamp=int(time.time()),
465
+ records=0,
466
+ uptime=0,
467
+ shunned=0,
468
+ age=int(time.time()),
469
+ host=machine_config["host"],
470
+ method=manager_type,
471
+ layout="1",
472
+ environment=machine_config.get("environment", ""),
473
+ manager_type=manager_type,
474
+ )
475
+
476
+ # Insert into database
477
+ with self.S() as session:
478
+ session.add(node)
479
+ session.commit()
480
+ session.refresh(node) # Get the persisted node
481
+
482
+ # Create the node using process manager
483
+ source_binary = os.path.expanduser("~/.local/bin/antnode")
484
+ manager = self._get_process_manager(node)
485
+
486
+ if not manager.create_node(node, source_binary):
487
+ logging.error(f"Failed to create node {node.id}")
488
+ return {"status": "failed-create-node"}
489
+
490
+ # Update status to RESTARTING (node is starting up)
491
+ self._set_node_status(node.id, RESTARTING)
492
+
493
+ return {"status": "added-node"}
494
+
495
+ def _execute_survey(self, dry_run: bool) -> Dict[str, Any]:
496
+ """Execute node survey (idle monitoring)."""
497
+ if dry_run:
498
+ logging.warning("DRYRUN: Update nodes")
499
+ else:
500
+ update_nodes(self.S)
501
+ return {"status": "idle"}
502
+
503
+ def _parse_node_name(self, service_name: str) -> Optional[int]:
504
+ """Parse node ID from service name like 'antnode0001'.
505
+
506
+ Args:
507
+ service_name: Node name (e.g., 'antnode0001')
508
+
509
+ Returns:
510
+ Node ID as integer, or None if parsing fails
511
+ """
512
+ import re
513
+ match = re.match(r"antnode(\d+)", service_name)
514
+ if match:
515
+ return int(match.group(1))
516
+ return None
517
+
518
+ def _get_node_by_name(self, service_name: str) -> Optional[Node]:
519
+ """Get node by service name.
520
+
521
+ Args:
522
+ service_name: Node name (e.g., 'antnode0001')
523
+
524
+ Returns:
525
+ Node object or None if not found
526
+ """
527
+ node_id = self._parse_node_name(service_name)
528
+ if node_id is None:
529
+ logging.error(f"Invalid node name format: {service_name}")
530
+ return None
531
+
532
+ with self.S() as session:
533
+ result = session.execute(
534
+ select(Node).where(Node.id == node_id)
535
+ ).first()
536
+
537
+ if result:
538
+ return result[0]
539
+ else:
540
+ logging.error(f"Node not found: {service_name} (id={node_id})")
541
+ return None
542
+
543
+ def execute_forced_action(
544
+ self,
545
+ action_type: str,
546
+ machine_config: Dict[str, Any],
547
+ metrics: Dict[str, Any],
548
+ service_name: Optional[str] = None,
549
+ dry_run: bool = False,
550
+ count: int = 1,
551
+ ) -> Dict[str, Any]:
552
+ """Execute a forced action bypassing the decision engine.
553
+
554
+ Args:
555
+ action_type: Type of action ('add', 'remove', 'upgrade', 'start', 'stop', 'disable', 'teardown')
556
+ machine_config: Machine configuration
557
+ metrics: Current system metrics
558
+ service_name: Optional node name for targeted operations
559
+ dry_run: If True, log without executing
560
+ count: Number of nodes to affect (for add, remove, start, stop, upgrade actions)
561
+
562
+ Returns:
563
+ Dictionary with execution result
564
+ """
565
+ if action_type == "add":
566
+ return self._force_add_node(machine_config, metrics, dry_run, count)
567
+ elif action_type == "remove":
568
+ return self._force_remove_node(service_name, dry_run, count)
569
+ elif action_type == "upgrade":
570
+ return self._force_upgrade_node(service_name, metrics, dry_run, count)
571
+ elif action_type == "start":
572
+ return self._force_start_node(service_name, metrics, dry_run, count)
573
+ elif action_type == "stop":
574
+ return self._force_stop_node(service_name, dry_run, count)
575
+ elif action_type == "disable":
576
+ return self._force_disable_node(service_name, dry_run)
577
+ elif action_type == "teardown":
578
+ return self._force_teardown_cluster(machine_config, dry_run)
579
+ elif action_type == "survey":
580
+ return self._force_survey_nodes(service_name, dry_run)
581
+ else:
582
+ return {"status": "error", "message": f"Unknown action type: {action_type}"}
583
+
584
+ def _force_add_node(
585
+ self, machine_config: Dict[str, Any], metrics: Dict[str, Any], dry_run: bool, count: int = 1
586
+ ) -> Dict[str, Any]:
587
+ """Force add new nodes.
588
+
589
+ Args:
590
+ machine_config: Machine configuration
591
+ metrics: Current system metrics
592
+ dry_run: If True, log without executing
593
+ count: Number of nodes to add (default: 1)
594
+
595
+ Returns:
596
+ Dictionary with execution result
597
+ """
598
+ logging.info(f"Forced action: Adding {count} node(s)")
599
+
600
+ if count < 1:
601
+ return {"status": "error", "message": "count must be at least 1"}
602
+
603
+ added_nodes = []
604
+ failed_nodes = []
605
+
606
+ # Track the start time to identify newly created nodes
607
+ start_time = int(time.time())
608
+
609
+ for i in range(count):
610
+ result = self._execute_add_node(machine_config, metrics, dry_run)
611
+ if result["status"] in ["added-node", "add-node"]:
612
+ # Get the node that was just added (youngest by age >= start_time)
613
+ if not dry_run:
614
+ with self.S() as session:
615
+ newest = session.execute(
616
+ select(Node).where(Node.age >= start_time).order_by(Node.age.desc())
617
+ ).first()
618
+ if newest:
619
+ added_nodes.append(newest[0].service.replace(".service", ""))
620
+ else:
621
+ added_nodes.append(f"node-{i+1}")
622
+ else:
623
+ failed_nodes.append({"index": i+1, "error": result.get("status", "unknown error")})
624
+
625
+ if count == 1:
626
+ # Keep backward compatibility for single node
627
+ return result
628
+
629
+ return {
630
+ "status": "added-nodes" if not dry_run else "add-nodes-dryrun",
631
+ "added_count": len(added_nodes),
632
+ "added_nodes": added_nodes if added_nodes else None,
633
+ "failed_count": len(failed_nodes),
634
+ "failed_nodes": failed_nodes if failed_nodes else None,
635
+ }
636
+
637
+ def _force_remove_node(
638
+ self, service_name: Optional[str], dry_run: bool, count: int = 1
639
+ ) -> Dict[str, Any]:
640
+ """Force remove nodes (specific or youngest by age).
641
+
642
+ Args:
643
+ service_name: Optional comma-separated list of service names
644
+ dry_run: If True, log without executing
645
+ count: Number of nodes to remove when service_name is not specified (default: 1)
646
+
647
+ Returns:
648
+ Dictionary with execution result
649
+ """
650
+ # Parse comma-separated service names
651
+ service_names = parse_service_names(service_name)
652
+
653
+ if service_names:
654
+ # Remove specific nodes
655
+ removed_nodes = []
656
+ failed_nodes = []
657
+
658
+ for name in service_names:
659
+ node = self._get_node_by_name(name)
660
+ if not node:
661
+ failed_nodes.append({"service": name, "error": "not found"})
662
+ continue
663
+
664
+ logging.info(f"Forced action: Removing node {name}")
665
+ if dry_run:
666
+ logging.warning(f"DRYRUN: Remove node {name}")
667
+ removed_nodes.append(name)
668
+ else:
669
+ try:
670
+ manager = self._get_process_manager(node)
671
+ manager.remove_node(node)
672
+ # Remove from database immediately
673
+ with self.S() as session:
674
+ session.delete(node)
675
+ session.commit()
676
+ removed_nodes.append(name)
677
+ except Exception as e:
678
+ logging.error(f"Failed to remove node {name}: {e}")
679
+ failed_nodes.append({"service": name, "error": str(e)})
680
+
681
+ return {
682
+ "status": "removed-nodes" if not dry_run else "remove-dryrun",
683
+ "removed_count": len(removed_nodes),
684
+ "removed_nodes": removed_nodes,
685
+ "failed_count": len(failed_nodes),
686
+ "failed_nodes": failed_nodes if failed_nodes else None,
687
+ }
688
+ else:
689
+ # Remove youngest nodes (default behavior - highest age value)
690
+ if count < 1:
691
+ return {"status": "error", "message": "count must be at least 1"}
692
+
693
+ logging.info(f"Forced action: Removing {count} youngest node(s)")
694
+
695
+ # Get youngest nodes (highest age values)
696
+ with self.S() as session:
697
+ youngest_nodes = session.execute(
698
+ select(Node).order_by(Node.age.desc()).limit(count)
699
+ ).all()
700
+
701
+ if not youngest_nodes:
702
+ return {"status": "error", "message": "No nodes to remove"}
703
+
704
+ if len(youngest_nodes) < count:
705
+ logging.warning(f"Only {len(youngest_nodes)} nodes available, removing all of them")
706
+
707
+ removed_nodes = []
708
+ failed_nodes = []
709
+
710
+ for row in youngest_nodes:
711
+ node = row[0]
712
+ if dry_run:
713
+ logging.warning(f"DRYRUN: Remove youngest node {node.node_name}")
714
+ removed_nodes.append(node.service.replace(".service", ""))
715
+ else:
716
+ try:
717
+ manager = self._get_process_manager(node)
718
+ manager.remove_node(node)
719
+ # Remove from database immediately
720
+ with self.S() as session:
721
+ session.delete(node)
722
+ session.commit()
723
+ removed_nodes.append(node.service.replace(".service", ""))
724
+ except Exception as e:
725
+ logging.error(f"Failed to remove node {node.node_name}: {e}")
726
+ failed_nodes.append({"service": node.service.replace(".service", ""), "error": str(e)})
727
+
728
+ if count == 1 and len(removed_nodes) == 1:
729
+ # Keep backward compatibility for single node
730
+ # Extract node name from service name (e.g., "antnode0001" -> "0001")
731
+ node_name = removed_nodes[0].replace("antnode", "")
732
+ return {"status": "removed-node", "node": node_name}
733
+
734
+ return {
735
+ "status": "removed-nodes" if not dry_run else "remove-dryrun",
736
+ "removed_count": len(removed_nodes),
737
+ "removed_nodes": removed_nodes if removed_nodes else None,
738
+ "failed_count": len(failed_nodes),
739
+ "failed_nodes": failed_nodes if failed_nodes else None,
740
+ }
741
+
742
+ def _force_upgrade_node(
743
+ self, service_name: Optional[str], metrics: Dict[str, Any], dry_run: bool, count: int = 1
744
+ ) -> Dict[str, Any]:
745
+ """Force upgrade nodes (specific or oldest running nodes by age).
746
+
747
+ Args:
748
+ service_name: Optional comma-separated list of service names
749
+ metrics: Current system metrics
750
+ dry_run: If True, log without executing
751
+ count: Number of nodes to upgrade when service_name is not specified (default: 1)
752
+
753
+ Returns:
754
+ Dictionary with execution result
755
+ """
756
+ # Parse comma-separated service names
757
+ service_names = parse_service_names(service_name)
758
+
759
+ if service_names:
760
+ # Upgrade specific nodes
761
+ upgraded_nodes = []
762
+ failed_nodes = []
763
+
764
+ for name in service_names:
765
+ node = self._get_node_by_name(name)
766
+ if not node:
767
+ failed_nodes.append({"service": name, "error": "not found"})
768
+ continue
769
+
770
+ logging.info(f"Forced action: Upgrading node {name}")
771
+ if dry_run:
772
+ logging.warning(f"DRYRUN: Upgrade node {name}")
773
+ upgraded_nodes.append(name)
774
+ else:
775
+ try:
776
+ if not self._upgrade_node_binary(node, metrics["antnode_version"]):
777
+ failed_nodes.append({"service": name, "error": "upgrade failed"})
778
+ else:
779
+ upgraded_nodes.append(name)
780
+ except Exception as e:
781
+ logging.error(f"Failed to upgrade node {name}: {e}")
782
+ failed_nodes.append({"service": name, "error": str(e)})
783
+
784
+ return {
785
+ "status": "upgraded-nodes" if not dry_run else "upgrade-dryrun",
786
+ "upgraded_count": len(upgraded_nodes),
787
+ "upgraded_nodes": upgraded_nodes,
788
+ "failed_count": len(failed_nodes),
789
+ "failed_nodes": failed_nodes if failed_nodes else None,
790
+ }
791
+ else:
792
+ # Upgrade oldest running nodes (default behavior - lowest age values)
793
+ if count < 1:
794
+ return {"status": "error", "message": "count must be at least 1"}
795
+
796
+ logging.info(f"Forced action: Upgrading {count} oldest running node(s)")
797
+
798
+ # Get oldest running nodes (lowest age values)
799
+ with self.S() as session:
800
+ oldest_nodes = session.execute(
801
+ select(Node)
802
+ .where(Node.status == RUNNING)
803
+ .order_by(Node.age.asc())
804
+ .limit(count)
805
+ ).all()
806
+
807
+ if not oldest_nodes:
808
+ return {"status": "error", "message": "No running nodes to upgrade"}
809
+
810
+ if len(oldest_nodes) < count:
811
+ logging.warning(f"Only {len(oldest_nodes)} running nodes available, upgrading all of them")
812
+
813
+ upgraded_nodes = []
814
+ failed_nodes = []
815
+
816
+ for row in oldest_nodes:
817
+ node = row[0]
818
+ if dry_run:
819
+ logging.warning(f"DRYRUN: Upgrade oldest node {node.node_name}")
820
+ upgraded_nodes.append(node.service.replace(".service", ""))
821
+ else:
822
+ try:
823
+ if not self._upgrade_node_binary(node, metrics["antnode_version"]):
824
+ failed_nodes.append({"service": node.service.replace(".service", ""), "error": "upgrade failed"})
825
+ else:
826
+ upgraded_nodes.append(node.service.replace(".service", ""))
827
+ except Exception as e:
828
+ logging.error(f"Failed to upgrade node {node.node_name}: {e}")
829
+ failed_nodes.append({"service": node.service.replace(".service", ""), "error": str(e)})
830
+
831
+ if count == 1 and len(upgraded_nodes) == 1:
832
+ # Keep backward compatibility for single node
833
+ # Extract node name from service name (e.g., "antnode0001" -> "0001")
834
+ node_name = upgraded_nodes[0].replace("antnode", "")
835
+ return {"status": "upgraded-node", "node": node_name}
836
+
837
+ return {
838
+ "status": "upgraded-nodes" if not dry_run else "upgrade-dryrun",
839
+ "upgraded_count": len(upgraded_nodes),
840
+ "upgraded_nodes": upgraded_nodes if upgraded_nodes else None,
841
+ "failed_count": len(failed_nodes),
842
+ "failed_nodes": failed_nodes if failed_nodes else None,
843
+ }
844
+
845
+ def _force_stop_node(
846
+ self, service_name: Optional[str], dry_run: bool, count: int = 1
847
+ ) -> Dict[str, Any]:
848
+ """Force stop nodes (specific or youngest running nodes by age).
849
+
850
+ Args:
851
+ service_name: Optional comma-separated list of service names
852
+ dry_run: If True, log without executing
853
+ count: Number of nodes to stop when service_name is not specified (default: 1)
854
+
855
+ Returns:
856
+ Dictionary with execution result
857
+ """
858
+ # Parse comma-separated service names
859
+ service_names = parse_service_names(service_name)
860
+
861
+ if service_names:
862
+ # Stop specific nodes
863
+ stopped_nodes = []
864
+ failed_nodes = []
865
+
866
+ for name in service_names:
867
+ node = self._get_node_by_name(name)
868
+ if not node:
869
+ failed_nodes.append({"service": name, "error": "not found"})
870
+ continue
871
+
872
+ logging.info(f"Forced action: Stopping node {name}")
873
+ if dry_run:
874
+ logging.warning(f"DRYRUN: Stop node {name}")
875
+ stopped_nodes.append(name)
876
+ else:
877
+ try:
878
+ manager = self._get_process_manager(node)
879
+ manager.stop_node(node)
880
+ self._set_node_status(node.id, STOPPED)
881
+ stopped_nodes.append(name)
882
+ except Exception as e:
883
+ logging.error(f"Failed to stop node {name}: {e}")
884
+ failed_nodes.append({"service": name, "error": str(e)})
885
+
886
+ return {
887
+ "status": "stopped-nodes" if not dry_run else "stop-dryrun",
888
+ "stopped_count": len(stopped_nodes),
889
+ "stopped_nodes": stopped_nodes,
890
+ "failed_count": len(failed_nodes),
891
+ "failed_nodes": failed_nodes if failed_nodes else None,
892
+ }
893
+ else:
894
+ # Stop youngest running nodes (default behavior - highest age values)
895
+ if count < 1:
896
+ return {"status": "error", "message": "count must be at least 1"}
897
+
898
+ logging.info(f"Forced action: Stopping {count} youngest running node(s)")
899
+
900
+ # Get youngest running nodes (highest age values)
901
+ with self.S() as session:
902
+ youngest_nodes = session.execute(
903
+ select(Node)
904
+ .where(Node.status == RUNNING)
905
+ .order_by(Node.age.desc())
906
+ .limit(count)
907
+ ).all()
908
+
909
+ if not youngest_nodes:
910
+ return {"status": "error", "message": "No running nodes to stop"}
911
+
912
+ if len(youngest_nodes) < count:
913
+ logging.warning(f"Only {len(youngest_nodes)} running nodes available, stopping all of them")
914
+
915
+ stopped_nodes = []
916
+ failed_nodes = []
917
+
918
+ for row in youngest_nodes:
919
+ node = row[0]
920
+ if dry_run:
921
+ logging.warning(f"DRYRUN: Stop youngest node {node.node_name}")
922
+ stopped_nodes.append(node.service.replace(".service", ""))
923
+ else:
924
+ try:
925
+ manager = self._get_process_manager(node)
926
+ manager.stop_node(node)
927
+ self._set_node_status(node.id, STOPPED)
928
+ stopped_nodes.append(node.service.replace(".service", ""))
929
+ except Exception as e:
930
+ logging.error(f"Failed to stop node {node.node_name}: {e}")
931
+ failed_nodes.append({"service": node.service.replace(".service", ""), "error": str(e)})
932
+
933
+ if count == 1 and len(stopped_nodes) == 1:
934
+ # Keep backward compatibility for single node
935
+ # Extract node name from service name (e.g., "antnode0001" -> "0001")
936
+ node_name = stopped_nodes[0].replace("antnode", "")
937
+ return {"status": "stopped-node", "node": node_name}
938
+
939
+ return {
940
+ "status": "stopped-nodes" if not dry_run else "stop-dryrun",
941
+ "stopped_count": len(stopped_nodes),
942
+ "stopped_nodes": stopped_nodes if stopped_nodes else None,
943
+ "failed_count": len(failed_nodes),
944
+ "failed_nodes": failed_nodes if failed_nodes else None,
945
+ }
946
+
947
+ def _force_start_node(
948
+ self, service_name: Optional[str], metrics: Dict[str, Any], dry_run: bool, count: int = 1
949
+ ) -> Dict[str, Any]:
950
+ """Force start nodes (specific or oldest stopped nodes by age).
951
+
952
+ Args:
953
+ service_name: Optional comma-separated list of service names
954
+ metrics: Current system metrics
955
+ dry_run: If True, log without executing
956
+ count: Number of nodes to start when service_name is not specified (default: 1)
957
+
958
+ Returns:
959
+ Dictionary with execution result
960
+ """
961
+ # Parse comma-separated service names
962
+ service_names = parse_service_names(service_name)
963
+
964
+ if service_names:
965
+ # Start specific nodes
966
+ started_nodes = []
967
+ upgraded_nodes = []
968
+ failed_nodes = []
969
+
970
+ for name in service_names:
971
+ node = self._get_node_by_name(name)
972
+ if not node:
973
+ failed_nodes.append({"service": name, "error": "not found"})
974
+ continue
975
+
976
+ if node.status == RUNNING:
977
+ failed_nodes.append({"service": name, "error": "already running"})
978
+ continue
979
+
980
+ logging.info(f"Forced action: Starting node {name}")
981
+ if dry_run:
982
+ logging.warning(f"DRYRUN: Start node {name}")
983
+ started_nodes.append(name)
984
+ else:
985
+ try:
986
+ # Check if node needs upgrade
987
+ if not node.version:
988
+ node.version = get_antnode_version(node.binary)
989
+
990
+ # If the stopped version is old, upgrade it (which also starts it)
991
+ if Version(metrics["antnode_version"]) > Version(node.version):
992
+ if not self._upgrade_node_binary(node, metrics["antnode_version"]):
993
+ failed_nodes.append({"service": name, "error": "upgrade failed"})
994
+ else:
995
+ upgraded_nodes.append(name)
996
+ else:
997
+ manager = self._get_process_manager(node)
998
+ if manager.start_node(node):
999
+ self._set_node_status(node.id, RESTARTING)
1000
+ started_nodes.append(name)
1001
+ else:
1002
+ failed_nodes.append({"service": name, "error": "start failed"})
1003
+ except Exception as e:
1004
+ logging.error(f"Failed to start node {name}: {e}")
1005
+ failed_nodes.append({"service": name, "error": str(e)})
1006
+
1007
+ return {
1008
+ "status": "started-nodes" if not dry_run else "start-dryrun",
1009
+ "started_count": len(started_nodes),
1010
+ "started_nodes": started_nodes,
1011
+ "upgraded_count": len(upgraded_nodes),
1012
+ "upgraded_nodes": upgraded_nodes if upgraded_nodes else None,
1013
+ "failed_count": len(failed_nodes),
1014
+ "failed_nodes": failed_nodes if failed_nodes else None,
1015
+ }
1016
+ else:
1017
+ # Start oldest stopped nodes (default behavior - lowest age values)
1018
+ if count < 1:
1019
+ return {"status": "error", "message": "count must be at least 1"}
1020
+
1021
+ logging.info(f"Forced action: Starting {count} oldest stopped node(s)")
1022
+
1023
+ # Get oldest stopped nodes (lowest age values)
1024
+ with self.S() as session:
1025
+ oldest_nodes = session.execute(
1026
+ select(Node)
1027
+ .where(Node.status == STOPPED)
1028
+ .order_by(Node.age.asc())
1029
+ .limit(count)
1030
+ ).all()
1031
+
1032
+ if not oldest_nodes:
1033
+ return {"status": "error", "message": "No stopped nodes to start"}
1034
+
1035
+ if len(oldest_nodes) < count:
1036
+ logging.warning(f"Only {len(oldest_nodes)} stopped nodes available, starting all of them")
1037
+
1038
+ started_nodes = []
1039
+ upgraded_nodes = []
1040
+ failed_nodes = []
1041
+
1042
+ for row in oldest_nodes:
1043
+ node = row[0]
1044
+ if dry_run:
1045
+ logging.warning(f"DRYRUN: Start oldest stopped node {node.node_name}")
1046
+ started_nodes.append(node.service.replace(".service", ""))
1047
+ else:
1048
+ try:
1049
+ # Check if node needs upgrade
1050
+ if not node.version:
1051
+ node.version = get_antnode_version(node.binary)
1052
+
1053
+ # If the stopped version is old, upgrade it (which also starts it)
1054
+ if Version(metrics["antnode_version"]) > Version(node.version):
1055
+ if not self._upgrade_node_binary(node, metrics["antnode_version"]):
1056
+ failed_nodes.append({"service": node.service.replace(".service", ""), "error": "upgrade failed"})
1057
+ else:
1058
+ upgraded_nodes.append(node.service.replace(".service", ""))
1059
+ else:
1060
+ manager = self._get_process_manager(node)
1061
+ if manager.start_node(node):
1062
+ self._set_node_status(node.id, RESTARTING)
1063
+ started_nodes.append(node.service.replace(".service", ""))
1064
+ else:
1065
+ failed_nodes.append({"service": node.service.replace(".service", ""), "error": "start failed"})
1066
+ except Exception as e:
1067
+ logging.error(f"Failed to start node {node.node_name}: {e}")
1068
+ failed_nodes.append({"service": node.service.replace(".service", ""), "error": str(e)})
1069
+
1070
+ if count == 1 and len(started_nodes) == 1:
1071
+ # Keep backward compatibility for single node
1072
+ # Extract node name from service name (e.g., "antnode0001" -> "0001")
1073
+ node_name = started_nodes[0].replace("antnode", "")
1074
+ return {"status": "started-node", "node": node_name}
1075
+ elif count == 1 and len(upgraded_nodes) == 1:
1076
+ # Keep backward compatibility for single node upgrade
1077
+ # Extract node name from service name (e.g., "antnode0001" -> "0001")
1078
+ node_name = upgraded_nodes[0].replace("antnode", "")
1079
+ return {"status": "upgrading-node", "node": node_name}
1080
+
1081
+ return {
1082
+ "status": "started-nodes" if not dry_run else "start-dryrun",
1083
+ "started_count": len(started_nodes),
1084
+ "started_nodes": started_nodes if started_nodes else None,
1085
+ "upgraded_count": len(upgraded_nodes),
1086
+ "upgraded_nodes": upgraded_nodes if upgraded_nodes else None,
1087
+ "failed_count": len(failed_nodes),
1088
+ "failed_nodes": failed_nodes if failed_nodes else None,
1089
+ }
1090
+
1091
+ def _force_disable_node(
1092
+ self, service_name: Optional[str], dry_run: bool
1093
+ ) -> Dict[str, Any]:
1094
+ """Force disable a specific node (service_name required)."""
1095
+ if not service_name:
1096
+ return {"status": "error", "message": "service_name required for disable action"}
1097
+
1098
+ # Parse comma-separated service names
1099
+ service_names = parse_service_names(service_name)
1100
+
1101
+ disabled_nodes = []
1102
+ failed_nodes = []
1103
+
1104
+ for name in service_names:
1105
+ node = self._get_node_by_name(name)
1106
+ if not node:
1107
+ failed_nodes.append({"service": name, "error": "not found"})
1108
+ continue
1109
+
1110
+ logging.info(f"Forced action: Disabling node {name}")
1111
+ if dry_run:
1112
+ logging.warning(f"DRYRUN: Disable node {name}")
1113
+ disabled_nodes.append(name)
1114
+ else:
1115
+ try:
1116
+ # Stop the node if it's running
1117
+ if node.status == RUNNING:
1118
+ manager = self._get_process_manager(node)
1119
+ manager.stop_node(node)
1120
+ self._set_node_status(node.id, DISABLED)
1121
+ disabled_nodes.append(name)
1122
+ except Exception as e:
1123
+ logging.error(f"Failed to disable node {name}: {e}")
1124
+ failed_nodes.append({"service": name, "error": str(e)})
1125
+
1126
+ return {
1127
+ "status": "disabled-nodes" if not dry_run else "disable-dryrun",
1128
+ "disabled_count": len(disabled_nodes),
1129
+ "disabled_nodes": disabled_nodes,
1130
+ "failed_count": len(failed_nodes),
1131
+ "failed_nodes": failed_nodes if failed_nodes else None,
1132
+ }
1133
+
1134
+ def _force_teardown_cluster(
1135
+ self, machine_config: Dict[str, Any], dry_run: bool
1136
+ ) -> Dict[str, Any]:
1137
+ """Force teardown the entire cluster."""
1138
+ logging.info("Forced action: Tearing down cluster")
1139
+
1140
+ # Get all nodes
1141
+ with self.S() as session:
1142
+ all_nodes = session.execute(
1143
+ select(Node).order_by(Node.id.asc())
1144
+ ).all()
1145
+
1146
+ if not all_nodes:
1147
+ return {"status": "no-nodes", "message": "No nodes to teardown"}
1148
+
1149
+ # Get the process manager (use the first node's manager or default)
1150
+ if all_nodes:
1151
+ sample_node = all_nodes[0][0]
1152
+ manager = self._get_process_manager(sample_node)
1153
+ else:
1154
+ manager = get_process_manager()
1155
+
1156
+ # Try manager-specific teardown first
1157
+ if hasattr(manager, 'teardown_cluster'):
1158
+ logging.info(f"Using {manager.__class__.__name__} teardown_cluster method")
1159
+ if dry_run:
1160
+ logging.warning("DRYRUN: Teardown cluster via manager")
1161
+ else:
1162
+ if manager.teardown_cluster():
1163
+ # Remove all nodes from database
1164
+ with self.S() as session:
1165
+ session.query(Node).delete()
1166
+ session.commit()
1167
+ return {"status": "cluster-teardown", "method": "manager-specific"}
1168
+
1169
+ # Fall back to removing each node individually (without delay)
1170
+ logging.info("Using default teardown (remove all nodes)")
1171
+ removed_count = 0
1172
+ for row in all_nodes:
1173
+ node = row[0]
1174
+ if dry_run:
1175
+ logging.warning(f"DRYRUN: Remove node {node.node_name}")
1176
+ removed_count += 1
1177
+ else:
1178
+ try:
1179
+ manager = self._get_process_manager(node)
1180
+ manager.remove_node(node)
1181
+ with self.S() as session:
1182
+ session.delete(node)
1183
+ session.commit()
1184
+ removed_count += 1
1185
+ logging.info(f"Removed node {node.node_name}")
1186
+ except Exception as e:
1187
+ logging.error(f"Failed to remove node {node.node_name}: {e}")
1188
+
1189
+ return {
1190
+ "status": "cluster-teardown",
1191
+ "method": "individual-remove",
1192
+ "removed_count": removed_count,
1193
+ }
1194
+
1195
+ def _survey_specific_nodes(self, service_names: List[str], dry_run: bool) -> Dict[str, Any]:
1196
+ """Survey specific nodes by service name.
1197
+
1198
+ Args:
1199
+ service_names: List of service names to survey
1200
+ dry_run: If True, log without executing
1201
+
1202
+ Returns:
1203
+ Dictionary with survey results
1204
+ """
1205
+ from wnm.utils import read_node_metrics, read_node_metadata, update_node_from_metrics
1206
+
1207
+ surveyed_nodes = []
1208
+ failed_nodes = []
1209
+
1210
+ for service_name in service_names:
1211
+ node = self._get_node_by_name(service_name)
1212
+ if not node:
1213
+ failed_nodes.append({"service": service_name, "error": "not found"})
1214
+ continue
1215
+
1216
+ if node.status == DISABLED:
1217
+ failed_nodes.append({"service": service_name, "error": "disabled"})
1218
+ continue
1219
+
1220
+ if dry_run:
1221
+ logging.warning(f"DRYRUN: Survey node {service_name}")
1222
+ surveyed_nodes.append(service_name)
1223
+ else:
1224
+ logging.info(f"Surveying node {service_name}")
1225
+
1226
+ # Check metadata first
1227
+ node_metadata = read_node_metadata(node.host, node.metrics_port)
1228
+
1229
+ # If metadata fails, fake metrics with 0's
1230
+ if node_metadata["status"] == STOPPED:
1231
+ node_metrics = {
1232
+ "status": STOPPED,
1233
+ "uptime": 0,
1234
+ "records": 0,
1235
+ "shunned": 0,
1236
+ "connected_peers": 0
1237
+ }
1238
+ else:
1239
+ # Metadata succeeded, now get metrics
1240
+ node_metrics = read_node_metrics(node.host, node.metrics_port)
1241
+
1242
+ # Skip update if node is stopped and already marked as stopped
1243
+ if node_metadata["status"] == STOPPED and node.status == STOPPED:
1244
+ surveyed_nodes.append(service_name)
1245
+ continue
1246
+
1247
+ update_node_from_metrics(self.S, node.id, node_metrics, node_metadata)
1248
+ surveyed_nodes.append(service_name)
1249
+
1250
+ return {
1251
+ "status": "survey-complete" if not dry_run else "survey-dryrun",
1252
+ "surveyed_count": len(surveyed_nodes),
1253
+ "surveyed_nodes": surveyed_nodes,
1254
+ "failed_count": len(failed_nodes),
1255
+ "failed_nodes": failed_nodes if failed_nodes else None,
1256
+ }
1257
+
1258
+ def _force_survey_nodes(self, service_name: Optional[str] = None, dry_run: bool = False) -> Dict[str, Any]:
1259
+ """Force a survey of all nodes or specific nodes to update their status and metrics.
1260
+
1261
+ Args:
1262
+ service_name: Optional comma-separated list of service names to survey
1263
+ dry_run: If True, log without executing
1264
+
1265
+ Returns:
1266
+ Dictionary with survey results
1267
+ """
1268
+ # Parse service names if provided
1269
+ service_names = parse_service_names(service_name)
1270
+
1271
+ if service_names:
1272
+ # Survey specific nodes
1273
+ logging.info(f"Forced action: Surveying {len(service_names)} specific nodes")
1274
+ return self._survey_specific_nodes(service_names, dry_run)
1275
+ else:
1276
+ # Survey all nodes
1277
+ logging.info("Forced action: Surveying all nodes")
1278
+
1279
+ if dry_run:
1280
+ logging.warning("DRYRUN: Survey all nodes")
1281
+ # Get count of non-disabled nodes
1282
+ with self.S() as session:
1283
+ from wnm.common import DISABLED
1284
+ node_count = session.execute(
1285
+ select(func.count(Node.id)).where(Node.status != DISABLED)
1286
+ ).scalar()
1287
+ return {"status": "survey-dryrun", "node_count": node_count}
1288
+
1289
+ # Update all nodes
1290
+ update_nodes(self.S)
1291
+
1292
+ # Get updated count
1293
+ with self.S() as session:
1294
+ from wnm.common import DISABLED
1295
+ node_count = session.execute(
1296
+ select(func.count(Node.id)).where(Node.status != DISABLED)
1297
+ ).scalar()
1298
+
1299
+ return {"status": "survey-complete", "node_count": node_count}