wnm 0.0.8__py3-none-any.whl → 0.0.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of wnm might be problematic. Click here for more details.

wnm/__main__.py CHANGED
@@ -1,983 +1,236 @@
1
- import os, sys
2
- import re, json, requests, time
3
- import subprocess, logging
4
- from collections import Counter
5
- from packaging.version import Version
6
- from dotenv import load_dotenv
7
- import psutil, shutil
8
-
9
- from wnm.models import Base, Machine, Node
10
- from sqlalchemy import create_engine, select, insert, update, delete, text
11
- from sqlalchemy.orm import sessionmaker, scoped_session
1
+ import json
2
+ import logging
3
+ import os
4
+ import sys
5
+ import time
6
+
7
+ from sqlalchemy import insert, select
8
+
9
+ from wnm.config import (
10
+ LOCK_FILE,
11
+ S,
12
+ apply_config_updates,
13
+ config_updates,
14
+ machine_config,
15
+ options,
16
+ )
17
+ from wnm.decision_engine import DecisionEngine
18
+ from wnm.executor import ActionExecutor
19
+ from wnm.migration import survey_machine
20
+ from wnm.models import Node
21
+ from wnm.utils import (
22
+ get_antnode_version,
23
+ get_machine_metrics,
24
+ update_counters,
25
+ )
12
26
 
13
27
  logging.basicConfig(level=logging.INFO)
14
- #Info level logging for sqlalchemy is too verbose, only use when needed
15
- logging.getLogger('sqlalchemy.engine.Engine').disabled = True
16
-
17
- # import .env
18
- basedir = os.path.abspath(os.path.dirname(__file__))
19
- load_dotenv(os.path.join(basedir, '.env'))
20
-
21
- # simulate arg/yaml configuration
22
- config = {}
23
- config['db']='sqlite:///colony.db'
24
- config['DonateAddress'] = os.getenv('DonateAddress') or '0x00455d78f850b0358E8cea5be24d415E01E107CF'
25
- config['ANMHost'] = os.getenv('ANMHost') or '127.0.0.1'
26
- config['CrisisBytes'] = os.getenv('CrisisBytes') or 2 * 10 ** 9 # default 2gb/node
27
-
28
-
29
- # Setup Database engine
30
- engine = create_engine(config["db"], echo=True)
31
-
32
- # Generate ORM
33
- Base.metadata.create_all(engine)
34
-
35
- # Create a connection to the ORM
36
- session_factory = sessionmaker(bind=engine)
37
- S = scoped_session(session_factory)
38
-
39
-
40
- # if WNM_CONFIG or -c parameter are set, check for existing config
41
- # else:
42
-
43
- # Primary node for want of one
44
- QUEEN=1
45
-
46
- # Donation address
47
- DONATE=config["DonateAddress"]
48
- #Keep these as strings so they can be grepped in logs
49
- STOPPED="STOPPED" #0 Node is not responding to it's metrics port
50
- RUNNING="RUNNING" #1 Node is responding to it's metrics port
51
- UPGRADING="UPGRADING" #2 Upgrade in progress
52
- DISABLED="DISABLED" #-1 Do not start
53
- RESTARTING="RESTARTING" #3 re/starting a server intionally
54
- MIGRATING="MIGRATING" #4 Moving volumes in progress
55
- REMOVING="REMOVING" #5 Removing node in progress
56
- DEAD="DEAD" #-86 Broken node to cleanup
57
-
58
- ANM_HOST=config["ANMHost"]
59
- # Baseline bytes per node
60
- CRISIS_BYTES=config["CrisisBytes"]
28
+ # Info level logging for sqlalchemy is too verbose, only use when needed
29
+ logging.getLogger("sqlalchemy.engine.Engine").disabled = True
30
+
61
31
 
62
32
  # A storage place for ant node data
63
- Workers=[]
64
-
65
- # Detect ANM (but don't upgrade)
66
- if os.path.exists("/var/antctl/system"):
67
- # Is anm scheduled to run
68
- if os.path.exists("/etc/cron.d/anm"):
69
- # remove cron to disable old anm
70
- try:
71
- subprocess.run(['sudo','rm', '/etc/cron.d/anm'])
72
- except Exception as error:
73
- template = "In GAV - An exception of type {0} occurred. Arguments:\n{1!r}"
74
- message = template.format(type(error).__name__, error.args)
75
- logging.info(message)
76
- sys.exit(1)
77
- os.remove("/etc/cron.d/anm")
78
- # Is anm sitll running? We'll wait
79
- if os.path.exists("/var/antctl/block"):
80
- logging.info("anm still running, waiting...")
81
- sys.exit(1)
33
+ Workers = []
82
34
 
83
- # Are we already running
84
- if os.path.exists("/var/antctl/wnm_active"):
85
- logging.info("wnm still running")
86
- sys.exit(1)
35
+ # Detect ANM
87
36
 
88
- # Get anm configuration
89
- def load_anm_config():
90
- anm_config = {}
91
-
92
- # Let's get the real count of CPU's available to this process
93
- anm_config["CpuCount"] = len(os.sched_getaffinity(0))
94
-
95
- # What can we save from /var/antctl/config
96
- if os.path.exists("/var/antctl/config"):
97
- load_dotenv("/var/antctl/config")
98
- anm_config["NodeCap"] = int(os.getenv('NodeCap') or 20)
99
- anm_config["CpuLessThan"] = int(os.getenv('CpuLessThan') or 50)
100
- anm_config["CpuRemove"] = int(os.getenv('CpuRemove') or 70)
101
- anm_config["MemLessThan"] = int(os.getenv('MemLessThan') or 70)
102
- anm_config["MemRemove"] = int(os.getenv('MemRemove') or 90)
103
- anm_config["HDLessThan"] = int(os.getenv('HDLessThan') or 70)
104
- anm_config["HDRemove"] = int(os.getenv('HDRemove') or 90)
105
- anm_config["DelayStart"] = int(os.getenv('DelayStart') or 5)
106
- anm_config["DelayUpgrade"] = int(os.getenv('DelayUpgrade') or 5)
107
- anm_config["DelayRestart"] = int(os.getenv('DelayRestart') or 10)
108
- anm_config["DelayRemove"] = int(os.getenv('DelayRemove') or 300)
109
- anm_config["NodeStorage"] = os.getenv('NodeStorage') or "/var/antctl/services"
110
- # Default to the faucet donation address
111
- try:
112
- anm_config["RewardsAddress"] = re.findall(r"--rewards-address ([\dA-Fa-fXx]+)",os.getenv('RewardsAddress'))[0]
113
- except:
114
- try:
115
- anm_config["RewardsAddress"] = re.findall(r"([\dA-Fa-fXx]+)",os.getenv("RewardsAddress"))[0]
116
- except:
117
- logging.warning("Unable to detect RewardsAddress")
118
- sys.exit(1)
119
- anm_config["DonateAddress"]=os.getenv("DonateAddress") or DONATE
120
- anm_config["MaxLoadAverageAllowed"]=float(os.getenv("MaxLoadAverageAllowed") or anm_config["CpuCount"])
121
- anm_config["DesiredLoadAverage"]=float(os.getenv("DesiredLoadAverage") or (anm_config["CpuCount"] * .6))
122
37
 
123
- try:
124
- with open('/usr/bin/anms.sh', 'r') as file:
125
- data = file.read()
126
- anm_config["PortStart"]=int(re.findall(r"ntpr\=(\d+)",data)[0])
127
- except:
128
- anm_config["PortStart"]=55
129
-
130
- anm_config["HDIOReadLessThan"] = float(os.getenv('HDIOReadLessThan') or 0.0)
131
- anm_config["HDIOReadRemove"] = float(os.getenv('HDIOReadRemove') or 0.0)
132
- anm_config["HDIOWriteLessThan"] = float(os.getenv('HDIOWriteLessThan') or 0.0)
133
- anm_config["HDIOWriteRemove"] = float(os.getenv('HDIOWriteRemove') or 0.0)
134
- anm_config["NetIOReadLessThan"] = float(os.getenv('NetIOReadLessThan') or 0.0)
135
- anm_config["NetIOReadRemove"] = float(os.getenv('NetIOReadRemove') or 0.0)
136
- anm_config["NetIOWriteLessThan"] = float(os.getenv('NetIOWriteLessThan') or 0.0)
137
- anm_config["NetIOWriteRemove"] = float(os.getenv('NetIOWriteRemove') or 0.0)
138
- # Timer for last stopped nodes
139
- anm_config["LastStoppedAt"]=0
140
-
141
-
142
- return anm_config
143
-
144
- # Read confirm from systemd service file
145
- def read_systemd_service(antnode):
146
- details={}
147
- try:
148
- with open('/etc/systemd/system/'+antnode, 'r') as file:
149
- data = file.read()
150
- details['id']=int(re.findall(r"antnode(\d+)",antnode)[0])
151
- details['binary']=re.findall(r"ExecStart=([^ ]+)",data)[0]
152
- details["user"]=re.findall(r"User=(\w+)",data)[0]
153
- details["root_dir"]=re.findall(r"--root-dir ([\w\/]+)",data)[0]
154
- details["port"]=int(re.findall(r"--port (\d+)",data)[0])
155
- details["metrics_port"]=int(re.findall(r"--metrics-server-port (\d+)",data)[0])
156
- details["wallet"]=re.findall(r"--rewards-address ([^ ]+)",data)[0]
157
- details["network"]=re.findall(r"--rewards-address [^ ]+ ([\w\-]+)",data)[0]
158
- except:
159
- pass
160
-
161
- return details
162
-
163
- # Read data from metadata endpoint
164
- def read_node_metadata(host,port):
165
- # Only return version number when we have one, to stop clobbering the binary check
166
- try:
167
- url = "http://{0}:{1}/metadata".format(host,port)
168
- response = requests.get(url)
169
- data=response.text
170
- except requests.exceptions.ConnectionError:
171
- logging.debug("Connection Refused on port: {0}:{1}".format(host,str(port)))
172
- return {"status": STOPPED, "peer_id":""}
173
- except Exception as error:
174
- template = "In RNMd - An exception of type {0} occurred. Arguments:\n{1!r}"
175
- message = template.format(type(error).__name__, error.args)
176
- logging.info(message)
177
- return {"status": STOPPED, "peer_id":""}
178
- # collect a dict to return
179
- card={}
180
- try:
181
- card["version"] = re.findall(r'{antnode_version="([\d\.]+)"}',data)[0]
182
- except:
183
- logging.info('No version found')
184
- try:
185
- card["peer_id"] = re.findall(r'{peer_id="([\w\d]+)"}',data)[0]
186
- except:
187
- card["peer_id"] = ""
188
- card["status"] = RUNNING if "version" in card else STOPPED
189
- return card
190
-
191
- # Read data from metrics port
192
- def read_node_metrics(host,port):
193
- metrics={}
194
- try:
195
- url = "http://{0}:{1}/metrics".format(host,port)
196
- response = requests.get(url)
197
- metrics["status"] = RUNNING
198
- metrics["uptime"] = int((re.findall(r'ant_node_uptime ([\d]+)',response.text) or [0])[0])
199
- metrics["records"] = int((re.findall(r'ant_networking_records_stored ([\d]+)',response.text) or [0])[0])
200
- metrics["shunned"] = int((re.findall(r'ant_networking_shunned_by_close_group ([\d]+)',response.text) or [0])[0])
201
- except requests.exceptions.ConnectionError:
202
- logging.debug("Connection Refused on port: {0}:{1}".format(host,str(port)))
203
- metrics["status"] = STOPPED
204
- metrics["uptime"] = 0
205
- metrics["records"] = 0
206
- metrics["shunned"] = 0
207
- except Exception as error:
208
- template = "in:RNM - An exception of type {0} occurred. Arguments:\n{1!r}"
209
- message = template.format(type(error).__name__, error.args)
210
- logging.info(message)
211
- metrics["status"] = STOPPED
212
- metrics["uptime"] = 0
213
- metrics["records"] = 0
214
- metrics["shunned"] = 0
215
- return metrics
216
-
217
- # Read antnode binary version
218
- def get_antnode_version(binary):
219
- try:
220
- data = subprocess.run([binary, '--version'], stdout=subprocess.PIPE).stdout.decode('utf-8')
221
- return re.findall(r'Autonomi Node v([\d\.]+)',data)[0]
222
- except Exception as error:
223
- template = "In GAV - An exception of type {0} occurred. Arguments:\n{1!r}"
224
- message = template.format(type(error).__name__, error.args)
225
- logging.info(message)
226
- return 0
227
-
228
- # Determine how long this node has been around by looking at it's secret_key file
229
- def get_node_age(root_dir):
230
- try:
231
- return int(os.stat("{0}/secret-key".format(root_dir)).st_mtime)
232
- except:
233
- return 0
234
-
235
- # Survey nodes by reading metadata from metrics ports or binary --version
236
- def survey_anm_nodes(antnodes):
237
- # Build a list of node dictionaries to return
238
- details=[]
239
- # Iterate on nodes
240
- for node in antnodes:
241
- # Initialize a dict
242
- logging.debug("{0} surveying node {1} ".format(time.strftime("%Y-%m-%d %H:%M"),node))
243
- if not re.findall(r"antnode([\d]+).service",node):
244
- logging.info("can't decode "+str(node))
245
- continue
246
- card={"nodename":re.findall(r"antnode([\d]+).service",node)[0],
247
- "service": node,
248
- "timestamp": int(time.time()),
249
- "host": ANM_HOST or '127.0.0.1'
250
- }
251
- # Load what systemd has configured
252
- card.update(read_systemd_service(node))
253
- #print(json.dumps(card,indent=2))
254
- # Read metadata from metrics_port
255
- metadata = read_node_metadata(card["host"],card["metrics_port"])
256
- #print(json.dumps(metadata,indent=2))
257
- if isinstance(metadata,dict) and \
258
- "status" in metadata and \
259
- metadata["status"]==RUNNING:
260
- # soak up metadata
261
- card.update(metadata)
262
- # The ports up, so grab metrics too
263
- card.update(read_node_metrics(card["host"],card["metrics_port"]))
264
- # Else run binary to get version
265
- else:
266
- # If the root directory of the node is missing, it's a bad node
267
- if not os.path.isdir(card["root_dir"]):
268
- card["status"]=DEAD
269
- card["version"]=''
270
- else:
271
- card["status"]=STOPPED
272
- card["version"]=get_antnode_version(card["binary"])
273
- card["peer_id"]=''
274
- card["records"]=0
275
- card["uptime"]=0
276
- card["shunned"]=0
277
- card["age"]=get_node_age(card["root_dir"])
278
- # harcoded for anm
279
- card["host"]=ANM_HOST
280
- # Append the node dict to the detail list
281
- details.append(card)
282
-
283
- return details
284
-
285
- # Survey server instance
286
- def survey_machine():
287
- # Make a bucket
288
- antnodes=[]
289
- # For all service files
290
- for file in os.listdir("/etc/systemd/system"):
291
- # Find antnodes
292
- if re.match(r'antnode[\d]+\.service',file):
293
- antnodes.append(file)
294
- #if len(antnodes)>=5:
295
- # break
296
- # Iterate over defined nodes and get details
297
- # Ingests a list of service files and outputs a list of dictionaries
298
- return survey_anm_nodes(antnodes)
299
-
300
- # Read system status
301
- def get_machine_metrics(node_storage,remove_limit):
302
- metrics = {}
303
-
304
- with S() as session:
305
- db_nodes=session.execute(select(Node.status,Node.version)).all()
306
-
307
- # Get some initial stats for comparing after a few seconds
308
- # We start these counters AFTER reading the database
309
- start_time=time.time()
310
- start_disk_counters=psutil.disk_io_counters()
311
- start_net_counters=psutil.net_io_counters()
312
-
313
- metrics["TotalNodes"]=len(db_nodes)
314
- data = Counter(node[0] for node in db_nodes)
315
- metrics["RunningNodes"] = data[RUNNING]
316
- metrics["StoppedNodes"] = data[STOPPED]
317
- metrics["RestartingNodes"] = data[RESTARTING]
318
- metrics["UpgradingNodes"] = data[UPGRADING]
319
- metrics["MigratingNodes"] = data[MIGRATING]
320
- metrics["RemovingNodes"] = data[REMOVING]
321
- metrics["DeadNodes"] = data[DEAD]
322
- metrics["antnode"]=shutil.which("antnode")
323
- if not metrics["antnode"]:
324
- logging.warning("Unable to locate current antnode binary, exiting")
325
- sys.exit(1)
326
- metrics["AntNodeVersion"]=get_antnode_version(metrics["antnode"])
327
- metrics["NodesLatestV"]=sum(1 for node in db_nodes if node[1]==metrics["AntNodeVersion"]) or 0
328
- metrics["NodesNoVersion"]=sum(1 for node in db_nodes if not node[1]) or 0
329
- metrics["NodesToUpgrade"]=metrics["TotalNodes"] - metrics["NodesLatestV"] - metrics["NodesNoVersion"]
330
-
331
- # Windows has to build load average over 5 seconds. The first 5 seconds returns 0's
332
- # I don't plan on supporting windows, but if this get's modular, I don't want this
333
- # issue to be skipped
334
- #if platform.system() == "Windows":
335
- # discard=psutil.getloadavg()
336
- # time.sleep(5)
337
- metrics["LoadAverage1"],metrics["LoadAverage5"],metrics["LoadAverage15"]=psutil.getloadavg()
338
- # Get CPU Metrics over 1 second
339
- metrics["IdleCpuPercent"],metrics["IOWait"] = psutil.cpu_times_percent(1)[3:5]
340
- # Really we returned Idle percent, subtract from 100 to get used.
341
- metrics["UsedCpuPercent"] = 100 - metrics["IdleCpuPercent"]
342
- data=psutil.virtual_memory()
343
- #print(data)
344
- metrics["UsedMemPercent"]=data.percent
345
- metrics["FreeMemPercent"]=100-metrics["UsedMemPercent"]
346
- data=psutil.disk_io_counters()
347
- # This only checks the drive mapped to the first node and will need to be updated
348
- # when we eventually support multiple drives
349
- data=psutil.disk_usage(node_storage)
350
- metrics["UsedHDPercent"]=data.percent
351
- metrics["TotalHDBytes"]=data.total
352
- end_time=time.time()
353
- end_disk_counters=psutil.disk_io_counters()
354
- end_net_counters=psutil.net_io_counters()
355
- metrics["HDWriteBytes"]=int((end_disk_counters.write_bytes-start_disk_counters.write_bytes)/(end_time-start_time))
356
- metrics["HDReadBytes"]=int((end_disk_counters.read_bytes-start_disk_counters.read_bytes)/(end_time-start_time))
357
- metrics["NetWriteBytes"]=int((end_net_counters.bytes_sent-start_net_counters.bytes_sent)/(end_time-start_time))
358
- metrics["NetReadBytes"]=int((end_net_counters.bytes_recv-start_net_counters.bytes_recv)/(end_time-start_time))
359
- #print (json.dumps(metrics,indent=2))
360
- # How close (out of 100) to removal limit will we be with a max bytes per node (2GB default)
361
- # For running nodes with Porpoise(tm).
362
- metrics["NodeHDCrisis"]=int((((metrics["TotalNodes"])*CRISIS_BYTES)/(metrics["TotalHDBytes"]*(remove_limit/100)))*100)
363
- return metrics
364
-
365
- # Update node with metrics result
366
- def update_node_from_metrics(id,metrics,metadata):
367
- try:
368
- # We check the binary version in other code, so lets stop clobbering it when a node is stopped
369
- card={'status': metrics["status"], 'timestamp': int(time.time()),
370
- 'uptime': metrics["uptime"], 'records': metrics["records"],
371
- 'shunned': metrics["shunned"],
372
- 'peer_id': metadata["peer_id"]}
373
- if "version" in metadata:
374
- card['version']=metadata["version"]
375
- with S() as session:
376
- session.query(Node).filter(Node.id == id).\
377
- update(card)
378
- session.commit()
379
- except Exception as error:
380
- template = "In UNFM - An exception of type {0} occurred. Arguments:\n{1!r}"
381
- message = template.format(type(error).__name__, error.args)
382
- logging.warning(message)
383
- return False
384
- else:
385
- return True
386
-
387
- # Set Node status
388
- def set_node_status(id,status):
389
- logging.info("Setting node status: {0} {1}".format(id,status))
390
- try:
391
- with S() as session:
392
- session.query(Node).filter(Node.id == id).\
393
- update({'status': status, 'timestamp': int(time.time())})
394
- session.commit()
395
- except:
396
- return False
397
- else:
398
- return True
399
-
400
- # Update metrics after checking counters
401
- def update_counters(old,config):
402
- # Are we already removing a node
403
- if old["RemovingNodes"]:
404
- with S() as session:
405
- removals=session.execute(select(Node.timestamp,Node.id)\
406
- .where(Node.status == REMOVING)\
407
- .order_by(Node.timestamp.asc())).all()
408
- # Iterate through active removals
409
- records_to_remove = len(removals)
410
- for check in removals:
411
- # If the DelayRemove timer has expired, delete the entry
412
- if isinstance(check[0],int) and \
413
- check[0] < (int(time.time()) - (config["DelayRemove"]*60)):
414
- logging.info("Deleting removed node "+str(check[1]))
415
- with S() as session:
416
- session.execute(delete(Node).where(Node.id==check[1]))
417
- session.commit()
418
- records_to_remove-=1
419
- old["RemovingNodes"]=records_to_remove
420
- # Are we already upgrading a node
421
- if old["UpgradingNodes"]:
422
- with S() as session:
423
- upgrades=session.execute(select(Node.timestamp,Node.id,Node.host,Node.metrics_port)\
424
- .where(Node.status == UPGRADING)\
425
- .order_by(Node.timestamp.asc())).all()
426
- # Iterate through active upgrades
427
- records_to_upgrade = len(upgrades)
428
- for check in upgrades:
429
- # If the DelayUpgrade timer has expired, check on status
430
- if isinstance(check[0],int) and \
431
- check[0] < (int(time.time()) - (config["DelayUpgrade"]*60)):
432
- logging.info("Updating upgraded node "+str(check[1]))
433
- node_metrics=read_node_metrics(check[2],check[3])
434
- node_metadata=read_node_metadata(check[2],check[3])
435
- if node_metrics and node_metadata:
436
- update_node_from_metrics(check[1],node_metrics,node_metadata)
437
- records_to_upgrade-=1
438
- old["UpgradingNodes"]=records_to_upgrade
439
- # Are we already restarting a node
440
- if old["RestartingNodes"]:
441
- with S() as session:
442
- restarts=session.execute(select(Node.timestamp,Node.id,Node.host,Node.metrics_port)\
443
- .where(Node.status == RESTARTING)\
444
- .order_by(Node.timestamp.asc())).all()
445
- # Iterate through active upgrades
446
- records_to_restart = len(restarts)
447
- for check in restarts:
448
- # If the DelayUpgrade timer has expired, check on status
449
- if isinstance(check[0],int) and \
450
- check[0] < (int(time.time()) - (config["DelayStart"]*60)):
451
- logging.info("Updating restarted node "+str(check[1]))
452
- node_metrics=read_node_metrics(check[2],check[3])
453
- node_metadata=read_node_metadata(check[2],check[3])
454
- if node_metrics and node_metadata:
455
- update_node_from_metrics(check[1],node_metrics,node_metadata)
456
- records_to_restart-=1
457
- old["RestartingNodes"]=records_to_restart
458
- return(old)
459
-
460
- # Enable firewall for port
461
- def enable_firewall(port,node):
462
- logging.info("enable firewall port {0}/udp".format(port))
463
- # Close ufw firewall
464
- try:
465
- subprocess.run(['sudo','ufw','allow',"{0}/udp".format(port),'comment',node], stdout=subprocess.PIPE)
466
- except subprocess.CalledProcessError as err:
467
- logging.error( 'EF Error:', err )
468
-
469
- # Disable firewall for port
470
- def disable_firewall(port):
471
- logging.info("disable firewall port {0}/udp".format(port))
472
- # Close ufw firewall
473
- try:
474
- subprocess.run(['sudo','ufw','delete','allow',"{0}/udp".format(port)], stdout=subprocess.PIPE)
475
- except subprocess.CalledProcessError as err:
476
- logging.error( 'DF ERROR:', err )
477
-
478
- # Start a systemd node
479
- def start_systemd_node(node):
480
- logging.info("Starting node "+str(node.id))
481
- # Try to start the service
482
- try:
483
- p = subprocess.run(['sudo', 'systemctl', 'start', node.service], stdout=subprocess.PIPE,stderr=subprocess.STDOUT).stdout.decode('utf-8')
484
- if re.match(r'Failed to start',p):
485
- logging.error( 'SSN2 ERROR:', p )
486
- return False
487
- except subprocess.CalledProcessError as err:
488
- logging.error( 'SSN1 ERROR:', err )
489
- return False
490
- # Open a firewall hole for the data port
491
- enable_firewall(node.port,node.service)
492
- # Update node status
493
- set_node_status(node.id,RESTARTING)
494
- return True
495
-
496
- # Stop a systemd node
497
- def stop_systemd_node(node):
498
- logging.info("Stopping node: "+node.service)
499
- # Send a stop signal to the process
500
- try:
501
- subprocess.run(['sudo', 'systemctl', 'stop', node.service], stdout=subprocess.PIPE)
502
- except subprocess.CalledProcessError as err:
503
- logging.error( 'SSN2 ERROR:', err )
504
- disable_firewall(node.port)
505
- set_node_status(node.id,STOPPED)
506
-
507
- return True
508
-
509
- # Upgrade a node
510
- def upgrade_node(node,metrics):
511
- logging.info("Upgrading node "+str(node.id))
512
- # Copy current node binary
513
- try:
514
- subprocess.run(['sudo', 'cp', '-f', metrics["antnode"], node.binary])
515
- except subprocess.CalledProcessError as err:
516
- logging.error( 'UN1 ERROR:', err )
517
- try:
518
- subprocess.run(['sudo', 'systemctl', 'restart', node.service])
519
- except subprocess.CalledProcessError as err:
520
- logging.error( 'UN2 ERROR:', err )
521
- version=get_antnode_version(node.binary)
522
- try:
523
- with S() as session:
524
- session.query(Node).filter(Node.id == node.id).\
525
- update({'status': UPGRADING, 'timestamp': int(time.time()),
526
- 'version': metrics["AntNodeVersion"]})
527
- session.commit()
528
- except:
529
- return False
530
- else:
531
- return True
532
-
533
- # Remove a node
534
- def remove_node(id):
535
- logging.info("Removing node "+str(id))
536
-
537
- with S() as session:
538
- node = session.execute(select(Node).where(Node.id == id)).first()
539
- # Grab Node from Row
540
- node=node[0]
541
- if stop_systemd_node(node):
542
- # Mark this node as REMOVING
543
- set_node_status(id,REMOVING)
544
-
545
- nodename=f"antnode{node.nodename}"
546
- # Remove node data and log
547
- try:
548
- subprocess.run(['sudo', 'rm', '-rf', node.root_dir, f"/var/log/antnode/{nodename}"])
549
- except subprocess.CalledProcessError as err:
550
- logging.error( 'RN1 ERROR:', err )
551
- # Remove systemd service file
552
- try:
553
- subprocess.run(['sudo', 'rm', '-f', f"/etc/systemd/system/{node.service}"])
554
- except subprocess.CalledProcessError as err:
555
- logging.error( 'RN2 ERROR:', err )
556
- # Tell system to reload systemd files
557
- try:
558
- subprocess.run(['sudo', 'systemctl', 'daemon-reload'])
559
- except subprocess.CalledProcessError as err:
560
- logging.error( 'RN3 ERROR:', err )
561
- #print(json.dumps(node,indent=2))
562
-
563
- # Rescan nodes for status
564
- def update_nodes():
565
- with S() as session:
566
- nodes=session.execute(select(Node.timestamp,Node.id,Node.host,Node.metrics_port,Node.status)\
567
- .where(Node.status != DISABLED)\
568
- .order_by(Node.timestamp.asc())).all()
569
- # Iterate through all records
570
- for check in nodes:
571
- # Check on status
572
- if isinstance(check[0],int):
573
- logging.debug("Updating info on node "+str(check[1]))
574
- node_metrics=read_node_metrics(check[2],check[3])
575
- node_metadata=read_node_metadata(check[2],check[3])
576
- if node_metrics and node_metadata:
577
- # Don't write updates for stopped nodes that are already marked as stopped
578
- if node_metadata["status"]==STOPPED and check[4]==STOPPED:
579
- continue
580
- update_node_from_metrics(check[1],node_metrics,node_metadata)
581
-
582
- # Create a new node
583
- def create_node(config,metrics):
584
- logging.info("Creating new node")
585
- # Create a holding place for the new node
586
- card = {}
587
- # Find the next available node number by first looking for holes
588
- sql = text('select n1.id + 1 as id from node n1 ' + \
589
- 'left join node n2 on n2.id = n1.id + 1 ' + \
590
- 'where n2.id is null ' + \
591
- 'and n1.id <> (select max(id) from node) ' + \
592
- 'order by n1.id;')
593
- with S() as session:
594
- result = session.execute(sql).first()
595
- if result:
596
- card['id']=result[0]
597
- # Otherwise get the max node number and add 1
598
- else:
599
- with S() as session:
600
- result = session.execute(select(Node.id).order_by(Node.id.desc())).first()
601
- card['id']=result[0]+1
602
- # Set the node name
603
- card['nodename']=f'{card['id']:04}'
604
- card['service']=f'antnode{card['nodename']}.service'
605
- card['user']='ant'
606
- card['version']=metrics["AntNodeVersion"]
607
- card['root_dir']=f"{config['NodeStorage']}/antnode{card['nodename']}"
608
- card['binary']=f"{card['root_dir']}/antnode"
609
- card['port']=config["PortStart"]*1000+card['id']
610
- card['metrics_port']=13*1000+card['id']
611
- card['network']='evm-arbitrum-one'
612
- card['wallet']=config["RewardsAddress"]
613
- card['peer_id']=''
614
- card['status']=STOPPED
615
- card['timestamp']=int(time.time())
616
- card['records']=0
617
- card['uptime']=0
618
- card['shunned']=0
619
- card['age']=card['timestamp']
620
- card['host']=ANM_HOST
621
- log_dir=f"/var/log/antnode/antnode{card['nodename']}"
622
- # Create the node directory and log directory
623
- try:
624
- subprocess.run(['sudo','mkdir','-p',card["root_dir"],log_dir], stdout=subprocess.PIPE)
625
- except subprocess.CalledProcessError as err:
626
- logging.error( 'CN1 ERROR:', err )
627
- # Copy the binary to the node directory
628
- try:
629
- subprocess.run(['sudo','cp',metrics["antnode"],card["root_dir"]], stdout=subprocess.PIPE)
630
- except subprocess.CalledProcessError as err:
631
- logging.error( 'CN2 ERROR:', err )
632
- # Change owner of the node directory and log directories
633
- try:
634
- subprocess.run(['sudo','chown','-R',f'{card["user"]}:{card["user"]}',card["root_dir"],log_dir], stdout=subprocess.PIPE)
635
- except subprocess.CalledProcessError as err:
636
- logging.error( 'CN3 ERROR:', err )
637
- # build the systemd service unit
638
- service=f"""[Unit]
639
- Description=antnode{card['nodename']}
640
- [Service]
641
- User={card['user']}
642
- ExecStart={card['binary']} --bootstrap-cache-dir /var/antctl/bootstrap-cache --root-dir {card['root_dir']} --port {card['port']} --enable-metrics-server --metrics-server-port {card['metrics_port']} --log-output-dest {log_dir} --max-log-files 1 --max-archived-log-files 1 --rewards-address {card['wallet']} {card['network']}
643
- Restart=always
644
- #RestartSec=300
645
- """
646
- # Write the systemd service unit with sudo tee since we're running as not root
647
- try:
648
- subprocess.run(['sudo','tee',f'/etc/systemd/system/{card["service"]}'],input=service,text=True, stdout=subprocess.PIPE)
649
- except subprocess.CalledProcessError as err:
650
- logging.error( 'CN4 ERROR:', err )
651
- # Reload systemd service files to get our new one
652
- try:
653
- subprocess.run(['sudo','systemctl','daemon-reload'], stdout=subprocess.PIPE)
654
- except subprocess.CalledProcessError as err:
655
- logging.error( 'CN5 ERROR:', err )
656
- # Add the new node to the database
657
- with S() as session:
658
- session.execute(
659
- insert(Node),[card]
660
- )
661
- session.commit()
662
- # Now we grab the node object from the database to pass to start node
663
- with S() as session:
664
- card=session.execute(select(Node).where(Node.id == card['id'])).first()
665
- # Get the Node object from the Row
666
- card=card[0]
667
- # Start the new node
668
- return start_systemd_node(card)
669
- #print(json.dumps(card,indent=2))
670
- return True
671
-
672
-
673
- # Make a decision about what to do
674
- def choose_action(config,metrics,db_nodes):
675
- # Gather knowlege
676
- features={}
677
- features["AllowCpu"]=metrics["UsedCpuPercent"] < config["CpuLessThan"]
678
- features["AllowMem"]=metrics["UsedMemPercent"] < config["MemLessThan"]
679
- features["AllowHD"]=metrics["UsedHDPercent"] < config["HDLessThan"]
680
- features["RemCpu"]=metrics["UsedCpuPercent"] > config["CpuRemove"]
681
- features["RemMem"]=metrics["UsedMemPercent"] > config["MemRemove"]
682
- features["RemHD"]=metrics["UsedHDPercent"] > config["HDRemove"]
683
- features["AllowNodeCap"]=metrics["RunningNodes"] < config["NodeCap"]
684
- # These are new features, so ignore them if not configured
685
- if (config["NetIOReadLessThan"]+config["NetIOReadRemove"]+
686
- config["NetIOWriteLessThan"]+config["NetIOWriteRemove"]>1):
687
- features["AllowNetIO"]=metrics["NetReadBytes"] < config["NetIOReadLessThan"] and \
688
- metrics["NetWriteBytes"] < config["NetIOWriteLessThan"]
689
- features["RemoveNetIO"]=metrics["NetReadBytes"] > config["NetIORemove"] or \
690
- metrics["NetWriteBytes"] > config["NetIORemove"]
691
- else:
692
- features["AllowNetIO"]=True
693
- features["RemoveNetIO"]=False
694
- if (config["HDIOReadLessThan"]+config["HDIOReadRemove"]+
695
- config["HDIOWriteLessThan"]+config["HDIOWriteRemove"]>1):
696
- features["AllowHDIO"]=metrics["HDReadBytes"] < config["HDIOReadLessThan"] and \
697
- metrics["HDWriteBytes"] < config["HDIOWriteLessThan"]
698
- features["RemoveHDIO"]=metrics["HDReadBytes"] > config["HDIORemove"] or \
699
- metrics["HDWriteBytes"] > config["HDtIORemove"]
700
- else:
701
- features["AllowHDIO"]=True
702
- features["RemoveHDIO"]=False
703
- features["LoadAllow"] = metrics["LoadAverage1"] < config["DesiredLoadAverage"] and \
704
- metrics["LoadAverage5"] < config["DesiredLoadAverage"] and \
705
- metrics["LoadAverage15"] < config["DesiredLoadAverage"]
706
- features["LoadNotAllow"] = metrics["LoadAverage1"] > config["MaxLoadAverageAllowed"] or \
707
- metrics["LoadAverage5"] > config["MaxLoadAverageAllowed"] or \
708
- metrics["LoadAverage15"] > config["MaxLoadAverageAllowed"]
709
- # Check records for expired status
710
- metrics=update_counters(metrics,config)
711
- # If we have other thing going on, don't add more nodes
712
- features["AddNewNode"]=sum([ metrics.get(m, 0) \
713
- for m in ['UpgradingNodes',
714
- 'RestartingNodes','MigratingNodes',
715
- 'RemovingNodes'] ]) == 0 and \
716
- features["AllowCpu"] and features["AllowHD"] and \
717
- features["AllowMem"] and features["AllowNodeCap"] and \
718
- features["AllowHDIO"] and features["AllowNetIO"] and \
719
- features["LoadAllow"]
720
- # Are we overlimit on nodes
721
- features["Remove"] =features["LoadNotAllow"] or features["RemCpu"] or \
722
- features["RemHD"] or features["RemMem"] or \
723
- features["RemoveHDIO"] or features["RemoveNetIO"] or \
724
- metrics["TotalNodes"] > config["NodeCap"]
725
- # If we have nodes to upgrade
726
- if metrics["NodesToUpgrade"] >= 1:
727
- # Make sure current version is equal or newer than version on first node.
728
- if Version(metrics["AntNodeVersion"]) < Version(db_nodes[0][1]):
729
- logging.warning("node upgrade cancelled due to lower version")
730
- features["Upgrade"]=False
731
- else:
732
- if features["Remove"]:
733
- logging.info("Can't upgrade while removing is required")
734
- features["Upgrade"]=False
735
- else:
736
- features["Upgrade"]=True
737
- else:
738
- features["Upgrade"]=False
739
-
740
-
741
- logging.info(json.dumps(features,indent=2))
742
- ##### Decisions
743
-
744
- # Actually, removing DEAD nodes take priority
745
- if metrics["DeadNodes"] > 1:
746
- with S() as session:
747
- broken=session.execute(select(Node.timestamp,Node.id,Node.host,Node.metrics_port)\
748
- .where(Node.status == DEAD)\
749
- .order_by(Node.timestamp.asc())).all()
750
- # Iterate through dead nodes and remove them all
751
- for check in broken:
752
- # Remove broken nodes
753
- logging.info("Removing dead node "+str(check[1]))
754
- remove_node(check[1])
755
- return {"status": "removed-dead-nodes"}
756
- # If we have nodes with no version number, update from binary
757
- if metrics["NodesNoVersion"] > 1:
758
- with S() as session:
759
- no_version=session.execute(select(Node.timestamp,Node.id,Node.binary)\
760
- .where(Node.version == '')\
761
- .order_by(Node.timestamp.asc())).all()
762
- # Iterate through nodes with no version number
763
- for check in no_version:
764
- # Update version number from binary
765
- version=get_antnode_version(check[2])
766
- logging.info(f"Updating version number for node {check[1]} to {version}")
767
- with S() as session:
768
- session.query(Node).filter(Node.id == check[1]).\
769
- update({'version': version})
770
- session.commit()
771
-
772
- # If we're restarting, wait patiently as metrics could be skewed
773
- if metrics["RestartingNodes"]:
774
- logging.info("Still waiting for RestartDelay")
775
- return {"status": RESTARTING}
776
- # If we still have unexpired upgrade records, wait
777
- if metrics["UpgradingNodes"]:
778
- logging.info("Still waiting for UpgradeDelay")
779
- return {"status": UPGRADING}
780
- # First if we're removing, that takes top priority
781
- if features["Remove"]:
782
- # If we still have unexpired removal records, wait
783
- if metrics["RemovingNodes"]:
784
- logging.info("Still waiting for RemoveDelay")
785
- return {"status": REMOVING}
786
- # If we're under HD pressure or trimming node cap, remove nodes
787
- if features["RemHD"] or metrics["TotalNodes"] > config["NodeCap"]:
788
- # Start removing with stopped nodes
789
- if metrics["StoppedNodes"] > 0:
790
- # What is the youngest stopped node
791
- with S() as session:
792
- youngest=session.execute(select(Node.id)\
793
- .where(Node.status == STOPPED)\
794
- .order_by(Node.age.desc())).first()
795
- if youngest:
796
- # Remove the youngest node
797
- remove_node(youngest[0])
798
- return{"status": REMOVING}
799
- # No low hanging fruit. let's start with the youngest running node
800
- with S() as session:
801
- youngest=session.execute(select(Node.id)\
802
- .where(Node.status == RUNNING)\
803
- .order_by(Node.age.desc())).first()
804
- if youngest:
805
- # Remove the youngest node
806
- remove_node(youngest[0])
807
- return{"status": REMOVING}
808
- return{"status": "nothing-to-remove"}
809
- # Otherwise, let's try just stopping a node to bring IO/Mem/Cpu down
38
+ # Make a decision about what to do (new implementation using DecisionEngine)
39
+ def choose_action(machine_config, metrics, dry_run):
40
+ """Plan and execute actions using DecisionEngine and ActionExecutor.
41
+
42
+ This function now acts as a thin wrapper around the new decision engine
43
+ and action executor classes.
44
+
45
+ Args:
46
+ machine_config: Machine configuration dictionary
47
+ metrics: Current system metrics
48
+ dry_run: If True, log actions without executing
49
+
50
+ Returns:
51
+ Dictionary with execution status
52
+ """
53
+ # Check records for expired status (must be done before planning)
54
+ if not dry_run:
55
+ metrics = update_counters(S, metrics, machine_config)
56
+
57
+ # Handle nodes with no version number (done before planning)
58
+ if metrics["nodes_no_version"] > 0:
59
+ if dry_run:
60
+ logging.warning("DRYRUN: Update NoVersion nodes")
810
61
  else:
811
- # If we just stopped a node, wait
812
- if int(config["LastStoppedAt"] or 0) > (int(time.time()) - (config["DelayRemove"]*60)):
813
- logging.info("Still waiting for RemoveDelay")
814
- return {"status": 'waiting-to-stop'}
815
- # Start with the youngest running node
816
62
  with S() as session:
817
- youngest=session.execute(select(Node)\
818
- .where(Node.status == RUNNING)\
819
- .order_by(Node.age.desc())).first()
820
- if youngest:
821
- # Stop the youngest node
822
- stop_systemd_node(youngest[0])
823
- # Update the last stopped time
63
+ no_version = session.execute(
64
+ select(Node.timestamp, Node.id, Node.binary)
65
+ .where(Node.version == "")
66
+ .order_by(Node.timestamp.asc())
67
+ ).all()
68
+ # Iterate through nodes with no version number
69
+ for check in no_version:
70
+ # Update version number from binary
71
+ version = get_antnode_version(check[2])
72
+ logging.info(
73
+ f"Updating version number for node {check[1]} to {version}"
74
+ )
824
75
  with S() as session:
825
- session.query(Machine).filter(Machine.id == 1).\
826
- update({'LastStoppedAt': int(time.time())})
76
+ session.query(Node).filter(Node.id == check[1]).update(
77
+ {"version": version}
78
+ )
827
79
  session.commit()
828
- return{"status": STOPPED}
829
- else:
830
- return{"status": "nothing-to-stop"}
831
-
832
- # Do we have upgrading to do?
833
- if features["Upgrade"]:
834
- # Let's find the oldest running node not using the current version
835
- with S() as session:
836
- oldest=session.execute(select(Node)\
837
- .where(Node.status == RUNNING)\
838
- .where(Node.version != metrics["AntNodeVersion"])
839
- .order_by(Node.age.asc())).first()
840
- if oldest:
841
- # Get Node from Row
842
- oldest = oldest[0]
843
- # If we don't have a version number from metadata, grab from binary
844
- if not oldest.version:
845
- oldest.version=get_antnode_version(oldest.binary)
846
- #print(json.dumps(oldest))
847
- # Upgrade the oldest node
848
- upgrade_node(oldest,metrics)
849
- return{"status": UPGRADING}
850
-
851
- # If AddNewNode
852
- # If stopped nodes available
853
- # Check oldest stopped version
854
- # If out of date
855
- # upgrade node which starts it
856
- # else
857
- # restart node
858
- # else
859
- # Create a Node which starts it
860
- if features["AddNewNode"]:
861
- # Start adding with stopped nodes
862
- if metrics["StoppedNodes"] > 0:
863
- # What is the oldest stopped node
864
- with S() as session:
865
- oldest=session.execute(select(Node)\
866
- .where(Node.status == STOPPED)\
867
- .order_by(Node.age.asc())).first()
868
- if oldest:
869
- # Get Node from Row
870
- oldest=oldest[0]
871
- # If we don't have a version number from metadata, grab from binary
872
- if not oldest.version:
873
- oldest.version=get_antnode_version(oldest.binary)
874
- # If the stopped version is old, upgrade it
875
- if Version(metrics["AntNodeVersion"]) > Version(oldest.version):
876
- upgrade_node(oldest,metrics)
877
- return{"status": UPGRADING}
878
- else:
879
- if start_systemd_node(oldest):
880
- return{"status": RESTARTING}
881
- else:
882
- return{"status": "failed-start-node"}
883
- # Hmm, still in Start mode, we shouldn't get here
884
- return {"status": 'START'}
885
- # Still in Add mode, add a new node
886
- if metrics["TotalNodes"] < config["NodeCap"]:
887
- if create_node(config,metrics):
888
- return {"status": "ADD"}
889
- else:
890
- return {"status": "failed-create-node"}
891
- else:
892
- return {"status": "node-cap-reached"}
893
- # If we have nothing to do, Survey the node ports
894
- update_nodes()
895
- return{"status": "idle"}
80
+
81
+ # Use the new DecisionEngine to plan actions
82
+ engine = DecisionEngine(machine_config, metrics)
83
+ actions = engine.plan_actions()
84
+
85
+ # Log the computed features for debugging
86
+ logging.info(json.dumps(engine.get_features(), indent=2))
87
+
88
+ # Use ActionExecutor to execute the planned actions
89
+ executor = ActionExecutor(S)
90
+ result = executor.execute(actions, machine_config, metrics, dry_run)
91
+
92
+ return result
93
+
896
94
 
897
95
  def main():
96
+
97
+ # Are we already running
98
+ if os.path.exists(LOCK_FILE):
99
+ logging.warning("wnm still running")
100
+ sys.exit(1)
101
+
898
102
  # We're starting, so lets create a lock file
899
103
  try:
900
- with open('/var/antctl/wnm_active', 'w') as file:
104
+ with open(LOCK_FILE, "w") as file:
901
105
  file.write(str(int(time.time())))
902
- except:
903
- logging.error("Unable to create lock file, exiting")
106
+ except (PermissionError, OSError) as e:
107
+ logging.error(f"Unable to create lock file: {e}")
904
108
  sys.exit(1)
905
109
 
906
- # See if we already have a known state in the database
907
- with S() as session:
908
- db_nodes=session.execute(select(Node.status,Node.version,
909
- Node.host,Node.metrics_port,
910
- Node.port,Node.age,Node.id,
911
- Node.timestamp)).all()
912
- anm_config=session.execute(select(Machine)).all()
913
-
914
- if db_nodes:
915
- # anm_config by default loads a parameter array,
916
- # use the __json__ method to return a dict from the first node
917
- anm_config = json.loads(json.dumps(anm_config[0][0])) or load_anm_config()
918
- metrics=get_machine_metrics(anm_config["NodeStorage"],anm_config["HDRemove"])
919
- #node_metrics = read_node_metrics(db_nodes[0][2],db_nodes[0][3])
920
- #print(db_nodes[0])
921
- #print(node_metrics)
922
- #print(anm_config)
923
- #print(json.dumps(anm_config,indent=4))
924
- #print("Node: ",db_nodes)
925
- logging.info("Found {counter} nodes migrated".format(counter=len(db_nodes)))
926
-
110
+ # Config should have loaded the machine_config
111
+ if machine_config:
112
+ logging.info("Machine: " + json.dumps(machine_config))
927
113
  else:
928
- anm_config = load_anm_config()
929
- #print(anm_config)
930
- Workers = survey_machine() or []
931
-
932
- #""""
933
- with S() as session:
934
- session.execute(
935
- insert(Node),Workers
936
- )
937
- session.commit()
938
- #"""
114
+ logging.error("Unable to load machine config, exiting")
115
+ sys.exit(1)
116
+ # Check for config updates
117
+ if config_updates:
118
+ logging.info("Update: " + json.dumps(config_updates))
119
+ if options.dry_run:
120
+ logging.warning("Dry run, not saving requested updates")
121
+ # Create a dictionary for the machine config
122
+ # Machine by default returns a parameter array,
123
+ # use the __json__ method to return a dict
124
+ local_config = json.loads(json.dumps(machine_config))
125
+ # Apply the local config with the requested updates
126
+ local_config.update(config_updates)
127
+ else:
128
+ # Store the config changes to the database
129
+ apply_config_updates(config_updates)
130
+ # Create a working dictionary for the machine config
131
+ # Machine by default returns a parameter array,
132
+ # use the __json__ method to return a dict
133
+ local_config = json.loads(json.dumps(machine_config))
134
+ else:
135
+ local_config = json.loads(json.dumps(machine_config))
136
+
137
+ metrics = get_machine_metrics(
138
+ S,
139
+ local_config["node_storage"],
140
+ local_config["hd_remove"],
141
+ local_config["crisis_bytes"],
142
+ )
143
+ logging.info(json.dumps(metrics, indent=2))
144
+
145
+ # Do we already have nodes
146
+ if metrics["total_nodes"] == 0:
147
+ # Are we migrating an anm server
148
+ if options.init and options.migrate_anm:
149
+ Workers = survey_machine(machine_config) or []
150
+ if Workers:
151
+ if options.dry_run:
152
+ logging.warning(f"DRYRUN: Not saving {len(Workers)} detected nodes")
153
+ else:
154
+ with S() as session:
155
+ session.execute(insert(Node), Workers)
156
+ session.commit()
157
+ # Reload metrics
158
+ metrics = get_machine_metrics(
159
+ S,
160
+ local_config["node_storage"],
161
+ local_config["hd_remove"],
162
+ local_config["crisis_bytes"],
163
+ )
164
+ logging.info(
165
+ "Found {counter} nodes defined".format(
166
+ counter=metrics["total_nodes"]
167
+ )
168
+ )
169
+ else:
170
+ logging.warning("Requested migration but no nodes found")
171
+ else:
172
+ logging.info("No nodes found")
173
+ else:
174
+ logging.info(
175
+ "Found {counter} nodes configured".format(counter=metrics["total_nodes"])
176
+ )
939
177
 
940
- with S() as session:
941
- session.execute(
942
- insert(Machine),[anm_config]
178
+ # Check for reports
179
+ if options.report:
180
+ from wnm.reports import generate_node_status_report, generate_node_status_details_report
181
+
182
+ # If survey action is specified, run it first
183
+ if options.force_action == "survey":
184
+ logging.info("Running survey before generating report")
185
+ executor = ActionExecutor(S)
186
+ survey_result = executor.execute_forced_action(
187
+ "survey",
188
+ local_config,
189
+ metrics,
190
+ service_name=options.service_name,
191
+ dry_run=options.dry_run,
943
192
  )
944
- session.commit()
193
+ logging.info(f"Survey result: {survey_result}")
945
194
 
946
- # Now load subset of data to work with
947
- with S() as session:
948
- db_nodes=session.execute(select(Node.status,Node.version,
949
- Node.host,Node.metrics_port,
950
- Node.port,Node.age,Node.id,
951
- Node.timestamp)).all()
952
-
953
-
954
-
955
- #print(json.dumps(anm_config,indent=4))
956
- logging.info("Found {counter} nodes configured".format(counter=len(db_nodes)))
957
-
958
- #versions = [v[1] for worker in Workers if (v := worker.get('version'))]
959
- #data = Counter(ver for ver in versions)
195
+ # Generate the report
196
+ if options.report == "node-status":
197
+ report_output = generate_node_status_report(
198
+ S, options.service_name, options.report_format
199
+ )
200
+ elif options.report == "node-status-details":
201
+ report_output = generate_node_status_details_report(
202
+ S, options.service_name, options.report_format
203
+ )
204
+ else:
205
+ report_output = f"Unknown report type: {options.report}"
206
+
207
+ print(report_output)
208
+ os.remove(LOCK_FILE)
209
+ sys.exit(0)
210
+
211
+ # Check for forced actions
212
+ if options.force_action:
213
+ logging.info(f"Executing forced action: {options.force_action}")
214
+ executor = ActionExecutor(S)
215
+ this_action = executor.execute_forced_action(
216
+ options.force_action,
217
+ local_config,
218
+ metrics,
219
+ service_name=options.service_name,
220
+ dry_run=options.dry_run,
221
+ count=options.count if hasattr(options, 'count') else 1,
222
+ )
223
+ else:
224
+ this_action = choose_action(local_config, metrics, options.dry_run)
960
225
 
226
+ print("Action:", json.dumps(this_action, indent=2))
961
227
 
962
- data = Counter(status[0] for status in db_nodes)
963
- #print(data)
964
- print("Running Nodes:",data[RUNNING])
965
- print("Restarting Nodes:",data[RESTARTING])
966
- print("Stopped Nodes:",data[STOPPED])
967
- print("Upgrading Nodes:",data[UPGRADING])
968
- print("Removing Nodes:",data[REMOVING])
969
- data = Counter(ver[1] for ver in db_nodes)
970
- print("Versions:",data)
228
+ os.remove(LOCK_FILE)
229
+ sys.exit(1)
971
230
 
972
- machine_metrics = get_machine_metrics(anm_config['NodeStorage'],anm_config["HDRemove"])
973
- print(json.dumps(anm_config,indent=2))
974
- print(json.dumps(machine_metrics,indent=2))
975
- this_action=choose_action(anm_config,machine_metrics,db_nodes)
976
- print("Action:",json.dumps(this_action,indent=2))
977
- # Remove lock file
978
- os.remove("/var/antctl/wnm_active")
979
231
 
980
232
  if __name__ == "__main__":
981
233
  main()
234
+ # print(options.MemRemove)
982
235
 
983
236
  print("End of program")