wnm 0.0.9__py3-none-any.whl → 0.0.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of wnm might be problematic. Click here for more details.

wnm/__main__.py CHANGED
@@ -1,1185 +1,236 @@
1
1
  import json
2
2
  import logging
3
3
  import os
4
- import re
5
- import shutil
6
- import subprocess
7
4
  import sys
8
5
  import time
9
- from collections import Counter
10
6
 
11
- import psutil
12
- import requests
13
- from dotenv import load_dotenv
14
- from packaging.version import Version
15
- from sqlalchemy import create_engine, delete, insert, select, text, update
16
- from sqlalchemy.orm import scoped_session, sessionmaker
7
+ from sqlalchemy import insert, select
17
8
 
18
- from wnm.models import Base, Machine, Node
9
+ from wnm.config import (
10
+ LOCK_FILE,
11
+ S,
12
+ apply_config_updates,
13
+ config_updates,
14
+ machine_config,
15
+ options,
16
+ )
17
+ from wnm.decision_engine import DecisionEngine
18
+ from wnm.executor import ActionExecutor
19
+ from wnm.migration import survey_machine
20
+ from wnm.models import Node
21
+ from wnm.utils import (
22
+ get_antnode_version,
23
+ get_machine_metrics,
24
+ update_counters,
25
+ )
19
26
 
20
27
  logging.basicConfig(level=logging.INFO)
21
28
  # Info level logging for sqlalchemy is too verbose, only use when needed
22
29
  logging.getLogger("sqlalchemy.engine.Engine").disabled = True
23
30
 
24
- # import .env
25
- basedir = os.path.abspath(os.path.dirname(__file__))
26
- load_dotenv(os.path.join(basedir, ".env"))
27
-
28
- # simulate arg/yaml configuration
29
- config = {}
30
- config["db"] = "sqlite:///colony.db"
31
- config["DonateAddress"] = (
32
- os.getenv("DonateAddress") or "0x00455d78f850b0358E8cea5be24d415E01E107CF"
33
- )
34
- config["ANMHost"] = os.getenv("ANMHost") or "127.0.0.1"
35
- config["CrisisBytes"] = os.getenv("CrisisBytes") or 2 * 10**9 # default 2gb/node
36
-
37
-
38
- # Setup Database engine
39
- engine = create_engine(config["db"], echo=True)
40
-
41
- # Generate ORM
42
- Base.metadata.create_all(engine)
43
-
44
- # Create a connection to the ORM
45
- session_factory = sessionmaker(bind=engine)
46
- S = scoped_session(session_factory)
47
-
48
-
49
- # if WNM_CONFIG or -c parameter are set, check for existing config
50
- # else:
51
-
52
- # Primary node for want of one
53
- QUEEN = 1
54
-
55
- # Donation address
56
- DONATE = config["DonateAddress"]
57
- # Keep these as strings so they can be grepped in logs
58
- STOPPED = "STOPPED" # 0 Node is not responding to it's metrics port
59
- RUNNING = "RUNNING" # 1 Node is responding to it's metrics port
60
- UPGRADING = "UPGRADING" # 2 Upgrade in progress
61
- DISABLED = "DISABLED" # -1 Do not start
62
- RESTARTING = "RESTARTING" # 3 re/starting a server intionally
63
- MIGRATING = "MIGRATING" # 4 Moving volumes in progress
64
- REMOVING = "REMOVING" # 5 Removing node in progress
65
- DEAD = "DEAD" # -86 Broken node to cleanup
66
-
67
- ANM_HOST = config["ANMHost"]
68
- # Baseline bytes per node
69
- CRISIS_BYTES = config["CrisisBytes"]
70
31
 
71
32
  # A storage place for ant node data
72
33
  Workers = []
73
34
 
74
- # Detect ANM (but don't upgrade)
75
- if os.path.exists("/var/antctl/system"):
76
- # Is anm scheduled to run
77
- if os.path.exists("/etc/cron.d/anm"):
78
- # remove cron to disable old anm
79
- try:
80
- subprocess.run(["sudo", "rm", "/etc/cron.d/anm"])
81
- except Exception as error:
82
- template = "In GAV - An exception of type {0} occurred. Arguments:\n{1!r}"
83
- message = template.format(type(error).__name__, error.args)
84
- logging.info(message)
85
- sys.exit(1)
86
- os.remove("/etc/cron.d/anm")
87
- # Is anm sitll running? We'll wait
88
- if os.path.exists("/var/antctl/block"):
89
- logging.info("anm still running, waiting...")
90
- sys.exit(1)
91
-
92
- # Are we already running
93
- if os.path.exists("/var/antctl/wnm_active"):
94
- logging.info("wnm still running")
95
- sys.exit(1)
96
-
97
-
98
- # Get anm configuration
99
- def load_anm_config():
100
- anm_config = {}
101
-
102
- # Let's get the real count of CPU's available to this process
103
- anm_config["CpuCount"] = len(os.sched_getaffinity(0))
104
-
105
- # What can we save from /var/antctl/config
106
- if os.path.exists("/var/antctl/config"):
107
- load_dotenv("/var/antctl/config")
108
- anm_config["NodeCap"] = int(os.getenv("NodeCap") or 20)
109
- anm_config["CpuLessThan"] = int(os.getenv("CpuLessThan") or 50)
110
- anm_config["CpuRemove"] = int(os.getenv("CpuRemove") or 70)
111
- anm_config["MemLessThan"] = int(os.getenv("MemLessThan") or 70)
112
- anm_config["MemRemove"] = int(os.getenv("MemRemove") or 90)
113
- anm_config["HDLessThan"] = int(os.getenv("HDLessThan") or 70)
114
- anm_config["HDRemove"] = int(os.getenv("HDRemove") or 90)
115
- anm_config["DelayStart"] = int(os.getenv("DelayStart") or 5)
116
- anm_config["DelayUpgrade"] = int(os.getenv("DelayUpgrade") or 5)
117
- anm_config["DelayRestart"] = int(os.getenv("DelayRestart") or 10)
118
- anm_config["DelayRemove"] = int(os.getenv("DelayRemove") or 300)
119
- anm_config["NodeStorage"] = os.getenv("NodeStorage") or "/var/antctl/services"
120
- # Default to the faucet donation address
121
- try:
122
- anm_config["RewardsAddress"] = re.findall(
123
- r"--rewards-address ([\dA-Fa-fXx]+)", os.getenv("RewardsAddress")
124
- )[0]
125
- except:
126
- try:
127
- anm_config["RewardsAddress"] = re.findall(
128
- r"([\dA-Fa-fXx]+)", os.getenv("RewardsAddress")
129
- )[0]
130
- except:
131
- logging.warning("Unable to detect RewardsAddress")
132
- sys.exit(1)
133
- anm_config["DonateAddress"] = os.getenv("DonateAddress") or DONATE
134
- anm_config["MaxLoadAverageAllowed"] = float(
135
- os.getenv("MaxLoadAverageAllowed") or anm_config["CpuCount"]
136
- )
137
- anm_config["DesiredLoadAverage"] = float(
138
- os.getenv("DesiredLoadAverage") or (anm_config["CpuCount"] * 0.6)
139
- )
140
-
141
- try:
142
- with open("/usr/bin/anms.sh", "r") as file:
143
- data = file.read()
144
- anm_config["PortStart"] = int(re.findall(r"ntpr\=(\d+)", data)[0])
145
- except:
146
- anm_config["PortStart"] = 55
147
-
148
- anm_config["HDIOReadLessThan"] = float(os.getenv("HDIOReadLessThan") or 0.0)
149
- anm_config["HDIOReadRemove"] = float(os.getenv("HDIOReadRemove") or 0.0)
150
- anm_config["HDIOWriteLessThan"] = float(os.getenv("HDIOWriteLessThan") or 0.0)
151
- anm_config["HDIOWriteRemove"] = float(os.getenv("HDIOWriteRemove") or 0.0)
152
- anm_config["NetIOReadLessThan"] = float(os.getenv("NetIOReadLessThan") or 0.0)
153
- anm_config["NetIOReadRemove"] = float(os.getenv("NetIOReadRemove") or 0.0)
154
- anm_config["NetIOWriteLessThan"] = float(os.getenv("NetIOWriteLessThan") or 0.0)
155
- anm_config["NetIOWriteRemove"] = float(os.getenv("NetIOWriteRemove") or 0.0)
156
- # Timer for last stopped nodes
157
- anm_config["LastStoppedAt"] = 0
158
-
159
- return anm_config
160
-
161
-
162
- # Read confirm from systemd service file
163
- def read_systemd_service(antnode):
164
- details = {}
165
- try:
166
- with open("/etc/systemd/system/" + antnode, "r") as file:
167
- data = file.read()
168
- details["id"] = int(re.findall(r"antnode(\d+)", antnode)[0])
169
- details["binary"] = re.findall(r"ExecStart=([^ ]+)", data)[0]
170
- details["user"] = re.findall(r"User=(\w+)", data)[0]
171
- details["root_dir"] = re.findall(r"--root-dir ([\w\/]+)", data)[0]
172
- details["port"] = int(re.findall(r"--port (\d+)", data)[0])
173
- details["metrics_port"] = int(
174
- re.findall(r"--metrics-server-port (\d+)", data)[0]
175
- )
176
- details["wallet"] = re.findall(r"--rewards-address ([^ ]+)", data)[0]
177
- details["network"] = re.findall(r"--rewards-address [^ ]+ ([\w\-]+)", data)[0]
178
- except:
179
- pass
180
-
181
- return details
182
-
183
-
184
- # Read data from metadata endpoint
185
- def read_node_metadata(host, port):
186
- # Only return version number when we have one, to stop clobbering the binary check
187
- try:
188
- url = "http://{0}:{1}/metadata".format(host, port)
189
- response = requests.get(url)
190
- data = response.text
191
- except requests.exceptions.ConnectionError:
192
- logging.debug("Connection Refused on port: {0}:{1}".format(host, str(port)))
193
- return {"status": STOPPED, "peer_id": ""}
194
- except Exception as error:
195
- template = "In RNMd - An exception of type {0} occurred. Arguments:\n{1!r}"
196
- message = template.format(type(error).__name__, error.args)
197
- logging.info(message)
198
- return {"status": STOPPED, "peer_id": ""}
199
- # collect a dict to return
200
- card = {}
201
- try:
202
- card["version"] = re.findall(r'{antnode_version="([\d\.]+)"}', data)[0]
203
- except:
204
- logging.info("No version found")
205
- try:
206
- card["peer_id"] = re.findall(r'{peer_id="([\w\d]+)"}', data)[0]
207
- except:
208
- card["peer_id"] = ""
209
- card["status"] = RUNNING if "version" in card else STOPPED
210
- return card
211
-
212
-
213
- # Read data from metrics port
214
- def read_node_metrics(host, port):
215
- metrics = {}
216
- try:
217
- url = "http://{0}:{1}/metrics".format(host, port)
218
- response = requests.get(url)
219
- metrics["status"] = RUNNING
220
- metrics["uptime"] = int(
221
- (re.findall(r"ant_node_uptime ([\d]+)", response.text) or [0])[0]
222
- )
223
- metrics["records"] = int(
224
- (
225
- re.findall(r"ant_networking_records_stored ([\d]+)", response.text)
226
- or [0]
227
- )[0]
228
- )
229
- metrics["shunned"] = int(
230
- (
231
- re.findall(
232
- r"ant_networking_shunned_by_close_group ([\d]+)", response.text
233
- )
234
- or [0]
235
- )[0]
236
- )
237
- except requests.exceptions.ConnectionError:
238
- logging.debug("Connection Refused on port: {0}:{1}".format(host, str(port)))
239
- metrics["status"] = STOPPED
240
- metrics["uptime"] = 0
241
- metrics["records"] = 0
242
- metrics["shunned"] = 0
243
- except Exception as error:
244
- template = "in:RNM - An exception of type {0} occurred. Arguments:\n{1!r}"
245
- message = template.format(type(error).__name__, error.args)
246
- logging.info(message)
247
- metrics["status"] = STOPPED
248
- metrics["uptime"] = 0
249
- metrics["records"] = 0
250
- metrics["shunned"] = 0
251
- return metrics
35
+ # Detect ANM
252
36
 
253
37
 
254
- # Read antnode binary version
255
- def get_antnode_version(binary):
256
- try:
257
- data = subprocess.run(
258
- [binary, "--version"], stdout=subprocess.PIPE
259
- ).stdout.decode("utf-8")
260
- return re.findall(r"Autonomi Node v([\d\.]+)", data)[0]
261
- except Exception as error:
262
- template = "In GAV - An exception of type {0} occurred. Arguments:\n{1!r}"
263
- message = template.format(type(error).__name__, error.args)
264
- logging.info(message)
265
- return 0
38
+ # Make a decision about what to do (new implementation using DecisionEngine)
39
+ def choose_action(machine_config, metrics, dry_run):
40
+ """Plan and execute actions using DecisionEngine and ActionExecutor.
266
41
 
42
+ This function now acts as a thin wrapper around the new decision engine
43
+ and action executor classes.
267
44
 
268
- # Determine how long this node has been around by looking at it's secret_key file
269
- def get_node_age(root_dir):
270
- try:
271
- return int(os.stat("{0}/secret-key".format(root_dir)).st_mtime)
272
- except:
273
- return 0
45
+ Args:
46
+ machine_config: Machine configuration dictionary
47
+ metrics: Current system metrics
48
+ dry_run: If True, log actions without executing
274
49
 
50
+ Returns:
51
+ Dictionary with execution status
52
+ """
53
+ # Check records for expired status (must be done before planning)
54
+ if not dry_run:
55
+ metrics = update_counters(S, metrics, machine_config)
275
56
 
276
- # Survey nodes by reading metadata from metrics ports or binary --version
277
- def survey_anm_nodes(antnodes):
278
- # Build a list of node dictionaries to return
279
- details = []
280
- # Iterate on nodes
281
- for node in antnodes:
282
- # Initialize a dict
283
- logging.debug(
284
- "{0} surveying node {1} ".format(time.strftime("%Y-%m-%d %H:%M"), node)
285
- )
286
- if not re.findall(r"antnode([\d]+).service", node):
287
- logging.info("can't decode " + str(node))
288
- continue
289
- card = {
290
- "nodename": re.findall(r"antnode([\d]+).service", node)[0],
291
- "service": node,
292
- "timestamp": int(time.time()),
293
- "host": ANM_HOST or "127.0.0.1",
294
- }
295
- # Load what systemd has configured
296
- card.update(read_systemd_service(node))
297
- # print(json.dumps(card,indent=2))
298
- # Read metadata from metrics_port
299
- metadata = read_node_metadata(card["host"], card["metrics_port"])
300
- # print(json.dumps(metadata,indent=2))
301
- if (
302
- isinstance(metadata, dict)
303
- and "status" in metadata
304
- and metadata["status"] == RUNNING
305
- ):
306
- # soak up metadata
307
- card.update(metadata)
308
- # The ports up, so grab metrics too
309
- card.update(read_node_metrics(card["host"], card["metrics_port"]))
310
- # Else run binary to get version
57
+ # Handle nodes with no version number (done before planning)
58
+ if metrics["nodes_no_version"] > 0:
59
+ if dry_run:
60
+ logging.warning("DRYRUN: Update NoVersion nodes")
311
61
  else:
312
- # If the root directory of the node is missing, it's a bad node
313
- if not os.path.isdir(card["root_dir"]):
314
- card["status"] = DEAD
315
- card["version"] = ""
316
- else:
317
- card["status"] = STOPPED
318
- card["version"] = get_antnode_version(card["binary"])
319
- card["peer_id"] = ""
320
- card["records"] = 0
321
- card["uptime"] = 0
322
- card["shunned"] = 0
323
- card["age"] = get_node_age(card["root_dir"])
324
- # harcoded for anm
325
- card["host"] = ANM_HOST
326
- # Append the node dict to the detail list
327
- details.append(card)
328
-
329
- return details
330
-
331
-
332
- # Survey server instance
333
- def survey_machine():
334
- # Make a bucket
335
- antnodes = []
336
- # For all service files
337
- for file in os.listdir("/etc/systemd/system"):
338
- # Find antnodes
339
- if re.match(r"antnode[\d]+\.service", file):
340
- antnodes.append(file)
341
- # if len(antnodes)>=5:
342
- # break
343
- # Iterate over defined nodes and get details
344
- # Ingests a list of service files and outputs a list of dictionaries
345
- return survey_anm_nodes(antnodes)
346
-
347
-
348
- # Read system status
349
- def get_machine_metrics(node_storage, remove_limit):
350
- metrics = {}
351
-
352
- with S() as session:
353
- db_nodes = session.execute(select(Node.status, Node.version)).all()
354
-
355
- # Get some initial stats for comparing after a few seconds
356
- # We start these counters AFTER reading the database
357
- start_time = time.time()
358
- start_disk_counters = psutil.disk_io_counters()
359
- start_net_counters = psutil.net_io_counters()
360
-
361
- metrics["TotalNodes"] = len(db_nodes)
362
- data = Counter(node[0] for node in db_nodes)
363
- metrics["RunningNodes"] = data[RUNNING]
364
- metrics["StoppedNodes"] = data[STOPPED]
365
- metrics["RestartingNodes"] = data[RESTARTING]
366
- metrics["UpgradingNodes"] = data[UPGRADING]
367
- metrics["MigratingNodes"] = data[MIGRATING]
368
- metrics["RemovingNodes"] = data[REMOVING]
369
- metrics["DeadNodes"] = data[DEAD]
370
- metrics["antnode"] = shutil.which("antnode")
371
- if not metrics["antnode"]:
372
- logging.warning("Unable to locate current antnode binary, exiting")
373
- sys.exit(1)
374
- metrics["AntNodeVersion"] = get_antnode_version(metrics["antnode"])
375
- metrics["NodesLatestV"] = (
376
- sum(1 for node in db_nodes if node[1] == metrics["AntNodeVersion"]) or 0
377
- )
378
- metrics["NodesNoVersion"] = sum(1 for node in db_nodes if not node[1]) or 0
379
- metrics["NodesToUpgrade"] = (
380
- metrics["TotalNodes"] - metrics["NodesLatestV"] - metrics["NodesNoVersion"]
381
- )
382
-
383
- # Windows has to build load average over 5 seconds. The first 5 seconds returns 0's
384
- # I don't plan on supporting windows, but if this get's modular, I don't want this
385
- # issue to be skipped
386
- # if platform.system() == "Windows":
387
- # discard=psutil.getloadavg()
388
- # time.sleep(5)
389
- metrics["LoadAverage1"], metrics["LoadAverage5"], metrics["LoadAverage15"] = (
390
- psutil.getloadavg()
391
- )
392
- # Get CPU Metrics over 1 second
393
- metrics["IdleCpuPercent"], metrics["IOWait"] = psutil.cpu_times_percent(1)[3:5]
394
- # Really we returned Idle percent, subtract from 100 to get used.
395
- metrics["UsedCpuPercent"] = 100 - metrics["IdleCpuPercent"]
396
- data = psutil.virtual_memory()
397
- # print(data)
398
- metrics["UsedMemPercent"] = data.percent
399
- metrics["FreeMemPercent"] = 100 - metrics["UsedMemPercent"]
400
- data = psutil.disk_io_counters()
401
- # This only checks the drive mapped to the first node and will need to be updated
402
- # when we eventually support multiple drives
403
- data = psutil.disk_usage(node_storage)
404
- metrics["UsedHDPercent"] = data.percent
405
- metrics["TotalHDBytes"] = data.total
406
- end_time = time.time()
407
- end_disk_counters = psutil.disk_io_counters()
408
- end_net_counters = psutil.net_io_counters()
409
- metrics["HDWriteBytes"] = int(
410
- (end_disk_counters.write_bytes - start_disk_counters.write_bytes)
411
- / (end_time - start_time)
412
- )
413
- metrics["HDReadBytes"] = int(
414
- (end_disk_counters.read_bytes - start_disk_counters.read_bytes)
415
- / (end_time - start_time)
416
- )
417
- metrics["NetWriteBytes"] = int(
418
- (end_net_counters.bytes_sent - start_net_counters.bytes_sent)
419
- / (end_time - start_time)
420
- )
421
- metrics["NetReadBytes"] = int(
422
- (end_net_counters.bytes_recv - start_net_counters.bytes_recv)
423
- / (end_time - start_time)
424
- )
425
- # print (json.dumps(metrics,indent=2))
426
- # How close (out of 100) to removal limit will we be with a max bytes per node (2GB default)
427
- # For running nodes with Porpoise(tm).
428
- metrics["NodeHDCrisis"] = int(
429
- (
430
- ((metrics["TotalNodes"]) * CRISIS_BYTES)
431
- / (metrics["TotalHDBytes"] * (remove_limit / 100))
432
- )
433
- * 100
434
- )
435
- return metrics
436
-
437
-
438
- # Update node with metrics result
439
- def update_node_from_metrics(id, metrics, metadata):
440
- try:
441
- # We check the binary version in other code, so lets stop clobbering it when a node is stopped
442
- card = {
443
- "status": metrics["status"],
444
- "timestamp": int(time.time()),
445
- "uptime": metrics["uptime"],
446
- "records": metrics["records"],
447
- "shunned": metrics["shunned"],
448
- "peer_id": metadata["peer_id"],
449
- }
450
- if "version" in metadata:
451
- card["version"] = metadata["version"]
452
- with S() as session:
453
- session.query(Node).filter(Node.id == id).update(card)
454
- session.commit()
455
- except Exception as error:
456
- template = "In UNFM - An exception of type {0} occurred. Arguments:\n{1!r}"
457
- message = template.format(type(error).__name__, error.args)
458
- logging.warning(message)
459
- return False
460
- else:
461
- return True
462
-
463
-
464
- # Set Node status
465
- def set_node_status(id, status):
466
- logging.info("Setting node status: {0} {1}".format(id, status))
467
- try:
468
- with S() as session:
469
- session.query(Node).filter(Node.id == id).update(
470
- {"status": status, "timestamp": int(time.time())}
471
- )
472
- session.commit()
473
- except:
474
- return False
475
- else:
476
- return True
477
-
478
-
479
- # Update metrics after checking counters
480
- def update_counters(old, config):
481
- # Are we already removing a node
482
- if old["RemovingNodes"]:
483
- with S() as session:
484
- removals = session.execute(
485
- select(Node.timestamp, Node.id)
486
- .where(Node.status == REMOVING)
487
- .order_by(Node.timestamp.asc())
488
- ).all()
489
- # Iterate through active removals
490
- records_to_remove = len(removals)
491
- for check in removals:
492
- # If the DelayRemove timer has expired, delete the entry
493
- if isinstance(check[0], int) and check[0] < (
494
- int(time.time()) - (config["DelayRemove"] * 60)
495
- ):
496
- logging.info("Deleting removed node " + str(check[1]))
62
+ with S() as session:
63
+ no_version = session.execute(
64
+ select(Node.timestamp, Node.id, Node.binary)
65
+ .where(Node.version == "")
66
+ .order_by(Node.timestamp.asc())
67
+ ).all()
68
+ # Iterate through nodes with no version number
69
+ for check in no_version:
70
+ # Update version number from binary
71
+ version = get_antnode_version(check[2])
72
+ logging.info(
73
+ f"Updating version number for node {check[1]} to {version}"
74
+ )
497
75
  with S() as session:
498
- session.execute(delete(Node).where(Node.id == check[1]))
76
+ session.query(Node).filter(Node.id == check[1]).update(
77
+ {"version": version}
78
+ )
499
79
  session.commit()
500
- records_to_remove -= 1
501
- old["RemovingNodes"] = records_to_remove
502
- # Are we already upgrading a node
503
- if old["UpgradingNodes"]:
504
- with S() as session:
505
- upgrades = session.execute(
506
- select(Node.timestamp, Node.id, Node.host, Node.metrics_port)
507
- .where(Node.status == UPGRADING)
508
- .order_by(Node.timestamp.asc())
509
- ).all()
510
- # Iterate through active upgrades
511
- records_to_upgrade = len(upgrades)
512
- for check in upgrades:
513
- # If the DelayUpgrade timer has expired, check on status
514
- if isinstance(check[0], int) and check[0] < (
515
- int(time.time()) - (config["DelayUpgrade"] * 60)
516
- ):
517
- logging.info("Updating upgraded node " + str(check[1]))
518
- node_metrics = read_node_metrics(check[2], check[3])
519
- node_metadata = read_node_metadata(check[2], check[3])
520
- if node_metrics and node_metadata:
521
- update_node_from_metrics(check[1], node_metrics, node_metadata)
522
- records_to_upgrade -= 1
523
- old["UpgradingNodes"] = records_to_upgrade
524
- # Are we already restarting a node
525
- if old["RestartingNodes"]:
526
- with S() as session:
527
- restarts = session.execute(
528
- select(Node.timestamp, Node.id, Node.host, Node.metrics_port)
529
- .where(Node.status == RESTARTING)
530
- .order_by(Node.timestamp.asc())
531
- ).all()
532
- # Iterate through active upgrades
533
- records_to_restart = len(restarts)
534
- for check in restarts:
535
- # If the DelayUpgrade timer has expired, check on status
536
- if isinstance(check[0], int) and check[0] < (
537
- int(time.time()) - (config["DelayStart"] * 60)
538
- ):
539
- logging.info("Updating restarted node " + str(check[1]))
540
- node_metrics = read_node_metrics(check[2], check[3])
541
- node_metadata = read_node_metadata(check[2], check[3])
542
- if node_metrics and node_metadata:
543
- update_node_from_metrics(check[1], node_metrics, node_metadata)
544
- records_to_restart -= 1
545
- old["RestartingNodes"] = records_to_restart
546
- return old
547
-
548
-
549
- # Enable firewall for port
550
- def enable_firewall(port, node):
551
- logging.info("enable firewall port {0}/udp".format(port))
552
- # Close ufw firewall
553
- try:
554
- subprocess.run(
555
- ["sudo", "ufw", "allow", "{0}/udp".format(port), "comment", node],
556
- stdout=subprocess.PIPE,
557
- )
558
- except subprocess.CalledProcessError as err:
559
- logging.error("EF Error:", err)
560
-
561
-
562
- # Disable firewall for port
563
- def disable_firewall(port):
564
- logging.info("disable firewall port {0}/udp".format(port))
565
- # Close ufw firewall
566
- try:
567
- subprocess.run(
568
- ["sudo", "ufw", "delete", "allow", "{0}/udp".format(port)],
569
- stdout=subprocess.PIPE,
570
- )
571
- except subprocess.CalledProcessError as err:
572
- logging.error("DF ERROR:", err)
573
-
574
-
575
- # Start a systemd node
576
- def start_systemd_node(node):
577
- logging.info("Starting node " + str(node.id))
578
- # Try to start the service
579
- try:
580
- p = subprocess.run(
581
- ["sudo", "systemctl", "start", node.service],
582
- stdout=subprocess.PIPE,
583
- stderr=subprocess.STDOUT,
584
- ).stdout.decode("utf-8")
585
- if re.match(r"Failed to start", p):
586
- logging.error("SSN2 ERROR:", p)
587
- return False
588
- except subprocess.CalledProcessError as err:
589
- logging.error("SSN1 ERROR:", err)
590
- return False
591
- # Open a firewall hole for the data port
592
- enable_firewall(node.port, node.service)
593
- # Update node status
594
- set_node_status(node.id, RESTARTING)
595
- return True
596
-
597
-
598
- # Stop a systemd node
599
- def stop_systemd_node(node):
600
- logging.info("Stopping node: " + node.service)
601
- # Send a stop signal to the process
602
- try:
603
- subprocess.run(
604
- ["sudo", "systemctl", "stop", node.service], stdout=subprocess.PIPE
605
- )
606
- except subprocess.CalledProcessError as err:
607
- logging.error("SSN2 ERROR:", err)
608
- disable_firewall(node.port)
609
- set_node_status(node.id, STOPPED)
610
-
611
- return True
612
-
613
-
614
- # Upgrade a node
615
- def upgrade_node(node, metrics):
616
- logging.info("Upgrading node " + str(node.id))
617
- # Copy current node binary
618
- try:
619
- subprocess.run(["sudo", "cp", "-f", metrics["antnode"], node.binary])
620
- except subprocess.CalledProcessError as err:
621
- logging.error("UN1 ERROR:", err)
622
- try:
623
- subprocess.run(["sudo", "systemctl", "restart", node.service])
624
- except subprocess.CalledProcessError as err:
625
- logging.error("UN2 ERROR:", err)
626
- version = get_antnode_version(node.binary)
627
- try:
628
- with S() as session:
629
- session.query(Node).filter(Node.id == node.id).update(
630
- {
631
- "status": UPGRADING,
632
- "timestamp": int(time.time()),
633
- "version": metrics["AntNodeVersion"],
634
- }
635
- )
636
- session.commit()
637
- except:
638
- return False
639
- else:
640
- return True
641
80
 
81
+ # Use the new DecisionEngine to plan actions
82
+ engine = DecisionEngine(machine_config, metrics)
83
+ actions = engine.plan_actions()
642
84
 
643
- # Remove a node
644
- def remove_node(id):
645
- logging.info("Removing node " + str(id))
85
+ # Log the computed features for debugging
86
+ logging.info(json.dumps(engine.get_features(), indent=2))
646
87
 
647
- with S() as session:
648
- node = session.execute(select(Node).where(Node.id == id)).first()
649
- # Grab Node from Row
650
- node = node[0]
651
- if stop_systemd_node(node):
652
- # Mark this node as REMOVING
653
- set_node_status(id, REMOVING)
88
+ # Use ActionExecutor to execute the planned actions
89
+ executor = ActionExecutor(S)
90
+ result = executor.execute(actions, machine_config, metrics, dry_run)
654
91
 
655
- nodename = f"antnode{node.nodename}"
656
- # Remove node data and log
657
- try:
658
- subprocess.run(
659
- ["sudo", "rm", "-rf", node.root_dir, f"/var/log/antnode/{nodename}"]
660
- )
661
- except subprocess.CalledProcessError as err:
662
- logging.error("RN1 ERROR:", err)
663
- # Remove systemd service file
664
- try:
665
- subprocess.run(["sudo", "rm", "-f", f"/etc/systemd/system/{node.service}"])
666
- except subprocess.CalledProcessError as err:
667
- logging.error("RN2 ERROR:", err)
668
- # Tell system to reload systemd files
669
- try:
670
- subprocess.run(["sudo", "systemctl", "daemon-reload"])
671
- except subprocess.CalledProcessError as err:
672
- logging.error("RN3 ERROR:", err)
673
- # print(json.dumps(node,indent=2))
92
+ return result
674
93
 
675
94
 
676
- # Rescan nodes for status
677
- def update_nodes():
678
- with S() as session:
679
- nodes = session.execute(
680
- select(Node.timestamp, Node.id, Node.host, Node.metrics_port, Node.status)
681
- .where(Node.status != DISABLED)
682
- .order_by(Node.timestamp.asc())
683
- ).all()
684
- # Iterate through all records
685
- for check in nodes:
686
- # Check on status
687
- if isinstance(check[0], int):
688
- logging.debug("Updating info on node " + str(check[1]))
689
- node_metrics = read_node_metrics(check[2], check[3])
690
- node_metadata = read_node_metadata(check[2], check[3])
691
- if node_metrics and node_metadata:
692
- # Don't write updates for stopped nodes that are already marked as stopped
693
- if node_metadata["status"] == STOPPED and check[4] == STOPPED:
694
- continue
695
- update_node_from_metrics(check[1], node_metrics, node_metadata)
95
+ def main():
696
96
 
97
+ # Are we already running
98
+ if os.path.exists(LOCK_FILE):
99
+ logging.warning("wnm still running")
100
+ sys.exit(1)
697
101
 
698
- # Create a new node
699
- def create_node(config, metrics):
700
- logging.info("Creating new node")
701
- # Create a holding place for the new node
702
- card = {}
703
- # Find the next available node number by first looking for holes
704
- sql = text(
705
- "select n1.id + 1 as id from node n1 "
706
- + "left join node n2 on n2.id = n1.id + 1 "
707
- + "where n2.id is null "
708
- + "and n1.id <> (select max(id) from node) "
709
- + "order by n1.id;"
710
- )
711
- with S() as session:
712
- result = session.execute(sql).first()
713
- if result:
714
- card["id"] = result[0]
715
- # Otherwise get the max node number and add 1
716
- else:
717
- with S() as session:
718
- result = session.execute(select(Node.id).order_by(Node.id.desc())).first()
719
- card["id"] = result[0] + 1
720
- # Set the node name
721
- card["nodename"] = f"{card['id']:04}"
722
- card["service"] = f"antnode{card['nodename']}.service"
723
- card["user"] = "ant"
724
- card["version"] = metrics["AntNodeVersion"]
725
- card["root_dir"] = f"{config['NodeStorage']}/antnode{card['nodename']}"
726
- card["binary"] = f"{card['root_dir']}/antnode"
727
- card["port"] = config["PortStart"] * 1000 + card["id"]
728
- card["metrics_port"] = 13 * 1000 + card["id"]
729
- card["network"] = "evm-arbitrum-one"
730
- card["wallet"] = config["RewardsAddress"]
731
- card["peer_id"] = ""
732
- card["status"] = STOPPED
733
- card["timestamp"] = int(time.time())
734
- card["records"] = 0
735
- card["uptime"] = 0
736
- card["shunned"] = 0
737
- card["age"] = card["timestamp"]
738
- card["host"] = ANM_HOST
739
- log_dir = f"/var/log/antnode/antnode{card['nodename']}"
740
- # Create the node directory and log directory
741
- try:
742
- subprocess.run(
743
- ["sudo", "mkdir", "-p", card["root_dir"], log_dir], stdout=subprocess.PIPE
744
- )
745
- except subprocess.CalledProcessError as err:
746
- logging.error("CN1 ERROR:", err)
747
- # Copy the binary to the node directory
748
- try:
749
- subprocess.run(
750
- ["sudo", "cp", metrics["antnode"], card["root_dir"]], stdout=subprocess.PIPE
751
- )
752
- except subprocess.CalledProcessError as err:
753
- logging.error("CN2 ERROR:", err)
754
- # Change owner of the node directory and log directories
755
- try:
756
- subprocess.run(
757
- [
758
- "sudo",
759
- "chown",
760
- "-R",
761
- f'{card["user"]}:{card["user"]}',
762
- card["root_dir"],
763
- log_dir,
764
- ],
765
- stdout=subprocess.PIPE,
766
- )
767
- except subprocess.CalledProcessError as err:
768
- logging.error("CN3 ERROR:", err)
769
- # build the systemd service unit
770
- service = f"""[Unit]
771
- Description=antnode{card['nodename']}
772
- [Service]
773
- User={card['user']}
774
- ExecStart={card['binary']} --bootstrap-cache-dir /var/antctl/bootstrap-cache --root-dir {card['root_dir']} --port {card['port']} --enable-metrics-server --metrics-server-port {card['metrics_port']} --log-output-dest {log_dir} --max-log-files 1 --max-archived-log-files 1 --rewards-address {card['wallet']} {card['network']}
775
- Restart=always
776
- #RestartSec=300
777
- """
778
- # Write the systemd service unit with sudo tee since we're running as not root
779
- try:
780
- subprocess.run(
781
- ["sudo", "tee", f'/etc/systemd/system/{card["service"]}'],
782
- input=service,
783
- text=True,
784
- stdout=subprocess.PIPE,
785
- )
786
- except subprocess.CalledProcessError as err:
787
- logging.error("CN4 ERROR:", err)
788
- # Reload systemd service files to get our new one
102
+ # We're starting, so lets create a lock file
789
103
  try:
790
- subprocess.run(["sudo", "systemctl", "daemon-reload"], stdout=subprocess.PIPE)
791
- except subprocess.CalledProcessError as err:
792
- logging.error("CN5 ERROR:", err)
793
- # Add the new node to the database
794
- with S() as session:
795
- session.execute(insert(Node), [card])
796
- session.commit()
797
- # Now we grab the node object from the database to pass to start node
798
- with S() as session:
799
- card = session.execute(select(Node).where(Node.id == card["id"])).first()
800
- # Get the Node object from the Row
801
- card = card[0]
802
- # Start the new node
803
- return start_systemd_node(card)
804
- # print(json.dumps(card,indent=2))
805
- return True
806
-
104
+ with open(LOCK_FILE, "w") as file:
105
+ file.write(str(int(time.time())))
106
+ except (PermissionError, OSError) as e:
107
+ logging.error(f"Unable to create lock file: {e}")
108
+ sys.exit(1)
807
109
 
808
- # Make a decision about what to do
809
- def choose_action(config, metrics, db_nodes):
810
- # Gather knowlege
811
- features = {}
812
- features["AllowCpu"] = metrics["UsedCpuPercent"] < config["CpuLessThan"]
813
- features["AllowMem"] = metrics["UsedMemPercent"] < config["MemLessThan"]
814
- features["AllowHD"] = metrics["UsedHDPercent"] < config["HDLessThan"]
815
- features["RemCpu"] = metrics["UsedCpuPercent"] > config["CpuRemove"]
816
- features["RemMem"] = metrics["UsedMemPercent"] > config["MemRemove"]
817
- features["RemHD"] = metrics["UsedHDPercent"] > config["HDRemove"]
818
- features["AllowNodeCap"] = metrics["RunningNodes"] < config["NodeCap"]
819
- # These are new features, so ignore them if not configured
820
- if (
821
- config["NetIOReadLessThan"]
822
- + config["NetIOReadRemove"]
823
- + config["NetIOWriteLessThan"]
824
- + config["NetIOWriteRemove"]
825
- > 1
826
- ):
827
- features["AllowNetIO"] = (
828
- metrics["NetReadBytes"] < config["NetIOReadLessThan"]
829
- and metrics["NetWriteBytes"] < config["NetIOWriteLessThan"]
830
- )
831
- features["RemoveNetIO"] = (
832
- metrics["NetReadBytes"] > config["NetIORemove"]
833
- or metrics["NetWriteBytes"] > config["NetIORemove"]
834
- )
835
- else:
836
- features["AllowNetIO"] = True
837
- features["RemoveNetIO"] = False
838
- if (
839
- config["HDIOReadLessThan"]
840
- + config["HDIOReadRemove"]
841
- + config["HDIOWriteLessThan"]
842
- + config["HDIOWriteRemove"]
843
- > 1
844
- ):
845
- features["AllowHDIO"] = (
846
- metrics["HDReadBytes"] < config["HDIOReadLessThan"]
847
- and metrics["HDWriteBytes"] < config["HDIOWriteLessThan"]
848
- )
849
- features["RemoveHDIO"] = (
850
- metrics["HDReadBytes"] > config["HDIORemove"]
851
- or metrics["HDWriteBytes"] > config["HDtIORemove"]
852
- )
110
+ # Config should have loaded the machine_config
111
+ if machine_config:
112
+ logging.info("Machine: " + json.dumps(machine_config))
853
113
  else:
854
- features["AllowHDIO"] = True
855
- features["RemoveHDIO"] = False
856
- features["LoadAllow"] = (
857
- metrics["LoadAverage1"] < config["DesiredLoadAverage"]
858
- and metrics["LoadAverage5"] < config["DesiredLoadAverage"]
859
- and metrics["LoadAverage15"] < config["DesiredLoadAverage"]
860
- )
861
- features["LoadNotAllow"] = (
862
- metrics["LoadAverage1"] > config["MaxLoadAverageAllowed"]
863
- or metrics["LoadAverage5"] > config["MaxLoadAverageAllowed"]
864
- or metrics["LoadAverage15"] > config["MaxLoadAverageAllowed"]
865
- )
866
- # Check records for expired status
867
- metrics = update_counters(metrics, config)
868
- # If we have other thing going on, don't add more nodes
869
- features["AddNewNode"] = (
870
- sum(
871
- [
872
- metrics.get(m, 0)
873
- for m in [
874
- "UpgradingNodes",
875
- "RestartingNodes",
876
- "MigratingNodes",
877
- "RemovingNodes",
878
- ]
879
- ]
880
- )
881
- == 0
882
- and features["AllowCpu"]
883
- and features["AllowHD"]
884
- and features["AllowMem"]
885
- and features["AllowNodeCap"]
886
- and features["AllowHDIO"]
887
- and features["AllowNetIO"]
888
- and features["LoadAllow"]
889
- and metrics["TotalNodes"] < config["NodeCap"]
890
- )
891
- # Are we overlimit on nodes
892
- features["Remove"] = (
893
- features["LoadNotAllow"]
894
- or features["RemCpu"]
895
- or features["RemHD"]
896
- or features["RemMem"]
897
- or features["RemoveHDIO"]
898
- or features["RemoveNetIO"]
899
- or metrics["TotalNodes"] > config["NodeCap"]
900
- )
901
- # If we have nodes to upgrade
902
- if metrics["NodesToUpgrade"] >= 1:
903
- # Make sure current version is equal or newer than version on first node.
904
- if Version(metrics["AntNodeVersion"]) < Version(db_nodes[0][1]):
905
- logging.warning("node upgrade cancelled due to lower version")
906
- features["Upgrade"] = False
114
+ logging.error("Unable to load machine config, exiting")
115
+ sys.exit(1)
116
+ # Check for config updates
117
+ if config_updates:
118
+ logging.info("Update: " + json.dumps(config_updates))
119
+ if options.dry_run:
120
+ logging.warning("Dry run, not saving requested updates")
121
+ # Create a dictionary for the machine config
122
+ # Machine by default returns a parameter array,
123
+ # use the __json__ method to return a dict
124
+ local_config = json.loads(json.dumps(machine_config))
125
+ # Apply the local config with the requested updates
126
+ local_config.update(config_updates)
907
127
  else:
908
- if features["Remove"]:
909
- logging.info("Can't upgrade while removing is required")
910
- features["Upgrade"] = False
911
- else:
912
- features["Upgrade"] = True
128
+ # Store the config changes to the database
129
+ apply_config_updates(config_updates)
130
+ # Create a working dictionary for the machine config
131
+ # Machine by default returns a parameter array,
132
+ # use the __json__ method to return a dict
133
+ local_config = json.loads(json.dumps(machine_config))
913
134
  else:
914
- features["Upgrade"] = False
915
-
916
- logging.info(json.dumps(features, indent=2))
917
- ##### Decisions
918
-
919
- # Actually, removing DEAD nodes take priority
920
- if metrics["DeadNodes"] > 1:
921
- with S() as session:
922
- broken = session.execute(
923
- select(Node.timestamp, Node.id, Node.host, Node.metrics_port)
924
- .where(Node.status == DEAD)
925
- .order_by(Node.timestamp.asc())
926
- ).all()
927
- # Iterate through dead nodes and remove them all
928
- for check in broken:
929
- # Remove broken nodes
930
- logging.info("Removing dead node " + str(check[1]))
931
- remove_node(check[1])
932
- return {"status": "removed-dead-nodes"}
933
- # If we have nodes with no version number, update from binary
934
- if metrics["NodesNoVersion"] > 1:
935
- with S() as session:
936
- no_version = session.execute(
937
- select(Node.timestamp, Node.id, Node.binary)
938
- .where(Node.version == "")
939
- .order_by(Node.timestamp.asc())
940
- ).all()
941
- # Iterate through nodes with no version number
942
- for check in no_version:
943
- # Update version number from binary
944
- version = get_antnode_version(check[2])
945
- logging.info(f"Updating version number for node {check[1]} to {version}")
946
- with S() as session:
947
- session.query(Node).filter(Node.id == check[1]).update(
948
- {"version": version}
949
- )
950
- session.commit()
951
-
952
- # If we're restarting, wait patiently as metrics could be skewed
953
- if metrics["RestartingNodes"]:
954
- logging.info("Still waiting for RestartDelay")
955
- return {"status": RESTARTING}
956
- # If we still have unexpired upgrade records, wait
957
- if metrics["UpgradingNodes"]:
958
- logging.info("Still waiting for UpgradeDelay")
959
- return {"status": UPGRADING}
960
- # First if we're removing, that takes top priority
961
- if features["Remove"]:
962
- # If we still have unexpired removal records, wait
963
- if metrics["RemovingNodes"]:
964
- logging.info("Still waiting for RemoveDelay")
965
- return {"status": REMOVING}
966
- # If we're under HD pressure or trimming node cap, remove nodes
967
- if features["RemHD"] or metrics["TotalNodes"] > config["NodeCap"]:
968
- # Start removing with stopped nodes
969
- if metrics["StoppedNodes"] > 0:
970
- # What is the youngest stopped node
971
- with S() as session:
972
- youngest = session.execute(
973
- select(Node.id)
974
- .where(Node.status == STOPPED)
975
- .order_by(Node.age.desc())
976
- ).first()
977
- if youngest:
978
- # Remove the youngest node
979
- remove_node(youngest[0])
980
- return {"status": REMOVING}
981
- # No low hanging fruit. let's start with the youngest running node
982
- with S() as session:
983
- youngest = session.execute(
984
- select(Node.id)
985
- .where(Node.status == RUNNING)
986
- .order_by(Node.age.desc())
987
- ).first()
988
- if youngest:
989
- # Remove the youngest node
990
- remove_node(youngest[0])
991
- return {"status": REMOVING}
992
- return {"status": "nothing-to-remove"}
993
- # Otherwise, let's try just stopping a node to bring IO/Mem/Cpu down
994
- else:
995
- # If we just stopped a node, wait
996
- if int(config["LastStoppedAt"] or 0) > (
997
- int(time.time()) - (config["DelayRemove"] * 60)
998
- ):
999
- logging.info("Still waiting for RemoveDelay")
1000
- return {"status": "waiting-to-stop"}
1001
- # Start with the youngest running node
1002
- with S() as session:
1003
- youngest = session.execute(
1004
- select(Node).where(Node.status == RUNNING).order_by(Node.age.desc())
1005
- ).first()
1006
- if youngest:
1007
- # Stop the youngest node
1008
- stop_systemd_node(youngest[0])
1009
- # Update the last stopped time
1010
- with S() as session:
1011
- session.query(Machine).filter(Machine.id == 1).update(
1012
- {"LastStoppedAt": int(time.time())}
1013
- )
1014
- session.commit()
1015
- return {"status": STOPPED}
1016
- else:
1017
- return {"status": "nothing-to-stop"}
1018
-
1019
- # Do we have upgrading to do?
1020
- if features["Upgrade"]:
1021
- # Let's find the oldest running node not using the current version
1022
- with S() as session:
1023
- oldest = session.execute(
1024
- select(Node)
1025
- .where(Node.status == RUNNING)
1026
- .where(Node.version != metrics["AntNodeVersion"])
1027
- .order_by(Node.age.asc())
1028
- ).first()
1029
- if oldest:
1030
- # Get Node from Row
1031
- oldest = oldest[0]
1032
- # If we don't have a version number from metadata, grab from binary
1033
- if not oldest.version:
1034
- oldest.version = get_antnode_version(oldest.binary)
1035
- # print(json.dumps(oldest))
1036
- # Upgrade the oldest node
1037
- upgrade_node(oldest, metrics)
1038
- return {"status": UPGRADING}
1039
-
1040
- # If AddNewNode
1041
- # If stopped nodes available
1042
- # Check oldest stopped version
1043
- # If out of date
1044
- # upgrade node which starts it
1045
- # else
1046
- # restart node
1047
- # else
1048
- # Create a Node which starts it
1049
- if features["AddNewNode"]:
1050
- # Start adding with stopped nodes
1051
- if metrics["StoppedNodes"] > 0:
1052
- # What is the oldest stopped node
1053
- with S() as session:
1054
- oldest = session.execute(
1055
- select(Node).where(Node.status == STOPPED).order_by(Node.age.asc())
1056
- ).first()
1057
- if oldest:
1058
- # Get Node from Row
1059
- oldest = oldest[0]
1060
- # If we don't have a version number from metadata, grab from binary
1061
- if not oldest.version:
1062
- oldest.version = get_antnode_version(oldest.binary)
1063
- # If the stopped version is old, upgrade it
1064
- if Version(metrics["AntNodeVersion"]) > Version(oldest.version):
1065
- upgrade_node(oldest, metrics)
1066
- return {"status": UPGRADING}
135
+ local_config = json.loads(json.dumps(machine_config))
136
+
137
+ metrics = get_machine_metrics(
138
+ S,
139
+ local_config["node_storage"],
140
+ local_config["hd_remove"],
141
+ local_config["crisis_bytes"],
142
+ )
143
+ logging.info(json.dumps(metrics, indent=2))
144
+
145
+ # Do we already have nodes
146
+ if metrics["total_nodes"] == 0:
147
+ # Are we migrating an anm server
148
+ if options.init and options.migrate_anm:
149
+ Workers = survey_machine(machine_config) or []
150
+ if Workers:
151
+ if options.dry_run:
152
+ logging.warning(f"DRYRUN: Not saving {len(Workers)} detected nodes")
1067
153
  else:
1068
- if start_systemd_node(oldest):
1069
- return {"status": RESTARTING}
1070
- else:
1071
- return {"status": "failed-start-node"}
1072
- # Hmm, still in Start mode, we shouldn't get here
1073
- return {"status": "START"}
1074
- # Still in Add mode, add a new node
1075
- if metrics["TotalNodes"] < config["NodeCap"]:
1076
- if create_node(config, metrics):
1077
- return {"status": "ADD"}
154
+ with S() as session:
155
+ session.execute(insert(Node), Workers)
156
+ session.commit()
157
+ # Reload metrics
158
+ metrics = get_machine_metrics(
159
+ S,
160
+ local_config["node_storage"],
161
+ local_config["hd_remove"],
162
+ local_config["crisis_bytes"],
163
+ )
164
+ logging.info(
165
+ "Found {counter} nodes defined".format(
166
+ counter=metrics["total_nodes"]
167
+ )
168
+ )
1078
169
  else:
1079
- return {"status": "failed-create-node"}
170
+ logging.warning("Requested migration but no nodes found")
1080
171
  else:
1081
- return {"status": "node-cap-reached"}
1082
- # If we have nothing to do, Survey the node ports
1083
- update_nodes()
1084
- return {"status": "idle"}
1085
-
1086
-
1087
- def main():
1088
- # We're starting, so lets create a lock file
1089
- try:
1090
- with open("/var/antctl/wnm_active", "w") as file:
1091
- file.write(str(int(time.time())))
1092
- except:
1093
- logging.error("Unable to create lock file, exiting")
1094
- sys.exit(1)
1095
-
1096
- # See if we already have a known state in the database
1097
- with S() as session:
1098
- db_nodes = session.execute(
1099
- select(
1100
- Node.status,
1101
- Node.version,
1102
- Node.host,
1103
- Node.metrics_port,
1104
- Node.port,
1105
- Node.age,
1106
- Node.id,
1107
- Node.timestamp,
172
+ logging.info("No nodes found")
173
+ else:
174
+ logging.info(
175
+ "Found {counter} nodes configured".format(counter=metrics["total_nodes"])
176
+ )
177
+
178
+ # Check for reports
179
+ if options.report:
180
+ from wnm.reports import generate_node_status_report, generate_node_status_details_report
181
+
182
+ # If survey action is specified, run it first
183
+ if options.force_action == "survey":
184
+ logging.info("Running survey before generating report")
185
+ executor = ActionExecutor(S)
186
+ survey_result = executor.execute_forced_action(
187
+ "survey",
188
+ local_config,
189
+ metrics,
190
+ service_name=options.service_name,
191
+ dry_run=options.dry_run,
1108
192
  )
1109
- ).all()
1110
- anm_config = session.execute(select(Machine)).all()
1111
-
1112
- if db_nodes:
1113
- # anm_config by default loads a parameter array,
1114
- # use the __json__ method to return a dict from the first node
1115
- anm_config = json.loads(json.dumps(anm_config[0][0])) or load_anm_config()
1116
- metrics = get_machine_metrics(anm_config["NodeStorage"], anm_config["HDRemove"])
1117
- # node_metrics = read_node_metrics(db_nodes[0][2],db_nodes[0][3])
1118
- # print(db_nodes[0])
1119
- # print(node_metrics)
1120
- # print(anm_config)
1121
- # print(json.dumps(anm_config,indent=4))
1122
- # print("Node: ",db_nodes)
1123
- logging.info("Found {counter} nodes migrated".format(counter=len(db_nodes)))
193
+ logging.info(f"Survey result: {survey_result}")
1124
194
 
195
+ # Generate the report
196
+ if options.report == "node-status":
197
+ report_output = generate_node_status_report(
198
+ S, options.service_name, options.report_format
199
+ )
200
+ elif options.report == "node-status-details":
201
+ report_output = generate_node_status_details_report(
202
+ S, options.service_name, options.report_format
203
+ )
204
+ else:
205
+ report_output = f"Unknown report type: {options.report}"
206
+
207
+ print(report_output)
208
+ os.remove(LOCK_FILE)
209
+ sys.exit(0)
210
+
211
+ # Check for forced actions
212
+ if options.force_action:
213
+ logging.info(f"Executing forced action: {options.force_action}")
214
+ executor = ActionExecutor(S)
215
+ this_action = executor.execute_forced_action(
216
+ options.force_action,
217
+ local_config,
218
+ metrics,
219
+ service_name=options.service_name,
220
+ dry_run=options.dry_run,
221
+ count=options.count if hasattr(options, 'count') else 1,
222
+ )
1125
223
  else:
1126
- anm_config = load_anm_config()
1127
- # print(anm_config)
1128
- Workers = survey_machine() or []
1129
-
1130
- # """"
1131
- with S() as session:
1132
- session.execute(insert(Node), Workers)
1133
- session.commit()
1134
- # """
1135
-
1136
- with S() as session:
1137
- session.execute(insert(Machine), [anm_config])
1138
- session.commit()
1139
-
1140
- # Now load subset of data to work with
1141
- with S() as session:
1142
- db_nodes = session.execute(
1143
- select(
1144
- Node.status,
1145
- Node.version,
1146
- Node.host,
1147
- Node.metrics_port,
1148
- Node.port,
1149
- Node.age,
1150
- Node.id,
1151
- Node.timestamp,
1152
- )
1153
- ).all()
1154
-
1155
- # print(json.dumps(anm_config,indent=4))
1156
- logging.info("Found {counter} nodes configured".format(counter=len(db_nodes)))
224
+ this_action = choose_action(local_config, metrics, options.dry_run)
1157
225
 
1158
- # versions = [v[1] for worker in Workers if (v := worker.get('version'))]
1159
- # data = Counter(ver for ver in versions)
1160
-
1161
- data = Counter(status[0] for status in db_nodes)
1162
- # print(data)
1163
- print("Running Nodes:", data[RUNNING])
1164
- print("Restarting Nodes:", data[RESTARTING])
1165
- print("Stopped Nodes:", data[STOPPED])
1166
- print("Upgrading Nodes:", data[UPGRADING])
1167
- print("Removing Nodes:", data[REMOVING])
1168
- data = Counter(ver[1] for ver in db_nodes)
1169
- print("Versions:", data)
1170
-
1171
- machine_metrics = get_machine_metrics(
1172
- anm_config["NodeStorage"], anm_config["HDRemove"]
1173
- )
1174
- print(json.dumps(anm_config, indent=2))
1175
- print(json.dumps(machine_metrics, indent=2))
1176
- this_action = choose_action(anm_config, machine_metrics, db_nodes)
1177
226
  print("Action:", json.dumps(this_action, indent=2))
1178
- # Remove lock file
1179
- os.remove("/var/antctl/wnm_active")
227
+
228
+ os.remove(LOCK_FILE)
229
+ sys.exit(1)
1180
230
 
1181
231
 
1182
232
  if __name__ == "__main__":
1183
233
  main()
234
+ # print(options.MemRemove)
1184
235
 
1185
236
  print("End of program")