wnm 0.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of wnm might be problematic. Click here for more details.

wnm/__init__.py ADDED
@@ -0,0 +1,3 @@
1
+ """A service to manage a cluster of decentralized Autonomi nodes"""
2
+
3
+ __version__ = "0.0.4"
wnm/__main__.py ADDED
@@ -0,0 +1,976 @@
1
+ import os, sys
2
+ import re, json, requests, time
3
+ import subprocess, logging
4
+ from collections import Counter
5
+ from packaging.version import Version
6
+ from dotenv import load_dotenv
7
+ import psutil, shutil
8
+
9
+ from wnm.models import Base, Machine, Node
10
+ from sqlalchemy import create_engine, select, insert, update, delete, text
11
+ from sqlalchemy.orm import sessionmaker, scoped_session
12
+
13
+ logging.basicConfig(level=logging.INFO)
14
+ #Info level logging for sqlalchemy is too verbose, only use when needed
15
+ logging.getLogger('sqlalchemy.engine.Engine').disabled = True
16
+
17
+ # import .env
18
+ basedir = os.path.abspath(os.path.dirname(__file__))
19
+ load_dotenv(os.path.join(basedir, '.env'))
20
+
21
+ # simulate arg/yaml configuration
22
+ config = {}
23
+ config['db']='sqlite:///colony.db'
24
+ config['DonateAddress'] = os.getenv('DonateAddress') or '0x00455d78f850b0358E8cea5be24d415E01E107CF'
25
+ config['ANMHost'] = os.getenv('ANMHost') or '127.0.0.1'
26
+ config['CrisisBytes'] = os.getenv('CrisisBytes') or 2 * 10 ** 9 # default 2gb/node
27
+
28
+
29
+ # Setup Database engine
30
+ engine = create_engine(config["db"], echo=True)
31
+
32
+ # Generate ORM
33
+ Base.metadata.create_all(engine)
34
+
35
+ # Create a connection to the ORM
36
+ session_factory = sessionmaker(bind=engine)
37
+ S = scoped_session(session_factory)
38
+
39
+
40
+ # if WNM_CONFIG or -c parameter are set, check for existing config
41
+ # else:
42
+
43
+ # Primary node for want of one
44
+ QUEEN=1
45
+
46
+ # Donation address
47
+ DONATE=config["DonateAddress"]
48
+ #Keep these as strings so they can be grepped in logs
49
+ STOPPED="STOPPED" #0 Node is not responding to it's metrics port
50
+ RUNNING="RUNNING" #1 Node is responding to it's metrics port
51
+ UPGRADING="UPGRADING" #2 Upgrade in progress
52
+ DISABLED="DISABLED" #-1 Do not start
53
+ RESTARTING="RESTARTING" #3 re/starting a server intionally
54
+ MIGRATING="MIGRATING" #4 Moving volumes in progress
55
+ REMOVING="REMOVING" #5 Removing node in progress
56
+ DEAD="DEAD" #-86 Broken node to cleanup
57
+
58
+ ANM_HOST=config["ANMHost"]
59
+ # Baseline bytes per node
60
+ CRISIS_BYTES=config["CrisisBytes"]
61
+
62
+ # A storage place for ant node data
63
+ Workers=[]
64
+
65
+ # Detect ANM (but don't upgrade)
66
+ if os.path.exists("/var/antctl/system"):
67
+ # Is anm scheduled to run
68
+ if os.path.exists("/etc/cron.d/anm"):
69
+ # remove cron to disable old anm
70
+ os.remove("/etc/cron.d/anm")
71
+ # Is anm sitll running? We'll wait
72
+ if os.path.exists("/var/antctl/block"):
73
+ logging.info("anm still running, waiting...")
74
+ sys.exit(1)
75
+
76
+ # Are we already running
77
+ if os.path.exists("/var/antctl/wnm_active"):
78
+ logging.info("wnm still running")
79
+ sys.exit(1)
80
+
81
+ # Get anm configuration
82
+ def load_anm_config():
83
+ anm_config = {}
84
+
85
+ # Let's get the real count of CPU's available to this process
86
+ anm_config["CpuCount"] = len(os.sched_getaffinity(0))
87
+
88
+ # What can we save from /var/antctl/config
89
+ if os.path.exists("/var/antctl/config"):
90
+ load_dotenv("/var/antctl/config")
91
+ anm_config["NodeCap"] = int(os.getenv('NodeCap') or 20)
92
+ anm_config["CpuLessThan"] = int(os.getenv('CpuLessThan') or 50)
93
+ anm_config["CpuRemove"] = int(os.getenv('CpuRemove') or 70)
94
+ anm_config["MemLessThan"] = int(os.getenv('MemLessThan') or 70)
95
+ anm_config["MemRemove"] = int(os.getenv('MemRemove') or 90)
96
+ anm_config["HDLessThan"] = int(os.getenv('HDLessThan') or 70)
97
+ anm_config["HDRemove"] = int(os.getenv('HDRemove') or 90)
98
+ anm_config["DelayStart"] = int(os.getenv('DelayStart') or 5)
99
+ anm_config["DelayUpgrade"] = int(os.getenv('DelayUpgrade') or 5)
100
+ anm_config["DelayRestart"] = int(os.getenv('DelayRestart') or 10)
101
+ anm_config["DelayRemove"] = int(os.getenv('DelayRemove') or 300)
102
+ anm_config["NodeStorage"] = os.getenv('NodeStorage') or "/var/antctl/services"
103
+ # Default to the faucet donation address
104
+ try:
105
+ anm_config["RewardsAddress"] = re.findall(r"--rewards-address ([\dA-Fa-fXx]+)",os.getenv('RewardsAddress'))[0]
106
+ except:
107
+ try:
108
+ anm_config["RewardsAddress"] = re.findall(r"([\dA-Fa-fXx]+)",os.getenv("RewardsAddress"))[0]
109
+ except:
110
+ logging.warning("Unable to detect RewardsAddress")
111
+ sys.exit(1)
112
+ anm_config["DonateAddress"]=os.getenv("DonateAddress") or DONATE
113
+ anm_config["MaxLoadAverageAllowed"]=float(os.getenv("MaxLoadAverageAllowed") or anm_config["CpuCount"])
114
+ anm_config["DesiredLoadAverage"]=float(os.getenv("DesiredLoadAverage") or (anm_config["CpuCount"] * .6))
115
+
116
+ try:
117
+ with open('/usr/bin/anms.sh', 'r') as file:
118
+ data = file.read()
119
+ anm_config["PortStart"]=int(re.findall(r"ntpr\=(\d+)",data)[0])
120
+ except:
121
+ anm_config["PortStart"]=55
122
+
123
+ anm_config["HDIOReadLessThan"] = float(os.getenv('HDIOReadLessThan') or 0.0)
124
+ anm_config["HDIOReadRemove"] = float(os.getenv('HDIOReadRemove') or 0.0)
125
+ anm_config["HDIOWriteLessThan"] = float(os.getenv('HDIOWriteLessThan') or 0.0)
126
+ anm_config["HDIOWriteRemove"] = float(os.getenv('HDIOWriteRemove') or 0.0)
127
+ anm_config["NetIOReadLessThan"] = float(os.getenv('NetIOReadLessThan') or 0.0)
128
+ anm_config["NetIOReadRemove"] = float(os.getenv('NetIOReadRemove') or 0.0)
129
+ anm_config["NetIOWriteLessThan"] = float(os.getenv('NetIOWriteLessThan') or 0.0)
130
+ anm_config["NetIOWriteRemove"] = float(os.getenv('NetIOWriteRemove') or 0.0)
131
+ # Timer for last stopped nodes
132
+ anm_config["LastStoppedAt"]=0
133
+
134
+
135
+ return anm_config
136
+
137
+ # Read confirm from systemd service file
138
+ def read_systemd_service(antnode):
139
+ details={}
140
+ try:
141
+ with open('/etc/systemd/system/'+antnode, 'r') as file:
142
+ data = file.read()
143
+ details['id']=int(re.findall(r"antnode(\d+)",antnode)[0])
144
+ details['binary']=re.findall(r"ExecStart=([^ ]+)",data)[0]
145
+ details["user"]=re.findall(r"User=(\w+)",data)[0]
146
+ details["root_dir"]=re.findall(r"--root-dir ([\w\/]+)",data)[0]
147
+ details["port"]=int(re.findall(r"--port (\d+)",data)[0])
148
+ details["metrics_port"]=int(re.findall(r"--metrics-server-port (\d+)",data)[0])
149
+ details["wallet"]=re.findall(r"--rewards-address ([^ ]+)",data)[0]
150
+ details["network"]=re.findall(r"--rewards-address [^ ]+ ([\w\-]+)",data)[0]
151
+ except:
152
+ pass
153
+
154
+ return details
155
+
156
+ # Read data from metadata endpoint
157
+ def read_node_metadata(host,port):
158
+ # Only return version number when we have one, to stop clobbering the binary check
159
+ try:
160
+ url = "http://{0}:{1}/metadata".format(host,port)
161
+ response = requests.get(url)
162
+ data=response.text
163
+ except requests.exceptions.ConnectionError:
164
+ logging.debug("Connection Refused on port: {0}:{1}".format(host,str(port)))
165
+ return {"status": STOPPED, "peer_id":""}
166
+ except Exception as error:
167
+ template = "In RNMd - An exception of type {0} occurred. Arguments:\n{1!r}"
168
+ message = template.format(type(error).__name__, error.args)
169
+ logging.info(message)
170
+ return {"status": STOPPED, "peer_id":""}
171
+ # collect a dict to return
172
+ card={}
173
+ try:
174
+ card["version"] = re.findall(r'{antnode_version="([\d\.]+)"}',data)[0]
175
+ except:
176
+ logging.info('No version found')
177
+ try:
178
+ card["peer_id"] = re.findall(r'{peer_id="([\w\d]+)"}',data)[0]
179
+ except:
180
+ card["peer_id"] = ""
181
+ card["status"] = RUNNING if "version" in card else STOPPED
182
+ return card
183
+
184
+ # Read data from metrics port
185
+ def read_node_metrics(host,port):
186
+ metrics={}
187
+ try:
188
+ url = "http://{0}:{1}/metrics".format(host,port)
189
+ response = requests.get(url)
190
+ metrics["status"] = RUNNING
191
+ metrics["uptime"] = int((re.findall(r'ant_node_uptime ([\d]+)',response.text) or [0])[0])
192
+ metrics["records"] = int((re.findall(r'ant_networking_records_stored ([\d]+)',response.text) or [0])[0])
193
+ metrics["shunned"] = int((re.findall(r'ant_networking_shunned_by_close_group ([\d]+)',response.text) or [0])[0])
194
+ except requests.exceptions.ConnectionError:
195
+ logging.debug("Connection Refused on port: {0}:{1}".format(host,str(port)))
196
+ metrics["status"] = STOPPED
197
+ metrics["uptime"] = 0
198
+ metrics["records"] = 0
199
+ metrics["shunned"] = 0
200
+ except Exception as error:
201
+ template = "in:RNM - An exception of type {0} occurred. Arguments:\n{1!r}"
202
+ message = template.format(type(error).__name__, error.args)
203
+ logging.info(message)
204
+ metrics["status"] = STOPPED
205
+ metrics["uptime"] = 0
206
+ metrics["records"] = 0
207
+ metrics["shunned"] = 0
208
+ return metrics
209
+
210
+ # Read antnode binary version
211
+ def get_antnode_version(binary):
212
+ try:
213
+ data = subprocess.run([binary, '--version'], stdout=subprocess.PIPE).stdout.decode('utf-8')
214
+ return re.findall(r'Autonomi Node v([\d\.]+)',data)[0]
215
+ except Exception as error:
216
+ template = "In GAV - An exception of type {0} occurred. Arguments:\n{1!r}"
217
+ message = template.format(type(error).__name__, error.args)
218
+ logging.info(message)
219
+ return 0
220
+
221
+ # Determine how long this node has been around by looking at it's secret_key file
222
+ def get_node_age(root_dir):
223
+ try:
224
+ return int(os.stat("{0}/secret-key".format(root_dir)).st_mtime)
225
+ except:
226
+ return 0
227
+
228
+ # Survey nodes by reading metadata from metrics ports or binary --version
229
+ def survey_anm_nodes(antnodes):
230
+ # Build a list of node dictionaries to return
231
+ details=[]
232
+ # Iterate on nodes
233
+ for node in antnodes:
234
+ # Initialize a dict
235
+ logging.debug("{0} surveying node {1} ".format(time.strftime("%Y-%m-%d %H:%M"),node))
236
+ if not re.findall(r"antnode([\d]+).service",node):
237
+ logging.info("can't decode "+str(node))
238
+ continue
239
+ card={"nodename":re.findall(r"antnode([\d]+).service",node)[0],
240
+ "service": node,
241
+ "timestamp": int(time.time()),
242
+ "host": ANM_HOST or '127.0.0.1'
243
+ }
244
+ # Load what systemd has configured
245
+ card.update(read_systemd_service(node))
246
+ #print(json.dumps(card,indent=2))
247
+ # Read metadata from metrics_port
248
+ metadata = read_node_metadata(card["host"],card["metrics_port"])
249
+ #print(json.dumps(metadata,indent=2))
250
+ if isinstance(metadata,dict) and \
251
+ "status" in metadata and \
252
+ metadata["status"]==RUNNING:
253
+ # soak up metadata
254
+ card.update(metadata)
255
+ # The ports up, so grab metrics too
256
+ card.update(read_node_metrics(card["host"],card["metrics_port"]))
257
+ # Else run binary to get version
258
+ else:
259
+ # If the root directory of the node is missing, it's a bad node
260
+ if not os.path.isdir(card["root_dir"]):
261
+ card["status"]=DEAD
262
+ card["version"]=''
263
+ else:
264
+ card["status"]=STOPPED
265
+ card["version"]=get_antnode_version(card["binary"])
266
+ card["peer_id"]=''
267
+ card["records"]=0
268
+ card["uptime"]=0
269
+ card["shunned"]=0
270
+ card["age"]=get_node_age(card["root_dir"])
271
+ # harcoded for anm
272
+ card["host"]=ANM_HOST
273
+ # Append the node dict to the detail list
274
+ details.append(card)
275
+
276
+ return details
277
+
278
+ # Survey server instance
279
+ def survey_machine():
280
+ # Make a bucket
281
+ antnodes=[]
282
+ # For all service files
283
+ for file in os.listdir("/etc/systemd/system"):
284
+ # Find antnodes
285
+ if re.match(r'antnode[\d]+\.service',file):
286
+ antnodes.append(file)
287
+ #if len(antnodes)>=5:
288
+ # break
289
+ # Iterate over defined nodes and get details
290
+ # Ingests a list of service files and outputs a list of dictionaries
291
+ return survey_anm_nodes(antnodes)
292
+
293
+ # Read system status
294
+ def get_machine_metrics(node_storage,remove_limit):
295
+ metrics = {}
296
+
297
+ with S() as session:
298
+ db_nodes=session.execute(select(Node.status,Node.version)).all()
299
+
300
+ # Get some initial stats for comparing after a few seconds
301
+ # We start these counters AFTER reading the database
302
+ start_time=time.time()
303
+ start_disk_counters=psutil.disk_io_counters()
304
+ start_net_counters=psutil.net_io_counters()
305
+
306
+ metrics["TotalNodes"]=len(db_nodes)
307
+ data = Counter(node[0] for node in db_nodes)
308
+ metrics["RunningNodes"] = data[RUNNING]
309
+ metrics["StoppedNodes"] = data[STOPPED]
310
+ metrics["RestartingNodes"] = data[RESTARTING]
311
+ metrics["UpgradingNodes"] = data[UPGRADING]
312
+ metrics["MigratingNodes"] = data[MIGRATING]
313
+ metrics["RemovingNodes"] = data[REMOVING]
314
+ metrics["DeadNodes"] = data[DEAD]
315
+ metrics["antnode"]=shutil.which("antnode")
316
+ if not metrics["antnode"]:
317
+ logging.warning("Unable to locate current antnode binary, exiting")
318
+ sys.exit(1)
319
+ metrics["AntNodeVersion"]=get_antnode_version(metrics["antnode"])
320
+ metrics["NodesLatestV"]=sum(1 for node in db_nodes if node[1]==metrics["AntNodeVersion"]) or 0
321
+ metrics["NodesNoVersion"]=sum(1 for node in db_nodes if not node[1]) or 0
322
+ metrics["NodesToUpgrade"]=metrics["TotalNodes"] - metrics["NodesLatestV"] - metrics["NodesNoVersion"]
323
+
324
+ # Windows has to build load average over 5 seconds. The first 5 seconds returns 0's
325
+ # I don't plan on supporting windows, but if this get's modular, I don't want this
326
+ # issue to be skipped
327
+ #if platform.system() == "Windows":
328
+ # discard=psutil.getloadavg()
329
+ # time.sleep(5)
330
+ metrics["LoadAverage1"],metrics["LoadAverage5"],metrics["LoadAverage15"]=psutil.getloadavg()
331
+ # Get CPU Metrics over 1 second
332
+ metrics["IdleCpuPercent"],metrics["IOWait"] = psutil.cpu_times_percent(1)[3:5]
333
+ # Really we returned Idle percent, subtract from 100 to get used.
334
+ metrics["UsedCpuPercent"] = 100 - metrics["IdleCpuPercent"]
335
+ data=psutil.virtual_memory()
336
+ #print(data)
337
+ metrics["UsedMemPercent"]=data.percent
338
+ metrics["FreeMemPercent"]=100-metrics["UsedMemPercent"]
339
+ data=psutil.disk_io_counters()
340
+ # This only checks the drive mapped to the first node and will need to be updated
341
+ # when we eventually support multiple drives
342
+ data=psutil.disk_usage(node_storage)
343
+ metrics["UsedHDPercent"]=data.percent
344
+ metrics["TotalHDBytes"]=data.total
345
+ end_time=time.time()
346
+ end_disk_counters=psutil.disk_io_counters()
347
+ end_net_counters=psutil.net_io_counters()
348
+ metrics["HDWriteBytes"]=int((end_disk_counters.write_bytes-start_disk_counters.write_bytes)/(end_time-start_time))
349
+ metrics["HDReadBytes"]=int((end_disk_counters.read_bytes-start_disk_counters.read_bytes)/(end_time-start_time))
350
+ metrics["NetWriteBytes"]=int((end_net_counters.bytes_sent-start_net_counters.bytes_sent)/(end_time-start_time))
351
+ metrics["NetReadBytes"]=int((end_net_counters.bytes_recv-start_net_counters.bytes_recv)/(end_time-start_time))
352
+ #print (json.dumps(metrics,indent=2))
353
+ # How close (out of 100) to removal limit will we be with a max bytes per node (2GB default)
354
+ # For running nodes with Porpoise(tm).
355
+ metrics["NodeHDCrisis"]=int((((metrics["TotalNodes"])*CRISIS_BYTES)/(metrics["TotalHDBytes"]*(remove_limit/100)))*100)
356
+ return metrics
357
+
358
+ # Update node with metrics result
359
+ def update_node_from_metrics(id,metrics,metadata):
360
+ try:
361
+ # We check the binary version in other code, so lets stop clobbering it when a node is stopped
362
+ card={'status': metrics["status"], 'timestamp': int(time.time()),
363
+ 'uptime': metrics["uptime"], 'records': metrics["records"],
364
+ 'shunned': metrics["shunned"],
365
+ 'peer_id': metadata["peer_id"]}
366
+ if "version" in metadata:
367
+ card['version']=metadata["version"]
368
+ with S() as session:
369
+ session.query(Node).filter(Node.id == id).\
370
+ update(card)
371
+ session.commit()
372
+ except Exception as error:
373
+ template = "In UNFM - An exception of type {0} occurred. Arguments:\n{1!r}"
374
+ message = template.format(type(error).__name__, error.args)
375
+ logging.warning(message)
376
+ return False
377
+ else:
378
+ return True
379
+
380
+ # Set Node status
381
+ def set_node_status(id,status):
382
+ logging.info("Setting node status: {0} {1}".format(id,status))
383
+ try:
384
+ with S() as session:
385
+ session.query(Node).filter(Node.id == id).\
386
+ update({'status': status, 'timestamp': int(time.time())})
387
+ session.commit()
388
+ except:
389
+ return False
390
+ else:
391
+ return True
392
+
393
+ # Update metrics after checking counters
394
+ def update_counters(old,config):
395
+ # Are we already removing a node
396
+ if old["RemovingNodes"]:
397
+ with S() as session:
398
+ removals=session.execute(select(Node.timestamp,Node.id)\
399
+ .where(Node.status == REMOVING)\
400
+ .order_by(Node.timestamp.asc())).all()
401
+ # Iterate through active removals
402
+ records_to_remove = len(removals)
403
+ for check in removals:
404
+ # If the DelayRemove timer has expired, delete the entry
405
+ if isinstance(check[0],int) and \
406
+ check[0] < (int(time.time()) - (config["DelayRemove"]*60)):
407
+ logging.info("Deleting removed node "+str(check[1]))
408
+ with S() as session:
409
+ session.execute(delete(Node).where(Node.id==check[1]))
410
+ session.commit()
411
+ records_to_remove-=1
412
+ old["RemovingNodes"]=records_to_remove
413
+ # Are we already upgrading a node
414
+ if old["UpgradingNodes"]:
415
+ with S() as session:
416
+ upgrades=session.execute(select(Node.timestamp,Node.id,Node.host,Node.metrics_port)\
417
+ .where(Node.status == UPGRADING)\
418
+ .order_by(Node.timestamp.asc())).all()
419
+ # Iterate through active upgrades
420
+ records_to_upgrade = len(upgrades)
421
+ for check in upgrades:
422
+ # If the DelayUpgrade timer has expired, check on status
423
+ if isinstance(check[0],int) and \
424
+ check[0] < (int(time.time()) - (config["DelayUpgrade"]*60)):
425
+ logging.info("Updating upgraded node "+str(check[1]))
426
+ node_metrics=read_node_metrics(check[2],check[3])
427
+ node_metadata=read_node_metadata(check[2],check[3])
428
+ if node_metrics and node_metadata:
429
+ update_node_from_metrics(check[1],node_metrics,node_metadata)
430
+ records_to_upgrade-=1
431
+ old["UpgradingNodes"]=records_to_upgrade
432
+ # Are we already restarting a node
433
+ if old["RestartingNodes"]:
434
+ with S() as session:
435
+ restarts=session.execute(select(Node.timestamp,Node.id,Node.host,Node.metrics_port)\
436
+ .where(Node.status == RESTARTING)\
437
+ .order_by(Node.timestamp.asc())).all()
438
+ # Iterate through active upgrades
439
+ records_to_restart = len(restarts)
440
+ for check in restarts:
441
+ # If the DelayUpgrade timer has expired, check on status
442
+ if isinstance(check[0],int) and \
443
+ check[0] < (int(time.time()) - (config["DelayStart"]*60)):
444
+ logging.info("Updating restarted node "+str(check[1]))
445
+ node_metrics=read_node_metrics(check[2],check[3])
446
+ node_metadata=read_node_metadata(check[2],check[3])
447
+ if node_metrics and node_metadata:
448
+ update_node_from_metrics(check[1],node_metrics,node_metadata)
449
+ records_to_restart-=1
450
+ old["RestartingNodes"]=records_to_restart
451
+ return(old)
452
+
453
+ # Enable firewall for port
454
+ def enable_firewall(port,node):
455
+ logging.info("enable firewall port {0}/udp".format(port))
456
+ # Close ufw firewall
457
+ try:
458
+ subprocess.run(['sudo','ufw','allow',"{0}/udp".format(port),'comment',node], stdout=subprocess.PIPE)
459
+ except subprocess.CalledProcessError as err:
460
+ logging.error( 'EF Error:', err )
461
+
462
+ # Disable firewall for port
463
+ def disable_firewall(port):
464
+ logging.info("disable firewall port {0}/udp".format(port))
465
+ # Close ufw firewall
466
+ try:
467
+ subprocess.run(['sudo','ufw','delete','allow',"{0}/udp".format(port)], stdout=subprocess.PIPE)
468
+ except subprocess.CalledProcessError as err:
469
+ logging.error( 'DF ERROR:', err )
470
+
471
+ # Start a systemd node
472
+ def start_systemd_node(node):
473
+ logging.info("Starting node "+str(node.id))
474
+ # Try to start the service
475
+ try:
476
+ p = subprocess.run(['sudo', 'systemctl', 'start', node.service], stdout=subprocess.PIPE,stderr=subprocess.STDOUT).stdout.decode('utf-8')
477
+ if re.match(r'Failed to start',p):
478
+ logging.error( 'SSN2 ERROR:', p )
479
+ return False
480
+ except subprocess.CalledProcessError as err:
481
+ logging.error( 'SSN1 ERROR:', err )
482
+ return False
483
+ # Open a firewall hole for the data port
484
+ enable_firewall(node.port,node.service)
485
+ # Update node status
486
+ set_node_status(node.id,RESTARTING)
487
+ return True
488
+
489
+ # Stop a systemd node
490
+ def stop_systemd_node(node):
491
+ logging.info("Stopping node: "+node.service)
492
+ # Send a stop signal to the process
493
+ try:
494
+ subprocess.run(['sudo', 'systemctl', 'stop', node.service], stdout=subprocess.PIPE)
495
+ except subprocess.CalledProcessError as err:
496
+ logging.error( 'SSN2 ERROR:', err )
497
+ disable_firewall(node.port)
498
+ set_node_status(node.id,STOPPED)
499
+
500
+ return True
501
+
502
+ # Upgrade a node
503
+ def upgrade_node(node,metrics):
504
+ logging.info("Upgrading node "+str(node.id))
505
+ # Copy current node binary
506
+ try:
507
+ subprocess.run(['sudo', 'cp', '-f', metrics["antnode"], node.binary])
508
+ except subprocess.CalledProcessError as err:
509
+ logging.error( 'UN1 ERROR:', err )
510
+ try:
511
+ subprocess.run(['sudo', 'systemctl', 'restart', node.service])
512
+ except subprocess.CalledProcessError as err:
513
+ logging.error( 'UN2 ERROR:', err )
514
+ version=get_antnode_version(node.binary)
515
+ try:
516
+ with S() as session:
517
+ session.query(Node).filter(Node.id == node.id).\
518
+ update({'status': UPGRADING, 'timestamp': int(time.time()),
519
+ 'version': metrics["AntNodeVersion"]})
520
+ session.commit()
521
+ except:
522
+ return False
523
+ else:
524
+ return True
525
+
526
+ # Remove a node
527
+ def remove_node(id):
528
+ logging.info("Removing node "+str(id))
529
+
530
+ with S() as session:
531
+ node = session.execute(select(Node).where(Node.id == id)).first()
532
+ # Grab Node from Row
533
+ node=node[0]
534
+ if stop_systemd_node(node):
535
+ # Mark this node as REMOVING
536
+ set_node_status(id,REMOVING)
537
+
538
+ nodename=f"antnode{node.nodename}"
539
+ # Remove node data and log
540
+ try:
541
+ subprocess.run(['sudo', 'rm', '-rf', node.root_dir, f"/var/log/antnode/{nodename}"])
542
+ except subprocess.CalledProcessError as err:
543
+ logging.error( 'RN1 ERROR:', err )
544
+ # Remove systemd service file
545
+ try:
546
+ subprocess.run(['sudo', 'rm', '-f', f"/etc/systemd/system/{node.service}"])
547
+ except subprocess.CalledProcessError as err:
548
+ logging.error( 'RN2 ERROR:', err )
549
+ # Tell system to reload systemd files
550
+ try:
551
+ subprocess.run(['sudo', 'systemctl', 'daemon-reload'])
552
+ except subprocess.CalledProcessError as err:
553
+ logging.error( 'RN3 ERROR:', err )
554
+ #print(json.dumps(node,indent=2))
555
+
556
+ # Rescan nodes for status
557
+ def update_nodes():
558
+ with S() as session:
559
+ nodes=session.execute(select(Node.timestamp,Node.id,Node.host,Node.metrics_port,Node.status)\
560
+ .where(Node.status != DISABLED)\
561
+ .order_by(Node.timestamp.asc())).all()
562
+ # Iterate through all records
563
+ for check in nodes:
564
+ # Check on status
565
+ if isinstance(check[0],int):
566
+ logging.debug("Updating info on node "+str(check[1]))
567
+ node_metrics=read_node_metrics(check[2],check[3])
568
+ node_metadata=read_node_metadata(check[2],check[3])
569
+ if node_metrics and node_metadata:
570
+ # Don't write updates for stopped nodes that are already marked as stopped
571
+ if node_metadata["status"]==STOPPED and check[4]==STOPPED:
572
+ continue
573
+ update_node_from_metrics(check[1],node_metrics,node_metadata)
574
+
575
+ # Create a new node
576
+ def create_node(config,metrics):
577
+ logging.info("Creating new node")
578
+ # Create a holding place for the new node
579
+ card = {}
580
+ # Find the next available node number by first looking for holes
581
+ sql = text('select n1.id + 1 as id from node n1 ' + \
582
+ 'left join node n2 on n2.id = n1.id + 1 ' + \
583
+ 'where n2.id is null ' + \
584
+ 'and n1.id <> (select max(id) from node) ' + \
585
+ 'order by n1.id;')
586
+ with S() as session:
587
+ result = session.execute(sql).first()
588
+ if result:
589
+ card['id']=result[0]
590
+ # Otherwise get the max node number and add 1
591
+ else:
592
+ with S() as session:
593
+ result = session.execute(select(Node.id).order_by(Node.id.desc())).first()
594
+ card['id']=result[0]+1
595
+ # Set the node name
596
+ card['nodename']=f'{card['id']:04}'
597
+ card['service']=f'antnode{card['nodename']}.service'
598
+ card['user']='ant'
599
+ card['version']=metrics["AntNodeVersion"]
600
+ card['root_dir']=f"{config['NodeStorage']}/antnode{card['nodename']}"
601
+ card['binary']=f"{card['root_dir']}/antnode"
602
+ card['port']=config["PortStart"]*1000+card['id']
603
+ card['metrics_port']=13*1000+card['id']
604
+ card['network']='evm-arbitrum-one'
605
+ card['wallet']=config["RewardsAddress"]
606
+ card['peer_id']=''
607
+ card['status']=STOPPED
608
+ card['timestamp']=int(time.time())
609
+ card['records']=0
610
+ card['uptime']=0
611
+ card['shunned']=0
612
+ card['age']=card['timestamp']
613
+ card['host']=ANM_HOST
614
+ log_dir=f"/var/log/antnode/antnode{card['nodename']}"
615
+ # Create the node directory and log directory
616
+ try:
617
+ subprocess.run(['sudo','mkdir','-p',card["root_dir"],log_dir], stdout=subprocess.PIPE)
618
+ except subprocess.CalledProcessError as err:
619
+ logging.error( 'CN1 ERROR:', err )
620
+ # Copy the binary to the node directory
621
+ try:
622
+ subprocess.run(['sudo','cp',metrics["antnode"],card["root_dir"]], stdout=subprocess.PIPE)
623
+ except subprocess.CalledProcessError as err:
624
+ logging.error( 'CN2 ERROR:', err )
625
+ # Change owner of the node directory and log directories
626
+ try:
627
+ subprocess.run(['sudo','chown','-R',f'{card["user"]}:{card["user"]}',card["root_dir"],log_dir], stdout=subprocess.PIPE)
628
+ except subprocess.CalledProcessError as err:
629
+ logging.error( 'CN3 ERROR:', err )
630
+ # build the systemd service unit
631
+ service=f"""[Unit]
632
+ Description=antnode{card['nodename']}
633
+ [Service]
634
+ User={card['user']}
635
+ ExecStart={card['binary']} --bootstrap-cache-dir /var/antctl/bootstrap-cache --root-dir {card['root_dir']} --port {card['port']} --enable-metrics-server --metrics-server-port {card['metrics_port']} --log-output-dest {log_dir} --max-log-files 1 --max-archived-log-files 1 --rewards-address {card['wallet']} {card['network']}
636
+ Restart=always
637
+ #RestartSec=300
638
+ """
639
+ # Write the systemd service unit with sudo tee since we're running as not root
640
+ try:
641
+ subprocess.run(['sudo','tee',f'/etc/systemd/system/{card["service"]}'],input=service,text=True, stdout=subprocess.PIPE)
642
+ except subprocess.CalledProcessError as err:
643
+ logging.error( 'CN4 ERROR:', err )
644
+ # Reload systemd service files to get our new one
645
+ try:
646
+ subprocess.run(['sudo','systemctl','daemon-reload'], stdout=subprocess.PIPE)
647
+ except subprocess.CalledProcessError as err:
648
+ logging.error( 'CN5 ERROR:', err )
649
+ # Add the new node to the database
650
+ with S() as session:
651
+ session.execute(
652
+ insert(Node),[card]
653
+ )
654
+ session.commit()
655
+ # Now we grab the node object from the database to pass to start node
656
+ with S() as session:
657
+ card=session.execute(select(Node).where(Node.id == card['id'])).first()
658
+ # Get the Node object from the Row
659
+ card=card[0]
660
+ # Start the new node
661
+ return start_systemd_node(card)
662
+ #print(json.dumps(card,indent=2))
663
+ return True
664
+
665
+
666
+ # Make a decision about what to do
667
+ def choose_action(config,metrics,db_nodes):
668
+ # Gather knowlege
669
+ features={}
670
+ features["AllowCpu"]=metrics["UsedCpuPercent"] < config["CpuLessThan"]
671
+ features["AllowMem"]=metrics["UsedMemPercent"] < config["MemLessThan"]
672
+ features["AllowHD"]=metrics["UsedHDPercent"] < config["HDLessThan"]
673
+ features["RemCpu"]=metrics["UsedCpuPercent"] > config["CpuRemove"]
674
+ features["RemMem"]=metrics["UsedMemPercent"] > config["MemRemove"]
675
+ features["RemHD"]=metrics["UsedHDPercent"] > config["HDRemove"]
676
+ features["AllowNodeCap"]=metrics["RunningNodes"] < config["NodeCap"]
677
+ # These are new features, so ignore them if not configured
678
+ if (config["NetIOReadLessThan"]+config["NetIOReadRemove"]+
679
+ config["NetIOWriteLessThan"]+config["NetIOWriteRemove"]>1):
680
+ features["AllowNetIO"]=metrics["NetReadBytes"] < config["NetIOReadLessThan"] and \
681
+ metrics["NetWriteBytes"] < config["NetIOWriteLessThan"]
682
+ features["RemoveNetIO"]=metrics["NetReadBytes"] > config["NetIORemove"] or \
683
+ metrics["NetWriteBytes"] > config["NetIORemove"]
684
+ else:
685
+ features["AllowNetIO"]=True
686
+ features["RemoveNetIO"]=False
687
+ if (config["HDIOReadLessThan"]+config["HDIOReadRemove"]+
688
+ config["HDIOWriteLessThan"]+config["HDIOWriteRemove"]>1):
689
+ features["AllowHDIO"]=metrics["HDReadBytes"] < config["HDIOReadLessThan"] and \
690
+ metrics["HDWriteBytes"] < config["HDIOWriteLessThan"]
691
+ features["RemoveHDIO"]=metrics["HDReadBytes"] > config["HDIORemove"] or \
692
+ metrics["HDWriteBytes"] > config["HDtIORemove"]
693
+ else:
694
+ features["AllowHDIO"]=True
695
+ features["RemoveHDIO"]=False
696
+ features["LoadAllow"] = metrics["LoadAverage1"] < config["DesiredLoadAverage"] and \
697
+ metrics["LoadAverage5"] < config["DesiredLoadAverage"] and \
698
+ metrics["LoadAverage15"] < config["DesiredLoadAverage"]
699
+ features["LoadNotAllow"] = metrics["LoadAverage1"] > config["MaxLoadAverageAllowed"] or \
700
+ metrics["LoadAverage5"] > config["MaxLoadAverageAllowed"] or \
701
+ metrics["LoadAverage15"] > config["MaxLoadAverageAllowed"]
702
+ # Check records for expired status
703
+ metrics=update_counters(metrics,config)
704
+ # If we have other thing going on, don't add more nodes
705
+ features["AddNewNode"]=sum([ metrics.get(m, 0) \
706
+ for m in ['UpgradingNodes',
707
+ 'RestartingNodes','MigratingNodes',
708
+ 'RemovingNodes'] ]) == 0 and \
709
+ features["AllowCpu"] and features["AllowHD"] and \
710
+ features["AllowMem"] and features["AllowNodeCap"] and \
711
+ features["AllowHDIO"] and features["AllowNetIO"] and \
712
+ features["LoadAllow"]
713
+ # Are we overlimit on nodes
714
+ features["Remove"] =features["LoadNotAllow"] or features["RemCpu"] or \
715
+ features["RemHD"] or features["RemMem"] or \
716
+ features["RemoveHDIO"] or features["RemoveNetIO"] or \
717
+ metrics["TotalNodes"] > config["NodeCap"]
718
+ # If we have nodes to upgrade
719
+ if metrics["NodesToUpgrade"] >= 1:
720
+ # Make sure current version is equal or newer than version on first node.
721
+ if Version(metrics["AntNodeVersion"]) < Version(db_nodes[0][1]):
722
+ logging.warning("node upgrade cancelled due to lower version")
723
+ features["Upgrade"]=False
724
+ else:
725
+ if features["Remove"]:
726
+ logging.info("Can't upgrade while removing is required")
727
+ features["Upgrade"]=False
728
+ else:
729
+ features["Upgrade"]=True
730
+ else:
731
+ features["Upgrade"]=False
732
+
733
+
734
+ logging.info(json.dumps(features,indent=2))
735
+ ##### Decisions
736
+
737
+ # Actually, removing DEAD nodes take priority
738
+ if metrics["DeadNodes"] > 1:
739
+ with S() as session:
740
+ broken=session.execute(select(Node.timestamp,Node.id,Node.host,Node.metrics_port)\
741
+ .where(Node.status == DEAD)\
742
+ .order_by(Node.timestamp.asc())).all()
743
+ # Iterate through dead nodes and remove them all
744
+ for check in broken:
745
+ # Remove broken nodes
746
+ logging.info("Removing dead node "+str(check[1]))
747
+ remove_node(check[1])
748
+ return {"status": "removed-dead-nodes"}
749
+ # If we have nodes with no version number, update from binary
750
+ if metrics["NodesNoVersion"] > 1:
751
+ with S() as session:
752
+ no_version=session.execute(select(Node.timestamp,Node.id,Node.binary)\
753
+ .where(Node.version == '')\
754
+ .order_by(Node.timestamp.asc())).all()
755
+ # Iterate through nodes with no version number
756
+ for check in no_version:
757
+ # Update version number from binary
758
+ version=get_antnode_version(check[2])
759
+ logging.info(f"Updating version number for node {check[1]} to {version}")
760
+ with S() as session:
761
+ session.query(Node).filter(Node.id == check[1]).\
762
+ update({'version': version})
763
+ session.commit()
764
+
765
+ # If we're restarting, wait patiently as metrics could be skewed
766
+ if metrics["RestartingNodes"]:
767
+ logging.info("Still waiting for RestartDelay")
768
+ return {"status": RESTARTING}
769
+ # If we still have unexpired upgrade records, wait
770
+ if metrics["UpgradingNodes"]:
771
+ logging.info("Still waiting for UpgradeDelay")
772
+ return {"status": UPGRADING}
773
+ # First if we're removing, that takes top priority
774
+ if features["Remove"]:
775
+ # If we still have unexpired removal records, wait
776
+ if metrics["RemovingNodes"]:
777
+ logging.info("Still waiting for RemoveDelay")
778
+ return {"status": REMOVING}
779
+ # If we're under HD pressure or trimming node cap, remove nodes
780
+ if features["RemHD"] or metrics["TotalNodes"] > config["NodeCap"]:
781
+ # Start removing with stopped nodes
782
+ if metrics["StoppedNodes"] > 0:
783
+ # What is the youngest stopped node
784
+ with S() as session:
785
+ youngest=session.execute(select(Node.id)\
786
+ .where(Node.status == STOPPED)\
787
+ .order_by(Node.age.desc())).first()
788
+ if youngest:
789
+ # Remove the youngest node
790
+ remove_node(youngest[0])
791
+ return{"status": REMOVING}
792
+ # No low hanging fruit. let's start with the youngest running node
793
+ with S() as session:
794
+ youngest=session.execute(select(Node.id)\
795
+ .where(Node.status == RUNNING)\
796
+ .order_by(Node.age.desc())).first()
797
+ if youngest:
798
+ # Remove the youngest node
799
+ remove_node(youngest[0])
800
+ return{"status": REMOVING}
801
+ return{"status": "nothing-to-remove"}
802
+ # Otherwise, let's try just stopping a node to bring IO/Mem/Cpu down
803
+ else:
804
+ # If we just stopped a node, wait
805
+ if int(config["LastStoppedAt"] or 0) > (int(time.time()) - (config["DelayRemove"]*60)):
806
+ logging.info("Still waiting for RemoveDelay")
807
+ return {"status": 'waiting-to-stop'}
808
+ # Start with the youngest running node
809
+ with S() as session:
810
+ youngest=session.execute(select(Node)\
811
+ .where(Node.status == RUNNING)\
812
+ .order_by(Node.age.desc())).first()
813
+ if youngest:
814
+ # Stop the youngest node
815
+ stop_systemd_node(youngest[0])
816
+ # Update the last stopped time
817
+ with S() as session:
818
+ session.query(Machine).filter(Machine.id == 1).\
819
+ update({'LastStoppedAt': int(time.time())})
820
+ session.commit()
821
+ return{"status": STOPPED}
822
+ else:
823
+ return{"status": "nothing-to-stop"}
824
+
825
+ # Do we have upgrading to do?
826
+ if features["Upgrade"]:
827
+ # Let's find the oldest running node not using the current version
828
+ with S() as session:
829
+ oldest=session.execute(select(Node)\
830
+ .where(Node.status == RUNNING)\
831
+ .where(Node.version != metrics["AntNodeVersion"])
832
+ .order_by(Node.age.asc())).first()
833
+ if oldest:
834
+ # Get Node from Row
835
+ oldest = oldest[0]
836
+ # If we don't have a version number from metadata, grab from binary
837
+ if not oldest.version:
838
+ oldest.version=get_antnode_version(oldest.binary)
839
+ #print(json.dumps(oldest))
840
+ # Upgrade the oldest node
841
+ upgrade_node(oldest,metrics)
842
+ return{"status": UPGRADING}
843
+
844
+ # If AddNewNode
845
+ # If stopped nodes available
846
+ # Check oldest stopped version
847
+ # If out of date
848
+ # upgrade node which starts it
849
+ # else
850
+ # restart node
851
+ # else
852
+ # Create a Node which starts it
853
+ if features["AddNewNode"]:
854
+ # Start adding with stopped nodes
855
+ if metrics["StoppedNodes"] > 0:
856
+ # What is the oldest stopped node
857
+ with S() as session:
858
+ oldest=session.execute(select(Node)\
859
+ .where(Node.status == STOPPED)\
860
+ .order_by(Node.age.asc())).first()
861
+ if oldest:
862
+ # Get Node from Row
863
+ oldest=oldest[0]
864
+ # If we don't have a version number from metadata, grab from binary
865
+ if not oldest.version:
866
+ oldest.version=get_antnode_version(oldest.binary)
867
+ # If the stopped version is old, upgrade it
868
+ if Version(metrics["AntNodeVersion"]) > Version(oldest.version):
869
+ upgrade_node(oldest,metrics)
870
+ return{"status": UPGRADING}
871
+ else:
872
+ if start_systemd_node(oldest):
873
+ return{"status": RESTARTING}
874
+ else:
875
+ return{"status": "failed-start-node"}
876
+ # Hmm, still in Start mode, we shouldn't get here
877
+ return {"status": 'START'}
878
+ # Still in Add mode, add a new node
879
+ if metrics["TotalNodes"] < config["NodeCap"]:
880
+ if create_node(config,metrics):
881
+ return {"status": "ADD"}
882
+ else:
883
+ return {"status": "failed-create-node"}
884
+ else:
885
+ return {"status": "node-cap-reached"}
886
+ # If we have nothing to do, Survey the node ports
887
+ update_nodes()
888
+ return{"status": "idle"}
889
+
890
+ def main():
891
+ # We're starting, so lets create a lock file
892
+ try:
893
+ with open('/var/antctl/wnm_active', 'w') as file:
894
+ file.write(str(int(time.time())))
895
+ except:
896
+ logging.error("Unable to create lock file, exiting")
897
+ sys.exit(1)
898
+
899
+ # See if we already have a known state in the database
900
+ with S() as session:
901
+ db_nodes=session.execute(select(Node.status,Node.version,
902
+ Node.host,Node.metrics_port,
903
+ Node.port,Node.age,Node.id,
904
+ Node.timestamp)).all()
905
+ anm_config=session.execute(select(Machine)).all()
906
+
907
+ if db_nodes:
908
+ # anm_config by default loads a parameter array,
909
+ # use the __json__ method to return a dict from the first node
910
+ anm_config = json.loads(json.dumps(anm_config[0][0])) or load_anm_config()
911
+ metrics=get_machine_metrics(anm_config["NodeStorage"],anm_config["HDRemove"])
912
+ #node_metrics = read_node_metrics(db_nodes[0][2],db_nodes[0][3])
913
+ #print(db_nodes[0])
914
+ #print(node_metrics)
915
+ #print(anm_config)
916
+ #print(json.dumps(anm_config,indent=4))
917
+ #print("Node: ",db_nodes)
918
+ logging.info("Found {counter} nodes migrated".format(counter=len(db_nodes)))
919
+
920
+ else:
921
+ anm_config = load_anm_config()
922
+ #print(anm_config)
923
+ Workers = survey_machine() or []
924
+
925
+ #""""
926
+ with S() as session:
927
+ session.execute(
928
+ insert(Node),Workers
929
+ )
930
+ session.commit()
931
+ #"""
932
+
933
+ with S() as session:
934
+ session.execute(
935
+ insert(Machine),[anm_config]
936
+ )
937
+ session.commit()
938
+
939
+ # Now load subset of data to work with
940
+ with S() as session:
941
+ db_nodes=session.execute(select(Node.status,Node.version,
942
+ Node.host,Node.metrics_port,
943
+ Node.port,Node.age,Node.id,
944
+ Node.timestamp)).all()
945
+
946
+
947
+
948
+ #print(json.dumps(anm_config,indent=4))
949
+ logging.info("Found {counter} nodes configured".format(counter=len(db_nodes)))
950
+
951
+ #versions = [v[1] for worker in Workers if (v := worker.get('version'))]
952
+ #data = Counter(ver for ver in versions)
953
+
954
+
955
+ data = Counter(status[0] for status in db_nodes)
956
+ #print(data)
957
+ print("Running Nodes:",data[RUNNING])
958
+ print("Restarting Nodes:",data[RESTARTING])
959
+ print("Stopped Nodes:",data[STOPPED])
960
+ print("Upgrading Nodes:",data[UPGRADING])
961
+ print("Removing Nodes:",data[REMOVING])
962
+ data = Counter(ver[1] for ver in db_nodes)
963
+ print("Versions:",data)
964
+
965
+ machine_metrics = get_machine_metrics(anm_config['NodeStorage'],anm_config["HDRemove"])
966
+ print(json.dumps(anm_config,indent=2))
967
+ print(json.dumps(machine_metrics,indent=2))
968
+ this_action=choose_action(anm_config,machine_metrics,db_nodes)
969
+ print("Action:",json.dumps(this_action,indent=2))
970
+ # Remove lock file
971
+ os.remove("/var/antctl/wnm_active")
972
+
973
+ if __name__ == "__main__":
974
+ main()
975
+
976
+ print("End of program")
wnm/config.py ADDED
@@ -0,0 +1,3 @@
1
+ import os, sys
2
+ from dotenv import load_dotenv
3
+
wnm/models.py ADDED
@@ -0,0 +1,192 @@
1
+ import json_fix
2
+
3
+ # Turn a class into a storable object with ORM
4
+ from typing import Optional
5
+ from sqlalchemy import Integer, Unicode, UnicodeText, Float
6
+ from sqlalchemy import create_engine, select, insert, update
7
+ from sqlalchemy.orm import sessionmaker, scoped_session
8
+ from sqlalchemy.orm import DeclarativeBase
9
+ from sqlalchemy.orm import Mapped, mapped_column
10
+
11
+ # create a Base class bound to sqlalchemy
12
+ class Base(DeclarativeBase):
13
+ pass
14
+
15
+ # Extend the Base class to create our Host info
16
+ class Machine(Base):
17
+ __tablename__ = 'machine'
18
+ # No schema in sqlite3
19
+ # __table_args__ = {"schema": "colony"}
20
+ id: Mapped[int] = mapped_column(Integer, primary_key=True)
21
+ CpuCount: Mapped[int] = mapped_column(Integer)
22
+ NodeCap: Mapped[int] = mapped_column(Integer)
23
+ CpuLessThan: Mapped[int] = mapped_column(Integer)
24
+ CpuRemove: Mapped[int] = mapped_column(Integer)
25
+ MemLessThan: Mapped[int] = mapped_column(Integer)
26
+ MemRemove: Mapped[int] = mapped_column(Integer)
27
+ HDLessThan: Mapped[int] = mapped_column(Integer)
28
+ HDRemove: Mapped[int] = mapped_column(Integer)
29
+ DelayStart: Mapped[int] = mapped_column(Integer)
30
+ DelayUpgrade: Mapped[int] = mapped_column(Integer)
31
+ DelayRemove: Mapped[int] = mapped_column(Integer)
32
+ NodeStorage: Mapped[str] = mapped_column(UnicodeText)
33
+ RewardsAddress: Mapped[str] = mapped_column(UnicodeText)
34
+ DonateAddress: Mapped[str] = mapped_column(UnicodeText)
35
+ MaxLoadAverageAllowed: Mapped[float] = mapped_column(Float)
36
+ DesiredLoadAverage: Mapped[float] = mapped_column(Float)
37
+ # What port to begin assigning nodes
38
+ PortStart: Mapped[int] = mapped_column(Integer)
39
+ HDIOReadLessThan: Mapped[float] = mapped_column(Float)
40
+ HDIOReadRemove: Mapped[float] = mapped_column(Float)
41
+ HDIOWriteLessThan: Mapped[float] = mapped_column(Float)
42
+ HDIOWriteRemove: Mapped[float] = mapped_column(Float)
43
+ NetIOReadLessThan: Mapped[float] = mapped_column(Float)
44
+ NetIOReadRemove: Mapped[float] = mapped_column(Float)
45
+ NetIOWriteLessThan: Mapped[float] = mapped_column(Float)
46
+ NetIOWriteRemove: Mapped[float] = mapped_column(Float)
47
+ LastStoppedAt: Mapped[int] = mapped_column(Integer)
48
+
49
+ def __init__(self, CpuCount, NodeCap, CpuLessThan, CpuRemove,
50
+ MemLessThan, MemRemove, HDLessThan, HDRemove,
51
+ DelayStart, DelayUpgrade, DelayRemove, NodeStorage,
52
+ RewardsAddress, DonateAddress, MaxLoadAverageAllowed,
53
+ DesiredLoadAverage, PortStart, HDIOReadLessThan,
54
+ HDIOReadRemove, HDIOWriteLessThan, HDIOWriteRemove,
55
+ NetIOReadLessThan, NetIOReadRemove, NetIOWriteLessThan,
56
+ NetIOWriteRemove, LastStoppedAt):
57
+
58
+ self.CpuCount = CpuCount
59
+ self.NodeCap = NodeCap
60
+ self.CpuLessThan = CpuLessThan
61
+ self.CpuRemove = CpuRemove
62
+ self.MemLessThan = MemLessThan
63
+ self.MemRemove = MemRemove
64
+ self.HDLessThan = HDLessThan
65
+ self.HDRemove = HDRemove
66
+ self.DelayStart = DelayStart
67
+ self.DelayUpgrade = DelayUpgrade
68
+ self.DelayRemove = DelayRemove
69
+ self.NodeStorage = NodeStorage
70
+ self.RewardsAddress = RewardsAddress
71
+ self.DonateAddress = DonateAddress
72
+ self.MaxLoadAverageAllowed = MaxLoadAverageAllowed
73
+ self.DesiredLoadAverage = DesiredLoadAverage
74
+ self.PortStart = PortStart
75
+ self.HDIOReadLessThan = HDIOReadLessThan
76
+ self.HDIOReadRemove = HDIOReadRemove
77
+ self.HDIOWriteLessThan = HDIOWriteLessThan
78
+ self.HDIOWriteRemove = HDIOWriteRemove
79
+ self.NetIOReadLessThan = NetIOReadLessThan
80
+ self.NetIOReadRemove = NetIOReadRemove
81
+ self.NetIOWriteLessThan = NetIOWriteLessThan
82
+ self.NetIOWriteRemove = NetIOWriteRemove
83
+ self.LastStoppedAt = LastStoppedAt
84
+
85
+ def __repr__(self):
86
+ return f'Machine({self.CpuCount},{self.NodeCap},{self.CpuLessThan},{self.CpuRemove}' + \
87
+ f',{self.MemLessThan},{self.MemRemove},{self.HDLessThan}' + \
88
+ f',{self.HDRemove},{self.DelayStart},{self.DelayUpgrade}' + \
89
+ f',{self.DelayRemove}' + \
90
+ f',"{self.NodeStorage}","{self.RewardsAddress}","{self.DonateAddress}"' + \
91
+ f',{self.MaxLoadAverageAllowed},{self.DesiredLoadAverage}' + \
92
+ f',{self.PortStart},{self.HDIOReadLessThan},{self.HDIOReadRemove}' + \
93
+ f',{self.HDIOWriteLessThan},{self.HDIOWriteRemove}' + \
94
+ f',{self.NetIOReadLessThan},{self.NetIOReadRemove}' + \
95
+ f',{self.NetIOWriteLessThan},{self.NetIOWriteRemove}' + \
96
+ f',{self.LastStoppedAt})'
97
+
98
+ def __json__(self):
99
+ return { 'CpuCount': self.CpuCount, 'NodeCap': self.NodeCap, 'CpuLessThan': self.CpuLessThan,
100
+ 'CpuRemove': self.CpuRemove, 'MemLessThan': self.MemLessThan, 'MemRemove': self.MemRemove,
101
+ 'HDLessThan': self.HDLessThan, 'HDRemove': self.HDRemove, 'DelayStart': self.DelayStart,
102
+ 'DelayUpgrade': self.DelayUpgrade, 'DelayRemove': self.DelayRemove, 'NodeStorage': f"{self.NodeStorage}",
103
+ 'RewardsAddress': f"{self.RewardsAddress}", 'DonateAddress': f"{self.DonateAddress}",
104
+ 'MaxLoadAverageAllowed': self.MaxLoadAverageAllowed, 'DesiredLoadAverage': self.DesiredLoadAverage,
105
+ 'PortStart': self.PortStart, 'HDIOReadLessThan': self.HDIOReadLessThan, 'HDIOReadRemove': self.HDIOReadRemove,
106
+ 'HDIOWriteLessThan': self.HDIOWriteLessThan, 'HDIOWriteRemove': self.HDIOWriteRemove,
107
+ 'NetIOReadLessThan': self.NetIOReadLessThan, 'NetIOReadRemove': self.NetIOReadRemove,
108
+ 'NetIOWriteLessThan': self.NetIOWriteLessThan, 'NetIOWriteRemove': self.NetIOWriteRemove,
109
+ 'LastStoppedAt': self.LastStoppedAt }
110
+
111
+
112
+ # Extend the Base class to create our Node info
113
+ class Node(Base):
114
+ __tablename__ = 'node'
115
+ # No schema in sqlite3
116
+ #__table_args__ = {"schema": "colony"}
117
+ id: Mapped[int] = mapped_column(Integer, primary_key=True)
118
+ # Maps to antnode-{nodename}
119
+ nodename: Mapped[str] = mapped_column(Unicode(10))
120
+ # service definition name
121
+ service: Mapped[str] = mapped_column(UnicodeText)
122
+ # User running node
123
+ user: Mapped[str] = mapped_column(Unicode(24))
124
+ # Full path to node binary
125
+ binary: Mapped[str] = mapped_column(UnicodeText)
126
+ # Last polled version of the binary
127
+ version: Mapped[Optional[str]] = mapped_column(UnicodeText)
128
+ # Root directory of the node
129
+ root_dir: Mapped[str] = mapped_column(UnicodeText)
130
+ # Node open port
131
+ port: Mapped[int] = mapped_column(Integer)
132
+ # Node metrics port
133
+ metrics_port: Mapped[int] = mapped_column(Integer)
134
+ # Network to use ( Live is evm-arbitrum-one )
135
+ network: Mapped[str] = mapped_column(UnicodeText)
136
+ # Reward address
137
+ wallet: Mapped[Optional[str]] = mapped_column(Unicode(42),index=True)
138
+ # Reported peer_id
139
+ peer_id: Mapped[Optional[str]] = mapped_column(Unicode(52))
140
+ # Node's last probed status
141
+ status: Mapped[str] = mapped_column(Unicode(32),index=True)
142
+ # Timestamp of last update
143
+ timestamp: Mapped[int] = mapped_column(Integer,index=True)
144
+ # Number of node records stored as reported by node
145
+ records: Mapped[int] = mapped_column(Integer,index=True)
146
+ # Node reported uptime
147
+ uptime: Mapped[int] = mapped_column(Integer)
148
+ # Number of shuns
149
+ shunned: Mapped[int] = mapped_column(Integer)
150
+ # Timestamp of node first launch
151
+ age: Mapped[int] = mapped_column(Integer)
152
+ # Host ip/name for data and metrics ports
153
+ host: Mapped[Optional[str]] = mapped_column(UnicodeText)
154
+
155
+ def __init__(self, id, nodename, service, user, binary, version,
156
+ root_dir, port, metrics_port, network,
157
+ wallet, peer_id, status, timestamp, records,
158
+ uptime, shunned, age, host):
159
+ self.id = id
160
+ self.nodename = nodename
161
+ self.service = service
162
+ self.user = user
163
+ self.binary = binary
164
+ self.version = version
165
+ self.root_dir = root_dir
166
+ self.port = port
167
+ self.metrics_port = metrics_port
168
+ self.network = network
169
+ self.wallet = wallet
170
+ self.peer_id = peer_id
171
+ self.status = status
172
+ self.timestamp = timestamp
173
+ self.records = records
174
+ self.uptime = uptime
175
+ self.shunned = shunned
176
+ self.age = age
177
+ self.host = host
178
+
179
+ def __repr__(self):
180
+ return f'Node({self.id},"{self.nodename}","{self.service}","{self.user},"{self.binary}"'+\
181
+ f',"{self.version}","{self.root_dir}",{self.port},{self.metrics_port}' + \
182
+ f',"{self.network}","{self.wallet}","{self.peer_id}","{self.status}",{self.timestamp}' + \
183
+ f',{self.records},{self.uptime},{self.shunned},{self.age},"{self.host}")'
184
+
185
+ def __json__(self):
186
+ return { "id": self.id, "nodename": f"{self.nodename}", "service": f"{self.service}",
187
+ "user": f"{self.user}", "binary": f"{self.binary}", "version": f"{self.version}",
188
+ "root_dir": f"{self.root_dir}", "port": self.port, "metrics_port": self.metrics_port,
189
+ "network": f"{self.network}", "wallet": f"{self.wallet}", "peer_id": f"{self.peer_id}",
190
+ "status": f"{self.status}", "timestamp": self.timestamp, "records": self.records,
191
+ "uptime": self.uptime, "shunned": self.shunned, "age": self.age,
192
+ "host": f"{self.host}" }
@@ -0,0 +1,76 @@
1
+ Metadata-Version: 2.4
2
+ Name: wnm
3
+ Version: 0.0.4
4
+ Summary: Manager for Autonomi nodes
5
+ Author-email: Troy Johnson <troy@weave.sh>
6
+ License: GPL-3.0
7
+ Keywords: Autonomi,antnode,weave,xd7
8
+ Classifier: Development Status :: 3 - Alpha
9
+ Classifier: Environment :: Console
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
12
+ Classifier: Operating System :: POSIX :: Linux
13
+ Classifier: Programming Language :: Python :: 3.12
14
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
15
+ Classifier: Topic :: System :: Distributed Computing
16
+ Requires-Python: >=3.12.3
17
+ Description-Content-Type: text/markdown
18
+ Requires-Dist: requests
19
+ Requires-Dist: packaging
20
+ Requires-Dist: sqlalchemy
21
+ Requires-Dist: alembic
22
+ Requires-Dist: json-fix
23
+ Requires-Dist: psutil
24
+ Provides-Extra: dev
25
+ Requires-Dist: black; extra == "dev"
26
+ Requires-Dist: isort; extra == "dev"
27
+ Requires-Dist: build; extra == "dev"
28
+ Requires-Dist: twine; extra == "dev"
29
+
30
+ # Weave Node Manager
31
+
32
+ ## Overview
33
+ Weave Node Manager (wnm) is a Python application designed to manage nodes for decentralized networks.
34
+
35
+ ## Features
36
+ - Update node metrics and statuses.
37
+ - Manage systemd services and ufw firewall for linux nodes.
38
+ - Support for configuration via YAML, JSON, or command-line parameters.
39
+
40
+ ## Installation
41
+ 1. Clone the repository:
42
+ ```
43
+ git clone https://github.com/iweave/weave-node-manager.git
44
+ ```
45
+ 2. Navigate to the project directory:
46
+ ```
47
+ cd weave-node-manager
48
+ 3. Create a virtual environment
49
+ ```
50
+ python3 -m venv .venv
51
+ ```
52
+ 4. Activate the virtual environment
53
+ ```
54
+ . .venv/bin/activate
55
+ ```
56
+ 5. Install the required dependencies:
57
+ ```
58
+ pip install -r requirements.txt
59
+ ```
60
+
61
+ ## Configuration
62
+ Configuration can be done through a `.env` file, YAML, or JSON files. The application will prioritize these configurations over default values.
63
+
64
+ Upon finding an existing installation of [anm - aatonnomicc node manager](https://github.com/safenetforum-community/NTracking/tree/main/anm), wnm will disable anm and take over management of the cluster. The /var/antctl/config is only read on first ingestion, configuration priority then moves to the `.env` file or a named configuration file.
65
+
66
+ ## Usage
67
+ To run the application, execute the following command:
68
+ ```
69
+ python main.py
70
+ ```
71
+
72
+ ## Contributing
73
+ Contributions are welcome! Please submit a pull request or open an issue for any enhancements or bug fixes.
74
+
75
+ ## License
76
+ This project is licensed under the MIT License. See the LICENSE file for more details.
@@ -0,0 +1,9 @@
1
+ wnm/__init__.py,sha256=3WMpwTP3eL5shixA3OMks-dTlTUzqN6pge2F4DQxEFo,91
2
+ wnm/__main__.py,sha256=IXboB7V4o80ye6N7xNAteoD_CALa4m49RlNduNLcifY,42631
3
+ wnm/config.py,sha256=nMJlzxZX7NA0iI_QAuJDE0uI3V_ROjP5v99wdKDfuDs,47
4
+ wnm/models.py,sha256=7lhZkUfFlnDgeEVRngQVuFnzSbJPwbytLMCynAo1j28,9416
5
+ wnm-0.0.4.dist-info/METADATA,sha256=1UvyYwEc5OyyiVU3_jq2lZZw5mqRB-GwL8VcDfDt44Y,2539
6
+ wnm-0.0.4.dist-info/WHEEL,sha256=1tXe9gY0PYatrMPMDd6jXqjfpz_B-Wqm32CPfRC58XU,91
7
+ wnm-0.0.4.dist-info/entry_points.txt,sha256=jfoemjoLVPeeiBMHKqAExrHQ4Rhf9IXxL4JCnS7ZYFo,42
8
+ wnm-0.0.4.dist-info/top_level.txt,sha256=E6dTE5k6efMEB9LaJAZSBu8zzs__l4R55t0-F-LwufI,4
9
+ wnm-0.0.4.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (77.0.3)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ wnm = wnm.__main__:main
@@ -0,0 +1 @@
1
+ wnm