wnm 0.0.8__py3-none-any.whl → 0.0.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of wnm might be problematic. Click here for more details.
- wnm/__init__.py +1 -1
- wnm/__main__.py +206 -953
- wnm/actions.py +45 -0
- wnm/common.py +21 -0
- wnm/config.py +653 -1
- wnm/decision_engine.py +388 -0
- wnm/executor.py +1292 -0
- wnm/firewall/__init__.py +13 -0
- wnm/firewall/base.py +71 -0
- wnm/firewall/factory.py +95 -0
- wnm/firewall/null_firewall.py +71 -0
- wnm/firewall/ufw_manager.py +118 -0
- wnm/migration.py +42 -0
- wnm/models.py +389 -122
- wnm/process_managers/__init__.py +23 -0
- wnm/process_managers/base.py +203 -0
- wnm/process_managers/docker_manager.py +371 -0
- wnm/process_managers/factory.py +83 -0
- wnm/process_managers/launchd_manager.py +592 -0
- wnm/process_managers/setsid_manager.py +340 -0
- wnm/process_managers/systemd_manager.py +443 -0
- wnm/reports.py +286 -0
- wnm/utils.py +403 -0
- wnm-0.0.10.dist-info/METADATA +316 -0
- wnm-0.0.10.dist-info/RECORD +28 -0
- {wnm-0.0.8.dist-info → wnm-0.0.10.dist-info}/WHEEL +1 -1
- wnm-0.0.8.dist-info/METADATA +0 -93
- wnm-0.0.8.dist-info/RECORD +0 -9
- {wnm-0.0.8.dist-info → wnm-0.0.10.dist-info}/entry_points.txt +0 -0
- {wnm-0.0.8.dist-info → wnm-0.0.10.dist-info}/top_level.txt +0 -0
wnm/__main__.py
CHANGED
|
@@ -1,983 +1,236 @@
|
|
|
1
|
-
import
|
|
2
|
-
import
|
|
3
|
-
import
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
import
|
|
8
|
-
|
|
9
|
-
from wnm.
|
|
10
|
-
|
|
11
|
-
|
|
1
|
+
import json
|
|
2
|
+
import logging
|
|
3
|
+
import os
|
|
4
|
+
import sys
|
|
5
|
+
import time
|
|
6
|
+
|
|
7
|
+
from sqlalchemy import insert, select
|
|
8
|
+
|
|
9
|
+
from wnm.config import (
|
|
10
|
+
LOCK_FILE,
|
|
11
|
+
S,
|
|
12
|
+
apply_config_updates,
|
|
13
|
+
config_updates,
|
|
14
|
+
machine_config,
|
|
15
|
+
options,
|
|
16
|
+
)
|
|
17
|
+
from wnm.decision_engine import DecisionEngine
|
|
18
|
+
from wnm.executor import ActionExecutor
|
|
19
|
+
from wnm.migration import survey_machine
|
|
20
|
+
from wnm.models import Node
|
|
21
|
+
from wnm.utils import (
|
|
22
|
+
get_antnode_version,
|
|
23
|
+
get_machine_metrics,
|
|
24
|
+
update_counters,
|
|
25
|
+
)
|
|
12
26
|
|
|
13
27
|
logging.basicConfig(level=logging.INFO)
|
|
14
|
-
#Info level logging for sqlalchemy is too verbose, only use when needed
|
|
15
|
-
logging.getLogger(
|
|
16
|
-
|
|
17
|
-
# import .env
|
|
18
|
-
basedir = os.path.abspath(os.path.dirname(__file__))
|
|
19
|
-
load_dotenv(os.path.join(basedir, '.env'))
|
|
20
|
-
|
|
21
|
-
# simulate arg/yaml configuration
|
|
22
|
-
config = {}
|
|
23
|
-
config['db']='sqlite:///colony.db'
|
|
24
|
-
config['DonateAddress'] = os.getenv('DonateAddress') or '0x00455d78f850b0358E8cea5be24d415E01E107CF'
|
|
25
|
-
config['ANMHost'] = os.getenv('ANMHost') or '127.0.0.1'
|
|
26
|
-
config['CrisisBytes'] = os.getenv('CrisisBytes') or 2 * 10 ** 9 # default 2gb/node
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
# Setup Database engine
|
|
30
|
-
engine = create_engine(config["db"], echo=True)
|
|
31
|
-
|
|
32
|
-
# Generate ORM
|
|
33
|
-
Base.metadata.create_all(engine)
|
|
34
|
-
|
|
35
|
-
# Create a connection to the ORM
|
|
36
|
-
session_factory = sessionmaker(bind=engine)
|
|
37
|
-
S = scoped_session(session_factory)
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
# if WNM_CONFIG or -c parameter are set, check for existing config
|
|
41
|
-
# else:
|
|
42
|
-
|
|
43
|
-
# Primary node for want of one
|
|
44
|
-
QUEEN=1
|
|
45
|
-
|
|
46
|
-
# Donation address
|
|
47
|
-
DONATE=config["DonateAddress"]
|
|
48
|
-
#Keep these as strings so they can be grepped in logs
|
|
49
|
-
STOPPED="STOPPED" #0 Node is not responding to it's metrics port
|
|
50
|
-
RUNNING="RUNNING" #1 Node is responding to it's metrics port
|
|
51
|
-
UPGRADING="UPGRADING" #2 Upgrade in progress
|
|
52
|
-
DISABLED="DISABLED" #-1 Do not start
|
|
53
|
-
RESTARTING="RESTARTING" #3 re/starting a server intionally
|
|
54
|
-
MIGRATING="MIGRATING" #4 Moving volumes in progress
|
|
55
|
-
REMOVING="REMOVING" #5 Removing node in progress
|
|
56
|
-
DEAD="DEAD" #-86 Broken node to cleanup
|
|
57
|
-
|
|
58
|
-
ANM_HOST=config["ANMHost"]
|
|
59
|
-
# Baseline bytes per node
|
|
60
|
-
CRISIS_BYTES=config["CrisisBytes"]
|
|
28
|
+
# Info level logging for sqlalchemy is too verbose, only use when needed
|
|
29
|
+
logging.getLogger("sqlalchemy.engine.Engine").disabled = True
|
|
30
|
+
|
|
61
31
|
|
|
62
32
|
# A storage place for ant node data
|
|
63
|
-
Workers=[]
|
|
64
|
-
|
|
65
|
-
# Detect ANM (but don't upgrade)
|
|
66
|
-
if os.path.exists("/var/antctl/system"):
|
|
67
|
-
# Is anm scheduled to run
|
|
68
|
-
if os.path.exists("/etc/cron.d/anm"):
|
|
69
|
-
# remove cron to disable old anm
|
|
70
|
-
try:
|
|
71
|
-
subprocess.run(['sudo','rm', '/etc/cron.d/anm'])
|
|
72
|
-
except Exception as error:
|
|
73
|
-
template = "In GAV - An exception of type {0} occurred. Arguments:\n{1!r}"
|
|
74
|
-
message = template.format(type(error).__name__, error.args)
|
|
75
|
-
logging.info(message)
|
|
76
|
-
sys.exit(1)
|
|
77
|
-
os.remove("/etc/cron.d/anm")
|
|
78
|
-
# Is anm sitll running? We'll wait
|
|
79
|
-
if os.path.exists("/var/antctl/block"):
|
|
80
|
-
logging.info("anm still running, waiting...")
|
|
81
|
-
sys.exit(1)
|
|
33
|
+
Workers = []
|
|
82
34
|
|
|
83
|
-
#
|
|
84
|
-
if os.path.exists("/var/antctl/wnm_active"):
|
|
85
|
-
logging.info("wnm still running")
|
|
86
|
-
sys.exit(1)
|
|
35
|
+
# Detect ANM
|
|
87
36
|
|
|
88
|
-
# Get anm configuration
|
|
89
|
-
def load_anm_config():
|
|
90
|
-
anm_config = {}
|
|
91
|
-
|
|
92
|
-
# Let's get the real count of CPU's available to this process
|
|
93
|
-
anm_config["CpuCount"] = len(os.sched_getaffinity(0))
|
|
94
|
-
|
|
95
|
-
# What can we save from /var/antctl/config
|
|
96
|
-
if os.path.exists("/var/antctl/config"):
|
|
97
|
-
load_dotenv("/var/antctl/config")
|
|
98
|
-
anm_config["NodeCap"] = int(os.getenv('NodeCap') or 20)
|
|
99
|
-
anm_config["CpuLessThan"] = int(os.getenv('CpuLessThan') or 50)
|
|
100
|
-
anm_config["CpuRemove"] = int(os.getenv('CpuRemove') or 70)
|
|
101
|
-
anm_config["MemLessThan"] = int(os.getenv('MemLessThan') or 70)
|
|
102
|
-
anm_config["MemRemove"] = int(os.getenv('MemRemove') or 90)
|
|
103
|
-
anm_config["HDLessThan"] = int(os.getenv('HDLessThan') or 70)
|
|
104
|
-
anm_config["HDRemove"] = int(os.getenv('HDRemove') or 90)
|
|
105
|
-
anm_config["DelayStart"] = int(os.getenv('DelayStart') or 5)
|
|
106
|
-
anm_config["DelayUpgrade"] = int(os.getenv('DelayUpgrade') or 5)
|
|
107
|
-
anm_config["DelayRestart"] = int(os.getenv('DelayRestart') or 10)
|
|
108
|
-
anm_config["DelayRemove"] = int(os.getenv('DelayRemove') or 300)
|
|
109
|
-
anm_config["NodeStorage"] = os.getenv('NodeStorage') or "/var/antctl/services"
|
|
110
|
-
# Default to the faucet donation address
|
|
111
|
-
try:
|
|
112
|
-
anm_config["RewardsAddress"] = re.findall(r"--rewards-address ([\dA-Fa-fXx]+)",os.getenv('RewardsAddress'))[0]
|
|
113
|
-
except:
|
|
114
|
-
try:
|
|
115
|
-
anm_config["RewardsAddress"] = re.findall(r"([\dA-Fa-fXx]+)",os.getenv("RewardsAddress"))[0]
|
|
116
|
-
except:
|
|
117
|
-
logging.warning("Unable to detect RewardsAddress")
|
|
118
|
-
sys.exit(1)
|
|
119
|
-
anm_config["DonateAddress"]=os.getenv("DonateAddress") or DONATE
|
|
120
|
-
anm_config["MaxLoadAverageAllowed"]=float(os.getenv("MaxLoadAverageAllowed") or anm_config["CpuCount"])
|
|
121
|
-
anm_config["DesiredLoadAverage"]=float(os.getenv("DesiredLoadAverage") or (anm_config["CpuCount"] * .6))
|
|
122
37
|
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
#
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
details={}
|
|
147
|
-
try:
|
|
148
|
-
with open('/etc/systemd/system/'+antnode, 'r') as file:
|
|
149
|
-
data = file.read()
|
|
150
|
-
details['id']=int(re.findall(r"antnode(\d+)",antnode)[0])
|
|
151
|
-
details['binary']=re.findall(r"ExecStart=([^ ]+)",data)[0]
|
|
152
|
-
details["user"]=re.findall(r"User=(\w+)",data)[0]
|
|
153
|
-
details["root_dir"]=re.findall(r"--root-dir ([\w\/]+)",data)[0]
|
|
154
|
-
details["port"]=int(re.findall(r"--port (\d+)",data)[0])
|
|
155
|
-
details["metrics_port"]=int(re.findall(r"--metrics-server-port (\d+)",data)[0])
|
|
156
|
-
details["wallet"]=re.findall(r"--rewards-address ([^ ]+)",data)[0]
|
|
157
|
-
details["network"]=re.findall(r"--rewards-address [^ ]+ ([\w\-]+)",data)[0]
|
|
158
|
-
except:
|
|
159
|
-
pass
|
|
160
|
-
|
|
161
|
-
return details
|
|
162
|
-
|
|
163
|
-
# Read data from metadata endpoint
|
|
164
|
-
def read_node_metadata(host,port):
|
|
165
|
-
# Only return version number when we have one, to stop clobbering the binary check
|
|
166
|
-
try:
|
|
167
|
-
url = "http://{0}:{1}/metadata".format(host,port)
|
|
168
|
-
response = requests.get(url)
|
|
169
|
-
data=response.text
|
|
170
|
-
except requests.exceptions.ConnectionError:
|
|
171
|
-
logging.debug("Connection Refused on port: {0}:{1}".format(host,str(port)))
|
|
172
|
-
return {"status": STOPPED, "peer_id":""}
|
|
173
|
-
except Exception as error:
|
|
174
|
-
template = "In RNMd - An exception of type {0} occurred. Arguments:\n{1!r}"
|
|
175
|
-
message = template.format(type(error).__name__, error.args)
|
|
176
|
-
logging.info(message)
|
|
177
|
-
return {"status": STOPPED, "peer_id":""}
|
|
178
|
-
# collect a dict to return
|
|
179
|
-
card={}
|
|
180
|
-
try:
|
|
181
|
-
card["version"] = re.findall(r'{antnode_version="([\d\.]+)"}',data)[0]
|
|
182
|
-
except:
|
|
183
|
-
logging.info('No version found')
|
|
184
|
-
try:
|
|
185
|
-
card["peer_id"] = re.findall(r'{peer_id="([\w\d]+)"}',data)[0]
|
|
186
|
-
except:
|
|
187
|
-
card["peer_id"] = ""
|
|
188
|
-
card["status"] = RUNNING if "version" in card else STOPPED
|
|
189
|
-
return card
|
|
190
|
-
|
|
191
|
-
# Read data from metrics port
|
|
192
|
-
def read_node_metrics(host,port):
|
|
193
|
-
metrics={}
|
|
194
|
-
try:
|
|
195
|
-
url = "http://{0}:{1}/metrics".format(host,port)
|
|
196
|
-
response = requests.get(url)
|
|
197
|
-
metrics["status"] = RUNNING
|
|
198
|
-
metrics["uptime"] = int((re.findall(r'ant_node_uptime ([\d]+)',response.text) or [0])[0])
|
|
199
|
-
metrics["records"] = int((re.findall(r'ant_networking_records_stored ([\d]+)',response.text) or [0])[0])
|
|
200
|
-
metrics["shunned"] = int((re.findall(r'ant_networking_shunned_by_close_group ([\d]+)',response.text) or [0])[0])
|
|
201
|
-
except requests.exceptions.ConnectionError:
|
|
202
|
-
logging.debug("Connection Refused on port: {0}:{1}".format(host,str(port)))
|
|
203
|
-
metrics["status"] = STOPPED
|
|
204
|
-
metrics["uptime"] = 0
|
|
205
|
-
metrics["records"] = 0
|
|
206
|
-
metrics["shunned"] = 0
|
|
207
|
-
except Exception as error:
|
|
208
|
-
template = "in:RNM - An exception of type {0} occurred. Arguments:\n{1!r}"
|
|
209
|
-
message = template.format(type(error).__name__, error.args)
|
|
210
|
-
logging.info(message)
|
|
211
|
-
metrics["status"] = STOPPED
|
|
212
|
-
metrics["uptime"] = 0
|
|
213
|
-
metrics["records"] = 0
|
|
214
|
-
metrics["shunned"] = 0
|
|
215
|
-
return metrics
|
|
216
|
-
|
|
217
|
-
# Read antnode binary version
|
|
218
|
-
def get_antnode_version(binary):
|
|
219
|
-
try:
|
|
220
|
-
data = subprocess.run([binary, '--version'], stdout=subprocess.PIPE).stdout.decode('utf-8')
|
|
221
|
-
return re.findall(r'Autonomi Node v([\d\.]+)',data)[0]
|
|
222
|
-
except Exception as error:
|
|
223
|
-
template = "In GAV - An exception of type {0} occurred. Arguments:\n{1!r}"
|
|
224
|
-
message = template.format(type(error).__name__, error.args)
|
|
225
|
-
logging.info(message)
|
|
226
|
-
return 0
|
|
227
|
-
|
|
228
|
-
# Determine how long this node has been around by looking at it's secret_key file
|
|
229
|
-
def get_node_age(root_dir):
|
|
230
|
-
try:
|
|
231
|
-
return int(os.stat("{0}/secret-key".format(root_dir)).st_mtime)
|
|
232
|
-
except:
|
|
233
|
-
return 0
|
|
234
|
-
|
|
235
|
-
# Survey nodes by reading metadata from metrics ports or binary --version
|
|
236
|
-
def survey_anm_nodes(antnodes):
|
|
237
|
-
# Build a list of node dictionaries to return
|
|
238
|
-
details=[]
|
|
239
|
-
# Iterate on nodes
|
|
240
|
-
for node in antnodes:
|
|
241
|
-
# Initialize a dict
|
|
242
|
-
logging.debug("{0} surveying node {1} ".format(time.strftime("%Y-%m-%d %H:%M"),node))
|
|
243
|
-
if not re.findall(r"antnode([\d]+).service",node):
|
|
244
|
-
logging.info("can't decode "+str(node))
|
|
245
|
-
continue
|
|
246
|
-
card={"nodename":re.findall(r"antnode([\d]+).service",node)[0],
|
|
247
|
-
"service": node,
|
|
248
|
-
"timestamp": int(time.time()),
|
|
249
|
-
"host": ANM_HOST or '127.0.0.1'
|
|
250
|
-
}
|
|
251
|
-
# Load what systemd has configured
|
|
252
|
-
card.update(read_systemd_service(node))
|
|
253
|
-
#print(json.dumps(card,indent=2))
|
|
254
|
-
# Read metadata from metrics_port
|
|
255
|
-
metadata = read_node_metadata(card["host"],card["metrics_port"])
|
|
256
|
-
#print(json.dumps(metadata,indent=2))
|
|
257
|
-
if isinstance(metadata,dict) and \
|
|
258
|
-
"status" in metadata and \
|
|
259
|
-
metadata["status"]==RUNNING:
|
|
260
|
-
# soak up metadata
|
|
261
|
-
card.update(metadata)
|
|
262
|
-
# The ports up, so grab metrics too
|
|
263
|
-
card.update(read_node_metrics(card["host"],card["metrics_port"]))
|
|
264
|
-
# Else run binary to get version
|
|
265
|
-
else:
|
|
266
|
-
# If the root directory of the node is missing, it's a bad node
|
|
267
|
-
if not os.path.isdir(card["root_dir"]):
|
|
268
|
-
card["status"]=DEAD
|
|
269
|
-
card["version"]=''
|
|
270
|
-
else:
|
|
271
|
-
card["status"]=STOPPED
|
|
272
|
-
card["version"]=get_antnode_version(card["binary"])
|
|
273
|
-
card["peer_id"]=''
|
|
274
|
-
card["records"]=0
|
|
275
|
-
card["uptime"]=0
|
|
276
|
-
card["shunned"]=0
|
|
277
|
-
card["age"]=get_node_age(card["root_dir"])
|
|
278
|
-
# harcoded for anm
|
|
279
|
-
card["host"]=ANM_HOST
|
|
280
|
-
# Append the node dict to the detail list
|
|
281
|
-
details.append(card)
|
|
282
|
-
|
|
283
|
-
return details
|
|
284
|
-
|
|
285
|
-
# Survey server instance
|
|
286
|
-
def survey_machine():
|
|
287
|
-
# Make a bucket
|
|
288
|
-
antnodes=[]
|
|
289
|
-
# For all service files
|
|
290
|
-
for file in os.listdir("/etc/systemd/system"):
|
|
291
|
-
# Find antnodes
|
|
292
|
-
if re.match(r'antnode[\d]+\.service',file):
|
|
293
|
-
antnodes.append(file)
|
|
294
|
-
#if len(antnodes)>=5:
|
|
295
|
-
# break
|
|
296
|
-
# Iterate over defined nodes and get details
|
|
297
|
-
# Ingests a list of service files and outputs a list of dictionaries
|
|
298
|
-
return survey_anm_nodes(antnodes)
|
|
299
|
-
|
|
300
|
-
# Read system status
|
|
301
|
-
def get_machine_metrics(node_storage,remove_limit):
|
|
302
|
-
metrics = {}
|
|
303
|
-
|
|
304
|
-
with S() as session:
|
|
305
|
-
db_nodes=session.execute(select(Node.status,Node.version)).all()
|
|
306
|
-
|
|
307
|
-
# Get some initial stats for comparing after a few seconds
|
|
308
|
-
# We start these counters AFTER reading the database
|
|
309
|
-
start_time=time.time()
|
|
310
|
-
start_disk_counters=psutil.disk_io_counters()
|
|
311
|
-
start_net_counters=psutil.net_io_counters()
|
|
312
|
-
|
|
313
|
-
metrics["TotalNodes"]=len(db_nodes)
|
|
314
|
-
data = Counter(node[0] for node in db_nodes)
|
|
315
|
-
metrics["RunningNodes"] = data[RUNNING]
|
|
316
|
-
metrics["StoppedNodes"] = data[STOPPED]
|
|
317
|
-
metrics["RestartingNodes"] = data[RESTARTING]
|
|
318
|
-
metrics["UpgradingNodes"] = data[UPGRADING]
|
|
319
|
-
metrics["MigratingNodes"] = data[MIGRATING]
|
|
320
|
-
metrics["RemovingNodes"] = data[REMOVING]
|
|
321
|
-
metrics["DeadNodes"] = data[DEAD]
|
|
322
|
-
metrics["antnode"]=shutil.which("antnode")
|
|
323
|
-
if not metrics["antnode"]:
|
|
324
|
-
logging.warning("Unable to locate current antnode binary, exiting")
|
|
325
|
-
sys.exit(1)
|
|
326
|
-
metrics["AntNodeVersion"]=get_antnode_version(metrics["antnode"])
|
|
327
|
-
metrics["NodesLatestV"]=sum(1 for node in db_nodes if node[1]==metrics["AntNodeVersion"]) or 0
|
|
328
|
-
metrics["NodesNoVersion"]=sum(1 for node in db_nodes if not node[1]) or 0
|
|
329
|
-
metrics["NodesToUpgrade"]=metrics["TotalNodes"] - metrics["NodesLatestV"] - metrics["NodesNoVersion"]
|
|
330
|
-
|
|
331
|
-
# Windows has to build load average over 5 seconds. The first 5 seconds returns 0's
|
|
332
|
-
# I don't plan on supporting windows, but if this get's modular, I don't want this
|
|
333
|
-
# issue to be skipped
|
|
334
|
-
#if platform.system() == "Windows":
|
|
335
|
-
# discard=psutil.getloadavg()
|
|
336
|
-
# time.sleep(5)
|
|
337
|
-
metrics["LoadAverage1"],metrics["LoadAverage5"],metrics["LoadAverage15"]=psutil.getloadavg()
|
|
338
|
-
# Get CPU Metrics over 1 second
|
|
339
|
-
metrics["IdleCpuPercent"],metrics["IOWait"] = psutil.cpu_times_percent(1)[3:5]
|
|
340
|
-
# Really we returned Idle percent, subtract from 100 to get used.
|
|
341
|
-
metrics["UsedCpuPercent"] = 100 - metrics["IdleCpuPercent"]
|
|
342
|
-
data=psutil.virtual_memory()
|
|
343
|
-
#print(data)
|
|
344
|
-
metrics["UsedMemPercent"]=data.percent
|
|
345
|
-
metrics["FreeMemPercent"]=100-metrics["UsedMemPercent"]
|
|
346
|
-
data=psutil.disk_io_counters()
|
|
347
|
-
# This only checks the drive mapped to the first node and will need to be updated
|
|
348
|
-
# when we eventually support multiple drives
|
|
349
|
-
data=psutil.disk_usage(node_storage)
|
|
350
|
-
metrics["UsedHDPercent"]=data.percent
|
|
351
|
-
metrics["TotalHDBytes"]=data.total
|
|
352
|
-
end_time=time.time()
|
|
353
|
-
end_disk_counters=psutil.disk_io_counters()
|
|
354
|
-
end_net_counters=psutil.net_io_counters()
|
|
355
|
-
metrics["HDWriteBytes"]=int((end_disk_counters.write_bytes-start_disk_counters.write_bytes)/(end_time-start_time))
|
|
356
|
-
metrics["HDReadBytes"]=int((end_disk_counters.read_bytes-start_disk_counters.read_bytes)/(end_time-start_time))
|
|
357
|
-
metrics["NetWriteBytes"]=int((end_net_counters.bytes_sent-start_net_counters.bytes_sent)/(end_time-start_time))
|
|
358
|
-
metrics["NetReadBytes"]=int((end_net_counters.bytes_recv-start_net_counters.bytes_recv)/(end_time-start_time))
|
|
359
|
-
#print (json.dumps(metrics,indent=2))
|
|
360
|
-
# How close (out of 100) to removal limit will we be with a max bytes per node (2GB default)
|
|
361
|
-
# For running nodes with Porpoise(tm).
|
|
362
|
-
metrics["NodeHDCrisis"]=int((((metrics["TotalNodes"])*CRISIS_BYTES)/(metrics["TotalHDBytes"]*(remove_limit/100)))*100)
|
|
363
|
-
return metrics
|
|
364
|
-
|
|
365
|
-
# Update node with metrics result
|
|
366
|
-
def update_node_from_metrics(id,metrics,metadata):
|
|
367
|
-
try:
|
|
368
|
-
# We check the binary version in other code, so lets stop clobbering it when a node is stopped
|
|
369
|
-
card={'status': metrics["status"], 'timestamp': int(time.time()),
|
|
370
|
-
'uptime': metrics["uptime"], 'records': metrics["records"],
|
|
371
|
-
'shunned': metrics["shunned"],
|
|
372
|
-
'peer_id': metadata["peer_id"]}
|
|
373
|
-
if "version" in metadata:
|
|
374
|
-
card['version']=metadata["version"]
|
|
375
|
-
with S() as session:
|
|
376
|
-
session.query(Node).filter(Node.id == id).\
|
|
377
|
-
update(card)
|
|
378
|
-
session.commit()
|
|
379
|
-
except Exception as error:
|
|
380
|
-
template = "In UNFM - An exception of type {0} occurred. Arguments:\n{1!r}"
|
|
381
|
-
message = template.format(type(error).__name__, error.args)
|
|
382
|
-
logging.warning(message)
|
|
383
|
-
return False
|
|
384
|
-
else:
|
|
385
|
-
return True
|
|
386
|
-
|
|
387
|
-
# Set Node status
|
|
388
|
-
def set_node_status(id,status):
|
|
389
|
-
logging.info("Setting node status: {0} {1}".format(id,status))
|
|
390
|
-
try:
|
|
391
|
-
with S() as session:
|
|
392
|
-
session.query(Node).filter(Node.id == id).\
|
|
393
|
-
update({'status': status, 'timestamp': int(time.time())})
|
|
394
|
-
session.commit()
|
|
395
|
-
except:
|
|
396
|
-
return False
|
|
397
|
-
else:
|
|
398
|
-
return True
|
|
399
|
-
|
|
400
|
-
# Update metrics after checking counters
|
|
401
|
-
def update_counters(old,config):
|
|
402
|
-
# Are we already removing a node
|
|
403
|
-
if old["RemovingNodes"]:
|
|
404
|
-
with S() as session:
|
|
405
|
-
removals=session.execute(select(Node.timestamp,Node.id)\
|
|
406
|
-
.where(Node.status == REMOVING)\
|
|
407
|
-
.order_by(Node.timestamp.asc())).all()
|
|
408
|
-
# Iterate through active removals
|
|
409
|
-
records_to_remove = len(removals)
|
|
410
|
-
for check in removals:
|
|
411
|
-
# If the DelayRemove timer has expired, delete the entry
|
|
412
|
-
if isinstance(check[0],int) and \
|
|
413
|
-
check[0] < (int(time.time()) - (config["DelayRemove"]*60)):
|
|
414
|
-
logging.info("Deleting removed node "+str(check[1]))
|
|
415
|
-
with S() as session:
|
|
416
|
-
session.execute(delete(Node).where(Node.id==check[1]))
|
|
417
|
-
session.commit()
|
|
418
|
-
records_to_remove-=1
|
|
419
|
-
old["RemovingNodes"]=records_to_remove
|
|
420
|
-
# Are we already upgrading a node
|
|
421
|
-
if old["UpgradingNodes"]:
|
|
422
|
-
with S() as session:
|
|
423
|
-
upgrades=session.execute(select(Node.timestamp,Node.id,Node.host,Node.metrics_port)\
|
|
424
|
-
.where(Node.status == UPGRADING)\
|
|
425
|
-
.order_by(Node.timestamp.asc())).all()
|
|
426
|
-
# Iterate through active upgrades
|
|
427
|
-
records_to_upgrade = len(upgrades)
|
|
428
|
-
for check in upgrades:
|
|
429
|
-
# If the DelayUpgrade timer has expired, check on status
|
|
430
|
-
if isinstance(check[0],int) and \
|
|
431
|
-
check[0] < (int(time.time()) - (config["DelayUpgrade"]*60)):
|
|
432
|
-
logging.info("Updating upgraded node "+str(check[1]))
|
|
433
|
-
node_metrics=read_node_metrics(check[2],check[3])
|
|
434
|
-
node_metadata=read_node_metadata(check[2],check[3])
|
|
435
|
-
if node_metrics and node_metadata:
|
|
436
|
-
update_node_from_metrics(check[1],node_metrics,node_metadata)
|
|
437
|
-
records_to_upgrade-=1
|
|
438
|
-
old["UpgradingNodes"]=records_to_upgrade
|
|
439
|
-
# Are we already restarting a node
|
|
440
|
-
if old["RestartingNodes"]:
|
|
441
|
-
with S() as session:
|
|
442
|
-
restarts=session.execute(select(Node.timestamp,Node.id,Node.host,Node.metrics_port)\
|
|
443
|
-
.where(Node.status == RESTARTING)\
|
|
444
|
-
.order_by(Node.timestamp.asc())).all()
|
|
445
|
-
# Iterate through active upgrades
|
|
446
|
-
records_to_restart = len(restarts)
|
|
447
|
-
for check in restarts:
|
|
448
|
-
# If the DelayUpgrade timer has expired, check on status
|
|
449
|
-
if isinstance(check[0],int) and \
|
|
450
|
-
check[0] < (int(time.time()) - (config["DelayStart"]*60)):
|
|
451
|
-
logging.info("Updating restarted node "+str(check[1]))
|
|
452
|
-
node_metrics=read_node_metrics(check[2],check[3])
|
|
453
|
-
node_metadata=read_node_metadata(check[2],check[3])
|
|
454
|
-
if node_metrics and node_metadata:
|
|
455
|
-
update_node_from_metrics(check[1],node_metrics,node_metadata)
|
|
456
|
-
records_to_restart-=1
|
|
457
|
-
old["RestartingNodes"]=records_to_restart
|
|
458
|
-
return(old)
|
|
459
|
-
|
|
460
|
-
# Enable firewall for port
|
|
461
|
-
def enable_firewall(port,node):
|
|
462
|
-
logging.info("enable firewall port {0}/udp".format(port))
|
|
463
|
-
# Close ufw firewall
|
|
464
|
-
try:
|
|
465
|
-
subprocess.run(['sudo','ufw','allow',"{0}/udp".format(port),'comment',node], stdout=subprocess.PIPE)
|
|
466
|
-
except subprocess.CalledProcessError as err:
|
|
467
|
-
logging.error( 'EF Error:', err )
|
|
468
|
-
|
|
469
|
-
# Disable firewall for port
|
|
470
|
-
def disable_firewall(port):
|
|
471
|
-
logging.info("disable firewall port {0}/udp".format(port))
|
|
472
|
-
# Close ufw firewall
|
|
473
|
-
try:
|
|
474
|
-
subprocess.run(['sudo','ufw','delete','allow',"{0}/udp".format(port)], stdout=subprocess.PIPE)
|
|
475
|
-
except subprocess.CalledProcessError as err:
|
|
476
|
-
logging.error( 'DF ERROR:', err )
|
|
477
|
-
|
|
478
|
-
# Start a systemd node
|
|
479
|
-
def start_systemd_node(node):
|
|
480
|
-
logging.info("Starting node "+str(node.id))
|
|
481
|
-
# Try to start the service
|
|
482
|
-
try:
|
|
483
|
-
p = subprocess.run(['sudo', 'systemctl', 'start', node.service], stdout=subprocess.PIPE,stderr=subprocess.STDOUT).stdout.decode('utf-8')
|
|
484
|
-
if re.match(r'Failed to start',p):
|
|
485
|
-
logging.error( 'SSN2 ERROR:', p )
|
|
486
|
-
return False
|
|
487
|
-
except subprocess.CalledProcessError as err:
|
|
488
|
-
logging.error( 'SSN1 ERROR:', err )
|
|
489
|
-
return False
|
|
490
|
-
# Open a firewall hole for the data port
|
|
491
|
-
enable_firewall(node.port,node.service)
|
|
492
|
-
# Update node status
|
|
493
|
-
set_node_status(node.id,RESTARTING)
|
|
494
|
-
return True
|
|
495
|
-
|
|
496
|
-
# Stop a systemd node
|
|
497
|
-
def stop_systemd_node(node):
|
|
498
|
-
logging.info("Stopping node: "+node.service)
|
|
499
|
-
# Send a stop signal to the process
|
|
500
|
-
try:
|
|
501
|
-
subprocess.run(['sudo', 'systemctl', 'stop', node.service], stdout=subprocess.PIPE)
|
|
502
|
-
except subprocess.CalledProcessError as err:
|
|
503
|
-
logging.error( 'SSN2 ERROR:', err )
|
|
504
|
-
disable_firewall(node.port)
|
|
505
|
-
set_node_status(node.id,STOPPED)
|
|
506
|
-
|
|
507
|
-
return True
|
|
508
|
-
|
|
509
|
-
# Upgrade a node
|
|
510
|
-
def upgrade_node(node,metrics):
|
|
511
|
-
logging.info("Upgrading node "+str(node.id))
|
|
512
|
-
# Copy current node binary
|
|
513
|
-
try:
|
|
514
|
-
subprocess.run(['sudo', 'cp', '-f', metrics["antnode"], node.binary])
|
|
515
|
-
except subprocess.CalledProcessError as err:
|
|
516
|
-
logging.error( 'UN1 ERROR:', err )
|
|
517
|
-
try:
|
|
518
|
-
subprocess.run(['sudo', 'systemctl', 'restart', node.service])
|
|
519
|
-
except subprocess.CalledProcessError as err:
|
|
520
|
-
logging.error( 'UN2 ERROR:', err )
|
|
521
|
-
version=get_antnode_version(node.binary)
|
|
522
|
-
try:
|
|
523
|
-
with S() as session:
|
|
524
|
-
session.query(Node).filter(Node.id == node.id).\
|
|
525
|
-
update({'status': UPGRADING, 'timestamp': int(time.time()),
|
|
526
|
-
'version': metrics["AntNodeVersion"]})
|
|
527
|
-
session.commit()
|
|
528
|
-
except:
|
|
529
|
-
return False
|
|
530
|
-
else:
|
|
531
|
-
return True
|
|
532
|
-
|
|
533
|
-
# Remove a node
|
|
534
|
-
def remove_node(id):
|
|
535
|
-
logging.info("Removing node "+str(id))
|
|
536
|
-
|
|
537
|
-
with S() as session:
|
|
538
|
-
node = session.execute(select(Node).where(Node.id == id)).first()
|
|
539
|
-
# Grab Node from Row
|
|
540
|
-
node=node[0]
|
|
541
|
-
if stop_systemd_node(node):
|
|
542
|
-
# Mark this node as REMOVING
|
|
543
|
-
set_node_status(id,REMOVING)
|
|
544
|
-
|
|
545
|
-
nodename=f"antnode{node.nodename}"
|
|
546
|
-
# Remove node data and log
|
|
547
|
-
try:
|
|
548
|
-
subprocess.run(['sudo', 'rm', '-rf', node.root_dir, f"/var/log/antnode/{nodename}"])
|
|
549
|
-
except subprocess.CalledProcessError as err:
|
|
550
|
-
logging.error( 'RN1 ERROR:', err )
|
|
551
|
-
# Remove systemd service file
|
|
552
|
-
try:
|
|
553
|
-
subprocess.run(['sudo', 'rm', '-f', f"/etc/systemd/system/{node.service}"])
|
|
554
|
-
except subprocess.CalledProcessError as err:
|
|
555
|
-
logging.error( 'RN2 ERROR:', err )
|
|
556
|
-
# Tell system to reload systemd files
|
|
557
|
-
try:
|
|
558
|
-
subprocess.run(['sudo', 'systemctl', 'daemon-reload'])
|
|
559
|
-
except subprocess.CalledProcessError as err:
|
|
560
|
-
logging.error( 'RN3 ERROR:', err )
|
|
561
|
-
#print(json.dumps(node,indent=2))
|
|
562
|
-
|
|
563
|
-
# Rescan nodes for status
|
|
564
|
-
def update_nodes():
|
|
565
|
-
with S() as session:
|
|
566
|
-
nodes=session.execute(select(Node.timestamp,Node.id,Node.host,Node.metrics_port,Node.status)\
|
|
567
|
-
.where(Node.status != DISABLED)\
|
|
568
|
-
.order_by(Node.timestamp.asc())).all()
|
|
569
|
-
# Iterate through all records
|
|
570
|
-
for check in nodes:
|
|
571
|
-
# Check on status
|
|
572
|
-
if isinstance(check[0],int):
|
|
573
|
-
logging.debug("Updating info on node "+str(check[1]))
|
|
574
|
-
node_metrics=read_node_metrics(check[2],check[3])
|
|
575
|
-
node_metadata=read_node_metadata(check[2],check[3])
|
|
576
|
-
if node_metrics and node_metadata:
|
|
577
|
-
# Don't write updates for stopped nodes that are already marked as stopped
|
|
578
|
-
if node_metadata["status"]==STOPPED and check[4]==STOPPED:
|
|
579
|
-
continue
|
|
580
|
-
update_node_from_metrics(check[1],node_metrics,node_metadata)
|
|
581
|
-
|
|
582
|
-
# Create a new node
|
|
583
|
-
def create_node(config,metrics):
|
|
584
|
-
logging.info("Creating new node")
|
|
585
|
-
# Create a holding place for the new node
|
|
586
|
-
card = {}
|
|
587
|
-
# Find the next available node number by first looking for holes
|
|
588
|
-
sql = text('select n1.id + 1 as id from node n1 ' + \
|
|
589
|
-
'left join node n2 on n2.id = n1.id + 1 ' + \
|
|
590
|
-
'where n2.id is null ' + \
|
|
591
|
-
'and n1.id <> (select max(id) from node) ' + \
|
|
592
|
-
'order by n1.id;')
|
|
593
|
-
with S() as session:
|
|
594
|
-
result = session.execute(sql).first()
|
|
595
|
-
if result:
|
|
596
|
-
card['id']=result[0]
|
|
597
|
-
# Otherwise get the max node number and add 1
|
|
598
|
-
else:
|
|
599
|
-
with S() as session:
|
|
600
|
-
result = session.execute(select(Node.id).order_by(Node.id.desc())).first()
|
|
601
|
-
card['id']=result[0]+1
|
|
602
|
-
# Set the node name
|
|
603
|
-
card['nodename']=f'{card['id']:04}'
|
|
604
|
-
card['service']=f'antnode{card['nodename']}.service'
|
|
605
|
-
card['user']='ant'
|
|
606
|
-
card['version']=metrics["AntNodeVersion"]
|
|
607
|
-
card['root_dir']=f"{config['NodeStorage']}/antnode{card['nodename']}"
|
|
608
|
-
card['binary']=f"{card['root_dir']}/antnode"
|
|
609
|
-
card['port']=config["PortStart"]*1000+card['id']
|
|
610
|
-
card['metrics_port']=13*1000+card['id']
|
|
611
|
-
card['network']='evm-arbitrum-one'
|
|
612
|
-
card['wallet']=config["RewardsAddress"]
|
|
613
|
-
card['peer_id']=''
|
|
614
|
-
card['status']=STOPPED
|
|
615
|
-
card['timestamp']=int(time.time())
|
|
616
|
-
card['records']=0
|
|
617
|
-
card['uptime']=0
|
|
618
|
-
card['shunned']=0
|
|
619
|
-
card['age']=card['timestamp']
|
|
620
|
-
card['host']=ANM_HOST
|
|
621
|
-
log_dir=f"/var/log/antnode/antnode{card['nodename']}"
|
|
622
|
-
# Create the node directory and log directory
|
|
623
|
-
try:
|
|
624
|
-
subprocess.run(['sudo','mkdir','-p',card["root_dir"],log_dir], stdout=subprocess.PIPE)
|
|
625
|
-
except subprocess.CalledProcessError as err:
|
|
626
|
-
logging.error( 'CN1 ERROR:', err )
|
|
627
|
-
# Copy the binary to the node directory
|
|
628
|
-
try:
|
|
629
|
-
subprocess.run(['sudo','cp',metrics["antnode"],card["root_dir"]], stdout=subprocess.PIPE)
|
|
630
|
-
except subprocess.CalledProcessError as err:
|
|
631
|
-
logging.error( 'CN2 ERROR:', err )
|
|
632
|
-
# Change owner of the node directory and log directories
|
|
633
|
-
try:
|
|
634
|
-
subprocess.run(['sudo','chown','-R',f'{card["user"]}:{card["user"]}',card["root_dir"],log_dir], stdout=subprocess.PIPE)
|
|
635
|
-
except subprocess.CalledProcessError as err:
|
|
636
|
-
logging.error( 'CN3 ERROR:', err )
|
|
637
|
-
# build the systemd service unit
|
|
638
|
-
service=f"""[Unit]
|
|
639
|
-
Description=antnode{card['nodename']}
|
|
640
|
-
[Service]
|
|
641
|
-
User={card['user']}
|
|
642
|
-
ExecStart={card['binary']} --bootstrap-cache-dir /var/antctl/bootstrap-cache --root-dir {card['root_dir']} --port {card['port']} --enable-metrics-server --metrics-server-port {card['metrics_port']} --log-output-dest {log_dir} --max-log-files 1 --max-archived-log-files 1 --rewards-address {card['wallet']} {card['network']}
|
|
643
|
-
Restart=always
|
|
644
|
-
#RestartSec=300
|
|
645
|
-
"""
|
|
646
|
-
# Write the systemd service unit with sudo tee since we're running as not root
|
|
647
|
-
try:
|
|
648
|
-
subprocess.run(['sudo','tee',f'/etc/systemd/system/{card["service"]}'],input=service,text=True, stdout=subprocess.PIPE)
|
|
649
|
-
except subprocess.CalledProcessError as err:
|
|
650
|
-
logging.error( 'CN4 ERROR:', err )
|
|
651
|
-
# Reload systemd service files to get our new one
|
|
652
|
-
try:
|
|
653
|
-
subprocess.run(['sudo','systemctl','daemon-reload'], stdout=subprocess.PIPE)
|
|
654
|
-
except subprocess.CalledProcessError as err:
|
|
655
|
-
logging.error( 'CN5 ERROR:', err )
|
|
656
|
-
# Add the new node to the database
|
|
657
|
-
with S() as session:
|
|
658
|
-
session.execute(
|
|
659
|
-
insert(Node),[card]
|
|
660
|
-
)
|
|
661
|
-
session.commit()
|
|
662
|
-
# Now we grab the node object from the database to pass to start node
|
|
663
|
-
with S() as session:
|
|
664
|
-
card=session.execute(select(Node).where(Node.id == card['id'])).first()
|
|
665
|
-
# Get the Node object from the Row
|
|
666
|
-
card=card[0]
|
|
667
|
-
# Start the new node
|
|
668
|
-
return start_systemd_node(card)
|
|
669
|
-
#print(json.dumps(card,indent=2))
|
|
670
|
-
return True
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
# Make a decision about what to do
|
|
674
|
-
def choose_action(config,metrics,db_nodes):
|
|
675
|
-
# Gather knowlege
|
|
676
|
-
features={}
|
|
677
|
-
features["AllowCpu"]=metrics["UsedCpuPercent"] < config["CpuLessThan"]
|
|
678
|
-
features["AllowMem"]=metrics["UsedMemPercent"] < config["MemLessThan"]
|
|
679
|
-
features["AllowHD"]=metrics["UsedHDPercent"] < config["HDLessThan"]
|
|
680
|
-
features["RemCpu"]=metrics["UsedCpuPercent"] > config["CpuRemove"]
|
|
681
|
-
features["RemMem"]=metrics["UsedMemPercent"] > config["MemRemove"]
|
|
682
|
-
features["RemHD"]=metrics["UsedHDPercent"] > config["HDRemove"]
|
|
683
|
-
features["AllowNodeCap"]=metrics["RunningNodes"] < config["NodeCap"]
|
|
684
|
-
# These are new features, so ignore them if not configured
|
|
685
|
-
if (config["NetIOReadLessThan"]+config["NetIOReadRemove"]+
|
|
686
|
-
config["NetIOWriteLessThan"]+config["NetIOWriteRemove"]>1):
|
|
687
|
-
features["AllowNetIO"]=metrics["NetReadBytes"] < config["NetIOReadLessThan"] and \
|
|
688
|
-
metrics["NetWriteBytes"] < config["NetIOWriteLessThan"]
|
|
689
|
-
features["RemoveNetIO"]=metrics["NetReadBytes"] > config["NetIORemove"] or \
|
|
690
|
-
metrics["NetWriteBytes"] > config["NetIORemove"]
|
|
691
|
-
else:
|
|
692
|
-
features["AllowNetIO"]=True
|
|
693
|
-
features["RemoveNetIO"]=False
|
|
694
|
-
if (config["HDIOReadLessThan"]+config["HDIOReadRemove"]+
|
|
695
|
-
config["HDIOWriteLessThan"]+config["HDIOWriteRemove"]>1):
|
|
696
|
-
features["AllowHDIO"]=metrics["HDReadBytes"] < config["HDIOReadLessThan"] and \
|
|
697
|
-
metrics["HDWriteBytes"] < config["HDIOWriteLessThan"]
|
|
698
|
-
features["RemoveHDIO"]=metrics["HDReadBytes"] > config["HDIORemove"] or \
|
|
699
|
-
metrics["HDWriteBytes"] > config["HDtIORemove"]
|
|
700
|
-
else:
|
|
701
|
-
features["AllowHDIO"]=True
|
|
702
|
-
features["RemoveHDIO"]=False
|
|
703
|
-
features["LoadAllow"] = metrics["LoadAverage1"] < config["DesiredLoadAverage"] and \
|
|
704
|
-
metrics["LoadAverage5"] < config["DesiredLoadAverage"] and \
|
|
705
|
-
metrics["LoadAverage15"] < config["DesiredLoadAverage"]
|
|
706
|
-
features["LoadNotAllow"] = metrics["LoadAverage1"] > config["MaxLoadAverageAllowed"] or \
|
|
707
|
-
metrics["LoadAverage5"] > config["MaxLoadAverageAllowed"] or \
|
|
708
|
-
metrics["LoadAverage15"] > config["MaxLoadAverageAllowed"]
|
|
709
|
-
# Check records for expired status
|
|
710
|
-
metrics=update_counters(metrics,config)
|
|
711
|
-
# If we have other thing going on, don't add more nodes
|
|
712
|
-
features["AddNewNode"]=sum([ metrics.get(m, 0) \
|
|
713
|
-
for m in ['UpgradingNodes',
|
|
714
|
-
'RestartingNodes','MigratingNodes',
|
|
715
|
-
'RemovingNodes'] ]) == 0 and \
|
|
716
|
-
features["AllowCpu"] and features["AllowHD"] and \
|
|
717
|
-
features["AllowMem"] and features["AllowNodeCap"] and \
|
|
718
|
-
features["AllowHDIO"] and features["AllowNetIO"] and \
|
|
719
|
-
features["LoadAllow"]
|
|
720
|
-
# Are we overlimit on nodes
|
|
721
|
-
features["Remove"] =features["LoadNotAllow"] or features["RemCpu"] or \
|
|
722
|
-
features["RemHD"] or features["RemMem"] or \
|
|
723
|
-
features["RemoveHDIO"] or features["RemoveNetIO"] or \
|
|
724
|
-
metrics["TotalNodes"] > config["NodeCap"]
|
|
725
|
-
# If we have nodes to upgrade
|
|
726
|
-
if metrics["NodesToUpgrade"] >= 1:
|
|
727
|
-
# Make sure current version is equal or newer than version on first node.
|
|
728
|
-
if Version(metrics["AntNodeVersion"]) < Version(db_nodes[0][1]):
|
|
729
|
-
logging.warning("node upgrade cancelled due to lower version")
|
|
730
|
-
features["Upgrade"]=False
|
|
731
|
-
else:
|
|
732
|
-
if features["Remove"]:
|
|
733
|
-
logging.info("Can't upgrade while removing is required")
|
|
734
|
-
features["Upgrade"]=False
|
|
735
|
-
else:
|
|
736
|
-
features["Upgrade"]=True
|
|
737
|
-
else:
|
|
738
|
-
features["Upgrade"]=False
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
logging.info(json.dumps(features,indent=2))
|
|
742
|
-
##### Decisions
|
|
743
|
-
|
|
744
|
-
# Actually, removing DEAD nodes take priority
|
|
745
|
-
if metrics["DeadNodes"] > 1:
|
|
746
|
-
with S() as session:
|
|
747
|
-
broken=session.execute(select(Node.timestamp,Node.id,Node.host,Node.metrics_port)\
|
|
748
|
-
.where(Node.status == DEAD)\
|
|
749
|
-
.order_by(Node.timestamp.asc())).all()
|
|
750
|
-
# Iterate through dead nodes and remove them all
|
|
751
|
-
for check in broken:
|
|
752
|
-
# Remove broken nodes
|
|
753
|
-
logging.info("Removing dead node "+str(check[1]))
|
|
754
|
-
remove_node(check[1])
|
|
755
|
-
return {"status": "removed-dead-nodes"}
|
|
756
|
-
# If we have nodes with no version number, update from binary
|
|
757
|
-
if metrics["NodesNoVersion"] > 1:
|
|
758
|
-
with S() as session:
|
|
759
|
-
no_version=session.execute(select(Node.timestamp,Node.id,Node.binary)\
|
|
760
|
-
.where(Node.version == '')\
|
|
761
|
-
.order_by(Node.timestamp.asc())).all()
|
|
762
|
-
# Iterate through nodes with no version number
|
|
763
|
-
for check in no_version:
|
|
764
|
-
# Update version number from binary
|
|
765
|
-
version=get_antnode_version(check[2])
|
|
766
|
-
logging.info(f"Updating version number for node {check[1]} to {version}")
|
|
767
|
-
with S() as session:
|
|
768
|
-
session.query(Node).filter(Node.id == check[1]).\
|
|
769
|
-
update({'version': version})
|
|
770
|
-
session.commit()
|
|
771
|
-
|
|
772
|
-
# If we're restarting, wait patiently as metrics could be skewed
|
|
773
|
-
if metrics["RestartingNodes"]:
|
|
774
|
-
logging.info("Still waiting for RestartDelay")
|
|
775
|
-
return {"status": RESTARTING}
|
|
776
|
-
# If we still have unexpired upgrade records, wait
|
|
777
|
-
if metrics["UpgradingNodes"]:
|
|
778
|
-
logging.info("Still waiting for UpgradeDelay")
|
|
779
|
-
return {"status": UPGRADING}
|
|
780
|
-
# First if we're removing, that takes top priority
|
|
781
|
-
if features["Remove"]:
|
|
782
|
-
# If we still have unexpired removal records, wait
|
|
783
|
-
if metrics["RemovingNodes"]:
|
|
784
|
-
logging.info("Still waiting for RemoveDelay")
|
|
785
|
-
return {"status": REMOVING}
|
|
786
|
-
# If we're under HD pressure or trimming node cap, remove nodes
|
|
787
|
-
if features["RemHD"] or metrics["TotalNodes"] > config["NodeCap"]:
|
|
788
|
-
# Start removing with stopped nodes
|
|
789
|
-
if metrics["StoppedNodes"] > 0:
|
|
790
|
-
# What is the youngest stopped node
|
|
791
|
-
with S() as session:
|
|
792
|
-
youngest=session.execute(select(Node.id)\
|
|
793
|
-
.where(Node.status == STOPPED)\
|
|
794
|
-
.order_by(Node.age.desc())).first()
|
|
795
|
-
if youngest:
|
|
796
|
-
# Remove the youngest node
|
|
797
|
-
remove_node(youngest[0])
|
|
798
|
-
return{"status": REMOVING}
|
|
799
|
-
# No low hanging fruit. let's start with the youngest running node
|
|
800
|
-
with S() as session:
|
|
801
|
-
youngest=session.execute(select(Node.id)\
|
|
802
|
-
.where(Node.status == RUNNING)\
|
|
803
|
-
.order_by(Node.age.desc())).first()
|
|
804
|
-
if youngest:
|
|
805
|
-
# Remove the youngest node
|
|
806
|
-
remove_node(youngest[0])
|
|
807
|
-
return{"status": REMOVING}
|
|
808
|
-
return{"status": "nothing-to-remove"}
|
|
809
|
-
# Otherwise, let's try just stopping a node to bring IO/Mem/Cpu down
|
|
38
|
+
# Make a decision about what to do (new implementation using DecisionEngine)
|
|
39
|
+
def choose_action(machine_config, metrics, dry_run):
|
|
40
|
+
"""Plan and execute actions using DecisionEngine and ActionExecutor.
|
|
41
|
+
|
|
42
|
+
This function now acts as a thin wrapper around the new decision engine
|
|
43
|
+
and action executor classes.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
machine_config: Machine configuration dictionary
|
|
47
|
+
metrics: Current system metrics
|
|
48
|
+
dry_run: If True, log actions without executing
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
Dictionary with execution status
|
|
52
|
+
"""
|
|
53
|
+
# Check records for expired status (must be done before planning)
|
|
54
|
+
if not dry_run:
|
|
55
|
+
metrics = update_counters(S, metrics, machine_config)
|
|
56
|
+
|
|
57
|
+
# Handle nodes with no version number (done before planning)
|
|
58
|
+
if metrics["nodes_no_version"] > 0:
|
|
59
|
+
if dry_run:
|
|
60
|
+
logging.warning("DRYRUN: Update NoVersion nodes")
|
|
810
61
|
else:
|
|
811
|
-
# If we just stopped a node, wait
|
|
812
|
-
if int(config["LastStoppedAt"] or 0) > (int(time.time()) - (config["DelayRemove"]*60)):
|
|
813
|
-
logging.info("Still waiting for RemoveDelay")
|
|
814
|
-
return {"status": 'waiting-to-stop'}
|
|
815
|
-
# Start with the youngest running node
|
|
816
62
|
with S() as session:
|
|
817
|
-
|
|
818
|
-
|
|
819
|
-
|
|
820
|
-
|
|
821
|
-
|
|
822
|
-
|
|
823
|
-
|
|
63
|
+
no_version = session.execute(
|
|
64
|
+
select(Node.timestamp, Node.id, Node.binary)
|
|
65
|
+
.where(Node.version == "")
|
|
66
|
+
.order_by(Node.timestamp.asc())
|
|
67
|
+
).all()
|
|
68
|
+
# Iterate through nodes with no version number
|
|
69
|
+
for check in no_version:
|
|
70
|
+
# Update version number from binary
|
|
71
|
+
version = get_antnode_version(check[2])
|
|
72
|
+
logging.info(
|
|
73
|
+
f"Updating version number for node {check[1]} to {version}"
|
|
74
|
+
)
|
|
824
75
|
with S() as session:
|
|
825
|
-
session.query(
|
|
826
|
-
|
|
76
|
+
session.query(Node).filter(Node.id == check[1]).update(
|
|
77
|
+
{"version": version}
|
|
78
|
+
)
|
|
827
79
|
session.commit()
|
|
828
|
-
|
|
829
|
-
|
|
830
|
-
|
|
831
|
-
|
|
832
|
-
|
|
833
|
-
|
|
834
|
-
|
|
835
|
-
|
|
836
|
-
|
|
837
|
-
|
|
838
|
-
|
|
839
|
-
|
|
840
|
-
|
|
841
|
-
|
|
842
|
-
oldest = oldest[0]
|
|
843
|
-
# If we don't have a version number from metadata, grab from binary
|
|
844
|
-
if not oldest.version:
|
|
845
|
-
oldest.version=get_antnode_version(oldest.binary)
|
|
846
|
-
#print(json.dumps(oldest))
|
|
847
|
-
# Upgrade the oldest node
|
|
848
|
-
upgrade_node(oldest,metrics)
|
|
849
|
-
return{"status": UPGRADING}
|
|
850
|
-
|
|
851
|
-
# If AddNewNode
|
|
852
|
-
# If stopped nodes available
|
|
853
|
-
# Check oldest stopped version
|
|
854
|
-
# If out of date
|
|
855
|
-
# upgrade node which starts it
|
|
856
|
-
# else
|
|
857
|
-
# restart node
|
|
858
|
-
# else
|
|
859
|
-
# Create a Node which starts it
|
|
860
|
-
if features["AddNewNode"]:
|
|
861
|
-
# Start adding with stopped nodes
|
|
862
|
-
if metrics["StoppedNodes"] > 0:
|
|
863
|
-
# What is the oldest stopped node
|
|
864
|
-
with S() as session:
|
|
865
|
-
oldest=session.execute(select(Node)\
|
|
866
|
-
.where(Node.status == STOPPED)\
|
|
867
|
-
.order_by(Node.age.asc())).first()
|
|
868
|
-
if oldest:
|
|
869
|
-
# Get Node from Row
|
|
870
|
-
oldest=oldest[0]
|
|
871
|
-
# If we don't have a version number from metadata, grab from binary
|
|
872
|
-
if not oldest.version:
|
|
873
|
-
oldest.version=get_antnode_version(oldest.binary)
|
|
874
|
-
# If the stopped version is old, upgrade it
|
|
875
|
-
if Version(metrics["AntNodeVersion"]) > Version(oldest.version):
|
|
876
|
-
upgrade_node(oldest,metrics)
|
|
877
|
-
return{"status": UPGRADING}
|
|
878
|
-
else:
|
|
879
|
-
if start_systemd_node(oldest):
|
|
880
|
-
return{"status": RESTARTING}
|
|
881
|
-
else:
|
|
882
|
-
return{"status": "failed-start-node"}
|
|
883
|
-
# Hmm, still in Start mode, we shouldn't get here
|
|
884
|
-
return {"status": 'START'}
|
|
885
|
-
# Still in Add mode, add a new node
|
|
886
|
-
if metrics["TotalNodes"] < config["NodeCap"]:
|
|
887
|
-
if create_node(config,metrics):
|
|
888
|
-
return {"status": "ADD"}
|
|
889
|
-
else:
|
|
890
|
-
return {"status": "failed-create-node"}
|
|
891
|
-
else:
|
|
892
|
-
return {"status": "node-cap-reached"}
|
|
893
|
-
# If we have nothing to do, Survey the node ports
|
|
894
|
-
update_nodes()
|
|
895
|
-
return{"status": "idle"}
|
|
80
|
+
|
|
81
|
+
# Use the new DecisionEngine to plan actions
|
|
82
|
+
engine = DecisionEngine(machine_config, metrics)
|
|
83
|
+
actions = engine.plan_actions()
|
|
84
|
+
|
|
85
|
+
# Log the computed features for debugging
|
|
86
|
+
logging.info(json.dumps(engine.get_features(), indent=2))
|
|
87
|
+
|
|
88
|
+
# Use ActionExecutor to execute the planned actions
|
|
89
|
+
executor = ActionExecutor(S)
|
|
90
|
+
result = executor.execute(actions, machine_config, metrics, dry_run)
|
|
91
|
+
|
|
92
|
+
return result
|
|
93
|
+
|
|
896
94
|
|
|
897
95
|
def main():
|
|
96
|
+
|
|
97
|
+
# Are we already running
|
|
98
|
+
if os.path.exists(LOCK_FILE):
|
|
99
|
+
logging.warning("wnm still running")
|
|
100
|
+
sys.exit(1)
|
|
101
|
+
|
|
898
102
|
# We're starting, so lets create a lock file
|
|
899
103
|
try:
|
|
900
|
-
with open(
|
|
104
|
+
with open(LOCK_FILE, "w") as file:
|
|
901
105
|
file.write(str(int(time.time())))
|
|
902
|
-
except:
|
|
903
|
-
logging.error("Unable to create lock file
|
|
106
|
+
except (PermissionError, OSError) as e:
|
|
107
|
+
logging.error(f"Unable to create lock file: {e}")
|
|
904
108
|
sys.exit(1)
|
|
905
109
|
|
|
906
|
-
#
|
|
907
|
-
|
|
908
|
-
|
|
909
|
-
Node.host,Node.metrics_port,
|
|
910
|
-
Node.port,Node.age,Node.id,
|
|
911
|
-
Node.timestamp)).all()
|
|
912
|
-
anm_config=session.execute(select(Machine)).all()
|
|
913
|
-
|
|
914
|
-
if db_nodes:
|
|
915
|
-
# anm_config by default loads a parameter array,
|
|
916
|
-
# use the __json__ method to return a dict from the first node
|
|
917
|
-
anm_config = json.loads(json.dumps(anm_config[0][0])) or load_anm_config()
|
|
918
|
-
metrics=get_machine_metrics(anm_config["NodeStorage"],anm_config["HDRemove"])
|
|
919
|
-
#node_metrics = read_node_metrics(db_nodes[0][2],db_nodes[0][3])
|
|
920
|
-
#print(db_nodes[0])
|
|
921
|
-
#print(node_metrics)
|
|
922
|
-
#print(anm_config)
|
|
923
|
-
#print(json.dumps(anm_config,indent=4))
|
|
924
|
-
#print("Node: ",db_nodes)
|
|
925
|
-
logging.info("Found {counter} nodes migrated".format(counter=len(db_nodes)))
|
|
926
|
-
|
|
110
|
+
# Config should have loaded the machine_config
|
|
111
|
+
if machine_config:
|
|
112
|
+
logging.info("Machine: " + json.dumps(machine_config))
|
|
927
113
|
else:
|
|
928
|
-
|
|
929
|
-
|
|
930
|
-
|
|
931
|
-
|
|
932
|
-
|
|
933
|
-
|
|
934
|
-
|
|
935
|
-
|
|
936
|
-
|
|
937
|
-
|
|
938
|
-
|
|
114
|
+
logging.error("Unable to load machine config, exiting")
|
|
115
|
+
sys.exit(1)
|
|
116
|
+
# Check for config updates
|
|
117
|
+
if config_updates:
|
|
118
|
+
logging.info("Update: " + json.dumps(config_updates))
|
|
119
|
+
if options.dry_run:
|
|
120
|
+
logging.warning("Dry run, not saving requested updates")
|
|
121
|
+
# Create a dictionary for the machine config
|
|
122
|
+
# Machine by default returns a parameter array,
|
|
123
|
+
# use the __json__ method to return a dict
|
|
124
|
+
local_config = json.loads(json.dumps(machine_config))
|
|
125
|
+
# Apply the local config with the requested updates
|
|
126
|
+
local_config.update(config_updates)
|
|
127
|
+
else:
|
|
128
|
+
# Store the config changes to the database
|
|
129
|
+
apply_config_updates(config_updates)
|
|
130
|
+
# Create a working dictionary for the machine config
|
|
131
|
+
# Machine by default returns a parameter array,
|
|
132
|
+
# use the __json__ method to return a dict
|
|
133
|
+
local_config = json.loads(json.dumps(machine_config))
|
|
134
|
+
else:
|
|
135
|
+
local_config = json.loads(json.dumps(machine_config))
|
|
136
|
+
|
|
137
|
+
metrics = get_machine_metrics(
|
|
138
|
+
S,
|
|
139
|
+
local_config["node_storage"],
|
|
140
|
+
local_config["hd_remove"],
|
|
141
|
+
local_config["crisis_bytes"],
|
|
142
|
+
)
|
|
143
|
+
logging.info(json.dumps(metrics, indent=2))
|
|
144
|
+
|
|
145
|
+
# Do we already have nodes
|
|
146
|
+
if metrics["total_nodes"] == 0:
|
|
147
|
+
# Are we migrating an anm server
|
|
148
|
+
if options.init and options.migrate_anm:
|
|
149
|
+
Workers = survey_machine(machine_config) or []
|
|
150
|
+
if Workers:
|
|
151
|
+
if options.dry_run:
|
|
152
|
+
logging.warning(f"DRYRUN: Not saving {len(Workers)} detected nodes")
|
|
153
|
+
else:
|
|
154
|
+
with S() as session:
|
|
155
|
+
session.execute(insert(Node), Workers)
|
|
156
|
+
session.commit()
|
|
157
|
+
# Reload metrics
|
|
158
|
+
metrics = get_machine_metrics(
|
|
159
|
+
S,
|
|
160
|
+
local_config["node_storage"],
|
|
161
|
+
local_config["hd_remove"],
|
|
162
|
+
local_config["crisis_bytes"],
|
|
163
|
+
)
|
|
164
|
+
logging.info(
|
|
165
|
+
"Found {counter} nodes defined".format(
|
|
166
|
+
counter=metrics["total_nodes"]
|
|
167
|
+
)
|
|
168
|
+
)
|
|
169
|
+
else:
|
|
170
|
+
logging.warning("Requested migration but no nodes found")
|
|
171
|
+
else:
|
|
172
|
+
logging.info("No nodes found")
|
|
173
|
+
else:
|
|
174
|
+
logging.info(
|
|
175
|
+
"Found {counter} nodes configured".format(counter=metrics["total_nodes"])
|
|
176
|
+
)
|
|
939
177
|
|
|
940
|
-
|
|
941
|
-
|
|
942
|
-
|
|
178
|
+
# Check for reports
|
|
179
|
+
if options.report:
|
|
180
|
+
from wnm.reports import generate_node_status_report, generate_node_status_details_report
|
|
181
|
+
|
|
182
|
+
# If survey action is specified, run it first
|
|
183
|
+
if options.force_action == "survey":
|
|
184
|
+
logging.info("Running survey before generating report")
|
|
185
|
+
executor = ActionExecutor(S)
|
|
186
|
+
survey_result = executor.execute_forced_action(
|
|
187
|
+
"survey",
|
|
188
|
+
local_config,
|
|
189
|
+
metrics,
|
|
190
|
+
service_name=options.service_name,
|
|
191
|
+
dry_run=options.dry_run,
|
|
943
192
|
)
|
|
944
|
-
|
|
193
|
+
logging.info(f"Survey result: {survey_result}")
|
|
945
194
|
|
|
946
|
-
#
|
|
947
|
-
|
|
948
|
-
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
|
|
952
|
-
|
|
953
|
-
|
|
954
|
-
|
|
955
|
-
|
|
956
|
-
|
|
957
|
-
|
|
958
|
-
|
|
959
|
-
|
|
195
|
+
# Generate the report
|
|
196
|
+
if options.report == "node-status":
|
|
197
|
+
report_output = generate_node_status_report(
|
|
198
|
+
S, options.service_name, options.report_format
|
|
199
|
+
)
|
|
200
|
+
elif options.report == "node-status-details":
|
|
201
|
+
report_output = generate_node_status_details_report(
|
|
202
|
+
S, options.service_name, options.report_format
|
|
203
|
+
)
|
|
204
|
+
else:
|
|
205
|
+
report_output = f"Unknown report type: {options.report}"
|
|
206
|
+
|
|
207
|
+
print(report_output)
|
|
208
|
+
os.remove(LOCK_FILE)
|
|
209
|
+
sys.exit(0)
|
|
210
|
+
|
|
211
|
+
# Check for forced actions
|
|
212
|
+
if options.force_action:
|
|
213
|
+
logging.info(f"Executing forced action: {options.force_action}")
|
|
214
|
+
executor = ActionExecutor(S)
|
|
215
|
+
this_action = executor.execute_forced_action(
|
|
216
|
+
options.force_action,
|
|
217
|
+
local_config,
|
|
218
|
+
metrics,
|
|
219
|
+
service_name=options.service_name,
|
|
220
|
+
dry_run=options.dry_run,
|
|
221
|
+
count=options.count if hasattr(options, 'count') else 1,
|
|
222
|
+
)
|
|
223
|
+
else:
|
|
224
|
+
this_action = choose_action(local_config, metrics, options.dry_run)
|
|
960
225
|
|
|
226
|
+
print("Action:", json.dumps(this_action, indent=2))
|
|
961
227
|
|
|
962
|
-
|
|
963
|
-
|
|
964
|
-
print("Running Nodes:",data[RUNNING])
|
|
965
|
-
print("Restarting Nodes:",data[RESTARTING])
|
|
966
|
-
print("Stopped Nodes:",data[STOPPED])
|
|
967
|
-
print("Upgrading Nodes:",data[UPGRADING])
|
|
968
|
-
print("Removing Nodes:",data[REMOVING])
|
|
969
|
-
data = Counter(ver[1] for ver in db_nodes)
|
|
970
|
-
print("Versions:",data)
|
|
228
|
+
os.remove(LOCK_FILE)
|
|
229
|
+
sys.exit(1)
|
|
971
230
|
|
|
972
|
-
machine_metrics = get_machine_metrics(anm_config['NodeStorage'],anm_config["HDRemove"])
|
|
973
|
-
print(json.dumps(anm_config,indent=2))
|
|
974
|
-
print(json.dumps(machine_metrics,indent=2))
|
|
975
|
-
this_action=choose_action(anm_config,machine_metrics,db_nodes)
|
|
976
|
-
print("Action:",json.dumps(this_action,indent=2))
|
|
977
|
-
# Remove lock file
|
|
978
|
-
os.remove("/var/antctl/wnm_active")
|
|
979
231
|
|
|
980
232
|
if __name__ == "__main__":
|
|
981
233
|
main()
|
|
234
|
+
# print(options.MemRemove)
|
|
982
235
|
|
|
983
236
|
print("End of program")
|