wnm 0.0.8__py3-none-any.whl → 0.0.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of wnm might be problematic. Click here for more details.
- wnm/__init__.py +1 -1
- wnm/__main__.py +647 -445
- wnm/models.py +151 -63
- {wnm-0.0.8.dist-info → wnm-0.0.9.dist-info}/METADATA +4 -2
- wnm-0.0.9.dist-info/RECORD +9 -0
- wnm-0.0.8.dist-info/RECORD +0 -9
- {wnm-0.0.8.dist-info → wnm-0.0.9.dist-info}/WHEEL +0 -0
- {wnm-0.0.8.dist-info → wnm-0.0.9.dist-info}/entry_points.txt +0 -0
- {wnm-0.0.8.dist-info → wnm-0.0.9.dist-info}/top_level.txt +0 -0
wnm/__main__.py
CHANGED
|
@@ -1,29 +1,38 @@
|
|
|
1
|
-
import
|
|
2
|
-
import
|
|
3
|
-
import
|
|
1
|
+
import json
|
|
2
|
+
import logging
|
|
3
|
+
import os
|
|
4
|
+
import re
|
|
5
|
+
import shutil
|
|
6
|
+
import subprocess
|
|
7
|
+
import sys
|
|
8
|
+
import time
|
|
4
9
|
from collections import Counter
|
|
5
|
-
|
|
10
|
+
|
|
11
|
+
import psutil
|
|
12
|
+
import requests
|
|
6
13
|
from dotenv import load_dotenv
|
|
7
|
-
import
|
|
14
|
+
from packaging.version import Version
|
|
15
|
+
from sqlalchemy import create_engine, delete, insert, select, text, update
|
|
16
|
+
from sqlalchemy.orm import scoped_session, sessionmaker
|
|
8
17
|
|
|
9
18
|
from wnm.models import Base, Machine, Node
|
|
10
|
-
from sqlalchemy import create_engine, select, insert, update, delete, text
|
|
11
|
-
from sqlalchemy.orm import sessionmaker, scoped_session
|
|
12
19
|
|
|
13
20
|
logging.basicConfig(level=logging.INFO)
|
|
14
|
-
#Info level logging for sqlalchemy is too verbose, only use when needed
|
|
15
|
-
logging.getLogger(
|
|
16
|
-
|
|
21
|
+
# Info level logging for sqlalchemy is too verbose, only use when needed
|
|
22
|
+
logging.getLogger("sqlalchemy.engine.Engine").disabled = True
|
|
23
|
+
|
|
17
24
|
# import .env
|
|
18
25
|
basedir = os.path.abspath(os.path.dirname(__file__))
|
|
19
|
-
load_dotenv(os.path.join(basedir,
|
|
26
|
+
load_dotenv(os.path.join(basedir, ".env"))
|
|
20
27
|
|
|
21
28
|
# simulate arg/yaml configuration
|
|
22
29
|
config = {}
|
|
23
|
-
config[
|
|
24
|
-
config[
|
|
25
|
-
|
|
26
|
-
|
|
30
|
+
config["db"] = "sqlite:///colony.db"
|
|
31
|
+
config["DonateAddress"] = (
|
|
32
|
+
os.getenv("DonateAddress") or "0x00455d78f850b0358E8cea5be24d415E01E107CF"
|
|
33
|
+
)
|
|
34
|
+
config["ANMHost"] = os.getenv("ANMHost") or "127.0.0.1"
|
|
35
|
+
config["CrisisBytes"] = os.getenv("CrisisBytes") or 2 * 10**9 # default 2gb/node
|
|
27
36
|
|
|
28
37
|
|
|
29
38
|
# Setup Database engine
|
|
@@ -41,26 +50,26 @@ S = scoped_session(session_factory)
|
|
|
41
50
|
# else:
|
|
42
51
|
|
|
43
52
|
# Primary node for want of one
|
|
44
|
-
QUEEN=1
|
|
53
|
+
QUEEN = 1
|
|
45
54
|
|
|
46
55
|
# Donation address
|
|
47
|
-
DONATE=config["DonateAddress"]
|
|
48
|
-
#Keep these as strings so they can be grepped in logs
|
|
49
|
-
STOPPED="STOPPED"
|
|
50
|
-
RUNNING="RUNNING"
|
|
51
|
-
UPGRADING="UPGRADING"
|
|
52
|
-
DISABLED="DISABLED"
|
|
53
|
-
RESTARTING="RESTARTING"
|
|
54
|
-
MIGRATING="MIGRATING"
|
|
55
|
-
REMOVING="REMOVING"
|
|
56
|
-
DEAD="DEAD"
|
|
57
|
-
|
|
58
|
-
ANM_HOST=config["ANMHost"]
|
|
56
|
+
DONATE = config["DonateAddress"]
|
|
57
|
+
# Keep these as strings so they can be grepped in logs
|
|
58
|
+
STOPPED = "STOPPED" # 0 Node is not responding to it's metrics port
|
|
59
|
+
RUNNING = "RUNNING" # 1 Node is responding to it's metrics port
|
|
60
|
+
UPGRADING = "UPGRADING" # 2 Upgrade in progress
|
|
61
|
+
DISABLED = "DISABLED" # -1 Do not start
|
|
62
|
+
RESTARTING = "RESTARTING" # 3 re/starting a server intionally
|
|
63
|
+
MIGRATING = "MIGRATING" # 4 Moving volumes in progress
|
|
64
|
+
REMOVING = "REMOVING" # 5 Removing node in progress
|
|
65
|
+
DEAD = "DEAD" # -86 Broken node to cleanup
|
|
66
|
+
|
|
67
|
+
ANM_HOST = config["ANMHost"]
|
|
59
68
|
# Baseline bytes per node
|
|
60
|
-
CRISIS_BYTES=config["CrisisBytes"]
|
|
69
|
+
CRISIS_BYTES = config["CrisisBytes"]
|
|
61
70
|
|
|
62
71
|
# A storage place for ant node data
|
|
63
|
-
Workers=[]
|
|
72
|
+
Workers = []
|
|
64
73
|
|
|
65
74
|
# Detect ANM (but don't upgrade)
|
|
66
75
|
if os.path.exists("/var/antctl/system"):
|
|
@@ -68,7 +77,7 @@ if os.path.exists("/var/antctl/system"):
|
|
|
68
77
|
if os.path.exists("/etc/cron.d/anm"):
|
|
69
78
|
# remove cron to disable old anm
|
|
70
79
|
try:
|
|
71
|
-
subprocess.run([
|
|
80
|
+
subprocess.run(["sudo", "rm", "/etc/cron.d/anm"])
|
|
72
81
|
except Exception as error:
|
|
73
82
|
template = "In GAV - An exception of type {0} occurred. Arguments:\n{1!r}"
|
|
74
83
|
message = template.format(type(error).__name__, error.args)
|
|
@@ -85,6 +94,7 @@ if os.path.exists("/var/antctl/wnm_active"):
|
|
|
85
94
|
logging.info("wnm still running")
|
|
86
95
|
sys.exit(1)
|
|
87
96
|
|
|
97
|
+
|
|
88
98
|
# Get anm configuration
|
|
89
99
|
def load_anm_config():
|
|
90
100
|
anm_config = {}
|
|
@@ -95,111 +105,137 @@ def load_anm_config():
|
|
|
95
105
|
# What can we save from /var/antctl/config
|
|
96
106
|
if os.path.exists("/var/antctl/config"):
|
|
97
107
|
load_dotenv("/var/antctl/config")
|
|
98
|
-
anm_config["NodeCap"] = int(os.getenv(
|
|
99
|
-
anm_config["CpuLessThan"] = int(os.getenv(
|
|
100
|
-
anm_config["CpuRemove"] = int(os.getenv(
|
|
101
|
-
anm_config["MemLessThan"] = int(os.getenv(
|
|
102
|
-
anm_config["MemRemove"] = int(os.getenv(
|
|
103
|
-
anm_config["HDLessThan"] = int(os.getenv(
|
|
104
|
-
anm_config["HDRemove"] = int(os.getenv(
|
|
105
|
-
anm_config["DelayStart"] = int(os.getenv(
|
|
106
|
-
anm_config["DelayUpgrade"] = int(os.getenv(
|
|
107
|
-
anm_config["DelayRestart"] = int(os.getenv(
|
|
108
|
-
anm_config["DelayRemove"] = int(os.getenv(
|
|
109
|
-
anm_config["NodeStorage"] = os.getenv(
|
|
108
|
+
anm_config["NodeCap"] = int(os.getenv("NodeCap") or 20)
|
|
109
|
+
anm_config["CpuLessThan"] = int(os.getenv("CpuLessThan") or 50)
|
|
110
|
+
anm_config["CpuRemove"] = int(os.getenv("CpuRemove") or 70)
|
|
111
|
+
anm_config["MemLessThan"] = int(os.getenv("MemLessThan") or 70)
|
|
112
|
+
anm_config["MemRemove"] = int(os.getenv("MemRemove") or 90)
|
|
113
|
+
anm_config["HDLessThan"] = int(os.getenv("HDLessThan") or 70)
|
|
114
|
+
anm_config["HDRemove"] = int(os.getenv("HDRemove") or 90)
|
|
115
|
+
anm_config["DelayStart"] = int(os.getenv("DelayStart") or 5)
|
|
116
|
+
anm_config["DelayUpgrade"] = int(os.getenv("DelayUpgrade") or 5)
|
|
117
|
+
anm_config["DelayRestart"] = int(os.getenv("DelayRestart") or 10)
|
|
118
|
+
anm_config["DelayRemove"] = int(os.getenv("DelayRemove") or 300)
|
|
119
|
+
anm_config["NodeStorage"] = os.getenv("NodeStorage") or "/var/antctl/services"
|
|
110
120
|
# Default to the faucet donation address
|
|
111
121
|
try:
|
|
112
|
-
anm_config["RewardsAddress"] = re.findall(
|
|
122
|
+
anm_config["RewardsAddress"] = re.findall(
|
|
123
|
+
r"--rewards-address ([\dA-Fa-fXx]+)", os.getenv("RewardsAddress")
|
|
124
|
+
)[0]
|
|
113
125
|
except:
|
|
114
126
|
try:
|
|
115
|
-
anm_config["RewardsAddress"] = re.findall(
|
|
127
|
+
anm_config["RewardsAddress"] = re.findall(
|
|
128
|
+
r"([\dA-Fa-fXx]+)", os.getenv("RewardsAddress")
|
|
129
|
+
)[0]
|
|
116
130
|
except:
|
|
117
131
|
logging.warning("Unable to detect RewardsAddress")
|
|
118
132
|
sys.exit(1)
|
|
119
|
-
anm_config["DonateAddress"]=os.getenv("DonateAddress") or DONATE
|
|
120
|
-
anm_config["MaxLoadAverageAllowed"]=float(
|
|
121
|
-
|
|
133
|
+
anm_config["DonateAddress"] = os.getenv("DonateAddress") or DONATE
|
|
134
|
+
anm_config["MaxLoadAverageAllowed"] = float(
|
|
135
|
+
os.getenv("MaxLoadAverageAllowed") or anm_config["CpuCount"]
|
|
136
|
+
)
|
|
137
|
+
anm_config["DesiredLoadAverage"] = float(
|
|
138
|
+
os.getenv("DesiredLoadAverage") or (anm_config["CpuCount"] * 0.6)
|
|
139
|
+
)
|
|
122
140
|
|
|
123
141
|
try:
|
|
124
|
-
with open(
|
|
142
|
+
with open("/usr/bin/anms.sh", "r") as file:
|
|
125
143
|
data = file.read()
|
|
126
|
-
anm_config["PortStart"]=int(re.findall(r"ntpr\=(\d+)",data)[0])
|
|
144
|
+
anm_config["PortStart"] = int(re.findall(r"ntpr\=(\d+)", data)[0])
|
|
127
145
|
except:
|
|
128
|
-
anm_config["PortStart"]=55
|
|
129
|
-
|
|
130
|
-
anm_config["HDIOReadLessThan"] = float(os.getenv('HDIOReadLessThan') or 0.0)
|
|
131
|
-
anm_config["HDIOReadRemove"] = float(os.getenv('HDIOReadRemove') or 0.0)
|
|
132
|
-
anm_config["HDIOWriteLessThan"] = float(os.getenv('HDIOWriteLessThan') or 0.0)
|
|
133
|
-
anm_config["HDIOWriteRemove"] = float(os.getenv('HDIOWriteRemove') or 0.0)
|
|
134
|
-
anm_config["NetIOReadLessThan"] = float(os.getenv('NetIOReadLessThan') or 0.0)
|
|
135
|
-
anm_config["NetIOReadRemove"] = float(os.getenv('NetIOReadRemove') or 0.0)
|
|
136
|
-
anm_config["NetIOWriteLessThan"] = float(os.getenv('NetIOWriteLessThan') or 0.0)
|
|
137
|
-
anm_config["NetIOWriteRemove"] = float(os.getenv('NetIOWriteRemove') or 0.0)
|
|
138
|
-
# Timer for last stopped nodes
|
|
139
|
-
anm_config["LastStoppedAt"]=0
|
|
146
|
+
anm_config["PortStart"] = 55
|
|
140
147
|
|
|
148
|
+
anm_config["HDIOReadLessThan"] = float(os.getenv("HDIOReadLessThan") or 0.0)
|
|
149
|
+
anm_config["HDIOReadRemove"] = float(os.getenv("HDIOReadRemove") or 0.0)
|
|
150
|
+
anm_config["HDIOWriteLessThan"] = float(os.getenv("HDIOWriteLessThan") or 0.0)
|
|
151
|
+
anm_config["HDIOWriteRemove"] = float(os.getenv("HDIOWriteRemove") or 0.0)
|
|
152
|
+
anm_config["NetIOReadLessThan"] = float(os.getenv("NetIOReadLessThan") or 0.0)
|
|
153
|
+
anm_config["NetIOReadRemove"] = float(os.getenv("NetIOReadRemove") or 0.0)
|
|
154
|
+
anm_config["NetIOWriteLessThan"] = float(os.getenv("NetIOWriteLessThan") or 0.0)
|
|
155
|
+
anm_config["NetIOWriteRemove"] = float(os.getenv("NetIOWriteRemove") or 0.0)
|
|
156
|
+
# Timer for last stopped nodes
|
|
157
|
+
anm_config["LastStoppedAt"] = 0
|
|
141
158
|
|
|
142
159
|
return anm_config
|
|
143
160
|
|
|
161
|
+
|
|
144
162
|
# Read confirm from systemd service file
|
|
145
163
|
def read_systemd_service(antnode):
|
|
146
|
-
details={}
|
|
164
|
+
details = {}
|
|
147
165
|
try:
|
|
148
|
-
with open(
|
|
166
|
+
with open("/etc/systemd/system/" + antnode, "r") as file:
|
|
149
167
|
data = file.read()
|
|
150
|
-
details[
|
|
151
|
-
details[
|
|
152
|
-
details["user"]=re.findall(r"User=(\w+)",data)[0]
|
|
153
|
-
details["root_dir"]=re.findall(r"--root-dir ([\w\/]+)",data)[0]
|
|
154
|
-
details["port"]=int(re.findall(r"--port (\d+)",data)[0])
|
|
155
|
-
details["metrics_port"]=int(
|
|
156
|
-
|
|
157
|
-
|
|
168
|
+
details["id"] = int(re.findall(r"antnode(\d+)", antnode)[0])
|
|
169
|
+
details["binary"] = re.findall(r"ExecStart=([^ ]+)", data)[0]
|
|
170
|
+
details["user"] = re.findall(r"User=(\w+)", data)[0]
|
|
171
|
+
details["root_dir"] = re.findall(r"--root-dir ([\w\/]+)", data)[0]
|
|
172
|
+
details["port"] = int(re.findall(r"--port (\d+)", data)[0])
|
|
173
|
+
details["metrics_port"] = int(
|
|
174
|
+
re.findall(r"--metrics-server-port (\d+)", data)[0]
|
|
175
|
+
)
|
|
176
|
+
details["wallet"] = re.findall(r"--rewards-address ([^ ]+)", data)[0]
|
|
177
|
+
details["network"] = re.findall(r"--rewards-address [^ ]+ ([\w\-]+)", data)[0]
|
|
158
178
|
except:
|
|
159
179
|
pass
|
|
160
|
-
|
|
180
|
+
|
|
161
181
|
return details
|
|
162
182
|
|
|
183
|
+
|
|
163
184
|
# Read data from metadata endpoint
|
|
164
|
-
def read_node_metadata(host,port):
|
|
185
|
+
def read_node_metadata(host, port):
|
|
165
186
|
# Only return version number when we have one, to stop clobbering the binary check
|
|
166
187
|
try:
|
|
167
|
-
url = "http://{0}:{1}/metadata".format(host,port)
|
|
188
|
+
url = "http://{0}:{1}/metadata".format(host, port)
|
|
168
189
|
response = requests.get(url)
|
|
169
|
-
data=response.text
|
|
190
|
+
data = response.text
|
|
170
191
|
except requests.exceptions.ConnectionError:
|
|
171
|
-
logging.debug("Connection Refused on port: {0}:{1}".format(host,str(port)))
|
|
172
|
-
return {"status": STOPPED, "peer_id":""}
|
|
192
|
+
logging.debug("Connection Refused on port: {0}:{1}".format(host, str(port)))
|
|
193
|
+
return {"status": STOPPED, "peer_id": ""}
|
|
173
194
|
except Exception as error:
|
|
174
195
|
template = "In RNMd - An exception of type {0} occurred. Arguments:\n{1!r}"
|
|
175
196
|
message = template.format(type(error).__name__, error.args)
|
|
176
197
|
logging.info(message)
|
|
177
|
-
return {"status": STOPPED, "peer_id":""}
|
|
198
|
+
return {"status": STOPPED, "peer_id": ""}
|
|
178
199
|
# collect a dict to return
|
|
179
|
-
card={}
|
|
200
|
+
card = {}
|
|
180
201
|
try:
|
|
181
|
-
card["version"] = re.findall(r'{antnode_version="([\d\.]+)"}',data)[0]
|
|
202
|
+
card["version"] = re.findall(r'{antnode_version="([\d\.]+)"}', data)[0]
|
|
182
203
|
except:
|
|
183
|
-
logging.info(
|
|
204
|
+
logging.info("No version found")
|
|
184
205
|
try:
|
|
185
|
-
card["peer_id"] = re.findall(r'{peer_id="([\w\d]+)"}',data)[0]
|
|
206
|
+
card["peer_id"] = re.findall(r'{peer_id="([\w\d]+)"}', data)[0]
|
|
186
207
|
except:
|
|
187
208
|
card["peer_id"] = ""
|
|
188
209
|
card["status"] = RUNNING if "version" in card else STOPPED
|
|
189
210
|
return card
|
|
190
211
|
|
|
212
|
+
|
|
191
213
|
# Read data from metrics port
|
|
192
|
-
def read_node_metrics(host,port):
|
|
193
|
-
metrics={}
|
|
214
|
+
def read_node_metrics(host, port):
|
|
215
|
+
metrics = {}
|
|
194
216
|
try:
|
|
195
|
-
url = "http://{0}:{1}/metrics".format(host,port)
|
|
217
|
+
url = "http://{0}:{1}/metrics".format(host, port)
|
|
196
218
|
response = requests.get(url)
|
|
197
219
|
metrics["status"] = RUNNING
|
|
198
|
-
metrics["uptime"] = int(
|
|
199
|
-
|
|
200
|
-
|
|
220
|
+
metrics["uptime"] = int(
|
|
221
|
+
(re.findall(r"ant_node_uptime ([\d]+)", response.text) or [0])[0]
|
|
222
|
+
)
|
|
223
|
+
metrics["records"] = int(
|
|
224
|
+
(
|
|
225
|
+
re.findall(r"ant_networking_records_stored ([\d]+)", response.text)
|
|
226
|
+
or [0]
|
|
227
|
+
)[0]
|
|
228
|
+
)
|
|
229
|
+
metrics["shunned"] = int(
|
|
230
|
+
(
|
|
231
|
+
re.findall(
|
|
232
|
+
r"ant_networking_shunned_by_close_group ([\d]+)", response.text
|
|
233
|
+
)
|
|
234
|
+
or [0]
|
|
235
|
+
)[0]
|
|
236
|
+
)
|
|
201
237
|
except requests.exceptions.ConnectionError:
|
|
202
|
-
logging.debug("Connection Refused on port: {0}:{1}".format(host,str(port)))
|
|
238
|
+
logging.debug("Connection Refused on port: {0}:{1}".format(host, str(port)))
|
|
203
239
|
metrics["status"] = STOPPED
|
|
204
240
|
metrics["uptime"] = 0
|
|
205
241
|
metrics["records"] = 0
|
|
@@ -213,104 +249,116 @@ def read_node_metrics(host,port):
|
|
|
213
249
|
metrics["records"] = 0
|
|
214
250
|
metrics["shunned"] = 0
|
|
215
251
|
return metrics
|
|
216
|
-
|
|
252
|
+
|
|
253
|
+
|
|
217
254
|
# Read antnode binary version
|
|
218
255
|
def get_antnode_version(binary):
|
|
219
256
|
try:
|
|
220
|
-
data = subprocess.run(
|
|
221
|
-
|
|
257
|
+
data = subprocess.run(
|
|
258
|
+
[binary, "--version"], stdout=subprocess.PIPE
|
|
259
|
+
).stdout.decode("utf-8")
|
|
260
|
+
return re.findall(r"Autonomi Node v([\d\.]+)", data)[0]
|
|
222
261
|
except Exception as error:
|
|
223
262
|
template = "In GAV - An exception of type {0} occurred. Arguments:\n{1!r}"
|
|
224
263
|
message = template.format(type(error).__name__, error.args)
|
|
225
264
|
logging.info(message)
|
|
226
265
|
return 0
|
|
227
|
-
|
|
266
|
+
|
|
267
|
+
|
|
228
268
|
# Determine how long this node has been around by looking at it's secret_key file
|
|
229
269
|
def get_node_age(root_dir):
|
|
230
270
|
try:
|
|
231
271
|
return int(os.stat("{0}/secret-key".format(root_dir)).st_mtime)
|
|
232
272
|
except:
|
|
233
273
|
return 0
|
|
234
|
-
|
|
274
|
+
|
|
275
|
+
|
|
235
276
|
# Survey nodes by reading metadata from metrics ports or binary --version
|
|
236
277
|
def survey_anm_nodes(antnodes):
|
|
237
278
|
# Build a list of node dictionaries to return
|
|
238
|
-
details=[]
|
|
279
|
+
details = []
|
|
239
280
|
# Iterate on nodes
|
|
240
281
|
for node in antnodes:
|
|
241
282
|
# Initialize a dict
|
|
242
|
-
logging.debug(
|
|
243
|
-
|
|
244
|
-
|
|
283
|
+
logging.debug(
|
|
284
|
+
"{0} surveying node {1} ".format(time.strftime("%Y-%m-%d %H:%M"), node)
|
|
285
|
+
)
|
|
286
|
+
if not re.findall(r"antnode([\d]+).service", node):
|
|
287
|
+
logging.info("can't decode " + str(node))
|
|
245
288
|
continue
|
|
246
|
-
card={
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
289
|
+
card = {
|
|
290
|
+
"nodename": re.findall(r"antnode([\d]+).service", node)[0],
|
|
291
|
+
"service": node,
|
|
292
|
+
"timestamp": int(time.time()),
|
|
293
|
+
"host": ANM_HOST or "127.0.0.1",
|
|
294
|
+
}
|
|
251
295
|
# Load what systemd has configured
|
|
252
296
|
card.update(read_systemd_service(node))
|
|
253
|
-
#print(json.dumps(card,indent=2))
|
|
297
|
+
# print(json.dumps(card,indent=2))
|
|
254
298
|
# Read metadata from metrics_port
|
|
255
|
-
metadata = read_node_metadata(card["host"],card["metrics_port"])
|
|
256
|
-
#print(json.dumps(metadata,indent=2))
|
|
257
|
-
if
|
|
258
|
-
|
|
259
|
-
|
|
299
|
+
metadata = read_node_metadata(card["host"], card["metrics_port"])
|
|
300
|
+
# print(json.dumps(metadata,indent=2))
|
|
301
|
+
if (
|
|
302
|
+
isinstance(metadata, dict)
|
|
303
|
+
and "status" in metadata
|
|
304
|
+
and metadata["status"] == RUNNING
|
|
305
|
+
):
|
|
260
306
|
# soak up metadata
|
|
261
307
|
card.update(metadata)
|
|
262
308
|
# The ports up, so grab metrics too
|
|
263
|
-
card.update(read_node_metrics(card["host"],card["metrics_port"]))
|
|
309
|
+
card.update(read_node_metrics(card["host"], card["metrics_port"]))
|
|
264
310
|
# Else run binary to get version
|
|
265
311
|
else:
|
|
266
312
|
# If the root directory of the node is missing, it's a bad node
|
|
267
313
|
if not os.path.isdir(card["root_dir"]):
|
|
268
|
-
card["status"]=DEAD
|
|
269
|
-
card["version"]=
|
|
314
|
+
card["status"] = DEAD
|
|
315
|
+
card["version"] = ""
|
|
270
316
|
else:
|
|
271
|
-
card["status"]=STOPPED
|
|
272
|
-
card["version"]=get_antnode_version(card["binary"])
|
|
273
|
-
card["peer_id"]=
|
|
274
|
-
card["records"]=0
|
|
275
|
-
card["uptime"]=0
|
|
276
|
-
card["shunned"]=0
|
|
277
|
-
card["age"]=get_node_age(card["root_dir"])
|
|
317
|
+
card["status"] = STOPPED
|
|
318
|
+
card["version"] = get_antnode_version(card["binary"])
|
|
319
|
+
card["peer_id"] = ""
|
|
320
|
+
card["records"] = 0
|
|
321
|
+
card["uptime"] = 0
|
|
322
|
+
card["shunned"] = 0
|
|
323
|
+
card["age"] = get_node_age(card["root_dir"])
|
|
278
324
|
# harcoded for anm
|
|
279
|
-
card["host"]=ANM_HOST
|
|
325
|
+
card["host"] = ANM_HOST
|
|
280
326
|
# Append the node dict to the detail list
|
|
281
327
|
details.append(card)
|
|
282
|
-
|
|
328
|
+
|
|
283
329
|
return details
|
|
284
330
|
|
|
331
|
+
|
|
285
332
|
# Survey server instance
|
|
286
333
|
def survey_machine():
|
|
287
334
|
# Make a bucket
|
|
288
|
-
antnodes=[]
|
|
335
|
+
antnodes = []
|
|
289
336
|
# For all service files
|
|
290
337
|
for file in os.listdir("/etc/systemd/system"):
|
|
291
338
|
# Find antnodes
|
|
292
|
-
if re.match(r
|
|
339
|
+
if re.match(r"antnode[\d]+\.service", file):
|
|
293
340
|
antnodes.append(file)
|
|
294
|
-
#if len(antnodes)>=5:
|
|
341
|
+
# if len(antnodes)>=5:
|
|
295
342
|
# break
|
|
296
343
|
# Iterate over defined nodes and get details
|
|
297
344
|
# Ingests a list of service files and outputs a list of dictionaries
|
|
298
345
|
return survey_anm_nodes(antnodes)
|
|
299
346
|
|
|
347
|
+
|
|
300
348
|
# Read system status
|
|
301
|
-
def get_machine_metrics(node_storage,remove_limit):
|
|
349
|
+
def get_machine_metrics(node_storage, remove_limit):
|
|
302
350
|
metrics = {}
|
|
303
351
|
|
|
304
352
|
with S() as session:
|
|
305
|
-
db_nodes=session.execute(select(Node.status,Node.version)).all()
|
|
306
|
-
|
|
353
|
+
db_nodes = session.execute(select(Node.status, Node.version)).all()
|
|
354
|
+
|
|
307
355
|
# Get some initial stats for comparing after a few seconds
|
|
308
356
|
# We start these counters AFTER reading the database
|
|
309
|
-
start_time=time.time()
|
|
310
|
-
start_disk_counters=psutil.disk_io_counters()
|
|
311
|
-
start_net_counters=psutil.net_io_counters()
|
|
357
|
+
start_time = time.time()
|
|
358
|
+
start_disk_counters = psutil.disk_io_counters()
|
|
359
|
+
start_net_counters = psutil.net_io_counters()
|
|
312
360
|
|
|
313
|
-
metrics["TotalNodes"]=len(db_nodes)
|
|
361
|
+
metrics["TotalNodes"] = len(db_nodes)
|
|
314
362
|
data = Counter(node[0] for node in db_nodes)
|
|
315
363
|
metrics["RunningNodes"] = data[RUNNING]
|
|
316
364
|
metrics["StoppedNodes"] = data[STOPPED]
|
|
@@ -319,62 +367,90 @@ def get_machine_metrics(node_storage,remove_limit):
|
|
|
319
367
|
metrics["MigratingNodes"] = data[MIGRATING]
|
|
320
368
|
metrics["RemovingNodes"] = data[REMOVING]
|
|
321
369
|
metrics["DeadNodes"] = data[DEAD]
|
|
322
|
-
metrics["antnode"]=shutil.which("antnode")
|
|
370
|
+
metrics["antnode"] = shutil.which("antnode")
|
|
323
371
|
if not metrics["antnode"]:
|
|
324
372
|
logging.warning("Unable to locate current antnode binary, exiting")
|
|
325
373
|
sys.exit(1)
|
|
326
|
-
metrics["AntNodeVersion"]=get_antnode_version(metrics["antnode"])
|
|
327
|
-
metrics["NodesLatestV"]=
|
|
328
|
-
|
|
329
|
-
|
|
374
|
+
metrics["AntNodeVersion"] = get_antnode_version(metrics["antnode"])
|
|
375
|
+
metrics["NodesLatestV"] = (
|
|
376
|
+
sum(1 for node in db_nodes if node[1] == metrics["AntNodeVersion"]) or 0
|
|
377
|
+
)
|
|
378
|
+
metrics["NodesNoVersion"] = sum(1 for node in db_nodes if not node[1]) or 0
|
|
379
|
+
metrics["NodesToUpgrade"] = (
|
|
380
|
+
metrics["TotalNodes"] - metrics["NodesLatestV"] - metrics["NodesNoVersion"]
|
|
381
|
+
)
|
|
330
382
|
|
|
331
383
|
# Windows has to build load average over 5 seconds. The first 5 seconds returns 0's
|
|
332
|
-
# I don't plan on supporting windows, but if this get's modular, I don't want this
|
|
384
|
+
# I don't plan on supporting windows, but if this get's modular, I don't want this
|
|
333
385
|
# issue to be skipped
|
|
334
|
-
#if platform.system() == "Windows":
|
|
386
|
+
# if platform.system() == "Windows":
|
|
335
387
|
# discard=psutil.getloadavg()
|
|
336
388
|
# time.sleep(5)
|
|
337
|
-
metrics["LoadAverage1"],metrics["LoadAverage5"],metrics["LoadAverage15"]=
|
|
389
|
+
metrics["LoadAverage1"], metrics["LoadAverage5"], metrics["LoadAverage15"] = (
|
|
390
|
+
psutil.getloadavg()
|
|
391
|
+
)
|
|
338
392
|
# Get CPU Metrics over 1 second
|
|
339
|
-
metrics["IdleCpuPercent"],metrics["IOWait"] = psutil.cpu_times_percent(1)[3:5]
|
|
393
|
+
metrics["IdleCpuPercent"], metrics["IOWait"] = psutil.cpu_times_percent(1)[3:5]
|
|
340
394
|
# Really we returned Idle percent, subtract from 100 to get used.
|
|
341
395
|
metrics["UsedCpuPercent"] = 100 - metrics["IdleCpuPercent"]
|
|
342
|
-
data=psutil.virtual_memory()
|
|
343
|
-
#print(data)
|
|
344
|
-
metrics["UsedMemPercent"]=data.percent
|
|
345
|
-
metrics["FreeMemPercent"]=100-metrics["UsedMemPercent"]
|
|
346
|
-
data=psutil.disk_io_counters()
|
|
396
|
+
data = psutil.virtual_memory()
|
|
397
|
+
# print(data)
|
|
398
|
+
metrics["UsedMemPercent"] = data.percent
|
|
399
|
+
metrics["FreeMemPercent"] = 100 - metrics["UsedMemPercent"]
|
|
400
|
+
data = psutil.disk_io_counters()
|
|
347
401
|
# This only checks the drive mapped to the first node and will need to be updated
|
|
348
402
|
# when we eventually support multiple drives
|
|
349
|
-
data=psutil.disk_usage(node_storage)
|
|
350
|
-
metrics["UsedHDPercent"]=data.percent
|
|
351
|
-
metrics["TotalHDBytes"]=data.total
|
|
352
|
-
end_time=time.time()
|
|
353
|
-
end_disk_counters=psutil.disk_io_counters()
|
|
354
|
-
end_net_counters=psutil.net_io_counters()
|
|
355
|
-
metrics["HDWriteBytes"]=int(
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
403
|
+
data = psutil.disk_usage(node_storage)
|
|
404
|
+
metrics["UsedHDPercent"] = data.percent
|
|
405
|
+
metrics["TotalHDBytes"] = data.total
|
|
406
|
+
end_time = time.time()
|
|
407
|
+
end_disk_counters = psutil.disk_io_counters()
|
|
408
|
+
end_net_counters = psutil.net_io_counters()
|
|
409
|
+
metrics["HDWriteBytes"] = int(
|
|
410
|
+
(end_disk_counters.write_bytes - start_disk_counters.write_bytes)
|
|
411
|
+
/ (end_time - start_time)
|
|
412
|
+
)
|
|
413
|
+
metrics["HDReadBytes"] = int(
|
|
414
|
+
(end_disk_counters.read_bytes - start_disk_counters.read_bytes)
|
|
415
|
+
/ (end_time - start_time)
|
|
416
|
+
)
|
|
417
|
+
metrics["NetWriteBytes"] = int(
|
|
418
|
+
(end_net_counters.bytes_sent - start_net_counters.bytes_sent)
|
|
419
|
+
/ (end_time - start_time)
|
|
420
|
+
)
|
|
421
|
+
metrics["NetReadBytes"] = int(
|
|
422
|
+
(end_net_counters.bytes_recv - start_net_counters.bytes_recv)
|
|
423
|
+
/ (end_time - start_time)
|
|
424
|
+
)
|
|
425
|
+
# print (json.dumps(metrics,indent=2))
|
|
360
426
|
# How close (out of 100) to removal limit will we be with a max bytes per node (2GB default)
|
|
361
427
|
# For running nodes with Porpoise(tm).
|
|
362
|
-
metrics["NodeHDCrisis"]=int(
|
|
428
|
+
metrics["NodeHDCrisis"] = int(
|
|
429
|
+
(
|
|
430
|
+
((metrics["TotalNodes"]) * CRISIS_BYTES)
|
|
431
|
+
/ (metrics["TotalHDBytes"] * (remove_limit / 100))
|
|
432
|
+
)
|
|
433
|
+
* 100
|
|
434
|
+
)
|
|
363
435
|
return metrics
|
|
364
436
|
|
|
437
|
+
|
|
365
438
|
# Update node with metrics result
|
|
366
|
-
def update_node_from_metrics(id,metrics,metadata):
|
|
439
|
+
def update_node_from_metrics(id, metrics, metadata):
|
|
367
440
|
try:
|
|
368
441
|
# We check the binary version in other code, so lets stop clobbering it when a node is stopped
|
|
369
|
-
card={
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
442
|
+
card = {
|
|
443
|
+
"status": metrics["status"],
|
|
444
|
+
"timestamp": int(time.time()),
|
|
445
|
+
"uptime": metrics["uptime"],
|
|
446
|
+
"records": metrics["records"],
|
|
447
|
+
"shunned": metrics["shunned"],
|
|
448
|
+
"peer_id": metadata["peer_id"],
|
|
449
|
+
}
|
|
373
450
|
if "version" in metadata:
|
|
374
|
-
card[
|
|
451
|
+
card["version"] = metadata["version"]
|
|
375
452
|
with S() as session:
|
|
376
|
-
session.query(Node).filter(Node.id == id)
|
|
377
|
-
update(card)
|
|
453
|
+
session.query(Node).filter(Node.id == id).update(card)
|
|
378
454
|
session.commit()
|
|
379
455
|
except Exception as error:
|
|
380
456
|
template = "In UNFM - An exception of type {0} occurred. Arguments:\n{1!r}"
|
|
@@ -383,259 +459,315 @@ def update_node_from_metrics(id,metrics,metadata):
|
|
|
383
459
|
return False
|
|
384
460
|
else:
|
|
385
461
|
return True
|
|
386
|
-
|
|
462
|
+
|
|
463
|
+
|
|
387
464
|
# Set Node status
|
|
388
|
-
def set_node_status(id,status):
|
|
389
|
-
logging.info("Setting node status: {0} {1}".format(id,status))
|
|
465
|
+
def set_node_status(id, status):
|
|
466
|
+
logging.info("Setting node status: {0} {1}".format(id, status))
|
|
390
467
|
try:
|
|
391
468
|
with S() as session:
|
|
392
|
-
session.query(Node).filter(Node.id == id)
|
|
393
|
-
|
|
469
|
+
session.query(Node).filter(Node.id == id).update(
|
|
470
|
+
{"status": status, "timestamp": int(time.time())}
|
|
471
|
+
)
|
|
394
472
|
session.commit()
|
|
395
473
|
except:
|
|
396
474
|
return False
|
|
397
475
|
else:
|
|
398
476
|
return True
|
|
399
477
|
|
|
478
|
+
|
|
400
479
|
# Update metrics after checking counters
|
|
401
|
-
def update_counters(old,config):
|
|
480
|
+
def update_counters(old, config):
|
|
402
481
|
# Are we already removing a node
|
|
403
482
|
if old["RemovingNodes"]:
|
|
404
483
|
with S() as session:
|
|
405
|
-
removals=session.execute(
|
|
406
|
-
|
|
407
|
-
|
|
484
|
+
removals = session.execute(
|
|
485
|
+
select(Node.timestamp, Node.id)
|
|
486
|
+
.where(Node.status == REMOVING)
|
|
487
|
+
.order_by(Node.timestamp.asc())
|
|
488
|
+
).all()
|
|
408
489
|
# Iterate through active removals
|
|
409
490
|
records_to_remove = len(removals)
|
|
410
491
|
for check in removals:
|
|
411
492
|
# If the DelayRemove timer has expired, delete the entry
|
|
412
|
-
if isinstance(check[0],int) and
|
|
413
|
-
|
|
414
|
-
|
|
493
|
+
if isinstance(check[0], int) and check[0] < (
|
|
494
|
+
int(time.time()) - (config["DelayRemove"] * 60)
|
|
495
|
+
):
|
|
496
|
+
logging.info("Deleting removed node " + str(check[1]))
|
|
415
497
|
with S() as session:
|
|
416
|
-
session.execute(delete(Node).where(Node.id==check[1]))
|
|
498
|
+
session.execute(delete(Node).where(Node.id == check[1]))
|
|
417
499
|
session.commit()
|
|
418
|
-
records_to_remove-=1
|
|
419
|
-
old["RemovingNodes"]=records_to_remove
|
|
500
|
+
records_to_remove -= 1
|
|
501
|
+
old["RemovingNodes"] = records_to_remove
|
|
420
502
|
# Are we already upgrading a node
|
|
421
503
|
if old["UpgradingNodes"]:
|
|
422
504
|
with S() as session:
|
|
423
|
-
upgrades=session.execute(
|
|
424
|
-
|
|
425
|
-
|
|
505
|
+
upgrades = session.execute(
|
|
506
|
+
select(Node.timestamp, Node.id, Node.host, Node.metrics_port)
|
|
507
|
+
.where(Node.status == UPGRADING)
|
|
508
|
+
.order_by(Node.timestamp.asc())
|
|
509
|
+
).all()
|
|
426
510
|
# Iterate through active upgrades
|
|
427
511
|
records_to_upgrade = len(upgrades)
|
|
428
512
|
for check in upgrades:
|
|
429
513
|
# If the DelayUpgrade timer has expired, check on status
|
|
430
|
-
if isinstance(check[0],int) and
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
514
|
+
if isinstance(check[0], int) and check[0] < (
|
|
515
|
+
int(time.time()) - (config["DelayUpgrade"] * 60)
|
|
516
|
+
):
|
|
517
|
+
logging.info("Updating upgraded node " + str(check[1]))
|
|
518
|
+
node_metrics = read_node_metrics(check[2], check[3])
|
|
519
|
+
node_metadata = read_node_metadata(check[2], check[3])
|
|
435
520
|
if node_metrics and node_metadata:
|
|
436
|
-
update_node_from_metrics(check[1],node_metrics,node_metadata)
|
|
437
|
-
records_to_upgrade-=1
|
|
438
|
-
old["UpgradingNodes"]=records_to_upgrade
|
|
521
|
+
update_node_from_metrics(check[1], node_metrics, node_metadata)
|
|
522
|
+
records_to_upgrade -= 1
|
|
523
|
+
old["UpgradingNodes"] = records_to_upgrade
|
|
439
524
|
# Are we already restarting a node
|
|
440
525
|
if old["RestartingNodes"]:
|
|
441
526
|
with S() as session:
|
|
442
|
-
restarts=session.execute(
|
|
443
|
-
|
|
444
|
-
|
|
527
|
+
restarts = session.execute(
|
|
528
|
+
select(Node.timestamp, Node.id, Node.host, Node.metrics_port)
|
|
529
|
+
.where(Node.status == RESTARTING)
|
|
530
|
+
.order_by(Node.timestamp.asc())
|
|
531
|
+
).all()
|
|
445
532
|
# Iterate through active upgrades
|
|
446
533
|
records_to_restart = len(restarts)
|
|
447
534
|
for check in restarts:
|
|
448
535
|
# If the DelayUpgrade timer has expired, check on status
|
|
449
|
-
if isinstance(check[0],int) and
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
536
|
+
if isinstance(check[0], int) and check[0] < (
|
|
537
|
+
int(time.time()) - (config["DelayStart"] * 60)
|
|
538
|
+
):
|
|
539
|
+
logging.info("Updating restarted node " + str(check[1]))
|
|
540
|
+
node_metrics = read_node_metrics(check[2], check[3])
|
|
541
|
+
node_metadata = read_node_metadata(check[2], check[3])
|
|
454
542
|
if node_metrics and node_metadata:
|
|
455
|
-
update_node_from_metrics(check[1],node_metrics,node_metadata)
|
|
456
|
-
records_to_restart-=1
|
|
457
|
-
old["RestartingNodes"]=records_to_restart
|
|
458
|
-
return
|
|
543
|
+
update_node_from_metrics(check[1], node_metrics, node_metadata)
|
|
544
|
+
records_to_restart -= 1
|
|
545
|
+
old["RestartingNodes"] = records_to_restart
|
|
546
|
+
return old
|
|
547
|
+
|
|
459
548
|
|
|
460
549
|
# Enable firewall for port
|
|
461
|
-
def enable_firewall(port,node):
|
|
550
|
+
def enable_firewall(port, node):
|
|
462
551
|
logging.info("enable firewall port {0}/udp".format(port))
|
|
463
552
|
# Close ufw firewall
|
|
464
553
|
try:
|
|
465
|
-
subprocess.run(
|
|
554
|
+
subprocess.run(
|
|
555
|
+
["sudo", "ufw", "allow", "{0}/udp".format(port), "comment", node],
|
|
556
|
+
stdout=subprocess.PIPE,
|
|
557
|
+
)
|
|
466
558
|
except subprocess.CalledProcessError as err:
|
|
467
|
-
|
|
559
|
+
logging.error("EF Error:", err)
|
|
560
|
+
|
|
468
561
|
|
|
469
562
|
# Disable firewall for port
|
|
470
563
|
def disable_firewall(port):
|
|
471
564
|
logging.info("disable firewall port {0}/udp".format(port))
|
|
472
565
|
# Close ufw firewall
|
|
473
566
|
try:
|
|
474
|
-
subprocess.run(
|
|
567
|
+
subprocess.run(
|
|
568
|
+
["sudo", "ufw", "delete", "allow", "{0}/udp".format(port)],
|
|
569
|
+
stdout=subprocess.PIPE,
|
|
570
|
+
)
|
|
475
571
|
except subprocess.CalledProcessError as err:
|
|
476
|
-
|
|
572
|
+
logging.error("DF ERROR:", err)
|
|
573
|
+
|
|
477
574
|
|
|
478
575
|
# Start a systemd node
|
|
479
576
|
def start_systemd_node(node):
|
|
480
|
-
logging.info("Starting node "+str(node.id))
|
|
577
|
+
logging.info("Starting node " + str(node.id))
|
|
481
578
|
# Try to start the service
|
|
482
579
|
try:
|
|
483
|
-
p = subprocess.run(
|
|
484
|
-
|
|
485
|
-
|
|
580
|
+
p = subprocess.run(
|
|
581
|
+
["sudo", "systemctl", "start", node.service],
|
|
582
|
+
stdout=subprocess.PIPE,
|
|
583
|
+
stderr=subprocess.STDOUT,
|
|
584
|
+
).stdout.decode("utf-8")
|
|
585
|
+
if re.match(r"Failed to start", p):
|
|
586
|
+
logging.error("SSN2 ERROR:", p)
|
|
486
587
|
return False
|
|
487
588
|
except subprocess.CalledProcessError as err:
|
|
488
|
-
|
|
489
|
-
|
|
589
|
+
logging.error("SSN1 ERROR:", err)
|
|
590
|
+
return False
|
|
490
591
|
# Open a firewall hole for the data port
|
|
491
|
-
enable_firewall(node.port,node.service)
|
|
592
|
+
enable_firewall(node.port, node.service)
|
|
492
593
|
# Update node status
|
|
493
|
-
set_node_status(node.id,RESTARTING)
|
|
594
|
+
set_node_status(node.id, RESTARTING)
|
|
494
595
|
return True
|
|
495
596
|
|
|
597
|
+
|
|
496
598
|
# Stop a systemd node
|
|
497
599
|
def stop_systemd_node(node):
|
|
498
|
-
logging.info("Stopping node: "+node.service)
|
|
600
|
+
logging.info("Stopping node: " + node.service)
|
|
499
601
|
# Send a stop signal to the process
|
|
500
602
|
try:
|
|
501
|
-
subprocess.run(
|
|
603
|
+
subprocess.run(
|
|
604
|
+
["sudo", "systemctl", "stop", node.service], stdout=subprocess.PIPE
|
|
605
|
+
)
|
|
502
606
|
except subprocess.CalledProcessError as err:
|
|
503
|
-
|
|
607
|
+
logging.error("SSN2 ERROR:", err)
|
|
504
608
|
disable_firewall(node.port)
|
|
505
|
-
set_node_status(node.id,STOPPED)
|
|
609
|
+
set_node_status(node.id, STOPPED)
|
|
506
610
|
|
|
507
611
|
return True
|
|
508
612
|
|
|
613
|
+
|
|
509
614
|
# Upgrade a node
|
|
510
|
-
def upgrade_node(node,metrics):
|
|
511
|
-
logging.info("Upgrading node "+str(node.id))
|
|
615
|
+
def upgrade_node(node, metrics):
|
|
616
|
+
logging.info("Upgrading node " + str(node.id))
|
|
512
617
|
# Copy current node binary
|
|
513
618
|
try:
|
|
514
|
-
subprocess.run([
|
|
619
|
+
subprocess.run(["sudo", "cp", "-f", metrics["antnode"], node.binary])
|
|
515
620
|
except subprocess.CalledProcessError as err:
|
|
516
|
-
logging.error(
|
|
621
|
+
logging.error("UN1 ERROR:", err)
|
|
517
622
|
try:
|
|
518
|
-
subprocess.run([
|
|
623
|
+
subprocess.run(["sudo", "systemctl", "restart", node.service])
|
|
519
624
|
except subprocess.CalledProcessError as err:
|
|
520
|
-
logging.error(
|
|
521
|
-
version=get_antnode_version(node.binary)
|
|
625
|
+
logging.error("UN2 ERROR:", err)
|
|
626
|
+
version = get_antnode_version(node.binary)
|
|
522
627
|
try:
|
|
523
628
|
with S() as session:
|
|
524
|
-
session.query(Node).filter(Node.id == node.id)
|
|
525
|
-
|
|
526
|
-
|
|
629
|
+
session.query(Node).filter(Node.id == node.id).update(
|
|
630
|
+
{
|
|
631
|
+
"status": UPGRADING,
|
|
632
|
+
"timestamp": int(time.time()),
|
|
633
|
+
"version": metrics["AntNodeVersion"],
|
|
634
|
+
}
|
|
635
|
+
)
|
|
527
636
|
session.commit()
|
|
528
637
|
except:
|
|
529
638
|
return False
|
|
530
639
|
else:
|
|
531
640
|
return True
|
|
532
641
|
|
|
642
|
+
|
|
533
643
|
# Remove a node
|
|
534
644
|
def remove_node(id):
|
|
535
|
-
logging.info("Removing node "+str(id))
|
|
645
|
+
logging.info("Removing node " + str(id))
|
|
536
646
|
|
|
537
647
|
with S() as session:
|
|
538
648
|
node = session.execute(select(Node).where(Node.id == id)).first()
|
|
539
649
|
# Grab Node from Row
|
|
540
|
-
node=node[0]
|
|
650
|
+
node = node[0]
|
|
541
651
|
if stop_systemd_node(node):
|
|
542
652
|
# Mark this node as REMOVING
|
|
543
|
-
set_node_status(id,REMOVING)
|
|
653
|
+
set_node_status(id, REMOVING)
|
|
544
654
|
|
|
545
|
-
nodename=f"antnode{node.nodename}"
|
|
655
|
+
nodename = f"antnode{node.nodename}"
|
|
546
656
|
# Remove node data and log
|
|
547
657
|
try:
|
|
548
|
-
subprocess.run(
|
|
658
|
+
subprocess.run(
|
|
659
|
+
["sudo", "rm", "-rf", node.root_dir, f"/var/log/antnode/{nodename}"]
|
|
660
|
+
)
|
|
549
661
|
except subprocess.CalledProcessError as err:
|
|
550
|
-
logging.error(
|
|
662
|
+
logging.error("RN1 ERROR:", err)
|
|
551
663
|
# Remove systemd service file
|
|
552
664
|
try:
|
|
553
|
-
subprocess.run([
|
|
665
|
+
subprocess.run(["sudo", "rm", "-f", f"/etc/systemd/system/{node.service}"])
|
|
554
666
|
except subprocess.CalledProcessError as err:
|
|
555
|
-
logging.error(
|
|
556
|
-
# Tell system to reload systemd files
|
|
667
|
+
logging.error("RN2 ERROR:", err)
|
|
668
|
+
# Tell system to reload systemd files
|
|
557
669
|
try:
|
|
558
|
-
subprocess.run([
|
|
670
|
+
subprocess.run(["sudo", "systemctl", "daemon-reload"])
|
|
559
671
|
except subprocess.CalledProcessError as err:
|
|
560
|
-
logging.error(
|
|
561
|
-
#print(json.dumps(node,indent=2))
|
|
672
|
+
logging.error("RN3 ERROR:", err)
|
|
673
|
+
# print(json.dumps(node,indent=2))
|
|
674
|
+
|
|
562
675
|
|
|
563
676
|
# Rescan nodes for status
|
|
564
677
|
def update_nodes():
|
|
565
678
|
with S() as session:
|
|
566
|
-
nodes=session.execute(
|
|
567
|
-
|
|
568
|
-
|
|
679
|
+
nodes = session.execute(
|
|
680
|
+
select(Node.timestamp, Node.id, Node.host, Node.metrics_port, Node.status)
|
|
681
|
+
.where(Node.status != DISABLED)
|
|
682
|
+
.order_by(Node.timestamp.asc())
|
|
683
|
+
).all()
|
|
569
684
|
# Iterate through all records
|
|
570
685
|
for check in nodes:
|
|
571
686
|
# Check on status
|
|
572
|
-
if isinstance(check[0],int):
|
|
573
|
-
logging.debug("Updating info on node "+str(check[1]))
|
|
574
|
-
node_metrics=read_node_metrics(check[2],check[3])
|
|
575
|
-
node_metadata=read_node_metadata(check[2],check[3])
|
|
687
|
+
if isinstance(check[0], int):
|
|
688
|
+
logging.debug("Updating info on node " + str(check[1]))
|
|
689
|
+
node_metrics = read_node_metrics(check[2], check[3])
|
|
690
|
+
node_metadata = read_node_metadata(check[2], check[3])
|
|
576
691
|
if node_metrics and node_metadata:
|
|
577
692
|
# Don't write updates for stopped nodes that are already marked as stopped
|
|
578
|
-
if node_metadata["status"]==STOPPED and check[4]==STOPPED:
|
|
693
|
+
if node_metadata["status"] == STOPPED and check[4] == STOPPED:
|
|
579
694
|
continue
|
|
580
|
-
update_node_from_metrics(check[1],node_metrics,node_metadata)
|
|
581
|
-
|
|
695
|
+
update_node_from_metrics(check[1], node_metrics, node_metadata)
|
|
696
|
+
|
|
697
|
+
|
|
582
698
|
# Create a new node
|
|
583
|
-
def create_node(config,metrics):
|
|
699
|
+
def create_node(config, metrics):
|
|
584
700
|
logging.info("Creating new node")
|
|
585
701
|
# Create a holding place for the new node
|
|
586
702
|
card = {}
|
|
587
703
|
# Find the next available node number by first looking for holes
|
|
588
|
-
sql = text(
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
704
|
+
sql = text(
|
|
705
|
+
"select n1.id + 1 as id from node n1 "
|
|
706
|
+
+ "left join node n2 on n2.id = n1.id + 1 "
|
|
707
|
+
+ "where n2.id is null "
|
|
708
|
+
+ "and n1.id <> (select max(id) from node) "
|
|
709
|
+
+ "order by n1.id;"
|
|
710
|
+
)
|
|
593
711
|
with S() as session:
|
|
594
712
|
result = session.execute(sql).first()
|
|
595
713
|
if result:
|
|
596
|
-
card[
|
|
714
|
+
card["id"] = result[0]
|
|
597
715
|
# Otherwise get the max node number and add 1
|
|
598
716
|
else:
|
|
599
717
|
with S() as session:
|
|
600
718
|
result = session.execute(select(Node.id).order_by(Node.id.desc())).first()
|
|
601
|
-
card[
|
|
719
|
+
card["id"] = result[0] + 1
|
|
602
720
|
# Set the node name
|
|
603
|
-
card[
|
|
604
|
-
card[
|
|
605
|
-
card[
|
|
606
|
-
card[
|
|
607
|
-
card[
|
|
608
|
-
card[
|
|
609
|
-
card[
|
|
610
|
-
card[
|
|
611
|
-
card[
|
|
612
|
-
card[
|
|
613
|
-
card[
|
|
614
|
-
card[
|
|
615
|
-
card[
|
|
616
|
-
card[
|
|
617
|
-
card[
|
|
618
|
-
card[
|
|
619
|
-
card[
|
|
620
|
-
card[
|
|
621
|
-
log_dir=f"/var/log/antnode/antnode{card['nodename']}"
|
|
721
|
+
card["nodename"] = f"{card['id']:04}"
|
|
722
|
+
card["service"] = f"antnode{card['nodename']}.service"
|
|
723
|
+
card["user"] = "ant"
|
|
724
|
+
card["version"] = metrics["AntNodeVersion"]
|
|
725
|
+
card["root_dir"] = f"{config['NodeStorage']}/antnode{card['nodename']}"
|
|
726
|
+
card["binary"] = f"{card['root_dir']}/antnode"
|
|
727
|
+
card["port"] = config["PortStart"] * 1000 + card["id"]
|
|
728
|
+
card["metrics_port"] = 13 * 1000 + card["id"]
|
|
729
|
+
card["network"] = "evm-arbitrum-one"
|
|
730
|
+
card["wallet"] = config["RewardsAddress"]
|
|
731
|
+
card["peer_id"] = ""
|
|
732
|
+
card["status"] = STOPPED
|
|
733
|
+
card["timestamp"] = int(time.time())
|
|
734
|
+
card["records"] = 0
|
|
735
|
+
card["uptime"] = 0
|
|
736
|
+
card["shunned"] = 0
|
|
737
|
+
card["age"] = card["timestamp"]
|
|
738
|
+
card["host"] = ANM_HOST
|
|
739
|
+
log_dir = f"/var/log/antnode/antnode{card['nodename']}"
|
|
622
740
|
# Create the node directory and log directory
|
|
623
741
|
try:
|
|
624
|
-
subprocess.run(
|
|
742
|
+
subprocess.run(
|
|
743
|
+
["sudo", "mkdir", "-p", card["root_dir"], log_dir], stdout=subprocess.PIPE
|
|
744
|
+
)
|
|
625
745
|
except subprocess.CalledProcessError as err:
|
|
626
|
-
|
|
746
|
+
logging.error("CN1 ERROR:", err)
|
|
627
747
|
# Copy the binary to the node directory
|
|
628
748
|
try:
|
|
629
|
-
subprocess.run(
|
|
749
|
+
subprocess.run(
|
|
750
|
+
["sudo", "cp", metrics["antnode"], card["root_dir"]], stdout=subprocess.PIPE
|
|
751
|
+
)
|
|
630
752
|
except subprocess.CalledProcessError as err:
|
|
631
|
-
|
|
753
|
+
logging.error("CN2 ERROR:", err)
|
|
632
754
|
# Change owner of the node directory and log directories
|
|
633
755
|
try:
|
|
634
|
-
subprocess.run(
|
|
756
|
+
subprocess.run(
|
|
757
|
+
[
|
|
758
|
+
"sudo",
|
|
759
|
+
"chown",
|
|
760
|
+
"-R",
|
|
761
|
+
f'{card["user"]}:{card["user"]}',
|
|
762
|
+
card["root_dir"],
|
|
763
|
+
log_dir,
|
|
764
|
+
],
|
|
765
|
+
stdout=subprocess.PIPE,
|
|
766
|
+
)
|
|
635
767
|
except subprocess.CalledProcessError as err:
|
|
636
|
-
|
|
768
|
+
logging.error("CN3 ERROR:", err)
|
|
637
769
|
# build the systemd service unit
|
|
638
|
-
service=f"""[Unit]
|
|
770
|
+
service = f"""[Unit]
|
|
639
771
|
Description=antnode{card['nodename']}
|
|
640
772
|
[Service]
|
|
641
773
|
User={card['user']}
|
|
@@ -645,130 +777,178 @@ Restart=always
|
|
|
645
777
|
"""
|
|
646
778
|
# Write the systemd service unit with sudo tee since we're running as not root
|
|
647
779
|
try:
|
|
648
|
-
subprocess.run(
|
|
780
|
+
subprocess.run(
|
|
781
|
+
["sudo", "tee", f'/etc/systemd/system/{card["service"]}'],
|
|
782
|
+
input=service,
|
|
783
|
+
text=True,
|
|
784
|
+
stdout=subprocess.PIPE,
|
|
785
|
+
)
|
|
649
786
|
except subprocess.CalledProcessError as err:
|
|
650
|
-
|
|
787
|
+
logging.error("CN4 ERROR:", err)
|
|
651
788
|
# Reload systemd service files to get our new one
|
|
652
789
|
try:
|
|
653
|
-
subprocess.run([
|
|
790
|
+
subprocess.run(["sudo", "systemctl", "daemon-reload"], stdout=subprocess.PIPE)
|
|
654
791
|
except subprocess.CalledProcessError as err:
|
|
655
|
-
|
|
792
|
+
logging.error("CN5 ERROR:", err)
|
|
656
793
|
# Add the new node to the database
|
|
657
794
|
with S() as session:
|
|
658
|
-
session.execute(
|
|
659
|
-
insert(Node),[card]
|
|
660
|
-
)
|
|
795
|
+
session.execute(insert(Node), [card])
|
|
661
796
|
session.commit()
|
|
662
797
|
# Now we grab the node object from the database to pass to start node
|
|
663
798
|
with S() as session:
|
|
664
|
-
card=session.execute(select(Node).where(Node.id == card[
|
|
799
|
+
card = session.execute(select(Node).where(Node.id == card["id"])).first()
|
|
665
800
|
# Get the Node object from the Row
|
|
666
|
-
card=card[0]
|
|
801
|
+
card = card[0]
|
|
667
802
|
# Start the new node
|
|
668
803
|
return start_systemd_node(card)
|
|
669
|
-
#print(json.dumps(card,indent=2))
|
|
804
|
+
# print(json.dumps(card,indent=2))
|
|
670
805
|
return True
|
|
671
|
-
|
|
806
|
+
|
|
672
807
|
|
|
673
808
|
# Make a decision about what to do
|
|
674
|
-
def choose_action(config,metrics,db_nodes):
|
|
809
|
+
def choose_action(config, metrics, db_nodes):
|
|
675
810
|
# Gather knowlege
|
|
676
|
-
features={}
|
|
677
|
-
features["AllowCpu"]=metrics["UsedCpuPercent"] < config["CpuLessThan"]
|
|
678
|
-
features["AllowMem"]=metrics["UsedMemPercent"] < config["MemLessThan"]
|
|
679
|
-
features["AllowHD"]=metrics["UsedHDPercent"] < config["HDLessThan"]
|
|
680
|
-
features["RemCpu"]=metrics["UsedCpuPercent"] > config["CpuRemove"]
|
|
681
|
-
features["RemMem"]=metrics["UsedMemPercent"] > config["MemRemove"]
|
|
682
|
-
features["RemHD"]=metrics["UsedHDPercent"] > config["HDRemove"]
|
|
683
|
-
features["AllowNodeCap"]=metrics["RunningNodes"] < config["NodeCap"]
|
|
811
|
+
features = {}
|
|
812
|
+
features["AllowCpu"] = metrics["UsedCpuPercent"] < config["CpuLessThan"]
|
|
813
|
+
features["AllowMem"] = metrics["UsedMemPercent"] < config["MemLessThan"]
|
|
814
|
+
features["AllowHD"] = metrics["UsedHDPercent"] < config["HDLessThan"]
|
|
815
|
+
features["RemCpu"] = metrics["UsedCpuPercent"] > config["CpuRemove"]
|
|
816
|
+
features["RemMem"] = metrics["UsedMemPercent"] > config["MemRemove"]
|
|
817
|
+
features["RemHD"] = metrics["UsedHDPercent"] > config["HDRemove"]
|
|
818
|
+
features["AllowNodeCap"] = metrics["RunningNodes"] < config["NodeCap"]
|
|
684
819
|
# These are new features, so ignore them if not configured
|
|
685
|
-
if (
|
|
686
|
-
config["
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
|
|
820
|
+
if (
|
|
821
|
+
config["NetIOReadLessThan"]
|
|
822
|
+
+ config["NetIOReadRemove"]
|
|
823
|
+
+ config["NetIOWriteLessThan"]
|
|
824
|
+
+ config["NetIOWriteRemove"]
|
|
825
|
+
> 1
|
|
826
|
+
):
|
|
827
|
+
features["AllowNetIO"] = (
|
|
828
|
+
metrics["NetReadBytes"] < config["NetIOReadLessThan"]
|
|
829
|
+
and metrics["NetWriteBytes"] < config["NetIOWriteLessThan"]
|
|
830
|
+
)
|
|
831
|
+
features["RemoveNetIO"] = (
|
|
832
|
+
metrics["NetReadBytes"] > config["NetIORemove"]
|
|
833
|
+
or metrics["NetWriteBytes"] > config["NetIORemove"]
|
|
834
|
+
)
|
|
691
835
|
else:
|
|
692
|
-
features["AllowNetIO"]=True
|
|
693
|
-
features["RemoveNetIO"]=False
|
|
694
|
-
if (
|
|
695
|
-
config["
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
836
|
+
features["AllowNetIO"] = True
|
|
837
|
+
features["RemoveNetIO"] = False
|
|
838
|
+
if (
|
|
839
|
+
config["HDIOReadLessThan"]
|
|
840
|
+
+ config["HDIOReadRemove"]
|
|
841
|
+
+ config["HDIOWriteLessThan"]
|
|
842
|
+
+ config["HDIOWriteRemove"]
|
|
843
|
+
> 1
|
|
844
|
+
):
|
|
845
|
+
features["AllowHDIO"] = (
|
|
846
|
+
metrics["HDReadBytes"] < config["HDIOReadLessThan"]
|
|
847
|
+
and metrics["HDWriteBytes"] < config["HDIOWriteLessThan"]
|
|
848
|
+
)
|
|
849
|
+
features["RemoveHDIO"] = (
|
|
850
|
+
metrics["HDReadBytes"] > config["HDIORemove"]
|
|
851
|
+
or metrics["HDWriteBytes"] > config["HDtIORemove"]
|
|
852
|
+
)
|
|
700
853
|
else:
|
|
701
|
-
features["AllowHDIO"]=True
|
|
702
|
-
features["RemoveHDIO"]=False
|
|
703
|
-
features["LoadAllow"] =
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
854
|
+
features["AllowHDIO"] = True
|
|
855
|
+
features["RemoveHDIO"] = False
|
|
856
|
+
features["LoadAllow"] = (
|
|
857
|
+
metrics["LoadAverage1"] < config["DesiredLoadAverage"]
|
|
858
|
+
and metrics["LoadAverage5"] < config["DesiredLoadAverage"]
|
|
859
|
+
and metrics["LoadAverage15"] < config["DesiredLoadAverage"]
|
|
860
|
+
)
|
|
861
|
+
features["LoadNotAllow"] = (
|
|
862
|
+
metrics["LoadAverage1"] > config["MaxLoadAverageAllowed"]
|
|
863
|
+
or metrics["LoadAverage5"] > config["MaxLoadAverageAllowed"]
|
|
864
|
+
or metrics["LoadAverage15"] > config["MaxLoadAverageAllowed"]
|
|
865
|
+
)
|
|
709
866
|
# Check records for expired status
|
|
710
|
-
metrics=update_counters(metrics,config)
|
|
867
|
+
metrics = update_counters(metrics, config)
|
|
711
868
|
# If we have other thing going on, don't add more nodes
|
|
712
|
-
features["AddNewNode"]=
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
869
|
+
features["AddNewNode"] = (
|
|
870
|
+
sum(
|
|
871
|
+
[
|
|
872
|
+
metrics.get(m, 0)
|
|
873
|
+
for m in [
|
|
874
|
+
"UpgradingNodes",
|
|
875
|
+
"RestartingNodes",
|
|
876
|
+
"MigratingNodes",
|
|
877
|
+
"RemovingNodes",
|
|
878
|
+
]
|
|
879
|
+
]
|
|
880
|
+
)
|
|
881
|
+
== 0
|
|
882
|
+
and features["AllowCpu"]
|
|
883
|
+
and features["AllowHD"]
|
|
884
|
+
and features["AllowMem"]
|
|
885
|
+
and features["AllowNodeCap"]
|
|
886
|
+
and features["AllowHDIO"]
|
|
887
|
+
and features["AllowNetIO"]
|
|
888
|
+
and features["LoadAllow"]
|
|
889
|
+
and metrics["TotalNodes"] < config["NodeCap"]
|
|
890
|
+
)
|
|
720
891
|
# Are we overlimit on nodes
|
|
721
|
-
features["Remove"] =
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
892
|
+
features["Remove"] = (
|
|
893
|
+
features["LoadNotAllow"]
|
|
894
|
+
or features["RemCpu"]
|
|
895
|
+
or features["RemHD"]
|
|
896
|
+
or features["RemMem"]
|
|
897
|
+
or features["RemoveHDIO"]
|
|
898
|
+
or features["RemoveNetIO"]
|
|
899
|
+
or metrics["TotalNodes"] > config["NodeCap"]
|
|
900
|
+
)
|
|
725
901
|
# If we have nodes to upgrade
|
|
726
902
|
if metrics["NodesToUpgrade"] >= 1:
|
|
727
903
|
# Make sure current version is equal or newer than version on first node.
|
|
728
904
|
if Version(metrics["AntNodeVersion"]) < Version(db_nodes[0][1]):
|
|
729
905
|
logging.warning("node upgrade cancelled due to lower version")
|
|
730
|
-
features["Upgrade"]=False
|
|
906
|
+
features["Upgrade"] = False
|
|
731
907
|
else:
|
|
732
908
|
if features["Remove"]:
|
|
733
909
|
logging.info("Can't upgrade while removing is required")
|
|
734
|
-
features["Upgrade"]=False
|
|
910
|
+
features["Upgrade"] = False
|
|
735
911
|
else:
|
|
736
|
-
features["Upgrade"]=True
|
|
912
|
+
features["Upgrade"] = True
|
|
737
913
|
else:
|
|
738
|
-
features["Upgrade"]=False
|
|
739
|
-
|
|
914
|
+
features["Upgrade"] = False
|
|
740
915
|
|
|
741
|
-
logging.info(json.dumps(features,indent=2))
|
|
916
|
+
logging.info(json.dumps(features, indent=2))
|
|
742
917
|
##### Decisions
|
|
743
918
|
|
|
744
919
|
# Actually, removing DEAD nodes take priority
|
|
745
920
|
if metrics["DeadNodes"] > 1:
|
|
746
921
|
with S() as session:
|
|
747
|
-
broken=session.execute(
|
|
748
|
-
|
|
749
|
-
|
|
922
|
+
broken = session.execute(
|
|
923
|
+
select(Node.timestamp, Node.id, Node.host, Node.metrics_port)
|
|
924
|
+
.where(Node.status == DEAD)
|
|
925
|
+
.order_by(Node.timestamp.asc())
|
|
926
|
+
).all()
|
|
750
927
|
# Iterate through dead nodes and remove them all
|
|
751
928
|
for check in broken:
|
|
752
929
|
# Remove broken nodes
|
|
753
|
-
logging.info("Removing dead node "+str(check[1]))
|
|
930
|
+
logging.info("Removing dead node " + str(check[1]))
|
|
754
931
|
remove_node(check[1])
|
|
755
932
|
return {"status": "removed-dead-nodes"}
|
|
756
933
|
# If we have nodes with no version number, update from binary
|
|
757
934
|
if metrics["NodesNoVersion"] > 1:
|
|
758
935
|
with S() as session:
|
|
759
|
-
no_version=session.execute(
|
|
760
|
-
|
|
761
|
-
|
|
936
|
+
no_version = session.execute(
|
|
937
|
+
select(Node.timestamp, Node.id, Node.binary)
|
|
938
|
+
.where(Node.version == "")
|
|
939
|
+
.order_by(Node.timestamp.asc())
|
|
940
|
+
).all()
|
|
762
941
|
# Iterate through nodes with no version number
|
|
763
942
|
for check in no_version:
|
|
764
943
|
# Update version number from binary
|
|
765
|
-
version=get_antnode_version(check[2])
|
|
944
|
+
version = get_antnode_version(check[2])
|
|
766
945
|
logging.info(f"Updating version number for node {check[1]} to {version}")
|
|
767
946
|
with S() as session:
|
|
768
|
-
session.query(Node).filter(Node.id == check[1])
|
|
769
|
-
|
|
947
|
+
session.query(Node).filter(Node.id == check[1]).update(
|
|
948
|
+
{"version": version}
|
|
949
|
+
)
|
|
770
950
|
session.commit()
|
|
771
|
-
|
|
951
|
+
|
|
772
952
|
# If we're restarting, wait patiently as metrics could be skewed
|
|
773
953
|
if metrics["RestartingNodes"]:
|
|
774
954
|
logging.info("Still waiting for RestartDelay")
|
|
@@ -789,65 +969,74 @@ def choose_action(config,metrics,db_nodes):
|
|
|
789
969
|
if metrics["StoppedNodes"] > 0:
|
|
790
970
|
# What is the youngest stopped node
|
|
791
971
|
with S() as session:
|
|
792
|
-
youngest=session.execute(
|
|
793
|
-
|
|
794
|
-
|
|
972
|
+
youngest = session.execute(
|
|
973
|
+
select(Node.id)
|
|
974
|
+
.where(Node.status == STOPPED)
|
|
975
|
+
.order_by(Node.age.desc())
|
|
976
|
+
).first()
|
|
795
977
|
if youngest:
|
|
796
978
|
# Remove the youngest node
|
|
797
979
|
remove_node(youngest[0])
|
|
798
|
-
return{"status": REMOVING}
|
|
980
|
+
return {"status": REMOVING}
|
|
799
981
|
# No low hanging fruit. let's start with the youngest running node
|
|
800
982
|
with S() as session:
|
|
801
|
-
youngest=session.execute(
|
|
802
|
-
|
|
803
|
-
|
|
983
|
+
youngest = session.execute(
|
|
984
|
+
select(Node.id)
|
|
985
|
+
.where(Node.status == RUNNING)
|
|
986
|
+
.order_by(Node.age.desc())
|
|
987
|
+
).first()
|
|
804
988
|
if youngest:
|
|
805
989
|
# Remove the youngest node
|
|
806
990
|
remove_node(youngest[0])
|
|
807
|
-
return{"status": REMOVING}
|
|
808
|
-
return{"status": "nothing-to-remove"}
|
|
991
|
+
return {"status": REMOVING}
|
|
992
|
+
return {"status": "nothing-to-remove"}
|
|
809
993
|
# Otherwise, let's try just stopping a node to bring IO/Mem/Cpu down
|
|
810
994
|
else:
|
|
811
995
|
# If we just stopped a node, wait
|
|
812
|
-
if int(config["LastStoppedAt"] or 0) > (
|
|
996
|
+
if int(config["LastStoppedAt"] or 0) > (
|
|
997
|
+
int(time.time()) - (config["DelayRemove"] * 60)
|
|
998
|
+
):
|
|
813
999
|
logging.info("Still waiting for RemoveDelay")
|
|
814
|
-
return {"status":
|
|
1000
|
+
return {"status": "waiting-to-stop"}
|
|
815
1001
|
# Start with the youngest running node
|
|
816
1002
|
with S() as session:
|
|
817
|
-
youngest=session.execute(
|
|
818
|
-
|
|
819
|
-
|
|
1003
|
+
youngest = session.execute(
|
|
1004
|
+
select(Node).where(Node.status == RUNNING).order_by(Node.age.desc())
|
|
1005
|
+
).first()
|
|
820
1006
|
if youngest:
|
|
821
1007
|
# Stop the youngest node
|
|
822
1008
|
stop_systemd_node(youngest[0])
|
|
823
1009
|
# Update the last stopped time
|
|
824
1010
|
with S() as session:
|
|
825
|
-
session.query(Machine).filter(Machine.id == 1)
|
|
826
|
-
|
|
1011
|
+
session.query(Machine).filter(Machine.id == 1).update(
|
|
1012
|
+
{"LastStoppedAt": int(time.time())}
|
|
1013
|
+
)
|
|
827
1014
|
session.commit()
|
|
828
|
-
return{"status": STOPPED}
|
|
1015
|
+
return {"status": STOPPED}
|
|
829
1016
|
else:
|
|
830
|
-
return{"status": "nothing-to-stop"}
|
|
831
|
-
|
|
1017
|
+
return {"status": "nothing-to-stop"}
|
|
1018
|
+
|
|
832
1019
|
# Do we have upgrading to do?
|
|
833
|
-
if features["Upgrade"]:
|
|
1020
|
+
if features["Upgrade"]:
|
|
834
1021
|
# Let's find the oldest running node not using the current version
|
|
835
1022
|
with S() as session:
|
|
836
|
-
oldest=session.execute(
|
|
837
|
-
|
|
838
|
-
|
|
839
|
-
|
|
1023
|
+
oldest = session.execute(
|
|
1024
|
+
select(Node)
|
|
1025
|
+
.where(Node.status == RUNNING)
|
|
1026
|
+
.where(Node.version != metrics["AntNodeVersion"])
|
|
1027
|
+
.order_by(Node.age.asc())
|
|
1028
|
+
).first()
|
|
840
1029
|
if oldest:
|
|
841
1030
|
# Get Node from Row
|
|
842
1031
|
oldest = oldest[0]
|
|
843
1032
|
# If we don't have a version number from metadata, grab from binary
|
|
844
1033
|
if not oldest.version:
|
|
845
|
-
oldest.version=get_antnode_version(oldest.binary)
|
|
846
|
-
#print(json.dumps(oldest))
|
|
1034
|
+
oldest.version = get_antnode_version(oldest.binary)
|
|
1035
|
+
# print(json.dumps(oldest))
|
|
847
1036
|
# Upgrade the oldest node
|
|
848
|
-
upgrade_node(oldest,metrics)
|
|
849
|
-
return{"status": UPGRADING}
|
|
850
|
-
|
|
1037
|
+
upgrade_node(oldest, metrics)
|
|
1038
|
+
return {"status": UPGRADING}
|
|
1039
|
+
|
|
851
1040
|
# If AddNewNode
|
|
852
1041
|
# If stopped nodes available
|
|
853
1042
|
# Check oldest stopped version
|
|
@@ -862,29 +1051,29 @@ def choose_action(config,metrics,db_nodes):
|
|
|
862
1051
|
if metrics["StoppedNodes"] > 0:
|
|
863
1052
|
# What is the oldest stopped node
|
|
864
1053
|
with S() as session:
|
|
865
|
-
oldest=session.execute(
|
|
866
|
-
|
|
867
|
-
|
|
1054
|
+
oldest = session.execute(
|
|
1055
|
+
select(Node).where(Node.status == STOPPED).order_by(Node.age.asc())
|
|
1056
|
+
).first()
|
|
868
1057
|
if oldest:
|
|
869
1058
|
# Get Node from Row
|
|
870
|
-
oldest=oldest[0]
|
|
1059
|
+
oldest = oldest[0]
|
|
871
1060
|
# If we don't have a version number from metadata, grab from binary
|
|
872
1061
|
if not oldest.version:
|
|
873
|
-
oldest.version=get_antnode_version(oldest.binary)
|
|
1062
|
+
oldest.version = get_antnode_version(oldest.binary)
|
|
874
1063
|
# If the stopped version is old, upgrade it
|
|
875
1064
|
if Version(metrics["AntNodeVersion"]) > Version(oldest.version):
|
|
876
|
-
upgrade_node(oldest,metrics)
|
|
877
|
-
return{"status": UPGRADING}
|
|
1065
|
+
upgrade_node(oldest, metrics)
|
|
1066
|
+
return {"status": UPGRADING}
|
|
878
1067
|
else:
|
|
879
1068
|
if start_systemd_node(oldest):
|
|
880
|
-
return{"status": RESTARTING}
|
|
1069
|
+
return {"status": RESTARTING}
|
|
881
1070
|
else:
|
|
882
|
-
return{"status": "failed-start-node"}
|
|
1071
|
+
return {"status": "failed-start-node"}
|
|
883
1072
|
# Hmm, still in Start mode, we shouldn't get here
|
|
884
|
-
return {"status":
|
|
1073
|
+
return {"status": "START"}
|
|
885
1074
|
# Still in Add mode, add a new node
|
|
886
1075
|
if metrics["TotalNodes"] < config["NodeCap"]:
|
|
887
|
-
if create_node(config,metrics):
|
|
1076
|
+
if create_node(config, metrics):
|
|
888
1077
|
return {"status": "ADD"}
|
|
889
1078
|
else:
|
|
890
1079
|
return {"status": "failed-create-node"}
|
|
@@ -892,12 +1081,13 @@ def choose_action(config,metrics,db_nodes):
|
|
|
892
1081
|
return {"status": "node-cap-reached"}
|
|
893
1082
|
# If we have nothing to do, Survey the node ports
|
|
894
1083
|
update_nodes()
|
|
895
|
-
return{"status": "idle"}
|
|
1084
|
+
return {"status": "idle"}
|
|
1085
|
+
|
|
896
1086
|
|
|
897
1087
|
def main():
|
|
898
1088
|
# We're starting, so lets create a lock file
|
|
899
1089
|
try:
|
|
900
|
-
with open(
|
|
1090
|
+
with open("/var/antctl/wnm_active", "w") as file:
|
|
901
1091
|
file.write(str(int(time.time())))
|
|
902
1092
|
except:
|
|
903
1093
|
logging.error("Unable to create lock file, exiting")
|
|
@@ -905,78 +1095,90 @@ def main():
|
|
|
905
1095
|
|
|
906
1096
|
# See if we already have a known state in the database
|
|
907
1097
|
with S() as session:
|
|
908
|
-
db_nodes=session.execute(
|
|
909
|
-
|
|
910
|
-
|
|
911
|
-
|
|
912
|
-
|
|
1098
|
+
db_nodes = session.execute(
|
|
1099
|
+
select(
|
|
1100
|
+
Node.status,
|
|
1101
|
+
Node.version,
|
|
1102
|
+
Node.host,
|
|
1103
|
+
Node.metrics_port,
|
|
1104
|
+
Node.port,
|
|
1105
|
+
Node.age,
|
|
1106
|
+
Node.id,
|
|
1107
|
+
Node.timestamp,
|
|
1108
|
+
)
|
|
1109
|
+
).all()
|
|
1110
|
+
anm_config = session.execute(select(Machine)).all()
|
|
913
1111
|
|
|
914
1112
|
if db_nodes:
|
|
915
|
-
# anm_config by default loads a parameter array,
|
|
1113
|
+
# anm_config by default loads a parameter array,
|
|
916
1114
|
# use the __json__ method to return a dict from the first node
|
|
917
1115
|
anm_config = json.loads(json.dumps(anm_config[0][0])) or load_anm_config()
|
|
918
|
-
metrics=get_machine_metrics(anm_config["NodeStorage"],anm_config["HDRemove"])
|
|
919
|
-
#node_metrics = read_node_metrics(db_nodes[0][2],db_nodes[0][3])
|
|
920
|
-
#print(db_nodes[0])
|
|
921
|
-
#print(node_metrics)
|
|
922
|
-
#print(anm_config)
|
|
923
|
-
#print(json.dumps(anm_config,indent=4))
|
|
924
|
-
#print("Node: ",db_nodes)
|
|
1116
|
+
metrics = get_machine_metrics(anm_config["NodeStorage"], anm_config["HDRemove"])
|
|
1117
|
+
# node_metrics = read_node_metrics(db_nodes[0][2],db_nodes[0][3])
|
|
1118
|
+
# print(db_nodes[0])
|
|
1119
|
+
# print(node_metrics)
|
|
1120
|
+
# print(anm_config)
|
|
1121
|
+
# print(json.dumps(anm_config,indent=4))
|
|
1122
|
+
# print("Node: ",db_nodes)
|
|
925
1123
|
logging.info("Found {counter} nodes migrated".format(counter=len(db_nodes)))
|
|
926
1124
|
|
|
927
1125
|
else:
|
|
928
1126
|
anm_config = load_anm_config()
|
|
929
|
-
#print(anm_config)
|
|
1127
|
+
# print(anm_config)
|
|
930
1128
|
Workers = survey_machine() or []
|
|
931
1129
|
|
|
932
|
-
#""""
|
|
1130
|
+
# """"
|
|
933
1131
|
with S() as session:
|
|
934
|
-
session.execute(
|
|
935
|
-
insert(Node),Workers
|
|
936
|
-
)
|
|
1132
|
+
session.execute(insert(Node), Workers)
|
|
937
1133
|
session.commit()
|
|
938
|
-
#"""
|
|
1134
|
+
# """
|
|
939
1135
|
|
|
940
1136
|
with S() as session:
|
|
941
|
-
session.execute(
|
|
942
|
-
insert(Machine),[anm_config]
|
|
943
|
-
)
|
|
1137
|
+
session.execute(insert(Machine), [anm_config])
|
|
944
1138
|
session.commit()
|
|
945
1139
|
|
|
946
1140
|
# Now load subset of data to work with
|
|
947
1141
|
with S() as session:
|
|
948
|
-
db_nodes=session.execute(
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
|
|
1142
|
+
db_nodes = session.execute(
|
|
1143
|
+
select(
|
|
1144
|
+
Node.status,
|
|
1145
|
+
Node.version,
|
|
1146
|
+
Node.host,
|
|
1147
|
+
Node.metrics_port,
|
|
1148
|
+
Node.port,
|
|
1149
|
+
Node.age,
|
|
1150
|
+
Node.id,
|
|
1151
|
+
Node.timestamp,
|
|
1152
|
+
)
|
|
1153
|
+
).all()
|
|
952
1154
|
|
|
953
|
-
|
|
954
|
-
|
|
955
|
-
#print(json.dumps(anm_config,indent=4))
|
|
1155
|
+
# print(json.dumps(anm_config,indent=4))
|
|
956
1156
|
logging.info("Found {counter} nodes configured".format(counter=len(db_nodes)))
|
|
957
1157
|
|
|
958
|
-
#versions = [v[1] for worker in Workers if (v := worker.get('version'))]
|
|
959
|
-
#data = Counter(ver for ver in versions)
|
|
960
|
-
|
|
1158
|
+
# versions = [v[1] for worker in Workers if (v := worker.get('version'))]
|
|
1159
|
+
# data = Counter(ver for ver in versions)
|
|
961
1160
|
|
|
962
1161
|
data = Counter(status[0] for status in db_nodes)
|
|
963
|
-
#print(data)
|
|
964
|
-
print("Running Nodes:",data[RUNNING])
|
|
965
|
-
print("Restarting Nodes:",data[RESTARTING])
|
|
966
|
-
print("Stopped Nodes:",data[STOPPED])
|
|
967
|
-
print("Upgrading Nodes:",data[UPGRADING])
|
|
968
|
-
print("Removing Nodes:",data[REMOVING])
|
|
1162
|
+
# print(data)
|
|
1163
|
+
print("Running Nodes:", data[RUNNING])
|
|
1164
|
+
print("Restarting Nodes:", data[RESTARTING])
|
|
1165
|
+
print("Stopped Nodes:", data[STOPPED])
|
|
1166
|
+
print("Upgrading Nodes:", data[UPGRADING])
|
|
1167
|
+
print("Removing Nodes:", data[REMOVING])
|
|
969
1168
|
data = Counter(ver[1] for ver in db_nodes)
|
|
970
|
-
print("Versions:",data)
|
|
1169
|
+
print("Versions:", data)
|
|
971
1170
|
|
|
972
|
-
machine_metrics = get_machine_metrics(
|
|
973
|
-
|
|
974
|
-
|
|
975
|
-
|
|
976
|
-
print(
|
|
1171
|
+
machine_metrics = get_machine_metrics(
|
|
1172
|
+
anm_config["NodeStorage"], anm_config["HDRemove"]
|
|
1173
|
+
)
|
|
1174
|
+
print(json.dumps(anm_config, indent=2))
|
|
1175
|
+
print(json.dumps(machine_metrics, indent=2))
|
|
1176
|
+
this_action = choose_action(anm_config, machine_metrics, db_nodes)
|
|
1177
|
+
print("Action:", json.dumps(this_action, indent=2))
|
|
977
1178
|
# Remove lock file
|
|
978
1179
|
os.remove("/var/antctl/wnm_active")
|
|
979
1180
|
|
|
1181
|
+
|
|
980
1182
|
if __name__ == "__main__":
|
|
981
1183
|
main()
|
|
982
1184
|
|