wnm 0.0.9__py3-none-any.whl → 0.0.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of wnm might be problematic. Click here for more details.
- wnm/__init__.py +1 -1
- wnm/__main__.py +184 -1133
- wnm/actions.py +45 -0
- wnm/common.py +21 -0
- wnm/config.py +653 -1
- wnm/decision_engine.py +388 -0
- wnm/executor.py +1292 -0
- wnm/firewall/__init__.py +13 -0
- wnm/firewall/base.py +71 -0
- wnm/firewall/factory.py +95 -0
- wnm/firewall/null_firewall.py +71 -0
- wnm/firewall/ufw_manager.py +118 -0
- wnm/migration.py +42 -0
- wnm/models.py +305 -126
- wnm/process_managers/__init__.py +23 -0
- wnm/process_managers/base.py +203 -0
- wnm/process_managers/docker_manager.py +371 -0
- wnm/process_managers/factory.py +83 -0
- wnm/process_managers/launchd_manager.py +592 -0
- wnm/process_managers/setsid_manager.py +340 -0
- wnm/process_managers/systemd_manager.py +529 -0
- wnm/reports.py +286 -0
- wnm/utils.py +403 -0
- wnm-0.0.11.dist-info/METADATA +316 -0
- wnm-0.0.11.dist-info/RECORD +28 -0
- {wnm-0.0.9.dist-info → wnm-0.0.11.dist-info}/WHEEL +1 -1
- wnm-0.0.9.dist-info/METADATA +0 -95
- wnm-0.0.9.dist-info/RECORD +0 -9
- {wnm-0.0.9.dist-info → wnm-0.0.11.dist-info}/entry_points.txt +0 -0
- {wnm-0.0.9.dist-info → wnm-0.0.11.dist-info}/top_level.txt +0 -0
wnm/__main__.py
CHANGED
|
@@ -1,1185 +1,236 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import logging
|
|
3
3
|
import os
|
|
4
|
-
import re
|
|
5
|
-
import shutil
|
|
6
|
-
import subprocess
|
|
7
4
|
import sys
|
|
8
5
|
import time
|
|
9
|
-
from collections import Counter
|
|
10
6
|
|
|
11
|
-
import
|
|
12
|
-
import requests
|
|
13
|
-
from dotenv import load_dotenv
|
|
14
|
-
from packaging.version import Version
|
|
15
|
-
from sqlalchemy import create_engine, delete, insert, select, text, update
|
|
16
|
-
from sqlalchemy.orm import scoped_session, sessionmaker
|
|
7
|
+
from sqlalchemy import insert, select
|
|
17
8
|
|
|
18
|
-
from wnm.
|
|
9
|
+
from wnm.config import (
|
|
10
|
+
LOCK_FILE,
|
|
11
|
+
S,
|
|
12
|
+
apply_config_updates,
|
|
13
|
+
config_updates,
|
|
14
|
+
machine_config,
|
|
15
|
+
options,
|
|
16
|
+
)
|
|
17
|
+
from wnm.decision_engine import DecisionEngine
|
|
18
|
+
from wnm.executor import ActionExecutor
|
|
19
|
+
from wnm.migration import survey_machine
|
|
20
|
+
from wnm.models import Node
|
|
21
|
+
from wnm.utils import (
|
|
22
|
+
get_antnode_version,
|
|
23
|
+
get_machine_metrics,
|
|
24
|
+
update_counters,
|
|
25
|
+
)
|
|
19
26
|
|
|
20
27
|
logging.basicConfig(level=logging.INFO)
|
|
21
28
|
# Info level logging for sqlalchemy is too verbose, only use when needed
|
|
22
29
|
logging.getLogger("sqlalchemy.engine.Engine").disabled = True
|
|
23
30
|
|
|
24
|
-
# import .env
|
|
25
|
-
basedir = os.path.abspath(os.path.dirname(__file__))
|
|
26
|
-
load_dotenv(os.path.join(basedir, ".env"))
|
|
27
|
-
|
|
28
|
-
# simulate arg/yaml configuration
|
|
29
|
-
config = {}
|
|
30
|
-
config["db"] = "sqlite:///colony.db"
|
|
31
|
-
config["DonateAddress"] = (
|
|
32
|
-
os.getenv("DonateAddress") or "0x00455d78f850b0358E8cea5be24d415E01E107CF"
|
|
33
|
-
)
|
|
34
|
-
config["ANMHost"] = os.getenv("ANMHost") or "127.0.0.1"
|
|
35
|
-
config["CrisisBytes"] = os.getenv("CrisisBytes") or 2 * 10**9 # default 2gb/node
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
# Setup Database engine
|
|
39
|
-
engine = create_engine(config["db"], echo=True)
|
|
40
|
-
|
|
41
|
-
# Generate ORM
|
|
42
|
-
Base.metadata.create_all(engine)
|
|
43
|
-
|
|
44
|
-
# Create a connection to the ORM
|
|
45
|
-
session_factory = sessionmaker(bind=engine)
|
|
46
|
-
S = scoped_session(session_factory)
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
# if WNM_CONFIG or -c parameter are set, check for existing config
|
|
50
|
-
# else:
|
|
51
|
-
|
|
52
|
-
# Primary node for want of one
|
|
53
|
-
QUEEN = 1
|
|
54
|
-
|
|
55
|
-
# Donation address
|
|
56
|
-
DONATE = config["DonateAddress"]
|
|
57
|
-
# Keep these as strings so they can be grepped in logs
|
|
58
|
-
STOPPED = "STOPPED" # 0 Node is not responding to it's metrics port
|
|
59
|
-
RUNNING = "RUNNING" # 1 Node is responding to it's metrics port
|
|
60
|
-
UPGRADING = "UPGRADING" # 2 Upgrade in progress
|
|
61
|
-
DISABLED = "DISABLED" # -1 Do not start
|
|
62
|
-
RESTARTING = "RESTARTING" # 3 re/starting a server intionally
|
|
63
|
-
MIGRATING = "MIGRATING" # 4 Moving volumes in progress
|
|
64
|
-
REMOVING = "REMOVING" # 5 Removing node in progress
|
|
65
|
-
DEAD = "DEAD" # -86 Broken node to cleanup
|
|
66
|
-
|
|
67
|
-
ANM_HOST = config["ANMHost"]
|
|
68
|
-
# Baseline bytes per node
|
|
69
|
-
CRISIS_BYTES = config["CrisisBytes"]
|
|
70
31
|
|
|
71
32
|
# A storage place for ant node data
|
|
72
33
|
Workers = []
|
|
73
34
|
|
|
74
|
-
# Detect ANM
|
|
75
|
-
if os.path.exists("/var/antctl/system"):
|
|
76
|
-
# Is anm scheduled to run
|
|
77
|
-
if os.path.exists("/etc/cron.d/anm"):
|
|
78
|
-
# remove cron to disable old anm
|
|
79
|
-
try:
|
|
80
|
-
subprocess.run(["sudo", "rm", "/etc/cron.d/anm"])
|
|
81
|
-
except Exception as error:
|
|
82
|
-
template = "In GAV - An exception of type {0} occurred. Arguments:\n{1!r}"
|
|
83
|
-
message = template.format(type(error).__name__, error.args)
|
|
84
|
-
logging.info(message)
|
|
85
|
-
sys.exit(1)
|
|
86
|
-
os.remove("/etc/cron.d/anm")
|
|
87
|
-
# Is anm sitll running? We'll wait
|
|
88
|
-
if os.path.exists("/var/antctl/block"):
|
|
89
|
-
logging.info("anm still running, waiting...")
|
|
90
|
-
sys.exit(1)
|
|
91
|
-
|
|
92
|
-
# Are we already running
|
|
93
|
-
if os.path.exists("/var/antctl/wnm_active"):
|
|
94
|
-
logging.info("wnm still running")
|
|
95
|
-
sys.exit(1)
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
# Get anm configuration
|
|
99
|
-
def load_anm_config():
|
|
100
|
-
anm_config = {}
|
|
101
|
-
|
|
102
|
-
# Let's get the real count of CPU's available to this process
|
|
103
|
-
anm_config["CpuCount"] = len(os.sched_getaffinity(0))
|
|
104
|
-
|
|
105
|
-
# What can we save from /var/antctl/config
|
|
106
|
-
if os.path.exists("/var/antctl/config"):
|
|
107
|
-
load_dotenv("/var/antctl/config")
|
|
108
|
-
anm_config["NodeCap"] = int(os.getenv("NodeCap") or 20)
|
|
109
|
-
anm_config["CpuLessThan"] = int(os.getenv("CpuLessThan") or 50)
|
|
110
|
-
anm_config["CpuRemove"] = int(os.getenv("CpuRemove") or 70)
|
|
111
|
-
anm_config["MemLessThan"] = int(os.getenv("MemLessThan") or 70)
|
|
112
|
-
anm_config["MemRemove"] = int(os.getenv("MemRemove") or 90)
|
|
113
|
-
anm_config["HDLessThan"] = int(os.getenv("HDLessThan") or 70)
|
|
114
|
-
anm_config["HDRemove"] = int(os.getenv("HDRemove") or 90)
|
|
115
|
-
anm_config["DelayStart"] = int(os.getenv("DelayStart") or 5)
|
|
116
|
-
anm_config["DelayUpgrade"] = int(os.getenv("DelayUpgrade") or 5)
|
|
117
|
-
anm_config["DelayRestart"] = int(os.getenv("DelayRestart") or 10)
|
|
118
|
-
anm_config["DelayRemove"] = int(os.getenv("DelayRemove") or 300)
|
|
119
|
-
anm_config["NodeStorage"] = os.getenv("NodeStorage") or "/var/antctl/services"
|
|
120
|
-
# Default to the faucet donation address
|
|
121
|
-
try:
|
|
122
|
-
anm_config["RewardsAddress"] = re.findall(
|
|
123
|
-
r"--rewards-address ([\dA-Fa-fXx]+)", os.getenv("RewardsAddress")
|
|
124
|
-
)[0]
|
|
125
|
-
except:
|
|
126
|
-
try:
|
|
127
|
-
anm_config["RewardsAddress"] = re.findall(
|
|
128
|
-
r"([\dA-Fa-fXx]+)", os.getenv("RewardsAddress")
|
|
129
|
-
)[0]
|
|
130
|
-
except:
|
|
131
|
-
logging.warning("Unable to detect RewardsAddress")
|
|
132
|
-
sys.exit(1)
|
|
133
|
-
anm_config["DonateAddress"] = os.getenv("DonateAddress") or DONATE
|
|
134
|
-
anm_config["MaxLoadAverageAllowed"] = float(
|
|
135
|
-
os.getenv("MaxLoadAverageAllowed") or anm_config["CpuCount"]
|
|
136
|
-
)
|
|
137
|
-
anm_config["DesiredLoadAverage"] = float(
|
|
138
|
-
os.getenv("DesiredLoadAverage") or (anm_config["CpuCount"] * 0.6)
|
|
139
|
-
)
|
|
140
|
-
|
|
141
|
-
try:
|
|
142
|
-
with open("/usr/bin/anms.sh", "r") as file:
|
|
143
|
-
data = file.read()
|
|
144
|
-
anm_config["PortStart"] = int(re.findall(r"ntpr\=(\d+)", data)[0])
|
|
145
|
-
except:
|
|
146
|
-
anm_config["PortStart"] = 55
|
|
147
|
-
|
|
148
|
-
anm_config["HDIOReadLessThan"] = float(os.getenv("HDIOReadLessThan") or 0.0)
|
|
149
|
-
anm_config["HDIOReadRemove"] = float(os.getenv("HDIOReadRemove") or 0.0)
|
|
150
|
-
anm_config["HDIOWriteLessThan"] = float(os.getenv("HDIOWriteLessThan") or 0.0)
|
|
151
|
-
anm_config["HDIOWriteRemove"] = float(os.getenv("HDIOWriteRemove") or 0.0)
|
|
152
|
-
anm_config["NetIOReadLessThan"] = float(os.getenv("NetIOReadLessThan") or 0.0)
|
|
153
|
-
anm_config["NetIOReadRemove"] = float(os.getenv("NetIOReadRemove") or 0.0)
|
|
154
|
-
anm_config["NetIOWriteLessThan"] = float(os.getenv("NetIOWriteLessThan") or 0.0)
|
|
155
|
-
anm_config["NetIOWriteRemove"] = float(os.getenv("NetIOWriteRemove") or 0.0)
|
|
156
|
-
# Timer for last stopped nodes
|
|
157
|
-
anm_config["LastStoppedAt"] = 0
|
|
158
|
-
|
|
159
|
-
return anm_config
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
# Read confirm from systemd service file
|
|
163
|
-
def read_systemd_service(antnode):
|
|
164
|
-
details = {}
|
|
165
|
-
try:
|
|
166
|
-
with open("/etc/systemd/system/" + antnode, "r") as file:
|
|
167
|
-
data = file.read()
|
|
168
|
-
details["id"] = int(re.findall(r"antnode(\d+)", antnode)[0])
|
|
169
|
-
details["binary"] = re.findall(r"ExecStart=([^ ]+)", data)[0]
|
|
170
|
-
details["user"] = re.findall(r"User=(\w+)", data)[0]
|
|
171
|
-
details["root_dir"] = re.findall(r"--root-dir ([\w\/]+)", data)[0]
|
|
172
|
-
details["port"] = int(re.findall(r"--port (\d+)", data)[0])
|
|
173
|
-
details["metrics_port"] = int(
|
|
174
|
-
re.findall(r"--metrics-server-port (\d+)", data)[0]
|
|
175
|
-
)
|
|
176
|
-
details["wallet"] = re.findall(r"--rewards-address ([^ ]+)", data)[0]
|
|
177
|
-
details["network"] = re.findall(r"--rewards-address [^ ]+ ([\w\-]+)", data)[0]
|
|
178
|
-
except:
|
|
179
|
-
pass
|
|
180
|
-
|
|
181
|
-
return details
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
# Read data from metadata endpoint
|
|
185
|
-
def read_node_metadata(host, port):
|
|
186
|
-
# Only return version number when we have one, to stop clobbering the binary check
|
|
187
|
-
try:
|
|
188
|
-
url = "http://{0}:{1}/metadata".format(host, port)
|
|
189
|
-
response = requests.get(url)
|
|
190
|
-
data = response.text
|
|
191
|
-
except requests.exceptions.ConnectionError:
|
|
192
|
-
logging.debug("Connection Refused on port: {0}:{1}".format(host, str(port)))
|
|
193
|
-
return {"status": STOPPED, "peer_id": ""}
|
|
194
|
-
except Exception as error:
|
|
195
|
-
template = "In RNMd - An exception of type {0} occurred. Arguments:\n{1!r}"
|
|
196
|
-
message = template.format(type(error).__name__, error.args)
|
|
197
|
-
logging.info(message)
|
|
198
|
-
return {"status": STOPPED, "peer_id": ""}
|
|
199
|
-
# collect a dict to return
|
|
200
|
-
card = {}
|
|
201
|
-
try:
|
|
202
|
-
card["version"] = re.findall(r'{antnode_version="([\d\.]+)"}', data)[0]
|
|
203
|
-
except:
|
|
204
|
-
logging.info("No version found")
|
|
205
|
-
try:
|
|
206
|
-
card["peer_id"] = re.findall(r'{peer_id="([\w\d]+)"}', data)[0]
|
|
207
|
-
except:
|
|
208
|
-
card["peer_id"] = ""
|
|
209
|
-
card["status"] = RUNNING if "version" in card else STOPPED
|
|
210
|
-
return card
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
# Read data from metrics port
|
|
214
|
-
def read_node_metrics(host, port):
|
|
215
|
-
metrics = {}
|
|
216
|
-
try:
|
|
217
|
-
url = "http://{0}:{1}/metrics".format(host, port)
|
|
218
|
-
response = requests.get(url)
|
|
219
|
-
metrics["status"] = RUNNING
|
|
220
|
-
metrics["uptime"] = int(
|
|
221
|
-
(re.findall(r"ant_node_uptime ([\d]+)", response.text) or [0])[0]
|
|
222
|
-
)
|
|
223
|
-
metrics["records"] = int(
|
|
224
|
-
(
|
|
225
|
-
re.findall(r"ant_networking_records_stored ([\d]+)", response.text)
|
|
226
|
-
or [0]
|
|
227
|
-
)[0]
|
|
228
|
-
)
|
|
229
|
-
metrics["shunned"] = int(
|
|
230
|
-
(
|
|
231
|
-
re.findall(
|
|
232
|
-
r"ant_networking_shunned_by_close_group ([\d]+)", response.text
|
|
233
|
-
)
|
|
234
|
-
or [0]
|
|
235
|
-
)[0]
|
|
236
|
-
)
|
|
237
|
-
except requests.exceptions.ConnectionError:
|
|
238
|
-
logging.debug("Connection Refused on port: {0}:{1}".format(host, str(port)))
|
|
239
|
-
metrics["status"] = STOPPED
|
|
240
|
-
metrics["uptime"] = 0
|
|
241
|
-
metrics["records"] = 0
|
|
242
|
-
metrics["shunned"] = 0
|
|
243
|
-
except Exception as error:
|
|
244
|
-
template = "in:RNM - An exception of type {0} occurred. Arguments:\n{1!r}"
|
|
245
|
-
message = template.format(type(error).__name__, error.args)
|
|
246
|
-
logging.info(message)
|
|
247
|
-
metrics["status"] = STOPPED
|
|
248
|
-
metrics["uptime"] = 0
|
|
249
|
-
metrics["records"] = 0
|
|
250
|
-
metrics["shunned"] = 0
|
|
251
|
-
return metrics
|
|
35
|
+
# Detect ANM
|
|
252
36
|
|
|
253
37
|
|
|
254
|
-
#
|
|
255
|
-
def
|
|
256
|
-
|
|
257
|
-
data = subprocess.run(
|
|
258
|
-
[binary, "--version"], stdout=subprocess.PIPE
|
|
259
|
-
).stdout.decode("utf-8")
|
|
260
|
-
return re.findall(r"Autonomi Node v([\d\.]+)", data)[0]
|
|
261
|
-
except Exception as error:
|
|
262
|
-
template = "In GAV - An exception of type {0} occurred. Arguments:\n{1!r}"
|
|
263
|
-
message = template.format(type(error).__name__, error.args)
|
|
264
|
-
logging.info(message)
|
|
265
|
-
return 0
|
|
38
|
+
# Make a decision about what to do (new implementation using DecisionEngine)
|
|
39
|
+
def choose_action(machine_config, metrics, dry_run):
|
|
40
|
+
"""Plan and execute actions using DecisionEngine and ActionExecutor.
|
|
266
41
|
|
|
42
|
+
This function now acts as a thin wrapper around the new decision engine
|
|
43
|
+
and action executor classes.
|
|
267
44
|
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
except:
|
|
273
|
-
return 0
|
|
45
|
+
Args:
|
|
46
|
+
machine_config: Machine configuration dictionary
|
|
47
|
+
metrics: Current system metrics
|
|
48
|
+
dry_run: If True, log actions without executing
|
|
274
49
|
|
|
50
|
+
Returns:
|
|
51
|
+
Dictionary with execution status
|
|
52
|
+
"""
|
|
53
|
+
# Check records for expired status (must be done before planning)
|
|
54
|
+
if not dry_run:
|
|
55
|
+
metrics = update_counters(S, metrics, machine_config)
|
|
275
56
|
|
|
276
|
-
#
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
# Iterate on nodes
|
|
281
|
-
for node in antnodes:
|
|
282
|
-
# Initialize a dict
|
|
283
|
-
logging.debug(
|
|
284
|
-
"{0} surveying node {1} ".format(time.strftime("%Y-%m-%d %H:%M"), node)
|
|
285
|
-
)
|
|
286
|
-
if not re.findall(r"antnode([\d]+).service", node):
|
|
287
|
-
logging.info("can't decode " + str(node))
|
|
288
|
-
continue
|
|
289
|
-
card = {
|
|
290
|
-
"nodename": re.findall(r"antnode([\d]+).service", node)[0],
|
|
291
|
-
"service": node,
|
|
292
|
-
"timestamp": int(time.time()),
|
|
293
|
-
"host": ANM_HOST or "127.0.0.1",
|
|
294
|
-
}
|
|
295
|
-
# Load what systemd has configured
|
|
296
|
-
card.update(read_systemd_service(node))
|
|
297
|
-
# print(json.dumps(card,indent=2))
|
|
298
|
-
# Read metadata from metrics_port
|
|
299
|
-
metadata = read_node_metadata(card["host"], card["metrics_port"])
|
|
300
|
-
# print(json.dumps(metadata,indent=2))
|
|
301
|
-
if (
|
|
302
|
-
isinstance(metadata, dict)
|
|
303
|
-
and "status" in metadata
|
|
304
|
-
and metadata["status"] == RUNNING
|
|
305
|
-
):
|
|
306
|
-
# soak up metadata
|
|
307
|
-
card.update(metadata)
|
|
308
|
-
# The ports up, so grab metrics too
|
|
309
|
-
card.update(read_node_metrics(card["host"], card["metrics_port"]))
|
|
310
|
-
# Else run binary to get version
|
|
57
|
+
# Handle nodes with no version number (done before planning)
|
|
58
|
+
if metrics["nodes_no_version"] > 0:
|
|
59
|
+
if dry_run:
|
|
60
|
+
logging.warning("DRYRUN: Update NoVersion nodes")
|
|
311
61
|
else:
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
card["host"] = ANM_HOST
|
|
326
|
-
# Append the node dict to the detail list
|
|
327
|
-
details.append(card)
|
|
328
|
-
|
|
329
|
-
return details
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
# Survey server instance
|
|
333
|
-
def survey_machine():
|
|
334
|
-
# Make a bucket
|
|
335
|
-
antnodes = []
|
|
336
|
-
# For all service files
|
|
337
|
-
for file in os.listdir("/etc/systemd/system"):
|
|
338
|
-
# Find antnodes
|
|
339
|
-
if re.match(r"antnode[\d]+\.service", file):
|
|
340
|
-
antnodes.append(file)
|
|
341
|
-
# if len(antnodes)>=5:
|
|
342
|
-
# break
|
|
343
|
-
# Iterate over defined nodes and get details
|
|
344
|
-
# Ingests a list of service files and outputs a list of dictionaries
|
|
345
|
-
return survey_anm_nodes(antnodes)
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
# Read system status
|
|
349
|
-
def get_machine_metrics(node_storage, remove_limit):
|
|
350
|
-
metrics = {}
|
|
351
|
-
|
|
352
|
-
with S() as session:
|
|
353
|
-
db_nodes = session.execute(select(Node.status, Node.version)).all()
|
|
354
|
-
|
|
355
|
-
# Get some initial stats for comparing after a few seconds
|
|
356
|
-
# We start these counters AFTER reading the database
|
|
357
|
-
start_time = time.time()
|
|
358
|
-
start_disk_counters = psutil.disk_io_counters()
|
|
359
|
-
start_net_counters = psutil.net_io_counters()
|
|
360
|
-
|
|
361
|
-
metrics["TotalNodes"] = len(db_nodes)
|
|
362
|
-
data = Counter(node[0] for node in db_nodes)
|
|
363
|
-
metrics["RunningNodes"] = data[RUNNING]
|
|
364
|
-
metrics["StoppedNodes"] = data[STOPPED]
|
|
365
|
-
metrics["RestartingNodes"] = data[RESTARTING]
|
|
366
|
-
metrics["UpgradingNodes"] = data[UPGRADING]
|
|
367
|
-
metrics["MigratingNodes"] = data[MIGRATING]
|
|
368
|
-
metrics["RemovingNodes"] = data[REMOVING]
|
|
369
|
-
metrics["DeadNodes"] = data[DEAD]
|
|
370
|
-
metrics["antnode"] = shutil.which("antnode")
|
|
371
|
-
if not metrics["antnode"]:
|
|
372
|
-
logging.warning("Unable to locate current antnode binary, exiting")
|
|
373
|
-
sys.exit(1)
|
|
374
|
-
metrics["AntNodeVersion"] = get_antnode_version(metrics["antnode"])
|
|
375
|
-
metrics["NodesLatestV"] = (
|
|
376
|
-
sum(1 for node in db_nodes if node[1] == metrics["AntNodeVersion"]) or 0
|
|
377
|
-
)
|
|
378
|
-
metrics["NodesNoVersion"] = sum(1 for node in db_nodes if not node[1]) or 0
|
|
379
|
-
metrics["NodesToUpgrade"] = (
|
|
380
|
-
metrics["TotalNodes"] - metrics["NodesLatestV"] - metrics["NodesNoVersion"]
|
|
381
|
-
)
|
|
382
|
-
|
|
383
|
-
# Windows has to build load average over 5 seconds. The first 5 seconds returns 0's
|
|
384
|
-
# I don't plan on supporting windows, but if this get's modular, I don't want this
|
|
385
|
-
# issue to be skipped
|
|
386
|
-
# if platform.system() == "Windows":
|
|
387
|
-
# discard=psutil.getloadavg()
|
|
388
|
-
# time.sleep(5)
|
|
389
|
-
metrics["LoadAverage1"], metrics["LoadAverage5"], metrics["LoadAverage15"] = (
|
|
390
|
-
psutil.getloadavg()
|
|
391
|
-
)
|
|
392
|
-
# Get CPU Metrics over 1 second
|
|
393
|
-
metrics["IdleCpuPercent"], metrics["IOWait"] = psutil.cpu_times_percent(1)[3:5]
|
|
394
|
-
# Really we returned Idle percent, subtract from 100 to get used.
|
|
395
|
-
metrics["UsedCpuPercent"] = 100 - metrics["IdleCpuPercent"]
|
|
396
|
-
data = psutil.virtual_memory()
|
|
397
|
-
# print(data)
|
|
398
|
-
metrics["UsedMemPercent"] = data.percent
|
|
399
|
-
metrics["FreeMemPercent"] = 100 - metrics["UsedMemPercent"]
|
|
400
|
-
data = psutil.disk_io_counters()
|
|
401
|
-
# This only checks the drive mapped to the first node and will need to be updated
|
|
402
|
-
# when we eventually support multiple drives
|
|
403
|
-
data = psutil.disk_usage(node_storage)
|
|
404
|
-
metrics["UsedHDPercent"] = data.percent
|
|
405
|
-
metrics["TotalHDBytes"] = data.total
|
|
406
|
-
end_time = time.time()
|
|
407
|
-
end_disk_counters = psutil.disk_io_counters()
|
|
408
|
-
end_net_counters = psutil.net_io_counters()
|
|
409
|
-
metrics["HDWriteBytes"] = int(
|
|
410
|
-
(end_disk_counters.write_bytes - start_disk_counters.write_bytes)
|
|
411
|
-
/ (end_time - start_time)
|
|
412
|
-
)
|
|
413
|
-
metrics["HDReadBytes"] = int(
|
|
414
|
-
(end_disk_counters.read_bytes - start_disk_counters.read_bytes)
|
|
415
|
-
/ (end_time - start_time)
|
|
416
|
-
)
|
|
417
|
-
metrics["NetWriteBytes"] = int(
|
|
418
|
-
(end_net_counters.bytes_sent - start_net_counters.bytes_sent)
|
|
419
|
-
/ (end_time - start_time)
|
|
420
|
-
)
|
|
421
|
-
metrics["NetReadBytes"] = int(
|
|
422
|
-
(end_net_counters.bytes_recv - start_net_counters.bytes_recv)
|
|
423
|
-
/ (end_time - start_time)
|
|
424
|
-
)
|
|
425
|
-
# print (json.dumps(metrics,indent=2))
|
|
426
|
-
# How close (out of 100) to removal limit will we be with a max bytes per node (2GB default)
|
|
427
|
-
# For running nodes with Porpoise(tm).
|
|
428
|
-
metrics["NodeHDCrisis"] = int(
|
|
429
|
-
(
|
|
430
|
-
((metrics["TotalNodes"]) * CRISIS_BYTES)
|
|
431
|
-
/ (metrics["TotalHDBytes"] * (remove_limit / 100))
|
|
432
|
-
)
|
|
433
|
-
* 100
|
|
434
|
-
)
|
|
435
|
-
return metrics
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
# Update node with metrics result
|
|
439
|
-
def update_node_from_metrics(id, metrics, metadata):
|
|
440
|
-
try:
|
|
441
|
-
# We check the binary version in other code, so lets stop clobbering it when a node is stopped
|
|
442
|
-
card = {
|
|
443
|
-
"status": metrics["status"],
|
|
444
|
-
"timestamp": int(time.time()),
|
|
445
|
-
"uptime": metrics["uptime"],
|
|
446
|
-
"records": metrics["records"],
|
|
447
|
-
"shunned": metrics["shunned"],
|
|
448
|
-
"peer_id": metadata["peer_id"],
|
|
449
|
-
}
|
|
450
|
-
if "version" in metadata:
|
|
451
|
-
card["version"] = metadata["version"]
|
|
452
|
-
with S() as session:
|
|
453
|
-
session.query(Node).filter(Node.id == id).update(card)
|
|
454
|
-
session.commit()
|
|
455
|
-
except Exception as error:
|
|
456
|
-
template = "In UNFM - An exception of type {0} occurred. Arguments:\n{1!r}"
|
|
457
|
-
message = template.format(type(error).__name__, error.args)
|
|
458
|
-
logging.warning(message)
|
|
459
|
-
return False
|
|
460
|
-
else:
|
|
461
|
-
return True
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
# Set Node status
|
|
465
|
-
def set_node_status(id, status):
|
|
466
|
-
logging.info("Setting node status: {0} {1}".format(id, status))
|
|
467
|
-
try:
|
|
468
|
-
with S() as session:
|
|
469
|
-
session.query(Node).filter(Node.id == id).update(
|
|
470
|
-
{"status": status, "timestamp": int(time.time())}
|
|
471
|
-
)
|
|
472
|
-
session.commit()
|
|
473
|
-
except:
|
|
474
|
-
return False
|
|
475
|
-
else:
|
|
476
|
-
return True
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
# Update metrics after checking counters
|
|
480
|
-
def update_counters(old, config):
|
|
481
|
-
# Are we already removing a node
|
|
482
|
-
if old["RemovingNodes"]:
|
|
483
|
-
with S() as session:
|
|
484
|
-
removals = session.execute(
|
|
485
|
-
select(Node.timestamp, Node.id)
|
|
486
|
-
.where(Node.status == REMOVING)
|
|
487
|
-
.order_by(Node.timestamp.asc())
|
|
488
|
-
).all()
|
|
489
|
-
# Iterate through active removals
|
|
490
|
-
records_to_remove = len(removals)
|
|
491
|
-
for check in removals:
|
|
492
|
-
# If the DelayRemove timer has expired, delete the entry
|
|
493
|
-
if isinstance(check[0], int) and check[0] < (
|
|
494
|
-
int(time.time()) - (config["DelayRemove"] * 60)
|
|
495
|
-
):
|
|
496
|
-
logging.info("Deleting removed node " + str(check[1]))
|
|
62
|
+
with S() as session:
|
|
63
|
+
no_version = session.execute(
|
|
64
|
+
select(Node.timestamp, Node.id, Node.binary)
|
|
65
|
+
.where(Node.version == "")
|
|
66
|
+
.order_by(Node.timestamp.asc())
|
|
67
|
+
).all()
|
|
68
|
+
# Iterate through nodes with no version number
|
|
69
|
+
for check in no_version:
|
|
70
|
+
# Update version number from binary
|
|
71
|
+
version = get_antnode_version(check[2])
|
|
72
|
+
logging.info(
|
|
73
|
+
f"Updating version number for node {check[1]} to {version}"
|
|
74
|
+
)
|
|
497
75
|
with S() as session:
|
|
498
|
-
session.
|
|
76
|
+
session.query(Node).filter(Node.id == check[1]).update(
|
|
77
|
+
{"version": version}
|
|
78
|
+
)
|
|
499
79
|
session.commit()
|
|
500
|
-
records_to_remove -= 1
|
|
501
|
-
old["RemovingNodes"] = records_to_remove
|
|
502
|
-
# Are we already upgrading a node
|
|
503
|
-
if old["UpgradingNodes"]:
|
|
504
|
-
with S() as session:
|
|
505
|
-
upgrades = session.execute(
|
|
506
|
-
select(Node.timestamp, Node.id, Node.host, Node.metrics_port)
|
|
507
|
-
.where(Node.status == UPGRADING)
|
|
508
|
-
.order_by(Node.timestamp.asc())
|
|
509
|
-
).all()
|
|
510
|
-
# Iterate through active upgrades
|
|
511
|
-
records_to_upgrade = len(upgrades)
|
|
512
|
-
for check in upgrades:
|
|
513
|
-
# If the DelayUpgrade timer has expired, check on status
|
|
514
|
-
if isinstance(check[0], int) and check[0] < (
|
|
515
|
-
int(time.time()) - (config["DelayUpgrade"] * 60)
|
|
516
|
-
):
|
|
517
|
-
logging.info("Updating upgraded node " + str(check[1]))
|
|
518
|
-
node_metrics = read_node_metrics(check[2], check[3])
|
|
519
|
-
node_metadata = read_node_metadata(check[2], check[3])
|
|
520
|
-
if node_metrics and node_metadata:
|
|
521
|
-
update_node_from_metrics(check[1], node_metrics, node_metadata)
|
|
522
|
-
records_to_upgrade -= 1
|
|
523
|
-
old["UpgradingNodes"] = records_to_upgrade
|
|
524
|
-
# Are we already restarting a node
|
|
525
|
-
if old["RestartingNodes"]:
|
|
526
|
-
with S() as session:
|
|
527
|
-
restarts = session.execute(
|
|
528
|
-
select(Node.timestamp, Node.id, Node.host, Node.metrics_port)
|
|
529
|
-
.where(Node.status == RESTARTING)
|
|
530
|
-
.order_by(Node.timestamp.asc())
|
|
531
|
-
).all()
|
|
532
|
-
# Iterate through active upgrades
|
|
533
|
-
records_to_restart = len(restarts)
|
|
534
|
-
for check in restarts:
|
|
535
|
-
# If the DelayUpgrade timer has expired, check on status
|
|
536
|
-
if isinstance(check[0], int) and check[0] < (
|
|
537
|
-
int(time.time()) - (config["DelayStart"] * 60)
|
|
538
|
-
):
|
|
539
|
-
logging.info("Updating restarted node " + str(check[1]))
|
|
540
|
-
node_metrics = read_node_metrics(check[2], check[3])
|
|
541
|
-
node_metadata = read_node_metadata(check[2], check[3])
|
|
542
|
-
if node_metrics and node_metadata:
|
|
543
|
-
update_node_from_metrics(check[1], node_metrics, node_metadata)
|
|
544
|
-
records_to_restart -= 1
|
|
545
|
-
old["RestartingNodes"] = records_to_restart
|
|
546
|
-
return old
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
# Enable firewall for port
|
|
550
|
-
def enable_firewall(port, node):
|
|
551
|
-
logging.info("enable firewall port {0}/udp".format(port))
|
|
552
|
-
# Close ufw firewall
|
|
553
|
-
try:
|
|
554
|
-
subprocess.run(
|
|
555
|
-
["sudo", "ufw", "allow", "{0}/udp".format(port), "comment", node],
|
|
556
|
-
stdout=subprocess.PIPE,
|
|
557
|
-
)
|
|
558
|
-
except subprocess.CalledProcessError as err:
|
|
559
|
-
logging.error("EF Error:", err)
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
# Disable firewall for port
|
|
563
|
-
def disable_firewall(port):
|
|
564
|
-
logging.info("disable firewall port {0}/udp".format(port))
|
|
565
|
-
# Close ufw firewall
|
|
566
|
-
try:
|
|
567
|
-
subprocess.run(
|
|
568
|
-
["sudo", "ufw", "delete", "allow", "{0}/udp".format(port)],
|
|
569
|
-
stdout=subprocess.PIPE,
|
|
570
|
-
)
|
|
571
|
-
except subprocess.CalledProcessError as err:
|
|
572
|
-
logging.error("DF ERROR:", err)
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
# Start a systemd node
|
|
576
|
-
def start_systemd_node(node):
|
|
577
|
-
logging.info("Starting node " + str(node.id))
|
|
578
|
-
# Try to start the service
|
|
579
|
-
try:
|
|
580
|
-
p = subprocess.run(
|
|
581
|
-
["sudo", "systemctl", "start", node.service],
|
|
582
|
-
stdout=subprocess.PIPE,
|
|
583
|
-
stderr=subprocess.STDOUT,
|
|
584
|
-
).stdout.decode("utf-8")
|
|
585
|
-
if re.match(r"Failed to start", p):
|
|
586
|
-
logging.error("SSN2 ERROR:", p)
|
|
587
|
-
return False
|
|
588
|
-
except subprocess.CalledProcessError as err:
|
|
589
|
-
logging.error("SSN1 ERROR:", err)
|
|
590
|
-
return False
|
|
591
|
-
# Open a firewall hole for the data port
|
|
592
|
-
enable_firewall(node.port, node.service)
|
|
593
|
-
# Update node status
|
|
594
|
-
set_node_status(node.id, RESTARTING)
|
|
595
|
-
return True
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
# Stop a systemd node
|
|
599
|
-
def stop_systemd_node(node):
|
|
600
|
-
logging.info("Stopping node: " + node.service)
|
|
601
|
-
# Send a stop signal to the process
|
|
602
|
-
try:
|
|
603
|
-
subprocess.run(
|
|
604
|
-
["sudo", "systemctl", "stop", node.service], stdout=subprocess.PIPE
|
|
605
|
-
)
|
|
606
|
-
except subprocess.CalledProcessError as err:
|
|
607
|
-
logging.error("SSN2 ERROR:", err)
|
|
608
|
-
disable_firewall(node.port)
|
|
609
|
-
set_node_status(node.id, STOPPED)
|
|
610
|
-
|
|
611
|
-
return True
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
# Upgrade a node
|
|
615
|
-
def upgrade_node(node, metrics):
|
|
616
|
-
logging.info("Upgrading node " + str(node.id))
|
|
617
|
-
# Copy current node binary
|
|
618
|
-
try:
|
|
619
|
-
subprocess.run(["sudo", "cp", "-f", metrics["antnode"], node.binary])
|
|
620
|
-
except subprocess.CalledProcessError as err:
|
|
621
|
-
logging.error("UN1 ERROR:", err)
|
|
622
|
-
try:
|
|
623
|
-
subprocess.run(["sudo", "systemctl", "restart", node.service])
|
|
624
|
-
except subprocess.CalledProcessError as err:
|
|
625
|
-
logging.error("UN2 ERROR:", err)
|
|
626
|
-
version = get_antnode_version(node.binary)
|
|
627
|
-
try:
|
|
628
|
-
with S() as session:
|
|
629
|
-
session.query(Node).filter(Node.id == node.id).update(
|
|
630
|
-
{
|
|
631
|
-
"status": UPGRADING,
|
|
632
|
-
"timestamp": int(time.time()),
|
|
633
|
-
"version": metrics["AntNodeVersion"],
|
|
634
|
-
}
|
|
635
|
-
)
|
|
636
|
-
session.commit()
|
|
637
|
-
except:
|
|
638
|
-
return False
|
|
639
|
-
else:
|
|
640
|
-
return True
|
|
641
80
|
|
|
81
|
+
# Use the new DecisionEngine to plan actions
|
|
82
|
+
engine = DecisionEngine(machine_config, metrics)
|
|
83
|
+
actions = engine.plan_actions()
|
|
642
84
|
|
|
643
|
-
#
|
|
644
|
-
|
|
645
|
-
logging.info("Removing node " + str(id))
|
|
85
|
+
# Log the computed features for debugging
|
|
86
|
+
logging.info(json.dumps(engine.get_features(), indent=2))
|
|
646
87
|
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
node = node[0]
|
|
651
|
-
if stop_systemd_node(node):
|
|
652
|
-
# Mark this node as REMOVING
|
|
653
|
-
set_node_status(id, REMOVING)
|
|
88
|
+
# Use ActionExecutor to execute the planned actions
|
|
89
|
+
executor = ActionExecutor(S)
|
|
90
|
+
result = executor.execute(actions, machine_config, metrics, dry_run)
|
|
654
91
|
|
|
655
|
-
|
|
656
|
-
# Remove node data and log
|
|
657
|
-
try:
|
|
658
|
-
subprocess.run(
|
|
659
|
-
["sudo", "rm", "-rf", node.root_dir, f"/var/log/antnode/{nodename}"]
|
|
660
|
-
)
|
|
661
|
-
except subprocess.CalledProcessError as err:
|
|
662
|
-
logging.error("RN1 ERROR:", err)
|
|
663
|
-
# Remove systemd service file
|
|
664
|
-
try:
|
|
665
|
-
subprocess.run(["sudo", "rm", "-f", f"/etc/systemd/system/{node.service}"])
|
|
666
|
-
except subprocess.CalledProcessError as err:
|
|
667
|
-
logging.error("RN2 ERROR:", err)
|
|
668
|
-
# Tell system to reload systemd files
|
|
669
|
-
try:
|
|
670
|
-
subprocess.run(["sudo", "systemctl", "daemon-reload"])
|
|
671
|
-
except subprocess.CalledProcessError as err:
|
|
672
|
-
logging.error("RN3 ERROR:", err)
|
|
673
|
-
# print(json.dumps(node,indent=2))
|
|
92
|
+
return result
|
|
674
93
|
|
|
675
94
|
|
|
676
|
-
|
|
677
|
-
def update_nodes():
|
|
678
|
-
with S() as session:
|
|
679
|
-
nodes = session.execute(
|
|
680
|
-
select(Node.timestamp, Node.id, Node.host, Node.metrics_port, Node.status)
|
|
681
|
-
.where(Node.status != DISABLED)
|
|
682
|
-
.order_by(Node.timestamp.asc())
|
|
683
|
-
).all()
|
|
684
|
-
# Iterate through all records
|
|
685
|
-
for check in nodes:
|
|
686
|
-
# Check on status
|
|
687
|
-
if isinstance(check[0], int):
|
|
688
|
-
logging.debug("Updating info on node " + str(check[1]))
|
|
689
|
-
node_metrics = read_node_metrics(check[2], check[3])
|
|
690
|
-
node_metadata = read_node_metadata(check[2], check[3])
|
|
691
|
-
if node_metrics and node_metadata:
|
|
692
|
-
# Don't write updates for stopped nodes that are already marked as stopped
|
|
693
|
-
if node_metadata["status"] == STOPPED and check[4] == STOPPED:
|
|
694
|
-
continue
|
|
695
|
-
update_node_from_metrics(check[1], node_metrics, node_metadata)
|
|
95
|
+
def main():
|
|
696
96
|
|
|
97
|
+
# Are we already running
|
|
98
|
+
if os.path.exists(LOCK_FILE):
|
|
99
|
+
logging.warning("wnm still running")
|
|
100
|
+
sys.exit(1)
|
|
697
101
|
|
|
698
|
-
#
|
|
699
|
-
def create_node(config, metrics):
|
|
700
|
-
logging.info("Creating new node")
|
|
701
|
-
# Create a holding place for the new node
|
|
702
|
-
card = {}
|
|
703
|
-
# Find the next available node number by first looking for holes
|
|
704
|
-
sql = text(
|
|
705
|
-
"select n1.id + 1 as id from node n1 "
|
|
706
|
-
+ "left join node n2 on n2.id = n1.id + 1 "
|
|
707
|
-
+ "where n2.id is null "
|
|
708
|
-
+ "and n1.id <> (select max(id) from node) "
|
|
709
|
-
+ "order by n1.id;"
|
|
710
|
-
)
|
|
711
|
-
with S() as session:
|
|
712
|
-
result = session.execute(sql).first()
|
|
713
|
-
if result:
|
|
714
|
-
card["id"] = result[0]
|
|
715
|
-
# Otherwise get the max node number and add 1
|
|
716
|
-
else:
|
|
717
|
-
with S() as session:
|
|
718
|
-
result = session.execute(select(Node.id).order_by(Node.id.desc())).first()
|
|
719
|
-
card["id"] = result[0] + 1
|
|
720
|
-
# Set the node name
|
|
721
|
-
card["nodename"] = f"{card['id']:04}"
|
|
722
|
-
card["service"] = f"antnode{card['nodename']}.service"
|
|
723
|
-
card["user"] = "ant"
|
|
724
|
-
card["version"] = metrics["AntNodeVersion"]
|
|
725
|
-
card["root_dir"] = f"{config['NodeStorage']}/antnode{card['nodename']}"
|
|
726
|
-
card["binary"] = f"{card['root_dir']}/antnode"
|
|
727
|
-
card["port"] = config["PortStart"] * 1000 + card["id"]
|
|
728
|
-
card["metrics_port"] = 13 * 1000 + card["id"]
|
|
729
|
-
card["network"] = "evm-arbitrum-one"
|
|
730
|
-
card["wallet"] = config["RewardsAddress"]
|
|
731
|
-
card["peer_id"] = ""
|
|
732
|
-
card["status"] = STOPPED
|
|
733
|
-
card["timestamp"] = int(time.time())
|
|
734
|
-
card["records"] = 0
|
|
735
|
-
card["uptime"] = 0
|
|
736
|
-
card["shunned"] = 0
|
|
737
|
-
card["age"] = card["timestamp"]
|
|
738
|
-
card["host"] = ANM_HOST
|
|
739
|
-
log_dir = f"/var/log/antnode/antnode{card['nodename']}"
|
|
740
|
-
# Create the node directory and log directory
|
|
741
|
-
try:
|
|
742
|
-
subprocess.run(
|
|
743
|
-
["sudo", "mkdir", "-p", card["root_dir"], log_dir], stdout=subprocess.PIPE
|
|
744
|
-
)
|
|
745
|
-
except subprocess.CalledProcessError as err:
|
|
746
|
-
logging.error("CN1 ERROR:", err)
|
|
747
|
-
# Copy the binary to the node directory
|
|
748
|
-
try:
|
|
749
|
-
subprocess.run(
|
|
750
|
-
["sudo", "cp", metrics["antnode"], card["root_dir"]], stdout=subprocess.PIPE
|
|
751
|
-
)
|
|
752
|
-
except subprocess.CalledProcessError as err:
|
|
753
|
-
logging.error("CN2 ERROR:", err)
|
|
754
|
-
# Change owner of the node directory and log directories
|
|
755
|
-
try:
|
|
756
|
-
subprocess.run(
|
|
757
|
-
[
|
|
758
|
-
"sudo",
|
|
759
|
-
"chown",
|
|
760
|
-
"-R",
|
|
761
|
-
f'{card["user"]}:{card["user"]}',
|
|
762
|
-
card["root_dir"],
|
|
763
|
-
log_dir,
|
|
764
|
-
],
|
|
765
|
-
stdout=subprocess.PIPE,
|
|
766
|
-
)
|
|
767
|
-
except subprocess.CalledProcessError as err:
|
|
768
|
-
logging.error("CN3 ERROR:", err)
|
|
769
|
-
# build the systemd service unit
|
|
770
|
-
service = f"""[Unit]
|
|
771
|
-
Description=antnode{card['nodename']}
|
|
772
|
-
[Service]
|
|
773
|
-
User={card['user']}
|
|
774
|
-
ExecStart={card['binary']} --bootstrap-cache-dir /var/antctl/bootstrap-cache --root-dir {card['root_dir']} --port {card['port']} --enable-metrics-server --metrics-server-port {card['metrics_port']} --log-output-dest {log_dir} --max-log-files 1 --max-archived-log-files 1 --rewards-address {card['wallet']} {card['network']}
|
|
775
|
-
Restart=always
|
|
776
|
-
#RestartSec=300
|
|
777
|
-
"""
|
|
778
|
-
# Write the systemd service unit with sudo tee since we're running as not root
|
|
779
|
-
try:
|
|
780
|
-
subprocess.run(
|
|
781
|
-
["sudo", "tee", f'/etc/systemd/system/{card["service"]}'],
|
|
782
|
-
input=service,
|
|
783
|
-
text=True,
|
|
784
|
-
stdout=subprocess.PIPE,
|
|
785
|
-
)
|
|
786
|
-
except subprocess.CalledProcessError as err:
|
|
787
|
-
logging.error("CN4 ERROR:", err)
|
|
788
|
-
# Reload systemd service files to get our new one
|
|
102
|
+
# We're starting, so lets create a lock file
|
|
789
103
|
try:
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
session.execute(insert(Node), [card])
|
|
796
|
-
session.commit()
|
|
797
|
-
# Now we grab the node object from the database to pass to start node
|
|
798
|
-
with S() as session:
|
|
799
|
-
card = session.execute(select(Node).where(Node.id == card["id"])).first()
|
|
800
|
-
# Get the Node object from the Row
|
|
801
|
-
card = card[0]
|
|
802
|
-
# Start the new node
|
|
803
|
-
return start_systemd_node(card)
|
|
804
|
-
# print(json.dumps(card,indent=2))
|
|
805
|
-
return True
|
|
806
|
-
|
|
104
|
+
with open(LOCK_FILE, "w") as file:
|
|
105
|
+
file.write(str(int(time.time())))
|
|
106
|
+
except (PermissionError, OSError) as e:
|
|
107
|
+
logging.error(f"Unable to create lock file: {e}")
|
|
108
|
+
sys.exit(1)
|
|
807
109
|
|
|
808
|
-
#
|
|
809
|
-
|
|
810
|
-
|
|
811
|
-
features = {}
|
|
812
|
-
features["AllowCpu"] = metrics["UsedCpuPercent"] < config["CpuLessThan"]
|
|
813
|
-
features["AllowMem"] = metrics["UsedMemPercent"] < config["MemLessThan"]
|
|
814
|
-
features["AllowHD"] = metrics["UsedHDPercent"] < config["HDLessThan"]
|
|
815
|
-
features["RemCpu"] = metrics["UsedCpuPercent"] > config["CpuRemove"]
|
|
816
|
-
features["RemMem"] = metrics["UsedMemPercent"] > config["MemRemove"]
|
|
817
|
-
features["RemHD"] = metrics["UsedHDPercent"] > config["HDRemove"]
|
|
818
|
-
features["AllowNodeCap"] = metrics["RunningNodes"] < config["NodeCap"]
|
|
819
|
-
# These are new features, so ignore them if not configured
|
|
820
|
-
if (
|
|
821
|
-
config["NetIOReadLessThan"]
|
|
822
|
-
+ config["NetIOReadRemove"]
|
|
823
|
-
+ config["NetIOWriteLessThan"]
|
|
824
|
-
+ config["NetIOWriteRemove"]
|
|
825
|
-
> 1
|
|
826
|
-
):
|
|
827
|
-
features["AllowNetIO"] = (
|
|
828
|
-
metrics["NetReadBytes"] < config["NetIOReadLessThan"]
|
|
829
|
-
and metrics["NetWriteBytes"] < config["NetIOWriteLessThan"]
|
|
830
|
-
)
|
|
831
|
-
features["RemoveNetIO"] = (
|
|
832
|
-
metrics["NetReadBytes"] > config["NetIORemove"]
|
|
833
|
-
or metrics["NetWriteBytes"] > config["NetIORemove"]
|
|
834
|
-
)
|
|
835
|
-
else:
|
|
836
|
-
features["AllowNetIO"] = True
|
|
837
|
-
features["RemoveNetIO"] = False
|
|
838
|
-
if (
|
|
839
|
-
config["HDIOReadLessThan"]
|
|
840
|
-
+ config["HDIOReadRemove"]
|
|
841
|
-
+ config["HDIOWriteLessThan"]
|
|
842
|
-
+ config["HDIOWriteRemove"]
|
|
843
|
-
> 1
|
|
844
|
-
):
|
|
845
|
-
features["AllowHDIO"] = (
|
|
846
|
-
metrics["HDReadBytes"] < config["HDIOReadLessThan"]
|
|
847
|
-
and metrics["HDWriteBytes"] < config["HDIOWriteLessThan"]
|
|
848
|
-
)
|
|
849
|
-
features["RemoveHDIO"] = (
|
|
850
|
-
metrics["HDReadBytes"] > config["HDIORemove"]
|
|
851
|
-
or metrics["HDWriteBytes"] > config["HDtIORemove"]
|
|
852
|
-
)
|
|
110
|
+
# Config should have loaded the machine_config
|
|
111
|
+
if machine_config:
|
|
112
|
+
logging.info("Machine: " + json.dumps(machine_config))
|
|
853
113
|
else:
|
|
854
|
-
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
|
|
861
|
-
|
|
862
|
-
|
|
863
|
-
|
|
864
|
-
|
|
865
|
-
|
|
866
|
-
|
|
867
|
-
metrics = update_counters(metrics, config)
|
|
868
|
-
# If we have other thing going on, don't add more nodes
|
|
869
|
-
features["AddNewNode"] = (
|
|
870
|
-
sum(
|
|
871
|
-
[
|
|
872
|
-
metrics.get(m, 0)
|
|
873
|
-
for m in [
|
|
874
|
-
"UpgradingNodes",
|
|
875
|
-
"RestartingNodes",
|
|
876
|
-
"MigratingNodes",
|
|
877
|
-
"RemovingNodes",
|
|
878
|
-
]
|
|
879
|
-
]
|
|
880
|
-
)
|
|
881
|
-
== 0
|
|
882
|
-
and features["AllowCpu"]
|
|
883
|
-
and features["AllowHD"]
|
|
884
|
-
and features["AllowMem"]
|
|
885
|
-
and features["AllowNodeCap"]
|
|
886
|
-
and features["AllowHDIO"]
|
|
887
|
-
and features["AllowNetIO"]
|
|
888
|
-
and features["LoadAllow"]
|
|
889
|
-
and metrics["TotalNodes"] < config["NodeCap"]
|
|
890
|
-
)
|
|
891
|
-
# Are we overlimit on nodes
|
|
892
|
-
features["Remove"] = (
|
|
893
|
-
features["LoadNotAllow"]
|
|
894
|
-
or features["RemCpu"]
|
|
895
|
-
or features["RemHD"]
|
|
896
|
-
or features["RemMem"]
|
|
897
|
-
or features["RemoveHDIO"]
|
|
898
|
-
or features["RemoveNetIO"]
|
|
899
|
-
or metrics["TotalNodes"] > config["NodeCap"]
|
|
900
|
-
)
|
|
901
|
-
# If we have nodes to upgrade
|
|
902
|
-
if metrics["NodesToUpgrade"] >= 1:
|
|
903
|
-
# Make sure current version is equal or newer than version on first node.
|
|
904
|
-
if Version(metrics["AntNodeVersion"]) < Version(db_nodes[0][1]):
|
|
905
|
-
logging.warning("node upgrade cancelled due to lower version")
|
|
906
|
-
features["Upgrade"] = False
|
|
114
|
+
logging.error("Unable to load machine config, exiting")
|
|
115
|
+
sys.exit(1)
|
|
116
|
+
# Check for config updates
|
|
117
|
+
if config_updates:
|
|
118
|
+
logging.info("Update: " + json.dumps(config_updates))
|
|
119
|
+
if options.dry_run:
|
|
120
|
+
logging.warning("Dry run, not saving requested updates")
|
|
121
|
+
# Create a dictionary for the machine config
|
|
122
|
+
# Machine by default returns a parameter array,
|
|
123
|
+
# use the __json__ method to return a dict
|
|
124
|
+
local_config = json.loads(json.dumps(machine_config))
|
|
125
|
+
# Apply the local config with the requested updates
|
|
126
|
+
local_config.update(config_updates)
|
|
907
127
|
else:
|
|
908
|
-
|
|
909
|
-
|
|
910
|
-
|
|
911
|
-
|
|
912
|
-
|
|
128
|
+
# Store the config changes to the database
|
|
129
|
+
apply_config_updates(config_updates)
|
|
130
|
+
# Create a working dictionary for the machine config
|
|
131
|
+
# Machine by default returns a parameter array,
|
|
132
|
+
# use the __json__ method to return a dict
|
|
133
|
+
local_config = json.loads(json.dumps(machine_config))
|
|
913
134
|
else:
|
|
914
|
-
|
|
915
|
-
|
|
916
|
-
|
|
917
|
-
|
|
918
|
-
|
|
919
|
-
|
|
920
|
-
|
|
921
|
-
|
|
922
|
-
|
|
923
|
-
|
|
924
|
-
|
|
925
|
-
|
|
926
|
-
|
|
927
|
-
|
|
928
|
-
|
|
929
|
-
|
|
930
|
-
|
|
931
|
-
|
|
932
|
-
return {"status": "removed-dead-nodes"}
|
|
933
|
-
# If we have nodes with no version number, update from binary
|
|
934
|
-
if metrics["NodesNoVersion"] > 1:
|
|
935
|
-
with S() as session:
|
|
936
|
-
no_version = session.execute(
|
|
937
|
-
select(Node.timestamp, Node.id, Node.binary)
|
|
938
|
-
.where(Node.version == "")
|
|
939
|
-
.order_by(Node.timestamp.asc())
|
|
940
|
-
).all()
|
|
941
|
-
# Iterate through nodes with no version number
|
|
942
|
-
for check in no_version:
|
|
943
|
-
# Update version number from binary
|
|
944
|
-
version = get_antnode_version(check[2])
|
|
945
|
-
logging.info(f"Updating version number for node {check[1]} to {version}")
|
|
946
|
-
with S() as session:
|
|
947
|
-
session.query(Node).filter(Node.id == check[1]).update(
|
|
948
|
-
{"version": version}
|
|
949
|
-
)
|
|
950
|
-
session.commit()
|
|
951
|
-
|
|
952
|
-
# If we're restarting, wait patiently as metrics could be skewed
|
|
953
|
-
if metrics["RestartingNodes"]:
|
|
954
|
-
logging.info("Still waiting for RestartDelay")
|
|
955
|
-
return {"status": RESTARTING}
|
|
956
|
-
# If we still have unexpired upgrade records, wait
|
|
957
|
-
if metrics["UpgradingNodes"]:
|
|
958
|
-
logging.info("Still waiting for UpgradeDelay")
|
|
959
|
-
return {"status": UPGRADING}
|
|
960
|
-
# First if we're removing, that takes top priority
|
|
961
|
-
if features["Remove"]:
|
|
962
|
-
# If we still have unexpired removal records, wait
|
|
963
|
-
if metrics["RemovingNodes"]:
|
|
964
|
-
logging.info("Still waiting for RemoveDelay")
|
|
965
|
-
return {"status": REMOVING}
|
|
966
|
-
# If we're under HD pressure or trimming node cap, remove nodes
|
|
967
|
-
if features["RemHD"] or metrics["TotalNodes"] > config["NodeCap"]:
|
|
968
|
-
# Start removing with stopped nodes
|
|
969
|
-
if metrics["StoppedNodes"] > 0:
|
|
970
|
-
# What is the youngest stopped node
|
|
971
|
-
with S() as session:
|
|
972
|
-
youngest = session.execute(
|
|
973
|
-
select(Node.id)
|
|
974
|
-
.where(Node.status == STOPPED)
|
|
975
|
-
.order_by(Node.age.desc())
|
|
976
|
-
).first()
|
|
977
|
-
if youngest:
|
|
978
|
-
# Remove the youngest node
|
|
979
|
-
remove_node(youngest[0])
|
|
980
|
-
return {"status": REMOVING}
|
|
981
|
-
# No low hanging fruit. let's start with the youngest running node
|
|
982
|
-
with S() as session:
|
|
983
|
-
youngest = session.execute(
|
|
984
|
-
select(Node.id)
|
|
985
|
-
.where(Node.status == RUNNING)
|
|
986
|
-
.order_by(Node.age.desc())
|
|
987
|
-
).first()
|
|
988
|
-
if youngest:
|
|
989
|
-
# Remove the youngest node
|
|
990
|
-
remove_node(youngest[0])
|
|
991
|
-
return {"status": REMOVING}
|
|
992
|
-
return {"status": "nothing-to-remove"}
|
|
993
|
-
# Otherwise, let's try just stopping a node to bring IO/Mem/Cpu down
|
|
994
|
-
else:
|
|
995
|
-
# If we just stopped a node, wait
|
|
996
|
-
if int(config["LastStoppedAt"] or 0) > (
|
|
997
|
-
int(time.time()) - (config["DelayRemove"] * 60)
|
|
998
|
-
):
|
|
999
|
-
logging.info("Still waiting for RemoveDelay")
|
|
1000
|
-
return {"status": "waiting-to-stop"}
|
|
1001
|
-
# Start with the youngest running node
|
|
1002
|
-
with S() as session:
|
|
1003
|
-
youngest = session.execute(
|
|
1004
|
-
select(Node).where(Node.status == RUNNING).order_by(Node.age.desc())
|
|
1005
|
-
).first()
|
|
1006
|
-
if youngest:
|
|
1007
|
-
# Stop the youngest node
|
|
1008
|
-
stop_systemd_node(youngest[0])
|
|
1009
|
-
# Update the last stopped time
|
|
1010
|
-
with S() as session:
|
|
1011
|
-
session.query(Machine).filter(Machine.id == 1).update(
|
|
1012
|
-
{"LastStoppedAt": int(time.time())}
|
|
1013
|
-
)
|
|
1014
|
-
session.commit()
|
|
1015
|
-
return {"status": STOPPED}
|
|
1016
|
-
else:
|
|
1017
|
-
return {"status": "nothing-to-stop"}
|
|
1018
|
-
|
|
1019
|
-
# Do we have upgrading to do?
|
|
1020
|
-
if features["Upgrade"]:
|
|
1021
|
-
# Let's find the oldest running node not using the current version
|
|
1022
|
-
with S() as session:
|
|
1023
|
-
oldest = session.execute(
|
|
1024
|
-
select(Node)
|
|
1025
|
-
.where(Node.status == RUNNING)
|
|
1026
|
-
.where(Node.version != metrics["AntNodeVersion"])
|
|
1027
|
-
.order_by(Node.age.asc())
|
|
1028
|
-
).first()
|
|
1029
|
-
if oldest:
|
|
1030
|
-
# Get Node from Row
|
|
1031
|
-
oldest = oldest[0]
|
|
1032
|
-
# If we don't have a version number from metadata, grab from binary
|
|
1033
|
-
if not oldest.version:
|
|
1034
|
-
oldest.version = get_antnode_version(oldest.binary)
|
|
1035
|
-
# print(json.dumps(oldest))
|
|
1036
|
-
# Upgrade the oldest node
|
|
1037
|
-
upgrade_node(oldest, metrics)
|
|
1038
|
-
return {"status": UPGRADING}
|
|
1039
|
-
|
|
1040
|
-
# If AddNewNode
|
|
1041
|
-
# If stopped nodes available
|
|
1042
|
-
# Check oldest stopped version
|
|
1043
|
-
# If out of date
|
|
1044
|
-
# upgrade node which starts it
|
|
1045
|
-
# else
|
|
1046
|
-
# restart node
|
|
1047
|
-
# else
|
|
1048
|
-
# Create a Node which starts it
|
|
1049
|
-
if features["AddNewNode"]:
|
|
1050
|
-
# Start adding with stopped nodes
|
|
1051
|
-
if metrics["StoppedNodes"] > 0:
|
|
1052
|
-
# What is the oldest stopped node
|
|
1053
|
-
with S() as session:
|
|
1054
|
-
oldest = session.execute(
|
|
1055
|
-
select(Node).where(Node.status == STOPPED).order_by(Node.age.asc())
|
|
1056
|
-
).first()
|
|
1057
|
-
if oldest:
|
|
1058
|
-
# Get Node from Row
|
|
1059
|
-
oldest = oldest[0]
|
|
1060
|
-
# If we don't have a version number from metadata, grab from binary
|
|
1061
|
-
if not oldest.version:
|
|
1062
|
-
oldest.version = get_antnode_version(oldest.binary)
|
|
1063
|
-
# If the stopped version is old, upgrade it
|
|
1064
|
-
if Version(metrics["AntNodeVersion"]) > Version(oldest.version):
|
|
1065
|
-
upgrade_node(oldest, metrics)
|
|
1066
|
-
return {"status": UPGRADING}
|
|
135
|
+
local_config = json.loads(json.dumps(machine_config))
|
|
136
|
+
|
|
137
|
+
metrics = get_machine_metrics(
|
|
138
|
+
S,
|
|
139
|
+
local_config["node_storage"],
|
|
140
|
+
local_config["hd_remove"],
|
|
141
|
+
local_config["crisis_bytes"],
|
|
142
|
+
)
|
|
143
|
+
logging.info(json.dumps(metrics, indent=2))
|
|
144
|
+
|
|
145
|
+
# Do we already have nodes
|
|
146
|
+
if metrics["total_nodes"] == 0:
|
|
147
|
+
# Are we migrating an anm server
|
|
148
|
+
if options.init and options.migrate_anm:
|
|
149
|
+
Workers = survey_machine(machine_config) or []
|
|
150
|
+
if Workers:
|
|
151
|
+
if options.dry_run:
|
|
152
|
+
logging.warning(f"DRYRUN: Not saving {len(Workers)} detected nodes")
|
|
1067
153
|
else:
|
|
1068
|
-
|
|
1069
|
-
|
|
1070
|
-
|
|
1071
|
-
|
|
1072
|
-
|
|
1073
|
-
|
|
1074
|
-
|
|
1075
|
-
|
|
1076
|
-
|
|
1077
|
-
|
|
154
|
+
with S() as session:
|
|
155
|
+
session.execute(insert(Node), Workers)
|
|
156
|
+
session.commit()
|
|
157
|
+
# Reload metrics
|
|
158
|
+
metrics = get_machine_metrics(
|
|
159
|
+
S,
|
|
160
|
+
local_config["node_storage"],
|
|
161
|
+
local_config["hd_remove"],
|
|
162
|
+
local_config["crisis_bytes"],
|
|
163
|
+
)
|
|
164
|
+
logging.info(
|
|
165
|
+
"Found {counter} nodes defined".format(
|
|
166
|
+
counter=metrics["total_nodes"]
|
|
167
|
+
)
|
|
168
|
+
)
|
|
1078
169
|
else:
|
|
1079
|
-
|
|
170
|
+
logging.warning("Requested migration but no nodes found")
|
|
1080
171
|
else:
|
|
1081
|
-
|
|
1082
|
-
|
|
1083
|
-
|
|
1084
|
-
|
|
1085
|
-
|
|
1086
|
-
|
|
1087
|
-
|
|
1088
|
-
|
|
1089
|
-
|
|
1090
|
-
|
|
1091
|
-
|
|
1092
|
-
|
|
1093
|
-
|
|
1094
|
-
|
|
1095
|
-
|
|
1096
|
-
|
|
1097
|
-
|
|
1098
|
-
|
|
1099
|
-
|
|
1100
|
-
|
|
1101
|
-
Node.version,
|
|
1102
|
-
Node.host,
|
|
1103
|
-
Node.metrics_port,
|
|
1104
|
-
Node.port,
|
|
1105
|
-
Node.age,
|
|
1106
|
-
Node.id,
|
|
1107
|
-
Node.timestamp,
|
|
172
|
+
logging.info("No nodes found")
|
|
173
|
+
else:
|
|
174
|
+
logging.info(
|
|
175
|
+
"Found {counter} nodes configured".format(counter=metrics["total_nodes"])
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
# Check for reports
|
|
179
|
+
if options.report:
|
|
180
|
+
from wnm.reports import generate_node_status_report, generate_node_status_details_report
|
|
181
|
+
|
|
182
|
+
# If survey action is specified, run it first
|
|
183
|
+
if options.force_action == "survey":
|
|
184
|
+
logging.info("Running survey before generating report")
|
|
185
|
+
executor = ActionExecutor(S)
|
|
186
|
+
survey_result = executor.execute_forced_action(
|
|
187
|
+
"survey",
|
|
188
|
+
local_config,
|
|
189
|
+
metrics,
|
|
190
|
+
service_name=options.service_name,
|
|
191
|
+
dry_run=options.dry_run,
|
|
1108
192
|
)
|
|
1109
|
-
|
|
1110
|
-
anm_config = session.execute(select(Machine)).all()
|
|
1111
|
-
|
|
1112
|
-
if db_nodes:
|
|
1113
|
-
# anm_config by default loads a parameter array,
|
|
1114
|
-
# use the __json__ method to return a dict from the first node
|
|
1115
|
-
anm_config = json.loads(json.dumps(anm_config[0][0])) or load_anm_config()
|
|
1116
|
-
metrics = get_machine_metrics(anm_config["NodeStorage"], anm_config["HDRemove"])
|
|
1117
|
-
# node_metrics = read_node_metrics(db_nodes[0][2],db_nodes[0][3])
|
|
1118
|
-
# print(db_nodes[0])
|
|
1119
|
-
# print(node_metrics)
|
|
1120
|
-
# print(anm_config)
|
|
1121
|
-
# print(json.dumps(anm_config,indent=4))
|
|
1122
|
-
# print("Node: ",db_nodes)
|
|
1123
|
-
logging.info("Found {counter} nodes migrated".format(counter=len(db_nodes)))
|
|
193
|
+
logging.info(f"Survey result: {survey_result}")
|
|
1124
194
|
|
|
195
|
+
# Generate the report
|
|
196
|
+
if options.report == "node-status":
|
|
197
|
+
report_output = generate_node_status_report(
|
|
198
|
+
S, options.service_name, options.report_format
|
|
199
|
+
)
|
|
200
|
+
elif options.report == "node-status-details":
|
|
201
|
+
report_output = generate_node_status_details_report(
|
|
202
|
+
S, options.service_name, options.report_format
|
|
203
|
+
)
|
|
204
|
+
else:
|
|
205
|
+
report_output = f"Unknown report type: {options.report}"
|
|
206
|
+
|
|
207
|
+
print(report_output)
|
|
208
|
+
os.remove(LOCK_FILE)
|
|
209
|
+
sys.exit(0)
|
|
210
|
+
|
|
211
|
+
# Check for forced actions
|
|
212
|
+
if options.force_action:
|
|
213
|
+
logging.info(f"Executing forced action: {options.force_action}")
|
|
214
|
+
executor = ActionExecutor(S)
|
|
215
|
+
this_action = executor.execute_forced_action(
|
|
216
|
+
options.force_action,
|
|
217
|
+
local_config,
|
|
218
|
+
metrics,
|
|
219
|
+
service_name=options.service_name,
|
|
220
|
+
dry_run=options.dry_run,
|
|
221
|
+
count=options.count if hasattr(options, 'count') else 1,
|
|
222
|
+
)
|
|
1125
223
|
else:
|
|
1126
|
-
|
|
1127
|
-
# print(anm_config)
|
|
1128
|
-
Workers = survey_machine() or []
|
|
1129
|
-
|
|
1130
|
-
# """"
|
|
1131
|
-
with S() as session:
|
|
1132
|
-
session.execute(insert(Node), Workers)
|
|
1133
|
-
session.commit()
|
|
1134
|
-
# """
|
|
1135
|
-
|
|
1136
|
-
with S() as session:
|
|
1137
|
-
session.execute(insert(Machine), [anm_config])
|
|
1138
|
-
session.commit()
|
|
1139
|
-
|
|
1140
|
-
# Now load subset of data to work with
|
|
1141
|
-
with S() as session:
|
|
1142
|
-
db_nodes = session.execute(
|
|
1143
|
-
select(
|
|
1144
|
-
Node.status,
|
|
1145
|
-
Node.version,
|
|
1146
|
-
Node.host,
|
|
1147
|
-
Node.metrics_port,
|
|
1148
|
-
Node.port,
|
|
1149
|
-
Node.age,
|
|
1150
|
-
Node.id,
|
|
1151
|
-
Node.timestamp,
|
|
1152
|
-
)
|
|
1153
|
-
).all()
|
|
1154
|
-
|
|
1155
|
-
# print(json.dumps(anm_config,indent=4))
|
|
1156
|
-
logging.info("Found {counter} nodes configured".format(counter=len(db_nodes)))
|
|
224
|
+
this_action = choose_action(local_config, metrics, options.dry_run)
|
|
1157
225
|
|
|
1158
|
-
# versions = [v[1] for worker in Workers if (v := worker.get('version'))]
|
|
1159
|
-
# data = Counter(ver for ver in versions)
|
|
1160
|
-
|
|
1161
|
-
data = Counter(status[0] for status in db_nodes)
|
|
1162
|
-
# print(data)
|
|
1163
|
-
print("Running Nodes:", data[RUNNING])
|
|
1164
|
-
print("Restarting Nodes:", data[RESTARTING])
|
|
1165
|
-
print("Stopped Nodes:", data[STOPPED])
|
|
1166
|
-
print("Upgrading Nodes:", data[UPGRADING])
|
|
1167
|
-
print("Removing Nodes:", data[REMOVING])
|
|
1168
|
-
data = Counter(ver[1] for ver in db_nodes)
|
|
1169
|
-
print("Versions:", data)
|
|
1170
|
-
|
|
1171
|
-
machine_metrics = get_machine_metrics(
|
|
1172
|
-
anm_config["NodeStorage"], anm_config["HDRemove"]
|
|
1173
|
-
)
|
|
1174
|
-
print(json.dumps(anm_config, indent=2))
|
|
1175
|
-
print(json.dumps(machine_metrics, indent=2))
|
|
1176
|
-
this_action = choose_action(anm_config, machine_metrics, db_nodes)
|
|
1177
226
|
print("Action:", json.dumps(this_action, indent=2))
|
|
1178
|
-
|
|
1179
|
-
os.remove(
|
|
227
|
+
|
|
228
|
+
os.remove(LOCK_FILE)
|
|
229
|
+
sys.exit(1)
|
|
1180
230
|
|
|
1181
231
|
|
|
1182
232
|
if __name__ == "__main__":
|
|
1183
233
|
main()
|
|
234
|
+
# print(options.MemRemove)
|
|
1184
235
|
|
|
1185
236
|
print("End of program")
|