wnm 0.0.9__py3-none-any.whl → 0.0.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of wnm might be problematic. Click here for more details.
- wnm/__init__.py +1 -1
- wnm/__main__.py +184 -1133
- wnm/actions.py +45 -0
- wnm/common.py +21 -0
- wnm/config.py +653 -1
- wnm/decision_engine.py +388 -0
- wnm/executor.py +1292 -0
- wnm/firewall/__init__.py +13 -0
- wnm/firewall/base.py +71 -0
- wnm/firewall/factory.py +95 -0
- wnm/firewall/null_firewall.py +71 -0
- wnm/firewall/ufw_manager.py +118 -0
- wnm/migration.py +42 -0
- wnm/models.py +305 -126
- wnm/process_managers/__init__.py +23 -0
- wnm/process_managers/base.py +203 -0
- wnm/process_managers/docker_manager.py +371 -0
- wnm/process_managers/factory.py +83 -0
- wnm/process_managers/launchd_manager.py +592 -0
- wnm/process_managers/setsid_manager.py +340 -0
- wnm/process_managers/systemd_manager.py +529 -0
- wnm/reports.py +286 -0
- wnm/utils.py +403 -0
- wnm-0.0.11.dist-info/METADATA +316 -0
- wnm-0.0.11.dist-info/RECORD +28 -0
- {wnm-0.0.9.dist-info → wnm-0.0.11.dist-info}/WHEEL +1 -1
- wnm-0.0.9.dist-info/METADATA +0 -95
- wnm-0.0.9.dist-info/RECORD +0 -9
- {wnm-0.0.9.dist-info → wnm-0.0.11.dist-info}/entry_points.txt +0 -0
- {wnm-0.0.9.dist-info → wnm-0.0.11.dist-info}/top_level.txt +0 -0
wnm/reports.py
ADDED
|
@@ -0,0 +1,286 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Reports module for weave-node-manager (wnm).
|
|
3
|
+
|
|
4
|
+
Provides formatted reporting capabilities for node status and details.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import json
|
|
8
|
+
import logging
|
|
9
|
+
from typing import List, Optional
|
|
10
|
+
|
|
11
|
+
from sqlalchemy import select
|
|
12
|
+
|
|
13
|
+
from wnm.models import Node
|
|
14
|
+
from wnm.common import RUNNING, STOPPED, UPGRADING, RESTARTING, REMOVING, DISABLED, DEAD
|
|
15
|
+
from wnm.utils import parse_service_names
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class NodeReporter:
|
|
19
|
+
"""
|
|
20
|
+
Reporter class for generating node status reports.
|
|
21
|
+
|
|
22
|
+
Supports two report types:
|
|
23
|
+
- node-status: Tabular summary of nodes
|
|
24
|
+
- node-status-details: Detailed node information
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
def __init__(self, session_factory):
|
|
28
|
+
"""
|
|
29
|
+
Initialize reporter with database session factory.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
session_factory: SQLAlchemy scoped_session factory
|
|
33
|
+
"""
|
|
34
|
+
self.S = session_factory
|
|
35
|
+
self.logger = logging.getLogger(__name__)
|
|
36
|
+
|
|
37
|
+
def _get_nodes(self, service_names: Optional[List[str]] = None) -> List[Node]:
|
|
38
|
+
"""
|
|
39
|
+
Retrieve nodes from database.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
service_names: Optional list of specific service names to retrieve
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
List of Node objects, ordered appropriately
|
|
46
|
+
"""
|
|
47
|
+
with self.S() as session:
|
|
48
|
+
if service_names:
|
|
49
|
+
# Get specific nodes in the order requested
|
|
50
|
+
nodes = []
|
|
51
|
+
for service_name in service_names:
|
|
52
|
+
result = session.execute(
|
|
53
|
+
select(Node).where(Node.service == service_name)
|
|
54
|
+
).first()
|
|
55
|
+
if result:
|
|
56
|
+
nodes.append(result[0])
|
|
57
|
+
else:
|
|
58
|
+
self.logger.warning(f"Node {service_name} not found in database")
|
|
59
|
+
return nodes
|
|
60
|
+
else:
|
|
61
|
+
# Get all nodes ordered by ID (numerical order)
|
|
62
|
+
results = session.execute(
|
|
63
|
+
select(Node).order_by(Node.id)
|
|
64
|
+
).all()
|
|
65
|
+
return [row[0] for row in results]
|
|
66
|
+
|
|
67
|
+
def node_status_report(
|
|
68
|
+
self,
|
|
69
|
+
service_name: Optional[str] = None,
|
|
70
|
+
report_format: str = "text"
|
|
71
|
+
) -> str:
|
|
72
|
+
"""
|
|
73
|
+
Generate tabular node status report.
|
|
74
|
+
|
|
75
|
+
Format (text):
|
|
76
|
+
Service Name Peer ID Status Connected Peers
|
|
77
|
+
antnode0001 12D3Koo... RUNNING 4
|
|
78
|
+
|
|
79
|
+
Format (json):
|
|
80
|
+
[{"service_name": "antnode0001", "peer_id": "12D3Koo...",
|
|
81
|
+
"status": "RUNNING", "connected_peers": 4}]
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
service_name: Optional comma-separated list of service names
|
|
85
|
+
report_format: Output format ("text" or "json")
|
|
86
|
+
|
|
87
|
+
Returns:
|
|
88
|
+
Formatted string report
|
|
89
|
+
"""
|
|
90
|
+
service_names = parse_service_names(service_name)
|
|
91
|
+
nodes = self._get_nodes(service_names)
|
|
92
|
+
|
|
93
|
+
if not nodes:
|
|
94
|
+
if report_format == "json":
|
|
95
|
+
return json.dumps({"error": "No nodes found"}, indent=2)
|
|
96
|
+
return "No nodes found."
|
|
97
|
+
|
|
98
|
+
if report_format == "json":
|
|
99
|
+
# Build JSON output with only the specified fields
|
|
100
|
+
node_dicts = []
|
|
101
|
+
for node in nodes:
|
|
102
|
+
node_dict = {
|
|
103
|
+
"service_name": node.service,
|
|
104
|
+
"peer_id": node.peer_id or "-",
|
|
105
|
+
"status": node.status,
|
|
106
|
+
"connected_peers": node.connected_peers if node.connected_peers is not None else 0,
|
|
107
|
+
}
|
|
108
|
+
node_dicts.append(node_dict)
|
|
109
|
+
|
|
110
|
+
# If single node, return object; if multiple, return array
|
|
111
|
+
if len(node_dicts) == 1:
|
|
112
|
+
return json.dumps(node_dicts[0], indent=2)
|
|
113
|
+
else:
|
|
114
|
+
return json.dumps(node_dicts, indent=2)
|
|
115
|
+
|
|
116
|
+
# Build text report
|
|
117
|
+
lines = []
|
|
118
|
+
|
|
119
|
+
# Header
|
|
120
|
+
header = f"{'Service Name':<20}{'Peer ID':<55}{'Status':<15}{'Connected Peers':>15}"
|
|
121
|
+
lines.append(header)
|
|
122
|
+
|
|
123
|
+
# Node rows
|
|
124
|
+
for node in nodes:
|
|
125
|
+
service_col = f"{node.service:<20}"
|
|
126
|
+
peer_id_col = f"{(node.peer_id or '-'):<55}"
|
|
127
|
+
status_col = f"{node.status:<15}"
|
|
128
|
+
# Connected peers from connected_peers field
|
|
129
|
+
peers = node.connected_peers if node.connected_peers is not None else 0
|
|
130
|
+
peers_col = f"{peers:>15}"
|
|
131
|
+
|
|
132
|
+
lines.append(f"{service_col}{peer_id_col}{status_col}{peers_col}")
|
|
133
|
+
|
|
134
|
+
return "\n".join(lines)
|
|
135
|
+
|
|
136
|
+
def node_status_details_report(
|
|
137
|
+
self,
|
|
138
|
+
service_name: Optional[str] = None,
|
|
139
|
+
report_format: str = "text"
|
|
140
|
+
) -> str:
|
|
141
|
+
"""
|
|
142
|
+
Generate detailed node status report.
|
|
143
|
+
|
|
144
|
+
Supports two formats:
|
|
145
|
+
- text: key: value format
|
|
146
|
+
- json: JSON format with snake_case keys
|
|
147
|
+
|
|
148
|
+
Args:
|
|
149
|
+
service_name: Optional comma-separated list of service names
|
|
150
|
+
report_format: Output format ("text" or "json")
|
|
151
|
+
|
|
152
|
+
Returns:
|
|
153
|
+
Formatted string report
|
|
154
|
+
"""
|
|
155
|
+
service_names = parse_service_names(service_name)
|
|
156
|
+
nodes = self._get_nodes(service_names)
|
|
157
|
+
|
|
158
|
+
if not nodes:
|
|
159
|
+
if report_format == "json":
|
|
160
|
+
return json.dumps({"error": "No nodes found"}, indent=2)
|
|
161
|
+
return "No nodes found."
|
|
162
|
+
|
|
163
|
+
if report_format == "json":
|
|
164
|
+
return self._format_details_json(nodes)
|
|
165
|
+
else:
|
|
166
|
+
return self._format_details_text(nodes)
|
|
167
|
+
|
|
168
|
+
def _format_details_text(self, nodes: List[Node]) -> str:
|
|
169
|
+
"""
|
|
170
|
+
Format node details as text (key: value format).
|
|
171
|
+
|
|
172
|
+
Args:
|
|
173
|
+
nodes: List of Node objects
|
|
174
|
+
|
|
175
|
+
Returns:
|
|
176
|
+
Formatted text string
|
|
177
|
+
"""
|
|
178
|
+
sections = []
|
|
179
|
+
|
|
180
|
+
for node in nodes:
|
|
181
|
+
lines = []
|
|
182
|
+
|
|
183
|
+
# Service Name
|
|
184
|
+
lines.append(f"Service Name: {node.service}")
|
|
185
|
+
|
|
186
|
+
# Version
|
|
187
|
+
lines.append(f"Version: {node.version or 'unknown'}")
|
|
188
|
+
|
|
189
|
+
# Port
|
|
190
|
+
lines.append(f"Port: {node.port}")
|
|
191
|
+
|
|
192
|
+
# Metrics Port
|
|
193
|
+
lines.append(f"Metrics Port: {node.metrics_port}")
|
|
194
|
+
|
|
195
|
+
# Data path (root_dir)
|
|
196
|
+
lines.append(f"Data path: {node.root_dir}")
|
|
197
|
+
|
|
198
|
+
# Log path - construct from root_dir
|
|
199
|
+
log_path = f"{node.root_dir}/logs" if node.root_dir else "unknown"
|
|
200
|
+
lines.append(f"Log path: {log_path}")
|
|
201
|
+
|
|
202
|
+
# Bin path - construct from root_dir and binary name
|
|
203
|
+
bin_path = f"{node.root_dir}/{node.binary}" if node.root_dir and node.binary else "unknown"
|
|
204
|
+
lines.append(f"Bin Path: {bin_path}")
|
|
205
|
+
|
|
206
|
+
# Connected peers from connected_peers field
|
|
207
|
+
connected_peers = node.connected_peers if node.connected_peers is not None else 0
|
|
208
|
+
lines.append(f"Connected peers: {connected_peers}")
|
|
209
|
+
|
|
210
|
+
# Rewards address from node's wallet field
|
|
211
|
+
rewards_address = node.wallet or "unknown"
|
|
212
|
+
lines.append(f"Rewards address: {rewards_address}")
|
|
213
|
+
|
|
214
|
+
# Age in seconds
|
|
215
|
+
age_seconds = node.age if node.age is not None else 0
|
|
216
|
+
lines.append(f"Age: {age_seconds}")
|
|
217
|
+
|
|
218
|
+
# Peer ID
|
|
219
|
+
lines.append(f"Peer ID: {node.peer_id or '-'}")
|
|
220
|
+
|
|
221
|
+
# Status
|
|
222
|
+
lines.append(f"Status: {node.status}")
|
|
223
|
+
|
|
224
|
+
sections.append("\n".join(lines))
|
|
225
|
+
|
|
226
|
+
# Separate multiple nodes with blank line
|
|
227
|
+
return "\n\n".join(sections)
|
|
228
|
+
|
|
229
|
+
def _format_details_json(self, nodes: List[Node]) -> str:
|
|
230
|
+
"""
|
|
231
|
+
Format node details as JSON using snake_case field names from model.
|
|
232
|
+
|
|
233
|
+
Args:
|
|
234
|
+
nodes: List of Node objects
|
|
235
|
+
|
|
236
|
+
Returns:
|
|
237
|
+
JSON formatted string
|
|
238
|
+
"""
|
|
239
|
+
# Use the __json__ method from the Node model
|
|
240
|
+
node_dicts = [node.__json__() for node in nodes]
|
|
241
|
+
|
|
242
|
+
# If single node, return object; if multiple, return array
|
|
243
|
+
if len(node_dicts) == 1:
|
|
244
|
+
return json.dumps(node_dicts[0], indent=2)
|
|
245
|
+
else:
|
|
246
|
+
return json.dumps(node_dicts, indent=2)
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
def generate_node_status_report(
|
|
250
|
+
session_factory,
|
|
251
|
+
service_name: Optional[str] = None,
|
|
252
|
+
report_format: str = "text"
|
|
253
|
+
) -> str:
|
|
254
|
+
"""
|
|
255
|
+
Convenience function to generate node status report.
|
|
256
|
+
|
|
257
|
+
Args:
|
|
258
|
+
session_factory: SQLAlchemy scoped_session factory
|
|
259
|
+
service_name: Optional comma-separated list of service names
|
|
260
|
+
report_format: Output format ("text" or "json")
|
|
261
|
+
|
|
262
|
+
Returns:
|
|
263
|
+
Formatted report string
|
|
264
|
+
"""
|
|
265
|
+
reporter = NodeReporter(session_factory)
|
|
266
|
+
return reporter.node_status_report(service_name, report_format)
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
def generate_node_status_details_report(
|
|
270
|
+
session_factory,
|
|
271
|
+
service_name: Optional[str] = None,
|
|
272
|
+
report_format: str = "text"
|
|
273
|
+
) -> str:
|
|
274
|
+
"""
|
|
275
|
+
Convenience function to generate node status details report.
|
|
276
|
+
|
|
277
|
+
Args:
|
|
278
|
+
session_factory: SQLAlchemy scoped_session factory
|
|
279
|
+
service_name: Optional comma-separated list of service names
|
|
280
|
+
report_format: Output format ("text" or "json")
|
|
281
|
+
|
|
282
|
+
Returns:
|
|
283
|
+
Formatted report string
|
|
284
|
+
"""
|
|
285
|
+
reporter = NodeReporter(session_factory)
|
|
286
|
+
return reporter.node_status_details_report(service_name, report_format)
|
wnm/utils.py
ADDED
|
@@ -0,0 +1,403 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import os
|
|
3
|
+
import re
|
|
4
|
+
import shutil
|
|
5
|
+
import subprocess
|
|
6
|
+
import sys
|
|
7
|
+
import time
|
|
8
|
+
from collections import Counter
|
|
9
|
+
from typing import List, Optional
|
|
10
|
+
|
|
11
|
+
import psutil
|
|
12
|
+
import requests
|
|
13
|
+
from sqlalchemy import create_engine, delete, insert, select, text, update
|
|
14
|
+
from sqlalchemy.orm import scoped_session, sessionmaker
|
|
15
|
+
|
|
16
|
+
from wnm.common import (
|
|
17
|
+
DEAD,
|
|
18
|
+
DISABLED,
|
|
19
|
+
DONATE,
|
|
20
|
+
METRICS_PORT_BASE,
|
|
21
|
+
MIGRATING,
|
|
22
|
+
MIN_NODES_THRESHOLD,
|
|
23
|
+
PORT_MULTIPLIER,
|
|
24
|
+
QUEEN,
|
|
25
|
+
REMOVING,
|
|
26
|
+
RESTARTING,
|
|
27
|
+
RUNNING,
|
|
28
|
+
STOPPED,
|
|
29
|
+
UPGRADING,
|
|
30
|
+
)
|
|
31
|
+
from wnm.config import BOOTSTRAP_CACHE_DIR, LOG_DIR, PLATFORM
|
|
32
|
+
from wnm.models import Base, Machine, Node
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def parse_service_names(service_name_str: Optional[str]) -> Optional[List[str]]:
|
|
36
|
+
"""Parse comma-separated service names.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
service_name_str: Comma-separated service names (e.g., "antnode0001,antnode0003")
|
|
40
|
+
|
|
41
|
+
Returns:
|
|
42
|
+
List of service names, or None if input is None/empty
|
|
43
|
+
"""
|
|
44
|
+
if not service_name_str:
|
|
45
|
+
return None
|
|
46
|
+
|
|
47
|
+
# Split by comma and strip whitespace
|
|
48
|
+
names = [name.strip() for name in service_name_str.split(',')]
|
|
49
|
+
# Filter out empty strings
|
|
50
|
+
return [name for name in names if name]
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
# Read config from systemd service file
|
|
54
|
+
def read_node_metadata(host, port):
|
|
55
|
+
# Only return version number when we have one, to stop clobbering the binary check
|
|
56
|
+
try:
|
|
57
|
+
url = "http://{0}:{1}/metadata".format(host, port)
|
|
58
|
+
response = requests.get(url, timeout=5)
|
|
59
|
+
data = response.text
|
|
60
|
+
except requests.exceptions.ConnectionError:
|
|
61
|
+
logging.debug("Connection Refused on port: {0}:{1}".format(host, str(port)))
|
|
62
|
+
return {"status": STOPPED, "peer_id": ""}
|
|
63
|
+
except Exception as error:
|
|
64
|
+
template = "In RNMd - An exception of type {0} occurred. Arguments:\n{1!r}"
|
|
65
|
+
message = template.format(type(error).__name__, error.args)
|
|
66
|
+
logging.info(message)
|
|
67
|
+
return {"status": STOPPED, "peer_id": ""}
|
|
68
|
+
# collect a dict to return
|
|
69
|
+
card = {}
|
|
70
|
+
try:
|
|
71
|
+
card["version"] = re.findall(r'{antnode_version="([\d\.]+)"}', data)[0]
|
|
72
|
+
except (IndexError, KeyError) as e:
|
|
73
|
+
logging.info(f"No version found: {e}")
|
|
74
|
+
try:
|
|
75
|
+
card["peer_id"] = re.findall(r'{peer_id="([\w\d]+)"}', data)[0]
|
|
76
|
+
except (IndexError, KeyError) as e:
|
|
77
|
+
logging.debug(f"No peer_id found: {e}")
|
|
78
|
+
card["peer_id"] = ""
|
|
79
|
+
card["status"] = RUNNING if "version" in card else STOPPED
|
|
80
|
+
return card
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
# Read data from metrics port
|
|
84
|
+
def read_node_metrics(host, port):
|
|
85
|
+
metrics = {}
|
|
86
|
+
try:
|
|
87
|
+
url = "http://{0}:{1}/metrics".format(host, port)
|
|
88
|
+
response = requests.get(url, timeout=5)
|
|
89
|
+
metrics["status"] = RUNNING
|
|
90
|
+
metrics["uptime"] = int(
|
|
91
|
+
(re.findall(r"ant_node_uptime ([\d]+)", response.text) or [0])[0]
|
|
92
|
+
)
|
|
93
|
+
metrics["records"] = int(
|
|
94
|
+
(
|
|
95
|
+
re.findall(r"ant_networking_records_stored ([\d]+)", response.text)
|
|
96
|
+
or [0]
|
|
97
|
+
)[0]
|
|
98
|
+
)
|
|
99
|
+
metrics["shunned"] = int(
|
|
100
|
+
(
|
|
101
|
+
re.findall(
|
|
102
|
+
r"ant_networking_shunned_by_close_group ([\d]+)", response.text
|
|
103
|
+
)
|
|
104
|
+
or [0]
|
|
105
|
+
)[0]
|
|
106
|
+
)
|
|
107
|
+
metrics["connected_peers"] = int(
|
|
108
|
+
(
|
|
109
|
+
re.findall(r"ant_networking_connected_peers ([\d]+)", response.text)
|
|
110
|
+
or [0]
|
|
111
|
+
)[0]
|
|
112
|
+
)
|
|
113
|
+
except requests.exceptions.ConnectionError:
|
|
114
|
+
logging.debug("Connection Refused on port: {0}:{1}".format(host, str(port)))
|
|
115
|
+
metrics["status"] = STOPPED
|
|
116
|
+
metrics["uptime"] = 0
|
|
117
|
+
metrics["records"] = 0
|
|
118
|
+
metrics["shunned"] = 0
|
|
119
|
+
metrics["connected_peers"] = 0
|
|
120
|
+
except Exception as error:
|
|
121
|
+
template = "in:RNM - An exception of type {0} occurred. Arguments:\n{1!r}"
|
|
122
|
+
message = template.format(type(error).__name__, error.args)
|
|
123
|
+
logging.info(message)
|
|
124
|
+
metrics["status"] = STOPPED
|
|
125
|
+
metrics["uptime"] = 0
|
|
126
|
+
metrics["records"] = 0
|
|
127
|
+
metrics["shunned"] = 0
|
|
128
|
+
metrics["connected_peers"] = 0
|
|
129
|
+
return metrics
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
# Read antnode binary version
|
|
133
|
+
def get_antnode_version(binary):
|
|
134
|
+
try:
|
|
135
|
+
data = subprocess.run(
|
|
136
|
+
[binary, "--version"], stdout=subprocess.PIPE
|
|
137
|
+
).stdout.decode("utf-8")
|
|
138
|
+
return re.findall(r"Autonomi Node v([\d\.]+)", data)[0]
|
|
139
|
+
except Exception as error:
|
|
140
|
+
template = "In GAV - An exception of type {0} occurred. Arguments:\n{1!r}"
|
|
141
|
+
message = template.format(type(error).__name__, error.args)
|
|
142
|
+
logging.info(message)
|
|
143
|
+
return 0
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
# Determine how long this node has been around by looking at it's secret_key file
|
|
147
|
+
def get_node_age(root_dir):
|
|
148
|
+
try:
|
|
149
|
+
return int(os.stat("{0}/secret-key".format(root_dir)).st_mtime)
|
|
150
|
+
except (FileNotFoundError, OSError) as e:
|
|
151
|
+
logging.debug(f"Unable to get node age for {root_dir}: {e}")
|
|
152
|
+
return 0
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
# Survey nodes by reading metadata from metrics ports or binary --version
|
|
156
|
+
def get_machine_metrics(S, node_storage, remove_limit, crisis_bytes):
|
|
157
|
+
metrics = {}
|
|
158
|
+
|
|
159
|
+
with S() as session:
|
|
160
|
+
db_nodes = session.execute(select(Node.status, Node.version)).all()
|
|
161
|
+
|
|
162
|
+
# Get system start time before we probe metrics
|
|
163
|
+
try:
|
|
164
|
+
if PLATFORM == "Darwin":
|
|
165
|
+
# macOS: use sysctl kern.boottime
|
|
166
|
+
p = subprocess.run(
|
|
167
|
+
["sysctl", "-n", "kern.boottime"],
|
|
168
|
+
stdout=subprocess.PIPE,
|
|
169
|
+
stderr=subprocess.STDOUT,
|
|
170
|
+
check=True,
|
|
171
|
+
).stdout.decode("utf-8")
|
|
172
|
+
# Parse: { sec = 1234567890, usec = 0 }
|
|
173
|
+
match = re.search(r"sec = (\d+)", p)
|
|
174
|
+
if match:
|
|
175
|
+
metrics["system_start"] = int(match.group(1))
|
|
176
|
+
else:
|
|
177
|
+
raise ValueError("Could not parse kern.boottime")
|
|
178
|
+
else:
|
|
179
|
+
# Linux: use uptime --since
|
|
180
|
+
p = subprocess.run(
|
|
181
|
+
["uptime", "--since"],
|
|
182
|
+
stdout=subprocess.PIPE,
|
|
183
|
+
stderr=subprocess.STDOUT,
|
|
184
|
+
).stdout.decode("utf-8")
|
|
185
|
+
if re.match(r"\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}", p):
|
|
186
|
+
metrics["system_start"] = int(
|
|
187
|
+
time.mktime(time.strptime(p.strip(), "%Y-%m-%d %H:%M:%S"))
|
|
188
|
+
)
|
|
189
|
+
except (subprocess.CalledProcessError, ValueError) as err:
|
|
190
|
+
logging.error("GMM ERROR:", err)
|
|
191
|
+
metrics["system_start"] = 0
|
|
192
|
+
|
|
193
|
+
# Get some initial stats for comparing after a few seconds
|
|
194
|
+
# We start these counters AFTER reading the database
|
|
195
|
+
start_time = time.time()
|
|
196
|
+
start_disk_counters = psutil.disk_io_counters()
|
|
197
|
+
start_net_counters = psutil.net_io_counters()
|
|
198
|
+
|
|
199
|
+
metrics["total_nodes"] = len(db_nodes)
|
|
200
|
+
data = Counter(node[0] for node in db_nodes)
|
|
201
|
+
metrics["running_nodes"] = data[RUNNING]
|
|
202
|
+
metrics["stopped_nodes"] = data[STOPPED]
|
|
203
|
+
metrics["restarting_nodes"] = data[RESTARTING]
|
|
204
|
+
metrics["upgrading_nodes"] = data[UPGRADING]
|
|
205
|
+
metrics["migrating_nodes"] = data[MIGRATING]
|
|
206
|
+
metrics["removing_nodes"] = data[REMOVING]
|
|
207
|
+
metrics["dead_nodes"] = data[DEAD]
|
|
208
|
+
metrics["antnode"] = shutil.which("antnode")
|
|
209
|
+
if not metrics["antnode"]:
|
|
210
|
+
logging.warning("Unable to locate current antnode binary, exiting")
|
|
211
|
+
sys.exit(1)
|
|
212
|
+
metrics["antnode_version"] = get_antnode_version(metrics["antnode"])
|
|
213
|
+
metrics["queen_node_version"] = (
|
|
214
|
+
db_nodes[0][1] if metrics["total_nodes"] > 0 else metrics["antnode_version"]
|
|
215
|
+
)
|
|
216
|
+
metrics["nodes_latest_v"] = (
|
|
217
|
+
sum(1 for node in db_nodes if node[1] == metrics["antnode_version"]) or 0
|
|
218
|
+
)
|
|
219
|
+
metrics["nodes_no_version"] = sum(1 for node in db_nodes if not node[1]) or 0
|
|
220
|
+
metrics["nodes_to_upgrade"] = (
|
|
221
|
+
metrics["total_nodes"] - metrics["nodes_latest_v"] - metrics["nodes_no_version"]
|
|
222
|
+
)
|
|
223
|
+
metrics["nodes_by_version"] = Counter(ver[1] for ver in db_nodes)
|
|
224
|
+
|
|
225
|
+
# Windows has to build load average over 5 seconds. The first 5 seconds returns 0's
|
|
226
|
+
# I don't plan on supporting windows, but if this get's modular, I don't want this
|
|
227
|
+
# issue to be skipped
|
|
228
|
+
# if platform.system() == "Windows":
|
|
229
|
+
# discard=psutil.getloadavg()
|
|
230
|
+
# time.sleep(5)
|
|
231
|
+
metrics["load_average_1"], metrics["load_average_5"], metrics["load_average_15"] = (
|
|
232
|
+
psutil.getloadavg()
|
|
233
|
+
)
|
|
234
|
+
# Get CPU Metrics over 1 second
|
|
235
|
+
cpu_times = psutil.cpu_times_percent(1)
|
|
236
|
+
if PLATFORM == "Darwin":
|
|
237
|
+
# macOS: cpu_times has (user, nice, system, idle) - no iowait
|
|
238
|
+
metrics["idle_cpu_percent"] = cpu_times.idle
|
|
239
|
+
metrics["io_wait"] = 0 # Not available on macOS
|
|
240
|
+
else:
|
|
241
|
+
# Linux: cpu_times has (user, nice, system, idle, iowait, ...)
|
|
242
|
+
metrics["idle_cpu_percent"], metrics["io_wait"] = cpu_times[3:5]
|
|
243
|
+
# Really we returned Idle percent, subtract from 100 to get used.
|
|
244
|
+
metrics["used_cpu_percent"] = 100 - metrics["idle_cpu_percent"]
|
|
245
|
+
data = psutil.virtual_memory()
|
|
246
|
+
# print(data)
|
|
247
|
+
metrics["used_mem_percent"] = data.percent
|
|
248
|
+
metrics["free_mem_percent"] = 100 - metrics["used_mem_percent"]
|
|
249
|
+
data = psutil.disk_io_counters()
|
|
250
|
+
# This only checks the drive mapped to the first node and will need to be updated
|
|
251
|
+
# when we eventually support multiple drives
|
|
252
|
+
data = psutil.disk_usage(node_storage)
|
|
253
|
+
metrics["used_hd_percent"] = data.percent
|
|
254
|
+
metrics["total_hd_bytes"] = data.total
|
|
255
|
+
end_time = time.time()
|
|
256
|
+
end_disk_counters = psutil.disk_io_counters()
|
|
257
|
+
end_net_counters = psutil.net_io_counters()
|
|
258
|
+
metrics["hdio_write_bytes"] = int(
|
|
259
|
+
(end_disk_counters.write_bytes - start_disk_counters.write_bytes)
|
|
260
|
+
/ (end_time - start_time)
|
|
261
|
+
)
|
|
262
|
+
metrics["hdio_read_bytes"] = int(
|
|
263
|
+
(end_disk_counters.read_bytes - start_disk_counters.read_bytes)
|
|
264
|
+
/ (end_time - start_time)
|
|
265
|
+
)
|
|
266
|
+
metrics["netio_write_bytes"] = int(
|
|
267
|
+
(end_net_counters.bytes_sent - start_net_counters.bytes_sent)
|
|
268
|
+
/ (end_time - start_time)
|
|
269
|
+
)
|
|
270
|
+
metrics["netio_read_bytes"] = int(
|
|
271
|
+
(end_net_counters.bytes_recv - start_net_counters.bytes_recv)
|
|
272
|
+
/ (end_time - start_time)
|
|
273
|
+
)
|
|
274
|
+
# print (json.dumps(metrics,indent=2))
|
|
275
|
+
# How close (out of 100) to removal limit will we be with a max bytes per node (2GB default)
|
|
276
|
+
# For running nodes with Porpoise(tm).
|
|
277
|
+
metrics["node_hd_crisis"] = int(
|
|
278
|
+
(
|
|
279
|
+
((metrics["total_nodes"]) * int(crisis_bytes))
|
|
280
|
+
/ (metrics["total_hd_bytes"] * (remove_limit / 100))
|
|
281
|
+
)
|
|
282
|
+
* 100
|
|
283
|
+
)
|
|
284
|
+
return metrics
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
# Update node with metrics result
|
|
288
|
+
def update_node_from_metrics(S, id, metrics, metadata):
|
|
289
|
+
try:
|
|
290
|
+
# We check the binary version in other code, so lets stop clobbering it when a node is stopped
|
|
291
|
+
card = {
|
|
292
|
+
"status": metrics["status"],
|
|
293
|
+
"timestamp": int(time.time()),
|
|
294
|
+
"uptime": metrics["uptime"],
|
|
295
|
+
"records": metrics["records"],
|
|
296
|
+
"shunned": metrics["shunned"],
|
|
297
|
+
"connected_peers": metrics["connected_peers"],
|
|
298
|
+
"peer_id": metadata["peer_id"],
|
|
299
|
+
}
|
|
300
|
+
if "version" in metadata:
|
|
301
|
+
card["version"] = metadata["version"]
|
|
302
|
+
with S() as session:
|
|
303
|
+
session.query(Node).filter(Node.id == id).update(card)
|
|
304
|
+
session.commit()
|
|
305
|
+
except Exception as error:
|
|
306
|
+
template = "In UNFM - An exception of type {0} occurred. Arguments:\n{1!r}"
|
|
307
|
+
message = template.format(type(error).__name__, error.args)
|
|
308
|
+
logging.warning(message)
|
|
309
|
+
return False
|
|
310
|
+
else:
|
|
311
|
+
return True
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
# Set Node status
|
|
315
|
+
def update_counters(S, old, config):
|
|
316
|
+
# Are we already removing a node
|
|
317
|
+
if old["removing_nodes"]:
|
|
318
|
+
with S() as session:
|
|
319
|
+
removals = session.execute(
|
|
320
|
+
select(Node.timestamp, Node.id)
|
|
321
|
+
.where(Node.status == REMOVING)
|
|
322
|
+
.order_by(Node.timestamp.asc())
|
|
323
|
+
).all()
|
|
324
|
+
# Iterate through active removals
|
|
325
|
+
records_to_remove = len(removals)
|
|
326
|
+
for check in removals:
|
|
327
|
+
# If the delay_remove timer has expired, delete the entry
|
|
328
|
+
if isinstance(check[0], int) and check[0] < (
|
|
329
|
+
int(time.time()) - config["delay_remove"]
|
|
330
|
+
):
|
|
331
|
+
logging.info("Deleting removed node " + str(check[1]))
|
|
332
|
+
with S() as session:
|
|
333
|
+
session.execute(delete(Node).where(Node.id == check[1]))
|
|
334
|
+
session.commit()
|
|
335
|
+
records_to_remove -= 1
|
|
336
|
+
old["removing_nodes"] = records_to_remove
|
|
337
|
+
# Are we already upgrading a node
|
|
338
|
+
if old["upgrading_nodes"]:
|
|
339
|
+
with S() as session:
|
|
340
|
+
upgrades = session.execute(
|
|
341
|
+
select(Node.timestamp, Node.id, Node.host, Node.metrics_port)
|
|
342
|
+
.where(Node.status == UPGRADING)
|
|
343
|
+
.order_by(Node.timestamp.asc())
|
|
344
|
+
).all()
|
|
345
|
+
# Iterate through active upgrades
|
|
346
|
+
records_to_upgrade = len(upgrades)
|
|
347
|
+
for check in upgrades:
|
|
348
|
+
# If the delay_upgrade timer has expired, check on status
|
|
349
|
+
if isinstance(check[0], int) and check[0] < (
|
|
350
|
+
int(time.time()) - config["delay_upgrade"]
|
|
351
|
+
):
|
|
352
|
+
logging.info("Updating upgraded node " + str(check[1]))
|
|
353
|
+
node_metrics = read_node_metrics(check[2], check[3])
|
|
354
|
+
node_metadata = read_node_metadata(check[2], check[3])
|
|
355
|
+
if node_metrics and node_metadata:
|
|
356
|
+
update_node_from_metrics(S, check[1], node_metrics, node_metadata)
|
|
357
|
+
records_to_upgrade -= 1
|
|
358
|
+
old["upgrading_nodes"] = records_to_upgrade
|
|
359
|
+
# Are we already restarting a node
|
|
360
|
+
if old["restarting_nodes"]:
|
|
361
|
+
with S() as session:
|
|
362
|
+
restarts = session.execute(
|
|
363
|
+
select(Node.timestamp, Node.id, Node.host, Node.metrics_port)
|
|
364
|
+
.where(Node.status == RESTARTING)
|
|
365
|
+
.order_by(Node.timestamp.asc())
|
|
366
|
+
).all()
|
|
367
|
+
# Iterate through active upgrades
|
|
368
|
+
records_to_restart = len(restarts)
|
|
369
|
+
for check in restarts:
|
|
370
|
+
# If the delay_start timer has expired, check on status
|
|
371
|
+
if isinstance(check[0], int) and check[0] < (
|
|
372
|
+
int(time.time()) - config["delay_start"]
|
|
373
|
+
):
|
|
374
|
+
logging.info("Updating restarted node " + str(check[1]))
|
|
375
|
+
node_metrics = read_node_metrics(check[2], check[3])
|
|
376
|
+
node_metadata = read_node_metadata(check[2], check[3])
|
|
377
|
+
if node_metrics and node_metadata:
|
|
378
|
+
update_node_from_metrics(S, check[1], node_metrics, node_metadata)
|
|
379
|
+
records_to_restart -= 1
|
|
380
|
+
old["restarting_nodes"] = records_to_restart
|
|
381
|
+
return old
|
|
382
|
+
|
|
383
|
+
|
|
384
|
+
# Enable firewall for port
|
|
385
|
+
def update_nodes(S):
|
|
386
|
+
with S() as session:
|
|
387
|
+
nodes = session.execute(
|
|
388
|
+
select(Node.timestamp, Node.id, Node.host, Node.metrics_port, Node.status)
|
|
389
|
+
.where(Node.status != DISABLED)
|
|
390
|
+
.order_by(Node.timestamp.asc())
|
|
391
|
+
).all()
|
|
392
|
+
# Iterate through all records
|
|
393
|
+
for check in nodes:
|
|
394
|
+
# Check on status
|
|
395
|
+
if isinstance(check[0], int):
|
|
396
|
+
logging.debug("Updating info on node " + str(check[1]))
|
|
397
|
+
node_metrics = read_node_metrics(check[2], check[3])
|
|
398
|
+
node_metadata = read_node_metadata(check[2], check[3])
|
|
399
|
+
if node_metrics and node_metadata:
|
|
400
|
+
# Don't write updates for stopped nodes that are already marked as stopped
|
|
401
|
+
if node_metadata["status"] == STOPPED and check[4] == STOPPED:
|
|
402
|
+
continue
|
|
403
|
+
update_node_from_metrics(S, check[1], node_metrics, node_metadata)
|