sbcli-pre 1.2.4__zip → 1.2.5__zip
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/PKG-INFO +20 -5
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/README.md +19 -4
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/env_var +1 -1
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/sbcli_pre.egg-info/PKG-INFO +20 -5
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/sbcli_pre.egg-info/SOURCES.txt +5 -5
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_cli/cli.py +115 -113
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/cluster_ops.py +238 -141
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/constants.py +7 -5
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/controllers/caching_node_controller.py +6 -8
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/controllers/cluster_events.py +0 -9
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/controllers/device_controller.py +63 -56
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/controllers/events_controller.py +3 -5
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/controllers/health_controller.py +40 -30
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/controllers/lvol_controller.py +38 -51
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/controllers/pool_controller.py +4 -8
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/controllers/snapshot_controller.py +3 -9
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/distr_controller.py +9 -13
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/kv_store.py +29 -47
- sbcli_pre-1.2.5/simplyblock_core/mgmt_node_ops.py +80 -0
- sbcli_pre-1.2.5/simplyblock_core/models/deployer.py +62 -0
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/models/events.py +1 -9
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/models/job_schedule.py +0 -6
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/models/nvme_device.py +4 -42
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/models/storage_node.py +1 -9
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/rpc_client.py +10 -55
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/scripts/__init__.py +4 -0
- sbcli_pre-1.2.4/simplyblock_core/scripts/alerting/alert_resources.yaml.j2 → sbcli_pre-1.2.5/simplyblock_core/scripts/alerting/alert_resources.yaml +5 -54
- sbcli_pre-1.2.5/simplyblock_core/scripts/apply_dashboard.sh +22 -0
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/scripts/dashboards/cluster.json +1 -1
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/scripts/deploy_stack.sh +0 -2
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/scripts/docker-compose-swarm-monitoring.yml +13 -22
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/scripts/docker-compose-swarm.yml +2 -17
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/scripts/haproxy.cfg +0 -15
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/scripts/install_deps.sh +0 -1
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/services/capacity_and_stats_collector.py +1 -1
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/services/device_monitor.py +46 -5
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/services/distr_event_collector.py +11 -10
- sbcli_pre-1.2.5/simplyblock_core/services/health_check_service.py +136 -0
- sbcli_pre-1.2.4/simplyblock_core/services/tasks_runner_restart.py → sbcli_pre-1.2.5/simplyblock_core/services/job_tasks.py +46 -95
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/services/lvol_monitor.py +1 -1
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/services/lvol_stat_collector.py +1 -1
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/services/port_stat_collector.py +1 -0
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/services/storage_node_monitor.py +44 -49
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/snode_client.py +0 -12
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/storage_node_ops.py +336 -525
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/utils.py +1 -46
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_web/app.py +2 -1
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_web/blueprints/snode_ops.py +25 -103
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_web/blueprints/web_api_cluster.py +43 -20
- sbcli_pre-1.2.5/simplyblock_web/blueprints/web_api_deployer.py +394 -0
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_web/blueprints/web_api_device.py +7 -10
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_web/blueprints/web_api_lvol.py +5 -9
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_web/blueprints/web_api_pool.py +5 -14
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_web/blueprints/web_api_storage_node.py +10 -3
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_web/node_utils.py +2 -0
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_web/utils.py +0 -8
- sbcli_pre-1.2.4/simplyblock_core/controllers/tasks_controller.py +0 -103
- sbcli_pre-1.2.4/simplyblock_core/controllers/tasks_events.py +0 -37
- sbcli_pre-1.2.4/simplyblock_core/mgmt_node_ops.py +0 -205
- sbcli_pre-1.2.4/simplyblock_core/services/health_check_service.py +0 -134
- sbcli_pre-1.2.4/simplyblock_core/services/tasks_runner_migration.py +0 -61
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/pyproject.toml +0 -0
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/sbcli_pre.egg-info/dependency_links.txt +0 -0
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/sbcli_pre.egg-info/entry_points.txt +0 -0
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/sbcli_pre.egg-info/requires.txt +0 -0
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/sbcli_pre.egg-info/top_level.txt +0 -0
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/setup.cfg +0 -0
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/setup.py +0 -0
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_cli/main.py +0 -0
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/__init__.py +0 -0
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/cnode_client.py +0 -0
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/compute_node_ops.py +0 -0
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/controllers/__init__.py +0 -0
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/controllers/device_events.py +0 -0
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/controllers/lvol_events.py +0 -0
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/controllers/mgmt_events.py +0 -0
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/controllers/pool_events.py +0 -0
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/controllers/snapshot_events.py +0 -0
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/controllers/storage_events.py +0 -0
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/models/__init__.py +0 -0
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/models/base_model.py +0 -0
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/models/caching_node.py +0 -0
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/models/cluster.py +0 -0
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/models/compute_node.py +0 -0
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/models/global_settings.py +0 -0
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/models/iface.py +0 -0
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/models/lvol_model.py +0 -0
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/models/mgmt_node.py +0 -0
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/models/pool.py +0 -0
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/models/port_stat.py +0 -0
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/models/snapshot.py +0 -0
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/models/stats.py +0 -0
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/pci_utils.py +0 -0
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/scripts/alerting/alert_rules.yaml +0 -0
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/scripts/clean_local_storage_deploy.sh +0 -0
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/scripts/config_docker.sh +0 -0
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/scripts/dashboards/devices.json +0 -0
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/scripts/dashboards/lvols.json +0 -0
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/scripts/dashboards/node-exporter.json +0 -0
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/scripts/dashboards/nodes.json +0 -0
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/scripts/dashboards/pools.json +0 -0
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/scripts/datasource.yml +0 -0
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/scripts/db_config_double.sh +0 -0
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/scripts/db_config_single.sh +0 -0
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/scripts/prometheus.yml +0 -0
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/scripts/run_ssh.sh +0 -0
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/scripts/set_db_config.sh +0 -0
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/scripts/stack_deploy_wait.sh +0 -0
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/services/__init__.py +0 -0
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/services/caching_node_monitor.py +0 -0
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/services/cap_monitor.py +0 -0
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/services/install_service.sh +0 -0
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/services/log_agg_service.py +0 -0
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/services/mgmt_node_monitor.py +0 -0
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/services/remove_service.sh +0 -0
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/services/service_template.service +0 -0
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/shell_utils.py +0 -0
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_web/__init__.py +0 -0
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_web/auth_middleware.py +0 -0
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_web/blueprints/__init__.py +0 -0
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_web/blueprints/caching_node_ops.py +0 -0
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_web/blueprints/caching_node_ops_k8s.py +0 -0
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_web/blueprints/node_api_basic.py +0 -0
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_web/blueprints/node_api_caching_docker.py +0 -0
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_web/blueprints/node_api_caching_ks.py +0 -0
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_web/blueprints/web_api_caching_node.py +0 -0
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_web/blueprints/web_api_mgmt_node.py +0 -0
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_web/blueprints/web_api_snapshot.py +0 -0
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_web/caching_node_app.py +0 -0
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_web/caching_node_app_k8s.py +0 -0
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_web/node_webapp.py +0 -0
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_web/snode_app.py +0 -0
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_web/static/delete.py +0 -0
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_web/static/deploy.py +0 -0
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_web/static/deploy_cnode.yaml +0 -0
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_web/static/deploy_spdk.yaml +0 -0
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_web/static/is_up.py +0 -0
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_web/static/list_deps.py +0 -0
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_web/static/rpac.yaml +0 -0
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_web/static/tst.py +0 -0
- {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_web/templates/deploy_spdk.yaml.j2 +0 -0
@@ -0,0 +1,136 @@
|
|
1
|
+
# coding=utf-8
|
2
|
+
import logging
|
3
|
+
|
4
|
+
import time
|
5
|
+
import sys
|
6
|
+
from datetime import datetime
|
7
|
+
|
8
|
+
|
9
|
+
from simplyblock_core.controllers import health_controller, storage_events, device_events
|
10
|
+
from simplyblock_core.models.storage_node import StorageNode
|
11
|
+
from simplyblock_core.rpc_client import RPCClient
|
12
|
+
from simplyblock_core import constants, kv_store
|
13
|
+
|
14
|
+
# Import the GELF logger
|
15
|
+
from graypy import GELFUDPHandler
|
16
|
+
|
17
|
+
def set_node_health_check(snode, health_check_status):
|
18
|
+
snode = db_controller.get_storage_node_by_id(snode.get_id())
|
19
|
+
if snode.health_check == health_check_status:
|
20
|
+
return
|
21
|
+
old_status = snode.health_check
|
22
|
+
snode.health_check = health_check_status
|
23
|
+
snode.updated_at = str(datetime.now())
|
24
|
+
snode.write_to_db(db_store)
|
25
|
+
storage_events.snode_health_check_change(snode, snode.health_check, old_status, caused_by="monitor")
|
26
|
+
|
27
|
+
|
28
|
+
def set_device_health_check(cluster_id, device, health_check_status):
|
29
|
+
if device.health_check == health_check_status:
|
30
|
+
return
|
31
|
+
nodes = db_controller.get_storage_nodes()
|
32
|
+
for node in nodes:
|
33
|
+
if node.nvme_devices:
|
34
|
+
for dev in node.nvme_devices:
|
35
|
+
if dev.get_id() == device.get_id():
|
36
|
+
old_status = dev.health_check
|
37
|
+
dev.health_check = health_check_status
|
38
|
+
node.write_to_db(db_store)
|
39
|
+
device_events.device_health_check_change(
|
40
|
+
dev, dev.health_check, old_status, caused_by="monitor")
|
41
|
+
|
42
|
+
|
43
|
+
# configure logging
|
44
|
+
logger_handler = logging.StreamHandler(stream=sys.stdout)
|
45
|
+
logger_handler.setFormatter(logging.Formatter('%(asctime)s: %(levelname)s: %(message)s'))
|
46
|
+
gelf_handler = GELFUDPHandler('0.0.0.0', constants.GELF_PORT)
|
47
|
+
logger = logging.getLogger()
|
48
|
+
logger.addHandler(gelf_handler)
|
49
|
+
logger.addHandler(logger_handler)
|
50
|
+
logger.setLevel(logging.DEBUG)
|
51
|
+
|
52
|
+
# get DB controller
|
53
|
+
db_store = kv_store.KVStore()
|
54
|
+
db_controller = kv_store.DBController()
|
55
|
+
|
56
|
+
logger.info("Starting health check service")
|
57
|
+
while True:
|
58
|
+
cluster_id = ""
|
59
|
+
cl = db_controller.get_clusters()
|
60
|
+
if cl:
|
61
|
+
cluster_id = cl[0].get_id()
|
62
|
+
|
63
|
+
snodes = db_controller.get_storage_nodes()
|
64
|
+
if not snodes:
|
65
|
+
logger.error("storage nodes list is empty")
|
66
|
+
|
67
|
+
for snode in snodes:
|
68
|
+
logger.info("Node: %s, status %s", snode.get_id(), snode.status)
|
69
|
+
|
70
|
+
if snode.status not in [StorageNode.STATUS_ONLINE, StorageNode.STATUS_UNREACHABLE]:
|
71
|
+
logger.info(f"Node status is: {snode.status}, skipping")
|
72
|
+
continue
|
73
|
+
|
74
|
+
# 1- check node ping
|
75
|
+
ping_check = health_controller._check_node_ping(snode.mgmt_ip)
|
76
|
+
logger.info(f"Check: ping mgmt ip {snode.mgmt_ip} ... {ping_check}")
|
77
|
+
|
78
|
+
# 2- check node API
|
79
|
+
node_api_check = health_controller._check_node_api(snode.mgmt_ip)
|
80
|
+
logger.info(f"Check: node API {snode.mgmt_ip}:5000 ... {node_api_check}")
|
81
|
+
|
82
|
+
if snode.status == StorageNode.STATUS_OFFLINE:
|
83
|
+
set_node_health_check(snode, ping_check & node_api_check)
|
84
|
+
continue
|
85
|
+
|
86
|
+
# 3- check node RPC
|
87
|
+
node_rpc_check = health_controller._check_node_rpc(
|
88
|
+
snode.mgmt_ip, snode.rpc_port, snode.rpc_username, snode.rpc_password)
|
89
|
+
logger.info(f"Check: node RPC {snode.mgmt_ip}:{snode.rpc_port} ... {node_rpc_check}")
|
90
|
+
|
91
|
+
# 4- docker API
|
92
|
+
node_docker_check = health_controller._check_node_docker_api(snode.mgmt_ip)
|
93
|
+
logger.info(f"Check: node docker API {snode.mgmt_ip}:2375 ... {node_docker_check}")
|
94
|
+
|
95
|
+
is_node_online = ping_check and node_api_check and node_rpc_check and node_docker_check
|
96
|
+
|
97
|
+
health_check_status = is_node_online
|
98
|
+
if not node_rpc_check:
|
99
|
+
logger.info("Putting all devices to unavailable state because RPC check failed")
|
100
|
+
for dev in snode.nvme_devices:
|
101
|
+
if dev.io_error:
|
102
|
+
logger.debug(f"Skipping Device action because of io_error {dev.get_id()}")
|
103
|
+
continue
|
104
|
+
set_device_health_check(cluster_id, dev, False)
|
105
|
+
else:
|
106
|
+
logger.info(f"Node device count: {len(snode.nvme_devices)}")
|
107
|
+
node_devices_check = True
|
108
|
+
node_remote_devices_check = True
|
109
|
+
|
110
|
+
for dev in snode.nvme_devices:
|
111
|
+
if dev.io_error:
|
112
|
+
logger.debug(f"Skipping Device check because of io_error {dev.get_id()}")
|
113
|
+
continue
|
114
|
+
ret = health_controller.check_device(dev.get_id())
|
115
|
+
set_device_health_check(cluster_id, dev, ret)
|
116
|
+
if dev.status == dev.STATUS_ONLINE:
|
117
|
+
node_devices_check &= ret
|
118
|
+
|
119
|
+
logger.info(f"Node remote device: {len(snode.remote_devices)}")
|
120
|
+
rpc_client = RPCClient(
|
121
|
+
snode.mgmt_ip, snode.rpc_port,
|
122
|
+
snode.rpc_username, snode.rpc_password,
|
123
|
+
timeout=5, retry=3)
|
124
|
+
for remote_device in snode.remote_devices:
|
125
|
+
ret = rpc_client.get_bdevs(remote_device.remote_bdev)
|
126
|
+
if ret:
|
127
|
+
logger.info(f"Checking bdev: {remote_device.remote_bdev} ... ok")
|
128
|
+
else:
|
129
|
+
logger.info(f"Checking bdev: {remote_device.remote_bdev} ... not found")
|
130
|
+
node_remote_devices_check &= bool(ret)
|
131
|
+
|
132
|
+
health_check_status = is_node_online and node_devices_check and node_remote_devices_check
|
133
|
+
set_node_health_check(snode, health_check_status)
|
134
|
+
|
135
|
+
time.sleep(constants.HEALTH_CHECK_INTERVAL_SEC)
|
136
|
+
|
@@ -5,7 +5,7 @@ import sys
|
|
5
5
|
|
6
6
|
|
7
7
|
from simplyblock_core import constants, kv_store, storage_node_ops
|
8
|
-
from simplyblock_core.controllers import device_controller
|
8
|
+
from simplyblock_core.controllers import device_controller
|
9
9
|
from simplyblock_core.models.job_schedule import JobSchedule
|
10
10
|
from simplyblock_core.models.nvme_device import NVMeDevice
|
11
11
|
|
@@ -15,19 +15,6 @@ from graypy import GELFUDPHandler
|
|
15
15
|
from simplyblock_core.models.storage_node import StorageNode
|
16
16
|
|
17
17
|
|
18
|
-
# configure logging
|
19
|
-
logger_handler = logging.StreamHandler(stream=sys.stdout)
|
20
|
-
logger_handler.setFormatter(logging.Formatter('%(asctime)s: %(levelname)s: %(message)s'))
|
21
|
-
gelf_handler = GELFUDPHandler('0.0.0.0', constants.GELF_PORT)
|
22
|
-
logger = logging.getLogger()
|
23
|
-
logger.addHandler(gelf_handler)
|
24
|
-
logger.addHandler(logger_handler)
|
25
|
-
logger.setLevel(logging.DEBUG)
|
26
|
-
|
27
|
-
# get DB controller
|
28
|
-
db_controller = kv_store.DBController()
|
29
|
-
|
30
|
-
|
31
18
|
def _get_node_unavailable_devices_count(node_id):
|
32
19
|
node = db_controller.get_storage_node_by_id(node_id)
|
33
20
|
devices = []
|
@@ -44,20 +31,10 @@ def _get_device(task):
|
|
44
31
|
return dev
|
45
32
|
|
46
33
|
|
47
|
-
def _validate_no_task_node_restart(cluster_id, node_id):
|
48
|
-
tasks = db_controller.get_job_tasks(cluster_id)
|
49
|
-
for task in tasks:
|
50
|
-
if task.function_name == JobSchedule.FN_NODE_RESTART and task.node_id == node_id:
|
51
|
-
if task.status != JobSchedule.STATUS_DONE:
|
52
|
-
logger.info(f"Task found, skip adding new task: {task.get_id()}")
|
53
|
-
return False
|
54
|
-
return True
|
55
|
-
|
56
|
-
|
57
34
|
def task_runner(task):
|
58
|
-
if task.function_name ==
|
35
|
+
if task.function_name == "device_restart":
|
59
36
|
return task_runner_device(task)
|
60
|
-
if task.function_name ==
|
37
|
+
if task.function_name == "node_restart":
|
61
38
|
return task_runner_node(task)
|
62
39
|
|
63
40
|
|
@@ -72,55 +49,38 @@ def task_runner_device(task):
|
|
72
49
|
device_controller.device_set_retries_exhausted(device.get_id(), True)
|
73
50
|
return True
|
74
51
|
|
75
|
-
if not _validate_no_task_node_restart(task.cluster_id, task.node_id):
|
76
|
-
task.function_result = "canceled: node restart found"
|
77
|
-
task.status = JobSchedule.STATUS_DONE
|
78
|
-
task.write_to_db(db_controller.kv_store)
|
79
|
-
device_controller.device_set_unavailable(device.get_id())
|
80
|
-
return True
|
81
|
-
|
82
|
-
if task.canceled:
|
83
|
-
task.function_result = "canceled"
|
84
|
-
task.status = JobSchedule.STATUS_DONE
|
85
|
-
task.write_to_db(db_controller.kv_store)
|
86
|
-
return True
|
87
|
-
|
88
52
|
node = db_controller.get_storage_node_by_id(task.node_id)
|
89
53
|
if node.status != StorageNode.STATUS_ONLINE:
|
90
|
-
logger.error(f"Node is not online: {node.get_id()},
|
54
|
+
logger.error(f"Node is not online: {node.get_id()} , skipping task: {task.get_id()}")
|
91
55
|
task.function_result = "Node is offline"
|
92
56
|
task.retry += 1
|
93
57
|
task.write_to_db(db_controller.kv_store)
|
94
58
|
return False
|
95
59
|
|
96
60
|
if device.status == NVMeDevice.STATUS_ONLINE and device.io_error is False:
|
97
|
-
logger.info(f"Device is online: {device.get_id()}")
|
98
|
-
task.function_result = "
|
61
|
+
logger.info(f"Device is online: {device.get_id()}, no restart needed")
|
62
|
+
task.function_result = "skipped because dev is online"
|
99
63
|
task.status = JobSchedule.STATUS_DONE
|
100
64
|
task.write_to_db(db_controller.kv_store)
|
101
65
|
return True
|
102
66
|
|
103
|
-
|
104
|
-
|
105
|
-
|
67
|
+
task.status = JobSchedule.STATUS_RUNNING
|
68
|
+
task.write_to_db(db_controller.kv_store)
|
69
|
+
|
70
|
+
# resetting device
|
71
|
+
logger.info(f"Resetting device {device.get_id()}")
|
72
|
+
device_controller.reset_storage_device(device.get_id())
|
73
|
+
time.sleep(5)
|
74
|
+
device = _get_device(task)
|
75
|
+
if device.status == NVMeDevice.STATUS_ONLINE and device.io_error is False:
|
76
|
+
logger.info(f"Device is online: {device.get_id()}")
|
77
|
+
task.function_result = "done"
|
106
78
|
task.status = JobSchedule.STATUS_DONE
|
107
79
|
task.write_to_db(db_controller.kv_store)
|
108
80
|
return True
|
109
81
|
|
110
|
-
|
111
|
-
|
112
|
-
task.write_to_db(db_controller.kv_store)
|
113
|
-
tasks_events.task_updated(task)
|
114
|
-
|
115
|
-
# set device online for the first 3 retries
|
116
|
-
if task.retry < 3:
|
117
|
-
logger.info(f"Set device online {device.get_id()}")
|
118
|
-
device_controller.device_set_online(device.get_id())
|
119
|
-
else:
|
120
|
-
logger.info(f"Restarting device {device.get_id()}")
|
121
|
-
device_controller.restart_device(device.get_id(), force=True)
|
122
|
-
|
123
|
-
# check device status
|
82
|
+
logger.info(f"Restarting device {device.get_id()}")
|
83
|
+
device_controller.restart_device(device.get_id(), force=True)
|
124
84
|
time.sleep(5)
|
125
85
|
device = _get_device(task)
|
126
86
|
if device.status == NVMeDevice.STATUS_ONLINE and device.io_error is False:
|
@@ -144,37 +104,22 @@ def task_runner_node(task):
|
|
144
104
|
storage_node_ops.set_node_status(task.node_id, StorageNode.STATUS_UNREACHABLE)
|
145
105
|
return True
|
146
106
|
|
147
|
-
if node.
|
148
|
-
logger.info(f"Node is
|
149
|
-
task.function_result =
|
107
|
+
if _get_node_unavailable_devices_count(node.get_id()) == 0:
|
108
|
+
logger.info(f"Node is online: {node.get_id()}, no restart needed")
|
109
|
+
task.function_result = "skipped because node is online"
|
150
110
|
task.status = JobSchedule.STATUS_DONE
|
151
111
|
task.write_to_db(db_controller.kv_store)
|
152
112
|
return True
|
153
113
|
|
154
|
-
|
155
|
-
|
156
|
-
task.function_result = "Node is online"
|
157
|
-
task.status = JobSchedule.STATUS_DONE
|
158
|
-
task.write_to_db(db_controller.kv_store)
|
159
|
-
return True
|
160
|
-
|
161
|
-
if task.canceled:
|
162
|
-
task.function_result = "canceled"
|
163
|
-
task.status = JobSchedule.STATUS_DONE
|
164
|
-
task.write_to_db(db_controller.kv_store)
|
165
|
-
return True
|
166
|
-
|
167
|
-
if task.status != JobSchedule.STATUS_RUNNING:
|
168
|
-
task.status = JobSchedule.STATUS_RUNNING
|
169
|
-
task.write_to_db(db_controller.kv_store)
|
170
|
-
tasks_events.task_updated(task)
|
114
|
+
task.status = JobSchedule.STATUS_RUNNING
|
115
|
+
task.write_to_db(db_controller.kv_store)
|
171
116
|
|
172
117
|
# shutting down node
|
173
118
|
logger.info(f"Shutdown node {node.get_id()}")
|
174
119
|
ret = storage_node_ops.shutdown_storage_node(node.get_id(), force=True)
|
175
120
|
if ret:
|
176
121
|
logger.info(f"Node shutdown succeeded")
|
177
|
-
time.sleep(
|
122
|
+
time.sleep(5)
|
178
123
|
|
179
124
|
# resetting node
|
180
125
|
logger.info(f"Restart node {node.get_id()}")
|
@@ -182,9 +127,8 @@ def task_runner_node(task):
|
|
182
127
|
if ret:
|
183
128
|
logger.info(f"Node restart succeeded")
|
184
129
|
|
185
|
-
|
186
|
-
|
187
|
-
logger.info(f"Node is online: {node.get_id()}")
|
130
|
+
if _get_node_unavailable_devices_count(node.get_id()) == 0:
|
131
|
+
logger.info(f"Node is online: {node.get_id()}, no restart needed")
|
188
132
|
task.function_result = "done"
|
189
133
|
task.status = JobSchedule.STATUS_DONE
|
190
134
|
task.write_to_db(db_controller.kv_store)
|
@@ -195,7 +139,19 @@ def task_runner_node(task):
|
|
195
139
|
return False
|
196
140
|
|
197
141
|
|
198
|
-
|
142
|
+
# configure logging
|
143
|
+
logger_handler = logging.StreamHandler(stream=sys.stdout)
|
144
|
+
logger_handler.setFormatter(logging.Formatter('%(asctime)s: %(levelname)s: %(message)s'))
|
145
|
+
gelf_handler = GELFUDPHandler('0.0.0.0', constants.GELF_PORT)
|
146
|
+
logger = logging.getLogger()
|
147
|
+
logger.addHandler(gelf_handler)
|
148
|
+
logger.addHandler(logger_handler)
|
149
|
+
logger.setLevel(logging.DEBUG)
|
150
|
+
|
151
|
+
# get DB controller
|
152
|
+
db_controller = kv_store.DBController()
|
153
|
+
|
154
|
+
logger.info("Starting Jobs runner...")
|
199
155
|
while True:
|
200
156
|
time.sleep(3)
|
201
157
|
clusters = db_controller.get_clusters()
|
@@ -203,16 +159,11 @@ while True:
|
|
203
159
|
logger.error("No clusters found!")
|
204
160
|
else:
|
205
161
|
for cl in clusters:
|
206
|
-
tasks = db_controller.get_job_tasks(cl.get_id()
|
162
|
+
tasks = db_controller.get_job_tasks(cl.get_id())
|
207
163
|
for task in tasks:
|
208
164
|
delay_seconds = constants.TASK_EXEC_INTERVAL_SEC
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
if res:
|
215
|
-
tasks_events.task_updated(task)
|
216
|
-
else:
|
217
|
-
time.sleep(delay_seconds)
|
218
|
-
delay_seconds *= 2
|
165
|
+
while task.status != JobSchedule.STATUS_DONE:
|
166
|
+
res = task_runner(task)
|
167
|
+
if res is False:
|
168
|
+
time.sleep(delay_seconds)
|
169
|
+
delay_seconds *= 2
|
@@ -115,7 +115,7 @@ logger.info("Starting stats collector...")
|
|
115
115
|
while True:
|
116
116
|
|
117
117
|
pools = db_controller.get_pools()
|
118
|
-
all_lvols = db_controller.get_lvols()
|
118
|
+
all_lvols = db_controller.get_lvols()
|
119
119
|
for pool in pools:
|
120
120
|
lvols = []
|
121
121
|
for lvol in all_lvols:
|
@@ -8,7 +8,7 @@ from datetime import datetime
|
|
8
8
|
|
9
9
|
|
10
10
|
from simplyblock_core import constants, kv_store, cluster_ops, storage_node_ops, distr_controller
|
11
|
-
from simplyblock_core.controllers import health_controller, device_controller
|
11
|
+
from simplyblock_core.controllers import storage_events, health_controller, device_controller
|
12
12
|
from simplyblock_core.models.cluster import Cluster
|
13
13
|
from simplyblock_core.models.nvme_device import NVMeDevice
|
14
14
|
from simplyblock_core.models.storage_node import StorageNode
|
@@ -31,8 +31,8 @@ db_store = kv_store.KVStore()
|
|
31
31
|
db_controller = kv_store.DBController(kv_store=db_store)
|
32
32
|
|
33
33
|
|
34
|
-
def get_cluster_target_status(
|
35
|
-
snodes = db_controller.
|
34
|
+
def get_cluster_target_status(cluster):
|
35
|
+
snodes = db_controller.get_storage_nodes()
|
36
36
|
|
37
37
|
online_nodes = 0
|
38
38
|
offline_nodes = 0
|
@@ -66,8 +66,8 @@ def get_cluster_target_status(cluster_id):
|
|
66
66
|
logger.debug(f"online_devices: {online_devices}")
|
67
67
|
logger.debug(f"offline_devices: {offline_devices}")
|
68
68
|
|
69
|
-
# if more than two affected
|
70
|
-
if affected_nodes > 2
|
69
|
+
# if more than two affected modes then cluster is suspended
|
70
|
+
if affected_nodes > 2:
|
71
71
|
return Cluster.STATUS_SUSPENDED
|
72
72
|
|
73
73
|
# if any device goes offline then cluster is degraded
|
@@ -85,7 +85,7 @@ def update_cluster_status(cluster_id):
|
|
85
85
|
cluster = db_controller.get_cluster_by_id(cluster_id)
|
86
86
|
|
87
87
|
if cluster.ha_type == "ha":
|
88
|
-
cluster_target_status = get_cluster_target_status(
|
88
|
+
cluster_target_status = get_cluster_target_status(cluster)
|
89
89
|
logger.info(f"Target cluster status {cluster_target_status}, current status: {cluster.status}")
|
90
90
|
if cluster.status == cluster_target_status:
|
91
91
|
return
|
@@ -111,53 +111,48 @@ def set_node_online(node):
|
|
111
111
|
def set_node_offline(node):
|
112
112
|
if node.status != StorageNode.STATUS_UNREACHABLE:
|
113
113
|
storage_node_ops.set_node_status(snode.get_id(), StorageNode.STATUS_UNREACHABLE)
|
114
|
-
# add node to auto restart
|
115
|
-
tasks_controller.add_node_to_auto_restart(node)
|
116
114
|
|
117
115
|
|
118
116
|
logger.info("Starting node monitor")
|
119
117
|
while True:
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
device_controller.device_set_unavailable(dev.get_id())
|
159
|
-
|
160
|
-
update_cluster_status(cluster_id)
|
118
|
+
# get storage nodes
|
119
|
+
nodes = db_controller.get_storage_nodes()
|
120
|
+
for snode in nodes:
|
121
|
+
if snode.status not in [StorageNode.STATUS_ONLINE, StorageNode.STATUS_UNREACHABLE]:
|
122
|
+
logger.info(f"Node status is: {snode.status}, skipping")
|
123
|
+
continue
|
124
|
+
|
125
|
+
logger.info(f"Checking node {snode.hostname}")
|
126
|
+
|
127
|
+
# 1- check node ping
|
128
|
+
ping_check = health_controller._check_node_ping(snode.mgmt_ip)
|
129
|
+
logger.info(f"Check: ping mgmt ip {snode.mgmt_ip} ... {ping_check}")
|
130
|
+
|
131
|
+
# 2- check node API
|
132
|
+
node_api_check = health_controller._check_node_api(snode.mgmt_ip)
|
133
|
+
logger.info(f"Check: node API {snode.mgmt_ip}:5000 ... {node_api_check}")
|
134
|
+
|
135
|
+
# 3- check node RPC
|
136
|
+
node_rpc_check = health_controller._check_node_rpc(
|
137
|
+
snode.mgmt_ip, snode.rpc_port, snode.rpc_username, snode.rpc_password)
|
138
|
+
logger.info(f"Check: node RPC {snode.mgmt_ip}:{snode.rpc_port} ... {node_rpc_check}")
|
139
|
+
|
140
|
+
# 4- docker API
|
141
|
+
node_docker_check = health_controller._check_node_docker_api(snode.mgmt_ip)
|
142
|
+
logger.info(f"Check: node docker API {snode.mgmt_ip}:2375 ... {node_docker_check}")
|
143
|
+
|
144
|
+
is_node_online = ping_check and node_api_check and node_rpc_check and node_docker_check
|
145
|
+
if is_node_online:
|
146
|
+
set_node_online(snode)
|
147
|
+
else:
|
148
|
+
set_node_offline(snode)
|
149
|
+
|
150
|
+
if not ping_check and not node_rpc_check:
|
151
|
+
# node is dead, set devices offline
|
152
|
+
for dev in snode.nvme_devices:
|
153
|
+
device_controller.device_set_unavailable(dev.get_id())
|
154
|
+
|
155
|
+
update_cluster_status(snode.cluster_id)
|
161
156
|
|
162
157
|
logger.info(f"Sleeping for {constants.NODE_MONITOR_INTERVAL_SEC} seconds")
|
163
158
|
time.sleep(constants.NODE_MONITOR_INTERVAL_SEC)
|
@@ -95,15 +95,3 @@ class SNodeClient:
|
|
95
95
|
|
96
96
|
def leave_swarm(self):
|
97
97
|
return self._request("GET", "leave_swarm")
|
98
|
-
|
99
|
-
def make_gpt_partitions(self, nbd_device, jm_percent, num_partitions):
|
100
|
-
params = {
|
101
|
-
"nbd_device": nbd_device,
|
102
|
-
"jm_percent": jm_percent,
|
103
|
-
"num_partitions": num_partitions,
|
104
|
-
}
|
105
|
-
return self._request("POST", "make_gpt_partitions", params)
|
106
|
-
|
107
|
-
def delete_dev_gpt_partitions(self, device_pci):
|
108
|
-
params = {"device_pci": device_pci}
|
109
|
-
return self._request("POST", "delete_dev_gpt_partitions", params)
|