sbcli-pre 1.2.5__zip → 1.2.7__zip
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/PKG-INFO +1 -1
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/env_var +1 -1
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/sbcli_pre.egg-info/PKG-INFO +1 -1
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/sbcli_pre.egg-info/SOURCES.txt +5 -3
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_cli/cli.py +138 -136
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/cluster_ops.py +138 -235
- sbcli_pre-1.2.7/simplyblock_core/constants.py +91 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/caching_node_controller.py +8 -6
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/cluster_events.py +9 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/device_controller.py +56 -63
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/events_controller.py +5 -3
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/health_controller.py +30 -40
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/lvol_controller.py +75 -39
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/pool_controller.py +8 -4
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/snapshot_controller.py +36 -3
- sbcli_pre-1.2.7/simplyblock_core/controllers/tasks_controller.py +103 -0
- sbcli_pre-1.2.7/simplyblock_core/controllers/tasks_events.py +37 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/distr_controller.py +13 -9
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/kv_store.py +62 -20
- sbcli_pre-1.2.7/simplyblock_core/mgmt_node_ops.py +205 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/events.py +9 -1
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/job_schedule.py +6 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/nvme_device.py +42 -4
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/storage_node.py +14 -2
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/rpc_client.py +55 -10
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/__init__.py +0 -4
- sbcli_pre-1.2.5/simplyblock_core/scripts/alerting/alert_resources.yaml → sbcli_pre-1.2.7/simplyblock_core/scripts/alerting/alert_resources.yaml.j2 +54 -5
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/dashboards/cluster.json +1 -1
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/deploy_stack.sh +9 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/docker-compose-swarm-monitoring.yml +32 -15
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/docker-compose-swarm.yml +17 -2
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/haproxy.cfg +15 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/install_deps.sh +3 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/stack_deploy_wait.sh +1 -1
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/capacity_and_stats_collector.py +1 -1
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/device_monitor.py +5 -46
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/distr_event_collector.py +10 -11
- sbcli_pre-1.2.7/simplyblock_core/services/health_check_service.py +134 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/lvol_monitor.py +1 -1
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/lvol_stat_collector.py +1 -1
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/port_stat_collector.py +0 -1
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/storage_node_monitor.py +49 -44
- sbcli_pre-1.2.7/simplyblock_core/services/tasks_runner_migration.py +61 -0
- sbcli_pre-1.2.5/simplyblock_core/services/job_tasks.py → sbcli_pre-1.2.7/simplyblock_core/services/tasks_runner_restart.py +95 -46
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/snode_client.py +12 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/storage_node_ops.py +630 -358
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/utils.py +126 -1
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/snode_ops.py +103 -25
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/web_api_cluster.py +20 -43
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/web_api_device.py +10 -7
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/web_api_lvol.py +9 -5
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/web_api_pool.py +14 -5
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/web_api_storage_node.py +15 -15
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/node_utils.py +0 -2
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/utils.py +8 -0
- sbcli_pre-1.2.5/simplyblock_core/constants.py +0 -65
- sbcli_pre-1.2.5/simplyblock_core/mgmt_node_ops.py +0 -80
- sbcli_pre-1.2.5/simplyblock_core/scripts/apply_dashboard.sh +0 -22
- sbcli_pre-1.2.5/simplyblock_core/services/health_check_service.py +0 -136
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/README.md +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/pyproject.toml +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/sbcli_pre.egg-info/dependency_links.txt +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/sbcli_pre.egg-info/entry_points.txt +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/sbcli_pre.egg-info/requires.txt +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/sbcli_pre.egg-info/top_level.txt +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/setup.cfg +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/setup.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_cli/main.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/__init__.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/cnode_client.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/compute_node_ops.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/__init__.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/device_events.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/lvol_events.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/mgmt_events.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/pool_events.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/snapshot_events.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/storage_events.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/__init__.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/base_model.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/caching_node.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/cluster.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/compute_node.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/deployer.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/global_settings.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/iface.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/lvol_model.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/mgmt_node.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/pool.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/port_stat.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/snapshot.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/stats.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/pci_utils.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/alerting/alert_rules.yaml +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/clean_local_storage_deploy.sh +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/config_docker.sh +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/dashboards/devices.json +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/dashboards/lvols.json +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/dashboards/node-exporter.json +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/dashboards/nodes.json +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/dashboards/pools.json +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/datasource.yml +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/db_config_double.sh +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/db_config_single.sh +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/prometheus.yml +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/run_ssh.sh +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/set_db_config.sh +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/__init__.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/caching_node_monitor.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/cap_monitor.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/install_service.sh +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/log_agg_service.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/mgmt_node_monitor.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/remove_service.sh +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/service_template.service +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/shell_utils.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/__init__.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/app.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/auth_middleware.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/__init__.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/caching_node_ops.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/caching_node_ops_k8s.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/node_api_basic.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/node_api_caching_docker.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/node_api_caching_ks.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/web_api_caching_node.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/web_api_deployer.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/web_api_mgmt_node.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/web_api_snapshot.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/caching_node_app.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/caching_node_app_k8s.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/node_webapp.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/snode_app.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/static/delete.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/static/deploy.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/static/deploy_cnode.yaml +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/static/deploy_spdk.yaml +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/static/is_up.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/static/list_deps.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/static/rpac.yaml +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/static/tst.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/templates/deploy_spdk.yaml.j2 +0 -0
@@ -8,7 +8,7 @@ from datetime import datetime
|
|
8
8
|
|
9
9
|
|
10
10
|
from simplyblock_core import constants, kv_store, cluster_ops, storage_node_ops, distr_controller
|
11
|
-
from simplyblock_core.controllers import
|
11
|
+
from simplyblock_core.controllers import health_controller, device_controller, tasks_controller
|
12
12
|
from simplyblock_core.models.cluster import Cluster
|
13
13
|
from simplyblock_core.models.nvme_device import NVMeDevice
|
14
14
|
from simplyblock_core.models.storage_node import StorageNode
|
@@ -31,8 +31,8 @@ db_store = kv_store.KVStore()
|
|
31
31
|
db_controller = kv_store.DBController(kv_store=db_store)
|
32
32
|
|
33
33
|
|
34
|
-
def get_cluster_target_status(
|
35
|
-
snodes = db_controller.
|
34
|
+
def get_cluster_target_status(cluster_id):
|
35
|
+
snodes = db_controller.get_storage_nodes_by_cluster_id(cluster_id)
|
36
36
|
|
37
37
|
online_nodes = 0
|
38
38
|
offline_nodes = 0
|
@@ -66,8 +66,8 @@ def get_cluster_target_status(cluster):
|
|
66
66
|
logger.debug(f"online_devices: {online_devices}")
|
67
67
|
logger.debug(f"offline_devices: {offline_devices}")
|
68
68
|
|
69
|
-
# if more than two affected
|
70
|
-
if affected_nodes > 2:
|
69
|
+
# if more than two affected nodes then cluster is suspended
|
70
|
+
if affected_nodes > 2 or offline_nodes > 2:
|
71
71
|
return Cluster.STATUS_SUSPENDED
|
72
72
|
|
73
73
|
# if any device goes offline then cluster is degraded
|
@@ -85,7 +85,7 @@ def update_cluster_status(cluster_id):
|
|
85
85
|
cluster = db_controller.get_cluster_by_id(cluster_id)
|
86
86
|
|
87
87
|
if cluster.ha_type == "ha":
|
88
|
-
cluster_target_status = get_cluster_target_status(
|
88
|
+
cluster_target_status = get_cluster_target_status(cluster_id)
|
89
89
|
logger.info(f"Target cluster status {cluster_target_status}, current status: {cluster.status}")
|
90
90
|
if cluster.status == cluster_target_status:
|
91
91
|
return
|
@@ -111,48 +111,53 @@ def set_node_online(node):
|
|
111
111
|
def set_node_offline(node):
|
112
112
|
if node.status != StorageNode.STATUS_UNREACHABLE:
|
113
113
|
storage_node_ops.set_node_status(snode.get_id(), StorageNode.STATUS_UNREACHABLE)
|
114
|
+
# add node to auto restart
|
115
|
+
tasks_controller.add_node_to_auto_restart(node)
|
114
116
|
|
115
117
|
|
116
118
|
logger.info("Starting node monitor")
|
117
119
|
while True:
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
120
|
+
clusters = db_controller.get_clusters()
|
121
|
+
for cluster in clusters:
|
122
|
+
cluster_id = cluster.get_id()
|
123
|
+
# get storage nodes
|
124
|
+
nodes = db_controller.get_storage_nodes_by_cluster_id(cluster_id)
|
125
|
+
for snode in nodes:
|
126
|
+
if snode.status not in [StorageNode.STATUS_ONLINE, StorageNode.STATUS_UNREACHABLE]:
|
127
|
+
logger.info(f"Node status is: {snode.status}, skipping")
|
128
|
+
continue
|
129
|
+
|
130
|
+
logger.info(f"Checking node {snode.hostname}")
|
131
|
+
|
132
|
+
# 1- check node ping
|
133
|
+
ping_check = health_controller._check_node_ping(snode.mgmt_ip)
|
134
|
+
logger.info(f"Check: ping mgmt ip {snode.mgmt_ip} ... {ping_check}")
|
135
|
+
|
136
|
+
# 2- check node API
|
137
|
+
node_api_check = health_controller._check_node_api(snode.mgmt_ip)
|
138
|
+
logger.info(f"Check: node API {snode.mgmt_ip}:5000 ... {node_api_check}")
|
139
|
+
|
140
|
+
# 3- check node RPC
|
141
|
+
node_rpc_check = health_controller._check_node_rpc(
|
142
|
+
snode.mgmt_ip, snode.rpc_port, snode.rpc_username, snode.rpc_password)
|
143
|
+
logger.info(f"Check: node RPC {snode.mgmt_ip}:{snode.rpc_port} ... {node_rpc_check}")
|
144
|
+
|
145
|
+
# 4- docker API
|
146
|
+
node_docker_check = health_controller._check_node_docker_api(snode.mgmt_ip)
|
147
|
+
logger.info(f"Check: node docker API {snode.mgmt_ip}:2375 ... {node_docker_check}")
|
148
|
+
|
149
|
+
is_node_online = ping_check and node_api_check and node_rpc_check and node_docker_check
|
150
|
+
if is_node_online:
|
151
|
+
set_node_online(snode)
|
152
|
+
else:
|
153
|
+
set_node_offline(snode)
|
154
|
+
|
155
|
+
if not ping_check and not node_rpc_check:
|
156
|
+
# node is dead, set devices offline
|
157
|
+
for dev in snode.nvme_devices:
|
158
|
+
device_controller.device_set_unavailable(dev.get_id())
|
159
|
+
|
160
|
+
update_cluster_status(cluster_id)
|
156
161
|
|
157
162
|
logger.info(f"Sleeping for {constants.NODE_MONITOR_INTERVAL_SEC} seconds")
|
158
163
|
time.sleep(constants.NODE_MONITOR_INTERVAL_SEC)
|
@@ -0,0 +1,61 @@
|
|
1
|
+
# coding=utf-8
|
2
|
+
import logging
|
3
|
+
import time
|
4
|
+
import sys
|
5
|
+
|
6
|
+
|
7
|
+
from simplyblock_core import constants, kv_store
|
8
|
+
from simplyblock_core.controllers import tasks_events
|
9
|
+
from simplyblock_core.models.job_schedule import JobSchedule
|
10
|
+
|
11
|
+
|
12
|
+
# Import the GELF logger
|
13
|
+
from graypy import GELFUDPHandler
|
14
|
+
|
15
|
+
|
16
|
+
def task_runner(task):
|
17
|
+
task.status = JobSchedule.STATUS_RUNNING
|
18
|
+
task.write_to_db(db_controller.kv_store)
|
19
|
+
tasks_events.task_updated(task)
|
20
|
+
|
21
|
+
time.sleep(30)
|
22
|
+
|
23
|
+
task.function_result = "sleep 30"
|
24
|
+
task.status = JobSchedule.STATUS_DONE
|
25
|
+
task.write_to_db(db_controller.kv_store)
|
26
|
+
tasks_events.task_updated(task)
|
27
|
+
|
28
|
+
return True
|
29
|
+
|
30
|
+
|
31
|
+
# configure logging
|
32
|
+
logger_handler = logging.StreamHandler(stream=sys.stdout)
|
33
|
+
logger_handler.setFormatter(logging.Formatter('%(asctime)s: %(levelname)s: %(message)s'))
|
34
|
+
gelf_handler = GELFUDPHandler('0.0.0.0', constants.GELF_PORT)
|
35
|
+
logger = logging.getLogger()
|
36
|
+
logger.addHandler(gelf_handler)
|
37
|
+
logger.addHandler(logger_handler)
|
38
|
+
logger.setLevel(logging.DEBUG)
|
39
|
+
|
40
|
+
# get DB controller
|
41
|
+
db_controller = kv_store.DBController()
|
42
|
+
|
43
|
+
logger.info("Starting Tasks runner...")
|
44
|
+
while True:
|
45
|
+
time.sleep(3)
|
46
|
+
clusters = db_controller.get_clusters()
|
47
|
+
if not clusters:
|
48
|
+
logger.error("No clusters found!")
|
49
|
+
else:
|
50
|
+
for cl in clusters:
|
51
|
+
tasks = db_controller.get_job_tasks(cl.get_id(), reverse=False)
|
52
|
+
for task in tasks:
|
53
|
+
delay_seconds = constants.TASK_EXEC_INTERVAL_SEC
|
54
|
+
if task.function_name == JobSchedule.FN_DEV_MIG:
|
55
|
+
while task.status != JobSchedule.STATUS_DONE:
|
56
|
+
res = task_runner(task)
|
57
|
+
if res:
|
58
|
+
tasks_events.task_updated(task)
|
59
|
+
else:
|
60
|
+
time.sleep(delay_seconds)
|
61
|
+
delay_seconds *= 2
|
@@ -5,7 +5,7 @@ import sys
|
|
5
5
|
|
6
6
|
|
7
7
|
from simplyblock_core import constants, kv_store, storage_node_ops
|
8
|
-
from simplyblock_core.controllers import device_controller
|
8
|
+
from simplyblock_core.controllers import device_controller, tasks_events
|
9
9
|
from simplyblock_core.models.job_schedule import JobSchedule
|
10
10
|
from simplyblock_core.models.nvme_device import NVMeDevice
|
11
11
|
|
@@ -15,6 +15,19 @@ from graypy import GELFUDPHandler
|
|
15
15
|
from simplyblock_core.models.storage_node import StorageNode
|
16
16
|
|
17
17
|
|
18
|
+
# configure logging
|
19
|
+
logger_handler = logging.StreamHandler(stream=sys.stdout)
|
20
|
+
logger_handler.setFormatter(logging.Formatter('%(asctime)s: %(levelname)s: %(message)s'))
|
21
|
+
gelf_handler = GELFUDPHandler('0.0.0.0', constants.GELF_PORT)
|
22
|
+
logger = logging.getLogger()
|
23
|
+
logger.addHandler(gelf_handler)
|
24
|
+
logger.addHandler(logger_handler)
|
25
|
+
logger.setLevel(logging.DEBUG)
|
26
|
+
|
27
|
+
# get DB controller
|
28
|
+
db_controller = kv_store.DBController()
|
29
|
+
|
30
|
+
|
18
31
|
def _get_node_unavailable_devices_count(node_id):
|
19
32
|
node = db_controller.get_storage_node_by_id(node_id)
|
20
33
|
devices = []
|
@@ -31,10 +44,20 @@ def _get_device(task):
|
|
31
44
|
return dev
|
32
45
|
|
33
46
|
|
47
|
+
def _validate_no_task_node_restart(cluster_id, node_id):
|
48
|
+
tasks = db_controller.get_job_tasks(cluster_id)
|
49
|
+
for task in tasks:
|
50
|
+
if task.function_name == JobSchedule.FN_NODE_RESTART and task.node_id == node_id:
|
51
|
+
if task.status != JobSchedule.STATUS_DONE:
|
52
|
+
logger.info(f"Task found, skip adding new task: {task.get_id()}")
|
53
|
+
return False
|
54
|
+
return True
|
55
|
+
|
56
|
+
|
34
57
|
def task_runner(task):
|
35
|
-
if task.function_name ==
|
58
|
+
if task.function_name == JobSchedule.FN_DEV_RESTART:
|
36
59
|
return task_runner_device(task)
|
37
|
-
if task.function_name ==
|
60
|
+
if task.function_name == JobSchedule.FN_NODE_RESTART:
|
38
61
|
return task_runner_node(task)
|
39
62
|
|
40
63
|
|
@@ -49,38 +72,55 @@ def task_runner_device(task):
|
|
49
72
|
device_controller.device_set_retries_exhausted(device.get_id(), True)
|
50
73
|
return True
|
51
74
|
|
75
|
+
if not _validate_no_task_node_restart(task.cluster_id, task.node_id):
|
76
|
+
task.function_result = "canceled: node restart found"
|
77
|
+
task.status = JobSchedule.STATUS_DONE
|
78
|
+
task.write_to_db(db_controller.kv_store)
|
79
|
+
device_controller.device_set_unavailable(device.get_id())
|
80
|
+
return True
|
81
|
+
|
82
|
+
if task.canceled:
|
83
|
+
task.function_result = "canceled"
|
84
|
+
task.status = JobSchedule.STATUS_DONE
|
85
|
+
task.write_to_db(db_controller.kv_store)
|
86
|
+
return True
|
87
|
+
|
52
88
|
node = db_controller.get_storage_node_by_id(task.node_id)
|
53
89
|
if node.status != StorageNode.STATUS_ONLINE:
|
54
|
-
logger.error(f"Node is not online: {node.get_id()}
|
90
|
+
logger.error(f"Node is not online: {node.get_id()}, retry")
|
55
91
|
task.function_result = "Node is offline"
|
56
92
|
task.retry += 1
|
57
93
|
task.write_to_db(db_controller.kv_store)
|
58
94
|
return False
|
59
95
|
|
60
96
|
if device.status == NVMeDevice.STATUS_ONLINE and device.io_error is False:
|
61
|
-
logger.info(f"Device is online: {device.get_id()}
|
62
|
-
task.function_result = "
|
97
|
+
logger.info(f"Device is online: {device.get_id()}")
|
98
|
+
task.function_result = "Device is online"
|
63
99
|
task.status = JobSchedule.STATUS_DONE
|
64
100
|
task.write_to_db(db_controller.kv_store)
|
65
101
|
return True
|
66
102
|
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
# resetting device
|
71
|
-
logger.info(f"Resetting device {device.get_id()}")
|
72
|
-
device_controller.reset_storage_device(device.get_id())
|
73
|
-
time.sleep(5)
|
74
|
-
device = _get_device(task)
|
75
|
-
if device.status == NVMeDevice.STATUS_ONLINE and device.io_error is False:
|
76
|
-
logger.info(f"Device is online: {device.get_id()}")
|
77
|
-
task.function_result = "done"
|
103
|
+
if device.status in [NVMeDevice.STATUS_REMOVED, NVMeDevice.STATUS_FAILED]:
|
104
|
+
logger.info(f"Device is not unavailable: {device.get_id()}, {device.status} , stopping task")
|
105
|
+
task.function_result = f"stopped because dev is {device.status}"
|
78
106
|
task.status = JobSchedule.STATUS_DONE
|
79
107
|
task.write_to_db(db_controller.kv_store)
|
80
108
|
return True
|
81
109
|
|
82
|
-
|
83
|
-
|
110
|
+
if task.status != JobSchedule.STATUS_RUNNING:
|
111
|
+
task.status = JobSchedule.STATUS_RUNNING
|
112
|
+
task.write_to_db(db_controller.kv_store)
|
113
|
+
tasks_events.task_updated(task)
|
114
|
+
|
115
|
+
# set device online for the first 3 retries
|
116
|
+
if task.retry < 3:
|
117
|
+
logger.info(f"Set device online {device.get_id()}")
|
118
|
+
device_controller.device_set_online(device.get_id())
|
119
|
+
else:
|
120
|
+
logger.info(f"Restarting device {device.get_id()}")
|
121
|
+
device_controller.restart_device(device.get_id(), force=True)
|
122
|
+
|
123
|
+
# check device status
|
84
124
|
time.sleep(5)
|
85
125
|
device = _get_device(task)
|
86
126
|
if device.status == NVMeDevice.STATUS_ONLINE and device.io_error is False:
|
@@ -104,22 +144,37 @@ def task_runner_node(task):
|
|
104
144
|
storage_node_ops.set_node_status(task.node_id, StorageNode.STATUS_UNREACHABLE)
|
105
145
|
return True
|
106
146
|
|
107
|
-
if
|
108
|
-
logger.info(f"Node is
|
109
|
-
task.function_result = "
|
147
|
+
if node.status == StorageNode.STATUS_REMOVED:
|
148
|
+
logger.info(f"Node is removed: {task.node_id}, stopping task")
|
149
|
+
task.function_result = f"Node is removed"
|
110
150
|
task.status = JobSchedule.STATUS_DONE
|
111
151
|
task.write_to_db(db_controller.kv_store)
|
112
152
|
return True
|
113
153
|
|
114
|
-
|
115
|
-
|
154
|
+
if _get_node_unavailable_devices_count(node.get_id()) == 0 and node.status == StorageNode.STATUS_ONLINE:
|
155
|
+
logger.info(f"Node is online: {node.get_id()}")
|
156
|
+
task.function_result = "Node is online"
|
157
|
+
task.status = JobSchedule.STATUS_DONE
|
158
|
+
task.write_to_db(db_controller.kv_store)
|
159
|
+
return True
|
160
|
+
|
161
|
+
if task.canceled:
|
162
|
+
task.function_result = "canceled"
|
163
|
+
task.status = JobSchedule.STATUS_DONE
|
164
|
+
task.write_to_db(db_controller.kv_store)
|
165
|
+
return True
|
166
|
+
|
167
|
+
if task.status != JobSchedule.STATUS_RUNNING:
|
168
|
+
task.status = JobSchedule.STATUS_RUNNING
|
169
|
+
task.write_to_db(db_controller.kv_store)
|
170
|
+
tasks_events.task_updated(task)
|
116
171
|
|
117
172
|
# shutting down node
|
118
173
|
logger.info(f"Shutdown node {node.get_id()}")
|
119
174
|
ret = storage_node_ops.shutdown_storage_node(node.get_id(), force=True)
|
120
175
|
if ret:
|
121
176
|
logger.info(f"Node shutdown succeeded")
|
122
|
-
time.sleep(
|
177
|
+
time.sleep(3)
|
123
178
|
|
124
179
|
# resetting node
|
125
180
|
logger.info(f"Restart node {node.get_id()}")
|
@@ -127,8 +182,9 @@ def task_runner_node(task):
|
|
127
182
|
if ret:
|
128
183
|
logger.info(f"Node restart succeeded")
|
129
184
|
|
130
|
-
|
131
|
-
|
185
|
+
time.sleep(5)
|
186
|
+
if _get_node_unavailable_devices_count(node.get_id()) == 0 and node.status == StorageNode.STATUS_ONLINE:
|
187
|
+
logger.info(f"Node is online: {node.get_id()}")
|
132
188
|
task.function_result = "done"
|
133
189
|
task.status = JobSchedule.STATUS_DONE
|
134
190
|
task.write_to_db(db_controller.kv_store)
|
@@ -139,19 +195,7 @@ def task_runner_node(task):
|
|
139
195
|
return False
|
140
196
|
|
141
197
|
|
142
|
-
|
143
|
-
logger_handler = logging.StreamHandler(stream=sys.stdout)
|
144
|
-
logger_handler.setFormatter(logging.Formatter('%(asctime)s: %(levelname)s: %(message)s'))
|
145
|
-
gelf_handler = GELFUDPHandler('0.0.0.0', constants.GELF_PORT)
|
146
|
-
logger = logging.getLogger()
|
147
|
-
logger.addHandler(gelf_handler)
|
148
|
-
logger.addHandler(logger_handler)
|
149
|
-
logger.setLevel(logging.DEBUG)
|
150
|
-
|
151
|
-
# get DB controller
|
152
|
-
db_controller = kv_store.DBController()
|
153
|
-
|
154
|
-
logger.info("Starting Jobs runner...")
|
198
|
+
logger.info("Starting Tasks runner...")
|
155
199
|
while True:
|
156
200
|
time.sleep(3)
|
157
201
|
clusters = db_controller.get_clusters()
|
@@ -159,11 +203,16 @@ while True:
|
|
159
203
|
logger.error("No clusters found!")
|
160
204
|
else:
|
161
205
|
for cl in clusters:
|
162
|
-
tasks = db_controller.get_job_tasks(cl.get_id())
|
206
|
+
tasks = db_controller.get_job_tasks(cl.get_id(), reverse=False)
|
163
207
|
for task in tasks:
|
164
208
|
delay_seconds = constants.TASK_EXEC_INTERVAL_SEC
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
209
|
+
if task.function_name in [JobSchedule.FN_DEV_RESTART, JobSchedule.FN_NODE_RESTART]:
|
210
|
+
while task.status != JobSchedule.STATUS_DONE:
|
211
|
+
# get new task object because it could be changed from cancel task
|
212
|
+
task = db_controller.get_task_by_id(task.uuid)
|
213
|
+
res = task_runner(task)
|
214
|
+
if res:
|
215
|
+
tasks_events.task_updated(task)
|
216
|
+
else:
|
217
|
+
time.sleep(delay_seconds)
|
218
|
+
delay_seconds *= 2
|
@@ -95,3 +95,15 @@ class SNodeClient:
|
|
95
95
|
|
96
96
|
def leave_swarm(self):
|
97
97
|
return self._request("GET", "leave_swarm")
|
98
|
+
|
99
|
+
def make_gpt_partitions(self, nbd_device, jm_percent, num_partitions):
|
100
|
+
params = {
|
101
|
+
"nbd_device": nbd_device,
|
102
|
+
"jm_percent": jm_percent,
|
103
|
+
"num_partitions": num_partitions,
|
104
|
+
}
|
105
|
+
return self._request("POST", "make_gpt_partitions", params)
|
106
|
+
|
107
|
+
def delete_dev_gpt_partitions(self, device_pci):
|
108
|
+
params = {"device_pci": device_pci}
|
109
|
+
return self._request("POST", "delete_dev_gpt_partitions", params)
|