sbcli-pre 1.2.5__zip → 1.2.6__zip
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/PKG-INFO +1 -1
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/env_var +1 -1
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/sbcli_pre.egg-info/PKG-INFO +1 -1
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/sbcli_pre.egg-info/SOURCES.txt +5 -3
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_cli/cli.py +113 -115
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/cluster_ops.py +138 -235
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/constants.py +5 -7
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/controllers/caching_node_controller.py +8 -6
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/controllers/cluster_events.py +9 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/controllers/device_controller.py +56 -63
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/controllers/events_controller.py +5 -3
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/controllers/health_controller.py +30 -40
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/controllers/lvol_controller.py +51 -38
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/controllers/pool_controller.py +8 -4
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/controllers/snapshot_controller.py +9 -3
- sbcli_pre-1.2.6/simplyblock_core/controllers/tasks_controller.py +103 -0
- sbcli_pre-1.2.6/simplyblock_core/controllers/tasks_events.py +37 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/distr_controller.py +13 -9
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/kv_store.py +47 -20
- sbcli_pre-1.2.6/simplyblock_core/mgmt_node_ops.py +205 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/models/events.py +9 -1
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/models/job_schedule.py +6 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/models/nvme_device.py +42 -4
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/models/storage_node.py +9 -1
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/rpc_client.py +55 -10
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/scripts/__init__.py +0 -4
- sbcli_pre-1.2.5/simplyblock_core/scripts/alerting/alert_resources.yaml → sbcli_pre-1.2.6/simplyblock_core/scripts/alerting/alert_resources.yaml.j2 +54 -5
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/scripts/dashboards/cluster.json +1 -1
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/scripts/deploy_stack.sh +9 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/scripts/docker-compose-swarm-monitoring.yml +32 -15
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/scripts/docker-compose-swarm.yml +17 -2
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/scripts/haproxy.cfg +15 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/scripts/install_deps.sh +3 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/scripts/stack_deploy_wait.sh +1 -1
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/services/capacity_and_stats_collector.py +1 -1
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/services/device_monitor.py +5 -46
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/services/distr_event_collector.py +10 -11
- sbcli_pre-1.2.6/simplyblock_core/services/health_check_service.py +134 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/services/lvol_monitor.py +1 -1
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/services/lvol_stat_collector.py +1 -1
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/services/port_stat_collector.py +0 -1
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/services/storage_node_monitor.py +49 -44
- sbcli_pre-1.2.6/simplyblock_core/services/tasks_runner_migration.py +61 -0
- sbcli_pre-1.2.5/simplyblock_core/services/job_tasks.py → sbcli_pre-1.2.6/simplyblock_core/services/tasks_runner_restart.py +95 -46
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/snode_client.py +12 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/storage_node_ops.py +525 -336
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/utils.py +46 -1
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_web/blueprints/snode_ops.py +103 -25
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_web/blueprints/web_api_cluster.py +20 -43
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_web/blueprints/web_api_device.py +10 -7
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_web/blueprints/web_api_lvol.py +9 -5
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_web/blueprints/web_api_pool.py +14 -5
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_web/blueprints/web_api_storage_node.py +3 -10
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_web/node_utils.py +0 -2
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_web/utils.py +8 -0
- sbcli_pre-1.2.5/simplyblock_core/mgmt_node_ops.py +0 -80
- sbcli_pre-1.2.5/simplyblock_core/scripts/apply_dashboard.sh +0 -22
- sbcli_pre-1.2.5/simplyblock_core/services/health_check_service.py +0 -136
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/README.md +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/pyproject.toml +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/sbcli_pre.egg-info/dependency_links.txt +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/sbcli_pre.egg-info/entry_points.txt +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/sbcli_pre.egg-info/requires.txt +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/sbcli_pre.egg-info/top_level.txt +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/setup.cfg +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/setup.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_cli/main.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/__init__.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/cnode_client.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/compute_node_ops.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/controllers/__init__.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/controllers/device_events.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/controllers/lvol_events.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/controllers/mgmt_events.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/controllers/pool_events.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/controllers/snapshot_events.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/controllers/storage_events.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/models/__init__.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/models/base_model.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/models/caching_node.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/models/cluster.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/models/compute_node.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/models/deployer.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/models/global_settings.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/models/iface.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/models/lvol_model.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/models/mgmt_node.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/models/pool.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/models/port_stat.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/models/snapshot.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/models/stats.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/pci_utils.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/scripts/alerting/alert_rules.yaml +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/scripts/clean_local_storage_deploy.sh +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/scripts/config_docker.sh +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/scripts/dashboards/devices.json +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/scripts/dashboards/lvols.json +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/scripts/dashboards/node-exporter.json +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/scripts/dashboards/nodes.json +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/scripts/dashboards/pools.json +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/scripts/datasource.yml +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/scripts/db_config_double.sh +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/scripts/db_config_single.sh +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/scripts/prometheus.yml +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/scripts/run_ssh.sh +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/scripts/set_db_config.sh +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/services/__init__.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/services/caching_node_monitor.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/services/cap_monitor.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/services/install_service.sh +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/services/log_agg_service.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/services/mgmt_node_monitor.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/services/remove_service.sh +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/services/service_template.service +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/shell_utils.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_web/__init__.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_web/app.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_web/auth_middleware.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_web/blueprints/__init__.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_web/blueprints/caching_node_ops.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_web/blueprints/caching_node_ops_k8s.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_web/blueprints/node_api_basic.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_web/blueprints/node_api_caching_docker.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_web/blueprints/node_api_caching_ks.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_web/blueprints/web_api_caching_node.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_web/blueprints/web_api_deployer.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_web/blueprints/web_api_mgmt_node.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_web/blueprints/web_api_snapshot.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_web/caching_node_app.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_web/caching_node_app_k8s.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_web/node_webapp.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_web/snode_app.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_web/static/delete.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_web/static/deploy.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_web/static/deploy_cnode.yaml +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_web/static/deploy_spdk.yaml +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_web/static/is_up.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_web/static/list_deps.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_web/static/rpac.yaml +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_web/static/tst.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_web/templates/deploy_spdk.yaml.j2 +0 -0
@@ -14,11 +14,11 @@ import docker
|
|
14
14
|
from simplyblock_core import constants, scripts, distr_controller
|
15
15
|
from simplyblock_core import utils
|
16
16
|
from simplyblock_core.controllers import lvol_controller, storage_events, snapshot_controller, device_events, \
|
17
|
-
device_controller
|
17
|
+
device_controller, tasks_controller
|
18
18
|
from simplyblock_core.kv_store import DBController
|
19
19
|
from simplyblock_core import shell_utils
|
20
20
|
from simplyblock_core.models.iface import IFace
|
21
|
-
from simplyblock_core.models.nvme_device import NVMeDevice
|
21
|
+
from simplyblock_core.models.nvme_device import NVMeDevice, JMDevice
|
22
22
|
from simplyblock_core.models.storage_node import StorageNode
|
23
23
|
from simplyblock_core.pci_utils import get_nvme_devices, bind_spdk_driver
|
24
24
|
from simplyblock_core.rpc_client import RPCClient
|
@@ -81,55 +81,50 @@ def _get_if_ip_address(ifname):
|
|
81
81
|
|
82
82
|
|
83
83
|
def addNvmeDevices(cluster, rpc_client, devs, snode):
|
84
|
-
sequential_number = 0
|
85
84
|
devices = []
|
86
85
|
ret = rpc_client.bdev_nvme_controller_list()
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
86
|
+
ctr_map = {}
|
87
|
+
try:
|
88
|
+
if ret:
|
89
|
+
ctr_map = {i["ctrlrs"][0]['trid']['traddr']: i["name"] for i in ret}
|
90
|
+
except:
|
91
|
+
pass
|
91
92
|
|
93
|
+
next_physical_label = get_next_physical_device_order()
|
92
94
|
for index, pcie in enumerate(devs):
|
93
95
|
|
94
96
|
if pcie in ctr_map:
|
95
|
-
|
97
|
+
nvme_controller = ctr_map[pcie]
|
96
98
|
else:
|
97
|
-
|
98
|
-
ret, err = rpc_client.bdev_nvme_controller_attach(
|
99
|
+
nvme_controller = "nvme_%s" % index
|
100
|
+
ret, err = rpc_client.bdev_nvme_controller_attach(nvme_controller, pcie)
|
99
101
|
time.sleep(2)
|
100
|
-
nvme_bdev = f"{name}n1"
|
101
102
|
|
103
|
+
nvme_bdev = f"{nvme_controller}n1"
|
104
|
+
rpc_client.bdev_examine(nvme_bdev)
|
105
|
+
time.sleep(5)
|
102
106
|
ret = rpc_client.get_bdevs(nvme_bdev)
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
+
nvme_dict = ret[0]
|
108
|
+
nvme_driver_data = nvme_dict['driver_specific']['nvme'][0]
|
109
|
+
model_number = nvme_driver_data['ctrlr_data']['model_number']
|
110
|
+
total_size = nvme_dict['block_size'] * nvme_dict['num_blocks']
|
107
111
|
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
'cluster_id': snode.cluster_id,
|
125
|
-
|
126
|
-
# 'nvmf_nqn': subsystem_nqn,
|
127
|
-
# 'nvmf_ip': IP,
|
128
|
-
# 'nvmf_port': 4420,
|
129
|
-
|
130
|
-
'status': 'online'
|
131
|
-
}))
|
132
|
-
sequential_number += device_partitions_count
|
112
|
+
devices.append(
|
113
|
+
NVMeDevice({
|
114
|
+
'uuid': str(uuid.uuid4()),
|
115
|
+
'device_name': nvme_dict['name'],
|
116
|
+
'size': total_size,
|
117
|
+
'physical_label': next_physical_label,
|
118
|
+
'pcie_address': nvme_driver_data['pci_address'],
|
119
|
+
'model_id': model_number,
|
120
|
+
'serial_number': nvme_driver_data['ctrlr_data']['serial_number'],
|
121
|
+
'nvme_bdev': nvme_bdev,
|
122
|
+
'nvme_controller': nvme_controller,
|
123
|
+
'node_id': snode.get_id(),
|
124
|
+
'cluster_id': snode.cluster_id,
|
125
|
+
'status': NVMeDevice.STATUS_ONLINE
|
126
|
+
}))
|
127
|
+
next_physical_label += 1
|
133
128
|
return devices
|
134
129
|
|
135
130
|
|
@@ -197,10 +192,10 @@ def _run_nvme_smart_log_add(dev_name):
|
|
197
192
|
return data
|
198
193
|
|
199
194
|
|
200
|
-
def get_next_cluster_device_order(db_controller):
|
195
|
+
def get_next_cluster_device_order(db_controller, cluster_id):
|
201
196
|
max_order = 0
|
202
197
|
found = False
|
203
|
-
for node in db_controller.
|
198
|
+
for node in db_controller.get_storage_nodes_by_cluster_id(cluster_id):
|
204
199
|
for dev in node.nvme_devices:
|
205
200
|
found = True
|
206
201
|
max_order = max(max_order, dev.cluster_device_order)
|
@@ -209,91 +204,319 @@ def get_next_cluster_device_order(db_controller):
|
|
209
204
|
return 0
|
210
205
|
|
211
206
|
|
212
|
-
def
|
207
|
+
def get_next_physical_device_order():
|
213
208
|
db_controller = DBController()
|
209
|
+
max_order = 0
|
210
|
+
found = False
|
211
|
+
for node in db_controller.get_storage_nodes():
|
212
|
+
for dev in node.nvme_devices:
|
213
|
+
found = True
|
214
|
+
max_order = max(max_order, dev.physical_label)
|
215
|
+
if found:
|
216
|
+
return max_order + 1
|
217
|
+
return 0
|
218
|
+
|
219
|
+
|
220
|
+
def _search_for_partitions(rpc_client, nvme_device):
|
221
|
+
partitioned_devices = []
|
222
|
+
for bdev in rpc_client.get_bdevs():
|
223
|
+
name = bdev['name']
|
224
|
+
if name.startswith(f"{nvme_device.nvme_bdev}p"):
|
225
|
+
new_dev = NVMeDevice(nvme_device.to_dict())
|
226
|
+
new_dev.uuid = str(uuid.uuid4())
|
227
|
+
new_dev.device_name = name
|
228
|
+
new_dev.nvme_bdev = name
|
229
|
+
new_dev.size = bdev['block_size'] * bdev['num_blocks']
|
230
|
+
partitioned_devices.append(new_dev)
|
231
|
+
return partitioned_devices
|
232
|
+
|
214
233
|
|
234
|
+
def _create_jm_stack_on_raid(rpc_client, jm_nvme_bdevs, snode, after_restart):
|
235
|
+
raid_bdev = f"raid_jm_{snode.get_id()}"
|
236
|
+
ret = rpc_client.bdev_raid_create(raid_bdev, jm_nvme_bdevs)
|
237
|
+
if not ret:
|
238
|
+
logger.error(f"Failed to create raid_jm_{snode.get_id()}")
|
239
|
+
return False
|
240
|
+
alceml_name = f"alceml_jm_{snode.get_id()}"
|
241
|
+
pba_init_mode = 3
|
242
|
+
if after_restart:
|
243
|
+
pba_init_mode = 2
|
244
|
+
ret = rpc_client.bdev_alceml_create(alceml_name, raid_bdev, str(uuid.uuid4()), pba_init_mode=pba_init_mode)
|
245
|
+
if not ret:
|
246
|
+
logger.error(f"Failed to create alceml bdev: {alceml_name}")
|
247
|
+
return False
|
248
|
+
|
249
|
+
jm_bdev = f"jm_{snode.get_id()}"
|
250
|
+
ret = rpc_client.bdev_jm_create(jm_bdev, alceml_name)
|
251
|
+
if not ret:
|
252
|
+
logger.error(f"Failed to create {jm_bdev}")
|
253
|
+
return False
|
254
|
+
ret = rpc_client.get_bdevs(raid_bdev)
|
255
|
+
|
256
|
+
return JMDevice({
|
257
|
+
'uuid': str(uuid.uuid4()),
|
258
|
+
'device_name': jm_bdev,
|
259
|
+
'size': ret[0]["block_size"] * ret[0]["num_blocks"],
|
260
|
+
'status': JMDevice.STATUS_ONLINE,
|
261
|
+
'jm_nvme_bdev_list': jm_nvme_bdevs,
|
262
|
+
'raid_bdev': raid_bdev,
|
263
|
+
'alceml_bdev': alceml_name,
|
264
|
+
'jm_bdev': jm_bdev
|
265
|
+
})
|
266
|
+
|
267
|
+
|
268
|
+
def _create_jm_stack_on_device(rpc_client, nvme, snode, after_restart):
|
269
|
+
|
270
|
+
alceml_id = nvme.get_id()
|
271
|
+
alceml_name = device_controller.get_alceml_name(alceml_id)
|
272
|
+
logger.info(f"adding {alceml_name}")
|
273
|
+
|
274
|
+
pba_init_mode = 3
|
275
|
+
if after_restart:
|
276
|
+
pba_init_mode = 2
|
277
|
+
ret = rpc_client.bdev_alceml_create(alceml_name, nvme.nvme_bdev, alceml_id, pba_init_mode=pba_init_mode)
|
278
|
+
if not ret:
|
279
|
+
logger.error(f"Failed to create alceml bdev: {alceml_name}")
|
280
|
+
return False
|
281
|
+
|
282
|
+
jm_bdev = f"jm_{snode.get_id()}"
|
283
|
+
ret = rpc_client.bdev_jm_create(jm_bdev, alceml_name)
|
284
|
+
if not ret:
|
285
|
+
logger.error(f"Failed to create {jm_bdev}")
|
286
|
+
return False
|
287
|
+
|
288
|
+
return JMDevice({
|
289
|
+
'uuid': alceml_id,
|
290
|
+
'device_name': jm_bdev,
|
291
|
+
'size': nvme.size,
|
292
|
+
'status': JMDevice.STATUS_ONLINE,
|
293
|
+
'alceml_bdev': alceml_name,
|
294
|
+
'nvme_bdev': nvme.nvme_bdev,
|
295
|
+
'jm_bdev': jm_bdev
|
296
|
+
})
|
297
|
+
|
298
|
+
|
299
|
+
def _create_storage_device_stack(rpc_client, nvme, snode, after_restart):
|
300
|
+
test_name = f"{nvme.nvme_bdev}_test"
|
301
|
+
ret = rpc_client.bdev_passtest_create(test_name, nvme.nvme_bdev)
|
302
|
+
if not ret:
|
303
|
+
logger.error(f"Failed to create passtest bdev {test_name}")
|
304
|
+
return False
|
305
|
+
alceml_id = nvme.get_id()
|
306
|
+
alceml_name = device_controller.get_alceml_name(alceml_id)
|
307
|
+
logger.info(f"adding {alceml_name}")
|
308
|
+
pba_init_mode = 3
|
309
|
+
if after_restart:
|
310
|
+
pba_init_mode = 2
|
311
|
+
ret = rpc_client.bdev_alceml_create(alceml_name, test_name, alceml_id, pba_init_mode=pba_init_mode,
|
312
|
+
dev_cpu_mask=snode.dev_cpu_mask)
|
313
|
+
if not ret:
|
314
|
+
logger.error(f"Failed to create alceml bdev: {alceml_name}")
|
315
|
+
return False
|
316
|
+
|
317
|
+
# add pass through
|
318
|
+
pt_name = f"{alceml_name}_PT"
|
319
|
+
ret = rpc_client.bdev_PT_NoExcl_create(pt_name, alceml_name)
|
320
|
+
if not ret:
|
321
|
+
logger.error(f"Failed to create pt noexcl bdev: {pt_name}")
|
322
|
+
return False
|
323
|
+
|
324
|
+
subsystem_nqn = snode.subsystem + ":dev:" + alceml_id
|
325
|
+
logger.info("creating subsystem %s", subsystem_nqn)
|
326
|
+
ret = rpc_client.subsystem_create(subsystem_nqn, 'sbcli-cn', alceml_id)
|
327
|
+
IP = None
|
328
|
+
for iface in snode.data_nics:
|
329
|
+
if iface.ip4_address:
|
330
|
+
tr_type = iface.get_transport_type()
|
331
|
+
ret = rpc_client.transport_list()
|
332
|
+
found = False
|
333
|
+
if ret:
|
334
|
+
for ty in ret:
|
335
|
+
if ty['trtype'] == tr_type:
|
336
|
+
found = True
|
337
|
+
if found is False:
|
338
|
+
ret = rpc_client.transport_create(tr_type)
|
339
|
+
logger.info("adding listener for %s on IP %s" % (subsystem_nqn, iface.ip4_address))
|
340
|
+
ret = rpc_client.listeners_create(subsystem_nqn, tr_type, iface.ip4_address, "4420")
|
341
|
+
IP = iface.ip4_address
|
342
|
+
break
|
343
|
+
logger.info(f"add {pt_name} to subsystem")
|
344
|
+
ret = rpc_client.nvmf_subsystem_add_ns(subsystem_nqn, pt_name)
|
345
|
+
if not ret:
|
346
|
+
logger.error(f"Failed to add: {pt_name} to the subsystem: {subsystem_nqn}")
|
347
|
+
return False
|
348
|
+
|
349
|
+
nvme.testing_bdev = test_name
|
350
|
+
nvme.alceml_bdev = alceml_name
|
351
|
+
nvme.pt_bdev = pt_name
|
352
|
+
nvme.nvmf_nqn = subsystem_nqn
|
353
|
+
nvme.nvmf_ip = IP
|
354
|
+
nvme.nvmf_port = 4420
|
355
|
+
nvme.io_error = False
|
356
|
+
nvme.status = NVMeDevice.STATUS_ONLINE
|
357
|
+
return nvme
|
358
|
+
|
359
|
+
|
360
|
+
def _create_device_partitions(rpc_client, nvme, snode):
|
361
|
+
nbd_device = rpc_client.nbd_start_disk(nvme.nvme_bdev)
|
362
|
+
time.sleep(3)
|
363
|
+
if not nbd_device:
|
364
|
+
logger.error(f"Failed to start nbd dev")
|
365
|
+
return False
|
366
|
+
snode_api = SNodeClient(snode.api_endpoint)
|
367
|
+
result, error = snode_api.make_gpt_partitions(
|
368
|
+
nbd_device, snode.jm_percent, snode.num_partitions_per_dev)
|
369
|
+
if error:
|
370
|
+
logger.error(f"Failed to make partitions")
|
371
|
+
logger.error(error)
|
372
|
+
return False
|
373
|
+
time.sleep(3)
|
374
|
+
rpc_client.nbd_stop_disk(nbd_device)
|
375
|
+
time.sleep(1)
|
376
|
+
rpc_client.bdev_nvme_detach_controller(nvme.nvme_controller)
|
377
|
+
time.sleep(1)
|
378
|
+
rpc_client.bdev_nvme_controller_attach(nvme.nvme_controller, nvme.pcie_address)
|
379
|
+
time.sleep(1)
|
380
|
+
rpc_client.bdev_examine(nvme.nvme_bdev)
|
381
|
+
time.sleep(1)
|
382
|
+
return True
|
383
|
+
|
384
|
+
|
385
|
+
def _prepare_cluster_devices_partitions(snode, devices):
|
386
|
+
db_controller = DBController()
|
215
387
|
rpc_client = RPCClient(
|
216
388
|
snode.mgmt_ip, snode.rpc_port,
|
217
389
|
snode.rpc_username, snode.rpc_password)
|
218
390
|
|
219
|
-
|
391
|
+
new_devices = []
|
392
|
+
jm_devices = []
|
393
|
+
dev_order = get_next_cluster_device_order(db_controller, snode.cluster_id)
|
394
|
+
for index, nvme in enumerate(devices):
|
395
|
+
if nvme.status not in [NVMeDevice.STATUS_ONLINE, NVMeDevice.STATUS_UNAVAILABLE, NVMeDevice.STATUS_READONLY]:
|
396
|
+
logger.debug(f"Device is skipped: {nvme.get_id()}, status: {nvme.status}")
|
397
|
+
continue
|
398
|
+
|
399
|
+
# look for partitions
|
400
|
+
partitioned_devices = _search_for_partitions(rpc_client, nvme)
|
401
|
+
logger.debug("partitioned_devices")
|
402
|
+
logger.debug(partitioned_devices)
|
403
|
+
if len(partitioned_devices) == (1 + snode.num_partitions_per_dev):
|
404
|
+
logger.info("Partitioned devices found")
|
405
|
+
else:
|
406
|
+
logger.info(f"Creating partitions for {nvme.nvme_bdev}")
|
407
|
+
_create_device_partitions(rpc_client, nvme, snode)
|
408
|
+
partitioned_devices = _search_for_partitions(rpc_client, nvme)
|
409
|
+
if len(partitioned_devices) == (1 + snode.num_partitions_per_dev):
|
410
|
+
logger.info("Device partitions created")
|
411
|
+
else:
|
412
|
+
logger.error("Failed to create partitions")
|
413
|
+
return False
|
414
|
+
|
415
|
+
jm_devices.append(partitioned_devices.pop(0))
|
416
|
+
|
417
|
+
for dev in partitioned_devices:
|
418
|
+
new_device = _create_storage_device_stack(rpc_client, dev, snode, after_restart=False)
|
419
|
+
if not new_device:
|
420
|
+
logger.error("failed to create dev stack")
|
421
|
+
return False
|
422
|
+
new_device.cluster_device_order = dev_order
|
423
|
+
dev_order += 1
|
424
|
+
new_devices.append(new_device)
|
425
|
+
device_events.device_create(new_device)
|
426
|
+
|
427
|
+
snode.nvme_devices = new_devices
|
428
|
+
|
429
|
+
if jm_devices:
|
430
|
+
jm_nvme_bdevs = [dev.nvme_bdev for dev in jm_devices]
|
431
|
+
jm_device = _create_jm_stack_on_raid(rpc_client, jm_nvme_bdevs, snode, after_restart=False)
|
432
|
+
if not jm_device:
|
433
|
+
logger.error(f"Failed to create JM device")
|
434
|
+
return False
|
435
|
+
snode.jm_device = jm_device
|
436
|
+
|
437
|
+
return True
|
438
|
+
|
439
|
+
|
440
|
+
def _prepare_cluster_devices_jm_on_dev(snode, devices):
|
441
|
+
db_controller = DBController()
|
442
|
+
|
443
|
+
jm_device = devices[0]
|
444
|
+
# Set device cluster order
|
445
|
+
dev_order = get_next_cluster_device_order(db_controller, snode.cluster_id)
|
446
|
+
for index, nvme in enumerate(devices):
|
447
|
+
nvme.cluster_device_order = dev_order
|
448
|
+
dev_order += 1
|
449
|
+
if nvme.size < jm_device.size:
|
450
|
+
jm_device = nvme
|
451
|
+
device_events.device_create(nvme)
|
452
|
+
jm_device.status = NVMeDevice.STATUS_JM
|
453
|
+
|
454
|
+
rpc_client = RPCClient(snode.mgmt_ip, snode.rpc_port, snode.rpc_username, snode.rpc_password)
|
455
|
+
|
456
|
+
new_devices = []
|
457
|
+
for index, nvme in enumerate(devices):
|
220
458
|
if nvme.status not in [NVMeDevice.STATUS_ONLINE, NVMeDevice.STATUS_UNAVAILABLE,
|
221
459
|
NVMeDevice.STATUS_JM, NVMeDevice.STATUS_READONLY]:
|
222
460
|
logger.debug(f"Device is not online or unavailable: {nvme.get_id()}, status: {nvme.status}")
|
223
461
|
continue
|
224
462
|
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
logger.error(f"Failed to create bdev: {test_name}")
|
230
|
-
return False
|
231
|
-
alceml_id = nvme.get_id()
|
232
|
-
alceml_name = device_controller.get_alceml_name(alceml_id)
|
233
|
-
logger.info(f"adding {alceml_name}")
|
234
|
-
pba_init_mode = 3
|
235
|
-
if after_restart:
|
236
|
-
pba_init_mode = 2
|
237
|
-
ret = rpc_client.bdev_alceml_create(alceml_name, test_name, alceml_id, pba_init_mode=pba_init_mode)
|
238
|
-
if not ret:
|
239
|
-
logger.error(f"Failed to create alceml bdev: {alceml_name}")
|
240
|
-
return False
|
241
|
-
|
242
|
-
# create jm
|
243
|
-
if nvme.jm_bdev:
|
244
|
-
ret = rpc_client.bdev_jm_create(nvme.jm_bdev, alceml_name)
|
245
|
-
if not ret:
|
246
|
-
logger.error(f"Failed to create JM bdev: {nvme.jm_bdev}")
|
463
|
+
if nvme.status == NVMeDevice.STATUS_JM:
|
464
|
+
jm_device = _create_jm_stack_on_device(rpc_client, nvme, snode, after_restart=False)
|
465
|
+
if not jm_device:
|
466
|
+
logger.error(f"Failed to create JM device")
|
247
467
|
return False
|
248
|
-
|
249
|
-
|
250
|
-
nvme
|
251
|
-
|
468
|
+
snode.jm_device = jm_device
|
469
|
+
else:
|
470
|
+
new_device = _create_storage_device_stack(rpc_client, nvme, snode, after_restart=False)
|
471
|
+
if not new_device:
|
472
|
+
logger.error("failed to create dev stack")
|
473
|
+
return False
|
474
|
+
new_device.cluster_device_order = dev_order
|
475
|
+
dev_order += 1
|
476
|
+
new_devices.append(new_device)
|
477
|
+
device_events.device_create(new_device)
|
478
|
+
|
479
|
+
snode.nvme_devices = new_devices
|
480
|
+
return True
|
481
|
+
|
482
|
+
|
483
|
+
def _prepare_cluster_devices_on_restart(snode):
|
484
|
+
db_controller = DBController()
|
485
|
+
|
486
|
+
rpc_client = RPCClient(
|
487
|
+
snode.mgmt_ip, snode.rpc_port,
|
488
|
+
snode.rpc_username, snode.rpc_password)
|
489
|
+
|
490
|
+
for index, nvme in enumerate(snode.nvme_devices):
|
491
|
+
if nvme.status not in [NVMeDevice.STATUS_ONLINE, NVMeDevice.STATUS_UNAVAILABLE, NVMeDevice.STATUS_READONLY]:
|
492
|
+
logger.debug(f"Device is skipped: {nvme.get_id()}, status: {nvme.status}")
|
252
493
|
continue
|
253
494
|
|
254
|
-
|
255
|
-
|
256
|
-
|
495
|
+
dev = _create_storage_device_stack(rpc_client, nvme, snode, after_restart=True)
|
496
|
+
if not dev:
|
497
|
+
logger.error(f"Failed to create dev stack {nvme.get_id()}")
|
498
|
+
return False
|
499
|
+
device_events.device_restarted(dev)
|
500
|
+
|
501
|
+
# prepare JM device
|
502
|
+
jm_device = snode.jm_device
|
503
|
+
if jm_device.jm_nvme_bdev_list:
|
504
|
+
ret = _create_jm_stack_on_raid(rpc_client, jm_device.jm_nvme_bdev_list, snode, after_restart=False)
|
257
505
|
if not ret:
|
258
|
-
logger.error(f"Failed to create
|
506
|
+
logger.error(f"Failed to create JM device")
|
259
507
|
return False
|
508
|
+
else:
|
260
509
|
|
261
|
-
|
262
|
-
logger.info("creating subsystem %s", subsystem_nqn)
|
263
|
-
ret = rpc_client.subsystem_create(subsystem_nqn, 'sbcli-cn', alceml_id)
|
264
|
-
IP = None
|
265
|
-
for iface in snode.data_nics:
|
266
|
-
if iface.ip4_address:
|
267
|
-
tr_type = iface.get_transport_type()
|
268
|
-
ret = rpc_client.transport_list()
|
269
|
-
found = False
|
270
|
-
if ret:
|
271
|
-
for ty in ret:
|
272
|
-
if ty['trtype'] == tr_type:
|
273
|
-
found = True
|
274
|
-
if found is False:
|
275
|
-
ret = rpc_client.transport_create(tr_type)
|
276
|
-
logger.info("adding listener for %s on IP %s" % (subsystem_nqn, iface.ip4_address))
|
277
|
-
ret = rpc_client.listeners_create(subsystem_nqn, tr_type, iface.ip4_address, "4420")
|
278
|
-
IP = iface.ip4_address
|
279
|
-
break
|
280
|
-
logger.info(f"add {pt_name} to subsystem")
|
281
|
-
ret = rpc_client.nvmf_subsystem_add_ns(subsystem_nqn, pt_name)
|
510
|
+
ret = rpc_client.bdev_alceml_create(jm_device.alceml_bdev, jm_device.nvme_bdev, jm_device.get_id(), pba_init_mode=2)
|
282
511
|
if not ret:
|
283
|
-
logger.error(f"Failed to
|
512
|
+
logger.error(f"Failed to create alceml bdev: {jm_device.alceml_bdev}")
|
284
513
|
return False
|
285
514
|
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
nvme.nvmf_port = 4420
|
292
|
-
nvme.io_error = False
|
293
|
-
old_status = nvme.status
|
294
|
-
nvme.status = NVMeDevice.STATUS_ONLINE
|
295
|
-
device_events.device_status_change(nvme, nvme.status, old_status)
|
296
|
-
snode.write_to_db(db_controller.kv_store)
|
515
|
+
jm_bdev = f"jm_{snode.get_id()}"
|
516
|
+
ret = rpc_client.bdev_jm_create(jm_bdev, jm_device.alceml_bdev)
|
517
|
+
if not ret:
|
518
|
+
logger.error(f"Failed to create {jm_bdev}")
|
519
|
+
return False
|
297
520
|
|
298
521
|
return True
|
299
522
|
|
@@ -307,7 +530,7 @@ def _connect_to_remote_devs(this_node):
|
|
307
530
|
|
308
531
|
remote_devices = []
|
309
532
|
# connect to remote devs
|
310
|
-
snodes = db_controller.
|
533
|
+
snodes = db_controller.get_storage_nodes_by_cluster_id(this_node.cluster_id)
|
311
534
|
for node_index, node in enumerate(snodes):
|
312
535
|
if node.get_id() == this_node.get_id() or node.status == node.STATUS_OFFLINE:
|
313
536
|
continue
|
@@ -326,9 +549,10 @@ def _connect_to_remote_devs(this_node):
|
|
326
549
|
return remote_devices
|
327
550
|
|
328
551
|
|
329
|
-
def add_node(cluster_id, node_ip, iface_name, data_nics_list,
|
552
|
+
def add_node(cluster_id, node_ip, iface_name, data_nics_list,
|
330
553
|
spdk_mem, spdk_image=None, spdk_debug=False,
|
331
|
-
small_pool_count=0, large_pool_count=0, small_bufsize=0, large_bufsize=0,
|
554
|
+
small_pool_count=0, large_pool_count=0, small_bufsize=0, large_bufsize=0,
|
555
|
+
num_partitions_per_dev=0, jm_percent=0):
|
332
556
|
db_controller = DBController()
|
333
557
|
kv_store = db_controller.kv_store
|
334
558
|
|
@@ -399,6 +623,27 @@ def add_node(cluster_id, node_ip, iface_name, data_nics_list, spdk_cpu_mask,
|
|
399
623
|
spdk_mem = huge_free
|
400
624
|
logger.info(f"Using the free hugepages for spdk memory: {utils.humanbytes(huge_free)}")
|
401
625
|
|
626
|
+
# Tune cpu maks parameters
|
627
|
+
cpu_count = node_info["cpu_count"]
|
628
|
+
pollers_mask = ""
|
629
|
+
app_thread_mask = ""
|
630
|
+
dev_cpu_mask = ""
|
631
|
+
if cpu_count < 8:
|
632
|
+
mask = (1 << (cpu_count - 1)) - 1
|
633
|
+
mask <<= 1
|
634
|
+
spdk_cpu_mask = f'0x{mask:X}'
|
635
|
+
os_cores = [0]
|
636
|
+
else:
|
637
|
+
os_cores, nvme_pollers_cores, app_thread_core, dev_cpu_cores = \
|
638
|
+
utils.calculate_core_allocation(cpu_count)
|
639
|
+
spdk_cores = nvme_pollers_cores + app_thread_core + dev_cpu_cores
|
640
|
+
|
641
|
+
pollers_mask = utils.generate_mask(nvme_pollers_cores)
|
642
|
+
app_thread_mask = utils.generate_mask(app_thread_core)
|
643
|
+
spdk_cpu_mask = utils.generate_mask(spdk_cores)
|
644
|
+
dev_cpu_mask = utils.generate_mask(dev_cpu_cores)
|
645
|
+
|
646
|
+
|
402
647
|
logger.info("Joining docker swarm...")
|
403
648
|
cluster_docker = utils.get_docker_client(cluster_id)
|
404
649
|
cluster_ip = cluster_docker.info()["Swarm"]["NodeAddr"]
|
@@ -476,12 +721,19 @@ def add_node(cluster_id, node_ip, iface_name, data_nics_list, spdk_cpu_mask,
|
|
476
721
|
snode.spdk_image = spdk_image or ""
|
477
722
|
snode.spdk_debug = spdk_debug or 0
|
478
723
|
snode.write_to_db(kv_store)
|
724
|
+
snode.app_thread_mask = app_thread_mask or ""
|
725
|
+
snode.pollers_mask = pollers_mask or ""
|
726
|
+
snode.dev_cpu_mask = dev_cpu_mask or ""
|
727
|
+
snode.os_cores = os_cores or []
|
479
728
|
|
480
729
|
snode.iobuf_small_pool_count = small_pool_count or 0
|
481
730
|
snode.iobuf_large_pool_count = large_pool_count or 0
|
482
731
|
snode.iobuf_small_bufsize = small_bufsize or 0
|
483
732
|
snode.iobuf_large_bufsize = large_bufsize or 0
|
484
733
|
|
734
|
+
snode.num_partitions_per_dev = num_partitions_per_dev
|
735
|
+
snode.jm_percent = jm_percent
|
736
|
+
|
485
737
|
snode.write_to_db(kv_store)
|
486
738
|
|
487
739
|
# creating RPCClient instance
|
@@ -499,13 +751,41 @@ def add_node(cluster_id, node_ip, iface_name, data_nics_list, spdk_cpu_mask,
|
|
499
751
|
logger.error("Failed to set iobuf options")
|
500
752
|
return False
|
501
753
|
|
502
|
-
# 2-
|
754
|
+
# 2- set socket implementation options
|
755
|
+
ret = rpc_client.sock_impl_set_options()
|
756
|
+
if not ret:
|
757
|
+
logger.error("Failed socket implement set options")
|
758
|
+
return False
|
759
|
+
|
760
|
+
# 3- set nvme config
|
761
|
+
if snode.pollers_mask:
|
762
|
+
ret = rpc_client.nvmf_set_config(snode.pollers_mask)
|
763
|
+
if not ret:
|
764
|
+
logger.error("Failed to set pollers mask")
|
765
|
+
return False
|
766
|
+
|
767
|
+
# 4- start spdk framework
|
503
768
|
ret = rpc_client.framework_start_init()
|
504
769
|
if not ret:
|
505
770
|
logger.error("Failed to start framework")
|
506
771
|
return False
|
507
772
|
|
508
|
-
#
|
773
|
+
# 5- set app_thread cpu mask
|
774
|
+
if snode.app_thread_mask:
|
775
|
+
ret = rpc_client.thread_get_stats()
|
776
|
+
app_thread_process_id = 0
|
777
|
+
if ret.get("threads"):
|
778
|
+
for entry in ret["threads"]:
|
779
|
+
if entry['name'] == 'app_thread':
|
780
|
+
app_thread_process_id = entry['id']
|
781
|
+
break
|
782
|
+
|
783
|
+
ret = rpc_client.thread_set_cpumask(app_thread_process_id, snode.app_thread_mask)
|
784
|
+
if not ret:
|
785
|
+
logger.error("Failed to set app thread mask")
|
786
|
+
return False
|
787
|
+
|
788
|
+
# 6- set nvme bdev options
|
509
789
|
ret = rpc_client.bdev_nvme_set_options()
|
510
790
|
if not ret:
|
511
791
|
logger.error("Failed to set nvme options")
|
@@ -513,36 +793,18 @@ def add_node(cluster_id, node_ip, iface_name, data_nics_list, spdk_cpu_mask,
|
|
513
793
|
|
514
794
|
# get new node info after starting spdk
|
515
795
|
node_info, _ = snode_api.info()
|
516
|
-
|
796
|
+
|
797
|
+
# discover devices
|
517
798
|
nvme_devs = addNvmeDevices(cluster, rpc_client, node_info['spdk_pcie_list'], snode)
|
518
799
|
if not nvme_devs:
|
519
800
|
logger.error("No NVMe devices was found!")
|
520
801
|
return False
|
521
802
|
|
522
|
-
snode.nvme_devices = nvme_devs
|
523
|
-
|
524
|
-
jm_device = snode.nvme_devices[0]
|
525
|
-
# Set device cluster order
|
526
|
-
dev_order = get_next_cluster_device_order(db_controller)
|
527
|
-
for index, nvme in enumerate(snode.nvme_devices):
|
528
|
-
nvme.cluster_device_order = dev_order
|
529
|
-
dev_order += 1
|
530
|
-
if jm_device_pcie:
|
531
|
-
if nvme.pcie_address == jm_device_pcie:
|
532
|
-
jm_device = nvme
|
533
|
-
elif nvme.size < jm_device.size:
|
534
|
-
jm_device = nvme
|
535
|
-
device_events.device_create(nvme)
|
536
|
-
|
537
|
-
# create jm
|
538
|
-
logger.info(f"Using device for JM: {jm_device.get_id()}")
|
539
|
-
jm_device.jm_bdev = f"jm_{snode.get_id()}"
|
540
|
-
|
541
|
-
# save object
|
542
|
-
snode.write_to_db(db_controller.kv_store)
|
543
|
-
|
544
803
|
# prepare devices
|
545
|
-
|
804
|
+
if snode.num_partitions_per_dev == 0 or snode.jm_percent == 0:
|
805
|
+
ret = _prepare_cluster_devices_jm_on_dev(snode, nvme_devs)
|
806
|
+
else:
|
807
|
+
ret = _prepare_cluster_devices_partitions(snode, nvme_devs)
|
546
808
|
if not ret:
|
547
809
|
logger.error("Failed to prepare cluster devices")
|
548
810
|
return False
|
@@ -557,7 +819,7 @@ def add_node(cluster_id, node_ip, iface_name, data_nics_list, spdk_cpu_mask,
|
|
557
819
|
|
558
820
|
# make other nodes connect to the new devices
|
559
821
|
logger.info("Make other nodes connect to the new devices")
|
560
|
-
snodes = db_controller.
|
822
|
+
snodes = db_controller.get_storage_nodes_by_cluster_id(cluster_id)
|
561
823
|
for node_index, node in enumerate(snodes):
|
562
824
|
if node.get_id() == snode.get_id() or node.status != StorageNode.STATUS_ONLINE:
|
563
825
|
continue
|
@@ -599,150 +861,16 @@ def add_node(cluster_id, node_ip, iface_name, data_nics_list, spdk_cpu_mask,
|
|
599
861
|
time.sleep(3)
|
600
862
|
|
601
863
|
logger.info("Sending cluster event updates")
|
602
|
-
distr_controller.send_node_status_event(snode
|
864
|
+
distr_controller.send_node_status_event(snode, StorageNode.STATUS_ONLINE)
|
603
865
|
|
604
866
|
for dev in snode.nvme_devices:
|
605
|
-
distr_controller.send_dev_status_event(dev
|
867
|
+
distr_controller.send_dev_status_event(dev, NVMeDevice.STATUS_ONLINE)
|
606
868
|
|
607
869
|
storage_events.snode_add(snode)
|
608
870
|
logger.info("Done")
|
609
871
|
return "Success"
|
610
872
|
|
611
873
|
|
612
|
-
# Deprecated
|
613
|
-
def add_storage_node(cluster_id, iface_name, data_nics):
|
614
|
-
db_controller = DBController()
|
615
|
-
kv_store = db_controller.kv_store
|
616
|
-
|
617
|
-
cluster = db_controller.get_cluster_by_id(cluster_id)
|
618
|
-
if not cluster:
|
619
|
-
logger.error("Cluster not found: %s", cluster_id)
|
620
|
-
return False
|
621
|
-
|
622
|
-
logger.info("Add Storage node")
|
623
|
-
|
624
|
-
hostname = utils.get_hostname()
|
625
|
-
snode = db_controller.get_storage_node_by_hostname(hostname)
|
626
|
-
if snode:
|
627
|
-
logger.error("Node already exists, try remove it first.")
|
628
|
-
exit(1)
|
629
|
-
else:
|
630
|
-
snode = StorageNode()
|
631
|
-
snode.uuid = str(uuid.uuid4())
|
632
|
-
|
633
|
-
mgmt_ip = _get_if_ip_address(iface_name)
|
634
|
-
system_id = utils.get_system_id()
|
635
|
-
|
636
|
-
BASE_NQN = cluster.nqn.split(":")[0]
|
637
|
-
subsystem_nqn = f"{BASE_NQN}:{hostname}"
|
638
|
-
|
639
|
-
if data_nics:
|
640
|
-
data_nics = _get_data_nics(data_nics)
|
641
|
-
else:
|
642
|
-
data_nics = _get_data_nics([iface_name])
|
643
|
-
|
644
|
-
rpc_user, rpc_pass = utils.generate_rpc_user_and_pass()
|
645
|
-
|
646
|
-
# creating storage node object
|
647
|
-
snode.status = StorageNode.STATUS_IN_CREATION
|
648
|
-
snode.baseboard_sn = utils.get_baseboard_sn()
|
649
|
-
snode.system_uuid = system_id
|
650
|
-
snode.hostname = hostname
|
651
|
-
snode.host_nqn = subsystem_nqn
|
652
|
-
snode.subsystem = subsystem_nqn
|
653
|
-
snode.data_nics = data_nics
|
654
|
-
snode.mgmt_ip = mgmt_ip
|
655
|
-
snode.rpc_port = constants.RPC_HTTP_PROXY_PORT
|
656
|
-
snode.rpc_username = rpc_user
|
657
|
-
snode.rpc_password = rpc_pass
|
658
|
-
snode.cluster_id = cluster_id
|
659
|
-
snode.write_to_db(kv_store)
|
660
|
-
|
661
|
-
# creating RPCClient instance
|
662
|
-
rpc_client = RPCClient(
|
663
|
-
snode.mgmt_ip,
|
664
|
-
snode.rpc_port,
|
665
|
-
snode.rpc_username,
|
666
|
-
snode.rpc_password)
|
667
|
-
|
668
|
-
logger.info("Getting nvme devices")
|
669
|
-
devs = get_nvme_devices()
|
670
|
-
logger.debug(devs)
|
671
|
-
pcies = [d[0] for d in devs]
|
672
|
-
nvme_devs = addNvmeDevices(cluster, rpc_client, pcies, snode)
|
673
|
-
if not nvme_devs:
|
674
|
-
logger.error("No NVMe devices was found!")
|
675
|
-
|
676
|
-
logger.debug(nvme_devs)
|
677
|
-
snode.nvme_devices = nvme_devs
|
678
|
-
|
679
|
-
# Set device cluster order
|
680
|
-
dev_order = get_next_cluster_device_order(db_controller)
|
681
|
-
for index, nvme in enumerate(snode.nvme_devices):
|
682
|
-
nvme.cluster_device_order = dev_order
|
683
|
-
dev_order += 1
|
684
|
-
snode.write_to_db(db_controller.kv_store)
|
685
|
-
|
686
|
-
# prepare devices
|
687
|
-
_prepare_cluster_devices(snode)
|
688
|
-
|
689
|
-
logger.info("Connecting to remote devices")
|
690
|
-
remote_devices = _connect_to_remote_devs(snode)
|
691
|
-
snode.remote_devices = remote_devices
|
692
|
-
|
693
|
-
logger.info("Setting node status to Active")
|
694
|
-
snode.status = StorageNode.STATUS_ONLINE
|
695
|
-
snode.write_to_db(kv_store)
|
696
|
-
|
697
|
-
# make other nodes connect to the new devices
|
698
|
-
logger.info("Make other nodes connect to the new devices")
|
699
|
-
snodes = db_controller.get_storage_nodes()
|
700
|
-
for node_index, node in enumerate(snodes):
|
701
|
-
if node.get_id() == snode.get_id():
|
702
|
-
continue
|
703
|
-
logger.info(f"Connecting to node: {node.get_id()}")
|
704
|
-
rpc_client = RPCClient(node.mgmt_ip, node.rpc_port, node.rpc_username, node.rpc_password)
|
705
|
-
count = 0
|
706
|
-
for dev in snode.nvme_devices:
|
707
|
-
name = f"remote_{dev.alceml_bdev}"
|
708
|
-
ret = rpc_client.bdev_nvme_attach_controller_tcp(name, dev.nvmf_nqn, dev.nvmf_ip, dev.nvmf_port)
|
709
|
-
if not ret:
|
710
|
-
logger.error(f"Failed to connect to device: {name}")
|
711
|
-
continue
|
712
|
-
|
713
|
-
dev.remote_bdev = f"{name}n1"
|
714
|
-
idx = -1
|
715
|
-
for i, d in enumerate(node.remote_devices):
|
716
|
-
if d.get_id() == dev.get_id():
|
717
|
-
idx = i
|
718
|
-
break
|
719
|
-
if idx >= 0:
|
720
|
-
node.remote_devices[idx] = dev
|
721
|
-
else:
|
722
|
-
node.remote_devices.append(dev)
|
723
|
-
count += 1
|
724
|
-
node.write_to_db(kv_store)
|
725
|
-
logger.info(f"connected to devices count: {count}")
|
726
|
-
|
727
|
-
logger.info("Sending cluster map")
|
728
|
-
ret = distr_controller.send_cluster_map_to_node(snode)
|
729
|
-
if not ret:
|
730
|
-
return False, "Failed to send cluster map"
|
731
|
-
ret = distr_controller.send_cluster_map_add_node(snode)
|
732
|
-
if not ret:
|
733
|
-
return False, "Failed to send cluster map add node"
|
734
|
-
time.sleep(3)
|
735
|
-
|
736
|
-
logger.info("Sending cluster event updates")
|
737
|
-
distr_controller.send_node_status_event(snode.get_id(), "online")
|
738
|
-
|
739
|
-
for dev in snode.nvme_devices:
|
740
|
-
distr_controller.send_dev_status_event(dev.cluster_device_order, "online")
|
741
|
-
|
742
|
-
logger.info("Done")
|
743
|
-
return "Success"
|
744
|
-
|
745
|
-
|
746
874
|
def delete_storage_node(node_id):
|
747
875
|
db_controller = DBController()
|
748
876
|
snode = db_controller.get_storage_node_by_id(node_id)
|
@@ -756,7 +884,7 @@ def delete_storage_node(node_id):
|
|
756
884
|
|
757
885
|
snode.remove(db_controller.kv_store)
|
758
886
|
|
759
|
-
for lvol in db_controller.get_lvols():
|
887
|
+
for lvol in db_controller.get_lvols(snode.cluster_id):
|
760
888
|
logger.info(f"Sending cluster map to LVol: {lvol.get_id()}")
|
761
889
|
lvol_controller.send_cluster_map(lvol.get_id())
|
762
890
|
|
@@ -764,7 +892,7 @@ def delete_storage_node(node_id):
|
|
764
892
|
logger.info("done")
|
765
893
|
|
766
894
|
|
767
|
-
def remove_storage_node(node_id, force_remove=False):
|
895
|
+
def remove_storage_node(node_id, force_remove=False, force_migrate=False):
|
768
896
|
db_controller = DBController()
|
769
897
|
snode = db_controller.get_storage_node_by_id(node_id)
|
770
898
|
if not snode:
|
@@ -811,7 +939,7 @@ def remove_storage_node(node_id, force_remove=False):
|
|
811
939
|
distr_controller.disconnect_device(dev)
|
812
940
|
old_status = dev.status
|
813
941
|
dev.status = NVMeDevice.STATUS_FAILED
|
814
|
-
distr_controller.send_dev_status_event(dev
|
942
|
+
distr_controller.send_dev_status_event(dev, NVMeDevice.STATUS_FAILED)
|
815
943
|
device_events.device_status_change(dev, NVMeDevice.STATUS_FAILED, old_status)
|
816
944
|
|
817
945
|
logger.info("Removing storage node")
|
@@ -825,24 +953,29 @@ def remove_storage_node(node_id, force_remove=False):
|
|
825
953
|
pass
|
826
954
|
|
827
955
|
try:
|
828
|
-
snode_api = SNodeClient(snode.api_endpoint)
|
956
|
+
snode_api = SNodeClient(snode.api_endpoint, timeout=20)
|
829
957
|
snode_api.spdk_process_kill()
|
830
958
|
snode_api.leave_swarm()
|
959
|
+
pci_address = []
|
960
|
+
for dev in snode.nvme_devices:
|
961
|
+
if dev.pcie_address not in pci_address:
|
962
|
+
ret = snode_api.delete_dev_gpt_partitions(dev.pcie_address)
|
963
|
+
logger.debug(ret)
|
964
|
+
pci_address.append(dev.pcie_address)
|
831
965
|
except Exception as e:
|
832
|
-
logger.
|
966
|
+
logger.exception(e)
|
833
967
|
|
834
968
|
old_status = snode.status
|
835
969
|
snode.status = StorageNode.STATUS_REMOVED
|
836
970
|
snode.write_to_db(db_controller.kv_store)
|
837
971
|
logger.info("Sending node event update")
|
838
|
-
distr_controller.send_node_status_event(snode
|
972
|
+
distr_controller.send_node_status_event(snode, snode.status)
|
839
973
|
storage_events.snode_status_change(snode, StorageNode.STATUS_REMOVED, old_status)
|
840
974
|
logger.info("done")
|
841
975
|
|
842
976
|
|
843
977
|
def restart_storage_node(
|
844
978
|
node_id,
|
845
|
-
spdk_cpu_mask=None,
|
846
979
|
spdk_mem=None,
|
847
980
|
spdk_image=None,
|
848
981
|
set_spdk_debug=None,
|
@@ -868,7 +1001,7 @@ def restart_storage_node(
|
|
868
1001
|
snode.status = StorageNode.STATUS_RESTARTING
|
869
1002
|
snode.write_to_db(kv_store)
|
870
1003
|
logger.info("Sending node event update")
|
871
|
-
distr_controller.send_node_status_event(snode
|
1004
|
+
distr_controller.send_node_status_event(snode, snode.status)
|
872
1005
|
storage_events.snode_status_change(snode, snode.status, old_status)
|
873
1006
|
|
874
1007
|
logger.info(f"Restarting Storage node: {snode.mgmt_ip}")
|
@@ -878,10 +1011,6 @@ def restart_storage_node(
|
|
878
1011
|
logger.info(f"Node info: {node_info}")
|
879
1012
|
|
880
1013
|
logger.info("Restarting SPDK")
|
881
|
-
cpu = snode.spdk_cpu_mask
|
882
|
-
if spdk_cpu_mask:
|
883
|
-
cpu = spdk_cpu_mask
|
884
|
-
snode.spdk_cpu_mask = cpu
|
885
1014
|
mem = snode.spdk_mem
|
886
1015
|
if spdk_mem:
|
887
1016
|
mem = spdk_mem
|
@@ -897,7 +1026,7 @@ def restart_storage_node(
|
|
897
1026
|
|
898
1027
|
cluster_docker = utils.get_docker_client(snode.cluster_id)
|
899
1028
|
cluster_ip = cluster_docker.info()["Swarm"]["NodeAddr"]
|
900
|
-
results, err = snode_api.spdk_process_start(
|
1029
|
+
results, err = snode_api.spdk_process_start(snode.spdk_cpu_mask, mem, img, spdk_debug, cluster_ip)
|
901
1030
|
|
902
1031
|
if not results:
|
903
1032
|
logger.error(f"Failed to start spdk: {err}")
|
@@ -931,13 +1060,41 @@ def restart_storage_node(
|
|
931
1060
|
logger.error("Failed to set iobuf options")
|
932
1061
|
return False
|
933
1062
|
|
934
|
-
# 2-
|
1063
|
+
# 2- set socket implementation options
|
1064
|
+
ret = rpc_client.sock_impl_set_options()
|
1065
|
+
if not ret:
|
1066
|
+
logger.error("Failed socket implement set options")
|
1067
|
+
return False
|
1068
|
+
|
1069
|
+
# 3- set nvme config
|
1070
|
+
if snode.pollers_mask:
|
1071
|
+
ret = rpc_client.nvmf_set_config(snode.pollers_mask)
|
1072
|
+
if not ret:
|
1073
|
+
logger.error("Failed to set pollers mask")
|
1074
|
+
return False
|
1075
|
+
|
1076
|
+
# 4- start spdk framework
|
935
1077
|
ret = rpc_client.framework_start_init()
|
936
1078
|
if not ret:
|
937
1079
|
logger.error("Failed to start framework")
|
938
1080
|
return False
|
939
1081
|
|
940
|
-
#
|
1082
|
+
# 5- set app_thread cpu mask
|
1083
|
+
if snode.app_thread_mask:
|
1084
|
+
ret = rpc_client.thread_get_stats()
|
1085
|
+
app_thread_process_id = 0
|
1086
|
+
if ret.get("threads"):
|
1087
|
+
for entry in ret["threads"]:
|
1088
|
+
if entry['name'] == 'app_thread':
|
1089
|
+
app_thread_process_id = entry['id']
|
1090
|
+
break
|
1091
|
+
|
1092
|
+
ret = rpc_client.thread_set_cpumask(app_thread_process_id, snode.app_thread_mask)
|
1093
|
+
if not ret:
|
1094
|
+
logger.error("Failed to set app thread mask")
|
1095
|
+
return False
|
1096
|
+
|
1097
|
+
# 6- set nvme bdev options
|
941
1098
|
ret = rpc_client.bdev_nvme_set_options()
|
942
1099
|
if not ret:
|
943
1100
|
logger.error("Failed to set nvme options")
|
@@ -970,22 +1127,23 @@ def restart_storage_node(
|
|
970
1127
|
else:
|
971
1128
|
logger.info(f"Device not found: {db_dev.get_id()}")
|
972
1129
|
db_dev.status = NVMeDevice.STATUS_REMOVED
|
973
|
-
distr_controller.send_dev_status_event(db_dev
|
1130
|
+
distr_controller.send_dev_status_event(db_dev, db_dev.status)
|
974
1131
|
|
975
|
-
|
976
|
-
|
977
|
-
|
978
|
-
|
979
|
-
|
980
|
-
|
1132
|
+
# todo: handle new devices
|
1133
|
+
# for dev in nvme_devs:
|
1134
|
+
# if dev.serial_number not in known_devices_sn:
|
1135
|
+
# logger.info(f"New device found: {dev.get_id()}")
|
1136
|
+
# dev.status = NVMeDevice.STATUS_NEW
|
1137
|
+
# new_devices.append(dev)
|
1138
|
+
# snode.nvme_devices.append(dev)
|
981
1139
|
|
982
|
-
dev_order = get_next_cluster_device_order(db_controller)
|
983
|
-
for index, nvme in enumerate(new_devices):
|
984
|
-
|
985
|
-
|
1140
|
+
# dev_order = get_next_cluster_device_order(db_controller, snode.cluster_id)
|
1141
|
+
# for index, nvme in enumerate(new_devices):
|
1142
|
+
# nvme.cluster_device_order = dev_order
|
1143
|
+
# dev_order += 1
|
986
1144
|
|
987
1145
|
# prepare devices
|
988
|
-
ret =
|
1146
|
+
ret = _prepare_cluster_devices_on_restart(snode)
|
989
1147
|
if not ret:
|
990
1148
|
logger.error("Failed to prepare cluster devices")
|
991
1149
|
return False
|
@@ -996,7 +1154,7 @@ def restart_storage_node(
|
|
996
1154
|
|
997
1155
|
# make other nodes connect to the new devices
|
998
1156
|
logger.info("Make other nodes connect to the node devices")
|
999
|
-
snodes = db_controller.
|
1157
|
+
snodes = db_controller.get_storage_nodes_by_cluster_id(snode.cluster_id)
|
1000
1158
|
for node_index, node in enumerate(snodes):
|
1001
1159
|
if node.get_id() == snode.get_id() or node.status != StorageNode.STATUS_ONLINE:
|
1002
1160
|
continue
|
@@ -1034,20 +1192,23 @@ def restart_storage_node(
|
|
1034
1192
|
storage_events.snode_status_change(snode, snode.status, old_status)
|
1035
1193
|
|
1036
1194
|
logger.info("Sending node event update")
|
1037
|
-
distr_controller.send_node_status_event(snode
|
1195
|
+
distr_controller.send_node_status_event(snode, StorageNode.STATUS_ONLINE)
|
1038
1196
|
|
1039
1197
|
logger.info("Sending devices event updates")
|
1198
|
+
logger.info("Starting migration tasks")
|
1040
1199
|
for dev in snode.nvme_devices:
|
1041
1200
|
if dev.status != NVMeDevice.STATUS_ONLINE:
|
1042
|
-
logger.
|
1201
|
+
logger.info(f"Device is not online: {dev.get_id()}, status: {dev.status}")
|
1043
1202
|
continue
|
1044
|
-
distr_controller.send_dev_status_event(dev.cluster_device_order, NVMeDevice.STATUS_ONLINE)
|
1045
1203
|
|
1046
|
-
|
1047
|
-
|
1048
|
-
|
1049
|
-
|
1050
|
-
|
1204
|
+
distr_controller.send_dev_status_event(dev, NVMeDevice.STATUS_ONLINE)
|
1205
|
+
tasks_controller.add_device_mig_task(dev.get_id())
|
1206
|
+
|
1207
|
+
# logger.info("Sending cluster map to current node")
|
1208
|
+
# ret = distr_controller.send_cluster_map_to_node(snode)
|
1209
|
+
# if not ret:
|
1210
|
+
# return False, "Failed to send cluster map"
|
1211
|
+
# time.sleep(3)
|
1051
1212
|
|
1052
1213
|
for lvol_id in snode.lvols:
|
1053
1214
|
lvol = lvol_controller.recreate_lvol(lvol_id, snode)
|
@@ -1062,9 +1223,12 @@ def restart_storage_node(
|
|
1062
1223
|
return "Success"
|
1063
1224
|
|
1064
1225
|
|
1065
|
-
def list_storage_nodes(
|
1066
|
-
db_controller = DBController(
|
1067
|
-
|
1226
|
+
def list_storage_nodes(is_json, cluster_id=None):
|
1227
|
+
db_controller = DBController()
|
1228
|
+
if cluster_id:
|
1229
|
+
nodes = db_controller.get_storage_nodes_by_cluster_id(cluster_id)
|
1230
|
+
else:
|
1231
|
+
nodes = db_controller.get_storage_nodes()
|
1068
1232
|
data = []
|
1069
1233
|
output = ""
|
1070
1234
|
|
@@ -1111,26 +1275,43 @@ def list_storage_devices(kv_store, node_id, sort, is_json):
|
|
1111
1275
|
logger.error("This storage node is not part of the cluster")
|
1112
1276
|
return False
|
1113
1277
|
|
1114
|
-
|
1278
|
+
storage_devices = []
|
1279
|
+
jm_devices = []
|
1280
|
+
remote_devices = []
|
1115
1281
|
for device in snode.nvme_devices:
|
1116
1282
|
logger.debug(device)
|
1117
1283
|
logger.debug("*" * 20)
|
1118
|
-
|
1284
|
+
storage_devices.append({
|
1119
1285
|
"UUID": device.uuid,
|
1120
1286
|
"Name": device.device_name,
|
1121
|
-
"Hostname": snode.hostname,
|
1122
1287
|
"Size": utils.humanbytes(device.size),
|
1123
|
-
# "Sequential Number": device.sequential_number,
|
1124
|
-
# "Partitions Count": device.partitions_count,
|
1125
|
-
# "Model ID": device.model_id,
|
1126
1288
|
"Serial Number": device.serial_number,
|
1127
1289
|
"PCIe": device.pcie_address,
|
1128
1290
|
"Status": device.status,
|
1129
1291
|
"IO Err": device.io_error,
|
1130
|
-
"Health": device.health_check
|
1292
|
+
"Health": device.health_check
|
1293
|
+
})
|
1131
1294
|
|
1295
|
+
if snode.jm_device:
|
1296
|
+
jm_devices.append({
|
1297
|
+
"UUID": snode.jm_device.uuid,
|
1298
|
+
"Name": snode.jm_device.device_name,
|
1299
|
+
"Size": utils.humanbytes(snode.jm_device.size),
|
1300
|
+
"Status": snode.jm_device.status,
|
1301
|
+
"IO Err": snode.jm_device.io_error,
|
1302
|
+
"Health": snode.jm_device.health_check
|
1132
1303
|
})
|
1133
1304
|
|
1305
|
+
for device in snode.remote_devices:
|
1306
|
+
logger.debug(device)
|
1307
|
+
logger.debug("*" * 20)
|
1308
|
+
remote_devices.append({
|
1309
|
+
"UUID": device.uuid,
|
1310
|
+
"Name": device.device_name,
|
1311
|
+
"Size": utils.humanbytes(device.size),
|
1312
|
+
"Serial Number": device.serial_number,
|
1313
|
+
"Node ID": device.node_id,
|
1314
|
+
})
|
1134
1315
|
if sort and sort in ['node-seq', 'dev-seq', 'serial']:
|
1135
1316
|
if sort == 'serial':
|
1136
1317
|
sort_key = "Serial Number"
|
@@ -1139,13 +1320,20 @@ def list_storage_devices(kv_store, node_id, sort, is_json):
|
|
1139
1320
|
elif sort == 'node-seq':
|
1140
1321
|
# TODO: check this key
|
1141
1322
|
sort_key = "Sequential Number"
|
1142
|
-
|
1143
|
-
data = sorted_data
|
1323
|
+
storage_devices = sorted(storage_devices, key=lambda d: d[sort_key])
|
1144
1324
|
|
1325
|
+
data = {
|
1326
|
+
"Storage Devices": storage_devices,
|
1327
|
+
"JM Devices": jm_devices,
|
1328
|
+
"Remote Devices": remote_devices,
|
1329
|
+
}
|
1145
1330
|
if is_json:
|
1146
1331
|
return json.dumps(data, indent=2)
|
1147
1332
|
else:
|
1148
|
-
|
1333
|
+
out = ""
|
1334
|
+
for d in data:
|
1335
|
+
out += f"{d}\n{utils.print_table(data[d])}\n\n"
|
1336
|
+
return out
|
1149
1337
|
|
1150
1338
|
|
1151
1339
|
def shutdown_storage_node(node_id, force=False):
|
@@ -1186,7 +1374,7 @@ def shutdown_storage_node(node_id, force=False):
|
|
1186
1374
|
for dev in snode.nvme_devices:
|
1187
1375
|
if dev.status in [NVMeDevice.STATUS_ONLINE, NVMeDevice.STATUS_READONLY]:
|
1188
1376
|
device_controller.device_set_unavailable(dev.get_id())
|
1189
|
-
distr_controller.send_node_status_event(snode
|
1377
|
+
distr_controller.send_node_status_event(snode, StorageNode.STATUS_IN_SHUTDOWN)
|
1190
1378
|
|
1191
1379
|
# shutdown node
|
1192
1380
|
# make other nodes disconnect from this node
|
@@ -1206,7 +1394,7 @@ def shutdown_storage_node(node_id, force=False):
|
|
1206
1394
|
snode_api = SNodeClient(snode.api_endpoint)
|
1207
1395
|
results, err = snode_api.spdk_process_kill()
|
1208
1396
|
|
1209
|
-
distr_controller.send_node_status_event(snode
|
1397
|
+
distr_controller.send_node_status_event(snode, StorageNode.STATUS_OFFLINE)
|
1210
1398
|
|
1211
1399
|
logger.info("Setting node status to offline")
|
1212
1400
|
snode = db_controller.get_storage_node_by_id(node_id)
|
@@ -1233,22 +1421,24 @@ def suspend_storage_node(node_id, force=False):
|
|
1233
1421
|
return False
|
1234
1422
|
|
1235
1423
|
cluster = db_controller.get_cluster_by_id(snode.cluster_id)
|
1236
|
-
snodes = db_controller.
|
1424
|
+
snodes = db_controller.get_storage_nodes_by_cluster_id(snode.cluster_id)
|
1237
1425
|
online_nodes = 0
|
1238
1426
|
for node in snodes:
|
1239
1427
|
if node.status == node.STATUS_ONLINE:
|
1240
1428
|
online_nodes += 1
|
1241
|
-
if cluster.ha_type == "ha" and online_nodes <= 3 and cluster.status == cluster.STATUS_ACTIVE:
|
1242
|
-
logger.warning(f"Cluster mode is HA but online storage nodes are less than 3")
|
1243
|
-
if force is False:
|
1244
|
-
return False
|
1245
1429
|
|
1246
|
-
if cluster.ha_type == "ha"
|
1247
|
-
|
1248
|
-
|
1430
|
+
if cluster.ha_type == "ha":
|
1431
|
+
if online_nodes <= 3 and cluster.status == cluster.STATUS_ACTIVE:
|
1432
|
+
logger.warning(f"Cluster mode is HA but online storage nodes are less than 3")
|
1433
|
+
if force is False:
|
1434
|
+
return False
|
1435
|
+
|
1436
|
+
if cluster.status == cluster.STATUS_DEGRADED and force is False:
|
1437
|
+
logger.warning(f"Cluster status is degraded, use --force but this will suspend the cluster")
|
1438
|
+
return False
|
1249
1439
|
|
1250
1440
|
logger.info("Suspending node")
|
1251
|
-
distr_controller.send_node_status_event(snode
|
1441
|
+
distr_controller.send_node_status_event(snode, StorageNode.STATUS_SUSPENDED)
|
1252
1442
|
for dev in snode.nvme_devices:
|
1253
1443
|
if dev.status == NVMeDevice.STATUS_ONLINE:
|
1254
1444
|
device_controller.device_set_unavailable(dev.get_id())
|
@@ -1292,7 +1482,7 @@ def resume_storage_node(node_id):
|
|
1292
1482
|
logger.info("Resuming node")
|
1293
1483
|
|
1294
1484
|
logger.info("Sending cluster event updates")
|
1295
|
-
distr_controller.send_node_status_event(snode
|
1485
|
+
distr_controller.send_node_status_event(snode, StorageNode.STATUS_ONLINE)
|
1296
1486
|
|
1297
1487
|
for dev in snode.nvme_devices:
|
1298
1488
|
if dev.status == NVMeDevice.STATUS_UNAVAILABLE:
|
@@ -1668,7 +1858,6 @@ def deploy_cleaner():
|
|
1668
1858
|
return True
|
1669
1859
|
|
1670
1860
|
|
1671
|
-
|
1672
1861
|
def get_host_secret(node_id):
|
1673
1862
|
db_controller = DBController()
|
1674
1863
|
node = db_controller.get_storage_node_by_id(node_id)
|
@@ -1831,7 +2020,7 @@ def set_node_status(node_id, status):
|
|
1831
2020
|
snode.updated_at = str(datetime.datetime.now())
|
1832
2021
|
snode.write_to_db(db_controller.kv_store)
|
1833
2022
|
storage_events.snode_status_change(snode, snode.status, old_status, caused_by="monitor")
|
1834
|
-
distr_controller.send_node_status_event(snode
|
2023
|
+
distr_controller.send_node_status_event(snode, status)
|
1835
2024
|
|
1836
2025
|
if snode.status == StorageNode.STATUS_ONLINE:
|
1837
2026
|
logger.info("Connecting to remote devices")
|