sbcli-pre 1.2.5__zip → 1.2.7__zip
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/PKG-INFO +1 -1
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/env_var +1 -1
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/sbcli_pre.egg-info/PKG-INFO +1 -1
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/sbcli_pre.egg-info/SOURCES.txt +5 -3
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_cli/cli.py +138 -136
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/cluster_ops.py +138 -235
- sbcli_pre-1.2.7/simplyblock_core/constants.py +91 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/caching_node_controller.py +8 -6
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/cluster_events.py +9 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/device_controller.py +56 -63
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/events_controller.py +5 -3
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/health_controller.py +30 -40
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/lvol_controller.py +75 -39
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/pool_controller.py +8 -4
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/snapshot_controller.py +36 -3
- sbcli_pre-1.2.7/simplyblock_core/controllers/tasks_controller.py +103 -0
- sbcli_pre-1.2.7/simplyblock_core/controllers/tasks_events.py +37 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/distr_controller.py +13 -9
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/kv_store.py +62 -20
- sbcli_pre-1.2.7/simplyblock_core/mgmt_node_ops.py +205 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/events.py +9 -1
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/job_schedule.py +6 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/nvme_device.py +42 -4
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/storage_node.py +14 -2
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/rpc_client.py +55 -10
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/__init__.py +0 -4
- sbcli_pre-1.2.5/simplyblock_core/scripts/alerting/alert_resources.yaml → sbcli_pre-1.2.7/simplyblock_core/scripts/alerting/alert_resources.yaml.j2 +54 -5
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/dashboards/cluster.json +1 -1
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/deploy_stack.sh +9 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/docker-compose-swarm-monitoring.yml +32 -15
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/docker-compose-swarm.yml +17 -2
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/haproxy.cfg +15 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/install_deps.sh +3 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/stack_deploy_wait.sh +1 -1
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/capacity_and_stats_collector.py +1 -1
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/device_monitor.py +5 -46
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/distr_event_collector.py +10 -11
- sbcli_pre-1.2.7/simplyblock_core/services/health_check_service.py +134 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/lvol_monitor.py +1 -1
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/lvol_stat_collector.py +1 -1
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/port_stat_collector.py +0 -1
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/storage_node_monitor.py +49 -44
- sbcli_pre-1.2.7/simplyblock_core/services/tasks_runner_migration.py +61 -0
- sbcli_pre-1.2.5/simplyblock_core/services/job_tasks.py → sbcli_pre-1.2.7/simplyblock_core/services/tasks_runner_restart.py +95 -46
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/snode_client.py +12 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/storage_node_ops.py +630 -358
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/utils.py +126 -1
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/snode_ops.py +103 -25
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/web_api_cluster.py +20 -43
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/web_api_device.py +10 -7
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/web_api_lvol.py +9 -5
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/web_api_pool.py +14 -5
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/web_api_storage_node.py +15 -15
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/node_utils.py +0 -2
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/utils.py +8 -0
- sbcli_pre-1.2.5/simplyblock_core/constants.py +0 -65
- sbcli_pre-1.2.5/simplyblock_core/mgmt_node_ops.py +0 -80
- sbcli_pre-1.2.5/simplyblock_core/scripts/apply_dashboard.sh +0 -22
- sbcli_pre-1.2.5/simplyblock_core/services/health_check_service.py +0 -136
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/README.md +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/pyproject.toml +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/sbcli_pre.egg-info/dependency_links.txt +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/sbcli_pre.egg-info/entry_points.txt +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/sbcli_pre.egg-info/requires.txt +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/sbcli_pre.egg-info/top_level.txt +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/setup.cfg +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/setup.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_cli/main.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/__init__.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/cnode_client.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/compute_node_ops.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/__init__.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/device_events.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/lvol_events.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/mgmt_events.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/pool_events.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/snapshot_events.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/storage_events.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/__init__.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/base_model.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/caching_node.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/cluster.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/compute_node.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/deployer.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/global_settings.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/iface.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/lvol_model.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/mgmt_node.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/pool.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/port_stat.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/snapshot.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/stats.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/pci_utils.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/alerting/alert_rules.yaml +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/clean_local_storage_deploy.sh +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/config_docker.sh +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/dashboards/devices.json +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/dashboards/lvols.json +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/dashboards/node-exporter.json +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/dashboards/nodes.json +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/dashboards/pools.json +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/datasource.yml +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/db_config_double.sh +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/db_config_single.sh +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/prometheus.yml +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/run_ssh.sh +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/set_db_config.sh +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/__init__.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/caching_node_monitor.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/cap_monitor.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/install_service.sh +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/log_agg_service.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/mgmt_node_monitor.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/remove_service.sh +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/service_template.service +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/shell_utils.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/__init__.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/app.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/auth_middleware.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/__init__.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/caching_node_ops.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/caching_node_ops_k8s.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/node_api_basic.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/node_api_caching_docker.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/node_api_caching_ks.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/web_api_caching_node.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/web_api_deployer.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/web_api_mgmt_node.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/web_api_snapshot.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/caching_node_app.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/caching_node_app_k8s.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/node_webapp.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/snode_app.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/static/delete.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/static/deploy.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/static/deploy_cnode.yaml +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/static/deploy_spdk.yaml +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/static/is_up.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/static/list_deps.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/static/rpac.yaml +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/static/tst.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/templates/deploy_spdk.yaml.j2 +0 -0
@@ -14,11 +14,11 @@ import docker
|
|
14
14
|
from simplyblock_core import constants, scripts, distr_controller
|
15
15
|
from simplyblock_core import utils
|
16
16
|
from simplyblock_core.controllers import lvol_controller, storage_events, snapshot_controller, device_events, \
|
17
|
-
device_controller
|
17
|
+
device_controller, tasks_controller
|
18
18
|
from simplyblock_core.kv_store import DBController
|
19
19
|
from simplyblock_core import shell_utils
|
20
20
|
from simplyblock_core.models.iface import IFace
|
21
|
-
from simplyblock_core.models.nvme_device import NVMeDevice
|
21
|
+
from simplyblock_core.models.nvme_device import NVMeDevice, JMDevice
|
22
22
|
from simplyblock_core.models.storage_node import StorageNode
|
23
23
|
from simplyblock_core.pci_utils import get_nvme_devices, bind_spdk_driver
|
24
24
|
from simplyblock_core.rpc_client import RPCClient
|
@@ -81,55 +81,50 @@ def _get_if_ip_address(ifname):
|
|
81
81
|
|
82
82
|
|
83
83
|
def addNvmeDevices(cluster, rpc_client, devs, snode):
|
84
|
-
sequential_number = 0
|
85
84
|
devices = []
|
86
85
|
ret = rpc_client.bdev_nvme_controller_list()
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
86
|
+
ctr_map = {}
|
87
|
+
try:
|
88
|
+
if ret:
|
89
|
+
ctr_map = {i["ctrlrs"][0]['trid']['traddr']: i["name"] for i in ret}
|
90
|
+
except:
|
91
|
+
pass
|
91
92
|
|
93
|
+
next_physical_label = get_next_physical_device_order()
|
92
94
|
for index, pcie in enumerate(devs):
|
93
95
|
|
94
96
|
if pcie in ctr_map:
|
95
|
-
|
97
|
+
nvme_controller = ctr_map[pcie]
|
96
98
|
else:
|
97
|
-
|
98
|
-
ret, err = rpc_client.bdev_nvme_controller_attach(
|
99
|
+
nvme_controller = "nvme_%s" % index
|
100
|
+
ret, err = rpc_client.bdev_nvme_controller_attach(nvme_controller, pcie)
|
99
101
|
time.sleep(2)
|
100
|
-
nvme_bdev = f"{name}n1"
|
101
102
|
|
103
|
+
nvme_bdev = f"{nvme_controller}n1"
|
104
|
+
rpc_client.bdev_examine(nvme_bdev)
|
105
|
+
time.sleep(5)
|
102
106
|
ret = rpc_client.get_bdevs(nvme_bdev)
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
+
nvme_dict = ret[0]
|
108
|
+
nvme_driver_data = nvme_dict['driver_specific']['nvme'][0]
|
109
|
+
model_number = nvme_driver_data['ctrlr_data']['model_number']
|
110
|
+
total_size = nvme_dict['block_size'] * nvme_dict['num_blocks']
|
107
111
|
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
'cluster_id': snode.cluster_id,
|
125
|
-
|
126
|
-
# 'nvmf_nqn': subsystem_nqn,
|
127
|
-
# 'nvmf_ip': IP,
|
128
|
-
# 'nvmf_port': 4420,
|
129
|
-
|
130
|
-
'status': 'online'
|
131
|
-
}))
|
132
|
-
sequential_number += device_partitions_count
|
112
|
+
devices.append(
|
113
|
+
NVMeDevice({
|
114
|
+
'uuid': str(uuid.uuid4()),
|
115
|
+
'device_name': nvme_dict['name'],
|
116
|
+
'size': total_size,
|
117
|
+
'physical_label': next_physical_label,
|
118
|
+
'pcie_address': nvme_driver_data['pci_address'],
|
119
|
+
'model_id': model_number,
|
120
|
+
'serial_number': nvme_driver_data['ctrlr_data']['serial_number'],
|
121
|
+
'nvme_bdev': nvme_bdev,
|
122
|
+
'nvme_controller': nvme_controller,
|
123
|
+
'node_id': snode.get_id(),
|
124
|
+
'cluster_id': snode.cluster_id,
|
125
|
+
'status': NVMeDevice.STATUS_ONLINE
|
126
|
+
}))
|
127
|
+
next_physical_label += 1
|
133
128
|
return devices
|
134
129
|
|
135
130
|
|
@@ -197,10 +192,10 @@ def _run_nvme_smart_log_add(dev_name):
|
|
197
192
|
return data
|
198
193
|
|
199
194
|
|
200
|
-
def get_next_cluster_device_order(db_controller):
|
195
|
+
def get_next_cluster_device_order(db_controller, cluster_id):
|
201
196
|
max_order = 0
|
202
197
|
found = False
|
203
|
-
for node in db_controller.
|
198
|
+
for node in db_controller.get_storage_nodes_by_cluster_id(cluster_id):
|
204
199
|
for dev in node.nvme_devices:
|
205
200
|
found = True
|
206
201
|
max_order = max(max_order, dev.cluster_device_order)
|
@@ -209,91 +204,319 @@ def get_next_cluster_device_order(db_controller):
|
|
209
204
|
return 0
|
210
205
|
|
211
206
|
|
212
|
-
def
|
207
|
+
def get_next_physical_device_order():
|
213
208
|
db_controller = DBController()
|
209
|
+
max_order = 0
|
210
|
+
found = False
|
211
|
+
for node in db_controller.get_storage_nodes():
|
212
|
+
for dev in node.nvme_devices:
|
213
|
+
found = True
|
214
|
+
max_order = max(max_order, dev.physical_label)
|
215
|
+
if found:
|
216
|
+
return max_order + 1
|
217
|
+
return 0
|
218
|
+
|
219
|
+
|
220
|
+
def _search_for_partitions(rpc_client, nvme_device):
|
221
|
+
partitioned_devices = []
|
222
|
+
for bdev in rpc_client.get_bdevs():
|
223
|
+
name = bdev['name']
|
224
|
+
if name.startswith(f"{nvme_device.nvme_bdev}p"):
|
225
|
+
new_dev = NVMeDevice(nvme_device.to_dict())
|
226
|
+
new_dev.uuid = str(uuid.uuid4())
|
227
|
+
new_dev.device_name = name
|
228
|
+
new_dev.nvme_bdev = name
|
229
|
+
new_dev.size = bdev['block_size'] * bdev['num_blocks']
|
230
|
+
partitioned_devices.append(new_dev)
|
231
|
+
return partitioned_devices
|
232
|
+
|
233
|
+
|
234
|
+
def _create_jm_stack_on_raid(rpc_client, jm_nvme_bdevs, snode, after_restart):
|
235
|
+
raid_bdev = f"raid_jm_{snode.get_id()}"
|
236
|
+
ret = rpc_client.bdev_raid_create(raid_bdev, jm_nvme_bdevs)
|
237
|
+
if not ret:
|
238
|
+
logger.error(f"Failed to create raid_jm_{snode.get_id()}")
|
239
|
+
return False
|
240
|
+
alceml_name = f"alceml_jm_{snode.get_id()}"
|
241
|
+
pba_init_mode = 3
|
242
|
+
if after_restart:
|
243
|
+
pba_init_mode = 2
|
244
|
+
ret = rpc_client.bdev_alceml_create(alceml_name, raid_bdev, str(uuid.uuid4()), pba_init_mode=pba_init_mode)
|
245
|
+
if not ret:
|
246
|
+
logger.error(f"Failed to create alceml bdev: {alceml_name}")
|
247
|
+
return False
|
248
|
+
|
249
|
+
jm_bdev = f"jm_{snode.get_id()}"
|
250
|
+
ret = rpc_client.bdev_jm_create(jm_bdev, alceml_name)
|
251
|
+
if not ret:
|
252
|
+
logger.error(f"Failed to create {jm_bdev}")
|
253
|
+
return False
|
254
|
+
ret = rpc_client.get_bdevs(raid_bdev)
|
255
|
+
|
256
|
+
return JMDevice({
|
257
|
+
'uuid': str(uuid.uuid4()),
|
258
|
+
'device_name': jm_bdev,
|
259
|
+
'size': ret[0]["block_size"] * ret[0]["num_blocks"],
|
260
|
+
'status': JMDevice.STATUS_ONLINE,
|
261
|
+
'jm_nvme_bdev_list': jm_nvme_bdevs,
|
262
|
+
'raid_bdev': raid_bdev,
|
263
|
+
'alceml_bdev': alceml_name,
|
264
|
+
'jm_bdev': jm_bdev
|
265
|
+
})
|
266
|
+
|
267
|
+
|
268
|
+
def _create_jm_stack_on_device(rpc_client, nvme, snode, after_restart):
|
269
|
+
|
270
|
+
alceml_id = nvme.get_id()
|
271
|
+
alceml_name = device_controller.get_alceml_name(alceml_id)
|
272
|
+
logger.info(f"adding {alceml_name}")
|
273
|
+
|
274
|
+
pba_init_mode = 3
|
275
|
+
if after_restart:
|
276
|
+
pba_init_mode = 2
|
277
|
+
ret = rpc_client.bdev_alceml_create(alceml_name, nvme.nvme_bdev, alceml_id, pba_init_mode=pba_init_mode)
|
278
|
+
if not ret:
|
279
|
+
logger.error(f"Failed to create alceml bdev: {alceml_name}")
|
280
|
+
return False
|
281
|
+
|
282
|
+
jm_bdev = f"jm_{snode.get_id()}"
|
283
|
+
ret = rpc_client.bdev_jm_create(jm_bdev, alceml_name)
|
284
|
+
if not ret:
|
285
|
+
logger.error(f"Failed to create {jm_bdev}")
|
286
|
+
return False
|
214
287
|
|
288
|
+
return JMDevice({
|
289
|
+
'uuid': alceml_id,
|
290
|
+
'device_name': jm_bdev,
|
291
|
+
'size': nvme.size,
|
292
|
+
'status': JMDevice.STATUS_ONLINE,
|
293
|
+
'alceml_bdev': alceml_name,
|
294
|
+
'nvme_bdev': nvme.nvme_bdev,
|
295
|
+
'jm_bdev': jm_bdev
|
296
|
+
})
|
297
|
+
|
298
|
+
|
299
|
+
def _create_storage_device_stack(rpc_client, nvme, snode, after_restart):
|
300
|
+
test_name = f"{nvme.nvme_bdev}_test"
|
301
|
+
ret = rpc_client.bdev_passtest_create(test_name, nvme.nvme_bdev)
|
302
|
+
if not ret:
|
303
|
+
logger.error(f"Failed to create passtest bdev {test_name}")
|
304
|
+
return False
|
305
|
+
alceml_id = nvme.get_id()
|
306
|
+
alceml_name = device_controller.get_alceml_name(alceml_id)
|
307
|
+
logger.info(f"adding {alceml_name}")
|
308
|
+
pba_init_mode = 3
|
309
|
+
if after_restart:
|
310
|
+
pba_init_mode = 2
|
311
|
+
ret = rpc_client.bdev_alceml_create(alceml_name, test_name, alceml_id, pba_init_mode=pba_init_mode,
|
312
|
+
dev_cpu_mask=snode.dev_cpu_mask)
|
313
|
+
if not ret:
|
314
|
+
logger.error(f"Failed to create alceml bdev: {alceml_name}")
|
315
|
+
return False
|
316
|
+
|
317
|
+
# add pass through
|
318
|
+
pt_name = f"{alceml_name}_PT"
|
319
|
+
ret = rpc_client.bdev_PT_NoExcl_create(pt_name, alceml_name)
|
320
|
+
if not ret:
|
321
|
+
logger.error(f"Failed to create pt noexcl bdev: {pt_name}")
|
322
|
+
return False
|
323
|
+
|
324
|
+
subsystem_nqn = snode.subsystem + ":dev:" + alceml_id
|
325
|
+
logger.info("creating subsystem %s", subsystem_nqn)
|
326
|
+
ret = rpc_client.subsystem_create(subsystem_nqn, 'sbcli-cn', alceml_id)
|
327
|
+
IP = None
|
328
|
+
for iface in snode.data_nics:
|
329
|
+
if iface.ip4_address:
|
330
|
+
tr_type = iface.get_transport_type()
|
331
|
+
ret = rpc_client.transport_list()
|
332
|
+
found = False
|
333
|
+
if ret:
|
334
|
+
for ty in ret:
|
335
|
+
if ty['trtype'] == tr_type:
|
336
|
+
found = True
|
337
|
+
if found is False:
|
338
|
+
ret = rpc_client.transport_create(tr_type)
|
339
|
+
logger.info("adding listener for %s on IP %s" % (subsystem_nqn, iface.ip4_address))
|
340
|
+
ret = rpc_client.listeners_create(subsystem_nqn, tr_type, iface.ip4_address, "4420")
|
341
|
+
IP = iface.ip4_address
|
342
|
+
break
|
343
|
+
logger.info(f"add {pt_name} to subsystem")
|
344
|
+
ret = rpc_client.nvmf_subsystem_add_ns(subsystem_nqn, pt_name)
|
345
|
+
if not ret:
|
346
|
+
logger.error(f"Failed to add: {pt_name} to the subsystem: {subsystem_nqn}")
|
347
|
+
return False
|
348
|
+
|
349
|
+
nvme.testing_bdev = test_name
|
350
|
+
nvme.alceml_bdev = alceml_name
|
351
|
+
nvme.pt_bdev = pt_name
|
352
|
+
nvme.nvmf_nqn = subsystem_nqn
|
353
|
+
nvme.nvmf_ip = IP
|
354
|
+
nvme.nvmf_port = 4420
|
355
|
+
nvme.io_error = False
|
356
|
+
nvme.status = NVMeDevice.STATUS_ONLINE
|
357
|
+
return nvme
|
358
|
+
|
359
|
+
|
360
|
+
def _create_device_partitions(rpc_client, nvme, snode):
|
361
|
+
nbd_device = rpc_client.nbd_start_disk(nvme.nvme_bdev)
|
362
|
+
time.sleep(3)
|
363
|
+
if not nbd_device:
|
364
|
+
logger.error(f"Failed to start nbd dev")
|
365
|
+
return False
|
366
|
+
snode_api = SNodeClient(snode.api_endpoint)
|
367
|
+
result, error = snode_api.make_gpt_partitions(
|
368
|
+
nbd_device, snode.jm_percent, snode.num_partitions_per_dev)
|
369
|
+
if error:
|
370
|
+
logger.error(f"Failed to make partitions")
|
371
|
+
logger.error(error)
|
372
|
+
return False
|
373
|
+
time.sleep(3)
|
374
|
+
rpc_client.nbd_stop_disk(nbd_device)
|
375
|
+
time.sleep(1)
|
376
|
+
rpc_client.bdev_nvme_detach_controller(nvme.nvme_controller)
|
377
|
+
time.sleep(1)
|
378
|
+
rpc_client.bdev_nvme_controller_attach(nvme.nvme_controller, nvme.pcie_address)
|
379
|
+
time.sleep(1)
|
380
|
+
rpc_client.bdev_examine(nvme.nvme_bdev)
|
381
|
+
time.sleep(1)
|
382
|
+
return True
|
383
|
+
|
384
|
+
|
385
|
+
def _prepare_cluster_devices_partitions(snode, devices):
|
386
|
+
db_controller = DBController()
|
215
387
|
rpc_client = RPCClient(
|
216
388
|
snode.mgmt_ip, snode.rpc_port,
|
217
389
|
snode.rpc_username, snode.rpc_password)
|
218
390
|
|
219
|
-
|
391
|
+
new_devices = []
|
392
|
+
jm_devices = []
|
393
|
+
dev_order = get_next_cluster_device_order(db_controller, snode.cluster_id)
|
394
|
+
for index, nvme in enumerate(devices):
|
395
|
+
if nvme.status not in [NVMeDevice.STATUS_ONLINE, NVMeDevice.STATUS_UNAVAILABLE, NVMeDevice.STATUS_READONLY]:
|
396
|
+
logger.debug(f"Device is skipped: {nvme.get_id()}, status: {nvme.status}")
|
397
|
+
continue
|
398
|
+
|
399
|
+
# look for partitions
|
400
|
+
partitioned_devices = _search_for_partitions(rpc_client, nvme)
|
401
|
+
logger.debug("partitioned_devices")
|
402
|
+
logger.debug(partitioned_devices)
|
403
|
+
if len(partitioned_devices) == (1 + snode.num_partitions_per_dev):
|
404
|
+
logger.info("Partitioned devices found")
|
405
|
+
else:
|
406
|
+
logger.info(f"Creating partitions for {nvme.nvme_bdev}")
|
407
|
+
_create_device_partitions(rpc_client, nvme, snode)
|
408
|
+
partitioned_devices = _search_for_partitions(rpc_client, nvme)
|
409
|
+
if len(partitioned_devices) == (1 + snode.num_partitions_per_dev):
|
410
|
+
logger.info("Device partitions created")
|
411
|
+
else:
|
412
|
+
logger.error("Failed to create partitions")
|
413
|
+
return False
|
414
|
+
|
415
|
+
jm_devices.append(partitioned_devices.pop(0))
|
416
|
+
|
417
|
+
for dev in partitioned_devices:
|
418
|
+
new_device = _create_storage_device_stack(rpc_client, dev, snode, after_restart=False)
|
419
|
+
if not new_device:
|
420
|
+
logger.error("failed to create dev stack")
|
421
|
+
return False
|
422
|
+
new_device.cluster_device_order = dev_order
|
423
|
+
dev_order += 1
|
424
|
+
new_devices.append(new_device)
|
425
|
+
device_events.device_create(new_device)
|
426
|
+
|
427
|
+
snode.nvme_devices = new_devices
|
428
|
+
|
429
|
+
if jm_devices:
|
430
|
+
jm_nvme_bdevs = [dev.nvme_bdev for dev in jm_devices]
|
431
|
+
jm_device = _create_jm_stack_on_raid(rpc_client, jm_nvme_bdevs, snode, after_restart=False)
|
432
|
+
if not jm_device:
|
433
|
+
logger.error(f"Failed to create JM device")
|
434
|
+
return False
|
435
|
+
snode.jm_device = jm_device
|
436
|
+
|
437
|
+
return True
|
438
|
+
|
439
|
+
|
440
|
+
def _prepare_cluster_devices_jm_on_dev(snode, devices):
|
441
|
+
db_controller = DBController()
|
442
|
+
|
443
|
+
jm_device = devices[0]
|
444
|
+
# Set device cluster order
|
445
|
+
dev_order = get_next_cluster_device_order(db_controller, snode.cluster_id)
|
446
|
+
for index, nvme in enumerate(devices):
|
447
|
+
nvme.cluster_device_order = dev_order
|
448
|
+
dev_order += 1
|
449
|
+
if nvme.size < jm_device.size:
|
450
|
+
jm_device = nvme
|
451
|
+
device_events.device_create(nvme)
|
452
|
+
jm_device.status = NVMeDevice.STATUS_JM
|
453
|
+
|
454
|
+
rpc_client = RPCClient(snode.mgmt_ip, snode.rpc_port, snode.rpc_username, snode.rpc_password)
|
455
|
+
|
456
|
+
new_devices = []
|
457
|
+
for index, nvme in enumerate(devices):
|
220
458
|
if nvme.status not in [NVMeDevice.STATUS_ONLINE, NVMeDevice.STATUS_UNAVAILABLE,
|
221
459
|
NVMeDevice.STATUS_JM, NVMeDevice.STATUS_READONLY]:
|
222
460
|
logger.debug(f"Device is not online or unavailable: {nvme.get_id()}, status: {nvme.status}")
|
223
461
|
continue
|
224
462
|
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
logger.error(f"Failed to create bdev: {test_name}")
|
230
|
-
return False
|
231
|
-
alceml_id = nvme.get_id()
|
232
|
-
alceml_name = device_controller.get_alceml_name(alceml_id)
|
233
|
-
logger.info(f"adding {alceml_name}")
|
234
|
-
pba_init_mode = 3
|
235
|
-
if after_restart:
|
236
|
-
pba_init_mode = 2
|
237
|
-
ret = rpc_client.bdev_alceml_create(alceml_name, test_name, alceml_id, pba_init_mode=pba_init_mode)
|
238
|
-
if not ret:
|
239
|
-
logger.error(f"Failed to create alceml bdev: {alceml_name}")
|
240
|
-
return False
|
241
|
-
|
242
|
-
# create jm
|
243
|
-
if nvme.jm_bdev:
|
244
|
-
ret = rpc_client.bdev_jm_create(nvme.jm_bdev, alceml_name)
|
245
|
-
if not ret:
|
246
|
-
logger.error(f"Failed to create JM bdev: {nvme.jm_bdev}")
|
463
|
+
if nvme.status == NVMeDevice.STATUS_JM:
|
464
|
+
jm_device = _create_jm_stack_on_device(rpc_client, nvme, snode, after_restart=False)
|
465
|
+
if not jm_device:
|
466
|
+
logger.error(f"Failed to create JM device")
|
247
467
|
return False
|
248
|
-
|
249
|
-
|
250
|
-
nvme
|
251
|
-
|
468
|
+
snode.jm_device = jm_device
|
469
|
+
else:
|
470
|
+
new_device = _create_storage_device_stack(rpc_client, nvme, snode, after_restart=False)
|
471
|
+
if not new_device:
|
472
|
+
logger.error("failed to create dev stack")
|
473
|
+
return False
|
474
|
+
new_device.cluster_device_order = dev_order
|
475
|
+
dev_order += 1
|
476
|
+
new_devices.append(new_device)
|
477
|
+
device_events.device_create(new_device)
|
478
|
+
|
479
|
+
snode.nvme_devices = new_devices
|
480
|
+
return True
|
481
|
+
|
482
|
+
|
483
|
+
def _prepare_cluster_devices_on_restart(snode):
|
484
|
+
db_controller = DBController()
|
485
|
+
|
486
|
+
rpc_client = RPCClient(
|
487
|
+
snode.mgmt_ip, snode.rpc_port,
|
488
|
+
snode.rpc_username, snode.rpc_password)
|
489
|
+
|
490
|
+
for index, nvme in enumerate(snode.nvme_devices):
|
491
|
+
if nvme.status not in [NVMeDevice.STATUS_ONLINE, NVMeDevice.STATUS_UNAVAILABLE, NVMeDevice.STATUS_READONLY]:
|
492
|
+
logger.debug(f"Device is skipped: {nvme.get_id()}, status: {nvme.status}")
|
252
493
|
continue
|
253
494
|
|
254
|
-
|
255
|
-
|
256
|
-
|
495
|
+
dev = _create_storage_device_stack(rpc_client, nvme, snode, after_restart=True)
|
496
|
+
if not dev:
|
497
|
+
logger.error(f"Failed to create dev stack {nvme.get_id()}")
|
498
|
+
return False
|
499
|
+
device_events.device_restarted(dev)
|
500
|
+
|
501
|
+
# prepare JM device
|
502
|
+
jm_device = snode.jm_device
|
503
|
+
if jm_device.jm_nvme_bdev_list:
|
504
|
+
ret = _create_jm_stack_on_raid(rpc_client, jm_device.jm_nvme_bdev_list, snode, after_restart=False)
|
257
505
|
if not ret:
|
258
|
-
logger.error(f"Failed to create
|
506
|
+
logger.error(f"Failed to create JM device")
|
259
507
|
return False
|
508
|
+
else:
|
260
509
|
|
261
|
-
|
262
|
-
logger.info("creating subsystem %s", subsystem_nqn)
|
263
|
-
ret = rpc_client.subsystem_create(subsystem_nqn, 'sbcli-cn', alceml_id)
|
264
|
-
IP = None
|
265
|
-
for iface in snode.data_nics:
|
266
|
-
if iface.ip4_address:
|
267
|
-
tr_type = iface.get_transport_type()
|
268
|
-
ret = rpc_client.transport_list()
|
269
|
-
found = False
|
270
|
-
if ret:
|
271
|
-
for ty in ret:
|
272
|
-
if ty['trtype'] == tr_type:
|
273
|
-
found = True
|
274
|
-
if found is False:
|
275
|
-
ret = rpc_client.transport_create(tr_type)
|
276
|
-
logger.info("adding listener for %s on IP %s" % (subsystem_nqn, iface.ip4_address))
|
277
|
-
ret = rpc_client.listeners_create(subsystem_nqn, tr_type, iface.ip4_address, "4420")
|
278
|
-
IP = iface.ip4_address
|
279
|
-
break
|
280
|
-
logger.info(f"add {pt_name} to subsystem")
|
281
|
-
ret = rpc_client.nvmf_subsystem_add_ns(subsystem_nqn, pt_name)
|
510
|
+
ret = rpc_client.bdev_alceml_create(jm_device.alceml_bdev, jm_device.nvme_bdev, jm_device.get_id(), pba_init_mode=2)
|
282
511
|
if not ret:
|
283
|
-
logger.error(f"Failed to
|
512
|
+
logger.error(f"Failed to create alceml bdev: {jm_device.alceml_bdev}")
|
284
513
|
return False
|
285
514
|
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
nvme.nvmf_port = 4420
|
292
|
-
nvme.io_error = False
|
293
|
-
old_status = nvme.status
|
294
|
-
nvme.status = NVMeDevice.STATUS_ONLINE
|
295
|
-
device_events.device_status_change(nvme, nvme.status, old_status)
|
296
|
-
snode.write_to_db(db_controller.kv_store)
|
515
|
+
jm_bdev = f"jm_{snode.get_id()}"
|
516
|
+
ret = rpc_client.bdev_jm_create(jm_bdev, jm_device.alceml_bdev)
|
517
|
+
if not ret:
|
518
|
+
logger.error(f"Failed to create {jm_bdev}")
|
519
|
+
return False
|
297
520
|
|
298
521
|
return True
|
299
522
|
|
@@ -307,7 +530,7 @@ def _connect_to_remote_devs(this_node):
|
|
307
530
|
|
308
531
|
remote_devices = []
|
309
532
|
# connect to remote devs
|
310
|
-
snodes = db_controller.
|
533
|
+
snodes = db_controller.get_storage_nodes_by_cluster_id(this_node.cluster_id)
|
311
534
|
for node_index, node in enumerate(snodes):
|
312
535
|
if node.get_id() == this_node.get_id() or node.status == node.STATUS_OFFLINE:
|
313
536
|
continue
|
@@ -326,9 +549,10 @@ def _connect_to_remote_devs(this_node):
|
|
326
549
|
return remote_devices
|
327
550
|
|
328
551
|
|
329
|
-
def add_node(cluster_id, node_ip, iface_name, data_nics_list,
|
330
|
-
|
331
|
-
|
552
|
+
def add_node(cluster_id, node_ip, iface_name, data_nics_list,
|
553
|
+
max_lvol, max_snap, max_prov, spdk_image=None, spdk_debug=False,
|
554
|
+
small_bufsize=0, large_bufsize=0,
|
555
|
+
num_partitions_per_dev=0, jm_percent=0, number_of_devices=0):
|
332
556
|
db_controller = DBController()
|
333
557
|
kv_store = db_controller.kv_store
|
334
558
|
|
@@ -384,20 +608,71 @@ def add_node(cluster_id, node_ip, iface_name, data_nics_list, spdk_cpu_mask,
|
|
384
608
|
logger.error(f"Node already exists, try remove it first: {ec2_metadata['instanceId']}")
|
385
609
|
return False
|
386
610
|
|
611
|
+
# Tune cpu maks parameters
|
612
|
+
cpu_count = node_info["cpu_count"]
|
613
|
+
pollers_mask = ""
|
614
|
+
app_thread_mask = ""
|
615
|
+
dev_cpu_mask = ""
|
616
|
+
nvme_pollers_cores = []
|
617
|
+
if cpu_count < 8:
|
618
|
+
mask = (1 << (cpu_count - 1)) - 1
|
619
|
+
mask <<= 1
|
620
|
+
spdk_cpu_mask = f'0x{mask:X}'
|
621
|
+
os_cores = [0]
|
622
|
+
else:
|
623
|
+
os_cores, nvme_pollers_cores, app_thread_core, dev_cpu_cores = \
|
624
|
+
utils.calculate_core_allocation(cpu_count)
|
625
|
+
spdk_cores = nvme_pollers_cores + app_thread_core + dev_cpu_cores
|
626
|
+
|
627
|
+
pollers_mask = utils.generate_mask(nvme_pollers_cores)
|
628
|
+
app_thread_mask = utils.generate_mask(app_thread_core)
|
629
|
+
spdk_cpu_mask = utils.generate_mask(spdk_cores)
|
630
|
+
dev_cpu_mask = utils.generate_mask(dev_cpu_cores)
|
631
|
+
|
632
|
+
# Calculate pool count
|
633
|
+
if ec2_metadata and ec2_metadata.get('instanceType'):
|
634
|
+
supported_type, storage_devices, device_size = utils.get_total_size_per_instance_type(ec2_metadata["instanceType"])
|
635
|
+
if not supported_type:
|
636
|
+
logger.warning(f"Unsupported ec2 instance-type {ec2_metadata['instanceType']} for deployment")
|
637
|
+
if not number_of_devices:
|
638
|
+
logger.error(f"Unsupported ec2 instance-type {ec2_metadata['instanceType']} "
|
639
|
+
"for deployment, please specify --number-of-devices")
|
640
|
+
return False
|
641
|
+
number_of_devices = storage_devices
|
642
|
+
else:
|
643
|
+
logger.warning("Can not get ec2 instance type for this instance.")
|
644
|
+
if not number_of_devices:
|
645
|
+
logger.error("Unsupported instance type please specify --number-of-devices.")
|
646
|
+
return False
|
647
|
+
|
648
|
+
number_of_split = num_partitions_per_dev if num_partitions_per_dev else num_partitions_per_dev + 1
|
649
|
+
number_of_alceml_devices = number_of_devices * number_of_split
|
650
|
+
small_pool_count, large_pool_count = utils.calculate_pool_count(
|
651
|
+
number_of_alceml_devices, max_lvol, max_snap, cpu_count, len(nvme_pollers_cores) or cpu_count)
|
652
|
+
|
653
|
+
# Calculate minimum huge page memory
|
654
|
+
minimum_hp_memory = utils.calculate_minimum_hp_memory(small_pool_count, large_pool_count, max_lvol, max_snap, cpu_count)
|
655
|
+
|
656
|
+
# Calculate minimum sys memory
|
657
|
+
minimum_sys_memory = utils.calculate_minimum_sys_memory(max_prov)
|
658
|
+
|
387
659
|
# check for memory
|
388
660
|
if "memory_details" in node_info and node_info['memory_details']:
|
389
661
|
memory_details = node_info['memory_details']
|
390
662
|
logger.info("Node Memory info")
|
391
663
|
logger.info(f"Total: {utils.humanbytes(memory_details['total'])}")
|
392
664
|
logger.info(f"Free: {utils.humanbytes(memory_details['free'])}")
|
393
|
-
|
394
|
-
|
395
|
-
|
396
|
-
|
397
|
-
|
398
|
-
|
399
|
-
|
400
|
-
|
665
|
+
else:
|
666
|
+
logger.error(f"Cannot get memory info from the ec2 instance.. Exiting")
|
667
|
+
return False
|
668
|
+
|
669
|
+
satisfied, spdk_mem = utils.calculate_spdk_memory(minimum_hp_memory,
|
670
|
+
minimum_sys_memory,
|
671
|
+
int(memory_details['free']),
|
672
|
+
int(memory_details['huge_total']))
|
673
|
+
if not satisfied:
|
674
|
+
logger.error(f"Not enough memory for the provided max_lvo: {max_lvol}, max_snap: {max_snap}, max_prov: {max_prov}.. Exiting")
|
675
|
+
return False
|
401
676
|
|
402
677
|
logger.info("Joining docker swarm...")
|
403
678
|
cluster_docker = utils.get_docker_client(cluster_id)
|
@@ -472,16 +747,28 @@ def add_node(cluster_id, node_ip, iface_name, data_nics_list, spdk_cpu_mask,
|
|
472
747
|
snode.hugepages = node_info['hugepages']
|
473
748
|
|
474
749
|
snode.spdk_cpu_mask = spdk_cpu_mask or ""
|
475
|
-
snode.spdk_mem = spdk_mem
|
750
|
+
snode.spdk_mem = spdk_mem
|
751
|
+
snode.max_lvol = max_lvol
|
752
|
+
snode.max_snap = max_snap
|
753
|
+
snode.max_prov = max_prov
|
754
|
+
snode.number_of_devices = number_of_devices
|
476
755
|
snode.spdk_image = spdk_image or ""
|
477
756
|
snode.spdk_debug = spdk_debug or 0
|
478
757
|
snode.write_to_db(kv_store)
|
758
|
+
snode.app_thread_mask = app_thread_mask or ""
|
759
|
+
snode.pollers_mask = pollers_mask or ""
|
760
|
+
snode.nvme_pollers_cores = nvme_pollers_cores or []
|
761
|
+
snode.dev_cpu_mask = dev_cpu_mask or ""
|
762
|
+
snode.os_cores = os_cores or []
|
479
763
|
|
480
764
|
snode.iobuf_small_pool_count = small_pool_count or 0
|
481
765
|
snode.iobuf_large_pool_count = large_pool_count or 0
|
482
766
|
snode.iobuf_small_bufsize = small_bufsize or 0
|
483
767
|
snode.iobuf_large_bufsize = large_bufsize or 0
|
484
768
|
|
769
|
+
snode.num_partitions_per_dev = num_partitions_per_dev
|
770
|
+
snode.jm_percent = jm_percent
|
771
|
+
|
485
772
|
snode.write_to_db(kv_store)
|
486
773
|
|
487
774
|
# creating RPCClient instance
|
@@ -499,13 +786,41 @@ def add_node(cluster_id, node_ip, iface_name, data_nics_list, spdk_cpu_mask,
|
|
499
786
|
logger.error("Failed to set iobuf options")
|
500
787
|
return False
|
501
788
|
|
502
|
-
# 2-
|
789
|
+
# 2- set socket implementation options
|
790
|
+
ret = rpc_client.sock_impl_set_options()
|
791
|
+
if not ret:
|
792
|
+
logger.error("Failed socket implement set options")
|
793
|
+
return False
|
794
|
+
|
795
|
+
# 3- set nvme config
|
796
|
+
if snode.pollers_mask:
|
797
|
+
ret = rpc_client.nvmf_set_config(snode.pollers_mask)
|
798
|
+
if not ret:
|
799
|
+
logger.error("Failed to set pollers mask")
|
800
|
+
return False
|
801
|
+
|
802
|
+
# 4- start spdk framework
|
503
803
|
ret = rpc_client.framework_start_init()
|
504
804
|
if not ret:
|
505
805
|
logger.error("Failed to start framework")
|
506
806
|
return False
|
507
807
|
|
508
|
-
#
|
808
|
+
# 5- set app_thread cpu mask
|
809
|
+
if snode.app_thread_mask:
|
810
|
+
ret = rpc_client.thread_get_stats()
|
811
|
+
app_thread_process_id = 0
|
812
|
+
if ret.get("threads"):
|
813
|
+
for entry in ret["threads"]:
|
814
|
+
if entry['name'] == 'app_thread':
|
815
|
+
app_thread_process_id = entry['id']
|
816
|
+
break
|
817
|
+
|
818
|
+
ret = rpc_client.thread_set_cpumask(app_thread_process_id, snode.app_thread_mask)
|
819
|
+
if not ret:
|
820
|
+
logger.error("Failed to set app thread mask")
|
821
|
+
return False
|
822
|
+
|
823
|
+
# 6- set nvme bdev options
|
509
824
|
ret = rpc_client.bdev_nvme_set_options()
|
510
825
|
if not ret:
|
511
826
|
logger.error("Failed to set nvme options")
|
@@ -513,36 +828,18 @@ def add_node(cluster_id, node_ip, iface_name, data_nics_list, spdk_cpu_mask,
|
|
513
828
|
|
514
829
|
# get new node info after starting spdk
|
515
830
|
node_info, _ = snode_api.info()
|
516
|
-
|
831
|
+
|
832
|
+
# discover devices
|
517
833
|
nvme_devs = addNvmeDevices(cluster, rpc_client, node_info['spdk_pcie_list'], snode)
|
518
834
|
if not nvme_devs:
|
519
835
|
logger.error("No NVMe devices was found!")
|
520
836
|
return False
|
521
837
|
|
522
|
-
snode.nvme_devices = nvme_devs
|
523
|
-
|
524
|
-
jm_device = snode.nvme_devices[0]
|
525
|
-
# Set device cluster order
|
526
|
-
dev_order = get_next_cluster_device_order(db_controller)
|
527
|
-
for index, nvme in enumerate(snode.nvme_devices):
|
528
|
-
nvme.cluster_device_order = dev_order
|
529
|
-
dev_order += 1
|
530
|
-
if jm_device_pcie:
|
531
|
-
if nvme.pcie_address == jm_device_pcie:
|
532
|
-
jm_device = nvme
|
533
|
-
elif nvme.size < jm_device.size:
|
534
|
-
jm_device = nvme
|
535
|
-
device_events.device_create(nvme)
|
536
|
-
|
537
|
-
# create jm
|
538
|
-
logger.info(f"Using device for JM: {jm_device.get_id()}")
|
539
|
-
jm_device.jm_bdev = f"jm_{snode.get_id()}"
|
540
|
-
|
541
|
-
# save object
|
542
|
-
snode.write_to_db(db_controller.kv_store)
|
543
|
-
|
544
838
|
# prepare devices
|
545
|
-
|
839
|
+
if snode.num_partitions_per_dev == 0 or snode.jm_percent == 0:
|
840
|
+
ret = _prepare_cluster_devices_jm_on_dev(snode, nvme_devs)
|
841
|
+
else:
|
842
|
+
ret = _prepare_cluster_devices_partitions(snode, nvme_devs)
|
546
843
|
if not ret:
|
547
844
|
logger.error("Failed to prepare cluster devices")
|
548
845
|
return False
|
@@ -557,7 +854,7 @@ def add_node(cluster_id, node_ip, iface_name, data_nics_list, spdk_cpu_mask,
|
|
557
854
|
|
558
855
|
# make other nodes connect to the new devices
|
559
856
|
logger.info("Make other nodes connect to the new devices")
|
560
|
-
snodes = db_controller.
|
857
|
+
snodes = db_controller.get_storage_nodes_by_cluster_id(cluster_id)
|
561
858
|
for node_index, node in enumerate(snodes):
|
562
859
|
if node.get_id() == snode.get_id() or node.status != StorageNode.STATUS_ONLINE:
|
563
860
|
continue
|
@@ -599,150 +896,16 @@ def add_node(cluster_id, node_ip, iface_name, data_nics_list, spdk_cpu_mask,
|
|
599
896
|
time.sleep(3)
|
600
897
|
|
601
898
|
logger.info("Sending cluster event updates")
|
602
|
-
distr_controller.send_node_status_event(snode
|
899
|
+
distr_controller.send_node_status_event(snode, StorageNode.STATUS_ONLINE)
|
603
900
|
|
604
901
|
for dev in snode.nvme_devices:
|
605
|
-
distr_controller.send_dev_status_event(dev
|
902
|
+
distr_controller.send_dev_status_event(dev, NVMeDevice.STATUS_ONLINE)
|
606
903
|
|
607
904
|
storage_events.snode_add(snode)
|
608
905
|
logger.info("Done")
|
609
906
|
return "Success"
|
610
907
|
|
611
908
|
|
612
|
-
# Deprecated
|
613
|
-
def add_storage_node(cluster_id, iface_name, data_nics):
|
614
|
-
db_controller = DBController()
|
615
|
-
kv_store = db_controller.kv_store
|
616
|
-
|
617
|
-
cluster = db_controller.get_cluster_by_id(cluster_id)
|
618
|
-
if not cluster:
|
619
|
-
logger.error("Cluster not found: %s", cluster_id)
|
620
|
-
return False
|
621
|
-
|
622
|
-
logger.info("Add Storage node")
|
623
|
-
|
624
|
-
hostname = utils.get_hostname()
|
625
|
-
snode = db_controller.get_storage_node_by_hostname(hostname)
|
626
|
-
if snode:
|
627
|
-
logger.error("Node already exists, try remove it first.")
|
628
|
-
exit(1)
|
629
|
-
else:
|
630
|
-
snode = StorageNode()
|
631
|
-
snode.uuid = str(uuid.uuid4())
|
632
|
-
|
633
|
-
mgmt_ip = _get_if_ip_address(iface_name)
|
634
|
-
system_id = utils.get_system_id()
|
635
|
-
|
636
|
-
BASE_NQN = cluster.nqn.split(":")[0]
|
637
|
-
subsystem_nqn = f"{BASE_NQN}:{hostname}"
|
638
|
-
|
639
|
-
if data_nics:
|
640
|
-
data_nics = _get_data_nics(data_nics)
|
641
|
-
else:
|
642
|
-
data_nics = _get_data_nics([iface_name])
|
643
|
-
|
644
|
-
rpc_user, rpc_pass = utils.generate_rpc_user_and_pass()
|
645
|
-
|
646
|
-
# creating storage node object
|
647
|
-
snode.status = StorageNode.STATUS_IN_CREATION
|
648
|
-
snode.baseboard_sn = utils.get_baseboard_sn()
|
649
|
-
snode.system_uuid = system_id
|
650
|
-
snode.hostname = hostname
|
651
|
-
snode.host_nqn = subsystem_nqn
|
652
|
-
snode.subsystem = subsystem_nqn
|
653
|
-
snode.data_nics = data_nics
|
654
|
-
snode.mgmt_ip = mgmt_ip
|
655
|
-
snode.rpc_port = constants.RPC_HTTP_PROXY_PORT
|
656
|
-
snode.rpc_username = rpc_user
|
657
|
-
snode.rpc_password = rpc_pass
|
658
|
-
snode.cluster_id = cluster_id
|
659
|
-
snode.write_to_db(kv_store)
|
660
|
-
|
661
|
-
# creating RPCClient instance
|
662
|
-
rpc_client = RPCClient(
|
663
|
-
snode.mgmt_ip,
|
664
|
-
snode.rpc_port,
|
665
|
-
snode.rpc_username,
|
666
|
-
snode.rpc_password)
|
667
|
-
|
668
|
-
logger.info("Getting nvme devices")
|
669
|
-
devs = get_nvme_devices()
|
670
|
-
logger.debug(devs)
|
671
|
-
pcies = [d[0] for d in devs]
|
672
|
-
nvme_devs = addNvmeDevices(cluster, rpc_client, pcies, snode)
|
673
|
-
if not nvme_devs:
|
674
|
-
logger.error("No NVMe devices was found!")
|
675
|
-
|
676
|
-
logger.debug(nvme_devs)
|
677
|
-
snode.nvme_devices = nvme_devs
|
678
|
-
|
679
|
-
# Set device cluster order
|
680
|
-
dev_order = get_next_cluster_device_order(db_controller)
|
681
|
-
for index, nvme in enumerate(snode.nvme_devices):
|
682
|
-
nvme.cluster_device_order = dev_order
|
683
|
-
dev_order += 1
|
684
|
-
snode.write_to_db(db_controller.kv_store)
|
685
|
-
|
686
|
-
# prepare devices
|
687
|
-
_prepare_cluster_devices(snode)
|
688
|
-
|
689
|
-
logger.info("Connecting to remote devices")
|
690
|
-
remote_devices = _connect_to_remote_devs(snode)
|
691
|
-
snode.remote_devices = remote_devices
|
692
|
-
|
693
|
-
logger.info("Setting node status to Active")
|
694
|
-
snode.status = StorageNode.STATUS_ONLINE
|
695
|
-
snode.write_to_db(kv_store)
|
696
|
-
|
697
|
-
# make other nodes connect to the new devices
|
698
|
-
logger.info("Make other nodes connect to the new devices")
|
699
|
-
snodes = db_controller.get_storage_nodes()
|
700
|
-
for node_index, node in enumerate(snodes):
|
701
|
-
if node.get_id() == snode.get_id():
|
702
|
-
continue
|
703
|
-
logger.info(f"Connecting to node: {node.get_id()}")
|
704
|
-
rpc_client = RPCClient(node.mgmt_ip, node.rpc_port, node.rpc_username, node.rpc_password)
|
705
|
-
count = 0
|
706
|
-
for dev in snode.nvme_devices:
|
707
|
-
name = f"remote_{dev.alceml_bdev}"
|
708
|
-
ret = rpc_client.bdev_nvme_attach_controller_tcp(name, dev.nvmf_nqn, dev.nvmf_ip, dev.nvmf_port)
|
709
|
-
if not ret:
|
710
|
-
logger.error(f"Failed to connect to device: {name}")
|
711
|
-
continue
|
712
|
-
|
713
|
-
dev.remote_bdev = f"{name}n1"
|
714
|
-
idx = -1
|
715
|
-
for i, d in enumerate(node.remote_devices):
|
716
|
-
if d.get_id() == dev.get_id():
|
717
|
-
idx = i
|
718
|
-
break
|
719
|
-
if idx >= 0:
|
720
|
-
node.remote_devices[idx] = dev
|
721
|
-
else:
|
722
|
-
node.remote_devices.append(dev)
|
723
|
-
count += 1
|
724
|
-
node.write_to_db(kv_store)
|
725
|
-
logger.info(f"connected to devices count: {count}")
|
726
|
-
|
727
|
-
logger.info("Sending cluster map")
|
728
|
-
ret = distr_controller.send_cluster_map_to_node(snode)
|
729
|
-
if not ret:
|
730
|
-
return False, "Failed to send cluster map"
|
731
|
-
ret = distr_controller.send_cluster_map_add_node(snode)
|
732
|
-
if not ret:
|
733
|
-
return False, "Failed to send cluster map add node"
|
734
|
-
time.sleep(3)
|
735
|
-
|
736
|
-
logger.info("Sending cluster event updates")
|
737
|
-
distr_controller.send_node_status_event(snode.get_id(), "online")
|
738
|
-
|
739
|
-
for dev in snode.nvme_devices:
|
740
|
-
distr_controller.send_dev_status_event(dev.cluster_device_order, "online")
|
741
|
-
|
742
|
-
logger.info("Done")
|
743
|
-
return "Success"
|
744
|
-
|
745
|
-
|
746
909
|
def delete_storage_node(node_id):
|
747
910
|
db_controller = DBController()
|
748
911
|
snode = db_controller.get_storage_node_by_id(node_id)
|
@@ -756,7 +919,7 @@ def delete_storage_node(node_id):
|
|
756
919
|
|
757
920
|
snode.remove(db_controller.kv_store)
|
758
921
|
|
759
|
-
for lvol in db_controller.get_lvols():
|
922
|
+
for lvol in db_controller.get_lvols(snode.cluster_id):
|
760
923
|
logger.info(f"Sending cluster map to LVol: {lvol.get_id()}")
|
761
924
|
lvol_controller.send_cluster_map(lvol.get_id())
|
762
925
|
|
@@ -764,7 +927,7 @@ def delete_storage_node(node_id):
|
|
764
927
|
logger.info("done")
|
765
928
|
|
766
929
|
|
767
|
-
def remove_storage_node(node_id, force_remove=False):
|
930
|
+
def remove_storage_node(node_id, force_remove=False, force_migrate=False):
|
768
931
|
db_controller = DBController()
|
769
932
|
snode = db_controller.get_storage_node_by_id(node_id)
|
770
933
|
if not snode:
|
@@ -811,7 +974,7 @@ def remove_storage_node(node_id, force_remove=False):
|
|
811
974
|
distr_controller.disconnect_device(dev)
|
812
975
|
old_status = dev.status
|
813
976
|
dev.status = NVMeDevice.STATUS_FAILED
|
814
|
-
distr_controller.send_dev_status_event(dev
|
977
|
+
distr_controller.send_dev_status_event(dev, NVMeDevice.STATUS_FAILED)
|
815
978
|
device_events.device_status_change(dev, NVMeDevice.STATUS_FAILED, old_status)
|
816
979
|
|
817
980
|
logger.info("Removing storage node")
|
@@ -825,29 +988,32 @@ def remove_storage_node(node_id, force_remove=False):
|
|
825
988
|
pass
|
826
989
|
|
827
990
|
try:
|
828
|
-
snode_api = SNodeClient(snode.api_endpoint)
|
991
|
+
snode_api = SNodeClient(snode.api_endpoint, timeout=20)
|
829
992
|
snode_api.spdk_process_kill()
|
830
993
|
snode_api.leave_swarm()
|
994
|
+
pci_address = []
|
995
|
+
for dev in snode.nvme_devices:
|
996
|
+
if dev.pcie_address not in pci_address:
|
997
|
+
ret = snode_api.delete_dev_gpt_partitions(dev.pcie_address)
|
998
|
+
logger.debug(ret)
|
999
|
+
pci_address.append(dev.pcie_address)
|
831
1000
|
except Exception as e:
|
832
|
-
logger.
|
1001
|
+
logger.exception(e)
|
833
1002
|
|
834
1003
|
old_status = snode.status
|
835
1004
|
snode.status = StorageNode.STATUS_REMOVED
|
836
1005
|
snode.write_to_db(db_controller.kv_store)
|
837
1006
|
logger.info("Sending node event update")
|
838
|
-
distr_controller.send_node_status_event(snode
|
1007
|
+
distr_controller.send_node_status_event(snode, snode.status)
|
839
1008
|
storage_events.snode_status_change(snode, StorageNode.STATUS_REMOVED, old_status)
|
840
1009
|
logger.info("done")
|
841
1010
|
|
842
1011
|
|
843
1012
|
def restart_storage_node(
|
844
|
-
node_id,
|
845
|
-
spdk_cpu_mask=None,
|
846
|
-
spdk_mem=None,
|
1013
|
+
node_id, max_lvol=0, max_snap=0, max_prov="",
|
847
1014
|
spdk_image=None,
|
848
1015
|
set_spdk_debug=None,
|
849
|
-
|
850
|
-
small_bufsize=0, large_bufsize=0):
|
1016
|
+
small_bufsize=0, large_bufsize=0, number_of_devices=0):
|
851
1017
|
|
852
1018
|
db_controller = DBController()
|
853
1019
|
kv_store = db_controller.kv_store
|
@@ -868,7 +1034,7 @@ def restart_storage_node(
|
|
868
1034
|
snode.status = StorageNode.STATUS_RESTARTING
|
869
1035
|
snode.write_to_db(kv_store)
|
870
1036
|
logger.info("Sending node event update")
|
871
|
-
distr_controller.send_node_status_event(snode
|
1037
|
+
distr_controller.send_node_status_event(snode, snode.status)
|
872
1038
|
storage_events.snode_status_change(snode, snode.status, old_status)
|
873
1039
|
|
874
1040
|
logger.info(f"Restarting Storage node: {snode.mgmt_ip}")
|
@@ -878,18 +1044,67 @@ def restart_storage_node(
|
|
878
1044
|
logger.info(f"Node info: {node_info}")
|
879
1045
|
|
880
1046
|
logger.info("Restarting SPDK")
|
881
|
-
|
882
|
-
if spdk_cpu_mask:
|
883
|
-
cpu = spdk_cpu_mask
|
884
|
-
snode.spdk_cpu_mask = cpu
|
885
|
-
mem = snode.spdk_mem
|
886
|
-
if spdk_mem:
|
887
|
-
mem = spdk_mem
|
888
|
-
snode.spdk_mem = mem
|
1047
|
+
|
889
1048
|
img = snode.spdk_image
|
1049
|
+
if max_lvol:
|
1050
|
+
snode.max_lvol = max_lvol
|
1051
|
+
if max_snap:
|
1052
|
+
snode.max_snap = max_snap
|
1053
|
+
if max_prov:
|
1054
|
+
snode.max_prov = max_prov
|
890
1055
|
if spdk_image:
|
891
1056
|
img = spdk_image
|
892
1057
|
snode.spdk_image = img
|
1058
|
+
|
1059
|
+
# Calculate pool count
|
1060
|
+
if snode.ec2_metadata and snode.ec2_metadata.get('instanceType'):
|
1061
|
+
supported_type, storage_devices, device_size = utils.get_total_size_per_instance_type(snode.ec2_metadata["instanceType"])
|
1062
|
+
if not supported_type:
|
1063
|
+
logger.warning(f"Unsupported ec2 instance-type {snode.ec2_metadata['instanceType']} for deployment")
|
1064
|
+
if not number_of_devices:
|
1065
|
+
logger.error(f"Unsupported ec2 instance-type {snode.ec2_metadata['instanceType']} "
|
1066
|
+
"for deployment, please specify --number-of-devices")
|
1067
|
+
return False
|
1068
|
+
number_of_devices = storage_devices
|
1069
|
+
else:
|
1070
|
+
logger.warning("Can not get ec2 instance type for this instance..")
|
1071
|
+
if not number_of_devices:
|
1072
|
+
if snode.number_of_devices:
|
1073
|
+
number_of_devices = snode.number_of_devices
|
1074
|
+
else:
|
1075
|
+
logger.error("Unsupported instance type please specify --number-of-devices")
|
1076
|
+
return False
|
1077
|
+
|
1078
|
+
snode.number_of_devices = number_of_devices
|
1079
|
+
|
1080
|
+
number_of_split = snode.num_partitions_per_dev if snode.num_partitions_per_dev else snode.num_partitions_per_dev + 1
|
1081
|
+
number_of_alceml_devices = number_of_devices * number_of_split
|
1082
|
+
small_pool_count, large_pool_count = utils.calculate_pool_count(
|
1083
|
+
number_of_alceml_devices, snode.max_lvol, snode.max_snap, snode.cpu, len(snode.nvme_pollers_cores) or snode.cpu)
|
1084
|
+
|
1085
|
+
# Calculate minimum huge page memory
|
1086
|
+
minimum_hp_memory = utils.calculate_minimum_hp_memory(small_pool_count, large_pool_count, snode.max_lvol, snode.max_snap, snode.cpu)
|
1087
|
+
|
1088
|
+
# Calculate minimum sys memory
|
1089
|
+
minimum_sys_memory = utils.calculate_minimum_sys_memory(snode.max_prov)
|
1090
|
+
|
1091
|
+
# check for memory
|
1092
|
+
if "memory_details" in node_info and node_info['memory_details']:
|
1093
|
+
memory_details = node_info['memory_details']
|
1094
|
+
logger.info("Node Memory info")
|
1095
|
+
logger.info(f"Total: {utils.humanbytes(memory_details['total'])}")
|
1096
|
+
logger.info(f"Free: {utils.humanbytes(memory_details['free'])}")
|
1097
|
+
else:
|
1098
|
+
logger.error(f"Cannot get memory info from the ec2 instance.. Exiting")
|
1099
|
+
|
1100
|
+
satisfied, spdk_mem = utils.calculate_spdk_memory(minimum_hp_memory,
|
1101
|
+
minimum_sys_memory,
|
1102
|
+
int(memory_details['free']),
|
1103
|
+
int(memory_details['huge_total']))
|
1104
|
+
if not satisfied:
|
1105
|
+
logger.error(f"Not enough memory for the provided max_lvo: {snode.max_lvol}, max_snap: {snode.max_snap}, max_prov: {utils.humanbytes(snode.max_prov)}.. Exiting")
|
1106
|
+
|
1107
|
+
|
893
1108
|
spdk_debug = snode.spdk_debug
|
894
1109
|
if set_spdk_debug:
|
895
1110
|
spdk_debug = spdk_debug
|
@@ -897,17 +1112,14 @@ def restart_storage_node(
|
|
897
1112
|
|
898
1113
|
cluster_docker = utils.get_docker_client(snode.cluster_id)
|
899
1114
|
cluster_ip = cluster_docker.info()["Swarm"]["NodeAddr"]
|
900
|
-
results, err = snode_api.spdk_process_start(
|
1115
|
+
results, err = snode_api.spdk_process_start(snode.spdk_cpu_mask, spdk_mem, img, spdk_debug, cluster_ip)
|
901
1116
|
|
902
1117
|
if not results:
|
903
1118
|
logger.error(f"Failed to start spdk: {err}")
|
904
1119
|
return False
|
905
1120
|
time.sleep(3)
|
906
1121
|
|
907
|
-
|
908
|
-
snode.iobuf_small_pool_count = small_pool_count
|
909
|
-
if large_pool_count:
|
910
|
-
snode.iobuf_large_pool_count = large_pool_count
|
1122
|
+
|
911
1123
|
if small_bufsize:
|
912
1124
|
snode.iobuf_small_bufsize = small_bufsize
|
913
1125
|
if large_bufsize:
|
@@ -931,13 +1143,41 @@ def restart_storage_node(
|
|
931
1143
|
logger.error("Failed to set iobuf options")
|
932
1144
|
return False
|
933
1145
|
|
934
|
-
# 2-
|
1146
|
+
# 2- set socket implementation options
|
1147
|
+
ret = rpc_client.sock_impl_set_options()
|
1148
|
+
if not ret:
|
1149
|
+
logger.error("Failed socket implement set options")
|
1150
|
+
return False
|
1151
|
+
|
1152
|
+
# 3- set nvme config
|
1153
|
+
if snode.pollers_mask:
|
1154
|
+
ret = rpc_client.nvmf_set_config(snode.pollers_mask)
|
1155
|
+
if not ret:
|
1156
|
+
logger.error("Failed to set pollers mask")
|
1157
|
+
return False
|
1158
|
+
|
1159
|
+
# 4- start spdk framework
|
935
1160
|
ret = rpc_client.framework_start_init()
|
936
1161
|
if not ret:
|
937
1162
|
logger.error("Failed to start framework")
|
938
1163
|
return False
|
939
1164
|
|
940
|
-
#
|
1165
|
+
# 5- set app_thread cpu mask
|
1166
|
+
if snode.app_thread_mask:
|
1167
|
+
ret = rpc_client.thread_get_stats()
|
1168
|
+
app_thread_process_id = 0
|
1169
|
+
if ret.get("threads"):
|
1170
|
+
for entry in ret["threads"]:
|
1171
|
+
if entry['name'] == 'app_thread':
|
1172
|
+
app_thread_process_id = entry['id']
|
1173
|
+
break
|
1174
|
+
|
1175
|
+
ret = rpc_client.thread_set_cpumask(app_thread_process_id, snode.app_thread_mask)
|
1176
|
+
if not ret:
|
1177
|
+
logger.error("Failed to set app thread mask")
|
1178
|
+
return False
|
1179
|
+
|
1180
|
+
# 6- set nvme bdev options
|
941
1181
|
ret = rpc_client.bdev_nvme_set_options()
|
942
1182
|
if not ret:
|
943
1183
|
logger.error("Failed to set nvme options")
|
@@ -970,22 +1210,23 @@ def restart_storage_node(
|
|
970
1210
|
else:
|
971
1211
|
logger.info(f"Device not found: {db_dev.get_id()}")
|
972
1212
|
db_dev.status = NVMeDevice.STATUS_REMOVED
|
973
|
-
distr_controller.send_dev_status_event(db_dev
|
1213
|
+
distr_controller.send_dev_status_event(db_dev, db_dev.status)
|
974
1214
|
|
975
|
-
|
976
|
-
|
977
|
-
|
978
|
-
|
979
|
-
|
980
|
-
|
1215
|
+
# todo: handle new devices
|
1216
|
+
# for dev in nvme_devs:
|
1217
|
+
# if dev.serial_number not in known_devices_sn:
|
1218
|
+
# logger.info(f"New device found: {dev.get_id()}")
|
1219
|
+
# dev.status = NVMeDevice.STATUS_NEW
|
1220
|
+
# new_devices.append(dev)
|
1221
|
+
# snode.nvme_devices.append(dev)
|
981
1222
|
|
982
|
-
dev_order = get_next_cluster_device_order(db_controller)
|
983
|
-
for index, nvme in enumerate(new_devices):
|
984
|
-
|
985
|
-
|
1223
|
+
# dev_order = get_next_cluster_device_order(db_controller, snode.cluster_id)
|
1224
|
+
# for index, nvme in enumerate(new_devices):
|
1225
|
+
# nvme.cluster_device_order = dev_order
|
1226
|
+
# dev_order += 1
|
986
1227
|
|
987
1228
|
# prepare devices
|
988
|
-
ret =
|
1229
|
+
ret = _prepare_cluster_devices_on_restart(snode)
|
989
1230
|
if not ret:
|
990
1231
|
logger.error("Failed to prepare cluster devices")
|
991
1232
|
return False
|
@@ -996,7 +1237,7 @@ def restart_storage_node(
|
|
996
1237
|
|
997
1238
|
# make other nodes connect to the new devices
|
998
1239
|
logger.info("Make other nodes connect to the node devices")
|
999
|
-
snodes = db_controller.
|
1240
|
+
snodes = db_controller.get_storage_nodes_by_cluster_id(snode.cluster_id)
|
1000
1241
|
for node_index, node in enumerate(snodes):
|
1001
1242
|
if node.get_id() == snode.get_id() or node.status != StorageNode.STATUS_ONLINE:
|
1002
1243
|
continue
|
@@ -1034,20 +1275,23 @@ def restart_storage_node(
|
|
1034
1275
|
storage_events.snode_status_change(snode, snode.status, old_status)
|
1035
1276
|
|
1036
1277
|
logger.info("Sending node event update")
|
1037
|
-
distr_controller.send_node_status_event(snode
|
1278
|
+
distr_controller.send_node_status_event(snode, StorageNode.STATUS_ONLINE)
|
1038
1279
|
|
1039
1280
|
logger.info("Sending devices event updates")
|
1281
|
+
logger.info("Starting migration tasks")
|
1040
1282
|
for dev in snode.nvme_devices:
|
1041
1283
|
if dev.status != NVMeDevice.STATUS_ONLINE:
|
1042
|
-
logger.
|
1284
|
+
logger.info(f"Device is not online: {dev.get_id()}, status: {dev.status}")
|
1043
1285
|
continue
|
1044
|
-
distr_controller.send_dev_status_event(dev.cluster_device_order, NVMeDevice.STATUS_ONLINE)
|
1045
1286
|
|
1046
|
-
|
1047
|
-
|
1048
|
-
|
1049
|
-
|
1050
|
-
|
1287
|
+
distr_controller.send_dev_status_event(dev, NVMeDevice.STATUS_ONLINE)
|
1288
|
+
tasks_controller.add_device_mig_task(dev.get_id())
|
1289
|
+
|
1290
|
+
# logger.info("Sending cluster map to current node")
|
1291
|
+
# ret = distr_controller.send_cluster_map_to_node(snode)
|
1292
|
+
# if not ret:
|
1293
|
+
# return False, "Failed to send cluster map"
|
1294
|
+
# time.sleep(3)
|
1051
1295
|
|
1052
1296
|
for lvol_id in snode.lvols:
|
1053
1297
|
lvol = lvol_controller.recreate_lvol(lvol_id, snode)
|
@@ -1062,9 +1306,12 @@ def restart_storage_node(
|
|
1062
1306
|
return "Success"
|
1063
1307
|
|
1064
1308
|
|
1065
|
-
def list_storage_nodes(
|
1066
|
-
db_controller = DBController(
|
1067
|
-
|
1309
|
+
def list_storage_nodes(is_json, cluster_id=None):
|
1310
|
+
db_controller = DBController()
|
1311
|
+
if cluster_id:
|
1312
|
+
nodes = db_controller.get_storage_nodes_by_cluster_id(cluster_id)
|
1313
|
+
else:
|
1314
|
+
nodes = db_controller.get_storage_nodes()
|
1068
1315
|
data = []
|
1069
1316
|
output = ""
|
1070
1317
|
|
@@ -1111,26 +1358,43 @@ def list_storage_devices(kv_store, node_id, sort, is_json):
|
|
1111
1358
|
logger.error("This storage node is not part of the cluster")
|
1112
1359
|
return False
|
1113
1360
|
|
1114
|
-
|
1361
|
+
storage_devices = []
|
1362
|
+
jm_devices = []
|
1363
|
+
remote_devices = []
|
1115
1364
|
for device in snode.nvme_devices:
|
1116
1365
|
logger.debug(device)
|
1117
1366
|
logger.debug("*" * 20)
|
1118
|
-
|
1367
|
+
storage_devices.append({
|
1119
1368
|
"UUID": device.uuid,
|
1120
1369
|
"Name": device.device_name,
|
1121
|
-
"Hostname": snode.hostname,
|
1122
1370
|
"Size": utils.humanbytes(device.size),
|
1123
|
-
# "Sequential Number": device.sequential_number,
|
1124
|
-
# "Partitions Count": device.partitions_count,
|
1125
|
-
# "Model ID": device.model_id,
|
1126
1371
|
"Serial Number": device.serial_number,
|
1127
1372
|
"PCIe": device.pcie_address,
|
1128
1373
|
"Status": device.status,
|
1129
1374
|
"IO Err": device.io_error,
|
1130
|
-
"Health": device.health_check
|
1375
|
+
"Health": device.health_check
|
1376
|
+
})
|
1131
1377
|
|
1378
|
+
if snode.jm_device:
|
1379
|
+
jm_devices.append({
|
1380
|
+
"UUID": snode.jm_device.uuid,
|
1381
|
+
"Name": snode.jm_device.device_name,
|
1382
|
+
"Size": utils.humanbytes(snode.jm_device.size),
|
1383
|
+
"Status": snode.jm_device.status,
|
1384
|
+
"IO Err": snode.jm_device.io_error,
|
1385
|
+
"Health": snode.jm_device.health_check
|
1132
1386
|
})
|
1133
1387
|
|
1388
|
+
for device in snode.remote_devices:
|
1389
|
+
logger.debug(device)
|
1390
|
+
logger.debug("*" * 20)
|
1391
|
+
remote_devices.append({
|
1392
|
+
"UUID": device.uuid,
|
1393
|
+
"Name": device.device_name,
|
1394
|
+
"Size": utils.humanbytes(device.size),
|
1395
|
+
"Serial Number": device.serial_number,
|
1396
|
+
"Node ID": device.node_id,
|
1397
|
+
})
|
1134
1398
|
if sort and sort in ['node-seq', 'dev-seq', 'serial']:
|
1135
1399
|
if sort == 'serial':
|
1136
1400
|
sort_key = "Serial Number"
|
@@ -1139,13 +1403,20 @@ def list_storage_devices(kv_store, node_id, sort, is_json):
|
|
1139
1403
|
elif sort == 'node-seq':
|
1140
1404
|
# TODO: check this key
|
1141
1405
|
sort_key = "Sequential Number"
|
1142
|
-
|
1143
|
-
data = sorted_data
|
1406
|
+
storage_devices = sorted(storage_devices, key=lambda d: d[sort_key])
|
1144
1407
|
|
1408
|
+
data = {
|
1409
|
+
"Storage Devices": storage_devices,
|
1410
|
+
"JM Devices": jm_devices,
|
1411
|
+
"Remote Devices": remote_devices,
|
1412
|
+
}
|
1145
1413
|
if is_json:
|
1146
1414
|
return json.dumps(data, indent=2)
|
1147
1415
|
else:
|
1148
|
-
|
1416
|
+
out = ""
|
1417
|
+
for d in data:
|
1418
|
+
out += f"{d}\n{utils.print_table(data[d])}\n\n"
|
1419
|
+
return out
|
1149
1420
|
|
1150
1421
|
|
1151
1422
|
def shutdown_storage_node(node_id, force=False):
|
@@ -1186,7 +1457,7 @@ def shutdown_storage_node(node_id, force=False):
|
|
1186
1457
|
for dev in snode.nvme_devices:
|
1187
1458
|
if dev.status in [NVMeDevice.STATUS_ONLINE, NVMeDevice.STATUS_READONLY]:
|
1188
1459
|
device_controller.device_set_unavailable(dev.get_id())
|
1189
|
-
distr_controller.send_node_status_event(snode
|
1460
|
+
distr_controller.send_node_status_event(snode, StorageNode.STATUS_IN_SHUTDOWN)
|
1190
1461
|
|
1191
1462
|
# shutdown node
|
1192
1463
|
# make other nodes disconnect from this node
|
@@ -1206,7 +1477,7 @@ def shutdown_storage_node(node_id, force=False):
|
|
1206
1477
|
snode_api = SNodeClient(snode.api_endpoint)
|
1207
1478
|
results, err = snode_api.spdk_process_kill()
|
1208
1479
|
|
1209
|
-
distr_controller.send_node_status_event(snode
|
1480
|
+
distr_controller.send_node_status_event(snode, StorageNode.STATUS_OFFLINE)
|
1210
1481
|
|
1211
1482
|
logger.info("Setting node status to offline")
|
1212
1483
|
snode = db_controller.get_storage_node_by_id(node_id)
|
@@ -1233,22 +1504,24 @@ def suspend_storage_node(node_id, force=False):
|
|
1233
1504
|
return False
|
1234
1505
|
|
1235
1506
|
cluster = db_controller.get_cluster_by_id(snode.cluster_id)
|
1236
|
-
snodes = db_controller.
|
1507
|
+
snodes = db_controller.get_storage_nodes_by_cluster_id(snode.cluster_id)
|
1237
1508
|
online_nodes = 0
|
1238
1509
|
for node in snodes:
|
1239
1510
|
if node.status == node.STATUS_ONLINE:
|
1240
1511
|
online_nodes += 1
|
1241
|
-
if cluster.ha_type == "ha" and online_nodes <= 3 and cluster.status == cluster.STATUS_ACTIVE:
|
1242
|
-
logger.warning(f"Cluster mode is HA but online storage nodes are less than 3")
|
1243
|
-
if force is False:
|
1244
|
-
return False
|
1245
1512
|
|
1246
|
-
if cluster.ha_type == "ha"
|
1247
|
-
|
1248
|
-
|
1513
|
+
if cluster.ha_type == "ha":
|
1514
|
+
if online_nodes <= 3 and cluster.status == cluster.STATUS_ACTIVE:
|
1515
|
+
logger.warning(f"Cluster mode is HA but online storage nodes are less than 3")
|
1516
|
+
if force is False:
|
1517
|
+
return False
|
1518
|
+
|
1519
|
+
if cluster.status == cluster.STATUS_DEGRADED and force is False:
|
1520
|
+
logger.warning(f"Cluster status is degraded, use --force but this will suspend the cluster")
|
1521
|
+
return False
|
1249
1522
|
|
1250
1523
|
logger.info("Suspending node")
|
1251
|
-
distr_controller.send_node_status_event(snode
|
1524
|
+
distr_controller.send_node_status_event(snode, StorageNode.STATUS_SUSPENDED)
|
1252
1525
|
for dev in snode.nvme_devices:
|
1253
1526
|
if dev.status == NVMeDevice.STATUS_ONLINE:
|
1254
1527
|
device_controller.device_set_unavailable(dev.get_id())
|
@@ -1292,7 +1565,7 @@ def resume_storage_node(node_id):
|
|
1292
1565
|
logger.info("Resuming node")
|
1293
1566
|
|
1294
1567
|
logger.info("Sending cluster event updates")
|
1295
|
-
distr_controller.send_node_status_event(snode
|
1568
|
+
distr_controller.send_node_status_event(snode, StorageNode.STATUS_ONLINE)
|
1296
1569
|
|
1297
1570
|
for dev in snode.nvme_devices:
|
1298
1571
|
if dev.status == NVMeDevice.STATUS_UNAVAILABLE:
|
@@ -1668,7 +1941,6 @@ def deploy_cleaner():
|
|
1668
1941
|
return True
|
1669
1942
|
|
1670
1943
|
|
1671
|
-
|
1672
1944
|
def get_host_secret(node_id):
|
1673
1945
|
db_controller = DBController()
|
1674
1946
|
node = db_controller.get_storage_node_by_id(node_id)
|
@@ -1831,7 +2103,7 @@ def set_node_status(node_id, status):
|
|
1831
2103
|
snode.updated_at = str(datetime.datetime.now())
|
1832
2104
|
snode.write_to_db(db_controller.kv_store)
|
1833
2105
|
storage_events.snode_status_change(snode, snode.status, old_status, caused_by="monitor")
|
1834
|
-
distr_controller.send_node_status_event(snode
|
2106
|
+
distr_controller.send_node_status_event(snode, status)
|
1835
2107
|
|
1836
2108
|
if snode.status == StorageNode.STATUS_ONLINE:
|
1837
2109
|
logger.info("Connecting to remote devices")
|