sbcli-pre 1.2.5__zip → 1.2.6__zip

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (141) hide show
  1. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/PKG-INFO +1 -1
  2. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/env_var +1 -1
  3. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/sbcli_pre.egg-info/PKG-INFO +1 -1
  4. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/sbcli_pre.egg-info/SOURCES.txt +5 -3
  5. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_cli/cli.py +113 -115
  6. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/cluster_ops.py +138 -235
  7. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/constants.py +5 -7
  8. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/controllers/caching_node_controller.py +8 -6
  9. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/controllers/cluster_events.py +9 -0
  10. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/controllers/device_controller.py +56 -63
  11. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/controllers/events_controller.py +5 -3
  12. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/controllers/health_controller.py +30 -40
  13. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/controllers/lvol_controller.py +51 -38
  14. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/controllers/pool_controller.py +8 -4
  15. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/controllers/snapshot_controller.py +9 -3
  16. sbcli_pre-1.2.6/simplyblock_core/controllers/tasks_controller.py +103 -0
  17. sbcli_pre-1.2.6/simplyblock_core/controllers/tasks_events.py +37 -0
  18. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/distr_controller.py +13 -9
  19. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/kv_store.py +47 -20
  20. sbcli_pre-1.2.6/simplyblock_core/mgmt_node_ops.py +205 -0
  21. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/models/events.py +9 -1
  22. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/models/job_schedule.py +6 -0
  23. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/models/nvme_device.py +42 -4
  24. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/models/storage_node.py +9 -1
  25. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/rpc_client.py +55 -10
  26. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/scripts/__init__.py +0 -4
  27. sbcli_pre-1.2.5/simplyblock_core/scripts/alerting/alert_resources.yaml → sbcli_pre-1.2.6/simplyblock_core/scripts/alerting/alert_resources.yaml.j2 +54 -5
  28. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/scripts/dashboards/cluster.json +1 -1
  29. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/scripts/deploy_stack.sh +9 -0
  30. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/scripts/docker-compose-swarm-monitoring.yml +32 -15
  31. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/scripts/docker-compose-swarm.yml +17 -2
  32. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/scripts/haproxy.cfg +15 -0
  33. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/scripts/install_deps.sh +3 -0
  34. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/scripts/stack_deploy_wait.sh +1 -1
  35. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/services/capacity_and_stats_collector.py +1 -1
  36. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/services/device_monitor.py +5 -46
  37. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/services/distr_event_collector.py +10 -11
  38. sbcli_pre-1.2.6/simplyblock_core/services/health_check_service.py +134 -0
  39. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/services/lvol_monitor.py +1 -1
  40. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/services/lvol_stat_collector.py +1 -1
  41. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/services/port_stat_collector.py +0 -1
  42. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/services/storage_node_monitor.py +49 -44
  43. sbcli_pre-1.2.6/simplyblock_core/services/tasks_runner_migration.py +61 -0
  44. sbcli_pre-1.2.5/simplyblock_core/services/job_tasks.py → sbcli_pre-1.2.6/simplyblock_core/services/tasks_runner_restart.py +95 -46
  45. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/snode_client.py +12 -0
  46. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/storage_node_ops.py +525 -336
  47. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/utils.py +46 -1
  48. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_web/blueprints/snode_ops.py +103 -25
  49. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_web/blueprints/web_api_cluster.py +20 -43
  50. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_web/blueprints/web_api_device.py +10 -7
  51. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_web/blueprints/web_api_lvol.py +9 -5
  52. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_web/blueprints/web_api_pool.py +14 -5
  53. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_web/blueprints/web_api_storage_node.py +3 -10
  54. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_web/node_utils.py +0 -2
  55. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_web/utils.py +8 -0
  56. sbcli_pre-1.2.5/simplyblock_core/mgmt_node_ops.py +0 -80
  57. sbcli_pre-1.2.5/simplyblock_core/scripts/apply_dashboard.sh +0 -22
  58. sbcli_pre-1.2.5/simplyblock_core/services/health_check_service.py +0 -136
  59. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/README.md +0 -0
  60. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/pyproject.toml +0 -0
  61. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/sbcli_pre.egg-info/dependency_links.txt +0 -0
  62. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/sbcli_pre.egg-info/entry_points.txt +0 -0
  63. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/sbcli_pre.egg-info/requires.txt +0 -0
  64. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/sbcli_pre.egg-info/top_level.txt +0 -0
  65. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/setup.cfg +0 -0
  66. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/setup.py +0 -0
  67. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_cli/main.py +0 -0
  68. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/__init__.py +0 -0
  69. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/cnode_client.py +0 -0
  70. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/compute_node_ops.py +0 -0
  71. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/controllers/__init__.py +0 -0
  72. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/controllers/device_events.py +0 -0
  73. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/controllers/lvol_events.py +0 -0
  74. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/controllers/mgmt_events.py +0 -0
  75. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/controllers/pool_events.py +0 -0
  76. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/controllers/snapshot_events.py +0 -0
  77. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/controllers/storage_events.py +0 -0
  78. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/models/__init__.py +0 -0
  79. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/models/base_model.py +0 -0
  80. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/models/caching_node.py +0 -0
  81. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/models/cluster.py +0 -0
  82. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/models/compute_node.py +0 -0
  83. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/models/deployer.py +0 -0
  84. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/models/global_settings.py +0 -0
  85. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/models/iface.py +0 -0
  86. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/models/lvol_model.py +0 -0
  87. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/models/mgmt_node.py +0 -0
  88. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/models/pool.py +0 -0
  89. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/models/port_stat.py +0 -0
  90. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/models/snapshot.py +0 -0
  91. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/models/stats.py +0 -0
  92. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/pci_utils.py +0 -0
  93. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/scripts/alerting/alert_rules.yaml +0 -0
  94. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/scripts/clean_local_storage_deploy.sh +0 -0
  95. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/scripts/config_docker.sh +0 -0
  96. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/scripts/dashboards/devices.json +0 -0
  97. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/scripts/dashboards/lvols.json +0 -0
  98. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/scripts/dashboards/node-exporter.json +0 -0
  99. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/scripts/dashboards/nodes.json +0 -0
  100. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/scripts/dashboards/pools.json +0 -0
  101. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/scripts/datasource.yml +0 -0
  102. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/scripts/db_config_double.sh +0 -0
  103. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/scripts/db_config_single.sh +0 -0
  104. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/scripts/prometheus.yml +0 -0
  105. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/scripts/run_ssh.sh +0 -0
  106. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/scripts/set_db_config.sh +0 -0
  107. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/services/__init__.py +0 -0
  108. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/services/caching_node_monitor.py +0 -0
  109. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/services/cap_monitor.py +0 -0
  110. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/services/install_service.sh +0 -0
  111. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/services/log_agg_service.py +0 -0
  112. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/services/mgmt_node_monitor.py +0 -0
  113. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/services/remove_service.sh +0 -0
  114. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/services/service_template.service +0 -0
  115. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_core/shell_utils.py +0 -0
  116. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_web/__init__.py +0 -0
  117. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_web/app.py +0 -0
  118. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_web/auth_middleware.py +0 -0
  119. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_web/blueprints/__init__.py +0 -0
  120. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_web/blueprints/caching_node_ops.py +0 -0
  121. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_web/blueprints/caching_node_ops_k8s.py +0 -0
  122. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_web/blueprints/node_api_basic.py +0 -0
  123. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_web/blueprints/node_api_caching_docker.py +0 -0
  124. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_web/blueprints/node_api_caching_ks.py +0 -0
  125. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_web/blueprints/web_api_caching_node.py +0 -0
  126. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_web/blueprints/web_api_deployer.py +0 -0
  127. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_web/blueprints/web_api_mgmt_node.py +0 -0
  128. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_web/blueprints/web_api_snapshot.py +0 -0
  129. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_web/caching_node_app.py +0 -0
  130. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_web/caching_node_app_k8s.py +0 -0
  131. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_web/node_webapp.py +0 -0
  132. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_web/snode_app.py +0 -0
  133. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_web/static/delete.py +0 -0
  134. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_web/static/deploy.py +0 -0
  135. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_web/static/deploy_cnode.yaml +0 -0
  136. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_web/static/deploy_spdk.yaml +0 -0
  137. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_web/static/is_up.py +0 -0
  138. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_web/static/list_deps.py +0 -0
  139. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_web/static/rpac.yaml +0 -0
  140. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_web/static/tst.py +0 -0
  141. {sbcli_pre-1.2.5 → sbcli_pre-1.2.6}/simplyblock_web/templates/deploy_spdk.yaml.j2 +0 -0
@@ -14,11 +14,11 @@ import docker
14
14
  from simplyblock_core import constants, scripts, distr_controller
15
15
  from simplyblock_core import utils
16
16
  from simplyblock_core.controllers import lvol_controller, storage_events, snapshot_controller, device_events, \
17
- device_controller
17
+ device_controller, tasks_controller
18
18
  from simplyblock_core.kv_store import DBController
19
19
  from simplyblock_core import shell_utils
20
20
  from simplyblock_core.models.iface import IFace
21
- from simplyblock_core.models.nvme_device import NVMeDevice
21
+ from simplyblock_core.models.nvme_device import NVMeDevice, JMDevice
22
22
  from simplyblock_core.models.storage_node import StorageNode
23
23
  from simplyblock_core.pci_utils import get_nvme_devices, bind_spdk_driver
24
24
  from simplyblock_core.rpc_client import RPCClient
@@ -81,55 +81,50 @@ def _get_if_ip_address(ifname):
81
81
 
82
82
 
83
83
  def addNvmeDevices(cluster, rpc_client, devs, snode):
84
- sequential_number = 0
85
84
  devices = []
86
85
  ret = rpc_client.bdev_nvme_controller_list()
87
- if ret:
88
- ctr_map = {i["ctrlrs"][0]['trid']['traddr']: i["name"] for i in ret}
89
- else:
90
- ctr_map = {}
86
+ ctr_map = {}
87
+ try:
88
+ if ret:
89
+ ctr_map = {i["ctrlrs"][0]['trid']['traddr']: i["name"] for i in ret}
90
+ except:
91
+ pass
91
92
 
93
+ next_physical_label = get_next_physical_device_order()
92
94
  for index, pcie in enumerate(devs):
93
95
 
94
96
  if pcie in ctr_map:
95
- nvme_bdev = ctr_map[pcie] + "n1"
97
+ nvme_controller = ctr_map[pcie]
96
98
  else:
97
- name = "nvme_%s" % index
98
- ret, err = rpc_client.bdev_nvme_controller_attach(name, pcie)
99
+ nvme_controller = "nvme_%s" % index
100
+ ret, err = rpc_client.bdev_nvme_controller_attach(nvme_controller, pcie)
99
101
  time.sleep(2)
100
- nvme_bdev = f"{name}n1"
101
102
 
103
+ nvme_bdev = f"{nvme_controller}n1"
104
+ rpc_client.bdev_examine(nvme_bdev)
105
+ time.sleep(5)
102
106
  ret = rpc_client.get_bdevs(nvme_bdev)
103
- if ret:
104
- nvme_dict = ret[0]
105
- nvme_driver_data = nvme_dict['driver_specific']['nvme'][0]
106
- model_number = nvme_driver_data['ctrlr_data']['model_number']
107
+ nvme_dict = ret[0]
108
+ nvme_driver_data = nvme_dict['driver_specific']['nvme'][0]
109
+ model_number = nvme_driver_data['ctrlr_data']['model_number']
110
+ total_size = nvme_dict['block_size'] * nvme_dict['num_blocks']
107
111
 
108
- size = nvme_dict['block_size'] * nvme_dict['num_blocks']
109
- device_partitions_count = int(size / (cluster.blk_size * cluster.page_size_in_blocks))
110
- devices.append(
111
- NVMeDevice({
112
- 'uuid': str(uuid.uuid4()),
113
- 'device_name': nvme_dict['name'],
114
- 'sequential_number': sequential_number,
115
- 'partitions_count': device_partitions_count,
116
- 'capacity': size,
117
- 'size': size,
118
- 'pcie_address': nvme_driver_data['pci_address'],
119
- 'model_id': model_number,
120
- 'serial_number': nvme_driver_data['ctrlr_data']['serial_number'],
121
- 'nvme_bdev': nvme_bdev,
122
- 'alloc_bdev': nvme_bdev,
123
- 'node_id': snode.get_id(),
124
- 'cluster_id': snode.cluster_id,
125
-
126
- # 'nvmf_nqn': subsystem_nqn,
127
- # 'nvmf_ip': IP,
128
- # 'nvmf_port': 4420,
129
-
130
- 'status': 'online'
131
- }))
132
- sequential_number += device_partitions_count
112
+ devices.append(
113
+ NVMeDevice({
114
+ 'uuid': str(uuid.uuid4()),
115
+ 'device_name': nvme_dict['name'],
116
+ 'size': total_size,
117
+ 'physical_label': next_physical_label,
118
+ 'pcie_address': nvme_driver_data['pci_address'],
119
+ 'model_id': model_number,
120
+ 'serial_number': nvme_driver_data['ctrlr_data']['serial_number'],
121
+ 'nvme_bdev': nvme_bdev,
122
+ 'nvme_controller': nvme_controller,
123
+ 'node_id': snode.get_id(),
124
+ 'cluster_id': snode.cluster_id,
125
+ 'status': NVMeDevice.STATUS_ONLINE
126
+ }))
127
+ next_physical_label += 1
133
128
  return devices
134
129
 
135
130
 
@@ -197,10 +192,10 @@ def _run_nvme_smart_log_add(dev_name):
197
192
  return data
198
193
 
199
194
 
200
- def get_next_cluster_device_order(db_controller):
195
+ def get_next_cluster_device_order(db_controller, cluster_id):
201
196
  max_order = 0
202
197
  found = False
203
- for node in db_controller.get_storage_nodes():
198
+ for node in db_controller.get_storage_nodes_by_cluster_id(cluster_id):
204
199
  for dev in node.nvme_devices:
205
200
  found = True
206
201
  max_order = max(max_order, dev.cluster_device_order)
@@ -209,91 +204,319 @@ def get_next_cluster_device_order(db_controller):
209
204
  return 0
210
205
 
211
206
 
212
- def _prepare_cluster_devices(snode, after_restart=False):
207
+ def get_next_physical_device_order():
213
208
  db_controller = DBController()
209
+ max_order = 0
210
+ found = False
211
+ for node in db_controller.get_storage_nodes():
212
+ for dev in node.nvme_devices:
213
+ found = True
214
+ max_order = max(max_order, dev.physical_label)
215
+ if found:
216
+ return max_order + 1
217
+ return 0
218
+
219
+
220
+ def _search_for_partitions(rpc_client, nvme_device):
221
+ partitioned_devices = []
222
+ for bdev in rpc_client.get_bdevs():
223
+ name = bdev['name']
224
+ if name.startswith(f"{nvme_device.nvme_bdev}p"):
225
+ new_dev = NVMeDevice(nvme_device.to_dict())
226
+ new_dev.uuid = str(uuid.uuid4())
227
+ new_dev.device_name = name
228
+ new_dev.nvme_bdev = name
229
+ new_dev.size = bdev['block_size'] * bdev['num_blocks']
230
+ partitioned_devices.append(new_dev)
231
+ return partitioned_devices
232
+
214
233
 
234
+ def _create_jm_stack_on_raid(rpc_client, jm_nvme_bdevs, snode, after_restart):
235
+ raid_bdev = f"raid_jm_{snode.get_id()}"
236
+ ret = rpc_client.bdev_raid_create(raid_bdev, jm_nvme_bdevs)
237
+ if not ret:
238
+ logger.error(f"Failed to create raid_jm_{snode.get_id()}")
239
+ return False
240
+ alceml_name = f"alceml_jm_{snode.get_id()}"
241
+ pba_init_mode = 3
242
+ if after_restart:
243
+ pba_init_mode = 2
244
+ ret = rpc_client.bdev_alceml_create(alceml_name, raid_bdev, str(uuid.uuid4()), pba_init_mode=pba_init_mode)
245
+ if not ret:
246
+ logger.error(f"Failed to create alceml bdev: {alceml_name}")
247
+ return False
248
+
249
+ jm_bdev = f"jm_{snode.get_id()}"
250
+ ret = rpc_client.bdev_jm_create(jm_bdev, alceml_name)
251
+ if not ret:
252
+ logger.error(f"Failed to create {jm_bdev}")
253
+ return False
254
+ ret = rpc_client.get_bdevs(raid_bdev)
255
+
256
+ return JMDevice({
257
+ 'uuid': str(uuid.uuid4()),
258
+ 'device_name': jm_bdev,
259
+ 'size': ret[0]["block_size"] * ret[0]["num_blocks"],
260
+ 'status': JMDevice.STATUS_ONLINE,
261
+ 'jm_nvme_bdev_list': jm_nvme_bdevs,
262
+ 'raid_bdev': raid_bdev,
263
+ 'alceml_bdev': alceml_name,
264
+ 'jm_bdev': jm_bdev
265
+ })
266
+
267
+
268
+ def _create_jm_stack_on_device(rpc_client, nvme, snode, after_restart):
269
+
270
+ alceml_id = nvme.get_id()
271
+ alceml_name = device_controller.get_alceml_name(alceml_id)
272
+ logger.info(f"adding {alceml_name}")
273
+
274
+ pba_init_mode = 3
275
+ if after_restart:
276
+ pba_init_mode = 2
277
+ ret = rpc_client.bdev_alceml_create(alceml_name, nvme.nvme_bdev, alceml_id, pba_init_mode=pba_init_mode)
278
+ if not ret:
279
+ logger.error(f"Failed to create alceml bdev: {alceml_name}")
280
+ return False
281
+
282
+ jm_bdev = f"jm_{snode.get_id()}"
283
+ ret = rpc_client.bdev_jm_create(jm_bdev, alceml_name)
284
+ if not ret:
285
+ logger.error(f"Failed to create {jm_bdev}")
286
+ return False
287
+
288
+ return JMDevice({
289
+ 'uuid': alceml_id,
290
+ 'device_name': jm_bdev,
291
+ 'size': nvme.size,
292
+ 'status': JMDevice.STATUS_ONLINE,
293
+ 'alceml_bdev': alceml_name,
294
+ 'nvme_bdev': nvme.nvme_bdev,
295
+ 'jm_bdev': jm_bdev
296
+ })
297
+
298
+
299
+ def _create_storage_device_stack(rpc_client, nvme, snode, after_restart):
300
+ test_name = f"{nvme.nvme_bdev}_test"
301
+ ret = rpc_client.bdev_passtest_create(test_name, nvme.nvme_bdev)
302
+ if not ret:
303
+ logger.error(f"Failed to create passtest bdev {test_name}")
304
+ return False
305
+ alceml_id = nvme.get_id()
306
+ alceml_name = device_controller.get_alceml_name(alceml_id)
307
+ logger.info(f"adding {alceml_name}")
308
+ pba_init_mode = 3
309
+ if after_restart:
310
+ pba_init_mode = 2
311
+ ret = rpc_client.bdev_alceml_create(alceml_name, test_name, alceml_id, pba_init_mode=pba_init_mode,
312
+ dev_cpu_mask=snode.dev_cpu_mask)
313
+ if not ret:
314
+ logger.error(f"Failed to create alceml bdev: {alceml_name}")
315
+ return False
316
+
317
+ # add pass through
318
+ pt_name = f"{alceml_name}_PT"
319
+ ret = rpc_client.bdev_PT_NoExcl_create(pt_name, alceml_name)
320
+ if not ret:
321
+ logger.error(f"Failed to create pt noexcl bdev: {pt_name}")
322
+ return False
323
+
324
+ subsystem_nqn = snode.subsystem + ":dev:" + alceml_id
325
+ logger.info("creating subsystem %s", subsystem_nqn)
326
+ ret = rpc_client.subsystem_create(subsystem_nqn, 'sbcli-cn', alceml_id)
327
+ IP = None
328
+ for iface in snode.data_nics:
329
+ if iface.ip4_address:
330
+ tr_type = iface.get_transport_type()
331
+ ret = rpc_client.transport_list()
332
+ found = False
333
+ if ret:
334
+ for ty in ret:
335
+ if ty['trtype'] == tr_type:
336
+ found = True
337
+ if found is False:
338
+ ret = rpc_client.transport_create(tr_type)
339
+ logger.info("adding listener for %s on IP %s" % (subsystem_nqn, iface.ip4_address))
340
+ ret = rpc_client.listeners_create(subsystem_nqn, tr_type, iface.ip4_address, "4420")
341
+ IP = iface.ip4_address
342
+ break
343
+ logger.info(f"add {pt_name} to subsystem")
344
+ ret = rpc_client.nvmf_subsystem_add_ns(subsystem_nqn, pt_name)
345
+ if not ret:
346
+ logger.error(f"Failed to add: {pt_name} to the subsystem: {subsystem_nqn}")
347
+ return False
348
+
349
+ nvme.testing_bdev = test_name
350
+ nvme.alceml_bdev = alceml_name
351
+ nvme.pt_bdev = pt_name
352
+ nvme.nvmf_nqn = subsystem_nqn
353
+ nvme.nvmf_ip = IP
354
+ nvme.nvmf_port = 4420
355
+ nvme.io_error = False
356
+ nvme.status = NVMeDevice.STATUS_ONLINE
357
+ return nvme
358
+
359
+
360
+ def _create_device_partitions(rpc_client, nvme, snode):
361
+ nbd_device = rpc_client.nbd_start_disk(nvme.nvme_bdev)
362
+ time.sleep(3)
363
+ if not nbd_device:
364
+ logger.error(f"Failed to start nbd dev")
365
+ return False
366
+ snode_api = SNodeClient(snode.api_endpoint)
367
+ result, error = snode_api.make_gpt_partitions(
368
+ nbd_device, snode.jm_percent, snode.num_partitions_per_dev)
369
+ if error:
370
+ logger.error(f"Failed to make partitions")
371
+ logger.error(error)
372
+ return False
373
+ time.sleep(3)
374
+ rpc_client.nbd_stop_disk(nbd_device)
375
+ time.sleep(1)
376
+ rpc_client.bdev_nvme_detach_controller(nvme.nvme_controller)
377
+ time.sleep(1)
378
+ rpc_client.bdev_nvme_controller_attach(nvme.nvme_controller, nvme.pcie_address)
379
+ time.sleep(1)
380
+ rpc_client.bdev_examine(nvme.nvme_bdev)
381
+ time.sleep(1)
382
+ return True
383
+
384
+
385
+ def _prepare_cluster_devices_partitions(snode, devices):
386
+ db_controller = DBController()
215
387
  rpc_client = RPCClient(
216
388
  snode.mgmt_ip, snode.rpc_port,
217
389
  snode.rpc_username, snode.rpc_password)
218
390
 
219
- for index, nvme in enumerate(snode.nvme_devices):
391
+ new_devices = []
392
+ jm_devices = []
393
+ dev_order = get_next_cluster_device_order(db_controller, snode.cluster_id)
394
+ for index, nvme in enumerate(devices):
395
+ if nvme.status not in [NVMeDevice.STATUS_ONLINE, NVMeDevice.STATUS_UNAVAILABLE, NVMeDevice.STATUS_READONLY]:
396
+ logger.debug(f"Device is skipped: {nvme.get_id()}, status: {nvme.status}")
397
+ continue
398
+
399
+ # look for partitions
400
+ partitioned_devices = _search_for_partitions(rpc_client, nvme)
401
+ logger.debug("partitioned_devices")
402
+ logger.debug(partitioned_devices)
403
+ if len(partitioned_devices) == (1 + snode.num_partitions_per_dev):
404
+ logger.info("Partitioned devices found")
405
+ else:
406
+ logger.info(f"Creating partitions for {nvme.nvme_bdev}")
407
+ _create_device_partitions(rpc_client, nvme, snode)
408
+ partitioned_devices = _search_for_partitions(rpc_client, nvme)
409
+ if len(partitioned_devices) == (1 + snode.num_partitions_per_dev):
410
+ logger.info("Device partitions created")
411
+ else:
412
+ logger.error("Failed to create partitions")
413
+ return False
414
+
415
+ jm_devices.append(partitioned_devices.pop(0))
416
+
417
+ for dev in partitioned_devices:
418
+ new_device = _create_storage_device_stack(rpc_client, dev, snode, after_restart=False)
419
+ if not new_device:
420
+ logger.error("failed to create dev stack")
421
+ return False
422
+ new_device.cluster_device_order = dev_order
423
+ dev_order += 1
424
+ new_devices.append(new_device)
425
+ device_events.device_create(new_device)
426
+
427
+ snode.nvme_devices = new_devices
428
+
429
+ if jm_devices:
430
+ jm_nvme_bdevs = [dev.nvme_bdev for dev in jm_devices]
431
+ jm_device = _create_jm_stack_on_raid(rpc_client, jm_nvme_bdevs, snode, after_restart=False)
432
+ if not jm_device:
433
+ logger.error(f"Failed to create JM device")
434
+ return False
435
+ snode.jm_device = jm_device
436
+
437
+ return True
438
+
439
+
440
+ def _prepare_cluster_devices_jm_on_dev(snode, devices):
441
+ db_controller = DBController()
442
+
443
+ jm_device = devices[0]
444
+ # Set device cluster order
445
+ dev_order = get_next_cluster_device_order(db_controller, snode.cluster_id)
446
+ for index, nvme in enumerate(devices):
447
+ nvme.cluster_device_order = dev_order
448
+ dev_order += 1
449
+ if nvme.size < jm_device.size:
450
+ jm_device = nvme
451
+ device_events.device_create(nvme)
452
+ jm_device.status = NVMeDevice.STATUS_JM
453
+
454
+ rpc_client = RPCClient(snode.mgmt_ip, snode.rpc_port, snode.rpc_username, snode.rpc_password)
455
+
456
+ new_devices = []
457
+ for index, nvme in enumerate(devices):
220
458
  if nvme.status not in [NVMeDevice.STATUS_ONLINE, NVMeDevice.STATUS_UNAVAILABLE,
221
459
  NVMeDevice.STATUS_JM, NVMeDevice.STATUS_READONLY]:
222
460
  logger.debug(f"Device is not online or unavailable: {nvme.get_id()}, status: {nvme.status}")
223
461
  continue
224
462
 
225
- test_name = f"{nvme.nvme_bdev}_test"
226
- # create testing bdev
227
- ret = rpc_client.bdev_passtest_create(test_name, nvme.nvme_bdev)
228
- if not ret:
229
- logger.error(f"Failed to create bdev: {test_name}")
230
- return False
231
- alceml_id = nvme.get_id()
232
- alceml_name = device_controller.get_alceml_name(alceml_id)
233
- logger.info(f"adding {alceml_name}")
234
- pba_init_mode = 3
235
- if after_restart:
236
- pba_init_mode = 2
237
- ret = rpc_client.bdev_alceml_create(alceml_name, test_name, alceml_id, pba_init_mode=pba_init_mode)
238
- if not ret:
239
- logger.error(f"Failed to create alceml bdev: {alceml_name}")
240
- return False
241
-
242
- # create jm
243
- if nvme.jm_bdev:
244
- ret = rpc_client.bdev_jm_create(nvme.jm_bdev, alceml_name)
245
- if not ret:
246
- logger.error(f"Failed to create JM bdev: {nvme.jm_bdev}")
463
+ if nvme.status == NVMeDevice.STATUS_JM:
464
+ jm_device = _create_jm_stack_on_device(rpc_client, nvme, snode, after_restart=False)
465
+ if not jm_device:
466
+ logger.error(f"Failed to create JM device")
247
467
  return False
248
- nvme.testing_bdev = test_name
249
- nvme.alceml_bdev = alceml_name
250
- nvme.io_error = True
251
- nvme.status = NVMeDevice.STATUS_JM
468
+ snode.jm_device = jm_device
469
+ else:
470
+ new_device = _create_storage_device_stack(rpc_client, nvme, snode, after_restart=False)
471
+ if not new_device:
472
+ logger.error("failed to create dev stack")
473
+ return False
474
+ new_device.cluster_device_order = dev_order
475
+ dev_order += 1
476
+ new_devices.append(new_device)
477
+ device_events.device_create(new_device)
478
+
479
+ snode.nvme_devices = new_devices
480
+ return True
481
+
482
+
483
+ def _prepare_cluster_devices_on_restart(snode):
484
+ db_controller = DBController()
485
+
486
+ rpc_client = RPCClient(
487
+ snode.mgmt_ip, snode.rpc_port,
488
+ snode.rpc_username, snode.rpc_password)
489
+
490
+ for index, nvme in enumerate(snode.nvme_devices):
491
+ if nvme.status not in [NVMeDevice.STATUS_ONLINE, NVMeDevice.STATUS_UNAVAILABLE, NVMeDevice.STATUS_READONLY]:
492
+ logger.debug(f"Device is skipped: {nvme.get_id()}, status: {nvme.status}")
252
493
  continue
253
494
 
254
- # add pass through
255
- pt_name = f"{alceml_name}_PT"
256
- ret = rpc_client.bdev_PT_NoExcl_create(pt_name, alceml_name)
495
+ dev = _create_storage_device_stack(rpc_client, nvme, snode, after_restart=True)
496
+ if not dev:
497
+ logger.error(f"Failed to create dev stack {nvme.get_id()}")
498
+ return False
499
+ device_events.device_restarted(dev)
500
+
501
+ # prepare JM device
502
+ jm_device = snode.jm_device
503
+ if jm_device.jm_nvme_bdev_list:
504
+ ret = _create_jm_stack_on_raid(rpc_client, jm_device.jm_nvme_bdev_list, snode, after_restart=False)
257
505
  if not ret:
258
- logger.error(f"Failed to create pt noexcl bdev: {pt_name}")
506
+ logger.error(f"Failed to create JM device")
259
507
  return False
508
+ else:
260
509
 
261
- subsystem_nqn = snode.subsystem + ":dev:" + alceml_id
262
- logger.info("creating subsystem %s", subsystem_nqn)
263
- ret = rpc_client.subsystem_create(subsystem_nqn, 'sbcli-cn', alceml_id)
264
- IP = None
265
- for iface in snode.data_nics:
266
- if iface.ip4_address:
267
- tr_type = iface.get_transport_type()
268
- ret = rpc_client.transport_list()
269
- found = False
270
- if ret:
271
- for ty in ret:
272
- if ty['trtype'] == tr_type:
273
- found = True
274
- if found is False:
275
- ret = rpc_client.transport_create(tr_type)
276
- logger.info("adding listener for %s on IP %s" % (subsystem_nqn, iface.ip4_address))
277
- ret = rpc_client.listeners_create(subsystem_nqn, tr_type, iface.ip4_address, "4420")
278
- IP = iface.ip4_address
279
- break
280
- logger.info(f"add {pt_name} to subsystem")
281
- ret = rpc_client.nvmf_subsystem_add_ns(subsystem_nqn, pt_name)
510
+ ret = rpc_client.bdev_alceml_create(jm_device.alceml_bdev, jm_device.nvme_bdev, jm_device.get_id(), pba_init_mode=2)
282
511
  if not ret:
283
- logger.error(f"Failed to add: {pt_name} to the subsystem: {subsystem_nqn}")
512
+ logger.error(f"Failed to create alceml bdev: {jm_device.alceml_bdev}")
284
513
  return False
285
514
 
286
- nvme.testing_bdev = test_name
287
- nvme.alceml_bdev = alceml_name
288
- nvme.pt_bdev = pt_name
289
- nvme.nvmf_nqn = subsystem_nqn
290
- nvme.nvmf_ip = IP
291
- nvme.nvmf_port = 4420
292
- nvme.io_error = False
293
- old_status = nvme.status
294
- nvme.status = NVMeDevice.STATUS_ONLINE
295
- device_events.device_status_change(nvme, nvme.status, old_status)
296
- snode.write_to_db(db_controller.kv_store)
515
+ jm_bdev = f"jm_{snode.get_id()}"
516
+ ret = rpc_client.bdev_jm_create(jm_bdev, jm_device.alceml_bdev)
517
+ if not ret:
518
+ logger.error(f"Failed to create {jm_bdev}")
519
+ return False
297
520
 
298
521
  return True
299
522
 
@@ -307,7 +530,7 @@ def _connect_to_remote_devs(this_node):
307
530
 
308
531
  remote_devices = []
309
532
  # connect to remote devs
310
- snodes = db_controller.get_storage_nodes()
533
+ snodes = db_controller.get_storage_nodes_by_cluster_id(this_node.cluster_id)
311
534
  for node_index, node in enumerate(snodes):
312
535
  if node.get_id() == this_node.get_id() or node.status == node.STATUS_OFFLINE:
313
536
  continue
@@ -326,9 +549,10 @@ def _connect_to_remote_devs(this_node):
326
549
  return remote_devices
327
550
 
328
551
 
329
- def add_node(cluster_id, node_ip, iface_name, data_nics_list, spdk_cpu_mask,
552
+ def add_node(cluster_id, node_ip, iface_name, data_nics_list,
330
553
  spdk_mem, spdk_image=None, spdk_debug=False,
331
- small_pool_count=0, large_pool_count=0, small_bufsize=0, large_bufsize=0, jm_device_pcie=None):
554
+ small_pool_count=0, large_pool_count=0, small_bufsize=0, large_bufsize=0,
555
+ num_partitions_per_dev=0, jm_percent=0):
332
556
  db_controller = DBController()
333
557
  kv_store = db_controller.kv_store
334
558
 
@@ -399,6 +623,27 @@ def add_node(cluster_id, node_ip, iface_name, data_nics_list, spdk_cpu_mask,
399
623
  spdk_mem = huge_free
400
624
  logger.info(f"Using the free hugepages for spdk memory: {utils.humanbytes(huge_free)}")
401
625
 
626
+ # Tune cpu maks parameters
627
+ cpu_count = node_info["cpu_count"]
628
+ pollers_mask = ""
629
+ app_thread_mask = ""
630
+ dev_cpu_mask = ""
631
+ if cpu_count < 8:
632
+ mask = (1 << (cpu_count - 1)) - 1
633
+ mask <<= 1
634
+ spdk_cpu_mask = f'0x{mask:X}'
635
+ os_cores = [0]
636
+ else:
637
+ os_cores, nvme_pollers_cores, app_thread_core, dev_cpu_cores = \
638
+ utils.calculate_core_allocation(cpu_count)
639
+ spdk_cores = nvme_pollers_cores + app_thread_core + dev_cpu_cores
640
+
641
+ pollers_mask = utils.generate_mask(nvme_pollers_cores)
642
+ app_thread_mask = utils.generate_mask(app_thread_core)
643
+ spdk_cpu_mask = utils.generate_mask(spdk_cores)
644
+ dev_cpu_mask = utils.generate_mask(dev_cpu_cores)
645
+
646
+
402
647
  logger.info("Joining docker swarm...")
403
648
  cluster_docker = utils.get_docker_client(cluster_id)
404
649
  cluster_ip = cluster_docker.info()["Swarm"]["NodeAddr"]
@@ -476,12 +721,19 @@ def add_node(cluster_id, node_ip, iface_name, data_nics_list, spdk_cpu_mask,
476
721
  snode.spdk_image = spdk_image or ""
477
722
  snode.spdk_debug = spdk_debug or 0
478
723
  snode.write_to_db(kv_store)
724
+ snode.app_thread_mask = app_thread_mask or ""
725
+ snode.pollers_mask = pollers_mask or ""
726
+ snode.dev_cpu_mask = dev_cpu_mask or ""
727
+ snode.os_cores = os_cores or []
479
728
 
480
729
  snode.iobuf_small_pool_count = small_pool_count or 0
481
730
  snode.iobuf_large_pool_count = large_pool_count or 0
482
731
  snode.iobuf_small_bufsize = small_bufsize or 0
483
732
  snode.iobuf_large_bufsize = large_bufsize or 0
484
733
 
734
+ snode.num_partitions_per_dev = num_partitions_per_dev
735
+ snode.jm_percent = jm_percent
736
+
485
737
  snode.write_to_db(kv_store)
486
738
 
487
739
  # creating RPCClient instance
@@ -499,13 +751,41 @@ def add_node(cluster_id, node_ip, iface_name, data_nics_list, spdk_cpu_mask,
499
751
  logger.error("Failed to set iobuf options")
500
752
  return False
501
753
 
502
- # 2- start spdk framework
754
+ # 2- set socket implementation options
755
+ ret = rpc_client.sock_impl_set_options()
756
+ if not ret:
757
+ logger.error("Failed socket implement set options")
758
+ return False
759
+
760
+ # 3- set nvme config
761
+ if snode.pollers_mask:
762
+ ret = rpc_client.nvmf_set_config(snode.pollers_mask)
763
+ if not ret:
764
+ logger.error("Failed to set pollers mask")
765
+ return False
766
+
767
+ # 4- start spdk framework
503
768
  ret = rpc_client.framework_start_init()
504
769
  if not ret:
505
770
  logger.error("Failed to start framework")
506
771
  return False
507
772
 
508
- # 3- set nvme bdev options
773
+ # 5- set app_thread cpu mask
774
+ if snode.app_thread_mask:
775
+ ret = rpc_client.thread_get_stats()
776
+ app_thread_process_id = 0
777
+ if ret.get("threads"):
778
+ for entry in ret["threads"]:
779
+ if entry['name'] == 'app_thread':
780
+ app_thread_process_id = entry['id']
781
+ break
782
+
783
+ ret = rpc_client.thread_set_cpumask(app_thread_process_id, snode.app_thread_mask)
784
+ if not ret:
785
+ logger.error("Failed to set app thread mask")
786
+ return False
787
+
788
+ # 6- set nvme bdev options
509
789
  ret = rpc_client.bdev_nvme_set_options()
510
790
  if not ret:
511
791
  logger.error("Failed to set nvme options")
@@ -513,36 +793,18 @@ def add_node(cluster_id, node_ip, iface_name, data_nics_list, spdk_cpu_mask,
513
793
 
514
794
  # get new node info after starting spdk
515
795
  node_info, _ = snode_api.info()
516
- # adding devices
796
+
797
+ # discover devices
517
798
  nvme_devs = addNvmeDevices(cluster, rpc_client, node_info['spdk_pcie_list'], snode)
518
799
  if not nvme_devs:
519
800
  logger.error("No NVMe devices was found!")
520
801
  return False
521
802
 
522
- snode.nvme_devices = nvme_devs
523
-
524
- jm_device = snode.nvme_devices[0]
525
- # Set device cluster order
526
- dev_order = get_next_cluster_device_order(db_controller)
527
- for index, nvme in enumerate(snode.nvme_devices):
528
- nvme.cluster_device_order = dev_order
529
- dev_order += 1
530
- if jm_device_pcie:
531
- if nvme.pcie_address == jm_device_pcie:
532
- jm_device = nvme
533
- elif nvme.size < jm_device.size:
534
- jm_device = nvme
535
- device_events.device_create(nvme)
536
-
537
- # create jm
538
- logger.info(f"Using device for JM: {jm_device.get_id()}")
539
- jm_device.jm_bdev = f"jm_{snode.get_id()}"
540
-
541
- # save object
542
- snode.write_to_db(db_controller.kv_store)
543
-
544
803
  # prepare devices
545
- ret = _prepare_cluster_devices(snode)
804
+ if snode.num_partitions_per_dev == 0 or snode.jm_percent == 0:
805
+ ret = _prepare_cluster_devices_jm_on_dev(snode, nvme_devs)
806
+ else:
807
+ ret = _prepare_cluster_devices_partitions(snode, nvme_devs)
546
808
  if not ret:
547
809
  logger.error("Failed to prepare cluster devices")
548
810
  return False
@@ -557,7 +819,7 @@ def add_node(cluster_id, node_ip, iface_name, data_nics_list, spdk_cpu_mask,
557
819
 
558
820
  # make other nodes connect to the new devices
559
821
  logger.info("Make other nodes connect to the new devices")
560
- snodes = db_controller.get_storage_nodes()
822
+ snodes = db_controller.get_storage_nodes_by_cluster_id(cluster_id)
561
823
  for node_index, node in enumerate(snodes):
562
824
  if node.get_id() == snode.get_id() or node.status != StorageNode.STATUS_ONLINE:
563
825
  continue
@@ -599,150 +861,16 @@ def add_node(cluster_id, node_ip, iface_name, data_nics_list, spdk_cpu_mask,
599
861
  time.sleep(3)
600
862
 
601
863
  logger.info("Sending cluster event updates")
602
- distr_controller.send_node_status_event(snode.get_id(), "online")
864
+ distr_controller.send_node_status_event(snode, StorageNode.STATUS_ONLINE)
603
865
 
604
866
  for dev in snode.nvme_devices:
605
- distr_controller.send_dev_status_event(dev.cluster_device_order, "online")
867
+ distr_controller.send_dev_status_event(dev, NVMeDevice.STATUS_ONLINE)
606
868
 
607
869
  storage_events.snode_add(snode)
608
870
  logger.info("Done")
609
871
  return "Success"
610
872
 
611
873
 
612
- # Deprecated
613
- def add_storage_node(cluster_id, iface_name, data_nics):
614
- db_controller = DBController()
615
- kv_store = db_controller.kv_store
616
-
617
- cluster = db_controller.get_cluster_by_id(cluster_id)
618
- if not cluster:
619
- logger.error("Cluster not found: %s", cluster_id)
620
- return False
621
-
622
- logger.info("Add Storage node")
623
-
624
- hostname = utils.get_hostname()
625
- snode = db_controller.get_storage_node_by_hostname(hostname)
626
- if snode:
627
- logger.error("Node already exists, try remove it first.")
628
- exit(1)
629
- else:
630
- snode = StorageNode()
631
- snode.uuid = str(uuid.uuid4())
632
-
633
- mgmt_ip = _get_if_ip_address(iface_name)
634
- system_id = utils.get_system_id()
635
-
636
- BASE_NQN = cluster.nqn.split(":")[0]
637
- subsystem_nqn = f"{BASE_NQN}:{hostname}"
638
-
639
- if data_nics:
640
- data_nics = _get_data_nics(data_nics)
641
- else:
642
- data_nics = _get_data_nics([iface_name])
643
-
644
- rpc_user, rpc_pass = utils.generate_rpc_user_and_pass()
645
-
646
- # creating storage node object
647
- snode.status = StorageNode.STATUS_IN_CREATION
648
- snode.baseboard_sn = utils.get_baseboard_sn()
649
- snode.system_uuid = system_id
650
- snode.hostname = hostname
651
- snode.host_nqn = subsystem_nqn
652
- snode.subsystem = subsystem_nqn
653
- snode.data_nics = data_nics
654
- snode.mgmt_ip = mgmt_ip
655
- snode.rpc_port = constants.RPC_HTTP_PROXY_PORT
656
- snode.rpc_username = rpc_user
657
- snode.rpc_password = rpc_pass
658
- snode.cluster_id = cluster_id
659
- snode.write_to_db(kv_store)
660
-
661
- # creating RPCClient instance
662
- rpc_client = RPCClient(
663
- snode.mgmt_ip,
664
- snode.rpc_port,
665
- snode.rpc_username,
666
- snode.rpc_password)
667
-
668
- logger.info("Getting nvme devices")
669
- devs = get_nvme_devices()
670
- logger.debug(devs)
671
- pcies = [d[0] for d in devs]
672
- nvme_devs = addNvmeDevices(cluster, rpc_client, pcies, snode)
673
- if not nvme_devs:
674
- logger.error("No NVMe devices was found!")
675
-
676
- logger.debug(nvme_devs)
677
- snode.nvme_devices = nvme_devs
678
-
679
- # Set device cluster order
680
- dev_order = get_next_cluster_device_order(db_controller)
681
- for index, nvme in enumerate(snode.nvme_devices):
682
- nvme.cluster_device_order = dev_order
683
- dev_order += 1
684
- snode.write_to_db(db_controller.kv_store)
685
-
686
- # prepare devices
687
- _prepare_cluster_devices(snode)
688
-
689
- logger.info("Connecting to remote devices")
690
- remote_devices = _connect_to_remote_devs(snode)
691
- snode.remote_devices = remote_devices
692
-
693
- logger.info("Setting node status to Active")
694
- snode.status = StorageNode.STATUS_ONLINE
695
- snode.write_to_db(kv_store)
696
-
697
- # make other nodes connect to the new devices
698
- logger.info("Make other nodes connect to the new devices")
699
- snodes = db_controller.get_storage_nodes()
700
- for node_index, node in enumerate(snodes):
701
- if node.get_id() == snode.get_id():
702
- continue
703
- logger.info(f"Connecting to node: {node.get_id()}")
704
- rpc_client = RPCClient(node.mgmt_ip, node.rpc_port, node.rpc_username, node.rpc_password)
705
- count = 0
706
- for dev in snode.nvme_devices:
707
- name = f"remote_{dev.alceml_bdev}"
708
- ret = rpc_client.bdev_nvme_attach_controller_tcp(name, dev.nvmf_nqn, dev.nvmf_ip, dev.nvmf_port)
709
- if not ret:
710
- logger.error(f"Failed to connect to device: {name}")
711
- continue
712
-
713
- dev.remote_bdev = f"{name}n1"
714
- idx = -1
715
- for i, d in enumerate(node.remote_devices):
716
- if d.get_id() == dev.get_id():
717
- idx = i
718
- break
719
- if idx >= 0:
720
- node.remote_devices[idx] = dev
721
- else:
722
- node.remote_devices.append(dev)
723
- count += 1
724
- node.write_to_db(kv_store)
725
- logger.info(f"connected to devices count: {count}")
726
-
727
- logger.info("Sending cluster map")
728
- ret = distr_controller.send_cluster_map_to_node(snode)
729
- if not ret:
730
- return False, "Failed to send cluster map"
731
- ret = distr_controller.send_cluster_map_add_node(snode)
732
- if not ret:
733
- return False, "Failed to send cluster map add node"
734
- time.sleep(3)
735
-
736
- logger.info("Sending cluster event updates")
737
- distr_controller.send_node_status_event(snode.get_id(), "online")
738
-
739
- for dev in snode.nvme_devices:
740
- distr_controller.send_dev_status_event(dev.cluster_device_order, "online")
741
-
742
- logger.info("Done")
743
- return "Success"
744
-
745
-
746
874
  def delete_storage_node(node_id):
747
875
  db_controller = DBController()
748
876
  snode = db_controller.get_storage_node_by_id(node_id)
@@ -756,7 +884,7 @@ def delete_storage_node(node_id):
756
884
 
757
885
  snode.remove(db_controller.kv_store)
758
886
 
759
- for lvol in db_controller.get_lvols():
887
+ for lvol in db_controller.get_lvols(snode.cluster_id):
760
888
  logger.info(f"Sending cluster map to LVol: {lvol.get_id()}")
761
889
  lvol_controller.send_cluster_map(lvol.get_id())
762
890
 
@@ -764,7 +892,7 @@ def delete_storage_node(node_id):
764
892
  logger.info("done")
765
893
 
766
894
 
767
- def remove_storage_node(node_id, force_remove=False):
895
+ def remove_storage_node(node_id, force_remove=False, force_migrate=False):
768
896
  db_controller = DBController()
769
897
  snode = db_controller.get_storage_node_by_id(node_id)
770
898
  if not snode:
@@ -811,7 +939,7 @@ def remove_storage_node(node_id, force_remove=False):
811
939
  distr_controller.disconnect_device(dev)
812
940
  old_status = dev.status
813
941
  dev.status = NVMeDevice.STATUS_FAILED
814
- distr_controller.send_dev_status_event(dev.cluster_device_order, NVMeDevice.STATUS_FAILED)
942
+ distr_controller.send_dev_status_event(dev, NVMeDevice.STATUS_FAILED)
815
943
  device_events.device_status_change(dev, NVMeDevice.STATUS_FAILED, old_status)
816
944
 
817
945
  logger.info("Removing storage node")
@@ -825,24 +953,29 @@ def remove_storage_node(node_id, force_remove=False):
825
953
  pass
826
954
 
827
955
  try:
828
- snode_api = SNodeClient(snode.api_endpoint)
956
+ snode_api = SNodeClient(snode.api_endpoint, timeout=20)
829
957
  snode_api.spdk_process_kill()
830
958
  snode_api.leave_swarm()
959
+ pci_address = []
960
+ for dev in snode.nvme_devices:
961
+ if dev.pcie_address not in pci_address:
962
+ ret = snode_api.delete_dev_gpt_partitions(dev.pcie_address)
963
+ logger.debug(ret)
964
+ pci_address.append(dev.pcie_address)
831
965
  except Exception as e:
832
- logger.warning(f"Failed to remove SPDK process: {e}")
966
+ logger.exception(e)
833
967
 
834
968
  old_status = snode.status
835
969
  snode.status = StorageNode.STATUS_REMOVED
836
970
  snode.write_to_db(db_controller.kv_store)
837
971
  logger.info("Sending node event update")
838
- distr_controller.send_node_status_event(snode.get_id(), snode.status)
972
+ distr_controller.send_node_status_event(snode, snode.status)
839
973
  storage_events.snode_status_change(snode, StorageNode.STATUS_REMOVED, old_status)
840
974
  logger.info("done")
841
975
 
842
976
 
843
977
  def restart_storage_node(
844
978
  node_id,
845
- spdk_cpu_mask=None,
846
979
  spdk_mem=None,
847
980
  spdk_image=None,
848
981
  set_spdk_debug=None,
@@ -868,7 +1001,7 @@ def restart_storage_node(
868
1001
  snode.status = StorageNode.STATUS_RESTARTING
869
1002
  snode.write_to_db(kv_store)
870
1003
  logger.info("Sending node event update")
871
- distr_controller.send_node_status_event(snode.get_id(), snode.status)
1004
+ distr_controller.send_node_status_event(snode, snode.status)
872
1005
  storage_events.snode_status_change(snode, snode.status, old_status)
873
1006
 
874
1007
  logger.info(f"Restarting Storage node: {snode.mgmt_ip}")
@@ -878,10 +1011,6 @@ def restart_storage_node(
878
1011
  logger.info(f"Node info: {node_info}")
879
1012
 
880
1013
  logger.info("Restarting SPDK")
881
- cpu = snode.spdk_cpu_mask
882
- if spdk_cpu_mask:
883
- cpu = spdk_cpu_mask
884
- snode.spdk_cpu_mask = cpu
885
1014
  mem = snode.spdk_mem
886
1015
  if spdk_mem:
887
1016
  mem = spdk_mem
@@ -897,7 +1026,7 @@ def restart_storage_node(
897
1026
 
898
1027
  cluster_docker = utils.get_docker_client(snode.cluster_id)
899
1028
  cluster_ip = cluster_docker.info()["Swarm"]["NodeAddr"]
900
- results, err = snode_api.spdk_process_start(cpu, mem, img, spdk_debug, cluster_ip)
1029
+ results, err = snode_api.spdk_process_start(snode.spdk_cpu_mask, mem, img, spdk_debug, cluster_ip)
901
1030
 
902
1031
  if not results:
903
1032
  logger.error(f"Failed to start spdk: {err}")
@@ -931,13 +1060,41 @@ def restart_storage_node(
931
1060
  logger.error("Failed to set iobuf options")
932
1061
  return False
933
1062
 
934
- # 2- start spdk framework
1063
+ # 2- set socket implementation options
1064
+ ret = rpc_client.sock_impl_set_options()
1065
+ if not ret:
1066
+ logger.error("Failed socket implement set options")
1067
+ return False
1068
+
1069
+ # 3- set nvme config
1070
+ if snode.pollers_mask:
1071
+ ret = rpc_client.nvmf_set_config(snode.pollers_mask)
1072
+ if not ret:
1073
+ logger.error("Failed to set pollers mask")
1074
+ return False
1075
+
1076
+ # 4- start spdk framework
935
1077
  ret = rpc_client.framework_start_init()
936
1078
  if not ret:
937
1079
  logger.error("Failed to start framework")
938
1080
  return False
939
1081
 
940
- # 3- set nvme bdev options
1082
+ # 5- set app_thread cpu mask
1083
+ if snode.app_thread_mask:
1084
+ ret = rpc_client.thread_get_stats()
1085
+ app_thread_process_id = 0
1086
+ if ret.get("threads"):
1087
+ for entry in ret["threads"]:
1088
+ if entry['name'] == 'app_thread':
1089
+ app_thread_process_id = entry['id']
1090
+ break
1091
+
1092
+ ret = rpc_client.thread_set_cpumask(app_thread_process_id, snode.app_thread_mask)
1093
+ if not ret:
1094
+ logger.error("Failed to set app thread mask")
1095
+ return False
1096
+
1097
+ # 6- set nvme bdev options
941
1098
  ret = rpc_client.bdev_nvme_set_options()
942
1099
  if not ret:
943
1100
  logger.error("Failed to set nvme options")
@@ -970,22 +1127,23 @@ def restart_storage_node(
970
1127
  else:
971
1128
  logger.info(f"Device not found: {db_dev.get_id()}")
972
1129
  db_dev.status = NVMeDevice.STATUS_REMOVED
973
- distr_controller.send_dev_status_event(db_dev.cluster_device_order, "offline")
1130
+ distr_controller.send_dev_status_event(db_dev, db_dev.status)
974
1131
 
975
- for dev in nvme_devs:
976
- if dev.serial_number not in known_devices_sn:
977
- logger.info(f"New device found: {dev.get_id()}")
978
- dev.status = 'new'
979
- new_devices.append(dev)
980
- snode.nvme_devices.append(dev)
1132
+ # todo: handle new devices
1133
+ # for dev in nvme_devs:
1134
+ # if dev.serial_number not in known_devices_sn:
1135
+ # logger.info(f"New device found: {dev.get_id()}")
1136
+ # dev.status = NVMeDevice.STATUS_NEW
1137
+ # new_devices.append(dev)
1138
+ # snode.nvme_devices.append(dev)
981
1139
 
982
- dev_order = get_next_cluster_device_order(db_controller)
983
- for index, nvme in enumerate(new_devices):
984
- nvme.cluster_device_order = dev_order
985
- dev_order += 1
1140
+ # dev_order = get_next_cluster_device_order(db_controller, snode.cluster_id)
1141
+ # for index, nvme in enumerate(new_devices):
1142
+ # nvme.cluster_device_order = dev_order
1143
+ # dev_order += 1
986
1144
 
987
1145
  # prepare devices
988
- ret = _prepare_cluster_devices(snode, after_restart=True)
1146
+ ret = _prepare_cluster_devices_on_restart(snode)
989
1147
  if not ret:
990
1148
  logger.error("Failed to prepare cluster devices")
991
1149
  return False
@@ -996,7 +1154,7 @@ def restart_storage_node(
996
1154
 
997
1155
  # make other nodes connect to the new devices
998
1156
  logger.info("Make other nodes connect to the node devices")
999
- snodes = db_controller.get_storage_nodes()
1157
+ snodes = db_controller.get_storage_nodes_by_cluster_id(snode.cluster_id)
1000
1158
  for node_index, node in enumerate(snodes):
1001
1159
  if node.get_id() == snode.get_id() or node.status != StorageNode.STATUS_ONLINE:
1002
1160
  continue
@@ -1034,20 +1192,23 @@ def restart_storage_node(
1034
1192
  storage_events.snode_status_change(snode, snode.status, old_status)
1035
1193
 
1036
1194
  logger.info("Sending node event update")
1037
- distr_controller.send_node_status_event(snode.get_id(), NVMeDevice.STATUS_ONLINE)
1195
+ distr_controller.send_node_status_event(snode, StorageNode.STATUS_ONLINE)
1038
1196
 
1039
1197
  logger.info("Sending devices event updates")
1198
+ logger.info("Starting migration tasks")
1040
1199
  for dev in snode.nvme_devices:
1041
1200
  if dev.status != NVMeDevice.STATUS_ONLINE:
1042
- logger.debug(f"Device is not online: {dev.get_id()}, status: {dev.status}")
1201
+ logger.info(f"Device is not online: {dev.get_id()}, status: {dev.status}")
1043
1202
  continue
1044
- distr_controller.send_dev_status_event(dev.cluster_device_order, NVMeDevice.STATUS_ONLINE)
1045
1203
 
1046
- logger.info("Sending cluster map to current node")
1047
- ret = distr_controller.send_cluster_map_to_node(snode)
1048
- if not ret:
1049
- return False, "Failed to send cluster map"
1050
- time.sleep(3)
1204
+ distr_controller.send_dev_status_event(dev, NVMeDevice.STATUS_ONLINE)
1205
+ tasks_controller.add_device_mig_task(dev.get_id())
1206
+
1207
+ # logger.info("Sending cluster map to current node")
1208
+ # ret = distr_controller.send_cluster_map_to_node(snode)
1209
+ # if not ret:
1210
+ # return False, "Failed to send cluster map"
1211
+ # time.sleep(3)
1051
1212
 
1052
1213
  for lvol_id in snode.lvols:
1053
1214
  lvol = lvol_controller.recreate_lvol(lvol_id, snode)
@@ -1062,9 +1223,12 @@ def restart_storage_node(
1062
1223
  return "Success"
1063
1224
 
1064
1225
 
1065
- def list_storage_nodes(kv_store, is_json):
1066
- db_controller = DBController(kv_store)
1067
- nodes = db_controller.get_storage_nodes()
1226
+ def list_storage_nodes(is_json, cluster_id=None):
1227
+ db_controller = DBController()
1228
+ if cluster_id:
1229
+ nodes = db_controller.get_storage_nodes_by_cluster_id(cluster_id)
1230
+ else:
1231
+ nodes = db_controller.get_storage_nodes()
1068
1232
  data = []
1069
1233
  output = ""
1070
1234
 
@@ -1111,26 +1275,43 @@ def list_storage_devices(kv_store, node_id, sort, is_json):
1111
1275
  logger.error("This storage node is not part of the cluster")
1112
1276
  return False
1113
1277
 
1114
- data = []
1278
+ storage_devices = []
1279
+ jm_devices = []
1280
+ remote_devices = []
1115
1281
  for device in snode.nvme_devices:
1116
1282
  logger.debug(device)
1117
1283
  logger.debug("*" * 20)
1118
- data.append({
1284
+ storage_devices.append({
1119
1285
  "UUID": device.uuid,
1120
1286
  "Name": device.device_name,
1121
- "Hostname": snode.hostname,
1122
1287
  "Size": utils.humanbytes(device.size),
1123
- # "Sequential Number": device.sequential_number,
1124
- # "Partitions Count": device.partitions_count,
1125
- # "Model ID": device.model_id,
1126
1288
  "Serial Number": device.serial_number,
1127
1289
  "PCIe": device.pcie_address,
1128
1290
  "Status": device.status,
1129
1291
  "IO Err": device.io_error,
1130
- "Health": device.health_check,
1292
+ "Health": device.health_check
1293
+ })
1131
1294
 
1295
+ if snode.jm_device:
1296
+ jm_devices.append({
1297
+ "UUID": snode.jm_device.uuid,
1298
+ "Name": snode.jm_device.device_name,
1299
+ "Size": utils.humanbytes(snode.jm_device.size),
1300
+ "Status": snode.jm_device.status,
1301
+ "IO Err": snode.jm_device.io_error,
1302
+ "Health": snode.jm_device.health_check
1132
1303
  })
1133
1304
 
1305
+ for device in snode.remote_devices:
1306
+ logger.debug(device)
1307
+ logger.debug("*" * 20)
1308
+ remote_devices.append({
1309
+ "UUID": device.uuid,
1310
+ "Name": device.device_name,
1311
+ "Size": utils.humanbytes(device.size),
1312
+ "Serial Number": device.serial_number,
1313
+ "Node ID": device.node_id,
1314
+ })
1134
1315
  if sort and sort in ['node-seq', 'dev-seq', 'serial']:
1135
1316
  if sort == 'serial':
1136
1317
  sort_key = "Serial Number"
@@ -1139,13 +1320,20 @@ def list_storage_devices(kv_store, node_id, sort, is_json):
1139
1320
  elif sort == 'node-seq':
1140
1321
  # TODO: check this key
1141
1322
  sort_key = "Sequential Number"
1142
- sorted_data = sorted(data, key=lambda d: d[sort_key])
1143
- data = sorted_data
1323
+ storage_devices = sorted(storage_devices, key=lambda d: d[sort_key])
1144
1324
 
1325
+ data = {
1326
+ "Storage Devices": storage_devices,
1327
+ "JM Devices": jm_devices,
1328
+ "Remote Devices": remote_devices,
1329
+ }
1145
1330
  if is_json:
1146
1331
  return json.dumps(data, indent=2)
1147
1332
  else:
1148
- return utils.print_table(data)
1333
+ out = ""
1334
+ for d in data:
1335
+ out += f"{d}\n{utils.print_table(data[d])}\n\n"
1336
+ return out
1149
1337
 
1150
1338
 
1151
1339
  def shutdown_storage_node(node_id, force=False):
@@ -1186,7 +1374,7 @@ def shutdown_storage_node(node_id, force=False):
1186
1374
  for dev in snode.nvme_devices:
1187
1375
  if dev.status in [NVMeDevice.STATUS_ONLINE, NVMeDevice.STATUS_READONLY]:
1188
1376
  device_controller.device_set_unavailable(dev.get_id())
1189
- distr_controller.send_node_status_event(snode.get_id(), "in_shutdown")
1377
+ distr_controller.send_node_status_event(snode, StorageNode.STATUS_IN_SHUTDOWN)
1190
1378
 
1191
1379
  # shutdown node
1192
1380
  # make other nodes disconnect from this node
@@ -1206,7 +1394,7 @@ def shutdown_storage_node(node_id, force=False):
1206
1394
  snode_api = SNodeClient(snode.api_endpoint)
1207
1395
  results, err = snode_api.spdk_process_kill()
1208
1396
 
1209
- distr_controller.send_node_status_event(snode.get_id(), StorageNode.STATUS_OFFLINE)
1397
+ distr_controller.send_node_status_event(snode, StorageNode.STATUS_OFFLINE)
1210
1398
 
1211
1399
  logger.info("Setting node status to offline")
1212
1400
  snode = db_controller.get_storage_node_by_id(node_id)
@@ -1233,22 +1421,24 @@ def suspend_storage_node(node_id, force=False):
1233
1421
  return False
1234
1422
 
1235
1423
  cluster = db_controller.get_cluster_by_id(snode.cluster_id)
1236
- snodes = db_controller.get_storage_nodes()
1424
+ snodes = db_controller.get_storage_nodes_by_cluster_id(snode.cluster_id)
1237
1425
  online_nodes = 0
1238
1426
  for node in snodes:
1239
1427
  if node.status == node.STATUS_ONLINE:
1240
1428
  online_nodes += 1
1241
- if cluster.ha_type == "ha" and online_nodes <= 3 and cluster.status == cluster.STATUS_ACTIVE:
1242
- logger.warning(f"Cluster mode is HA but online storage nodes are less than 3")
1243
- if force is False:
1244
- return False
1245
1429
 
1246
- if cluster.ha_type == "ha" and cluster.status == cluster.STATUS_DEGRADED and force is False:
1247
- logger.warning(f"Cluster status is degraded, use --force but this will suspend the cluster")
1248
- return False
1430
+ if cluster.ha_type == "ha":
1431
+ if online_nodes <= 3 and cluster.status == cluster.STATUS_ACTIVE:
1432
+ logger.warning(f"Cluster mode is HA but online storage nodes are less than 3")
1433
+ if force is False:
1434
+ return False
1435
+
1436
+ if cluster.status == cluster.STATUS_DEGRADED and force is False:
1437
+ logger.warning(f"Cluster status is degraded, use --force but this will suspend the cluster")
1438
+ return False
1249
1439
 
1250
1440
  logger.info("Suspending node")
1251
- distr_controller.send_node_status_event(snode.get_id(), "suspended")
1441
+ distr_controller.send_node_status_event(snode, StorageNode.STATUS_SUSPENDED)
1252
1442
  for dev in snode.nvme_devices:
1253
1443
  if dev.status == NVMeDevice.STATUS_ONLINE:
1254
1444
  device_controller.device_set_unavailable(dev.get_id())
@@ -1292,7 +1482,7 @@ def resume_storage_node(node_id):
1292
1482
  logger.info("Resuming node")
1293
1483
 
1294
1484
  logger.info("Sending cluster event updates")
1295
- distr_controller.send_node_status_event(snode.get_id(), "online")
1485
+ distr_controller.send_node_status_event(snode, StorageNode.STATUS_ONLINE)
1296
1486
 
1297
1487
  for dev in snode.nvme_devices:
1298
1488
  if dev.status == NVMeDevice.STATUS_UNAVAILABLE:
@@ -1668,7 +1858,6 @@ def deploy_cleaner():
1668
1858
  return True
1669
1859
 
1670
1860
 
1671
-
1672
1861
  def get_host_secret(node_id):
1673
1862
  db_controller = DBController()
1674
1863
  node = db_controller.get_storage_node_by_id(node_id)
@@ -1831,7 +2020,7 @@ def set_node_status(node_id, status):
1831
2020
  snode.updated_at = str(datetime.datetime.now())
1832
2021
  snode.write_to_db(db_controller.kv_store)
1833
2022
  storage_events.snode_status_change(snode, snode.status, old_status, caused_by="monitor")
1834
- distr_controller.send_node_status_event(snode.get_id(), status)
2023
+ distr_controller.send_node_status_event(snode, status)
1835
2024
 
1836
2025
  if snode.status == StorageNode.STATUS_ONLINE:
1837
2026
  logger.info("Connecting to remote devices")