sbcli-pre 1.2.5__zip → 1.2.7__zip

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/PKG-INFO +1 -1
  2. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/env_var +1 -1
  3. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/sbcli_pre.egg-info/PKG-INFO +1 -1
  4. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/sbcli_pre.egg-info/SOURCES.txt +5 -3
  5. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_cli/cli.py +138 -136
  6. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/cluster_ops.py +138 -235
  7. sbcli_pre-1.2.7/simplyblock_core/constants.py +91 -0
  8. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/caching_node_controller.py +8 -6
  9. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/cluster_events.py +9 -0
  10. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/device_controller.py +56 -63
  11. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/events_controller.py +5 -3
  12. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/health_controller.py +30 -40
  13. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/lvol_controller.py +75 -39
  14. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/pool_controller.py +8 -4
  15. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/snapshot_controller.py +36 -3
  16. sbcli_pre-1.2.7/simplyblock_core/controllers/tasks_controller.py +103 -0
  17. sbcli_pre-1.2.7/simplyblock_core/controllers/tasks_events.py +37 -0
  18. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/distr_controller.py +13 -9
  19. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/kv_store.py +62 -20
  20. sbcli_pre-1.2.7/simplyblock_core/mgmt_node_ops.py +205 -0
  21. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/events.py +9 -1
  22. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/job_schedule.py +6 -0
  23. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/nvme_device.py +42 -4
  24. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/storage_node.py +14 -2
  25. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/rpc_client.py +55 -10
  26. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/__init__.py +0 -4
  27. sbcli_pre-1.2.5/simplyblock_core/scripts/alerting/alert_resources.yaml → sbcli_pre-1.2.7/simplyblock_core/scripts/alerting/alert_resources.yaml.j2 +54 -5
  28. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/dashboards/cluster.json +1 -1
  29. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/deploy_stack.sh +9 -0
  30. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/docker-compose-swarm-monitoring.yml +32 -15
  31. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/docker-compose-swarm.yml +17 -2
  32. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/haproxy.cfg +15 -0
  33. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/install_deps.sh +3 -0
  34. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/stack_deploy_wait.sh +1 -1
  35. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/capacity_and_stats_collector.py +1 -1
  36. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/device_monitor.py +5 -46
  37. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/distr_event_collector.py +10 -11
  38. sbcli_pre-1.2.7/simplyblock_core/services/health_check_service.py +134 -0
  39. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/lvol_monitor.py +1 -1
  40. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/lvol_stat_collector.py +1 -1
  41. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/port_stat_collector.py +0 -1
  42. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/storage_node_monitor.py +49 -44
  43. sbcli_pre-1.2.7/simplyblock_core/services/tasks_runner_migration.py +61 -0
  44. sbcli_pre-1.2.5/simplyblock_core/services/job_tasks.py → sbcli_pre-1.2.7/simplyblock_core/services/tasks_runner_restart.py +95 -46
  45. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/snode_client.py +12 -0
  46. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/storage_node_ops.py +630 -358
  47. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/utils.py +126 -1
  48. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/snode_ops.py +103 -25
  49. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/web_api_cluster.py +20 -43
  50. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/web_api_device.py +10 -7
  51. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/web_api_lvol.py +9 -5
  52. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/web_api_pool.py +14 -5
  53. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/web_api_storage_node.py +15 -15
  54. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/node_utils.py +0 -2
  55. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/utils.py +8 -0
  56. sbcli_pre-1.2.5/simplyblock_core/constants.py +0 -65
  57. sbcli_pre-1.2.5/simplyblock_core/mgmt_node_ops.py +0 -80
  58. sbcli_pre-1.2.5/simplyblock_core/scripts/apply_dashboard.sh +0 -22
  59. sbcli_pre-1.2.5/simplyblock_core/services/health_check_service.py +0 -136
  60. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/README.md +0 -0
  61. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/pyproject.toml +0 -0
  62. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/sbcli_pre.egg-info/dependency_links.txt +0 -0
  63. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/sbcli_pre.egg-info/entry_points.txt +0 -0
  64. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/sbcli_pre.egg-info/requires.txt +0 -0
  65. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/sbcli_pre.egg-info/top_level.txt +0 -0
  66. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/setup.cfg +0 -0
  67. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/setup.py +0 -0
  68. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_cli/main.py +0 -0
  69. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/__init__.py +0 -0
  70. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/cnode_client.py +0 -0
  71. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/compute_node_ops.py +0 -0
  72. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/__init__.py +0 -0
  73. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/device_events.py +0 -0
  74. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/lvol_events.py +0 -0
  75. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/mgmt_events.py +0 -0
  76. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/pool_events.py +0 -0
  77. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/snapshot_events.py +0 -0
  78. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/storage_events.py +0 -0
  79. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/__init__.py +0 -0
  80. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/base_model.py +0 -0
  81. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/caching_node.py +0 -0
  82. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/cluster.py +0 -0
  83. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/compute_node.py +0 -0
  84. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/deployer.py +0 -0
  85. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/global_settings.py +0 -0
  86. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/iface.py +0 -0
  87. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/lvol_model.py +0 -0
  88. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/mgmt_node.py +0 -0
  89. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/pool.py +0 -0
  90. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/port_stat.py +0 -0
  91. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/snapshot.py +0 -0
  92. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/stats.py +0 -0
  93. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/pci_utils.py +0 -0
  94. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/alerting/alert_rules.yaml +0 -0
  95. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/clean_local_storage_deploy.sh +0 -0
  96. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/config_docker.sh +0 -0
  97. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/dashboards/devices.json +0 -0
  98. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/dashboards/lvols.json +0 -0
  99. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/dashboards/node-exporter.json +0 -0
  100. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/dashboards/nodes.json +0 -0
  101. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/dashboards/pools.json +0 -0
  102. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/datasource.yml +0 -0
  103. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/db_config_double.sh +0 -0
  104. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/db_config_single.sh +0 -0
  105. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/prometheus.yml +0 -0
  106. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/run_ssh.sh +0 -0
  107. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/set_db_config.sh +0 -0
  108. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/__init__.py +0 -0
  109. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/caching_node_monitor.py +0 -0
  110. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/cap_monitor.py +0 -0
  111. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/install_service.sh +0 -0
  112. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/log_agg_service.py +0 -0
  113. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/mgmt_node_monitor.py +0 -0
  114. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/remove_service.sh +0 -0
  115. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/service_template.service +0 -0
  116. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/shell_utils.py +0 -0
  117. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/__init__.py +0 -0
  118. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/app.py +0 -0
  119. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/auth_middleware.py +0 -0
  120. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/__init__.py +0 -0
  121. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/caching_node_ops.py +0 -0
  122. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/caching_node_ops_k8s.py +0 -0
  123. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/node_api_basic.py +0 -0
  124. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/node_api_caching_docker.py +0 -0
  125. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/node_api_caching_ks.py +0 -0
  126. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/web_api_caching_node.py +0 -0
  127. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/web_api_deployer.py +0 -0
  128. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/web_api_mgmt_node.py +0 -0
  129. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/web_api_snapshot.py +0 -0
  130. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/caching_node_app.py +0 -0
  131. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/caching_node_app_k8s.py +0 -0
  132. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/node_webapp.py +0 -0
  133. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/snode_app.py +0 -0
  134. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/static/delete.py +0 -0
  135. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/static/deploy.py +0 -0
  136. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/static/deploy_cnode.yaml +0 -0
  137. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/static/deploy_spdk.yaml +0 -0
  138. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/static/is_up.py +0 -0
  139. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/static/list_deps.py +0 -0
  140. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/static/rpac.yaml +0 -0
  141. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/static/tst.py +0 -0
  142. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/templates/deploy_spdk.yaml.j2 +0 -0
@@ -14,11 +14,11 @@ import docker
14
14
  from simplyblock_core import constants, scripts, distr_controller
15
15
  from simplyblock_core import utils
16
16
  from simplyblock_core.controllers import lvol_controller, storage_events, snapshot_controller, device_events, \
17
- device_controller
17
+ device_controller, tasks_controller
18
18
  from simplyblock_core.kv_store import DBController
19
19
  from simplyblock_core import shell_utils
20
20
  from simplyblock_core.models.iface import IFace
21
- from simplyblock_core.models.nvme_device import NVMeDevice
21
+ from simplyblock_core.models.nvme_device import NVMeDevice, JMDevice
22
22
  from simplyblock_core.models.storage_node import StorageNode
23
23
  from simplyblock_core.pci_utils import get_nvme_devices, bind_spdk_driver
24
24
  from simplyblock_core.rpc_client import RPCClient
@@ -81,55 +81,50 @@ def _get_if_ip_address(ifname):
81
81
 
82
82
 
83
83
  def addNvmeDevices(cluster, rpc_client, devs, snode):
84
- sequential_number = 0
85
84
  devices = []
86
85
  ret = rpc_client.bdev_nvme_controller_list()
87
- if ret:
88
- ctr_map = {i["ctrlrs"][0]['trid']['traddr']: i["name"] for i in ret}
89
- else:
90
- ctr_map = {}
86
+ ctr_map = {}
87
+ try:
88
+ if ret:
89
+ ctr_map = {i["ctrlrs"][0]['trid']['traddr']: i["name"] for i in ret}
90
+ except:
91
+ pass
91
92
 
93
+ next_physical_label = get_next_physical_device_order()
92
94
  for index, pcie in enumerate(devs):
93
95
 
94
96
  if pcie in ctr_map:
95
- nvme_bdev = ctr_map[pcie] + "n1"
97
+ nvme_controller = ctr_map[pcie]
96
98
  else:
97
- name = "nvme_%s" % index
98
- ret, err = rpc_client.bdev_nvme_controller_attach(name, pcie)
99
+ nvme_controller = "nvme_%s" % index
100
+ ret, err = rpc_client.bdev_nvme_controller_attach(nvme_controller, pcie)
99
101
  time.sleep(2)
100
- nvme_bdev = f"{name}n1"
101
102
 
103
+ nvme_bdev = f"{nvme_controller}n1"
104
+ rpc_client.bdev_examine(nvme_bdev)
105
+ time.sleep(5)
102
106
  ret = rpc_client.get_bdevs(nvme_bdev)
103
- if ret:
104
- nvme_dict = ret[0]
105
- nvme_driver_data = nvme_dict['driver_specific']['nvme'][0]
106
- model_number = nvme_driver_data['ctrlr_data']['model_number']
107
+ nvme_dict = ret[0]
108
+ nvme_driver_data = nvme_dict['driver_specific']['nvme'][0]
109
+ model_number = nvme_driver_data['ctrlr_data']['model_number']
110
+ total_size = nvme_dict['block_size'] * nvme_dict['num_blocks']
107
111
 
108
- size = nvme_dict['block_size'] * nvme_dict['num_blocks']
109
- device_partitions_count = int(size / (cluster.blk_size * cluster.page_size_in_blocks))
110
- devices.append(
111
- NVMeDevice({
112
- 'uuid': str(uuid.uuid4()),
113
- 'device_name': nvme_dict['name'],
114
- 'sequential_number': sequential_number,
115
- 'partitions_count': device_partitions_count,
116
- 'capacity': size,
117
- 'size': size,
118
- 'pcie_address': nvme_driver_data['pci_address'],
119
- 'model_id': model_number,
120
- 'serial_number': nvme_driver_data['ctrlr_data']['serial_number'],
121
- 'nvme_bdev': nvme_bdev,
122
- 'alloc_bdev': nvme_bdev,
123
- 'node_id': snode.get_id(),
124
- 'cluster_id': snode.cluster_id,
125
-
126
- # 'nvmf_nqn': subsystem_nqn,
127
- # 'nvmf_ip': IP,
128
- # 'nvmf_port': 4420,
129
-
130
- 'status': 'online'
131
- }))
132
- sequential_number += device_partitions_count
112
+ devices.append(
113
+ NVMeDevice({
114
+ 'uuid': str(uuid.uuid4()),
115
+ 'device_name': nvme_dict['name'],
116
+ 'size': total_size,
117
+ 'physical_label': next_physical_label,
118
+ 'pcie_address': nvme_driver_data['pci_address'],
119
+ 'model_id': model_number,
120
+ 'serial_number': nvme_driver_data['ctrlr_data']['serial_number'],
121
+ 'nvme_bdev': nvme_bdev,
122
+ 'nvme_controller': nvme_controller,
123
+ 'node_id': snode.get_id(),
124
+ 'cluster_id': snode.cluster_id,
125
+ 'status': NVMeDevice.STATUS_ONLINE
126
+ }))
127
+ next_physical_label += 1
133
128
  return devices
134
129
 
135
130
 
@@ -197,10 +192,10 @@ def _run_nvme_smart_log_add(dev_name):
197
192
  return data
198
193
 
199
194
 
200
- def get_next_cluster_device_order(db_controller):
195
+ def get_next_cluster_device_order(db_controller, cluster_id):
201
196
  max_order = 0
202
197
  found = False
203
- for node in db_controller.get_storage_nodes():
198
+ for node in db_controller.get_storage_nodes_by_cluster_id(cluster_id):
204
199
  for dev in node.nvme_devices:
205
200
  found = True
206
201
  max_order = max(max_order, dev.cluster_device_order)
@@ -209,91 +204,319 @@ def get_next_cluster_device_order(db_controller):
209
204
  return 0
210
205
 
211
206
 
212
- def _prepare_cluster_devices(snode, after_restart=False):
207
+ def get_next_physical_device_order():
213
208
  db_controller = DBController()
209
+ max_order = 0
210
+ found = False
211
+ for node in db_controller.get_storage_nodes():
212
+ for dev in node.nvme_devices:
213
+ found = True
214
+ max_order = max(max_order, dev.physical_label)
215
+ if found:
216
+ return max_order + 1
217
+ return 0
218
+
219
+
220
+ def _search_for_partitions(rpc_client, nvme_device):
221
+ partitioned_devices = []
222
+ for bdev in rpc_client.get_bdevs():
223
+ name = bdev['name']
224
+ if name.startswith(f"{nvme_device.nvme_bdev}p"):
225
+ new_dev = NVMeDevice(nvme_device.to_dict())
226
+ new_dev.uuid = str(uuid.uuid4())
227
+ new_dev.device_name = name
228
+ new_dev.nvme_bdev = name
229
+ new_dev.size = bdev['block_size'] * bdev['num_blocks']
230
+ partitioned_devices.append(new_dev)
231
+ return partitioned_devices
232
+
233
+
234
+ def _create_jm_stack_on_raid(rpc_client, jm_nvme_bdevs, snode, after_restart):
235
+ raid_bdev = f"raid_jm_{snode.get_id()}"
236
+ ret = rpc_client.bdev_raid_create(raid_bdev, jm_nvme_bdevs)
237
+ if not ret:
238
+ logger.error(f"Failed to create raid_jm_{snode.get_id()}")
239
+ return False
240
+ alceml_name = f"alceml_jm_{snode.get_id()}"
241
+ pba_init_mode = 3
242
+ if after_restart:
243
+ pba_init_mode = 2
244
+ ret = rpc_client.bdev_alceml_create(alceml_name, raid_bdev, str(uuid.uuid4()), pba_init_mode=pba_init_mode)
245
+ if not ret:
246
+ logger.error(f"Failed to create alceml bdev: {alceml_name}")
247
+ return False
248
+
249
+ jm_bdev = f"jm_{snode.get_id()}"
250
+ ret = rpc_client.bdev_jm_create(jm_bdev, alceml_name)
251
+ if not ret:
252
+ logger.error(f"Failed to create {jm_bdev}")
253
+ return False
254
+ ret = rpc_client.get_bdevs(raid_bdev)
255
+
256
+ return JMDevice({
257
+ 'uuid': str(uuid.uuid4()),
258
+ 'device_name': jm_bdev,
259
+ 'size': ret[0]["block_size"] * ret[0]["num_blocks"],
260
+ 'status': JMDevice.STATUS_ONLINE,
261
+ 'jm_nvme_bdev_list': jm_nvme_bdevs,
262
+ 'raid_bdev': raid_bdev,
263
+ 'alceml_bdev': alceml_name,
264
+ 'jm_bdev': jm_bdev
265
+ })
266
+
267
+
268
+ def _create_jm_stack_on_device(rpc_client, nvme, snode, after_restart):
269
+
270
+ alceml_id = nvme.get_id()
271
+ alceml_name = device_controller.get_alceml_name(alceml_id)
272
+ logger.info(f"adding {alceml_name}")
273
+
274
+ pba_init_mode = 3
275
+ if after_restart:
276
+ pba_init_mode = 2
277
+ ret = rpc_client.bdev_alceml_create(alceml_name, nvme.nvme_bdev, alceml_id, pba_init_mode=pba_init_mode)
278
+ if not ret:
279
+ logger.error(f"Failed to create alceml bdev: {alceml_name}")
280
+ return False
281
+
282
+ jm_bdev = f"jm_{snode.get_id()}"
283
+ ret = rpc_client.bdev_jm_create(jm_bdev, alceml_name)
284
+ if not ret:
285
+ logger.error(f"Failed to create {jm_bdev}")
286
+ return False
214
287
 
288
+ return JMDevice({
289
+ 'uuid': alceml_id,
290
+ 'device_name': jm_bdev,
291
+ 'size': nvme.size,
292
+ 'status': JMDevice.STATUS_ONLINE,
293
+ 'alceml_bdev': alceml_name,
294
+ 'nvme_bdev': nvme.nvme_bdev,
295
+ 'jm_bdev': jm_bdev
296
+ })
297
+
298
+
299
+ def _create_storage_device_stack(rpc_client, nvme, snode, after_restart):
300
+ test_name = f"{nvme.nvme_bdev}_test"
301
+ ret = rpc_client.bdev_passtest_create(test_name, nvme.nvme_bdev)
302
+ if not ret:
303
+ logger.error(f"Failed to create passtest bdev {test_name}")
304
+ return False
305
+ alceml_id = nvme.get_id()
306
+ alceml_name = device_controller.get_alceml_name(alceml_id)
307
+ logger.info(f"adding {alceml_name}")
308
+ pba_init_mode = 3
309
+ if after_restart:
310
+ pba_init_mode = 2
311
+ ret = rpc_client.bdev_alceml_create(alceml_name, test_name, alceml_id, pba_init_mode=pba_init_mode,
312
+ dev_cpu_mask=snode.dev_cpu_mask)
313
+ if not ret:
314
+ logger.error(f"Failed to create alceml bdev: {alceml_name}")
315
+ return False
316
+
317
+ # add pass through
318
+ pt_name = f"{alceml_name}_PT"
319
+ ret = rpc_client.bdev_PT_NoExcl_create(pt_name, alceml_name)
320
+ if not ret:
321
+ logger.error(f"Failed to create pt noexcl bdev: {pt_name}")
322
+ return False
323
+
324
+ subsystem_nqn = snode.subsystem + ":dev:" + alceml_id
325
+ logger.info("creating subsystem %s", subsystem_nqn)
326
+ ret = rpc_client.subsystem_create(subsystem_nqn, 'sbcli-cn', alceml_id)
327
+ IP = None
328
+ for iface in snode.data_nics:
329
+ if iface.ip4_address:
330
+ tr_type = iface.get_transport_type()
331
+ ret = rpc_client.transport_list()
332
+ found = False
333
+ if ret:
334
+ for ty in ret:
335
+ if ty['trtype'] == tr_type:
336
+ found = True
337
+ if found is False:
338
+ ret = rpc_client.transport_create(tr_type)
339
+ logger.info("adding listener for %s on IP %s" % (subsystem_nqn, iface.ip4_address))
340
+ ret = rpc_client.listeners_create(subsystem_nqn, tr_type, iface.ip4_address, "4420")
341
+ IP = iface.ip4_address
342
+ break
343
+ logger.info(f"add {pt_name} to subsystem")
344
+ ret = rpc_client.nvmf_subsystem_add_ns(subsystem_nqn, pt_name)
345
+ if not ret:
346
+ logger.error(f"Failed to add: {pt_name} to the subsystem: {subsystem_nqn}")
347
+ return False
348
+
349
+ nvme.testing_bdev = test_name
350
+ nvme.alceml_bdev = alceml_name
351
+ nvme.pt_bdev = pt_name
352
+ nvme.nvmf_nqn = subsystem_nqn
353
+ nvme.nvmf_ip = IP
354
+ nvme.nvmf_port = 4420
355
+ nvme.io_error = False
356
+ nvme.status = NVMeDevice.STATUS_ONLINE
357
+ return nvme
358
+
359
+
360
+ def _create_device_partitions(rpc_client, nvme, snode):
361
+ nbd_device = rpc_client.nbd_start_disk(nvme.nvme_bdev)
362
+ time.sleep(3)
363
+ if not nbd_device:
364
+ logger.error(f"Failed to start nbd dev")
365
+ return False
366
+ snode_api = SNodeClient(snode.api_endpoint)
367
+ result, error = snode_api.make_gpt_partitions(
368
+ nbd_device, snode.jm_percent, snode.num_partitions_per_dev)
369
+ if error:
370
+ logger.error(f"Failed to make partitions")
371
+ logger.error(error)
372
+ return False
373
+ time.sleep(3)
374
+ rpc_client.nbd_stop_disk(nbd_device)
375
+ time.sleep(1)
376
+ rpc_client.bdev_nvme_detach_controller(nvme.nvme_controller)
377
+ time.sleep(1)
378
+ rpc_client.bdev_nvme_controller_attach(nvme.nvme_controller, nvme.pcie_address)
379
+ time.sleep(1)
380
+ rpc_client.bdev_examine(nvme.nvme_bdev)
381
+ time.sleep(1)
382
+ return True
383
+
384
+
385
+ def _prepare_cluster_devices_partitions(snode, devices):
386
+ db_controller = DBController()
215
387
  rpc_client = RPCClient(
216
388
  snode.mgmt_ip, snode.rpc_port,
217
389
  snode.rpc_username, snode.rpc_password)
218
390
 
219
- for index, nvme in enumerate(snode.nvme_devices):
391
+ new_devices = []
392
+ jm_devices = []
393
+ dev_order = get_next_cluster_device_order(db_controller, snode.cluster_id)
394
+ for index, nvme in enumerate(devices):
395
+ if nvme.status not in [NVMeDevice.STATUS_ONLINE, NVMeDevice.STATUS_UNAVAILABLE, NVMeDevice.STATUS_READONLY]:
396
+ logger.debug(f"Device is skipped: {nvme.get_id()}, status: {nvme.status}")
397
+ continue
398
+
399
+ # look for partitions
400
+ partitioned_devices = _search_for_partitions(rpc_client, nvme)
401
+ logger.debug("partitioned_devices")
402
+ logger.debug(partitioned_devices)
403
+ if len(partitioned_devices) == (1 + snode.num_partitions_per_dev):
404
+ logger.info("Partitioned devices found")
405
+ else:
406
+ logger.info(f"Creating partitions for {nvme.nvme_bdev}")
407
+ _create_device_partitions(rpc_client, nvme, snode)
408
+ partitioned_devices = _search_for_partitions(rpc_client, nvme)
409
+ if len(partitioned_devices) == (1 + snode.num_partitions_per_dev):
410
+ logger.info("Device partitions created")
411
+ else:
412
+ logger.error("Failed to create partitions")
413
+ return False
414
+
415
+ jm_devices.append(partitioned_devices.pop(0))
416
+
417
+ for dev in partitioned_devices:
418
+ new_device = _create_storage_device_stack(rpc_client, dev, snode, after_restart=False)
419
+ if not new_device:
420
+ logger.error("failed to create dev stack")
421
+ return False
422
+ new_device.cluster_device_order = dev_order
423
+ dev_order += 1
424
+ new_devices.append(new_device)
425
+ device_events.device_create(new_device)
426
+
427
+ snode.nvme_devices = new_devices
428
+
429
+ if jm_devices:
430
+ jm_nvme_bdevs = [dev.nvme_bdev for dev in jm_devices]
431
+ jm_device = _create_jm_stack_on_raid(rpc_client, jm_nvme_bdevs, snode, after_restart=False)
432
+ if not jm_device:
433
+ logger.error(f"Failed to create JM device")
434
+ return False
435
+ snode.jm_device = jm_device
436
+
437
+ return True
438
+
439
+
440
+ def _prepare_cluster_devices_jm_on_dev(snode, devices):
441
+ db_controller = DBController()
442
+
443
+ jm_device = devices[0]
444
+ # Set device cluster order
445
+ dev_order = get_next_cluster_device_order(db_controller, snode.cluster_id)
446
+ for index, nvme in enumerate(devices):
447
+ nvme.cluster_device_order = dev_order
448
+ dev_order += 1
449
+ if nvme.size < jm_device.size:
450
+ jm_device = nvme
451
+ device_events.device_create(nvme)
452
+ jm_device.status = NVMeDevice.STATUS_JM
453
+
454
+ rpc_client = RPCClient(snode.mgmt_ip, snode.rpc_port, snode.rpc_username, snode.rpc_password)
455
+
456
+ new_devices = []
457
+ for index, nvme in enumerate(devices):
220
458
  if nvme.status not in [NVMeDevice.STATUS_ONLINE, NVMeDevice.STATUS_UNAVAILABLE,
221
459
  NVMeDevice.STATUS_JM, NVMeDevice.STATUS_READONLY]:
222
460
  logger.debug(f"Device is not online or unavailable: {nvme.get_id()}, status: {nvme.status}")
223
461
  continue
224
462
 
225
- test_name = f"{nvme.nvme_bdev}_test"
226
- # create testing bdev
227
- ret = rpc_client.bdev_passtest_create(test_name, nvme.nvme_bdev)
228
- if not ret:
229
- logger.error(f"Failed to create bdev: {test_name}")
230
- return False
231
- alceml_id = nvme.get_id()
232
- alceml_name = device_controller.get_alceml_name(alceml_id)
233
- logger.info(f"adding {alceml_name}")
234
- pba_init_mode = 3
235
- if after_restart:
236
- pba_init_mode = 2
237
- ret = rpc_client.bdev_alceml_create(alceml_name, test_name, alceml_id, pba_init_mode=pba_init_mode)
238
- if not ret:
239
- logger.error(f"Failed to create alceml bdev: {alceml_name}")
240
- return False
241
-
242
- # create jm
243
- if nvme.jm_bdev:
244
- ret = rpc_client.bdev_jm_create(nvme.jm_bdev, alceml_name)
245
- if not ret:
246
- logger.error(f"Failed to create JM bdev: {nvme.jm_bdev}")
463
+ if nvme.status == NVMeDevice.STATUS_JM:
464
+ jm_device = _create_jm_stack_on_device(rpc_client, nvme, snode, after_restart=False)
465
+ if not jm_device:
466
+ logger.error(f"Failed to create JM device")
247
467
  return False
248
- nvme.testing_bdev = test_name
249
- nvme.alceml_bdev = alceml_name
250
- nvme.io_error = True
251
- nvme.status = NVMeDevice.STATUS_JM
468
+ snode.jm_device = jm_device
469
+ else:
470
+ new_device = _create_storage_device_stack(rpc_client, nvme, snode, after_restart=False)
471
+ if not new_device:
472
+ logger.error("failed to create dev stack")
473
+ return False
474
+ new_device.cluster_device_order = dev_order
475
+ dev_order += 1
476
+ new_devices.append(new_device)
477
+ device_events.device_create(new_device)
478
+
479
+ snode.nvme_devices = new_devices
480
+ return True
481
+
482
+
483
+ def _prepare_cluster_devices_on_restart(snode):
484
+ db_controller = DBController()
485
+
486
+ rpc_client = RPCClient(
487
+ snode.mgmt_ip, snode.rpc_port,
488
+ snode.rpc_username, snode.rpc_password)
489
+
490
+ for index, nvme in enumerate(snode.nvme_devices):
491
+ if nvme.status not in [NVMeDevice.STATUS_ONLINE, NVMeDevice.STATUS_UNAVAILABLE, NVMeDevice.STATUS_READONLY]:
492
+ logger.debug(f"Device is skipped: {nvme.get_id()}, status: {nvme.status}")
252
493
  continue
253
494
 
254
- # add pass through
255
- pt_name = f"{alceml_name}_PT"
256
- ret = rpc_client.bdev_PT_NoExcl_create(pt_name, alceml_name)
495
+ dev = _create_storage_device_stack(rpc_client, nvme, snode, after_restart=True)
496
+ if not dev:
497
+ logger.error(f"Failed to create dev stack {nvme.get_id()}")
498
+ return False
499
+ device_events.device_restarted(dev)
500
+
501
+ # prepare JM device
502
+ jm_device = snode.jm_device
503
+ if jm_device.jm_nvme_bdev_list:
504
+ ret = _create_jm_stack_on_raid(rpc_client, jm_device.jm_nvme_bdev_list, snode, after_restart=False)
257
505
  if not ret:
258
- logger.error(f"Failed to create pt noexcl bdev: {pt_name}")
506
+ logger.error(f"Failed to create JM device")
259
507
  return False
508
+ else:
260
509
 
261
- subsystem_nqn = snode.subsystem + ":dev:" + alceml_id
262
- logger.info("creating subsystem %s", subsystem_nqn)
263
- ret = rpc_client.subsystem_create(subsystem_nqn, 'sbcli-cn', alceml_id)
264
- IP = None
265
- for iface in snode.data_nics:
266
- if iface.ip4_address:
267
- tr_type = iface.get_transport_type()
268
- ret = rpc_client.transport_list()
269
- found = False
270
- if ret:
271
- for ty in ret:
272
- if ty['trtype'] == tr_type:
273
- found = True
274
- if found is False:
275
- ret = rpc_client.transport_create(tr_type)
276
- logger.info("adding listener for %s on IP %s" % (subsystem_nqn, iface.ip4_address))
277
- ret = rpc_client.listeners_create(subsystem_nqn, tr_type, iface.ip4_address, "4420")
278
- IP = iface.ip4_address
279
- break
280
- logger.info(f"add {pt_name} to subsystem")
281
- ret = rpc_client.nvmf_subsystem_add_ns(subsystem_nqn, pt_name)
510
+ ret = rpc_client.bdev_alceml_create(jm_device.alceml_bdev, jm_device.nvme_bdev, jm_device.get_id(), pba_init_mode=2)
282
511
  if not ret:
283
- logger.error(f"Failed to add: {pt_name} to the subsystem: {subsystem_nqn}")
512
+ logger.error(f"Failed to create alceml bdev: {jm_device.alceml_bdev}")
284
513
  return False
285
514
 
286
- nvme.testing_bdev = test_name
287
- nvme.alceml_bdev = alceml_name
288
- nvme.pt_bdev = pt_name
289
- nvme.nvmf_nqn = subsystem_nqn
290
- nvme.nvmf_ip = IP
291
- nvme.nvmf_port = 4420
292
- nvme.io_error = False
293
- old_status = nvme.status
294
- nvme.status = NVMeDevice.STATUS_ONLINE
295
- device_events.device_status_change(nvme, nvme.status, old_status)
296
- snode.write_to_db(db_controller.kv_store)
515
+ jm_bdev = f"jm_{snode.get_id()}"
516
+ ret = rpc_client.bdev_jm_create(jm_bdev, jm_device.alceml_bdev)
517
+ if not ret:
518
+ logger.error(f"Failed to create {jm_bdev}")
519
+ return False
297
520
 
298
521
  return True
299
522
 
@@ -307,7 +530,7 @@ def _connect_to_remote_devs(this_node):
307
530
 
308
531
  remote_devices = []
309
532
  # connect to remote devs
310
- snodes = db_controller.get_storage_nodes()
533
+ snodes = db_controller.get_storage_nodes_by_cluster_id(this_node.cluster_id)
311
534
  for node_index, node in enumerate(snodes):
312
535
  if node.get_id() == this_node.get_id() or node.status == node.STATUS_OFFLINE:
313
536
  continue
@@ -326,9 +549,10 @@ def _connect_to_remote_devs(this_node):
326
549
  return remote_devices
327
550
 
328
551
 
329
- def add_node(cluster_id, node_ip, iface_name, data_nics_list, spdk_cpu_mask,
330
- spdk_mem, spdk_image=None, spdk_debug=False,
331
- small_pool_count=0, large_pool_count=0, small_bufsize=0, large_bufsize=0, jm_device_pcie=None):
552
+ def add_node(cluster_id, node_ip, iface_name, data_nics_list,
553
+ max_lvol, max_snap, max_prov, spdk_image=None, spdk_debug=False,
554
+ small_bufsize=0, large_bufsize=0,
555
+ num_partitions_per_dev=0, jm_percent=0, number_of_devices=0):
332
556
  db_controller = DBController()
333
557
  kv_store = db_controller.kv_store
334
558
 
@@ -384,20 +608,71 @@ def add_node(cluster_id, node_ip, iface_name, data_nics_list, spdk_cpu_mask,
384
608
  logger.error(f"Node already exists, try remove it first: {ec2_metadata['instanceId']}")
385
609
  return False
386
610
 
611
+ # Tune cpu maks parameters
612
+ cpu_count = node_info["cpu_count"]
613
+ pollers_mask = ""
614
+ app_thread_mask = ""
615
+ dev_cpu_mask = ""
616
+ nvme_pollers_cores = []
617
+ if cpu_count < 8:
618
+ mask = (1 << (cpu_count - 1)) - 1
619
+ mask <<= 1
620
+ spdk_cpu_mask = f'0x{mask:X}'
621
+ os_cores = [0]
622
+ else:
623
+ os_cores, nvme_pollers_cores, app_thread_core, dev_cpu_cores = \
624
+ utils.calculate_core_allocation(cpu_count)
625
+ spdk_cores = nvme_pollers_cores + app_thread_core + dev_cpu_cores
626
+
627
+ pollers_mask = utils.generate_mask(nvme_pollers_cores)
628
+ app_thread_mask = utils.generate_mask(app_thread_core)
629
+ spdk_cpu_mask = utils.generate_mask(spdk_cores)
630
+ dev_cpu_mask = utils.generate_mask(dev_cpu_cores)
631
+
632
+ # Calculate pool count
633
+ if ec2_metadata and ec2_metadata.get('instanceType'):
634
+ supported_type, storage_devices, device_size = utils.get_total_size_per_instance_type(ec2_metadata["instanceType"])
635
+ if not supported_type:
636
+ logger.warning(f"Unsupported ec2 instance-type {ec2_metadata['instanceType']} for deployment")
637
+ if not number_of_devices:
638
+ logger.error(f"Unsupported ec2 instance-type {ec2_metadata['instanceType']} "
639
+ "for deployment, please specify --number-of-devices")
640
+ return False
641
+ number_of_devices = storage_devices
642
+ else:
643
+ logger.warning("Can not get ec2 instance type for this instance.")
644
+ if not number_of_devices:
645
+ logger.error("Unsupported instance type please specify --number-of-devices.")
646
+ return False
647
+
648
+ number_of_split = num_partitions_per_dev if num_partitions_per_dev else num_partitions_per_dev + 1
649
+ number_of_alceml_devices = number_of_devices * number_of_split
650
+ small_pool_count, large_pool_count = utils.calculate_pool_count(
651
+ number_of_alceml_devices, max_lvol, max_snap, cpu_count, len(nvme_pollers_cores) or cpu_count)
652
+
653
+ # Calculate minimum huge page memory
654
+ minimum_hp_memory = utils.calculate_minimum_hp_memory(small_pool_count, large_pool_count, max_lvol, max_snap, cpu_count)
655
+
656
+ # Calculate minimum sys memory
657
+ minimum_sys_memory = utils.calculate_minimum_sys_memory(max_prov)
658
+
387
659
  # check for memory
388
660
  if "memory_details" in node_info and node_info['memory_details']:
389
661
  memory_details = node_info['memory_details']
390
662
  logger.info("Node Memory info")
391
663
  logger.info(f"Total: {utils.humanbytes(memory_details['total'])}")
392
664
  logger.info(f"Free: {utils.humanbytes(memory_details['free'])}")
393
- logger.info(f"Hugepages Total: {utils.humanbytes(memory_details['huge_total'])}")
394
- huge_free = memory_details['huge_free']
395
- logger.info(f"Hugepages Free: {utils.humanbytes(huge_free)}")
396
- if huge_free < 1 * 1024 * 1024:
397
- logger.warning(f"Free hugepages are less than 1G: {utils.humanbytes(huge_free)}")
398
- if not spdk_mem:
399
- spdk_mem = huge_free
400
- logger.info(f"Using the free hugepages for spdk memory: {utils.humanbytes(huge_free)}")
665
+ else:
666
+ logger.error(f"Cannot get memory info from the ec2 instance.. Exiting")
667
+ return False
668
+
669
+ satisfied, spdk_mem = utils.calculate_spdk_memory(minimum_hp_memory,
670
+ minimum_sys_memory,
671
+ int(memory_details['free']),
672
+ int(memory_details['huge_total']))
673
+ if not satisfied:
674
+ logger.error(f"Not enough memory for the provided max_lvo: {max_lvol}, max_snap: {max_snap}, max_prov: {max_prov}.. Exiting")
675
+ return False
401
676
 
402
677
  logger.info("Joining docker swarm...")
403
678
  cluster_docker = utils.get_docker_client(cluster_id)
@@ -472,16 +747,28 @@ def add_node(cluster_id, node_ip, iface_name, data_nics_list, spdk_cpu_mask,
472
747
  snode.hugepages = node_info['hugepages']
473
748
 
474
749
  snode.spdk_cpu_mask = spdk_cpu_mask or ""
475
- snode.spdk_mem = spdk_mem or 0
750
+ snode.spdk_mem = spdk_mem
751
+ snode.max_lvol = max_lvol
752
+ snode.max_snap = max_snap
753
+ snode.max_prov = max_prov
754
+ snode.number_of_devices = number_of_devices
476
755
  snode.spdk_image = spdk_image or ""
477
756
  snode.spdk_debug = spdk_debug or 0
478
757
  snode.write_to_db(kv_store)
758
+ snode.app_thread_mask = app_thread_mask or ""
759
+ snode.pollers_mask = pollers_mask or ""
760
+ snode.nvme_pollers_cores = nvme_pollers_cores or []
761
+ snode.dev_cpu_mask = dev_cpu_mask or ""
762
+ snode.os_cores = os_cores or []
479
763
 
480
764
  snode.iobuf_small_pool_count = small_pool_count or 0
481
765
  snode.iobuf_large_pool_count = large_pool_count or 0
482
766
  snode.iobuf_small_bufsize = small_bufsize or 0
483
767
  snode.iobuf_large_bufsize = large_bufsize or 0
484
768
 
769
+ snode.num_partitions_per_dev = num_partitions_per_dev
770
+ snode.jm_percent = jm_percent
771
+
485
772
  snode.write_to_db(kv_store)
486
773
 
487
774
  # creating RPCClient instance
@@ -499,13 +786,41 @@ def add_node(cluster_id, node_ip, iface_name, data_nics_list, spdk_cpu_mask,
499
786
  logger.error("Failed to set iobuf options")
500
787
  return False
501
788
 
502
- # 2- start spdk framework
789
+ # 2- set socket implementation options
790
+ ret = rpc_client.sock_impl_set_options()
791
+ if not ret:
792
+ logger.error("Failed socket implement set options")
793
+ return False
794
+
795
+ # 3- set nvme config
796
+ if snode.pollers_mask:
797
+ ret = rpc_client.nvmf_set_config(snode.pollers_mask)
798
+ if not ret:
799
+ logger.error("Failed to set pollers mask")
800
+ return False
801
+
802
+ # 4- start spdk framework
503
803
  ret = rpc_client.framework_start_init()
504
804
  if not ret:
505
805
  logger.error("Failed to start framework")
506
806
  return False
507
807
 
508
- # 3- set nvme bdev options
808
+ # 5- set app_thread cpu mask
809
+ if snode.app_thread_mask:
810
+ ret = rpc_client.thread_get_stats()
811
+ app_thread_process_id = 0
812
+ if ret.get("threads"):
813
+ for entry in ret["threads"]:
814
+ if entry['name'] == 'app_thread':
815
+ app_thread_process_id = entry['id']
816
+ break
817
+
818
+ ret = rpc_client.thread_set_cpumask(app_thread_process_id, snode.app_thread_mask)
819
+ if not ret:
820
+ logger.error("Failed to set app thread mask")
821
+ return False
822
+
823
+ # 6- set nvme bdev options
509
824
  ret = rpc_client.bdev_nvme_set_options()
510
825
  if not ret:
511
826
  logger.error("Failed to set nvme options")
@@ -513,36 +828,18 @@ def add_node(cluster_id, node_ip, iface_name, data_nics_list, spdk_cpu_mask,
513
828
 
514
829
  # get new node info after starting spdk
515
830
  node_info, _ = snode_api.info()
516
- # adding devices
831
+
832
+ # discover devices
517
833
  nvme_devs = addNvmeDevices(cluster, rpc_client, node_info['spdk_pcie_list'], snode)
518
834
  if not nvme_devs:
519
835
  logger.error("No NVMe devices was found!")
520
836
  return False
521
837
 
522
- snode.nvme_devices = nvme_devs
523
-
524
- jm_device = snode.nvme_devices[0]
525
- # Set device cluster order
526
- dev_order = get_next_cluster_device_order(db_controller)
527
- for index, nvme in enumerate(snode.nvme_devices):
528
- nvme.cluster_device_order = dev_order
529
- dev_order += 1
530
- if jm_device_pcie:
531
- if nvme.pcie_address == jm_device_pcie:
532
- jm_device = nvme
533
- elif nvme.size < jm_device.size:
534
- jm_device = nvme
535
- device_events.device_create(nvme)
536
-
537
- # create jm
538
- logger.info(f"Using device for JM: {jm_device.get_id()}")
539
- jm_device.jm_bdev = f"jm_{snode.get_id()}"
540
-
541
- # save object
542
- snode.write_to_db(db_controller.kv_store)
543
-
544
838
  # prepare devices
545
- ret = _prepare_cluster_devices(snode)
839
+ if snode.num_partitions_per_dev == 0 or snode.jm_percent == 0:
840
+ ret = _prepare_cluster_devices_jm_on_dev(snode, nvme_devs)
841
+ else:
842
+ ret = _prepare_cluster_devices_partitions(snode, nvme_devs)
546
843
  if not ret:
547
844
  logger.error("Failed to prepare cluster devices")
548
845
  return False
@@ -557,7 +854,7 @@ def add_node(cluster_id, node_ip, iface_name, data_nics_list, spdk_cpu_mask,
557
854
 
558
855
  # make other nodes connect to the new devices
559
856
  logger.info("Make other nodes connect to the new devices")
560
- snodes = db_controller.get_storage_nodes()
857
+ snodes = db_controller.get_storage_nodes_by_cluster_id(cluster_id)
561
858
  for node_index, node in enumerate(snodes):
562
859
  if node.get_id() == snode.get_id() or node.status != StorageNode.STATUS_ONLINE:
563
860
  continue
@@ -599,150 +896,16 @@ def add_node(cluster_id, node_ip, iface_name, data_nics_list, spdk_cpu_mask,
599
896
  time.sleep(3)
600
897
 
601
898
  logger.info("Sending cluster event updates")
602
- distr_controller.send_node_status_event(snode.get_id(), "online")
899
+ distr_controller.send_node_status_event(snode, StorageNode.STATUS_ONLINE)
603
900
 
604
901
  for dev in snode.nvme_devices:
605
- distr_controller.send_dev_status_event(dev.cluster_device_order, "online")
902
+ distr_controller.send_dev_status_event(dev, NVMeDevice.STATUS_ONLINE)
606
903
 
607
904
  storage_events.snode_add(snode)
608
905
  logger.info("Done")
609
906
  return "Success"
610
907
 
611
908
 
612
- # Deprecated
613
- def add_storage_node(cluster_id, iface_name, data_nics):
614
- db_controller = DBController()
615
- kv_store = db_controller.kv_store
616
-
617
- cluster = db_controller.get_cluster_by_id(cluster_id)
618
- if not cluster:
619
- logger.error("Cluster not found: %s", cluster_id)
620
- return False
621
-
622
- logger.info("Add Storage node")
623
-
624
- hostname = utils.get_hostname()
625
- snode = db_controller.get_storage_node_by_hostname(hostname)
626
- if snode:
627
- logger.error("Node already exists, try remove it first.")
628
- exit(1)
629
- else:
630
- snode = StorageNode()
631
- snode.uuid = str(uuid.uuid4())
632
-
633
- mgmt_ip = _get_if_ip_address(iface_name)
634
- system_id = utils.get_system_id()
635
-
636
- BASE_NQN = cluster.nqn.split(":")[0]
637
- subsystem_nqn = f"{BASE_NQN}:{hostname}"
638
-
639
- if data_nics:
640
- data_nics = _get_data_nics(data_nics)
641
- else:
642
- data_nics = _get_data_nics([iface_name])
643
-
644
- rpc_user, rpc_pass = utils.generate_rpc_user_and_pass()
645
-
646
- # creating storage node object
647
- snode.status = StorageNode.STATUS_IN_CREATION
648
- snode.baseboard_sn = utils.get_baseboard_sn()
649
- snode.system_uuid = system_id
650
- snode.hostname = hostname
651
- snode.host_nqn = subsystem_nqn
652
- snode.subsystem = subsystem_nqn
653
- snode.data_nics = data_nics
654
- snode.mgmt_ip = mgmt_ip
655
- snode.rpc_port = constants.RPC_HTTP_PROXY_PORT
656
- snode.rpc_username = rpc_user
657
- snode.rpc_password = rpc_pass
658
- snode.cluster_id = cluster_id
659
- snode.write_to_db(kv_store)
660
-
661
- # creating RPCClient instance
662
- rpc_client = RPCClient(
663
- snode.mgmt_ip,
664
- snode.rpc_port,
665
- snode.rpc_username,
666
- snode.rpc_password)
667
-
668
- logger.info("Getting nvme devices")
669
- devs = get_nvme_devices()
670
- logger.debug(devs)
671
- pcies = [d[0] for d in devs]
672
- nvme_devs = addNvmeDevices(cluster, rpc_client, pcies, snode)
673
- if not nvme_devs:
674
- logger.error("No NVMe devices was found!")
675
-
676
- logger.debug(nvme_devs)
677
- snode.nvme_devices = nvme_devs
678
-
679
- # Set device cluster order
680
- dev_order = get_next_cluster_device_order(db_controller)
681
- for index, nvme in enumerate(snode.nvme_devices):
682
- nvme.cluster_device_order = dev_order
683
- dev_order += 1
684
- snode.write_to_db(db_controller.kv_store)
685
-
686
- # prepare devices
687
- _prepare_cluster_devices(snode)
688
-
689
- logger.info("Connecting to remote devices")
690
- remote_devices = _connect_to_remote_devs(snode)
691
- snode.remote_devices = remote_devices
692
-
693
- logger.info("Setting node status to Active")
694
- snode.status = StorageNode.STATUS_ONLINE
695
- snode.write_to_db(kv_store)
696
-
697
- # make other nodes connect to the new devices
698
- logger.info("Make other nodes connect to the new devices")
699
- snodes = db_controller.get_storage_nodes()
700
- for node_index, node in enumerate(snodes):
701
- if node.get_id() == snode.get_id():
702
- continue
703
- logger.info(f"Connecting to node: {node.get_id()}")
704
- rpc_client = RPCClient(node.mgmt_ip, node.rpc_port, node.rpc_username, node.rpc_password)
705
- count = 0
706
- for dev in snode.nvme_devices:
707
- name = f"remote_{dev.alceml_bdev}"
708
- ret = rpc_client.bdev_nvme_attach_controller_tcp(name, dev.nvmf_nqn, dev.nvmf_ip, dev.nvmf_port)
709
- if not ret:
710
- logger.error(f"Failed to connect to device: {name}")
711
- continue
712
-
713
- dev.remote_bdev = f"{name}n1"
714
- idx = -1
715
- for i, d in enumerate(node.remote_devices):
716
- if d.get_id() == dev.get_id():
717
- idx = i
718
- break
719
- if idx >= 0:
720
- node.remote_devices[idx] = dev
721
- else:
722
- node.remote_devices.append(dev)
723
- count += 1
724
- node.write_to_db(kv_store)
725
- logger.info(f"connected to devices count: {count}")
726
-
727
- logger.info("Sending cluster map")
728
- ret = distr_controller.send_cluster_map_to_node(snode)
729
- if not ret:
730
- return False, "Failed to send cluster map"
731
- ret = distr_controller.send_cluster_map_add_node(snode)
732
- if not ret:
733
- return False, "Failed to send cluster map add node"
734
- time.sleep(3)
735
-
736
- logger.info("Sending cluster event updates")
737
- distr_controller.send_node_status_event(snode.get_id(), "online")
738
-
739
- for dev in snode.nvme_devices:
740
- distr_controller.send_dev_status_event(dev.cluster_device_order, "online")
741
-
742
- logger.info("Done")
743
- return "Success"
744
-
745
-
746
909
  def delete_storage_node(node_id):
747
910
  db_controller = DBController()
748
911
  snode = db_controller.get_storage_node_by_id(node_id)
@@ -756,7 +919,7 @@ def delete_storage_node(node_id):
756
919
 
757
920
  snode.remove(db_controller.kv_store)
758
921
 
759
- for lvol in db_controller.get_lvols():
922
+ for lvol in db_controller.get_lvols(snode.cluster_id):
760
923
  logger.info(f"Sending cluster map to LVol: {lvol.get_id()}")
761
924
  lvol_controller.send_cluster_map(lvol.get_id())
762
925
 
@@ -764,7 +927,7 @@ def delete_storage_node(node_id):
764
927
  logger.info("done")
765
928
 
766
929
 
767
- def remove_storage_node(node_id, force_remove=False):
930
+ def remove_storage_node(node_id, force_remove=False, force_migrate=False):
768
931
  db_controller = DBController()
769
932
  snode = db_controller.get_storage_node_by_id(node_id)
770
933
  if not snode:
@@ -811,7 +974,7 @@ def remove_storage_node(node_id, force_remove=False):
811
974
  distr_controller.disconnect_device(dev)
812
975
  old_status = dev.status
813
976
  dev.status = NVMeDevice.STATUS_FAILED
814
- distr_controller.send_dev_status_event(dev.cluster_device_order, NVMeDevice.STATUS_FAILED)
977
+ distr_controller.send_dev_status_event(dev, NVMeDevice.STATUS_FAILED)
815
978
  device_events.device_status_change(dev, NVMeDevice.STATUS_FAILED, old_status)
816
979
 
817
980
  logger.info("Removing storage node")
@@ -825,29 +988,32 @@ def remove_storage_node(node_id, force_remove=False):
825
988
  pass
826
989
 
827
990
  try:
828
- snode_api = SNodeClient(snode.api_endpoint)
991
+ snode_api = SNodeClient(snode.api_endpoint, timeout=20)
829
992
  snode_api.spdk_process_kill()
830
993
  snode_api.leave_swarm()
994
+ pci_address = []
995
+ for dev in snode.nvme_devices:
996
+ if dev.pcie_address not in pci_address:
997
+ ret = snode_api.delete_dev_gpt_partitions(dev.pcie_address)
998
+ logger.debug(ret)
999
+ pci_address.append(dev.pcie_address)
831
1000
  except Exception as e:
832
- logger.warning(f"Failed to remove SPDK process: {e}")
1001
+ logger.exception(e)
833
1002
 
834
1003
  old_status = snode.status
835
1004
  snode.status = StorageNode.STATUS_REMOVED
836
1005
  snode.write_to_db(db_controller.kv_store)
837
1006
  logger.info("Sending node event update")
838
- distr_controller.send_node_status_event(snode.get_id(), snode.status)
1007
+ distr_controller.send_node_status_event(snode, snode.status)
839
1008
  storage_events.snode_status_change(snode, StorageNode.STATUS_REMOVED, old_status)
840
1009
  logger.info("done")
841
1010
 
842
1011
 
843
1012
  def restart_storage_node(
844
- node_id,
845
- spdk_cpu_mask=None,
846
- spdk_mem=None,
1013
+ node_id, max_lvol=0, max_snap=0, max_prov="",
847
1014
  spdk_image=None,
848
1015
  set_spdk_debug=None,
849
- small_pool_count=0, large_pool_count=0,
850
- small_bufsize=0, large_bufsize=0):
1016
+ small_bufsize=0, large_bufsize=0, number_of_devices=0):
851
1017
 
852
1018
  db_controller = DBController()
853
1019
  kv_store = db_controller.kv_store
@@ -868,7 +1034,7 @@ def restart_storage_node(
868
1034
  snode.status = StorageNode.STATUS_RESTARTING
869
1035
  snode.write_to_db(kv_store)
870
1036
  logger.info("Sending node event update")
871
- distr_controller.send_node_status_event(snode.get_id(), snode.status)
1037
+ distr_controller.send_node_status_event(snode, snode.status)
872
1038
  storage_events.snode_status_change(snode, snode.status, old_status)
873
1039
 
874
1040
  logger.info(f"Restarting Storage node: {snode.mgmt_ip}")
@@ -878,18 +1044,67 @@ def restart_storage_node(
878
1044
  logger.info(f"Node info: {node_info}")
879
1045
 
880
1046
  logger.info("Restarting SPDK")
881
- cpu = snode.spdk_cpu_mask
882
- if spdk_cpu_mask:
883
- cpu = spdk_cpu_mask
884
- snode.spdk_cpu_mask = cpu
885
- mem = snode.spdk_mem
886
- if spdk_mem:
887
- mem = spdk_mem
888
- snode.spdk_mem = mem
1047
+
889
1048
  img = snode.spdk_image
1049
+ if max_lvol:
1050
+ snode.max_lvol = max_lvol
1051
+ if max_snap:
1052
+ snode.max_snap = max_snap
1053
+ if max_prov:
1054
+ snode.max_prov = max_prov
890
1055
  if spdk_image:
891
1056
  img = spdk_image
892
1057
  snode.spdk_image = img
1058
+
1059
+ # Calculate pool count
1060
+ if snode.ec2_metadata and snode.ec2_metadata.get('instanceType'):
1061
+ supported_type, storage_devices, device_size = utils.get_total_size_per_instance_type(snode.ec2_metadata["instanceType"])
1062
+ if not supported_type:
1063
+ logger.warning(f"Unsupported ec2 instance-type {snode.ec2_metadata['instanceType']} for deployment")
1064
+ if not number_of_devices:
1065
+ logger.error(f"Unsupported ec2 instance-type {snode.ec2_metadata['instanceType']} "
1066
+ "for deployment, please specify --number-of-devices")
1067
+ return False
1068
+ number_of_devices = storage_devices
1069
+ else:
1070
+ logger.warning("Can not get ec2 instance type for this instance..")
1071
+ if not number_of_devices:
1072
+ if snode.number_of_devices:
1073
+ number_of_devices = snode.number_of_devices
1074
+ else:
1075
+ logger.error("Unsupported instance type please specify --number-of-devices")
1076
+ return False
1077
+
1078
+ snode.number_of_devices = number_of_devices
1079
+
1080
+ number_of_split = snode.num_partitions_per_dev if snode.num_partitions_per_dev else snode.num_partitions_per_dev + 1
1081
+ number_of_alceml_devices = number_of_devices * number_of_split
1082
+ small_pool_count, large_pool_count = utils.calculate_pool_count(
1083
+ number_of_alceml_devices, snode.max_lvol, snode.max_snap, snode.cpu, len(snode.nvme_pollers_cores) or snode.cpu)
1084
+
1085
+ # Calculate minimum huge page memory
1086
+ minimum_hp_memory = utils.calculate_minimum_hp_memory(small_pool_count, large_pool_count, snode.max_lvol, snode.max_snap, snode.cpu)
1087
+
1088
+ # Calculate minimum sys memory
1089
+ minimum_sys_memory = utils.calculate_minimum_sys_memory(snode.max_prov)
1090
+
1091
+ # check for memory
1092
+ if "memory_details" in node_info and node_info['memory_details']:
1093
+ memory_details = node_info['memory_details']
1094
+ logger.info("Node Memory info")
1095
+ logger.info(f"Total: {utils.humanbytes(memory_details['total'])}")
1096
+ logger.info(f"Free: {utils.humanbytes(memory_details['free'])}")
1097
+ else:
1098
+ logger.error(f"Cannot get memory info from the ec2 instance.. Exiting")
1099
+
1100
+ satisfied, spdk_mem = utils.calculate_spdk_memory(minimum_hp_memory,
1101
+ minimum_sys_memory,
1102
+ int(memory_details['free']),
1103
+ int(memory_details['huge_total']))
1104
+ if not satisfied:
1105
+ logger.error(f"Not enough memory for the provided max_lvo: {snode.max_lvol}, max_snap: {snode.max_snap}, max_prov: {utils.humanbytes(snode.max_prov)}.. Exiting")
1106
+
1107
+
893
1108
  spdk_debug = snode.spdk_debug
894
1109
  if set_spdk_debug:
895
1110
  spdk_debug = spdk_debug
@@ -897,17 +1112,14 @@ def restart_storage_node(
897
1112
 
898
1113
  cluster_docker = utils.get_docker_client(snode.cluster_id)
899
1114
  cluster_ip = cluster_docker.info()["Swarm"]["NodeAddr"]
900
- results, err = snode_api.spdk_process_start(cpu, mem, img, spdk_debug, cluster_ip)
1115
+ results, err = snode_api.spdk_process_start(snode.spdk_cpu_mask, spdk_mem, img, spdk_debug, cluster_ip)
901
1116
 
902
1117
  if not results:
903
1118
  logger.error(f"Failed to start spdk: {err}")
904
1119
  return False
905
1120
  time.sleep(3)
906
1121
 
907
- if small_pool_count:
908
- snode.iobuf_small_pool_count = small_pool_count
909
- if large_pool_count:
910
- snode.iobuf_large_pool_count = large_pool_count
1122
+
911
1123
  if small_bufsize:
912
1124
  snode.iobuf_small_bufsize = small_bufsize
913
1125
  if large_bufsize:
@@ -931,13 +1143,41 @@ def restart_storage_node(
931
1143
  logger.error("Failed to set iobuf options")
932
1144
  return False
933
1145
 
934
- # 2- start spdk framework
1146
+ # 2- set socket implementation options
1147
+ ret = rpc_client.sock_impl_set_options()
1148
+ if not ret:
1149
+ logger.error("Failed socket implement set options")
1150
+ return False
1151
+
1152
+ # 3- set nvme config
1153
+ if snode.pollers_mask:
1154
+ ret = rpc_client.nvmf_set_config(snode.pollers_mask)
1155
+ if not ret:
1156
+ logger.error("Failed to set pollers mask")
1157
+ return False
1158
+
1159
+ # 4- start spdk framework
935
1160
  ret = rpc_client.framework_start_init()
936
1161
  if not ret:
937
1162
  logger.error("Failed to start framework")
938
1163
  return False
939
1164
 
940
- # 3- set nvme bdev options
1165
+ # 5- set app_thread cpu mask
1166
+ if snode.app_thread_mask:
1167
+ ret = rpc_client.thread_get_stats()
1168
+ app_thread_process_id = 0
1169
+ if ret.get("threads"):
1170
+ for entry in ret["threads"]:
1171
+ if entry['name'] == 'app_thread':
1172
+ app_thread_process_id = entry['id']
1173
+ break
1174
+
1175
+ ret = rpc_client.thread_set_cpumask(app_thread_process_id, snode.app_thread_mask)
1176
+ if not ret:
1177
+ logger.error("Failed to set app thread mask")
1178
+ return False
1179
+
1180
+ # 6- set nvme bdev options
941
1181
  ret = rpc_client.bdev_nvme_set_options()
942
1182
  if not ret:
943
1183
  logger.error("Failed to set nvme options")
@@ -970,22 +1210,23 @@ def restart_storage_node(
970
1210
  else:
971
1211
  logger.info(f"Device not found: {db_dev.get_id()}")
972
1212
  db_dev.status = NVMeDevice.STATUS_REMOVED
973
- distr_controller.send_dev_status_event(db_dev.cluster_device_order, "offline")
1213
+ distr_controller.send_dev_status_event(db_dev, db_dev.status)
974
1214
 
975
- for dev in nvme_devs:
976
- if dev.serial_number not in known_devices_sn:
977
- logger.info(f"New device found: {dev.get_id()}")
978
- dev.status = 'new'
979
- new_devices.append(dev)
980
- snode.nvme_devices.append(dev)
1215
+ # todo: handle new devices
1216
+ # for dev in nvme_devs:
1217
+ # if dev.serial_number not in known_devices_sn:
1218
+ # logger.info(f"New device found: {dev.get_id()}")
1219
+ # dev.status = NVMeDevice.STATUS_NEW
1220
+ # new_devices.append(dev)
1221
+ # snode.nvme_devices.append(dev)
981
1222
 
982
- dev_order = get_next_cluster_device_order(db_controller)
983
- for index, nvme in enumerate(new_devices):
984
- nvme.cluster_device_order = dev_order
985
- dev_order += 1
1223
+ # dev_order = get_next_cluster_device_order(db_controller, snode.cluster_id)
1224
+ # for index, nvme in enumerate(new_devices):
1225
+ # nvme.cluster_device_order = dev_order
1226
+ # dev_order += 1
986
1227
 
987
1228
  # prepare devices
988
- ret = _prepare_cluster_devices(snode, after_restart=True)
1229
+ ret = _prepare_cluster_devices_on_restart(snode)
989
1230
  if not ret:
990
1231
  logger.error("Failed to prepare cluster devices")
991
1232
  return False
@@ -996,7 +1237,7 @@ def restart_storage_node(
996
1237
 
997
1238
  # make other nodes connect to the new devices
998
1239
  logger.info("Make other nodes connect to the node devices")
999
- snodes = db_controller.get_storage_nodes()
1240
+ snodes = db_controller.get_storage_nodes_by_cluster_id(snode.cluster_id)
1000
1241
  for node_index, node in enumerate(snodes):
1001
1242
  if node.get_id() == snode.get_id() or node.status != StorageNode.STATUS_ONLINE:
1002
1243
  continue
@@ -1034,20 +1275,23 @@ def restart_storage_node(
1034
1275
  storage_events.snode_status_change(snode, snode.status, old_status)
1035
1276
 
1036
1277
  logger.info("Sending node event update")
1037
- distr_controller.send_node_status_event(snode.get_id(), NVMeDevice.STATUS_ONLINE)
1278
+ distr_controller.send_node_status_event(snode, StorageNode.STATUS_ONLINE)
1038
1279
 
1039
1280
  logger.info("Sending devices event updates")
1281
+ logger.info("Starting migration tasks")
1040
1282
  for dev in snode.nvme_devices:
1041
1283
  if dev.status != NVMeDevice.STATUS_ONLINE:
1042
- logger.debug(f"Device is not online: {dev.get_id()}, status: {dev.status}")
1284
+ logger.info(f"Device is not online: {dev.get_id()}, status: {dev.status}")
1043
1285
  continue
1044
- distr_controller.send_dev_status_event(dev.cluster_device_order, NVMeDevice.STATUS_ONLINE)
1045
1286
 
1046
- logger.info("Sending cluster map to current node")
1047
- ret = distr_controller.send_cluster_map_to_node(snode)
1048
- if not ret:
1049
- return False, "Failed to send cluster map"
1050
- time.sleep(3)
1287
+ distr_controller.send_dev_status_event(dev, NVMeDevice.STATUS_ONLINE)
1288
+ tasks_controller.add_device_mig_task(dev.get_id())
1289
+
1290
+ # logger.info("Sending cluster map to current node")
1291
+ # ret = distr_controller.send_cluster_map_to_node(snode)
1292
+ # if not ret:
1293
+ # return False, "Failed to send cluster map"
1294
+ # time.sleep(3)
1051
1295
 
1052
1296
  for lvol_id in snode.lvols:
1053
1297
  lvol = lvol_controller.recreate_lvol(lvol_id, snode)
@@ -1062,9 +1306,12 @@ def restart_storage_node(
1062
1306
  return "Success"
1063
1307
 
1064
1308
 
1065
- def list_storage_nodes(kv_store, is_json):
1066
- db_controller = DBController(kv_store)
1067
- nodes = db_controller.get_storage_nodes()
1309
+ def list_storage_nodes(is_json, cluster_id=None):
1310
+ db_controller = DBController()
1311
+ if cluster_id:
1312
+ nodes = db_controller.get_storage_nodes_by_cluster_id(cluster_id)
1313
+ else:
1314
+ nodes = db_controller.get_storage_nodes()
1068
1315
  data = []
1069
1316
  output = ""
1070
1317
 
@@ -1111,26 +1358,43 @@ def list_storage_devices(kv_store, node_id, sort, is_json):
1111
1358
  logger.error("This storage node is not part of the cluster")
1112
1359
  return False
1113
1360
 
1114
- data = []
1361
+ storage_devices = []
1362
+ jm_devices = []
1363
+ remote_devices = []
1115
1364
  for device in snode.nvme_devices:
1116
1365
  logger.debug(device)
1117
1366
  logger.debug("*" * 20)
1118
- data.append({
1367
+ storage_devices.append({
1119
1368
  "UUID": device.uuid,
1120
1369
  "Name": device.device_name,
1121
- "Hostname": snode.hostname,
1122
1370
  "Size": utils.humanbytes(device.size),
1123
- # "Sequential Number": device.sequential_number,
1124
- # "Partitions Count": device.partitions_count,
1125
- # "Model ID": device.model_id,
1126
1371
  "Serial Number": device.serial_number,
1127
1372
  "PCIe": device.pcie_address,
1128
1373
  "Status": device.status,
1129
1374
  "IO Err": device.io_error,
1130
- "Health": device.health_check,
1375
+ "Health": device.health_check
1376
+ })
1131
1377
 
1378
+ if snode.jm_device:
1379
+ jm_devices.append({
1380
+ "UUID": snode.jm_device.uuid,
1381
+ "Name": snode.jm_device.device_name,
1382
+ "Size": utils.humanbytes(snode.jm_device.size),
1383
+ "Status": snode.jm_device.status,
1384
+ "IO Err": snode.jm_device.io_error,
1385
+ "Health": snode.jm_device.health_check
1132
1386
  })
1133
1387
 
1388
+ for device in snode.remote_devices:
1389
+ logger.debug(device)
1390
+ logger.debug("*" * 20)
1391
+ remote_devices.append({
1392
+ "UUID": device.uuid,
1393
+ "Name": device.device_name,
1394
+ "Size": utils.humanbytes(device.size),
1395
+ "Serial Number": device.serial_number,
1396
+ "Node ID": device.node_id,
1397
+ })
1134
1398
  if sort and sort in ['node-seq', 'dev-seq', 'serial']:
1135
1399
  if sort == 'serial':
1136
1400
  sort_key = "Serial Number"
@@ -1139,13 +1403,20 @@ def list_storage_devices(kv_store, node_id, sort, is_json):
1139
1403
  elif sort == 'node-seq':
1140
1404
  # TODO: check this key
1141
1405
  sort_key = "Sequential Number"
1142
- sorted_data = sorted(data, key=lambda d: d[sort_key])
1143
- data = sorted_data
1406
+ storage_devices = sorted(storage_devices, key=lambda d: d[sort_key])
1144
1407
 
1408
+ data = {
1409
+ "Storage Devices": storage_devices,
1410
+ "JM Devices": jm_devices,
1411
+ "Remote Devices": remote_devices,
1412
+ }
1145
1413
  if is_json:
1146
1414
  return json.dumps(data, indent=2)
1147
1415
  else:
1148
- return utils.print_table(data)
1416
+ out = ""
1417
+ for d in data:
1418
+ out += f"{d}\n{utils.print_table(data[d])}\n\n"
1419
+ return out
1149
1420
 
1150
1421
 
1151
1422
  def shutdown_storage_node(node_id, force=False):
@@ -1186,7 +1457,7 @@ def shutdown_storage_node(node_id, force=False):
1186
1457
  for dev in snode.nvme_devices:
1187
1458
  if dev.status in [NVMeDevice.STATUS_ONLINE, NVMeDevice.STATUS_READONLY]:
1188
1459
  device_controller.device_set_unavailable(dev.get_id())
1189
- distr_controller.send_node_status_event(snode.get_id(), "in_shutdown")
1460
+ distr_controller.send_node_status_event(snode, StorageNode.STATUS_IN_SHUTDOWN)
1190
1461
 
1191
1462
  # shutdown node
1192
1463
  # make other nodes disconnect from this node
@@ -1206,7 +1477,7 @@ def shutdown_storage_node(node_id, force=False):
1206
1477
  snode_api = SNodeClient(snode.api_endpoint)
1207
1478
  results, err = snode_api.spdk_process_kill()
1208
1479
 
1209
- distr_controller.send_node_status_event(snode.get_id(), StorageNode.STATUS_OFFLINE)
1480
+ distr_controller.send_node_status_event(snode, StorageNode.STATUS_OFFLINE)
1210
1481
 
1211
1482
  logger.info("Setting node status to offline")
1212
1483
  snode = db_controller.get_storage_node_by_id(node_id)
@@ -1233,22 +1504,24 @@ def suspend_storage_node(node_id, force=False):
1233
1504
  return False
1234
1505
 
1235
1506
  cluster = db_controller.get_cluster_by_id(snode.cluster_id)
1236
- snodes = db_controller.get_storage_nodes()
1507
+ snodes = db_controller.get_storage_nodes_by_cluster_id(snode.cluster_id)
1237
1508
  online_nodes = 0
1238
1509
  for node in snodes:
1239
1510
  if node.status == node.STATUS_ONLINE:
1240
1511
  online_nodes += 1
1241
- if cluster.ha_type == "ha" and online_nodes <= 3 and cluster.status == cluster.STATUS_ACTIVE:
1242
- logger.warning(f"Cluster mode is HA but online storage nodes are less than 3")
1243
- if force is False:
1244
- return False
1245
1512
 
1246
- if cluster.ha_type == "ha" and cluster.status == cluster.STATUS_DEGRADED and force is False:
1247
- logger.warning(f"Cluster status is degraded, use --force but this will suspend the cluster")
1248
- return False
1513
+ if cluster.ha_type == "ha":
1514
+ if online_nodes <= 3 and cluster.status == cluster.STATUS_ACTIVE:
1515
+ logger.warning(f"Cluster mode is HA but online storage nodes are less than 3")
1516
+ if force is False:
1517
+ return False
1518
+
1519
+ if cluster.status == cluster.STATUS_DEGRADED and force is False:
1520
+ logger.warning(f"Cluster status is degraded, use --force but this will suspend the cluster")
1521
+ return False
1249
1522
 
1250
1523
  logger.info("Suspending node")
1251
- distr_controller.send_node_status_event(snode.get_id(), "suspended")
1524
+ distr_controller.send_node_status_event(snode, StorageNode.STATUS_SUSPENDED)
1252
1525
  for dev in snode.nvme_devices:
1253
1526
  if dev.status == NVMeDevice.STATUS_ONLINE:
1254
1527
  device_controller.device_set_unavailable(dev.get_id())
@@ -1292,7 +1565,7 @@ def resume_storage_node(node_id):
1292
1565
  logger.info("Resuming node")
1293
1566
 
1294
1567
  logger.info("Sending cluster event updates")
1295
- distr_controller.send_node_status_event(snode.get_id(), "online")
1568
+ distr_controller.send_node_status_event(snode, StorageNode.STATUS_ONLINE)
1296
1569
 
1297
1570
  for dev in snode.nvme_devices:
1298
1571
  if dev.status == NVMeDevice.STATUS_UNAVAILABLE:
@@ -1668,7 +1941,6 @@ def deploy_cleaner():
1668
1941
  return True
1669
1942
 
1670
1943
 
1671
-
1672
1944
  def get_host_secret(node_id):
1673
1945
  db_controller = DBController()
1674
1946
  node = db_controller.get_storage_node_by_id(node_id)
@@ -1831,7 +2103,7 @@ def set_node_status(node_id, status):
1831
2103
  snode.updated_at = str(datetime.datetime.now())
1832
2104
  snode.write_to_db(db_controller.kv_store)
1833
2105
  storage_events.snode_status_change(snode, snode.status, old_status, caused_by="monitor")
1834
- distr_controller.send_node_status_event(snode.get_id(), status)
2106
+ distr_controller.send_node_status_event(snode, status)
1835
2107
 
1836
2108
  if snode.status == StorageNode.STATUS_ONLINE:
1837
2109
  logger.info("Connecting to remote devices")