sbcli-pre 1.2.5__zip → 1.2.7__zip

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/PKG-INFO +1 -1
  2. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/env_var +1 -1
  3. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/sbcli_pre.egg-info/PKG-INFO +1 -1
  4. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/sbcli_pre.egg-info/SOURCES.txt +5 -3
  5. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_cli/cli.py +138 -136
  6. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/cluster_ops.py +138 -235
  7. sbcli_pre-1.2.7/simplyblock_core/constants.py +91 -0
  8. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/caching_node_controller.py +8 -6
  9. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/cluster_events.py +9 -0
  10. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/device_controller.py +56 -63
  11. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/events_controller.py +5 -3
  12. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/health_controller.py +30 -40
  13. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/lvol_controller.py +75 -39
  14. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/pool_controller.py +8 -4
  15. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/snapshot_controller.py +36 -3
  16. sbcli_pre-1.2.7/simplyblock_core/controllers/tasks_controller.py +103 -0
  17. sbcli_pre-1.2.7/simplyblock_core/controllers/tasks_events.py +37 -0
  18. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/distr_controller.py +13 -9
  19. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/kv_store.py +62 -20
  20. sbcli_pre-1.2.7/simplyblock_core/mgmt_node_ops.py +205 -0
  21. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/events.py +9 -1
  22. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/job_schedule.py +6 -0
  23. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/nvme_device.py +42 -4
  24. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/storage_node.py +14 -2
  25. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/rpc_client.py +55 -10
  26. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/__init__.py +0 -4
  27. sbcli_pre-1.2.5/simplyblock_core/scripts/alerting/alert_resources.yaml → sbcli_pre-1.2.7/simplyblock_core/scripts/alerting/alert_resources.yaml.j2 +54 -5
  28. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/dashboards/cluster.json +1 -1
  29. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/deploy_stack.sh +9 -0
  30. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/docker-compose-swarm-monitoring.yml +32 -15
  31. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/docker-compose-swarm.yml +17 -2
  32. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/haproxy.cfg +15 -0
  33. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/install_deps.sh +3 -0
  34. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/stack_deploy_wait.sh +1 -1
  35. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/capacity_and_stats_collector.py +1 -1
  36. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/device_monitor.py +5 -46
  37. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/distr_event_collector.py +10 -11
  38. sbcli_pre-1.2.7/simplyblock_core/services/health_check_service.py +134 -0
  39. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/lvol_monitor.py +1 -1
  40. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/lvol_stat_collector.py +1 -1
  41. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/port_stat_collector.py +0 -1
  42. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/storage_node_monitor.py +49 -44
  43. sbcli_pre-1.2.7/simplyblock_core/services/tasks_runner_migration.py +61 -0
  44. sbcli_pre-1.2.5/simplyblock_core/services/job_tasks.py → sbcli_pre-1.2.7/simplyblock_core/services/tasks_runner_restart.py +95 -46
  45. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/snode_client.py +12 -0
  46. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/storage_node_ops.py +630 -358
  47. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/utils.py +126 -1
  48. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/snode_ops.py +103 -25
  49. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/web_api_cluster.py +20 -43
  50. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/web_api_device.py +10 -7
  51. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/web_api_lvol.py +9 -5
  52. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/web_api_pool.py +14 -5
  53. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/web_api_storage_node.py +15 -15
  54. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/node_utils.py +0 -2
  55. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/utils.py +8 -0
  56. sbcli_pre-1.2.5/simplyblock_core/constants.py +0 -65
  57. sbcli_pre-1.2.5/simplyblock_core/mgmt_node_ops.py +0 -80
  58. sbcli_pre-1.2.5/simplyblock_core/scripts/apply_dashboard.sh +0 -22
  59. sbcli_pre-1.2.5/simplyblock_core/services/health_check_service.py +0 -136
  60. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/README.md +0 -0
  61. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/pyproject.toml +0 -0
  62. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/sbcli_pre.egg-info/dependency_links.txt +0 -0
  63. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/sbcli_pre.egg-info/entry_points.txt +0 -0
  64. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/sbcli_pre.egg-info/requires.txt +0 -0
  65. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/sbcli_pre.egg-info/top_level.txt +0 -0
  66. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/setup.cfg +0 -0
  67. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/setup.py +0 -0
  68. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_cli/main.py +0 -0
  69. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/__init__.py +0 -0
  70. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/cnode_client.py +0 -0
  71. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/compute_node_ops.py +0 -0
  72. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/__init__.py +0 -0
  73. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/device_events.py +0 -0
  74. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/lvol_events.py +0 -0
  75. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/mgmt_events.py +0 -0
  76. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/pool_events.py +0 -0
  77. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/snapshot_events.py +0 -0
  78. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/storage_events.py +0 -0
  79. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/__init__.py +0 -0
  80. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/base_model.py +0 -0
  81. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/caching_node.py +0 -0
  82. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/cluster.py +0 -0
  83. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/compute_node.py +0 -0
  84. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/deployer.py +0 -0
  85. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/global_settings.py +0 -0
  86. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/iface.py +0 -0
  87. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/lvol_model.py +0 -0
  88. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/mgmt_node.py +0 -0
  89. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/pool.py +0 -0
  90. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/port_stat.py +0 -0
  91. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/snapshot.py +0 -0
  92. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/stats.py +0 -0
  93. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/pci_utils.py +0 -0
  94. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/alerting/alert_rules.yaml +0 -0
  95. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/clean_local_storage_deploy.sh +0 -0
  96. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/config_docker.sh +0 -0
  97. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/dashboards/devices.json +0 -0
  98. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/dashboards/lvols.json +0 -0
  99. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/dashboards/node-exporter.json +0 -0
  100. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/dashboards/nodes.json +0 -0
  101. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/dashboards/pools.json +0 -0
  102. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/datasource.yml +0 -0
  103. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/db_config_double.sh +0 -0
  104. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/db_config_single.sh +0 -0
  105. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/prometheus.yml +0 -0
  106. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/run_ssh.sh +0 -0
  107. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/set_db_config.sh +0 -0
  108. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/__init__.py +0 -0
  109. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/caching_node_monitor.py +0 -0
  110. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/cap_monitor.py +0 -0
  111. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/install_service.sh +0 -0
  112. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/log_agg_service.py +0 -0
  113. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/mgmt_node_monitor.py +0 -0
  114. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/remove_service.sh +0 -0
  115. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/service_template.service +0 -0
  116. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/shell_utils.py +0 -0
  117. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/__init__.py +0 -0
  118. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/app.py +0 -0
  119. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/auth_middleware.py +0 -0
  120. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/__init__.py +0 -0
  121. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/caching_node_ops.py +0 -0
  122. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/caching_node_ops_k8s.py +0 -0
  123. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/node_api_basic.py +0 -0
  124. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/node_api_caching_docker.py +0 -0
  125. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/node_api_caching_ks.py +0 -0
  126. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/web_api_caching_node.py +0 -0
  127. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/web_api_deployer.py +0 -0
  128. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/web_api_mgmt_node.py +0 -0
  129. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/web_api_snapshot.py +0 -0
  130. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/caching_node_app.py +0 -0
  131. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/caching_node_app_k8s.py +0 -0
  132. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/node_webapp.py +0 -0
  133. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/snode_app.py +0 -0
  134. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/static/delete.py +0 -0
  135. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/static/deploy.py +0 -0
  136. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/static/deploy_cnode.yaml +0 -0
  137. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/static/deploy_spdk.yaml +0 -0
  138. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/static/is_up.py +0 -0
  139. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/static/list_deps.py +0 -0
  140. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/static/rpac.yaml +0 -0
  141. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/static/tst.py +0 -0
  142. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/templates/deploy_spdk.yaml.j2 +0 -0
@@ -1,8 +1,8 @@
1
1
  import time
2
2
  import logging
3
3
 
4
- from simplyblock_core import distr_controller, utils
5
- from simplyblock_core.controllers import device_events, lvol_controller
4
+ from simplyblock_core import distr_controller, utils, storage_node_ops
5
+ from simplyblock_core.controllers import device_events, lvol_controller, tasks_controller
6
6
  from simplyblock_core.kv_store import DBController
7
7
  from simplyblock_core.models.nvme_device import NVMeDevice
8
8
  from simplyblock_core.rpc_client import RPCClient
@@ -16,6 +16,7 @@ def device_set_state(device_id, state):
16
16
  dev = db_controller.get_storage_devices(device_id)
17
17
  if not dev:
18
18
  logger.error("device not found")
19
+ return False
19
20
 
20
21
  snode = db_controller.get_storage_node_by_id(dev.node_id)
21
22
  if not snode:
@@ -35,7 +36,7 @@ def device_set_state(device_id, state):
35
36
 
36
37
  old_status = dev.status
37
38
  device.status = state
38
- distr_controller.send_dev_status_event(device.cluster_device_order, device.status)
39
+ distr_controller.send_dev_status_event(device, device.status)
39
40
  snode.write_to_db(db_controller.kv_store)
40
41
  device_events.device_status_change(device, device.status, old_status)
41
42
  return True
@@ -78,14 +79,20 @@ def device_set_read_only(device_id):
78
79
 
79
80
 
80
81
  def device_set_online(device_id):
81
- return device_set_state(device_id, NVMeDevice.STATUS_ONLINE)
82
+ ret = device_set_state(device_id, NVMeDevice.STATUS_ONLINE)
83
+ if ret:
84
+ logger.info("Adding task to device data migration")
85
+ task_id = tasks_controller.add_device_mig_task(device_id)
86
+ if task_id:
87
+ logger.info(f"Task id: {task_id}")
88
+ return ret
82
89
 
83
90
 
84
91
  def get_alceml_name(alceml_id):
85
92
  return f"alceml_{alceml_id}"
86
93
 
87
94
 
88
- def _def_create_device_stack(device_obj, snode):
95
+ def _def_create_device_stack(device_obj, snode, force=False):
89
96
 
90
97
  rpc_client = RPCClient(
91
98
  snode.mgmt_ip, snode.rpc_port,
@@ -98,22 +105,26 @@ def _def_create_device_stack(device_obj, snode):
98
105
  ret = rpc_client.bdev_passtest_create(test_name, device_obj.nvme_bdev)
99
106
  if not ret:
100
107
  logger.error(f"Failed to create bdev: {test_name}")
101
- return False
108
+ if not force:
109
+ return False
102
110
 
103
111
  alceml_id = device_obj.get_id()
104
112
  alceml_name = get_alceml_name(alceml_id)
105
113
  logger.info(f"adding {alceml_name}")
106
- ret = rpc_client.bdev_alceml_create(alceml_name, test_name, alceml_id, pba_init_mode=2)
114
+ ret = rpc_client.bdev_alceml_create(alceml_name, test_name, alceml_id, pba_init_mode=2,
115
+ dev_cpu_mask=snode.dev_cpu_mask)
107
116
  if not ret:
108
117
  logger.error(f"Failed to create alceml bdev: {alceml_name}")
109
- return False
118
+ if not force:
119
+ return False
110
120
 
111
121
  # add pass through
112
122
  pt_name = f"{alceml_name}_PT"
113
123
  ret = rpc_client.bdev_PT_NoExcl_create(pt_name, alceml_name)
114
124
  if not ret:
115
125
  logger.error(f"Failed to create pt noexcl bdev: {pt_name}")
116
- return False
126
+ if not force:
127
+ return False
117
128
 
118
129
  subsystem_nqn = snode.subsystem + ":dev:" + alceml_id
119
130
  logger.info("Creating subsystem %s", subsystem_nqn)
@@ -137,11 +148,13 @@ def _def_create_device_stack(device_obj, snode):
137
148
  logger.info(f"Adding {pt_name} to the subsystem")
138
149
  ret = rpc_client.nvmf_subsystem_add_ns(subsystem_nqn, pt_name)
139
150
 
140
- if device_obj.jm_bdev:
141
- ret = rpc_client.bdev_jm_create(device_obj.jm_bdev, device_obj.alceml_bdev)
151
+ if hasattr(device_obj, 'jm_bdev') and device_obj.jm_bdev:
152
+ ret = rpc_client.bdev_jm_create(device_obj.jm_bdev, device_obj.alceml_bdev,
153
+ dev_cpu_mask=snode.dev_cpu_mask)
142
154
  if not ret:
143
- logger.error(f"Failed to create bdev: {device_obj.jm_bdev}")
144
- return False
155
+ logger.error(f"Failed to create jm bdev: {device_obj.jm_bdev}")
156
+ if not force:
157
+ return False
145
158
 
146
159
  device_obj.testing_bdev = test_name
147
160
  device_obj.alceml_bdev = alceml_name
@@ -174,23 +187,15 @@ def restart_device(device_id, force=False):
174
187
  device_obj = dev
175
188
  break
176
189
 
177
- device_obj.status = 'restarting'
178
- snode.write_to_db(db_controller.kv_store)
179
-
180
190
  logger.info(f"Restarting device {device_id}")
191
+ device_set_unavailable(device_id)
181
192
 
182
- ret = _def_create_device_stack(device_obj, snode)
193
+ ret = _def_create_device_stack(device_obj, snode, force=force)
183
194
 
184
195
  if not ret:
185
196
  logger.error("Failed to create device stack")
186
- device_obj.status = NVMeDevice.STATUS_UNAVAILABLE
187
- snode.write_to_db(db_controller.kv_store)
188
- return False
189
-
190
- device_obj.io_error = False
191
- device_obj.retries_exhausted = False
192
- device_obj.status = NVMeDevice.STATUS_ONLINE
193
- snode.write_to_db(db_controller.kv_store)
197
+ if not force:
198
+ return False
194
199
 
195
200
  logger.info("Make other nodes connect to the device")
196
201
  snodes = db_controller.get_storage_nodes()
@@ -221,10 +226,11 @@ def restart_device(device_id, force=False):
221
226
  node.write_to_db(db_controller.kv_store)
222
227
  time.sleep(3)
223
228
 
224
- logger.info("Sending device event")
225
- distr_controller.send_dev_status_event(device_obj.cluster_device_order, "online")
229
+ logger.info("Setting device io_error to False")
230
+ device_set_io_error(device_id, False)
231
+ logger.info("Setting device online")
232
+ device_set_online(device_id)
226
233
  device_events.device_restarted(device_obj)
227
-
228
234
  return "Done"
229
235
 
230
236
 
@@ -267,15 +273,8 @@ def device_remove(device_id, force=True):
267
273
  device = dev
268
274
  break
269
275
 
270
- if device.jm_bdev:
271
- if snode.lvols:
272
- logger.error(f"Failed to remove device: {device.get_id()}, "
273
- f"there are LVols that uses JM from this device, delete LVol to continue")
274
- # if not force:
275
- return False
276
-
277
276
  logger.info("Sending device event")
278
- distr_controller.send_dev_status_event(device.cluster_device_order, "removed")
277
+ distr_controller.send_dev_status_event(device, NVMeDevice.STATUS_REMOVED)
279
278
 
280
279
  logger.info("Disconnecting device from all nodes")
281
280
  distr_controller.disconnect_device(device)
@@ -291,13 +290,6 @@ def device_remove(device_id, force=True):
291
290
  if not force:
292
291
  return False
293
292
 
294
- if device.jm_bdev:
295
- ret = rpc_client.bdev_jm_delete(f"jm_{snode.get_id()}")
296
- if not ret:
297
- logger.error(f"Failed to remove journal manager: jm_{snode.get_id()}")
298
- if not force:
299
- return False
300
-
301
293
  logger.info("Removing device bdevs")
302
294
  ret = rpc_client.bdev_PT_NoExcl_delete(f"{device.alceml_bdev}_PT")
303
295
  if not ret:
@@ -405,19 +397,12 @@ def get_device_iostats(device_id, history, records_count=20, parse_sizes=True):
405
397
 
406
398
  def reset_storage_device(dev_id):
407
399
  db_controller = DBController()
408
- device = None
409
- snode = None
410
- for node in db_controller.get_storage_nodes():
411
- for dev in node.nvme_devices:
412
- if dev.get_id() == dev_id:
413
- device = dev
414
- snode = node
415
- break
416
-
400
+ device = db_controller.get_storage_devices(dev_id)
417
401
  if not device:
418
402
  logger.error(f"Device not found: {dev_id}")
419
403
  return False
420
404
 
405
+ snode = db_controller.get_storage_node_by_id(device.node_id)
421
406
  if not snode:
422
407
  logger.error(f"Node not found {device.node_id}")
423
408
  return False
@@ -426,30 +411,38 @@ def reset_storage_device(dev_id):
426
411
  logger.error(f"Device status: {device.status} is removed")
427
412
  return False
428
413
 
429
- logger.info("Setting device to unavailable")
430
- old_status = device.status
431
- device.status = NVMeDevice.STATUS_UNAVAILABLE
432
- distr_controller.send_dev_status_event(device.cluster_device_order, device.status)
433
- snode.write_to_db(db_controller.kv_store)
434
- device_events.device_status_change(device, device.status, old_status)
414
+ logger.info("Setting devices to unavailable")
415
+ device_set_unavailable(dev_id)
416
+ devs = []
417
+ for dev in snode.nvme_devices:
418
+ if dev.get_id() == device.get_id():
419
+ continue
420
+ if dev.status == NVMeDevice.STATUS_ONLINE and dev.physical_label == device.physical_label:
421
+ devs.append(dev)
422
+ device_set_unavailable(dev.get_id())
435
423
 
436
424
  logger.info("Resetting device")
437
425
  rpc_client = RPCClient(
438
426
  snode.mgmt_ip, snode.rpc_port,
439
427
  snode.rpc_username, snode.rpc_password)
440
428
 
441
- controller_name = device.nvme_bdev[:-2]
429
+ controller_name = device.nvme_controller
442
430
  response = rpc_client.reset_device(controller_name)
443
431
  if not response:
444
432
  logger.error(f"Failed to reset NVMe BDev {controller_name}")
445
433
  return False
434
+ time.sleep(3)
446
435
 
447
- device.io_error = False
448
- device.retries_exhausted = False
449
- snode.write_to_db(db_controller.kv_store)
436
+ logger.info("Setting devices online")
437
+ for dev in devs:
438
+ device_set_online(dev.get_id())
450
439
 
451
- device_events.device_reset(device)
440
+ # set io_error flag False
441
+ device_set_io_error(dev_id, False)
442
+ device_set_retries_exhausted(dev_id, False)
443
+ # set device to online
452
444
  device_set_online(dev_id)
445
+ device_events.device_reset(device)
453
446
  return True
454
447
 
455
448
 
@@ -40,7 +40,7 @@ def log_distr_event(cluster_id, node_id, event_dict):
40
40
  ds.uuid = str(uuid.uuid4())
41
41
  ds.cluster_uuid = cluster_id
42
42
  ds.node_id = node_id
43
- ds.date = int(time.time())
43
+ ds.date = round(time.time()*1000)
44
44
  ds.domain = DOMAIN_DISTR
45
45
  ds.event_level = EventObj.LEVEL_ERROR
46
46
  ds.caused_by = CAUSED_BY_MONITOR
@@ -66,7 +66,7 @@ def log_distr_event(cluster_id, node_id, event_dict):
66
66
 
67
67
 
68
68
  def log_event_cluster(cluster_id, domain, event, db_object, caused_by, message,
69
- node_id=None, event_level=EventObj.LEVEL_INFO):
69
+ node_id=None, event_level=EventObj.LEVEL_INFO, status=None):
70
70
  """
71
71
  uuid:
72
72
  cluster_uuid: 1234
@@ -83,7 +83,7 @@ def log_event_cluster(cluster_id, domain, event, db_object, caused_by, message,
83
83
  ds = EventObj()
84
84
  ds.uuid = str(uuid.uuid4())
85
85
  ds.cluster_uuid = cluster_id
86
- ds.date = int(time.time())
86
+ ds.date = round(time.time()*1000)
87
87
  ds.node_id = node_id
88
88
  ds.event_level = event_level
89
89
 
@@ -93,12 +93,14 @@ def log_event_cluster(cluster_id, domain, event, db_object, caused_by, message,
93
93
  ds.object_dict = db_object.get_clean_dict()
94
94
  ds.caused_by = caused_by
95
95
  ds.message = message
96
+ ds.status = status
96
97
 
97
98
  log_event_based_on_level(cluster_id, event, db_object.name, message, caused_by, event_level)
98
99
 
99
100
  db_controller = DBController()
100
101
  ds.write_to_db(db_controller.kv_store)
101
102
 
103
+
102
104
  def log_event_based_on_level(cluster_id, event, db_object, message, caused_by, event_level):
103
105
  json_str = json.dumps({
104
106
  "cluster_id": cluster_id,
@@ -16,11 +16,13 @@ logger = log.getLogger()
16
16
 
17
17
  def check_cluster(cluster_id):
18
18
  db_controller = DBController()
19
- st = db_controller.get_storage_nodes()
19
+ st = db_controller.get_storage_nodes_by_cluster_id(cluster_id)
20
20
  data = []
21
+ result = True
21
22
  for node in st:
22
23
  # check if node is online, unavailable, restarting
23
24
  ret = check_node(node.get_id(), with_devices=False)
25
+ result &= ret
24
26
  print("*"*100)
25
27
  data.append({
26
28
  "Kind": "Node",
@@ -28,30 +30,32 @@ def check_cluster(cluster_id):
28
30
  "Status": "ok" if ret else "failed"
29
31
  })
30
32
 
31
- for device in db_controller.get_storage_devices():
32
- ret = check_device(device.get_id())
33
- print("*" * 100)
34
- data.append({
35
- "Kind": "Device",
36
- "UUID": device.get_id(),
37
- "Status": "ok" if ret else "failed"
38
- })
39
-
40
- for lvol in db_controller.get_lvols():
41
- ret = check_lvol(lvol.get_id())
42
- print("*" * 100)
43
- data.append({
44
- "Kind": "LVol",
45
- "UUID": lvol.get_id(),
46
- "Status": "ok" if ret else "failed"
47
- })
33
+ for device in node.nvme_devices:
34
+ ret = check_device(device.get_id())
35
+ result &= ret
36
+ print("*" * 100)
37
+ data.append({
38
+ "Kind": "Device",
39
+ "UUID": device.get_id(),
40
+ "Status": "ok" if ret else "failed"
41
+ })
42
+
43
+ for lvol in db_controller.get_lvols(cluster_id):
44
+ ret = check_lvol(lvol.get_id())
45
+ result &= ret
46
+ print("*" * 100)
47
+ data.append({
48
+ "Kind": "LVol",
49
+ "UUID": lvol.get_id(),
50
+ "Status": "ok" if ret else "failed"
51
+ })
48
52
  print(utils.print_table(data))
49
- return True
53
+ return result
50
54
 
51
55
 
52
56
  def _check_node_docker_api(ip):
53
57
  try:
54
- node_docker = docker.DockerClient(base_url=f"tcp://{ip}:2375", version="auto")
58
+ node_docker = docker.DockerClient(base_url=f"tcp://{ip}:2375", version="auto", timeout=3)
55
59
  ret = node_docker.info()
56
60
  if ret:
57
61
  logger.debug(ret)
@@ -65,7 +69,7 @@ def _check_node_rpc(rpc_ip, rpc_port, rpc_username, rpc_password):
65
69
  try:
66
70
  rpc_client = RPCClient(
67
71
  rpc_ip, rpc_port, rpc_username, rpc_password,
68
- timeout=5, retry=3)
72
+ timeout=10, retry=1)
69
73
  ret = rpc_client.get_version()
70
74
  if ret:
71
75
  logger.debug(f"SPDK version: {ret['version']}")
@@ -167,7 +171,7 @@ def check_node(node_id, with_devices=True):
167
171
 
168
172
  def check_device(device_id):
169
173
  db_controller = DBController()
170
- device = db_controller.get_storage_devices(device_id)
174
+ device = db_controller.get_storage_device_by_id(device_id)
171
175
  if not device:
172
176
  logger.error("device not found")
173
177
  return False
@@ -192,8 +196,8 @@ def check_device(device_id):
192
196
  snode.rpc_username, snode.rpc_password)
193
197
 
194
198
  bdevs_stack = [device.nvme_bdev, device.testing_bdev, device.alceml_bdev, device.pt_bdev]
195
- if device.jm_bdev:
196
- bdevs_stack.append(device.jm_bdev)
199
+ # if device.jm_bdev:
200
+ # bdevs_stack.append(device.jm_bdev)
197
201
  logger.info(f"Checking Device: {device_id}, status:{device.status}")
198
202
  problems = 0
199
203
  for bdev in bdevs_stack:
@@ -231,7 +235,7 @@ def check_device(device_id):
231
235
 
232
236
  def check_remote_device(device_id):
233
237
  db_controller = DBController()
234
- device = db_controller.get_storage_devices(device_id)
238
+ device = db_controller.get_storage_device_by_id(device_id)
235
239
  if not device:
236
240
  logger.error("device not found")
237
241
  return False
@@ -241,7 +245,7 @@ def check_remote_device(device_id):
241
245
  return False
242
246
 
243
247
  result = True
244
- for node in db_controller.get_storage_nodes():
248
+ for node in db_controller.get_storage_nodes_by_cluster_id(snode.cluster_id):
245
249
  if node.status == StorageNode.STATUS_ONLINE:
246
250
  if node.get_id() == snode.get_id():
247
251
  continue
@@ -306,20 +310,6 @@ def check_lvol_on_node(lvol_id, node_id):
306
310
  logger.exception(e)
307
311
  return False
308
312
 
309
- # check ndcs+npcs <= online devices
310
- # then change its status to offline if fails this check
311
- online_devices = 0
312
- for node in db_controller.get_storage_nodes():
313
- for dev in node.nvme_devices:
314
- if dev.status == dev.STATUS_ONLINE:
315
- online_devices += 1
316
-
317
- # if lvol.ndcs + lvol.npcs < online_devices:
318
- # logger.info(f"Checking Distr ndcs+npcs: {lvol.ndcs}+{lvol.npcs}, online devices: {online_devices} ... ok")
319
- # else:
320
- # logger.info(f"Checking Distr ndcs+npcs: {lvol.ndcs}+{lvol.npcs}, online devices: {online_devices} ... failed")
321
- # passed = False
322
-
323
313
  return passed
324
314
 
325
315