sbcli-pre 1.2.5__zip → 1.2.7__zip

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/PKG-INFO +1 -1
  2. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/env_var +1 -1
  3. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/sbcli_pre.egg-info/PKG-INFO +1 -1
  4. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/sbcli_pre.egg-info/SOURCES.txt +5 -3
  5. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_cli/cli.py +138 -136
  6. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/cluster_ops.py +138 -235
  7. sbcli_pre-1.2.7/simplyblock_core/constants.py +91 -0
  8. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/caching_node_controller.py +8 -6
  9. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/cluster_events.py +9 -0
  10. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/device_controller.py +56 -63
  11. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/events_controller.py +5 -3
  12. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/health_controller.py +30 -40
  13. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/lvol_controller.py +75 -39
  14. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/pool_controller.py +8 -4
  15. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/snapshot_controller.py +36 -3
  16. sbcli_pre-1.2.7/simplyblock_core/controllers/tasks_controller.py +103 -0
  17. sbcli_pre-1.2.7/simplyblock_core/controllers/tasks_events.py +37 -0
  18. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/distr_controller.py +13 -9
  19. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/kv_store.py +62 -20
  20. sbcli_pre-1.2.7/simplyblock_core/mgmt_node_ops.py +205 -0
  21. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/events.py +9 -1
  22. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/job_schedule.py +6 -0
  23. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/nvme_device.py +42 -4
  24. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/storage_node.py +14 -2
  25. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/rpc_client.py +55 -10
  26. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/__init__.py +0 -4
  27. sbcli_pre-1.2.5/simplyblock_core/scripts/alerting/alert_resources.yaml → sbcli_pre-1.2.7/simplyblock_core/scripts/alerting/alert_resources.yaml.j2 +54 -5
  28. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/dashboards/cluster.json +1 -1
  29. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/deploy_stack.sh +9 -0
  30. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/docker-compose-swarm-monitoring.yml +32 -15
  31. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/docker-compose-swarm.yml +17 -2
  32. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/haproxy.cfg +15 -0
  33. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/install_deps.sh +3 -0
  34. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/stack_deploy_wait.sh +1 -1
  35. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/capacity_and_stats_collector.py +1 -1
  36. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/device_monitor.py +5 -46
  37. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/distr_event_collector.py +10 -11
  38. sbcli_pre-1.2.7/simplyblock_core/services/health_check_service.py +134 -0
  39. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/lvol_monitor.py +1 -1
  40. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/lvol_stat_collector.py +1 -1
  41. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/port_stat_collector.py +0 -1
  42. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/storage_node_monitor.py +49 -44
  43. sbcli_pre-1.2.7/simplyblock_core/services/tasks_runner_migration.py +61 -0
  44. sbcli_pre-1.2.5/simplyblock_core/services/job_tasks.py → sbcli_pre-1.2.7/simplyblock_core/services/tasks_runner_restart.py +95 -46
  45. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/snode_client.py +12 -0
  46. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/storage_node_ops.py +630 -358
  47. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/utils.py +126 -1
  48. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/snode_ops.py +103 -25
  49. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/web_api_cluster.py +20 -43
  50. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/web_api_device.py +10 -7
  51. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/web_api_lvol.py +9 -5
  52. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/web_api_pool.py +14 -5
  53. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/web_api_storage_node.py +15 -15
  54. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/node_utils.py +0 -2
  55. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/utils.py +8 -0
  56. sbcli_pre-1.2.5/simplyblock_core/constants.py +0 -65
  57. sbcli_pre-1.2.5/simplyblock_core/mgmt_node_ops.py +0 -80
  58. sbcli_pre-1.2.5/simplyblock_core/scripts/apply_dashboard.sh +0 -22
  59. sbcli_pre-1.2.5/simplyblock_core/services/health_check_service.py +0 -136
  60. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/README.md +0 -0
  61. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/pyproject.toml +0 -0
  62. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/sbcli_pre.egg-info/dependency_links.txt +0 -0
  63. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/sbcli_pre.egg-info/entry_points.txt +0 -0
  64. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/sbcli_pre.egg-info/requires.txt +0 -0
  65. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/sbcli_pre.egg-info/top_level.txt +0 -0
  66. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/setup.cfg +0 -0
  67. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/setup.py +0 -0
  68. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_cli/main.py +0 -0
  69. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/__init__.py +0 -0
  70. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/cnode_client.py +0 -0
  71. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/compute_node_ops.py +0 -0
  72. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/__init__.py +0 -0
  73. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/device_events.py +0 -0
  74. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/lvol_events.py +0 -0
  75. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/mgmt_events.py +0 -0
  76. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/pool_events.py +0 -0
  77. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/snapshot_events.py +0 -0
  78. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/storage_events.py +0 -0
  79. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/__init__.py +0 -0
  80. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/base_model.py +0 -0
  81. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/caching_node.py +0 -0
  82. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/cluster.py +0 -0
  83. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/compute_node.py +0 -0
  84. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/deployer.py +0 -0
  85. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/global_settings.py +0 -0
  86. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/iface.py +0 -0
  87. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/lvol_model.py +0 -0
  88. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/mgmt_node.py +0 -0
  89. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/pool.py +0 -0
  90. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/port_stat.py +0 -0
  91. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/snapshot.py +0 -0
  92. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/stats.py +0 -0
  93. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/pci_utils.py +0 -0
  94. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/alerting/alert_rules.yaml +0 -0
  95. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/clean_local_storage_deploy.sh +0 -0
  96. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/config_docker.sh +0 -0
  97. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/dashboards/devices.json +0 -0
  98. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/dashboards/lvols.json +0 -0
  99. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/dashboards/node-exporter.json +0 -0
  100. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/dashboards/nodes.json +0 -0
  101. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/dashboards/pools.json +0 -0
  102. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/datasource.yml +0 -0
  103. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/db_config_double.sh +0 -0
  104. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/db_config_single.sh +0 -0
  105. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/prometheus.yml +0 -0
  106. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/run_ssh.sh +0 -0
  107. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/set_db_config.sh +0 -0
  108. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/__init__.py +0 -0
  109. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/caching_node_monitor.py +0 -0
  110. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/cap_monitor.py +0 -0
  111. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/install_service.sh +0 -0
  112. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/log_agg_service.py +0 -0
  113. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/mgmt_node_monitor.py +0 -0
  114. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/remove_service.sh +0 -0
  115. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/service_template.service +0 -0
  116. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/shell_utils.py +0 -0
  117. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/__init__.py +0 -0
  118. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/app.py +0 -0
  119. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/auth_middleware.py +0 -0
  120. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/__init__.py +0 -0
  121. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/caching_node_ops.py +0 -0
  122. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/caching_node_ops_k8s.py +0 -0
  123. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/node_api_basic.py +0 -0
  124. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/node_api_caching_docker.py +0 -0
  125. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/node_api_caching_ks.py +0 -0
  126. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/web_api_caching_node.py +0 -0
  127. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/web_api_deployer.py +0 -0
  128. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/web_api_mgmt_node.py +0 -0
  129. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/web_api_snapshot.py +0 -0
  130. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/caching_node_app.py +0 -0
  131. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/caching_node_app_k8s.py +0 -0
  132. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/node_webapp.py +0 -0
  133. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/snode_app.py +0 -0
  134. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/static/delete.py +0 -0
  135. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/static/deploy.py +0 -0
  136. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/static/deploy_cnode.yaml +0 -0
  137. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/static/deploy_spdk.yaml +0 -0
  138. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/static/is_up.py +0 -0
  139. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/static/list_deps.py +0 -0
  140. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/static/rpac.yaml +0 -0
  141. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/static/tst.py +0 -0
  142. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/templates/deploy_spdk.yaml.j2 +0 -0
@@ -8,7 +8,7 @@ from datetime import datetime
8
8
 
9
9
 
10
10
  from simplyblock_core import constants, kv_store, cluster_ops, storage_node_ops, distr_controller
11
- from simplyblock_core.controllers import storage_events, health_controller, device_controller
11
+ from simplyblock_core.controllers import health_controller, device_controller, tasks_controller
12
12
  from simplyblock_core.models.cluster import Cluster
13
13
  from simplyblock_core.models.nvme_device import NVMeDevice
14
14
  from simplyblock_core.models.storage_node import StorageNode
@@ -31,8 +31,8 @@ db_store = kv_store.KVStore()
31
31
  db_controller = kv_store.DBController(kv_store=db_store)
32
32
 
33
33
 
34
- def get_cluster_target_status(cluster):
35
- snodes = db_controller.get_storage_nodes()
34
+ def get_cluster_target_status(cluster_id):
35
+ snodes = db_controller.get_storage_nodes_by_cluster_id(cluster_id)
36
36
 
37
37
  online_nodes = 0
38
38
  offline_nodes = 0
@@ -66,8 +66,8 @@ def get_cluster_target_status(cluster):
66
66
  logger.debug(f"online_devices: {online_devices}")
67
67
  logger.debug(f"offline_devices: {offline_devices}")
68
68
 
69
- # if more than two affected modes then cluster is suspended
70
- if affected_nodes > 2:
69
+ # if more than two affected nodes then cluster is suspended
70
+ if affected_nodes > 2 or offline_nodes > 2:
71
71
  return Cluster.STATUS_SUSPENDED
72
72
 
73
73
  # if any device goes offline then cluster is degraded
@@ -85,7 +85,7 @@ def update_cluster_status(cluster_id):
85
85
  cluster = db_controller.get_cluster_by_id(cluster_id)
86
86
 
87
87
  if cluster.ha_type == "ha":
88
- cluster_target_status = get_cluster_target_status(cluster)
88
+ cluster_target_status = get_cluster_target_status(cluster_id)
89
89
  logger.info(f"Target cluster status {cluster_target_status}, current status: {cluster.status}")
90
90
  if cluster.status == cluster_target_status:
91
91
  return
@@ -111,48 +111,53 @@ def set_node_online(node):
111
111
  def set_node_offline(node):
112
112
  if node.status != StorageNode.STATUS_UNREACHABLE:
113
113
  storage_node_ops.set_node_status(snode.get_id(), StorageNode.STATUS_UNREACHABLE)
114
+ # add node to auto restart
115
+ tasks_controller.add_node_to_auto_restart(node)
114
116
 
115
117
 
116
118
  logger.info("Starting node monitor")
117
119
  while True:
118
- # get storage nodes
119
- nodes = db_controller.get_storage_nodes()
120
- for snode in nodes:
121
- if snode.status not in [StorageNode.STATUS_ONLINE, StorageNode.STATUS_UNREACHABLE]:
122
- logger.info(f"Node status is: {snode.status}, skipping")
123
- continue
124
-
125
- logger.info(f"Checking node {snode.hostname}")
126
-
127
- # 1- check node ping
128
- ping_check = health_controller._check_node_ping(snode.mgmt_ip)
129
- logger.info(f"Check: ping mgmt ip {snode.mgmt_ip} ... {ping_check}")
130
-
131
- # 2- check node API
132
- node_api_check = health_controller._check_node_api(snode.mgmt_ip)
133
- logger.info(f"Check: node API {snode.mgmt_ip}:5000 ... {node_api_check}")
134
-
135
- # 3- check node RPC
136
- node_rpc_check = health_controller._check_node_rpc(
137
- snode.mgmt_ip, snode.rpc_port, snode.rpc_username, snode.rpc_password)
138
- logger.info(f"Check: node RPC {snode.mgmt_ip}:{snode.rpc_port} ... {node_rpc_check}")
139
-
140
- # 4- docker API
141
- node_docker_check = health_controller._check_node_docker_api(snode.mgmt_ip)
142
- logger.info(f"Check: node docker API {snode.mgmt_ip}:2375 ... {node_docker_check}")
143
-
144
- is_node_online = ping_check and node_api_check and node_rpc_check and node_docker_check
145
- if is_node_online:
146
- set_node_online(snode)
147
- else:
148
- set_node_offline(snode)
149
-
150
- if not ping_check and not node_rpc_check:
151
- # node is dead, set devices offline
152
- for dev in snode.nvme_devices:
153
- device_controller.device_set_unavailable(dev.get_id())
154
-
155
- update_cluster_status(snode.cluster_id)
120
+ clusters = db_controller.get_clusters()
121
+ for cluster in clusters:
122
+ cluster_id = cluster.get_id()
123
+ # get storage nodes
124
+ nodes = db_controller.get_storage_nodes_by_cluster_id(cluster_id)
125
+ for snode in nodes:
126
+ if snode.status not in [StorageNode.STATUS_ONLINE, StorageNode.STATUS_UNREACHABLE]:
127
+ logger.info(f"Node status is: {snode.status}, skipping")
128
+ continue
129
+
130
+ logger.info(f"Checking node {snode.hostname}")
131
+
132
+ # 1- check node ping
133
+ ping_check = health_controller._check_node_ping(snode.mgmt_ip)
134
+ logger.info(f"Check: ping mgmt ip {snode.mgmt_ip} ... {ping_check}")
135
+
136
+ # 2- check node API
137
+ node_api_check = health_controller._check_node_api(snode.mgmt_ip)
138
+ logger.info(f"Check: node API {snode.mgmt_ip}:5000 ... {node_api_check}")
139
+
140
+ # 3- check node RPC
141
+ node_rpc_check = health_controller._check_node_rpc(
142
+ snode.mgmt_ip, snode.rpc_port, snode.rpc_username, snode.rpc_password)
143
+ logger.info(f"Check: node RPC {snode.mgmt_ip}:{snode.rpc_port} ... {node_rpc_check}")
144
+
145
+ # 4- docker API
146
+ node_docker_check = health_controller._check_node_docker_api(snode.mgmt_ip)
147
+ logger.info(f"Check: node docker API {snode.mgmt_ip}:2375 ... {node_docker_check}")
148
+
149
+ is_node_online = ping_check and node_api_check and node_rpc_check and node_docker_check
150
+ if is_node_online:
151
+ set_node_online(snode)
152
+ else:
153
+ set_node_offline(snode)
154
+
155
+ if not ping_check and not node_rpc_check:
156
+ # node is dead, set devices offline
157
+ for dev in snode.nvme_devices:
158
+ device_controller.device_set_unavailable(dev.get_id())
159
+
160
+ update_cluster_status(cluster_id)
156
161
 
157
162
  logger.info(f"Sleeping for {constants.NODE_MONITOR_INTERVAL_SEC} seconds")
158
163
  time.sleep(constants.NODE_MONITOR_INTERVAL_SEC)
@@ -0,0 +1,61 @@
1
+ # coding=utf-8
2
+ import logging
3
+ import time
4
+ import sys
5
+
6
+
7
+ from simplyblock_core import constants, kv_store
8
+ from simplyblock_core.controllers import tasks_events
9
+ from simplyblock_core.models.job_schedule import JobSchedule
10
+
11
+
12
+ # Import the GELF logger
13
+ from graypy import GELFUDPHandler
14
+
15
+
16
+ def task_runner(task):
17
+ task.status = JobSchedule.STATUS_RUNNING
18
+ task.write_to_db(db_controller.kv_store)
19
+ tasks_events.task_updated(task)
20
+
21
+ time.sleep(30)
22
+
23
+ task.function_result = "sleep 30"
24
+ task.status = JobSchedule.STATUS_DONE
25
+ task.write_to_db(db_controller.kv_store)
26
+ tasks_events.task_updated(task)
27
+
28
+ return True
29
+
30
+
31
+ # configure logging
32
+ logger_handler = logging.StreamHandler(stream=sys.stdout)
33
+ logger_handler.setFormatter(logging.Formatter('%(asctime)s: %(levelname)s: %(message)s'))
34
+ gelf_handler = GELFUDPHandler('0.0.0.0', constants.GELF_PORT)
35
+ logger = logging.getLogger()
36
+ logger.addHandler(gelf_handler)
37
+ logger.addHandler(logger_handler)
38
+ logger.setLevel(logging.DEBUG)
39
+
40
+ # get DB controller
41
+ db_controller = kv_store.DBController()
42
+
43
+ logger.info("Starting Tasks runner...")
44
+ while True:
45
+ time.sleep(3)
46
+ clusters = db_controller.get_clusters()
47
+ if not clusters:
48
+ logger.error("No clusters found!")
49
+ else:
50
+ for cl in clusters:
51
+ tasks = db_controller.get_job_tasks(cl.get_id(), reverse=False)
52
+ for task in tasks:
53
+ delay_seconds = constants.TASK_EXEC_INTERVAL_SEC
54
+ if task.function_name == JobSchedule.FN_DEV_MIG:
55
+ while task.status != JobSchedule.STATUS_DONE:
56
+ res = task_runner(task)
57
+ if res:
58
+ tasks_events.task_updated(task)
59
+ else:
60
+ time.sleep(delay_seconds)
61
+ delay_seconds *= 2
@@ -5,7 +5,7 @@ import sys
5
5
 
6
6
 
7
7
  from simplyblock_core import constants, kv_store, storage_node_ops
8
- from simplyblock_core.controllers import device_controller
8
+ from simplyblock_core.controllers import device_controller, tasks_events
9
9
  from simplyblock_core.models.job_schedule import JobSchedule
10
10
  from simplyblock_core.models.nvme_device import NVMeDevice
11
11
 
@@ -15,6 +15,19 @@ from graypy import GELFUDPHandler
15
15
  from simplyblock_core.models.storage_node import StorageNode
16
16
 
17
17
 
18
+ # configure logging
19
+ logger_handler = logging.StreamHandler(stream=sys.stdout)
20
+ logger_handler.setFormatter(logging.Formatter('%(asctime)s: %(levelname)s: %(message)s'))
21
+ gelf_handler = GELFUDPHandler('0.0.0.0', constants.GELF_PORT)
22
+ logger = logging.getLogger()
23
+ logger.addHandler(gelf_handler)
24
+ logger.addHandler(logger_handler)
25
+ logger.setLevel(logging.DEBUG)
26
+
27
+ # get DB controller
28
+ db_controller = kv_store.DBController()
29
+
30
+
18
31
  def _get_node_unavailable_devices_count(node_id):
19
32
  node = db_controller.get_storage_node_by_id(node_id)
20
33
  devices = []
@@ -31,10 +44,20 @@ def _get_device(task):
31
44
  return dev
32
45
 
33
46
 
47
+ def _validate_no_task_node_restart(cluster_id, node_id):
48
+ tasks = db_controller.get_job_tasks(cluster_id)
49
+ for task in tasks:
50
+ if task.function_name == JobSchedule.FN_NODE_RESTART and task.node_id == node_id:
51
+ if task.status != JobSchedule.STATUS_DONE:
52
+ logger.info(f"Task found, skip adding new task: {task.get_id()}")
53
+ return False
54
+ return True
55
+
56
+
34
57
  def task_runner(task):
35
- if task.function_name == "device_restart":
58
+ if task.function_name == JobSchedule.FN_DEV_RESTART:
36
59
  return task_runner_device(task)
37
- if task.function_name == "node_restart":
60
+ if task.function_name == JobSchedule.FN_NODE_RESTART:
38
61
  return task_runner_node(task)
39
62
 
40
63
 
@@ -49,38 +72,55 @@ def task_runner_device(task):
49
72
  device_controller.device_set_retries_exhausted(device.get_id(), True)
50
73
  return True
51
74
 
75
+ if not _validate_no_task_node_restart(task.cluster_id, task.node_id):
76
+ task.function_result = "canceled: node restart found"
77
+ task.status = JobSchedule.STATUS_DONE
78
+ task.write_to_db(db_controller.kv_store)
79
+ device_controller.device_set_unavailable(device.get_id())
80
+ return True
81
+
82
+ if task.canceled:
83
+ task.function_result = "canceled"
84
+ task.status = JobSchedule.STATUS_DONE
85
+ task.write_to_db(db_controller.kv_store)
86
+ return True
87
+
52
88
  node = db_controller.get_storage_node_by_id(task.node_id)
53
89
  if node.status != StorageNode.STATUS_ONLINE:
54
- logger.error(f"Node is not online: {node.get_id()} , skipping task: {task.get_id()}")
90
+ logger.error(f"Node is not online: {node.get_id()}, retry")
55
91
  task.function_result = "Node is offline"
56
92
  task.retry += 1
57
93
  task.write_to_db(db_controller.kv_store)
58
94
  return False
59
95
 
60
96
  if device.status == NVMeDevice.STATUS_ONLINE and device.io_error is False:
61
- logger.info(f"Device is online: {device.get_id()}, no restart needed")
62
- task.function_result = "skipped because dev is online"
97
+ logger.info(f"Device is online: {device.get_id()}")
98
+ task.function_result = "Device is online"
63
99
  task.status = JobSchedule.STATUS_DONE
64
100
  task.write_to_db(db_controller.kv_store)
65
101
  return True
66
102
 
67
- task.status = JobSchedule.STATUS_RUNNING
68
- task.write_to_db(db_controller.kv_store)
69
-
70
- # resetting device
71
- logger.info(f"Resetting device {device.get_id()}")
72
- device_controller.reset_storage_device(device.get_id())
73
- time.sleep(5)
74
- device = _get_device(task)
75
- if device.status == NVMeDevice.STATUS_ONLINE and device.io_error is False:
76
- logger.info(f"Device is online: {device.get_id()}")
77
- task.function_result = "done"
103
+ if device.status in [NVMeDevice.STATUS_REMOVED, NVMeDevice.STATUS_FAILED]:
104
+ logger.info(f"Device is not unavailable: {device.get_id()}, {device.status} , stopping task")
105
+ task.function_result = f"stopped because dev is {device.status}"
78
106
  task.status = JobSchedule.STATUS_DONE
79
107
  task.write_to_db(db_controller.kv_store)
80
108
  return True
81
109
 
82
- logger.info(f"Restarting device {device.get_id()}")
83
- device_controller.restart_device(device.get_id(), force=True)
110
+ if task.status != JobSchedule.STATUS_RUNNING:
111
+ task.status = JobSchedule.STATUS_RUNNING
112
+ task.write_to_db(db_controller.kv_store)
113
+ tasks_events.task_updated(task)
114
+
115
+ # set device online for the first 3 retries
116
+ if task.retry < 3:
117
+ logger.info(f"Set device online {device.get_id()}")
118
+ device_controller.device_set_online(device.get_id())
119
+ else:
120
+ logger.info(f"Restarting device {device.get_id()}")
121
+ device_controller.restart_device(device.get_id(), force=True)
122
+
123
+ # check device status
84
124
  time.sleep(5)
85
125
  device = _get_device(task)
86
126
  if device.status == NVMeDevice.STATUS_ONLINE and device.io_error is False:
@@ -104,22 +144,37 @@ def task_runner_node(task):
104
144
  storage_node_ops.set_node_status(task.node_id, StorageNode.STATUS_UNREACHABLE)
105
145
  return True
106
146
 
107
- if _get_node_unavailable_devices_count(node.get_id()) == 0:
108
- logger.info(f"Node is online: {node.get_id()}, no restart needed")
109
- task.function_result = "skipped because node is online"
147
+ if node.status == StorageNode.STATUS_REMOVED:
148
+ logger.info(f"Node is removed: {task.node_id}, stopping task")
149
+ task.function_result = f"Node is removed"
110
150
  task.status = JobSchedule.STATUS_DONE
111
151
  task.write_to_db(db_controller.kv_store)
112
152
  return True
113
153
 
114
- task.status = JobSchedule.STATUS_RUNNING
115
- task.write_to_db(db_controller.kv_store)
154
+ if _get_node_unavailable_devices_count(node.get_id()) == 0 and node.status == StorageNode.STATUS_ONLINE:
155
+ logger.info(f"Node is online: {node.get_id()}")
156
+ task.function_result = "Node is online"
157
+ task.status = JobSchedule.STATUS_DONE
158
+ task.write_to_db(db_controller.kv_store)
159
+ return True
160
+
161
+ if task.canceled:
162
+ task.function_result = "canceled"
163
+ task.status = JobSchedule.STATUS_DONE
164
+ task.write_to_db(db_controller.kv_store)
165
+ return True
166
+
167
+ if task.status != JobSchedule.STATUS_RUNNING:
168
+ task.status = JobSchedule.STATUS_RUNNING
169
+ task.write_to_db(db_controller.kv_store)
170
+ tasks_events.task_updated(task)
116
171
 
117
172
  # shutting down node
118
173
  logger.info(f"Shutdown node {node.get_id()}")
119
174
  ret = storage_node_ops.shutdown_storage_node(node.get_id(), force=True)
120
175
  if ret:
121
176
  logger.info(f"Node shutdown succeeded")
122
- time.sleep(5)
177
+ time.sleep(3)
123
178
 
124
179
  # resetting node
125
180
  logger.info(f"Restart node {node.get_id()}")
@@ -127,8 +182,9 @@ def task_runner_node(task):
127
182
  if ret:
128
183
  logger.info(f"Node restart succeeded")
129
184
 
130
- if _get_node_unavailable_devices_count(node.get_id()) == 0:
131
- logger.info(f"Node is online: {node.get_id()}, no restart needed")
185
+ time.sleep(5)
186
+ if _get_node_unavailable_devices_count(node.get_id()) == 0 and node.status == StorageNode.STATUS_ONLINE:
187
+ logger.info(f"Node is online: {node.get_id()}")
132
188
  task.function_result = "done"
133
189
  task.status = JobSchedule.STATUS_DONE
134
190
  task.write_to_db(db_controller.kv_store)
@@ -139,19 +195,7 @@ def task_runner_node(task):
139
195
  return False
140
196
 
141
197
 
142
- # configure logging
143
- logger_handler = logging.StreamHandler(stream=sys.stdout)
144
- logger_handler.setFormatter(logging.Formatter('%(asctime)s: %(levelname)s: %(message)s'))
145
- gelf_handler = GELFUDPHandler('0.0.0.0', constants.GELF_PORT)
146
- logger = logging.getLogger()
147
- logger.addHandler(gelf_handler)
148
- logger.addHandler(logger_handler)
149
- logger.setLevel(logging.DEBUG)
150
-
151
- # get DB controller
152
- db_controller = kv_store.DBController()
153
-
154
- logger.info("Starting Jobs runner...")
198
+ logger.info("Starting Tasks runner...")
155
199
  while True:
156
200
  time.sleep(3)
157
201
  clusters = db_controller.get_clusters()
@@ -159,11 +203,16 @@ while True:
159
203
  logger.error("No clusters found!")
160
204
  else:
161
205
  for cl in clusters:
162
- tasks = db_controller.get_job_tasks(cl.get_id())
206
+ tasks = db_controller.get_job_tasks(cl.get_id(), reverse=False)
163
207
  for task in tasks:
164
208
  delay_seconds = constants.TASK_EXEC_INTERVAL_SEC
165
- while task.status != JobSchedule.STATUS_DONE:
166
- res = task_runner(task)
167
- if res is False:
168
- time.sleep(delay_seconds)
169
- delay_seconds *= 2
209
+ if task.function_name in [JobSchedule.FN_DEV_RESTART, JobSchedule.FN_NODE_RESTART]:
210
+ while task.status != JobSchedule.STATUS_DONE:
211
+ # get new task object because it could be changed from cancel task
212
+ task = db_controller.get_task_by_id(task.uuid)
213
+ res = task_runner(task)
214
+ if res:
215
+ tasks_events.task_updated(task)
216
+ else:
217
+ time.sleep(delay_seconds)
218
+ delay_seconds *= 2
@@ -95,3 +95,15 @@ class SNodeClient:
95
95
 
96
96
  def leave_swarm(self):
97
97
  return self._request("GET", "leave_swarm")
98
+
99
+ def make_gpt_partitions(self, nbd_device, jm_percent, num_partitions):
100
+ params = {
101
+ "nbd_device": nbd_device,
102
+ "jm_percent": jm_percent,
103
+ "num_partitions": num_partitions,
104
+ }
105
+ return self._request("POST", "make_gpt_partitions", params)
106
+
107
+ def delete_dev_gpt_partitions(self, device_pci):
108
+ params = {"device_pci": device_pci}
109
+ return self._request("POST", "delete_dev_gpt_partitions", params)