sbcli-pre 1.2.4__zip → 1.2.5__zip

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (141) hide show
  1. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/PKG-INFO +20 -5
  2. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/README.md +19 -4
  3. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/env_var +1 -1
  4. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/sbcli_pre.egg-info/PKG-INFO +20 -5
  5. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/sbcli_pre.egg-info/SOURCES.txt +5 -5
  6. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_cli/cli.py +115 -113
  7. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/cluster_ops.py +238 -141
  8. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/constants.py +7 -5
  9. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/controllers/caching_node_controller.py +6 -8
  10. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/controllers/cluster_events.py +0 -9
  11. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/controllers/device_controller.py +63 -56
  12. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/controllers/events_controller.py +3 -5
  13. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/controllers/health_controller.py +40 -30
  14. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/controllers/lvol_controller.py +38 -51
  15. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/controllers/pool_controller.py +4 -8
  16. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/controllers/snapshot_controller.py +3 -9
  17. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/distr_controller.py +9 -13
  18. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/kv_store.py +29 -47
  19. sbcli_pre-1.2.5/simplyblock_core/mgmt_node_ops.py +80 -0
  20. sbcli_pre-1.2.5/simplyblock_core/models/deployer.py +62 -0
  21. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/models/events.py +1 -9
  22. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/models/job_schedule.py +0 -6
  23. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/models/nvme_device.py +4 -42
  24. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/models/storage_node.py +1 -9
  25. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/rpc_client.py +10 -55
  26. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/scripts/__init__.py +4 -0
  27. sbcli_pre-1.2.4/simplyblock_core/scripts/alerting/alert_resources.yaml.j2 → sbcli_pre-1.2.5/simplyblock_core/scripts/alerting/alert_resources.yaml +5 -54
  28. sbcli_pre-1.2.5/simplyblock_core/scripts/apply_dashboard.sh +22 -0
  29. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/scripts/dashboards/cluster.json +1 -1
  30. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/scripts/deploy_stack.sh +0 -2
  31. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/scripts/docker-compose-swarm-monitoring.yml +13 -22
  32. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/scripts/docker-compose-swarm.yml +2 -17
  33. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/scripts/haproxy.cfg +0 -15
  34. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/scripts/install_deps.sh +0 -1
  35. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/services/capacity_and_stats_collector.py +1 -1
  36. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/services/device_monitor.py +46 -5
  37. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/services/distr_event_collector.py +11 -10
  38. sbcli_pre-1.2.5/simplyblock_core/services/health_check_service.py +136 -0
  39. sbcli_pre-1.2.4/simplyblock_core/services/tasks_runner_restart.py → sbcli_pre-1.2.5/simplyblock_core/services/job_tasks.py +46 -95
  40. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/services/lvol_monitor.py +1 -1
  41. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/services/lvol_stat_collector.py +1 -1
  42. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/services/port_stat_collector.py +1 -0
  43. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/services/storage_node_monitor.py +44 -49
  44. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/snode_client.py +0 -12
  45. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/storage_node_ops.py +336 -525
  46. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/utils.py +1 -46
  47. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_web/app.py +2 -1
  48. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_web/blueprints/snode_ops.py +25 -103
  49. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_web/blueprints/web_api_cluster.py +43 -20
  50. sbcli_pre-1.2.5/simplyblock_web/blueprints/web_api_deployer.py +394 -0
  51. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_web/blueprints/web_api_device.py +7 -10
  52. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_web/blueprints/web_api_lvol.py +5 -9
  53. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_web/blueprints/web_api_pool.py +5 -14
  54. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_web/blueprints/web_api_storage_node.py +10 -3
  55. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_web/node_utils.py +2 -0
  56. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_web/utils.py +0 -8
  57. sbcli_pre-1.2.4/simplyblock_core/controllers/tasks_controller.py +0 -103
  58. sbcli_pre-1.2.4/simplyblock_core/controllers/tasks_events.py +0 -37
  59. sbcli_pre-1.2.4/simplyblock_core/mgmt_node_ops.py +0 -205
  60. sbcli_pre-1.2.4/simplyblock_core/services/health_check_service.py +0 -134
  61. sbcli_pre-1.2.4/simplyblock_core/services/tasks_runner_migration.py +0 -61
  62. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/pyproject.toml +0 -0
  63. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/sbcli_pre.egg-info/dependency_links.txt +0 -0
  64. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/sbcli_pre.egg-info/entry_points.txt +0 -0
  65. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/sbcli_pre.egg-info/requires.txt +0 -0
  66. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/sbcli_pre.egg-info/top_level.txt +0 -0
  67. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/setup.cfg +0 -0
  68. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/setup.py +0 -0
  69. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_cli/main.py +0 -0
  70. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/__init__.py +0 -0
  71. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/cnode_client.py +0 -0
  72. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/compute_node_ops.py +0 -0
  73. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/controllers/__init__.py +0 -0
  74. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/controllers/device_events.py +0 -0
  75. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/controllers/lvol_events.py +0 -0
  76. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/controllers/mgmt_events.py +0 -0
  77. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/controllers/pool_events.py +0 -0
  78. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/controllers/snapshot_events.py +0 -0
  79. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/controllers/storage_events.py +0 -0
  80. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/models/__init__.py +0 -0
  81. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/models/base_model.py +0 -0
  82. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/models/caching_node.py +0 -0
  83. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/models/cluster.py +0 -0
  84. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/models/compute_node.py +0 -0
  85. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/models/global_settings.py +0 -0
  86. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/models/iface.py +0 -0
  87. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/models/lvol_model.py +0 -0
  88. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/models/mgmt_node.py +0 -0
  89. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/models/pool.py +0 -0
  90. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/models/port_stat.py +0 -0
  91. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/models/snapshot.py +0 -0
  92. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/models/stats.py +0 -0
  93. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/pci_utils.py +0 -0
  94. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/scripts/alerting/alert_rules.yaml +0 -0
  95. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/scripts/clean_local_storage_deploy.sh +0 -0
  96. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/scripts/config_docker.sh +0 -0
  97. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/scripts/dashboards/devices.json +0 -0
  98. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/scripts/dashboards/lvols.json +0 -0
  99. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/scripts/dashboards/node-exporter.json +0 -0
  100. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/scripts/dashboards/nodes.json +0 -0
  101. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/scripts/dashboards/pools.json +0 -0
  102. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/scripts/datasource.yml +0 -0
  103. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/scripts/db_config_double.sh +0 -0
  104. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/scripts/db_config_single.sh +0 -0
  105. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/scripts/prometheus.yml +0 -0
  106. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/scripts/run_ssh.sh +0 -0
  107. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/scripts/set_db_config.sh +0 -0
  108. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/scripts/stack_deploy_wait.sh +0 -0
  109. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/services/__init__.py +0 -0
  110. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/services/caching_node_monitor.py +0 -0
  111. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/services/cap_monitor.py +0 -0
  112. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/services/install_service.sh +0 -0
  113. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/services/log_agg_service.py +0 -0
  114. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/services/mgmt_node_monitor.py +0 -0
  115. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/services/remove_service.sh +0 -0
  116. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/services/service_template.service +0 -0
  117. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_core/shell_utils.py +0 -0
  118. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_web/__init__.py +0 -0
  119. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_web/auth_middleware.py +0 -0
  120. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_web/blueprints/__init__.py +0 -0
  121. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_web/blueprints/caching_node_ops.py +0 -0
  122. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_web/blueprints/caching_node_ops_k8s.py +0 -0
  123. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_web/blueprints/node_api_basic.py +0 -0
  124. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_web/blueprints/node_api_caching_docker.py +0 -0
  125. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_web/blueprints/node_api_caching_ks.py +0 -0
  126. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_web/blueprints/web_api_caching_node.py +0 -0
  127. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_web/blueprints/web_api_mgmt_node.py +0 -0
  128. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_web/blueprints/web_api_snapshot.py +0 -0
  129. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_web/caching_node_app.py +0 -0
  130. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_web/caching_node_app_k8s.py +0 -0
  131. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_web/node_webapp.py +0 -0
  132. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_web/snode_app.py +0 -0
  133. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_web/static/delete.py +0 -0
  134. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_web/static/deploy.py +0 -0
  135. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_web/static/deploy_cnode.yaml +0 -0
  136. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_web/static/deploy_spdk.yaml +0 -0
  137. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_web/static/is_up.py +0 -0
  138. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_web/static/list_deps.py +0 -0
  139. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_web/static/rpac.yaml +0 -0
  140. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_web/static/tst.py +0 -0
  141. {sbcli_pre-1.2.4 → sbcli_pre-1.2.5}/simplyblock_web/templates/deploy_spdk.yaml.j2 +0 -0
@@ -0,0 +1,136 @@
1
+ # coding=utf-8
2
+ import logging
3
+
4
+ import time
5
+ import sys
6
+ from datetime import datetime
7
+
8
+
9
+ from simplyblock_core.controllers import health_controller, storage_events, device_events
10
+ from simplyblock_core.models.storage_node import StorageNode
11
+ from simplyblock_core.rpc_client import RPCClient
12
+ from simplyblock_core import constants, kv_store
13
+
14
+ # Import the GELF logger
15
+ from graypy import GELFUDPHandler
16
+
17
+ def set_node_health_check(snode, health_check_status):
18
+ snode = db_controller.get_storage_node_by_id(snode.get_id())
19
+ if snode.health_check == health_check_status:
20
+ return
21
+ old_status = snode.health_check
22
+ snode.health_check = health_check_status
23
+ snode.updated_at = str(datetime.now())
24
+ snode.write_to_db(db_store)
25
+ storage_events.snode_health_check_change(snode, snode.health_check, old_status, caused_by="monitor")
26
+
27
+
28
+ def set_device_health_check(cluster_id, device, health_check_status):
29
+ if device.health_check == health_check_status:
30
+ return
31
+ nodes = db_controller.get_storage_nodes()
32
+ for node in nodes:
33
+ if node.nvme_devices:
34
+ for dev in node.nvme_devices:
35
+ if dev.get_id() == device.get_id():
36
+ old_status = dev.health_check
37
+ dev.health_check = health_check_status
38
+ node.write_to_db(db_store)
39
+ device_events.device_health_check_change(
40
+ dev, dev.health_check, old_status, caused_by="monitor")
41
+
42
+
43
+ # configure logging
44
+ logger_handler = logging.StreamHandler(stream=sys.stdout)
45
+ logger_handler.setFormatter(logging.Formatter('%(asctime)s: %(levelname)s: %(message)s'))
46
+ gelf_handler = GELFUDPHandler('0.0.0.0', constants.GELF_PORT)
47
+ logger = logging.getLogger()
48
+ logger.addHandler(gelf_handler)
49
+ logger.addHandler(logger_handler)
50
+ logger.setLevel(logging.DEBUG)
51
+
52
+ # get DB controller
53
+ db_store = kv_store.KVStore()
54
+ db_controller = kv_store.DBController()
55
+
56
+ logger.info("Starting health check service")
57
+ while True:
58
+ cluster_id = ""
59
+ cl = db_controller.get_clusters()
60
+ if cl:
61
+ cluster_id = cl[0].get_id()
62
+
63
+ snodes = db_controller.get_storage_nodes()
64
+ if not snodes:
65
+ logger.error("storage nodes list is empty")
66
+
67
+ for snode in snodes:
68
+ logger.info("Node: %s, status %s", snode.get_id(), snode.status)
69
+
70
+ if snode.status not in [StorageNode.STATUS_ONLINE, StorageNode.STATUS_UNREACHABLE]:
71
+ logger.info(f"Node status is: {snode.status}, skipping")
72
+ continue
73
+
74
+ # 1- check node ping
75
+ ping_check = health_controller._check_node_ping(snode.mgmt_ip)
76
+ logger.info(f"Check: ping mgmt ip {snode.mgmt_ip} ... {ping_check}")
77
+
78
+ # 2- check node API
79
+ node_api_check = health_controller._check_node_api(snode.mgmt_ip)
80
+ logger.info(f"Check: node API {snode.mgmt_ip}:5000 ... {node_api_check}")
81
+
82
+ if snode.status == StorageNode.STATUS_OFFLINE:
83
+ set_node_health_check(snode, ping_check & node_api_check)
84
+ continue
85
+
86
+ # 3- check node RPC
87
+ node_rpc_check = health_controller._check_node_rpc(
88
+ snode.mgmt_ip, snode.rpc_port, snode.rpc_username, snode.rpc_password)
89
+ logger.info(f"Check: node RPC {snode.mgmt_ip}:{snode.rpc_port} ... {node_rpc_check}")
90
+
91
+ # 4- docker API
92
+ node_docker_check = health_controller._check_node_docker_api(snode.mgmt_ip)
93
+ logger.info(f"Check: node docker API {snode.mgmt_ip}:2375 ... {node_docker_check}")
94
+
95
+ is_node_online = ping_check and node_api_check and node_rpc_check and node_docker_check
96
+
97
+ health_check_status = is_node_online
98
+ if not node_rpc_check:
99
+ logger.info("Putting all devices to unavailable state because RPC check failed")
100
+ for dev in snode.nvme_devices:
101
+ if dev.io_error:
102
+ logger.debug(f"Skipping Device action because of io_error {dev.get_id()}")
103
+ continue
104
+ set_device_health_check(cluster_id, dev, False)
105
+ else:
106
+ logger.info(f"Node device count: {len(snode.nvme_devices)}")
107
+ node_devices_check = True
108
+ node_remote_devices_check = True
109
+
110
+ for dev in snode.nvme_devices:
111
+ if dev.io_error:
112
+ logger.debug(f"Skipping Device check because of io_error {dev.get_id()}")
113
+ continue
114
+ ret = health_controller.check_device(dev.get_id())
115
+ set_device_health_check(cluster_id, dev, ret)
116
+ if dev.status == dev.STATUS_ONLINE:
117
+ node_devices_check &= ret
118
+
119
+ logger.info(f"Node remote device: {len(snode.remote_devices)}")
120
+ rpc_client = RPCClient(
121
+ snode.mgmt_ip, snode.rpc_port,
122
+ snode.rpc_username, snode.rpc_password,
123
+ timeout=5, retry=3)
124
+ for remote_device in snode.remote_devices:
125
+ ret = rpc_client.get_bdevs(remote_device.remote_bdev)
126
+ if ret:
127
+ logger.info(f"Checking bdev: {remote_device.remote_bdev} ... ok")
128
+ else:
129
+ logger.info(f"Checking bdev: {remote_device.remote_bdev} ... not found")
130
+ node_remote_devices_check &= bool(ret)
131
+
132
+ health_check_status = is_node_online and node_devices_check and node_remote_devices_check
133
+ set_node_health_check(snode, health_check_status)
134
+
135
+ time.sleep(constants.HEALTH_CHECK_INTERVAL_SEC)
136
+
@@ -5,7 +5,7 @@ import sys
5
5
 
6
6
 
7
7
  from simplyblock_core import constants, kv_store, storage_node_ops
8
- from simplyblock_core.controllers import device_controller, tasks_events
8
+ from simplyblock_core.controllers import device_controller
9
9
  from simplyblock_core.models.job_schedule import JobSchedule
10
10
  from simplyblock_core.models.nvme_device import NVMeDevice
11
11
 
@@ -15,19 +15,6 @@ from graypy import GELFUDPHandler
15
15
  from simplyblock_core.models.storage_node import StorageNode
16
16
 
17
17
 
18
- # configure logging
19
- logger_handler = logging.StreamHandler(stream=sys.stdout)
20
- logger_handler.setFormatter(logging.Formatter('%(asctime)s: %(levelname)s: %(message)s'))
21
- gelf_handler = GELFUDPHandler('0.0.0.0', constants.GELF_PORT)
22
- logger = logging.getLogger()
23
- logger.addHandler(gelf_handler)
24
- logger.addHandler(logger_handler)
25
- logger.setLevel(logging.DEBUG)
26
-
27
- # get DB controller
28
- db_controller = kv_store.DBController()
29
-
30
-
31
18
  def _get_node_unavailable_devices_count(node_id):
32
19
  node = db_controller.get_storage_node_by_id(node_id)
33
20
  devices = []
@@ -44,20 +31,10 @@ def _get_device(task):
44
31
  return dev
45
32
 
46
33
 
47
- def _validate_no_task_node_restart(cluster_id, node_id):
48
- tasks = db_controller.get_job_tasks(cluster_id)
49
- for task in tasks:
50
- if task.function_name == JobSchedule.FN_NODE_RESTART and task.node_id == node_id:
51
- if task.status != JobSchedule.STATUS_DONE:
52
- logger.info(f"Task found, skip adding new task: {task.get_id()}")
53
- return False
54
- return True
55
-
56
-
57
34
  def task_runner(task):
58
- if task.function_name == JobSchedule.FN_DEV_RESTART:
35
+ if task.function_name == "device_restart":
59
36
  return task_runner_device(task)
60
- if task.function_name == JobSchedule.FN_NODE_RESTART:
37
+ if task.function_name == "node_restart":
61
38
  return task_runner_node(task)
62
39
 
63
40
 
@@ -72,55 +49,38 @@ def task_runner_device(task):
72
49
  device_controller.device_set_retries_exhausted(device.get_id(), True)
73
50
  return True
74
51
 
75
- if not _validate_no_task_node_restart(task.cluster_id, task.node_id):
76
- task.function_result = "canceled: node restart found"
77
- task.status = JobSchedule.STATUS_DONE
78
- task.write_to_db(db_controller.kv_store)
79
- device_controller.device_set_unavailable(device.get_id())
80
- return True
81
-
82
- if task.canceled:
83
- task.function_result = "canceled"
84
- task.status = JobSchedule.STATUS_DONE
85
- task.write_to_db(db_controller.kv_store)
86
- return True
87
-
88
52
  node = db_controller.get_storage_node_by_id(task.node_id)
89
53
  if node.status != StorageNode.STATUS_ONLINE:
90
- logger.error(f"Node is not online: {node.get_id()}, retry")
54
+ logger.error(f"Node is not online: {node.get_id()} , skipping task: {task.get_id()}")
91
55
  task.function_result = "Node is offline"
92
56
  task.retry += 1
93
57
  task.write_to_db(db_controller.kv_store)
94
58
  return False
95
59
 
96
60
  if device.status == NVMeDevice.STATUS_ONLINE and device.io_error is False:
97
- logger.info(f"Device is online: {device.get_id()}")
98
- task.function_result = "Device is online"
61
+ logger.info(f"Device is online: {device.get_id()}, no restart needed")
62
+ task.function_result = "skipped because dev is online"
99
63
  task.status = JobSchedule.STATUS_DONE
100
64
  task.write_to_db(db_controller.kv_store)
101
65
  return True
102
66
 
103
- if device.status in [NVMeDevice.STATUS_REMOVED, NVMeDevice.STATUS_FAILED]:
104
- logger.info(f"Device is not unavailable: {device.get_id()}, {device.status} , stopping task")
105
- task.function_result = f"stopped because dev is {device.status}"
67
+ task.status = JobSchedule.STATUS_RUNNING
68
+ task.write_to_db(db_controller.kv_store)
69
+
70
+ # resetting device
71
+ logger.info(f"Resetting device {device.get_id()}")
72
+ device_controller.reset_storage_device(device.get_id())
73
+ time.sleep(5)
74
+ device = _get_device(task)
75
+ if device.status == NVMeDevice.STATUS_ONLINE and device.io_error is False:
76
+ logger.info(f"Device is online: {device.get_id()}")
77
+ task.function_result = "done"
106
78
  task.status = JobSchedule.STATUS_DONE
107
79
  task.write_to_db(db_controller.kv_store)
108
80
  return True
109
81
 
110
- if task.status != JobSchedule.STATUS_RUNNING:
111
- task.status = JobSchedule.STATUS_RUNNING
112
- task.write_to_db(db_controller.kv_store)
113
- tasks_events.task_updated(task)
114
-
115
- # set device online for the first 3 retries
116
- if task.retry < 3:
117
- logger.info(f"Set device online {device.get_id()}")
118
- device_controller.device_set_online(device.get_id())
119
- else:
120
- logger.info(f"Restarting device {device.get_id()}")
121
- device_controller.restart_device(device.get_id(), force=True)
122
-
123
- # check device status
82
+ logger.info(f"Restarting device {device.get_id()}")
83
+ device_controller.restart_device(device.get_id(), force=True)
124
84
  time.sleep(5)
125
85
  device = _get_device(task)
126
86
  if device.status == NVMeDevice.STATUS_ONLINE and device.io_error is False:
@@ -144,37 +104,22 @@ def task_runner_node(task):
144
104
  storage_node_ops.set_node_status(task.node_id, StorageNode.STATUS_UNREACHABLE)
145
105
  return True
146
106
 
147
- if node.status == StorageNode.STATUS_REMOVED:
148
- logger.info(f"Node is removed: {task.node_id}, stopping task")
149
- task.function_result = f"Node is removed"
107
+ if _get_node_unavailable_devices_count(node.get_id()) == 0:
108
+ logger.info(f"Node is online: {node.get_id()}, no restart needed")
109
+ task.function_result = "skipped because node is online"
150
110
  task.status = JobSchedule.STATUS_DONE
151
111
  task.write_to_db(db_controller.kv_store)
152
112
  return True
153
113
 
154
- if _get_node_unavailable_devices_count(node.get_id()) == 0 and node.status == StorageNode.STATUS_ONLINE:
155
- logger.info(f"Node is online: {node.get_id()}")
156
- task.function_result = "Node is online"
157
- task.status = JobSchedule.STATUS_DONE
158
- task.write_to_db(db_controller.kv_store)
159
- return True
160
-
161
- if task.canceled:
162
- task.function_result = "canceled"
163
- task.status = JobSchedule.STATUS_DONE
164
- task.write_to_db(db_controller.kv_store)
165
- return True
166
-
167
- if task.status != JobSchedule.STATUS_RUNNING:
168
- task.status = JobSchedule.STATUS_RUNNING
169
- task.write_to_db(db_controller.kv_store)
170
- tasks_events.task_updated(task)
114
+ task.status = JobSchedule.STATUS_RUNNING
115
+ task.write_to_db(db_controller.kv_store)
171
116
 
172
117
  # shutting down node
173
118
  logger.info(f"Shutdown node {node.get_id()}")
174
119
  ret = storage_node_ops.shutdown_storage_node(node.get_id(), force=True)
175
120
  if ret:
176
121
  logger.info(f"Node shutdown succeeded")
177
- time.sleep(3)
122
+ time.sleep(5)
178
123
 
179
124
  # resetting node
180
125
  logger.info(f"Restart node {node.get_id()}")
@@ -182,9 +127,8 @@ def task_runner_node(task):
182
127
  if ret:
183
128
  logger.info(f"Node restart succeeded")
184
129
 
185
- time.sleep(5)
186
- if _get_node_unavailable_devices_count(node.get_id()) == 0 and node.status == StorageNode.STATUS_ONLINE:
187
- logger.info(f"Node is online: {node.get_id()}")
130
+ if _get_node_unavailable_devices_count(node.get_id()) == 0:
131
+ logger.info(f"Node is online: {node.get_id()}, no restart needed")
188
132
  task.function_result = "done"
189
133
  task.status = JobSchedule.STATUS_DONE
190
134
  task.write_to_db(db_controller.kv_store)
@@ -195,7 +139,19 @@ def task_runner_node(task):
195
139
  return False
196
140
 
197
141
 
198
- logger.info("Starting Tasks runner...")
142
+ # configure logging
143
+ logger_handler = logging.StreamHandler(stream=sys.stdout)
144
+ logger_handler.setFormatter(logging.Formatter('%(asctime)s: %(levelname)s: %(message)s'))
145
+ gelf_handler = GELFUDPHandler('0.0.0.0', constants.GELF_PORT)
146
+ logger = logging.getLogger()
147
+ logger.addHandler(gelf_handler)
148
+ logger.addHandler(logger_handler)
149
+ logger.setLevel(logging.DEBUG)
150
+
151
+ # get DB controller
152
+ db_controller = kv_store.DBController()
153
+
154
+ logger.info("Starting Jobs runner...")
199
155
  while True:
200
156
  time.sleep(3)
201
157
  clusters = db_controller.get_clusters()
@@ -203,16 +159,11 @@ while True:
203
159
  logger.error("No clusters found!")
204
160
  else:
205
161
  for cl in clusters:
206
- tasks = db_controller.get_job_tasks(cl.get_id(), reverse=False)
162
+ tasks = db_controller.get_job_tasks(cl.get_id())
207
163
  for task in tasks:
208
164
  delay_seconds = constants.TASK_EXEC_INTERVAL_SEC
209
- if task.function_name in [JobSchedule.FN_DEV_RESTART, JobSchedule.FN_NODE_RESTART]:
210
- while task.status != JobSchedule.STATUS_DONE:
211
- # get new task object because it could be changed from cancel task
212
- task = db_controller.get_task_by_id(task.get_id())
213
- res = task_runner(task)
214
- if res:
215
- tasks_events.task_updated(task)
216
- else:
217
- time.sleep(delay_seconds)
218
- delay_seconds *= 2
165
+ while task.status != JobSchedule.STATUS_DONE:
166
+ res = task_runner(task)
167
+ if res is False:
168
+ time.sleep(delay_seconds)
169
+ delay_seconds *= 2
@@ -49,7 +49,7 @@ db_controller = kv_store.DBController()
49
49
 
50
50
  logger.info("Starting LVol monitor...")
51
51
  while True:
52
- lvols = db_controller.get_lvols() # pass
52
+ lvols = db_controller.get_lvols()
53
53
  if not lvols:
54
54
  logger.error("LVols list is empty")
55
55
 
@@ -115,7 +115,7 @@ logger.info("Starting stats collector...")
115
115
  while True:
116
116
 
117
117
  pools = db_controller.get_pools()
118
- all_lvols = db_controller.get_lvols() # pass
118
+ all_lvols = db_controller.get_lvols()
119
119
  for pool in pools:
120
120
  lvols = []
121
121
  for lvol in all_lvols:
@@ -74,3 +74,4 @@ while True:
74
74
  update_port_stats(snode, nic, stats)
75
75
  else:
76
76
  logger.error("Error getting port stats: %s", nic.get_id())
77
+
@@ -8,7 +8,7 @@ from datetime import datetime
8
8
 
9
9
 
10
10
  from simplyblock_core import constants, kv_store, cluster_ops, storage_node_ops, distr_controller
11
- from simplyblock_core.controllers import health_controller, device_controller, tasks_controller
11
+ from simplyblock_core.controllers import storage_events, health_controller, device_controller
12
12
  from simplyblock_core.models.cluster import Cluster
13
13
  from simplyblock_core.models.nvme_device import NVMeDevice
14
14
  from simplyblock_core.models.storage_node import StorageNode
@@ -31,8 +31,8 @@ db_store = kv_store.KVStore()
31
31
  db_controller = kv_store.DBController(kv_store=db_store)
32
32
 
33
33
 
34
- def get_cluster_target_status(cluster_id):
35
- snodes = db_controller.get_storage_nodes_by_cluster_id(cluster_id)
34
+ def get_cluster_target_status(cluster):
35
+ snodes = db_controller.get_storage_nodes()
36
36
 
37
37
  online_nodes = 0
38
38
  offline_nodes = 0
@@ -66,8 +66,8 @@ def get_cluster_target_status(cluster_id):
66
66
  logger.debug(f"online_devices: {online_devices}")
67
67
  logger.debug(f"offline_devices: {offline_devices}")
68
68
 
69
- # if more than two affected nodes then cluster is suspended
70
- if affected_nodes > 2 or offline_nodes > 2:
69
+ # if more than two affected modes then cluster is suspended
70
+ if affected_nodes > 2:
71
71
  return Cluster.STATUS_SUSPENDED
72
72
 
73
73
  # if any device goes offline then cluster is degraded
@@ -85,7 +85,7 @@ def update_cluster_status(cluster_id):
85
85
  cluster = db_controller.get_cluster_by_id(cluster_id)
86
86
 
87
87
  if cluster.ha_type == "ha":
88
- cluster_target_status = get_cluster_target_status(cluster_id)
88
+ cluster_target_status = get_cluster_target_status(cluster)
89
89
  logger.info(f"Target cluster status {cluster_target_status}, current status: {cluster.status}")
90
90
  if cluster.status == cluster_target_status:
91
91
  return
@@ -111,53 +111,48 @@ def set_node_online(node):
111
111
  def set_node_offline(node):
112
112
  if node.status != StorageNode.STATUS_UNREACHABLE:
113
113
  storage_node_ops.set_node_status(snode.get_id(), StorageNode.STATUS_UNREACHABLE)
114
- # add node to auto restart
115
- tasks_controller.add_node_to_auto_restart(node)
116
114
 
117
115
 
118
116
  logger.info("Starting node monitor")
119
117
  while True:
120
- clusters = db_controller.get_clusters()
121
- for cluster in clusters:
122
- cluster_id = cluster.get_id()
123
- # get storage nodes
124
- nodes = db_controller.get_storage_nodes_by_cluster_id(cluster_id)
125
- for snode in nodes:
126
- if snode.status not in [StorageNode.STATUS_ONLINE, StorageNode.STATUS_UNREACHABLE]:
127
- logger.info(f"Node status is: {snode.status}, skipping")
128
- continue
129
-
130
- logger.info(f"Checking node {snode.hostname}")
131
-
132
- # 1- check node ping
133
- ping_check = health_controller._check_node_ping(snode.mgmt_ip)
134
- logger.info(f"Check: ping mgmt ip {snode.mgmt_ip} ... {ping_check}")
135
-
136
- # 2- check node API
137
- node_api_check = health_controller._check_node_api(snode.mgmt_ip)
138
- logger.info(f"Check: node API {snode.mgmt_ip}:5000 ... {node_api_check}")
139
-
140
- # 3- check node RPC
141
- node_rpc_check = health_controller._check_node_rpc(
142
- snode.mgmt_ip, snode.rpc_port, snode.rpc_username, snode.rpc_password)
143
- logger.info(f"Check: node RPC {snode.mgmt_ip}:{snode.rpc_port} ... {node_rpc_check}")
144
-
145
- # 4- docker API
146
- node_docker_check = health_controller._check_node_docker_api(snode.mgmt_ip)
147
- logger.info(f"Check: node docker API {snode.mgmt_ip}:2375 ... {node_docker_check}")
148
-
149
- is_node_online = ping_check and node_api_check and node_rpc_check and node_docker_check
150
- if is_node_online:
151
- set_node_online(snode)
152
- else:
153
- set_node_offline(snode)
154
-
155
- if not ping_check and not node_rpc_check:
156
- # node is dead, set devices offline
157
- for dev in snode.nvme_devices:
158
- device_controller.device_set_unavailable(dev.get_id())
159
-
160
- update_cluster_status(cluster_id)
118
+ # get storage nodes
119
+ nodes = db_controller.get_storage_nodes()
120
+ for snode in nodes:
121
+ if snode.status not in [StorageNode.STATUS_ONLINE, StorageNode.STATUS_UNREACHABLE]:
122
+ logger.info(f"Node status is: {snode.status}, skipping")
123
+ continue
124
+
125
+ logger.info(f"Checking node {snode.hostname}")
126
+
127
+ # 1- check node ping
128
+ ping_check = health_controller._check_node_ping(snode.mgmt_ip)
129
+ logger.info(f"Check: ping mgmt ip {snode.mgmt_ip} ... {ping_check}")
130
+
131
+ # 2- check node API
132
+ node_api_check = health_controller._check_node_api(snode.mgmt_ip)
133
+ logger.info(f"Check: node API {snode.mgmt_ip}:5000 ... {node_api_check}")
134
+
135
+ # 3- check node RPC
136
+ node_rpc_check = health_controller._check_node_rpc(
137
+ snode.mgmt_ip, snode.rpc_port, snode.rpc_username, snode.rpc_password)
138
+ logger.info(f"Check: node RPC {snode.mgmt_ip}:{snode.rpc_port} ... {node_rpc_check}")
139
+
140
+ # 4- docker API
141
+ node_docker_check = health_controller._check_node_docker_api(snode.mgmt_ip)
142
+ logger.info(f"Check: node docker API {snode.mgmt_ip}:2375 ... {node_docker_check}")
143
+
144
+ is_node_online = ping_check and node_api_check and node_rpc_check and node_docker_check
145
+ if is_node_online:
146
+ set_node_online(snode)
147
+ else:
148
+ set_node_offline(snode)
149
+
150
+ if not ping_check and not node_rpc_check:
151
+ # node is dead, set devices offline
152
+ for dev in snode.nvme_devices:
153
+ device_controller.device_set_unavailable(dev.get_id())
154
+
155
+ update_cluster_status(snode.cluster_id)
161
156
 
162
157
  logger.info(f"Sleeping for {constants.NODE_MONITOR_INTERVAL_SEC} seconds")
163
158
  time.sleep(constants.NODE_MONITOR_INTERVAL_SEC)
@@ -95,15 +95,3 @@ class SNodeClient:
95
95
 
96
96
  def leave_swarm(self):
97
97
  return self._request("GET", "leave_swarm")
98
-
99
- def make_gpt_partitions(self, nbd_device, jm_percent, num_partitions):
100
- params = {
101
- "nbd_device": nbd_device,
102
- "jm_percent": jm_percent,
103
- "num_partitions": num_partitions,
104
- }
105
- return self._request("POST", "make_gpt_partitions", params)
106
-
107
- def delete_dev_gpt_partitions(self, device_pci):
108
- params = {"device_pci": device_pci}
109
- return self._request("POST", "delete_dev_gpt_partitions", params)