sbcli-pre 1.3.5__zip → 1.3.7__zip

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (146) hide show
  1. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/PKG-INFO +1 -1
  2. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/env_var +2 -1
  3. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/sbcli_pre.egg-info/PKG-INFO +1 -1
  4. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/sbcli_pre.egg-info/SOURCES.txt +2 -0
  5. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_core/cluster_ops.py +0 -26
  6. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_core/controllers/device_controller.py +17 -0
  7. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_core/controllers/health_controller.py +1 -1
  8. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_core/controllers/tasks_controller.py +21 -12
  9. sbcli_pre-1.3.7/simplyblock_core/scripts/dashboard.yml +12 -0
  10. sbcli_pre-1.3.7/simplyblock_core/scripts/dashboards/cluster.json +2355 -0
  11. sbcli_pre-1.3.7/simplyblock_core/scripts/dashboards/devices.json +2436 -0
  12. sbcli_pre-1.3.7/simplyblock_core/scripts/dashboards/lvols.json +2441 -0
  13. sbcli_pre-1.3.7/simplyblock_core/scripts/dashboards/node-exporter.json +23743 -0
  14. sbcli_pre-1.3.7/simplyblock_core/scripts/dashboards/nodes.json +2434 -0
  15. sbcli_pre-1.3.7/simplyblock_core/scripts/dashboards/pools.json +2399 -0
  16. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_core/scripts/datasource.yml +3 -2
  17. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_core/scripts/deploy_stack.sh +0 -7
  18. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_core/scripts/docker-compose-swarm-monitoring.yml +79 -8
  19. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_core/scripts/install_deps.sh +0 -2
  20. sbcli_pre-1.3.7/simplyblock_core/scripts/objstore.yml +3 -0
  21. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_core/services/distr_event_collector.py +2 -2
  22. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_core/services/health_check_service.py +1 -1
  23. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_core/services/tasks_runner_restart.py +14 -1
  24. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_core/storage_node_ops.py +29 -11
  25. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_web/blueprints/snode_ops.py +2 -2
  26. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_web/blueprints/web_api_deployer.py +3 -3
  27. sbcli_pre-1.3.5/simplyblock_core/scripts/dashboards/cluster.json +0 -2357
  28. sbcli_pre-1.3.5/simplyblock_core/scripts/dashboards/devices.json +0 -2438
  29. sbcli_pre-1.3.5/simplyblock_core/scripts/dashboards/lvols.json +0 -2443
  30. sbcli_pre-1.3.5/simplyblock_core/scripts/dashboards/node-exporter.json +0 -23745
  31. sbcli_pre-1.3.5/simplyblock_core/scripts/dashboards/nodes.json +0 -2436
  32. sbcli_pre-1.3.5/simplyblock_core/scripts/dashboards/pools.json +0 -2401
  33. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/README.md +0 -0
  34. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/pyproject.toml +0 -0
  35. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/sbcli_pre.egg-info/dependency_links.txt +0 -0
  36. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/sbcli_pre.egg-info/entry_points.txt +0 -0
  37. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/sbcli_pre.egg-info/requires.txt +0 -0
  38. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/sbcli_pre.egg-info/top_level.txt +0 -0
  39. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/setup.cfg +0 -0
  40. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/setup.py +0 -0
  41. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_cli/cli.py +0 -0
  42. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_cli/main.py +0 -0
  43. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_core/__init__.py +0 -0
  44. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_core/cnode_client.py +0 -0
  45. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_core/compute_node_ops.py +0 -0
  46. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_core/constants.py +0 -0
  47. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_core/controllers/__init__.py +0 -0
  48. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_core/controllers/caching_node_controller.py +0 -0
  49. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_core/controllers/cluster_events.py +0 -0
  50. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_core/controllers/device_events.py +0 -0
  51. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_core/controllers/events_controller.py +0 -0
  52. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_core/controllers/lvol_controller.py +0 -0
  53. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_core/controllers/lvol_events.py +0 -0
  54. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_core/controllers/mgmt_events.py +0 -0
  55. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_core/controllers/pool_controller.py +0 -0
  56. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_core/controllers/pool_events.py +0 -0
  57. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_core/controllers/snapshot_controller.py +0 -0
  58. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_core/controllers/snapshot_events.py +0 -0
  59. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_core/controllers/storage_events.py +0 -0
  60. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_core/controllers/tasks_events.py +0 -0
  61. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_core/distr_controller.py +0 -0
  62. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_core/kv_store.py +0 -0
  63. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_core/mgmt_node_ops.py +0 -0
  64. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_core/models/__init__.py +0 -0
  65. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_core/models/base_model.py +0 -0
  66. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_core/models/caching_node.py +0 -0
  67. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_core/models/cluster.py +0 -0
  68. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_core/models/compute_node.py +0 -0
  69. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_core/models/deployer.py +0 -0
  70. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_core/models/events.py +0 -0
  71. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_core/models/global_settings.py +0 -0
  72. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_core/models/iface.py +0 -0
  73. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_core/models/job_schedule.py +0 -0
  74. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_core/models/lvol_model.py +0 -0
  75. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_core/models/mgmt_node.py +0 -0
  76. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_core/models/nvme_device.py +0 -0
  77. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_core/models/pool.py +0 -0
  78. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_core/models/port_stat.py +0 -0
  79. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_core/models/snapshot.py +0 -0
  80. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_core/models/stats.py +0 -0
  81. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_core/models/storage_node.py +0 -0
  82. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_core/pci_utils.py +0 -0
  83. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_core/rpc_client.py +0 -0
  84. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_core/scripts/__init__.py +0 -0
  85. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_core/scripts/alerting/alert_resources.yaml.j2 +0 -0
  86. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_core/scripts/alerting/alert_rules.yaml +0 -0
  87. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_core/scripts/clean_local_storage_deploy.sh +0 -0
  88. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_core/scripts/config_docker.sh +0 -0
  89. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_core/scripts/db_config_double.sh +0 -0
  90. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_core/scripts/db_config_single.sh +0 -0
  91. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_core/scripts/docker-compose-swarm.yml +0 -0
  92. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_core/scripts/haproxy.cfg +0 -0
  93. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_core/scripts/prometheus.yml +0 -0
  94. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_core/scripts/run_ssh.sh +0 -0
  95. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_core/scripts/set_db_config.sh +0 -0
  96. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_core/scripts/stack_deploy_wait.sh +0 -0
  97. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_core/services/__init__.py +0 -0
  98. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_core/services/caching_node_monitor.py +0 -0
  99. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_core/services/cap_monitor.py +0 -0
  100. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_core/services/capacity_and_stats_collector.py +0 -0
  101. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_core/services/device_monitor.py +0 -0
  102. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_core/services/install_service.sh +0 -0
  103. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_core/services/log_agg_service.py +0 -0
  104. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_core/services/lvol_monitor.py +0 -0
  105. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_core/services/lvol_stat_collector.py +0 -0
  106. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_core/services/mgmt_node_monitor.py +0 -0
  107. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_core/services/port_stat_collector.py +0 -0
  108. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_core/services/remove_service.sh +0 -0
  109. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_core/services/service_template.service +0 -0
  110. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_core/services/storage_node_monitor.py +0 -0
  111. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_core/services/tasks_runner_migration.py +0 -0
  112. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_core/shell_utils.py +0 -0
  113. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_core/snode_client.py +0 -0
  114. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_core/utils.py +0 -0
  115. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_web/__init__.py +0 -0
  116. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_web/app.py +0 -0
  117. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_web/auth_middleware.py +0 -0
  118. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_web/blueprints/__init__.py +0 -0
  119. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_web/blueprints/caching_node_ops.py +0 -0
  120. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_web/blueprints/caching_node_ops_k8s.py +0 -0
  121. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_web/blueprints/node_api_basic.py +0 -0
  122. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_web/blueprints/node_api_caching_docker.py +0 -0
  123. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_web/blueprints/node_api_caching_ks.py +0 -0
  124. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_web/blueprints/web_api_caching_node.py +0 -0
  125. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_web/blueprints/web_api_cluster.py +0 -0
  126. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_web/blueprints/web_api_device.py +0 -0
  127. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_web/blueprints/web_api_lvol.py +0 -0
  128. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_web/blueprints/web_api_mgmt_node.py +0 -0
  129. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_web/blueprints/web_api_pool.py +0 -0
  130. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_web/blueprints/web_api_snapshot.py +0 -0
  131. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_web/blueprints/web_api_storage_node.py +0 -0
  132. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_web/caching_node_app.py +0 -0
  133. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_web/caching_node_app_k8s.py +0 -0
  134. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_web/node_utils.py +0 -0
  135. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_web/node_webapp.py +0 -0
  136. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_web/snode_app.py +0 -0
  137. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_web/static/delete.py +0 -0
  138. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_web/static/deploy.py +0 -0
  139. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_web/static/deploy_cnode.yaml +0 -0
  140. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_web/static/deploy_spdk.yaml +0 -0
  141. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_web/static/is_up.py +0 -0
  142. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_web/static/list_deps.py +0 -0
  143. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_web/static/rpac.yaml +0 -0
  144. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_web/static/tst.py +0 -0
  145. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_web/templates/deploy_spdk.yaml.j2 +0 -0
  146. {sbcli_pre-1.3.5 → sbcli_pre-1.3.7}/simplyblock_web/utils.py +0 -0
@@ -1,10 +1,11 @@
1
1
  apiVersion: 1
2
2
  datasources:
3
- - name: Prometheus
3
+ - name: Thanos
4
4
  type: prometheus
5
- url: http://prometheus:9090
5
+ url: http://thanos-query:9091
6
6
  isDefault: true
7
7
  access: proxy
8
+ uid: PBFA97CFB590B2093
8
9
  editable: true
9
10
  - name: GRAYLOG
10
11
  type: elasticsearch
@@ -21,13 +21,6 @@ fi
21
21
 
22
22
  docker network create monitoring-net -d overlay --attachable
23
23
 
24
- INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id)
25
-
26
- #if [ -n "$INSTANCE_ID" ]
27
- #then
28
- # export USE_EFS="rexray/efs"
29
- #fi
30
-
31
24
  docker stack deploy --compose-file="$DIR"/docker-compose-swarm-monitoring.yml monitoring
32
25
 
33
26
  # wait for the services to become online
@@ -68,6 +68,7 @@ services:
68
68
  ClusterIP: "HAProxy"
69
69
  ClusterSecret: "${CLUSTER_SECRET}"
70
70
  deploy:
71
+ mode: global
71
72
  placement:
72
73
  constraints: [node.role == manager]
73
74
  networks:
@@ -76,6 +77,7 @@ services:
76
77
  pushgateway:
77
78
  image: prom/pushgateway
78
79
  deploy:
80
+ mode: global
79
81
  placement:
80
82
  constraints: [node.role == manager]
81
83
  networks:
@@ -90,9 +92,82 @@ services:
90
92
  command:
91
93
  - "--config.file=/etc/prometheus/prometheus.yml"
92
94
  - "--storage.tsdb.path=/prometheus"
95
+ - "--storage.tsdb.no-lockfile"
93
96
  - "--storage.tsdb.retention.time=${RETENTION_PERIOD}"
97
+ - "--storage.tsdb.min-block-duration=2h"
98
+ - "--storage.tsdb.max-block-duration=2h"
94
99
  restart: "always"
95
100
  deploy:
101
+ mode: global
102
+ placement:
103
+ constraints: [node.role == manager]
104
+ networks:
105
+ - monitoring-net
106
+
107
+ thanos-sidecar:
108
+ image: thanosio/thanos:v0.31.0
109
+ user: root
110
+ command:
111
+ - sidecar
112
+ - --tsdb.path=/prometheus
113
+ - --prometheus.url=http://prometheus:9090
114
+ - --objstore.config-file=/etc/thanos/objstore.yml
115
+ volumes:
116
+ - prometheus_data:/prometheus
117
+ - ./objstore.yml:/etc/thanos/objstore.yml
118
+ deploy:
119
+ mode: global
120
+ placement:
121
+ constraints: [node.role == manager]
122
+ networks:
123
+ - monitoring-net
124
+
125
+ thanos-store:
126
+ image: thanosio/thanos:v0.31.0
127
+ command:
128
+ - store
129
+ - --objstore.config-file=/etc/thanos/objstore.yml
130
+ - --index-cache-size=500MB
131
+ - --chunk-pool-size=500MB
132
+ volumes:
133
+ - ./objstore.yml:/etc/thanos/objstore.yml
134
+ deploy:
135
+ mode: global
136
+ placement:
137
+ constraints: [node.role == manager]
138
+ networks:
139
+ - monitoring-net
140
+
141
+ thanos-query:
142
+ image: thanosio/thanos:v0.31.0
143
+ command:
144
+ - query
145
+ - --http-address=0.0.0.0:9091
146
+ - --store=thanos-store:10901
147
+ - --store=thanos-sidecar:10901
148
+ deploy:
149
+ mode: global
150
+ placement:
151
+ constraints: [node.role == manager]
152
+ networks:
153
+ - monitoring-net
154
+
155
+ thanos-compactor:
156
+ image: thanosio/thanos:v0.31.0
157
+ command:
158
+ - compact
159
+ - --data-dir=/data
160
+ - --objstore.config-file=/etc/thanos/objstore.yml
161
+ - --retention.resolution-raw=30d
162
+ - --retention.resolution-5m=60d
163
+ - --retention.resolution-1h=90d
164
+ - --compact.concurrency=1
165
+ - --wait
166
+ volumes:
167
+ - ./objstore.yml:/etc/thanos/objstore.yml
168
+ - thanos_compactor_data:/data
169
+ deploy:
170
+ mode: global
96
171
  placement:
97
172
  constraints: [node.role == manager]
98
173
  networks:
@@ -133,6 +208,8 @@ services:
133
208
  - ./datasource.yml:/etc/grafana/provisioning/datasources/datasource.yaml
134
209
  - grafana_data:/var/lib/grafana
135
210
  - ./alerting:/etc/grafana/provisioning/alerting
211
+ - ./dashboard.yml:/etc/grafana/provisioning/dashboards/main.yaml
212
+ - ./dashboards:/var/lib/grafana/dashboards
136
213
  deploy:
137
214
  placement:
138
215
  constraints: [node.role == manager]
@@ -152,19 +229,13 @@ services:
152
229
 
153
230
  volumes:
154
231
  mongodb_data:
155
- driver: ${USE_EFS:-local}
156
232
  os_data:
157
- driver: ${USE_EFS:-local}
158
233
  graylog_data:
159
- driver: ${USE_EFS:-local}
160
234
  graylog_journal:
161
- driver: ${USE_EFS:-local}
162
235
  grafana_data:
163
- driver: ${USE_EFS:-local}
164
- graylog_config:
165
- driver: ${USE_EFS:-local}
236
+ graylog_config:
166
237
  prometheus_data:
167
- driver: ${USE_EFS:-local}
238
+ thanos_compactor_data:
168
239
 
169
240
  networks:
170
241
  monitoring-net:
@@ -15,8 +15,6 @@ sudo yum install hostname pkg-config git wget python3-pip yum-utils docker-ce do
15
15
  sudo systemctl enable docker
16
16
  sudo systemctl start docker
17
17
 
18
- #sudo docker plugin install rexray/efs --grant-all-permissions EFS_TAG=$INSTANCE_ID REXRAY_PREEMPT=true
19
-
20
18
  wget https://github.com/apple/foundationdb/releases/download/7.3.3/foundationdb-clients-7.3.3-1.el7.x86_64.rpm -q
21
19
  sudo rpm -U foundationdb-clients-7.3.3-1.el7.x86_64.rpm --quiet --reinstall
22
20
  rm -f foundationdb-clients-7.3.3-1.el7.x86_64.rpm
@@ -0,0 +1,3 @@
1
+ type: FILESYSTEM
2
+ config:
3
+ directory: /mnt/thanos
@@ -160,8 +160,8 @@ while True:
160
160
  logger.info(f"Processing event: {eid}")
161
161
  process_event(eid)
162
162
 
163
- logger.info(f"Discarding events: {len(events)}")
164
- client.distr_status_events_discard_then_get(len(events), 0)
163
+ logger.info(f"Discarding events: {len(events)}")
164
+ client.distr_status_events_discard_then_get(len(events), 0)
165
165
 
166
166
  except Exception as e:
167
167
  logger.error("Failed to process distr events")
@@ -125,7 +125,7 @@ while True:
125
125
  logger.info(f"Checking bdev: {remote_device.remote_bdev} ... ok")
126
126
  else:
127
127
  logger.info(f"Checking bdev: {remote_device.remote_bdev} ... not found")
128
- node_remote_devices_check &= bool(ret)
128
+ # node_remote_devices_check &= bool(ret)
129
129
 
130
130
  health_check_status = is_node_online and node_devices_check and node_remote_devices_check
131
131
  set_node_health_check(snode, health_check_status)
@@ -5,7 +5,7 @@ import sys
5
5
 
6
6
 
7
7
  from simplyblock_core import constants, kv_store, storage_node_ops
8
- from simplyblock_core.controllers import device_controller, tasks_events
8
+ from simplyblock_core.controllers import device_controller, tasks_events, health_controller
9
9
  from simplyblock_core.models.job_schedule import JobSchedule
10
10
  from simplyblock_core.models.nvme_device import NVMeDevice
11
11
 
@@ -169,6 +169,19 @@ def task_runner_node(task):
169
169
  task.write_to_db(db_controller.kv_store)
170
170
  tasks_events.task_updated(task)
171
171
 
172
+ # is node reachable?
173
+ ping_check = health_controller._check_node_ping(node.mgmt_ip)
174
+ logger.info(f"Check: ping mgmt ip {node.mgmt_ip} ... {ping_check}")
175
+ node_api_check = health_controller._check_node_api(node.mgmt_ip)
176
+ logger.info(f"Check: node API {node.mgmt_ip}:5000 ... {node_api_check}")
177
+ if not ping_check or not node_api_check:
178
+ # node is unreachable, retry
179
+ logger.info(f"Node is not reachable: {task.node_id}, retry")
180
+ task.function_result = f"Node is unreachable, retry"
181
+ task.retry += 1
182
+ task.write_to_db(db_controller.kv_store)
183
+ return False
184
+
172
185
  # shutting down node
173
186
  logger.info(f"Shutdown node {node.get_id()}")
174
187
  ret = storage_node_ops.shutdown_storage_node(node.get_id(), force=True)
@@ -973,6 +973,12 @@ def remove_storage_node(node_id, force_remove=False, force_migrate=False):
973
973
  logger.error(f"Can not remove online node: {node_id}")
974
974
  return False
975
975
 
976
+ task_id = tasks_controller.get_active_node_restart_task(snode.cluster_id, snode.get_id())
977
+ if task_id:
978
+ logger.error(f"Restart task found: {task_id}, can not remove storage node")
979
+ if force_remove is False:
980
+ return False
981
+
976
982
  if snode.lvols:
977
983
  if force_migrate:
978
984
  for lvol_id in snode.lvols:
@@ -1064,6 +1070,11 @@ def restart_storage_node(
1064
1070
  logger.error(f"Can not restart online node: {node_id}")
1065
1071
  return False
1066
1072
 
1073
+ task_id = tasks_controller.get_active_node_restart_task(snode.cluster_id, snode.get_id())
1074
+ if task_id:
1075
+ logger.error(f"Restart task found: {task_id}, can not restart storage node")
1076
+ return False
1077
+
1067
1078
  logger.info("Setting node state to restarting")
1068
1079
  old_status = snode.status
1069
1080
  snode.status = StorageNode.STATUS_RESTARTING
@@ -1469,16 +1480,11 @@ def shutdown_storage_node(node_id, force=False):
1469
1480
  if force is False:
1470
1481
  return False
1471
1482
 
1472
- # cls = db_controller.get_clusters(id=snode.cluster_id)
1473
- # snodes = db_controller.get_storage_nodes()
1474
- # online_nodes = 0
1475
- # for node in snodes:
1476
- # if node.status == node.STATUS_ONLINE:
1477
- # online_nodes += 1
1478
- # if cls[0].ha_type == "ha" and online_nodes <= 3:
1479
- # logger.warning(f"Cluster mode is HA but online storage nodes are less than 3")
1480
- # if force is False:
1481
- # return False
1483
+ task_id = tasks_controller.get_active_node_restart_task(snode.cluster_id, snode.get_id())
1484
+ if task_id:
1485
+ logger.error(f"Restart task found: {task_id}, can not shutdown storage node")
1486
+ if force is False:
1487
+ return False
1482
1488
 
1483
1489
  logger.info("Shutting down node")
1484
1490
  old_status = snode.status
@@ -1538,7 +1544,14 @@ def suspend_storage_node(node_id, force=False):
1538
1544
  logger.info("Node found: %s in state: %s", snode.hostname, snode.status)
1539
1545
  if snode.status != StorageNode.STATUS_ONLINE:
1540
1546
  logger.error("Node is not in online state")
1541
- return False
1547
+ if force is False:
1548
+ return False
1549
+
1550
+ task_id = tasks_controller.get_active_node_restart_task(snode.cluster_id, snode.get_id())
1551
+ if task_id:
1552
+ logger.error(f"Restart task found: {task_id}, can not suspend storage node")
1553
+ if force is False:
1554
+ return False
1542
1555
 
1543
1556
  cluster = db_controller.get_cluster_by_id(snode.cluster_id)
1544
1557
  snodes = db_controller.get_storage_nodes_by_cluster_id(snode.cluster_id)
@@ -1599,6 +1612,11 @@ def resume_storage_node(node_id):
1599
1612
  logger.error("Node is not in suspended state")
1600
1613
  return False
1601
1614
 
1615
+ task_id = tasks_controller.get_active_node_restart_task(snode.cluster_id, snode.get_id())
1616
+ if task_id:
1617
+ logger.error(f"Restart task found: {task_id}, can not resume storage node")
1618
+ return False
1619
+
1602
1620
  logger.info("Resuming node")
1603
1621
 
1604
1622
  logger.info("Sending cluster event updates")
@@ -100,9 +100,9 @@ def spdk_process_start():
100
100
  node.remove(force=True)
101
101
  time.sleep(2)
102
102
 
103
- spdk_debug = 0
103
+ spdk_debug = ""
104
104
  if set_debug:
105
- spdk_debug = 1
105
+ spdk_debug = "1"
106
106
 
107
107
  spdk_image = constants.SIMPLY_BLOCK_SPDK_ULTRA_IMAGE
108
108
  if 'spdk_image' in data and data['spdk_image']:
@@ -30,8 +30,8 @@ ssm = boto3.client('ssm', region_name=region)
30
30
  s3 = boto3.client('s3', region_name=region)
31
31
 
32
32
 
33
- def get_instance_tf_engine_instance_id():
34
- tag_value = 'tfengine'
33
+ def get_instance_tf_engine_instance_id(workspace: str):
34
+ tag_value = f'{workspace}-tfengine'
35
35
  tag_key = 'Name'
36
36
 
37
37
  ec2 = boto3.client('ec2', region_name=region)
@@ -145,7 +145,7 @@ def update_cluster(d, kv_store, storage_nodes, availability_zone):
145
145
  d.status = "in_progress"
146
146
  d.write_to_db(kv_store)
147
147
 
148
- instance_ids = get_instance_tf_engine_instance_id()
148
+ instance_ids = get_instance_tf_engine_instance_id(d.tf_workspace)
149
149
  if len(instance_ids) == 0:
150
150
  # wait for a min and try again before returning error on the API
151
151
  print('no instance IDs')