sbcli-pre 1.2.5__zip → 1.2.7__zip

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/PKG-INFO +1 -1
  2. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/env_var +1 -1
  3. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/sbcli_pre.egg-info/PKG-INFO +1 -1
  4. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/sbcli_pre.egg-info/SOURCES.txt +5 -3
  5. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_cli/cli.py +138 -136
  6. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/cluster_ops.py +138 -235
  7. sbcli_pre-1.2.7/simplyblock_core/constants.py +91 -0
  8. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/caching_node_controller.py +8 -6
  9. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/cluster_events.py +9 -0
  10. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/device_controller.py +56 -63
  11. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/events_controller.py +5 -3
  12. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/health_controller.py +30 -40
  13. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/lvol_controller.py +75 -39
  14. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/pool_controller.py +8 -4
  15. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/snapshot_controller.py +36 -3
  16. sbcli_pre-1.2.7/simplyblock_core/controllers/tasks_controller.py +103 -0
  17. sbcli_pre-1.2.7/simplyblock_core/controllers/tasks_events.py +37 -0
  18. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/distr_controller.py +13 -9
  19. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/kv_store.py +62 -20
  20. sbcli_pre-1.2.7/simplyblock_core/mgmt_node_ops.py +205 -0
  21. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/events.py +9 -1
  22. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/job_schedule.py +6 -0
  23. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/nvme_device.py +42 -4
  24. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/storage_node.py +14 -2
  25. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/rpc_client.py +55 -10
  26. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/__init__.py +0 -4
  27. sbcli_pre-1.2.5/simplyblock_core/scripts/alerting/alert_resources.yaml → sbcli_pre-1.2.7/simplyblock_core/scripts/alerting/alert_resources.yaml.j2 +54 -5
  28. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/dashboards/cluster.json +1 -1
  29. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/deploy_stack.sh +9 -0
  30. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/docker-compose-swarm-monitoring.yml +32 -15
  31. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/docker-compose-swarm.yml +17 -2
  32. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/haproxy.cfg +15 -0
  33. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/install_deps.sh +3 -0
  34. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/stack_deploy_wait.sh +1 -1
  35. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/capacity_and_stats_collector.py +1 -1
  36. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/device_monitor.py +5 -46
  37. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/distr_event_collector.py +10 -11
  38. sbcli_pre-1.2.7/simplyblock_core/services/health_check_service.py +134 -0
  39. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/lvol_monitor.py +1 -1
  40. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/lvol_stat_collector.py +1 -1
  41. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/port_stat_collector.py +0 -1
  42. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/storage_node_monitor.py +49 -44
  43. sbcli_pre-1.2.7/simplyblock_core/services/tasks_runner_migration.py +61 -0
  44. sbcli_pre-1.2.5/simplyblock_core/services/job_tasks.py → sbcli_pre-1.2.7/simplyblock_core/services/tasks_runner_restart.py +95 -46
  45. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/snode_client.py +12 -0
  46. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/storage_node_ops.py +630 -358
  47. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/utils.py +126 -1
  48. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/snode_ops.py +103 -25
  49. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/web_api_cluster.py +20 -43
  50. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/web_api_device.py +10 -7
  51. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/web_api_lvol.py +9 -5
  52. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/web_api_pool.py +14 -5
  53. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/web_api_storage_node.py +15 -15
  54. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/node_utils.py +0 -2
  55. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/utils.py +8 -0
  56. sbcli_pre-1.2.5/simplyblock_core/constants.py +0 -65
  57. sbcli_pre-1.2.5/simplyblock_core/mgmt_node_ops.py +0 -80
  58. sbcli_pre-1.2.5/simplyblock_core/scripts/apply_dashboard.sh +0 -22
  59. sbcli_pre-1.2.5/simplyblock_core/services/health_check_service.py +0 -136
  60. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/README.md +0 -0
  61. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/pyproject.toml +0 -0
  62. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/sbcli_pre.egg-info/dependency_links.txt +0 -0
  63. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/sbcli_pre.egg-info/entry_points.txt +0 -0
  64. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/sbcli_pre.egg-info/requires.txt +0 -0
  65. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/sbcli_pre.egg-info/top_level.txt +0 -0
  66. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/setup.cfg +0 -0
  67. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/setup.py +0 -0
  68. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_cli/main.py +0 -0
  69. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/__init__.py +0 -0
  70. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/cnode_client.py +0 -0
  71. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/compute_node_ops.py +0 -0
  72. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/__init__.py +0 -0
  73. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/device_events.py +0 -0
  74. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/lvol_events.py +0 -0
  75. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/mgmt_events.py +0 -0
  76. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/pool_events.py +0 -0
  77. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/snapshot_events.py +0 -0
  78. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/storage_events.py +0 -0
  79. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/__init__.py +0 -0
  80. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/base_model.py +0 -0
  81. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/caching_node.py +0 -0
  82. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/cluster.py +0 -0
  83. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/compute_node.py +0 -0
  84. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/deployer.py +0 -0
  85. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/global_settings.py +0 -0
  86. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/iface.py +0 -0
  87. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/lvol_model.py +0 -0
  88. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/mgmt_node.py +0 -0
  89. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/pool.py +0 -0
  90. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/port_stat.py +0 -0
  91. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/snapshot.py +0 -0
  92. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/stats.py +0 -0
  93. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/pci_utils.py +0 -0
  94. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/alerting/alert_rules.yaml +0 -0
  95. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/clean_local_storage_deploy.sh +0 -0
  96. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/config_docker.sh +0 -0
  97. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/dashboards/devices.json +0 -0
  98. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/dashboards/lvols.json +0 -0
  99. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/dashboards/node-exporter.json +0 -0
  100. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/dashboards/nodes.json +0 -0
  101. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/dashboards/pools.json +0 -0
  102. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/datasource.yml +0 -0
  103. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/db_config_double.sh +0 -0
  104. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/db_config_single.sh +0 -0
  105. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/prometheus.yml +0 -0
  106. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/run_ssh.sh +0 -0
  107. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/set_db_config.sh +0 -0
  108. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/__init__.py +0 -0
  109. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/caching_node_monitor.py +0 -0
  110. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/cap_monitor.py +0 -0
  111. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/install_service.sh +0 -0
  112. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/log_agg_service.py +0 -0
  113. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/mgmt_node_monitor.py +0 -0
  114. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/remove_service.sh +0 -0
  115. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/service_template.service +0 -0
  116. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/shell_utils.py +0 -0
  117. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/__init__.py +0 -0
  118. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/app.py +0 -0
  119. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/auth_middleware.py +0 -0
  120. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/__init__.py +0 -0
  121. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/caching_node_ops.py +0 -0
  122. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/caching_node_ops_k8s.py +0 -0
  123. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/node_api_basic.py +0 -0
  124. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/node_api_caching_docker.py +0 -0
  125. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/node_api_caching_ks.py +0 -0
  126. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/web_api_caching_node.py +0 -0
  127. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/web_api_deployer.py +0 -0
  128. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/web_api_mgmt_node.py +0 -0
  129. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/web_api_snapshot.py +0 -0
  130. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/caching_node_app.py +0 -0
  131. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/caching_node_app_k8s.py +0 -0
  132. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/node_webapp.py +0 -0
  133. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/snode_app.py +0 -0
  134. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/static/delete.py +0 -0
  135. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/static/deploy.py +0 -0
  136. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/static/deploy_cnode.yaml +0 -0
  137. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/static/deploy_spdk.yaml +0 -0
  138. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/static/is_up.py +0 -0
  139. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/static/list_deps.py +0 -0
  140. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/static/rpac.yaml +0 -0
  141. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/static/tst.py +0 -0
  142. {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/templates/deploy_spdk.yaml.j2 +0 -0
@@ -12,15 +12,26 @@ contactPoints:
12
12
  name: grafana-alerts
13
13
  receivers:
14
14
  - uid: grafana
15
- type: slack
15
+ type: {{ ALERT_TYPE }}
16
+ {% if ALERT_TYPE == 'slack' %}
16
17
  settings:
17
18
  username: grafana_bot
18
- url: 'https://hooks.slack.com/services/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'
19
+ url: '{{ CONTACT_POINT }}'
19
20
  title: |
20
- {{ template "slack.title" . }}
21
+ {{ '{{' }} template "slack.title" . {{ '}}' }}
21
22
  text: |
22
- {{ template "slack.message" . }}
23
+ {{ '{{' }} template "slack.message" . {{ '}}' }}
24
+ {% else %}
25
+ settings:
26
+ addresses: '{{ CONTACT_POINT }}'
27
+ subject: |
28
+ {{ '{{' }} template "email.subject" . {{ '}}' }}
29
+ body: |
30
+ {{ '{{' }} template "email.body" . {{ '}}' }}
31
+ {% endif %}
23
32
 
33
+ {% if ALERT_TYPE == 'slack' %}
34
+ {% raw %}
24
35
  templates:
25
36
  - orgId: 1
26
37
  name: slack.title
@@ -38,7 +49,9 @@ templates:
38
49
  *Description*: {{ .Annotations.description }}
39
50
  {{ end -}}
40
51
  *Log message*: {{ index .Labels "message" }}
41
- *Explore logs:* https://grafanaURL.com/explore?orgId=1
52
+ {% endraw %}
53
+ *Explore logs:* {{ GRAFANA_ENDPOINT }}
54
+ {% raw %}
42
55
  {{ if .DashboardURL -}}
43
56
  *Go to dashboard:* {{ .DashboardURL }}
44
57
  {{- end }}
@@ -65,3 +78,39 @@ templates:
65
78
  {{ end }}
66
79
 
67
80
  {{- end }}
81
+ {% endraw %}
82
+ {% else %}
83
+ {% raw %}
84
+ - orgId: 1
85
+ name: email.subject
86
+ template: |-
87
+ {{ define "email.subject" -}}
88
+ [{{ .Status | toUpper }}] Grafana Alert
89
+ {{- end -}}
90
+ - orgId: 1
91
+ name: email.body
92
+ template: |-
93
+ {{ define "email.body" -}}
94
+ Alert: {{ .Labels.alertname }}
95
+ {{ if .Annotations -}}
96
+ Summary: {{ .Annotations.summary}}
97
+ Description: {{ .Annotations.description }}
98
+ {{ end -}}
99
+ Log message: {{ index .Labels "message" }}
100
+ Explore logs: {{ GRAFANA_ENDPOINT }}
101
+ {{ if .DashboardURL -}}
102
+ Go to dashboard: {{ .DashboardURL }}
103
+ {{- end }}
104
+ {{ if .PanelURL -}}
105
+ Go to panel: {{ .PanelURL }}
106
+ {{- end }}
107
+ Details:
108
+ {{ range .Labels.SortedPairs -}}
109
+ - {{ .Name }}: `{{ .Value }}`
110
+ {{ end -}}
111
+ {{ if .SilenceURL -}}
112
+ Silence this alert: {{ .SilenceURL }}
113
+ {{- end }}
114
+ {{- end }}
115
+ {% endraw %}
116
+ {% endif %}
@@ -2354,4 +2354,4 @@
2354
2354
  "version": 5,
2355
2355
  "weekStart": ""
2356
2356
  }
2357
- }
2357
+ }
@@ -19,6 +19,15 @@ then
19
19
  export FDB_CLUSTER_FILE_CONTENTS=$FDB_CLUSTER_FILE_CONTENTS
20
20
  fi
21
21
 
22
+ docker network create monitoring-net -d overlay --attachable
23
+
24
+ INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id)
25
+
26
+ if [ -n "$INSTANCE_ID" ]
27
+ then
28
+ export USE_EFS="rexray/efs"
29
+ fi
30
+
22
31
  docker stack deploy --compose-file="$DIR"/docker-compose-swarm-monitoring.yml monitoring
23
32
 
24
33
  # wait for the services to become online
@@ -9,6 +9,8 @@ services:
9
9
  deploy:
10
10
  placement:
11
11
  constraints: [node.role == manager]
12
+ networks:
13
+ - monitoring-net
12
14
 
13
15
  opensearch:
14
16
  image: "opensearchproject/opensearch:2.4.0"
@@ -24,6 +26,8 @@ services:
24
26
  deploy:
25
27
  placement:
26
28
  constraints: [node.role == manager]
29
+ networks:
30
+ - monitoring-net
27
31
 
28
32
  graylog:
29
33
  hostname: "server"
@@ -34,16 +38,16 @@ services:
34
38
  GRAYLOG_PASSWORD_SECRET: "${GRAYLOG_PASSWORD_SECRET}"
35
39
  GRAYLOG_ROOT_PASSWORD_SHA2: "${GRAYLOG_ROOT_PASSWORD_SHA2}"
36
40
  GRAYLOG_HTTP_BIND_ADDRESS: "0.0.0.0:9000"
37
- GRAYLOG_HTTP_EXTERNAL_URI: "http://localhost:9000/"
41
+ GRAYLOG_HTTP_EXTERNAL_URI: "http://localhost/graylog/"
38
42
  GRAYLOG_ELASTICSEARCH_HOSTS: "http://opensearch:9200"
39
43
  GRAYLOG_MONGODB_URI: "mongodb://mongodb:27017/graylog"
44
+ GRAYLOG_SKIP_PREFLIGHT_CHECKS: "true"
40
45
  ports:
41
46
  - "5044:5044/tcp" # Beats
42
47
  - "5140:5140/udp" # Syslog
43
48
  - "5140:5140/tcp" # Syslog
44
49
  - "5555:5555/tcp" # RAW TCP
45
50
  - "5555:5555/udp" # RAW TCP
46
- - "9000:9000/tcp" # Server API
47
51
  - "12201:12201/tcp" # GELF TCP
48
52
  - "12201:12201/udp" # GELF UDP
49
53
  - "13301:13301/tcp" # Forwarder data
@@ -54,22 +58,28 @@ services:
54
58
  deploy:
55
59
  placement:
56
60
  constraints: [node.role == manager]
61
+ networks:
62
+ - monitoring-net
57
63
 
58
64
  promagent:
59
65
  image: simplyblock/promagent
60
66
  environment:
61
67
  ClusterID: "${CLUSTER_ID}"
62
- ClusterIP: "${CLUSTER_IP}"
68
+ ClusterIP: "HAProxy"
63
69
  ClusterSecret: "${CLUSTER_SECRET}"
64
70
  deploy:
65
71
  placement:
66
72
  constraints: [node.role == manager]
73
+ networks:
74
+ - monitoring-net
67
75
 
68
76
  pushgateway:
69
77
  image: prom/pushgateway
70
78
  deploy:
71
79
  placement:
72
80
  constraints: [node.role == manager]
81
+ networks:
82
+ - monitoring-net
73
83
 
74
84
  prometheus:
75
85
  image: prom/prometheus:v2.44.0
@@ -85,6 +95,8 @@ services:
85
95
  deploy:
86
96
  placement:
87
97
  constraints: [node.role == manager]
98
+ networks:
99
+ - monitoring-net
88
100
 
89
101
  node-exporter:
90
102
  image: prom/node-exporter:v1.7.0
@@ -105,7 +117,9 @@ services:
105
117
  mode: global
106
118
  placement:
107
119
  constraints: [node.role == worker]
108
-
120
+ networks:
121
+ - monitoring-net
122
+
109
123
  grafana:
110
124
  image: grafana/grafana:10.0.12
111
125
  environment:
@@ -114,19 +128,16 @@ services:
114
128
  GF_ALERTING_ENABLED: "true"
115
129
  GF_PATHS_PROVISIONING: "/etc/grafana/provisioning"
116
130
  GF_INSTALL_PLUGINS: "grafana-opensearch-datasource"
131
+ GF_SERVER_ROOT_URL: "http://localhost/grafana/"
117
132
  volumes:
118
133
  - ./datasource.yml:/etc/grafana/provisioning/datasources/datasource.yaml
119
134
  - grafana_data:/var/lib/grafana
120
135
  - ./alerting:/etc/grafana/provisioning/alerting
121
- restart: "always"
122
- ports:
123
- - target: 3000
124
- published: 3000
125
- protocol: tcp
126
- mode: host
127
136
  deploy:
128
137
  placement:
129
138
  constraints: [node.role == manager]
139
+ networks:
140
+ - monitoring-net
130
141
 
131
142
  CleanupGraylog:
132
143
  image: $SIMPLYBLOCK_DOCKER_IMAGE
@@ -136,19 +147,25 @@ services:
136
147
  deploy:
137
148
  placement:
138
149
  constraints: [node.role == manager]
139
-
140
- ### monitoring ###
150
+ networks:
151
+ - monitoring-net
141
152
 
142
153
  volumes:
143
154
  mongodb_data:
155
+ driver: ${USE_EFS:-local}
144
156
  os_data:
157
+ driver: ${USE_EFS:-local}
145
158
  graylog_data:
159
+ driver: ${USE_EFS:-local}
146
160
  graylog_journal:
161
+ driver: ${USE_EFS:-local}
147
162
  grafana_data:
163
+ driver: ${USE_EFS:-local}
164
+ graylog_config:
165
+ driver: ${USE_EFS:-local}
148
166
  prometheus_data:
149
- alertmanager_data:
167
+ driver: ${USE_EFS:-local}
150
168
 
151
169
  networks:
152
- hostnet:
170
+ monitoring-net:
153
171
  external: true
154
- name: host
@@ -114,6 +114,7 @@ services:
114
114
  - 8404:8404
115
115
  networks:
116
116
  - localnet
117
+ - monitoring-net
117
118
  volumes:
118
119
  - "$DIR/haproxy.cfg:/usr/local/etc/haproxy/haproxy.cfg"
119
120
 
@@ -185,9 +186,20 @@ services:
185
186
  networks:
186
187
  - hostnet
187
188
 
188
- TasksRunner:
189
+ TasksRunnerRestart:
189
190
  image: $SIMPLYBLOCK_DOCKER_IMAGE
190
- command: "python simplyblock_core/services/job_tasks.py"
191
+ command: "python simplyblock_core/services/tasks_runner_restart.py"
192
+ deploy:
193
+ placement:
194
+ constraints: [node.role == manager]
195
+ volumes:
196
+ - "/etc/foundationdb:/etc/foundationdb"
197
+ networks:
198
+ - hostnet
199
+
200
+ TasksRunnerMigration:
201
+ image: $SIMPLYBLOCK_DOCKER_IMAGE
202
+ command: "python simplyblock_core/services/tasks_runner_migration.py"
191
203
  deploy:
192
204
  placement:
193
205
  constraints: [node.role == manager]
@@ -200,6 +212,9 @@ volumes:
200
212
  os_data:
201
213
 
202
214
  networks:
215
+ monitoring-net:
216
+ external: true
217
+
203
218
  hostnet:
204
219
  external: true
205
220
  name: host
@@ -42,6 +42,16 @@ backend wep_api_services
42
42
  balance roundrobin
43
43
  server-template webapi- 3 WebAppAPI:5000 check resolvers docker init-addr libc,none
44
44
 
45
+ backend grafana_services
46
+ balance roundrobin
47
+ http-request set-path %[path,regsub(^/grafana/?,/)]
48
+ server-template grafana- 1 grafana:3000 check resolvers docker init-addr libc,none
49
+
50
+ backend graylog_services
51
+ balance roundrobin
52
+ http-request set-path %[path,regsub(^/graylog/?,/)]
53
+ server-template graylog- 1 graylog:9000 check resolvers docker init-addr libc,none
54
+
45
55
  frontend stats_front
46
56
  bind *:8404
47
57
  stats enable
@@ -52,4 +62,9 @@ frontend stats_front
52
62
 
53
63
  frontend web_api_front
54
64
  bind *:80
65
+
66
+ use_backend grafana_services if { path /grafana } || { path_beg /grafana/ }
67
+ use_backend graylog_services if { path /graylog } || { path_beg /graylog/ }
68
+
55
69
  default_backend wep_api_services
70
+
@@ -15,6 +15,8 @@ sudo yum install hostname pkg-config git wget python3-pip yum-utils docker-ce do
15
15
  sudo systemctl enable docker
16
16
  sudo systemctl start docker
17
17
 
18
+ sudo docker plugin install --grant-all-permissions rexray/efs REXRAY_PREEMPT=true
19
+
18
20
  wget https://github.com/apple/foundationdb/releases/download/7.3.3/foundationdb-clients-7.3.3-1.el7.x86_64.rpm -q
19
21
  sudo rpm -U foundationdb-clients-7.3.3-1.el7.x86_64.rpm --quiet --reinstall
20
22
  rm -f foundationdb-clients-7.3.3-1.el7.x86_64.rpm
@@ -29,6 +31,7 @@ sudo sed -i 's/#X11UseLocalhost yes/X11UseLocalhost no/g' /etc/ssh/sshd_config
29
31
 
30
32
  sudo service sshd restart
31
33
  sudo modprobe nvme-tcp
34
+ sudo modprobe nbd
32
35
 
33
36
  sudo sysctl -w net.ipv6.conf.all.disable_ipv6=1
34
37
 
@@ -6,7 +6,7 @@ opt_h=0
6
6
  opt_l=""
7
7
  opt_r=0
8
8
  opt_s=5
9
- opt_t=600 # 10 min
9
+ opt_t=900 # 15 min
10
10
 
11
11
  start_epoc=$(date +%s)
12
12
  cmd_min_timeout=15
@@ -193,7 +193,7 @@ while True:
193
193
  logger.info(f"Device is skipped: {device.get_id()} status: {device.status}")
194
194
  continue
195
195
  capacity_dict = rpc_client.alceml_get_capacity(device.alceml_bdev)
196
- stats_dict = rpc_client.get_device_stats(device.alloc_bdev)
196
+ stats_dict = rpc_client.get_device_stats(device.nvme_bdev)
197
197
  record = add_device_stats(cl, device, capacity_dict, stats_dict)
198
198
  if record:
199
199
  devices_records.append(record)
@@ -5,7 +5,7 @@ import sys
5
5
  import uuid
6
6
 
7
7
  from simplyblock_core import constants, kv_store
8
- from simplyblock_core.models.job_schedule import JobSchedule
8
+ from simplyblock_core.controllers import tasks_controller
9
9
  from simplyblock_core.models.nvme_device import NVMeDevice
10
10
  from simplyblock_core.models.storage_node import StorageNode
11
11
 
@@ -27,47 +27,6 @@ db_store = kv_store.KVStore()
27
27
  db_controller = kv_store.DBController()
28
28
 
29
29
 
30
- def add_device_to_auto_restart(device):
31
- tasks = db_controller.get_job_tasks(device.cluster_id)
32
- for task in tasks:
33
- if task.device_id == device.get_id():
34
- if task.status != JobSchedule.STATUS_DONE:
35
- logger.info(f"Task found, skip adding new task: {task.get_id()}")
36
- return
37
-
38
- ds = JobSchedule()
39
- ds.uuid = str(uuid.uuid4())
40
- ds.cluster_id = device.cluster_id
41
- ds.node_id = device.node_id
42
- ds.device_id = device.get_id()
43
- ds.date = int(time.time())
44
- ds.function_name = "device_restart"
45
- ds.status = 'new'
46
-
47
- ds.write_to_db(db_store)
48
- return ds.get_id()
49
-
50
-
51
- def add_node_to_auto_restart(node):
52
- tasks = db_controller.get_job_tasks(node.cluster_id)
53
- for task in tasks:
54
- if task.node_id == node.get_id():
55
- if task.status != JobSchedule.STATUS_DONE:
56
- logger.info(f"Task found, skip adding new task: {task.get_id()}")
57
- return
58
-
59
- ds = JobSchedule()
60
- ds.uuid = str(uuid.uuid4())
61
- ds.cluster_id = node.cluster_id
62
- ds.node_id = node.get_id()
63
- ds.date = int(time.time())
64
- ds.function_name = "node_restart"
65
- ds.status = 'new'
66
-
67
- ds.write_to_db(db_store)
68
- return ds.get_id()
69
-
70
-
71
30
  logger.info("Starting Device monitor...")
72
31
  while True:
73
32
  nodes = db_controller.get_storage_nodes()
@@ -89,9 +48,9 @@ while True:
89
48
  logger.info("Adding device to auto restart")
90
49
  auto_restart_devices.append(dev)
91
50
 
92
- if len(auto_restart_devices) == 1:
93
- add_device_to_auto_restart(auto_restart_devices[0])
94
- elif len(auto_restart_devices) >= 2 and len(online_devices) == 0:
95
- add_node_to_auto_restart(node)
51
+ if len(auto_restart_devices) >= 2 or len(online_devices) == 0:
52
+ tasks_controller.add_node_to_auto_restart(node)
53
+ elif len(auto_restart_devices) == 1:
54
+ tasks_controller.add_device_to_auto_restart(auto_restart_devices[0])
96
55
 
97
56
  time.sleep(constants.DEV_MONITOR_INTERVAL_SEC)
@@ -88,7 +88,7 @@ def process_lvol_event(event):
88
88
  if event.message in ["error_open", 'error_read', "error_write", "error_unmap"]:
89
89
  vuid = event.object_dict['vuid']
90
90
  lvol = None
91
- for lv in db_controller.get_lvols():
91
+ for lv in db_controller.get_lvols(): # pass
92
92
  if lv.vuid == vuid:
93
93
  lvol = lv
94
94
  break
@@ -127,7 +127,6 @@ def process_event(event_id):
127
127
 
128
128
  hostname = utils.get_hostname()
129
129
  logger.info("Starting Distr event collector...")
130
- logger.info(f"Node:{hostname}")
131
130
  while True:
132
131
  time.sleep(constants.DISTR_EVENT_COLLECTOR_INTERVAL_SEC)
133
132
 
@@ -141,14 +140,13 @@ while True:
141
140
  snode.rpc_port,
142
141
  snode.rpc_username,
143
142
  snode.rpc_password,
144
- timeout=3, retry=2
145
- )
146
- num_of_events = constants.DISTR_EVENT_COLLECTOR_NUM_OF_EVENTS
143
+ timeout=10, retry=2)
144
+
147
145
  try:
148
- # events = client.distr_status_events_get()
149
- events = client.distr_status_events_discard_then_get(0, num_of_events)
146
+ events = client.distr_status_events_discard_then_get(0, constants.DISTR_EVENT_COLLECTOR_NUM_OF_EVENTS)
147
+
150
148
  if not events:
151
- logger.error("Distr events empty")
149
+ logger.debug("no events found")
152
150
  continue
153
151
 
154
152
  logger.info(f"Found events: {len(events)}")
@@ -161,10 +159,11 @@ while True:
161
159
  for eid in event_ids:
162
160
  logger.info(f"Processing event: {eid}")
163
161
  process_event(eid)
164
- logger.info(f"Discarding events: {num_of_events}")
165
- events = client.distr_status_events_discard_then_get(num_of_events, 0)
162
+
163
+ logger.info(f"Discarding events: {len(events)}")
164
+ client.distr_status_events_discard_then_get(len(events), 0)
166
165
 
167
166
  except Exception as e:
168
- logger.error("Failed to get distr events")
167
+ logger.error("Failed to process distr events")
169
168
  logger.exception(e)
170
169
  continue
@@ -0,0 +1,134 @@
1
+ # coding=utf-8
2
+ import logging
3
+
4
+ import time
5
+ import sys
6
+ from datetime import datetime
7
+
8
+
9
+ from simplyblock_core.controllers import health_controller, storage_events, device_events
10
+ from simplyblock_core.models.storage_node import StorageNode
11
+ from simplyblock_core.rpc_client import RPCClient
12
+ from simplyblock_core import constants, kv_store
13
+
14
+ # Import the GELF logger
15
+ from graypy import GELFUDPHandler
16
+
17
+ def set_node_health_check(snode, health_check_status):
18
+ snode = db_controller.get_storage_node_by_id(snode.get_id())
19
+ if snode.health_check == health_check_status:
20
+ return
21
+ old_status = snode.health_check
22
+ snode.health_check = health_check_status
23
+ snode.updated_at = str(datetime.now())
24
+ snode.write_to_db(db_store)
25
+ storage_events.snode_health_check_change(snode, snode.health_check, old_status, caused_by="monitor")
26
+
27
+
28
+ def set_device_health_check(cluster_id, device, health_check_status):
29
+ if device.health_check == health_check_status:
30
+ return
31
+ nodes = db_controller.get_storage_nodes_by_cluster_id(cluster_id)
32
+ for node in nodes:
33
+ if node.nvme_devices:
34
+ for dev in node.nvme_devices:
35
+ if dev.get_id() == device.get_id():
36
+ old_status = dev.health_check
37
+ dev.health_check = health_check_status
38
+ node.write_to_db(db_store)
39
+ device_events.device_health_check_change(
40
+ dev, dev.health_check, old_status, caused_by="monitor")
41
+
42
+
43
+ # configure logging
44
+ logger_handler = logging.StreamHandler(stream=sys.stdout)
45
+ logger_handler.setFormatter(logging.Formatter('%(asctime)s: %(levelname)s: %(message)s'))
46
+ gelf_handler = GELFUDPHandler('0.0.0.0', constants.GELF_PORT)
47
+ logger = logging.getLogger()
48
+ logger.addHandler(gelf_handler)
49
+ logger.addHandler(logger_handler)
50
+ logger.setLevel(logging.DEBUG)
51
+
52
+ # get DB controller
53
+ db_store = kv_store.KVStore()
54
+ db_controller = kv_store.DBController()
55
+
56
+ logger.info("Starting health check service")
57
+ while True:
58
+ clusters = db_controller.get_clusters()
59
+ for cluster in clusters:
60
+ cluster_id = cluster.get_id()
61
+ snodes = db_controller.get_storage_nodes_by_cluster_id(cluster_id)
62
+ if not snodes:
63
+ logger.error("storage nodes list is empty")
64
+
65
+ for snode in snodes:
66
+ logger.info("Node: %s, status %s", snode.get_id(), snode.status)
67
+
68
+ if snode.status not in [StorageNode.STATUS_ONLINE, StorageNode.STATUS_UNREACHABLE]:
69
+ logger.info(f"Node status is: {snode.status}, skipping")
70
+ continue
71
+
72
+ # 1- check node ping
73
+ ping_check = health_controller._check_node_ping(snode.mgmt_ip)
74
+ logger.info(f"Check: ping mgmt ip {snode.mgmt_ip} ... {ping_check}")
75
+
76
+ # 2- check node API
77
+ node_api_check = health_controller._check_node_api(snode.mgmt_ip)
78
+ logger.info(f"Check: node API {snode.mgmt_ip}:5000 ... {node_api_check}")
79
+
80
+ if snode.status == StorageNode.STATUS_OFFLINE:
81
+ set_node_health_check(snode, ping_check & node_api_check)
82
+ continue
83
+
84
+ # 3- check node RPC
85
+ node_rpc_check = health_controller._check_node_rpc(
86
+ snode.mgmt_ip, snode.rpc_port, snode.rpc_username, snode.rpc_password)
87
+ logger.info(f"Check: node RPC {snode.mgmt_ip}:{snode.rpc_port} ... {node_rpc_check}")
88
+
89
+ # 4- docker API
90
+ node_docker_check = health_controller._check_node_docker_api(snode.mgmt_ip)
91
+ logger.info(f"Check: node docker API {snode.mgmt_ip}:2375 ... {node_docker_check}")
92
+
93
+ is_node_online = ping_check and node_api_check and node_rpc_check and node_docker_check
94
+
95
+ health_check_status = is_node_online
96
+ if not node_rpc_check:
97
+ logger.info("Putting all devices to unavailable state because RPC check failed")
98
+ for dev in snode.nvme_devices:
99
+ if dev.io_error:
100
+ logger.debug(f"Skipping Device action because of io_error {dev.get_id()}")
101
+ continue
102
+ set_device_health_check(cluster_id, dev, False)
103
+ else:
104
+ logger.info(f"Node device count: {len(snode.nvme_devices)}")
105
+ node_devices_check = True
106
+ node_remote_devices_check = True
107
+
108
+ for dev in snode.nvme_devices:
109
+ if dev.io_error:
110
+ logger.debug(f"Skipping Device check because of io_error {dev.get_id()}")
111
+ continue
112
+ ret = health_controller.check_device(dev.get_id())
113
+ set_device_health_check(cluster_id, dev, ret)
114
+ if dev.status == dev.STATUS_ONLINE:
115
+ node_devices_check &= ret
116
+
117
+ logger.info(f"Node remote device: {len(snode.remote_devices)}")
118
+ rpc_client = RPCClient(
119
+ snode.mgmt_ip, snode.rpc_port,
120
+ snode.rpc_username, snode.rpc_password,
121
+ timeout=10, retry=1)
122
+ for remote_device in snode.remote_devices:
123
+ ret = rpc_client.get_bdevs(remote_device.remote_bdev)
124
+ if ret:
125
+ logger.info(f"Checking bdev: {remote_device.remote_bdev} ... ok")
126
+ else:
127
+ logger.info(f"Checking bdev: {remote_device.remote_bdev} ... not found")
128
+ node_remote_devices_check &= bool(ret)
129
+
130
+ health_check_status = is_node_online and node_devices_check and node_remote_devices_check
131
+ set_node_health_check(snode, health_check_status)
132
+
133
+ time.sleep(constants.HEALTH_CHECK_INTERVAL_SEC)
134
+
@@ -49,7 +49,7 @@ db_controller = kv_store.DBController()
49
49
 
50
50
  logger.info("Starting LVol monitor...")
51
51
  while True:
52
- lvols = db_controller.get_lvols()
52
+ lvols = db_controller.get_lvols() # pass
53
53
  if not lvols:
54
54
  logger.error("LVols list is empty")
55
55
 
@@ -115,7 +115,7 @@ logger.info("Starting stats collector...")
115
115
  while True:
116
116
 
117
117
  pools = db_controller.get_pools()
118
- all_lvols = db_controller.get_lvols()
118
+ all_lvols = db_controller.get_lvols() # pass
119
119
  for pool in pools:
120
120
  lvols = []
121
121
  for lvol in all_lvols:
@@ -74,4 +74,3 @@ while True:
74
74
  update_port_stats(snode, nic, stats)
75
75
  else:
76
76
  logger.error("Error getting port stats: %s", nic.get_id())
77
-