sbcli-pre 1.2.5__zip → 1.2.7__zip
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/PKG-INFO +1 -1
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/env_var +1 -1
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/sbcli_pre.egg-info/PKG-INFO +1 -1
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/sbcli_pre.egg-info/SOURCES.txt +5 -3
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_cli/cli.py +138 -136
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/cluster_ops.py +138 -235
- sbcli_pre-1.2.7/simplyblock_core/constants.py +91 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/caching_node_controller.py +8 -6
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/cluster_events.py +9 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/device_controller.py +56 -63
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/events_controller.py +5 -3
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/health_controller.py +30 -40
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/lvol_controller.py +75 -39
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/pool_controller.py +8 -4
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/snapshot_controller.py +36 -3
- sbcli_pre-1.2.7/simplyblock_core/controllers/tasks_controller.py +103 -0
- sbcli_pre-1.2.7/simplyblock_core/controllers/tasks_events.py +37 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/distr_controller.py +13 -9
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/kv_store.py +62 -20
- sbcli_pre-1.2.7/simplyblock_core/mgmt_node_ops.py +205 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/events.py +9 -1
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/job_schedule.py +6 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/nvme_device.py +42 -4
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/storage_node.py +14 -2
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/rpc_client.py +55 -10
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/__init__.py +0 -4
- sbcli_pre-1.2.5/simplyblock_core/scripts/alerting/alert_resources.yaml → sbcli_pre-1.2.7/simplyblock_core/scripts/alerting/alert_resources.yaml.j2 +54 -5
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/dashboards/cluster.json +1 -1
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/deploy_stack.sh +9 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/docker-compose-swarm-monitoring.yml +32 -15
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/docker-compose-swarm.yml +17 -2
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/haproxy.cfg +15 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/install_deps.sh +3 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/stack_deploy_wait.sh +1 -1
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/capacity_and_stats_collector.py +1 -1
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/device_monitor.py +5 -46
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/distr_event_collector.py +10 -11
- sbcli_pre-1.2.7/simplyblock_core/services/health_check_service.py +134 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/lvol_monitor.py +1 -1
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/lvol_stat_collector.py +1 -1
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/port_stat_collector.py +0 -1
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/storage_node_monitor.py +49 -44
- sbcli_pre-1.2.7/simplyblock_core/services/tasks_runner_migration.py +61 -0
- sbcli_pre-1.2.5/simplyblock_core/services/job_tasks.py → sbcli_pre-1.2.7/simplyblock_core/services/tasks_runner_restart.py +95 -46
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/snode_client.py +12 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/storage_node_ops.py +630 -358
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/utils.py +126 -1
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/snode_ops.py +103 -25
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/web_api_cluster.py +20 -43
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/web_api_device.py +10 -7
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/web_api_lvol.py +9 -5
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/web_api_pool.py +14 -5
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/web_api_storage_node.py +15 -15
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/node_utils.py +0 -2
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/utils.py +8 -0
- sbcli_pre-1.2.5/simplyblock_core/constants.py +0 -65
- sbcli_pre-1.2.5/simplyblock_core/mgmt_node_ops.py +0 -80
- sbcli_pre-1.2.5/simplyblock_core/scripts/apply_dashboard.sh +0 -22
- sbcli_pre-1.2.5/simplyblock_core/services/health_check_service.py +0 -136
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/README.md +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/pyproject.toml +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/sbcli_pre.egg-info/dependency_links.txt +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/sbcli_pre.egg-info/entry_points.txt +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/sbcli_pre.egg-info/requires.txt +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/sbcli_pre.egg-info/top_level.txt +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/setup.cfg +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/setup.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_cli/main.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/__init__.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/cnode_client.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/compute_node_ops.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/__init__.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/device_events.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/lvol_events.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/mgmt_events.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/pool_events.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/snapshot_events.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/controllers/storage_events.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/__init__.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/base_model.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/caching_node.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/cluster.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/compute_node.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/deployer.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/global_settings.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/iface.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/lvol_model.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/mgmt_node.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/pool.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/port_stat.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/snapshot.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/models/stats.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/pci_utils.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/alerting/alert_rules.yaml +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/clean_local_storage_deploy.sh +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/config_docker.sh +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/dashboards/devices.json +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/dashboards/lvols.json +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/dashboards/node-exporter.json +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/dashboards/nodes.json +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/dashboards/pools.json +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/datasource.yml +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/db_config_double.sh +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/db_config_single.sh +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/prometheus.yml +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/run_ssh.sh +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/set_db_config.sh +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/__init__.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/caching_node_monitor.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/cap_monitor.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/install_service.sh +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/log_agg_service.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/mgmt_node_monitor.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/remove_service.sh +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/service_template.service +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/shell_utils.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/__init__.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/app.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/auth_middleware.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/__init__.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/caching_node_ops.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/caching_node_ops_k8s.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/node_api_basic.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/node_api_caching_docker.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/node_api_caching_ks.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/web_api_caching_node.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/web_api_deployer.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/web_api_mgmt_node.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/blueprints/web_api_snapshot.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/caching_node_app.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/caching_node_app_k8s.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/node_webapp.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/snode_app.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/static/delete.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/static/deploy.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/static/deploy_cnode.yaml +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/static/deploy_spdk.yaml +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/static/is_up.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/static/list_deps.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/static/rpac.yaml +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/static/tst.py +0 -0
- {sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_web/templates/deploy_spdk.yaml.j2 +0 -0
@@ -12,15 +12,26 @@ contactPoints:
|
|
12
12
|
name: grafana-alerts
|
13
13
|
receivers:
|
14
14
|
- uid: grafana
|
15
|
-
type:
|
15
|
+
type: {{ ALERT_TYPE }}
|
16
|
+
{% if ALERT_TYPE == 'slack' %}
|
16
17
|
settings:
|
17
18
|
username: grafana_bot
|
18
|
-
url: '
|
19
|
+
url: '{{ CONTACT_POINT }}'
|
19
20
|
title: |
|
20
|
-
{{ template "slack.title" . }}
|
21
|
+
{{ '{{' }} template "slack.title" . {{ '}}' }}
|
21
22
|
text: |
|
22
|
-
{{ template "slack.message" . }}
|
23
|
+
{{ '{{' }} template "slack.message" . {{ '}}' }}
|
24
|
+
{% else %}
|
25
|
+
settings:
|
26
|
+
addresses: '{{ CONTACT_POINT }}'
|
27
|
+
subject: |
|
28
|
+
{{ '{{' }} template "email.subject" . {{ '}}' }}
|
29
|
+
body: |
|
30
|
+
{{ '{{' }} template "email.body" . {{ '}}' }}
|
31
|
+
{% endif %}
|
23
32
|
|
33
|
+
{% if ALERT_TYPE == 'slack' %}
|
34
|
+
{% raw %}
|
24
35
|
templates:
|
25
36
|
- orgId: 1
|
26
37
|
name: slack.title
|
@@ -38,7 +49,9 @@ templates:
|
|
38
49
|
*Description*: {{ .Annotations.description }}
|
39
50
|
{{ end -}}
|
40
51
|
*Log message*: {{ index .Labels "message" }}
|
41
|
-
|
52
|
+
{% endraw %}
|
53
|
+
*Explore logs:* {{ GRAFANA_ENDPOINT }}
|
54
|
+
{% raw %}
|
42
55
|
{{ if .DashboardURL -}}
|
43
56
|
*Go to dashboard:* {{ .DashboardURL }}
|
44
57
|
{{- end }}
|
@@ -65,3 +78,39 @@ templates:
|
|
65
78
|
{{ end }}
|
66
79
|
|
67
80
|
{{- end }}
|
81
|
+
{% endraw %}
|
82
|
+
{% else %}
|
83
|
+
{% raw %}
|
84
|
+
- orgId: 1
|
85
|
+
name: email.subject
|
86
|
+
template: |-
|
87
|
+
{{ define "email.subject" -}}
|
88
|
+
[{{ .Status | toUpper }}] Grafana Alert
|
89
|
+
{{- end -}}
|
90
|
+
- orgId: 1
|
91
|
+
name: email.body
|
92
|
+
template: |-
|
93
|
+
{{ define "email.body" -}}
|
94
|
+
Alert: {{ .Labels.alertname }}
|
95
|
+
{{ if .Annotations -}}
|
96
|
+
Summary: {{ .Annotations.summary}}
|
97
|
+
Description: {{ .Annotations.description }}
|
98
|
+
{{ end -}}
|
99
|
+
Log message: {{ index .Labels "message" }}
|
100
|
+
Explore logs: {{ GRAFANA_ENDPOINT }}
|
101
|
+
{{ if .DashboardURL -}}
|
102
|
+
Go to dashboard: {{ .DashboardURL }}
|
103
|
+
{{- end }}
|
104
|
+
{{ if .PanelURL -}}
|
105
|
+
Go to panel: {{ .PanelURL }}
|
106
|
+
{{- end }}
|
107
|
+
Details:
|
108
|
+
{{ range .Labels.SortedPairs -}}
|
109
|
+
- {{ .Name }}: `{{ .Value }}`
|
110
|
+
{{ end -}}
|
111
|
+
{{ if .SilenceURL -}}
|
112
|
+
Silence this alert: {{ .SilenceURL }}
|
113
|
+
{{- end }}
|
114
|
+
{{- end }}
|
115
|
+
{% endraw %}
|
116
|
+
{% endif %}
|
@@ -19,6 +19,15 @@ then
|
|
19
19
|
export FDB_CLUSTER_FILE_CONTENTS=$FDB_CLUSTER_FILE_CONTENTS
|
20
20
|
fi
|
21
21
|
|
22
|
+
docker network create monitoring-net -d overlay --attachable
|
23
|
+
|
24
|
+
INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id)
|
25
|
+
|
26
|
+
if [ -n "$INSTANCE_ID" ]
|
27
|
+
then
|
28
|
+
export USE_EFS="rexray/efs"
|
29
|
+
fi
|
30
|
+
|
22
31
|
docker stack deploy --compose-file="$DIR"/docker-compose-swarm-monitoring.yml monitoring
|
23
32
|
|
24
33
|
# wait for the services to become online
|
{sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/scripts/docker-compose-swarm-monitoring.yml
RENAMED
@@ -9,6 +9,8 @@ services:
|
|
9
9
|
deploy:
|
10
10
|
placement:
|
11
11
|
constraints: [node.role == manager]
|
12
|
+
networks:
|
13
|
+
- monitoring-net
|
12
14
|
|
13
15
|
opensearch:
|
14
16
|
image: "opensearchproject/opensearch:2.4.0"
|
@@ -24,6 +26,8 @@ services:
|
|
24
26
|
deploy:
|
25
27
|
placement:
|
26
28
|
constraints: [node.role == manager]
|
29
|
+
networks:
|
30
|
+
- monitoring-net
|
27
31
|
|
28
32
|
graylog:
|
29
33
|
hostname: "server"
|
@@ -34,16 +38,16 @@ services:
|
|
34
38
|
GRAYLOG_PASSWORD_SECRET: "${GRAYLOG_PASSWORD_SECRET}"
|
35
39
|
GRAYLOG_ROOT_PASSWORD_SHA2: "${GRAYLOG_ROOT_PASSWORD_SHA2}"
|
36
40
|
GRAYLOG_HTTP_BIND_ADDRESS: "0.0.0.0:9000"
|
37
|
-
GRAYLOG_HTTP_EXTERNAL_URI: "http://localhost
|
41
|
+
GRAYLOG_HTTP_EXTERNAL_URI: "http://localhost/graylog/"
|
38
42
|
GRAYLOG_ELASTICSEARCH_HOSTS: "http://opensearch:9200"
|
39
43
|
GRAYLOG_MONGODB_URI: "mongodb://mongodb:27017/graylog"
|
44
|
+
GRAYLOG_SKIP_PREFLIGHT_CHECKS: "true"
|
40
45
|
ports:
|
41
46
|
- "5044:5044/tcp" # Beats
|
42
47
|
- "5140:5140/udp" # Syslog
|
43
48
|
- "5140:5140/tcp" # Syslog
|
44
49
|
- "5555:5555/tcp" # RAW TCP
|
45
50
|
- "5555:5555/udp" # RAW TCP
|
46
|
-
- "9000:9000/tcp" # Server API
|
47
51
|
- "12201:12201/tcp" # GELF TCP
|
48
52
|
- "12201:12201/udp" # GELF UDP
|
49
53
|
- "13301:13301/tcp" # Forwarder data
|
@@ -54,22 +58,28 @@ services:
|
|
54
58
|
deploy:
|
55
59
|
placement:
|
56
60
|
constraints: [node.role == manager]
|
61
|
+
networks:
|
62
|
+
- monitoring-net
|
57
63
|
|
58
64
|
promagent:
|
59
65
|
image: simplyblock/promagent
|
60
66
|
environment:
|
61
67
|
ClusterID: "${CLUSTER_ID}"
|
62
|
-
ClusterIP: "
|
68
|
+
ClusterIP: "HAProxy"
|
63
69
|
ClusterSecret: "${CLUSTER_SECRET}"
|
64
70
|
deploy:
|
65
71
|
placement:
|
66
72
|
constraints: [node.role == manager]
|
73
|
+
networks:
|
74
|
+
- monitoring-net
|
67
75
|
|
68
76
|
pushgateway:
|
69
77
|
image: prom/pushgateway
|
70
78
|
deploy:
|
71
79
|
placement:
|
72
80
|
constraints: [node.role == manager]
|
81
|
+
networks:
|
82
|
+
- monitoring-net
|
73
83
|
|
74
84
|
prometheus:
|
75
85
|
image: prom/prometheus:v2.44.0
|
@@ -85,6 +95,8 @@ services:
|
|
85
95
|
deploy:
|
86
96
|
placement:
|
87
97
|
constraints: [node.role == manager]
|
98
|
+
networks:
|
99
|
+
- monitoring-net
|
88
100
|
|
89
101
|
node-exporter:
|
90
102
|
image: prom/node-exporter:v1.7.0
|
@@ -105,7 +117,9 @@ services:
|
|
105
117
|
mode: global
|
106
118
|
placement:
|
107
119
|
constraints: [node.role == worker]
|
108
|
-
|
120
|
+
networks:
|
121
|
+
- monitoring-net
|
122
|
+
|
109
123
|
grafana:
|
110
124
|
image: grafana/grafana:10.0.12
|
111
125
|
environment:
|
@@ -114,19 +128,16 @@ services:
|
|
114
128
|
GF_ALERTING_ENABLED: "true"
|
115
129
|
GF_PATHS_PROVISIONING: "/etc/grafana/provisioning"
|
116
130
|
GF_INSTALL_PLUGINS: "grafana-opensearch-datasource"
|
131
|
+
GF_SERVER_ROOT_URL: "http://localhost/grafana/"
|
117
132
|
volumes:
|
118
133
|
- ./datasource.yml:/etc/grafana/provisioning/datasources/datasource.yaml
|
119
134
|
- grafana_data:/var/lib/grafana
|
120
135
|
- ./alerting:/etc/grafana/provisioning/alerting
|
121
|
-
restart: "always"
|
122
|
-
ports:
|
123
|
-
- target: 3000
|
124
|
-
published: 3000
|
125
|
-
protocol: tcp
|
126
|
-
mode: host
|
127
136
|
deploy:
|
128
137
|
placement:
|
129
138
|
constraints: [node.role == manager]
|
139
|
+
networks:
|
140
|
+
- monitoring-net
|
130
141
|
|
131
142
|
CleanupGraylog:
|
132
143
|
image: $SIMPLYBLOCK_DOCKER_IMAGE
|
@@ -136,19 +147,25 @@ services:
|
|
136
147
|
deploy:
|
137
148
|
placement:
|
138
149
|
constraints: [node.role == manager]
|
139
|
-
|
140
|
-
|
150
|
+
networks:
|
151
|
+
- monitoring-net
|
141
152
|
|
142
153
|
volumes:
|
143
154
|
mongodb_data:
|
155
|
+
driver: ${USE_EFS:-local}
|
144
156
|
os_data:
|
157
|
+
driver: ${USE_EFS:-local}
|
145
158
|
graylog_data:
|
159
|
+
driver: ${USE_EFS:-local}
|
146
160
|
graylog_journal:
|
161
|
+
driver: ${USE_EFS:-local}
|
147
162
|
grafana_data:
|
163
|
+
driver: ${USE_EFS:-local}
|
164
|
+
graylog_config:
|
165
|
+
driver: ${USE_EFS:-local}
|
148
166
|
prometheus_data:
|
149
|
-
|
167
|
+
driver: ${USE_EFS:-local}
|
150
168
|
|
151
169
|
networks:
|
152
|
-
|
170
|
+
monitoring-net:
|
153
171
|
external: true
|
154
|
-
name: host
|
@@ -114,6 +114,7 @@ services:
|
|
114
114
|
- 8404:8404
|
115
115
|
networks:
|
116
116
|
- localnet
|
117
|
+
- monitoring-net
|
117
118
|
volumes:
|
118
119
|
- "$DIR/haproxy.cfg:/usr/local/etc/haproxy/haproxy.cfg"
|
119
120
|
|
@@ -185,9 +186,20 @@ services:
|
|
185
186
|
networks:
|
186
187
|
- hostnet
|
187
188
|
|
188
|
-
|
189
|
+
TasksRunnerRestart:
|
189
190
|
image: $SIMPLYBLOCK_DOCKER_IMAGE
|
190
|
-
command: "python simplyblock_core/services/
|
191
|
+
command: "python simplyblock_core/services/tasks_runner_restart.py"
|
192
|
+
deploy:
|
193
|
+
placement:
|
194
|
+
constraints: [node.role == manager]
|
195
|
+
volumes:
|
196
|
+
- "/etc/foundationdb:/etc/foundationdb"
|
197
|
+
networks:
|
198
|
+
- hostnet
|
199
|
+
|
200
|
+
TasksRunnerMigration:
|
201
|
+
image: $SIMPLYBLOCK_DOCKER_IMAGE
|
202
|
+
command: "python simplyblock_core/services/tasks_runner_migration.py"
|
191
203
|
deploy:
|
192
204
|
placement:
|
193
205
|
constraints: [node.role == manager]
|
@@ -200,6 +212,9 @@ volumes:
|
|
200
212
|
os_data:
|
201
213
|
|
202
214
|
networks:
|
215
|
+
monitoring-net:
|
216
|
+
external: true
|
217
|
+
|
203
218
|
hostnet:
|
204
219
|
external: true
|
205
220
|
name: host
|
@@ -42,6 +42,16 @@ backend wep_api_services
|
|
42
42
|
balance roundrobin
|
43
43
|
server-template webapi- 3 WebAppAPI:5000 check resolvers docker init-addr libc,none
|
44
44
|
|
45
|
+
backend grafana_services
|
46
|
+
balance roundrobin
|
47
|
+
http-request set-path %[path,regsub(^/grafana/?,/)]
|
48
|
+
server-template grafana- 1 grafana:3000 check resolvers docker init-addr libc,none
|
49
|
+
|
50
|
+
backend graylog_services
|
51
|
+
balance roundrobin
|
52
|
+
http-request set-path %[path,regsub(^/graylog/?,/)]
|
53
|
+
server-template graylog- 1 graylog:9000 check resolvers docker init-addr libc,none
|
54
|
+
|
45
55
|
frontend stats_front
|
46
56
|
bind *:8404
|
47
57
|
stats enable
|
@@ -52,4 +62,9 @@ frontend stats_front
|
|
52
62
|
|
53
63
|
frontend web_api_front
|
54
64
|
bind *:80
|
65
|
+
|
66
|
+
use_backend grafana_services if { path /grafana } || { path_beg /grafana/ }
|
67
|
+
use_backend graylog_services if { path /graylog } || { path_beg /graylog/ }
|
68
|
+
|
55
69
|
default_backend wep_api_services
|
70
|
+
|
@@ -15,6 +15,8 @@ sudo yum install hostname pkg-config git wget python3-pip yum-utils docker-ce do
|
|
15
15
|
sudo systemctl enable docker
|
16
16
|
sudo systemctl start docker
|
17
17
|
|
18
|
+
sudo docker plugin install --grant-all-permissions rexray/efs REXRAY_PREEMPT=true
|
19
|
+
|
18
20
|
wget https://github.com/apple/foundationdb/releases/download/7.3.3/foundationdb-clients-7.3.3-1.el7.x86_64.rpm -q
|
19
21
|
sudo rpm -U foundationdb-clients-7.3.3-1.el7.x86_64.rpm --quiet --reinstall
|
20
22
|
rm -f foundationdb-clients-7.3.3-1.el7.x86_64.rpm
|
@@ -29,6 +31,7 @@ sudo sed -i 's/#X11UseLocalhost yes/X11UseLocalhost no/g' /etc/ssh/sshd_config
|
|
29
31
|
|
30
32
|
sudo service sshd restart
|
31
33
|
sudo modprobe nvme-tcp
|
34
|
+
sudo modprobe nbd
|
32
35
|
|
33
36
|
sudo sysctl -w net.ipv6.conf.all.disable_ipv6=1
|
34
37
|
|
{sbcli_pre-1.2.5 → sbcli_pre-1.2.7}/simplyblock_core/services/capacity_and_stats_collector.py
RENAMED
@@ -193,7 +193,7 @@ while True:
|
|
193
193
|
logger.info(f"Device is skipped: {device.get_id()} status: {device.status}")
|
194
194
|
continue
|
195
195
|
capacity_dict = rpc_client.alceml_get_capacity(device.alceml_bdev)
|
196
|
-
stats_dict = rpc_client.get_device_stats(device.
|
196
|
+
stats_dict = rpc_client.get_device_stats(device.nvme_bdev)
|
197
197
|
record = add_device_stats(cl, device, capacity_dict, stats_dict)
|
198
198
|
if record:
|
199
199
|
devices_records.append(record)
|
@@ -5,7 +5,7 @@ import sys
|
|
5
5
|
import uuid
|
6
6
|
|
7
7
|
from simplyblock_core import constants, kv_store
|
8
|
-
from simplyblock_core.
|
8
|
+
from simplyblock_core.controllers import tasks_controller
|
9
9
|
from simplyblock_core.models.nvme_device import NVMeDevice
|
10
10
|
from simplyblock_core.models.storage_node import StorageNode
|
11
11
|
|
@@ -27,47 +27,6 @@ db_store = kv_store.KVStore()
|
|
27
27
|
db_controller = kv_store.DBController()
|
28
28
|
|
29
29
|
|
30
|
-
def add_device_to_auto_restart(device):
|
31
|
-
tasks = db_controller.get_job_tasks(device.cluster_id)
|
32
|
-
for task in tasks:
|
33
|
-
if task.device_id == device.get_id():
|
34
|
-
if task.status != JobSchedule.STATUS_DONE:
|
35
|
-
logger.info(f"Task found, skip adding new task: {task.get_id()}")
|
36
|
-
return
|
37
|
-
|
38
|
-
ds = JobSchedule()
|
39
|
-
ds.uuid = str(uuid.uuid4())
|
40
|
-
ds.cluster_id = device.cluster_id
|
41
|
-
ds.node_id = device.node_id
|
42
|
-
ds.device_id = device.get_id()
|
43
|
-
ds.date = int(time.time())
|
44
|
-
ds.function_name = "device_restart"
|
45
|
-
ds.status = 'new'
|
46
|
-
|
47
|
-
ds.write_to_db(db_store)
|
48
|
-
return ds.get_id()
|
49
|
-
|
50
|
-
|
51
|
-
def add_node_to_auto_restart(node):
|
52
|
-
tasks = db_controller.get_job_tasks(node.cluster_id)
|
53
|
-
for task in tasks:
|
54
|
-
if task.node_id == node.get_id():
|
55
|
-
if task.status != JobSchedule.STATUS_DONE:
|
56
|
-
logger.info(f"Task found, skip adding new task: {task.get_id()}")
|
57
|
-
return
|
58
|
-
|
59
|
-
ds = JobSchedule()
|
60
|
-
ds.uuid = str(uuid.uuid4())
|
61
|
-
ds.cluster_id = node.cluster_id
|
62
|
-
ds.node_id = node.get_id()
|
63
|
-
ds.date = int(time.time())
|
64
|
-
ds.function_name = "node_restart"
|
65
|
-
ds.status = 'new'
|
66
|
-
|
67
|
-
ds.write_to_db(db_store)
|
68
|
-
return ds.get_id()
|
69
|
-
|
70
|
-
|
71
30
|
logger.info("Starting Device monitor...")
|
72
31
|
while True:
|
73
32
|
nodes = db_controller.get_storage_nodes()
|
@@ -89,9 +48,9 @@ while True:
|
|
89
48
|
logger.info("Adding device to auto restart")
|
90
49
|
auto_restart_devices.append(dev)
|
91
50
|
|
92
|
-
if len(auto_restart_devices) ==
|
93
|
-
|
94
|
-
elif len(auto_restart_devices)
|
95
|
-
|
51
|
+
if len(auto_restart_devices) >= 2 or len(online_devices) == 0:
|
52
|
+
tasks_controller.add_node_to_auto_restart(node)
|
53
|
+
elif len(auto_restart_devices) == 1:
|
54
|
+
tasks_controller.add_device_to_auto_restart(auto_restart_devices[0])
|
96
55
|
|
97
56
|
time.sleep(constants.DEV_MONITOR_INTERVAL_SEC)
|
@@ -88,7 +88,7 @@ def process_lvol_event(event):
|
|
88
88
|
if event.message in ["error_open", 'error_read', "error_write", "error_unmap"]:
|
89
89
|
vuid = event.object_dict['vuid']
|
90
90
|
lvol = None
|
91
|
-
for lv in db_controller.get_lvols():
|
91
|
+
for lv in db_controller.get_lvols(): # pass
|
92
92
|
if lv.vuid == vuid:
|
93
93
|
lvol = lv
|
94
94
|
break
|
@@ -127,7 +127,6 @@ def process_event(event_id):
|
|
127
127
|
|
128
128
|
hostname = utils.get_hostname()
|
129
129
|
logger.info("Starting Distr event collector...")
|
130
|
-
logger.info(f"Node:{hostname}")
|
131
130
|
while True:
|
132
131
|
time.sleep(constants.DISTR_EVENT_COLLECTOR_INTERVAL_SEC)
|
133
132
|
|
@@ -141,14 +140,13 @@ while True:
|
|
141
140
|
snode.rpc_port,
|
142
141
|
snode.rpc_username,
|
143
142
|
snode.rpc_password,
|
144
|
-
timeout=
|
145
|
-
|
146
|
-
num_of_events = constants.DISTR_EVENT_COLLECTOR_NUM_OF_EVENTS
|
143
|
+
timeout=10, retry=2)
|
144
|
+
|
147
145
|
try:
|
148
|
-
|
149
|
-
|
146
|
+
events = client.distr_status_events_discard_then_get(0, constants.DISTR_EVENT_COLLECTOR_NUM_OF_EVENTS)
|
147
|
+
|
150
148
|
if not events:
|
151
|
-
logger.
|
149
|
+
logger.debug("no events found")
|
152
150
|
continue
|
153
151
|
|
154
152
|
logger.info(f"Found events: {len(events)}")
|
@@ -161,10 +159,11 @@ while True:
|
|
161
159
|
for eid in event_ids:
|
162
160
|
logger.info(f"Processing event: {eid}")
|
163
161
|
process_event(eid)
|
164
|
-
|
165
|
-
|
162
|
+
|
163
|
+
logger.info(f"Discarding events: {len(events)}")
|
164
|
+
client.distr_status_events_discard_then_get(len(events), 0)
|
166
165
|
|
167
166
|
except Exception as e:
|
168
|
-
logger.error("Failed to
|
167
|
+
logger.error("Failed to process distr events")
|
169
168
|
logger.exception(e)
|
170
169
|
continue
|
@@ -0,0 +1,134 @@
|
|
1
|
+
# coding=utf-8
|
2
|
+
import logging
|
3
|
+
|
4
|
+
import time
|
5
|
+
import sys
|
6
|
+
from datetime import datetime
|
7
|
+
|
8
|
+
|
9
|
+
from simplyblock_core.controllers import health_controller, storage_events, device_events
|
10
|
+
from simplyblock_core.models.storage_node import StorageNode
|
11
|
+
from simplyblock_core.rpc_client import RPCClient
|
12
|
+
from simplyblock_core import constants, kv_store
|
13
|
+
|
14
|
+
# Import the GELF logger
|
15
|
+
from graypy import GELFUDPHandler
|
16
|
+
|
17
|
+
def set_node_health_check(snode, health_check_status):
|
18
|
+
snode = db_controller.get_storage_node_by_id(snode.get_id())
|
19
|
+
if snode.health_check == health_check_status:
|
20
|
+
return
|
21
|
+
old_status = snode.health_check
|
22
|
+
snode.health_check = health_check_status
|
23
|
+
snode.updated_at = str(datetime.now())
|
24
|
+
snode.write_to_db(db_store)
|
25
|
+
storage_events.snode_health_check_change(snode, snode.health_check, old_status, caused_by="monitor")
|
26
|
+
|
27
|
+
|
28
|
+
def set_device_health_check(cluster_id, device, health_check_status):
|
29
|
+
if device.health_check == health_check_status:
|
30
|
+
return
|
31
|
+
nodes = db_controller.get_storage_nodes_by_cluster_id(cluster_id)
|
32
|
+
for node in nodes:
|
33
|
+
if node.nvme_devices:
|
34
|
+
for dev in node.nvme_devices:
|
35
|
+
if dev.get_id() == device.get_id():
|
36
|
+
old_status = dev.health_check
|
37
|
+
dev.health_check = health_check_status
|
38
|
+
node.write_to_db(db_store)
|
39
|
+
device_events.device_health_check_change(
|
40
|
+
dev, dev.health_check, old_status, caused_by="monitor")
|
41
|
+
|
42
|
+
|
43
|
+
# configure logging
|
44
|
+
logger_handler = logging.StreamHandler(stream=sys.stdout)
|
45
|
+
logger_handler.setFormatter(logging.Formatter('%(asctime)s: %(levelname)s: %(message)s'))
|
46
|
+
gelf_handler = GELFUDPHandler('0.0.0.0', constants.GELF_PORT)
|
47
|
+
logger = logging.getLogger()
|
48
|
+
logger.addHandler(gelf_handler)
|
49
|
+
logger.addHandler(logger_handler)
|
50
|
+
logger.setLevel(logging.DEBUG)
|
51
|
+
|
52
|
+
# get DB controller
|
53
|
+
db_store = kv_store.KVStore()
|
54
|
+
db_controller = kv_store.DBController()
|
55
|
+
|
56
|
+
logger.info("Starting health check service")
|
57
|
+
while True:
|
58
|
+
clusters = db_controller.get_clusters()
|
59
|
+
for cluster in clusters:
|
60
|
+
cluster_id = cluster.get_id()
|
61
|
+
snodes = db_controller.get_storage_nodes_by_cluster_id(cluster_id)
|
62
|
+
if not snodes:
|
63
|
+
logger.error("storage nodes list is empty")
|
64
|
+
|
65
|
+
for snode in snodes:
|
66
|
+
logger.info("Node: %s, status %s", snode.get_id(), snode.status)
|
67
|
+
|
68
|
+
if snode.status not in [StorageNode.STATUS_ONLINE, StorageNode.STATUS_UNREACHABLE]:
|
69
|
+
logger.info(f"Node status is: {snode.status}, skipping")
|
70
|
+
continue
|
71
|
+
|
72
|
+
# 1- check node ping
|
73
|
+
ping_check = health_controller._check_node_ping(snode.mgmt_ip)
|
74
|
+
logger.info(f"Check: ping mgmt ip {snode.mgmt_ip} ... {ping_check}")
|
75
|
+
|
76
|
+
# 2- check node API
|
77
|
+
node_api_check = health_controller._check_node_api(snode.mgmt_ip)
|
78
|
+
logger.info(f"Check: node API {snode.mgmt_ip}:5000 ... {node_api_check}")
|
79
|
+
|
80
|
+
if snode.status == StorageNode.STATUS_OFFLINE:
|
81
|
+
set_node_health_check(snode, ping_check & node_api_check)
|
82
|
+
continue
|
83
|
+
|
84
|
+
# 3- check node RPC
|
85
|
+
node_rpc_check = health_controller._check_node_rpc(
|
86
|
+
snode.mgmt_ip, snode.rpc_port, snode.rpc_username, snode.rpc_password)
|
87
|
+
logger.info(f"Check: node RPC {snode.mgmt_ip}:{snode.rpc_port} ... {node_rpc_check}")
|
88
|
+
|
89
|
+
# 4- docker API
|
90
|
+
node_docker_check = health_controller._check_node_docker_api(snode.mgmt_ip)
|
91
|
+
logger.info(f"Check: node docker API {snode.mgmt_ip}:2375 ... {node_docker_check}")
|
92
|
+
|
93
|
+
is_node_online = ping_check and node_api_check and node_rpc_check and node_docker_check
|
94
|
+
|
95
|
+
health_check_status = is_node_online
|
96
|
+
if not node_rpc_check:
|
97
|
+
logger.info("Putting all devices to unavailable state because RPC check failed")
|
98
|
+
for dev in snode.nvme_devices:
|
99
|
+
if dev.io_error:
|
100
|
+
logger.debug(f"Skipping Device action because of io_error {dev.get_id()}")
|
101
|
+
continue
|
102
|
+
set_device_health_check(cluster_id, dev, False)
|
103
|
+
else:
|
104
|
+
logger.info(f"Node device count: {len(snode.nvme_devices)}")
|
105
|
+
node_devices_check = True
|
106
|
+
node_remote_devices_check = True
|
107
|
+
|
108
|
+
for dev in snode.nvme_devices:
|
109
|
+
if dev.io_error:
|
110
|
+
logger.debug(f"Skipping Device check because of io_error {dev.get_id()}")
|
111
|
+
continue
|
112
|
+
ret = health_controller.check_device(dev.get_id())
|
113
|
+
set_device_health_check(cluster_id, dev, ret)
|
114
|
+
if dev.status == dev.STATUS_ONLINE:
|
115
|
+
node_devices_check &= ret
|
116
|
+
|
117
|
+
logger.info(f"Node remote device: {len(snode.remote_devices)}")
|
118
|
+
rpc_client = RPCClient(
|
119
|
+
snode.mgmt_ip, snode.rpc_port,
|
120
|
+
snode.rpc_username, snode.rpc_password,
|
121
|
+
timeout=10, retry=1)
|
122
|
+
for remote_device in snode.remote_devices:
|
123
|
+
ret = rpc_client.get_bdevs(remote_device.remote_bdev)
|
124
|
+
if ret:
|
125
|
+
logger.info(f"Checking bdev: {remote_device.remote_bdev} ... ok")
|
126
|
+
else:
|
127
|
+
logger.info(f"Checking bdev: {remote_device.remote_bdev} ... not found")
|
128
|
+
node_remote_devices_check &= bool(ret)
|
129
|
+
|
130
|
+
health_check_status = is_node_online and node_devices_check and node_remote_devices_check
|
131
|
+
set_node_health_check(snode, health_check_status)
|
132
|
+
|
133
|
+
time.sleep(constants.HEALTH_CHECK_INTERVAL_SEC)
|
134
|
+
|
@@ -115,7 +115,7 @@ logger.info("Starting stats collector...")
|
|
115
115
|
while True:
|
116
116
|
|
117
117
|
pools = db_controller.get_pools()
|
118
|
-
all_lvols = db_controller.get_lvols()
|
118
|
+
all_lvols = db_controller.get_lvols() # pass
|
119
119
|
for pool in pools:
|
120
120
|
lvols = []
|
121
121
|
for lvol in all_lvols:
|