sbcli-pre 1.2.3__zip → 1.2.5__zip
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/PKG-INFO +20 -5
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/README.md +19 -4
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/env_var +1 -1
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/sbcli_pre.egg-info/PKG-INFO +20 -5
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/sbcli_pre.egg-info/SOURCES.txt +5 -5
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_cli/cli.py +115 -113
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_core/cluster_ops.py +238 -141
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_core/constants.py +7 -5
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_core/controllers/caching_node_controller.py +6 -8
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_core/controllers/cluster_events.py +0 -9
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_core/controllers/device_controller.py +63 -56
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_core/controllers/events_controller.py +3 -5
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_core/controllers/health_controller.py +40 -30
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_core/controllers/lvol_controller.py +36 -42
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_core/controllers/pool_controller.py +4 -8
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_core/controllers/snapshot_controller.py +3 -9
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_core/distr_controller.py +9 -13
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_core/kv_store.py +29 -47
- sbcli_pre-1.2.5/simplyblock_core/mgmt_node_ops.py +80 -0
- sbcli_pre-1.2.5/simplyblock_core/models/deployer.py +62 -0
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_core/models/events.py +1 -9
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_core/models/job_schedule.py +0 -6
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_core/models/nvme_device.py +4 -42
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_core/models/storage_node.py +1 -9
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_core/rpc_client.py +10 -55
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_core/scripts/__init__.py +4 -0
- sbcli_pre-1.2.3/simplyblock_core/scripts/alerting/alert_resources.yaml.j2 → sbcli_pre-1.2.5/simplyblock_core/scripts/alerting/alert_resources.yaml +5 -54
- sbcli_pre-1.2.5/simplyblock_core/scripts/apply_dashboard.sh +22 -0
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_core/scripts/dashboards/cluster.json +1 -1
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_core/scripts/deploy_stack.sh +0 -2
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_core/scripts/docker-compose-swarm-monitoring.yml +13 -22
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_core/scripts/docker-compose-swarm.yml +2 -17
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_core/scripts/haproxy.cfg +0 -15
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_core/scripts/install_deps.sh +0 -1
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_core/services/capacity_and_stats_collector.py +1 -1
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_core/services/device_monitor.py +44 -3
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_core/services/distr_event_collector.py +11 -10
- sbcli_pre-1.2.5/simplyblock_core/services/health_check_service.py +136 -0
- sbcli_pre-1.2.3/simplyblock_core/services/tasks_runner_restart.py → sbcli_pre-1.2.5/simplyblock_core/services/job_tasks.py +46 -93
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_core/services/lvol_monitor.py +1 -1
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_core/services/lvol_stat_collector.py +1 -1
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_core/services/port_stat_collector.py +1 -0
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_core/services/storage_node_monitor.py +44 -49
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_core/snode_client.py +0 -12
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_core/storage_node_ops.py +336 -525
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_core/utils.py +1 -46
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_web/app.py +2 -1
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_web/blueprints/snode_ops.py +25 -103
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_web/blueprints/web_api_cluster.py +43 -20
- sbcli_pre-1.2.5/simplyblock_web/blueprints/web_api_deployer.py +394 -0
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_web/blueprints/web_api_device.py +7 -10
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_web/blueprints/web_api_lvol.py +5 -9
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_web/blueprints/web_api_pool.py +5 -14
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_web/blueprints/web_api_storage_node.py +10 -3
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_web/node_utils.py +2 -0
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_web/utils.py +0 -8
- sbcli_pre-1.2.3/simplyblock_core/controllers/tasks_controller.py +0 -103
- sbcli_pre-1.2.3/simplyblock_core/controllers/tasks_events.py +0 -37
- sbcli_pre-1.2.3/simplyblock_core/mgmt_node_ops.py +0 -205
- sbcli_pre-1.2.3/simplyblock_core/services/health_check_service.py +0 -134
- sbcli_pre-1.2.3/simplyblock_core/services/tasks_runner_migration.py +0 -61
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/pyproject.toml +0 -0
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/sbcli_pre.egg-info/dependency_links.txt +0 -0
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/sbcli_pre.egg-info/entry_points.txt +0 -0
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/sbcli_pre.egg-info/requires.txt +0 -0
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/sbcli_pre.egg-info/top_level.txt +0 -0
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/setup.cfg +0 -0
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/setup.py +0 -0
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_cli/main.py +0 -0
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_core/__init__.py +0 -0
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_core/cnode_client.py +0 -0
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_core/compute_node_ops.py +0 -0
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_core/controllers/__init__.py +0 -0
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_core/controllers/device_events.py +0 -0
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_core/controllers/lvol_events.py +0 -0
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_core/controllers/mgmt_events.py +0 -0
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_core/controllers/pool_events.py +0 -0
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_core/controllers/snapshot_events.py +0 -0
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_core/controllers/storage_events.py +0 -0
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_core/models/__init__.py +0 -0
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_core/models/base_model.py +0 -0
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_core/models/caching_node.py +0 -0
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_core/models/cluster.py +0 -0
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_core/models/compute_node.py +0 -0
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_core/models/global_settings.py +0 -0
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_core/models/iface.py +0 -0
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_core/models/lvol_model.py +0 -0
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_core/models/mgmt_node.py +0 -0
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_core/models/pool.py +0 -0
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_core/models/port_stat.py +0 -0
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_core/models/snapshot.py +0 -0
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_core/models/stats.py +0 -0
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_core/pci_utils.py +0 -0
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_core/scripts/alerting/alert_rules.yaml +0 -0
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_core/scripts/clean_local_storage_deploy.sh +0 -0
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_core/scripts/config_docker.sh +0 -0
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_core/scripts/dashboards/devices.json +0 -0
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_core/scripts/dashboards/lvols.json +0 -0
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_core/scripts/dashboards/node-exporter.json +0 -0
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_core/scripts/dashboards/nodes.json +0 -0
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_core/scripts/dashboards/pools.json +0 -0
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_core/scripts/datasource.yml +0 -0
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_core/scripts/db_config_double.sh +0 -0
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_core/scripts/db_config_single.sh +0 -0
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_core/scripts/prometheus.yml +0 -0
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_core/scripts/run_ssh.sh +0 -0
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_core/scripts/set_db_config.sh +0 -0
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_core/scripts/stack_deploy_wait.sh +0 -0
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_core/services/__init__.py +0 -0
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_core/services/caching_node_monitor.py +0 -0
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_core/services/cap_monitor.py +0 -0
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_core/services/install_service.sh +0 -0
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_core/services/log_agg_service.py +0 -0
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_core/services/mgmt_node_monitor.py +0 -0
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_core/services/remove_service.sh +0 -0
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_core/services/service_template.service +0 -0
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_core/shell_utils.py +0 -0
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_web/__init__.py +0 -0
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_web/auth_middleware.py +0 -0
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_web/blueprints/__init__.py +0 -0
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_web/blueprints/caching_node_ops.py +0 -0
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_web/blueprints/caching_node_ops_k8s.py +0 -0
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_web/blueprints/node_api_basic.py +0 -0
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_web/blueprints/node_api_caching_docker.py +0 -0
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_web/blueprints/node_api_caching_ks.py +0 -0
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_web/blueprints/web_api_caching_node.py +0 -0
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_web/blueprints/web_api_mgmt_node.py +0 -0
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_web/blueprints/web_api_snapshot.py +0 -0
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_web/caching_node_app.py +0 -0
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_web/caching_node_app_k8s.py +0 -0
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_web/node_webapp.py +0 -0
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_web/snode_app.py +0 -0
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_web/static/delete.py +0 -0
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_web/static/deploy.py +0 -0
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_web/static/deploy_cnode.yaml +0 -0
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_web/static/deploy_spdk.yaml +0 -0
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_web/static/is_up.py +0 -0
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_web/static/list_deps.py +0 -0
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_web/static/rpac.yaml +0 -0
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_web/static/tst.py +0 -0
- {sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_web/templates/deploy_spdk.yaml.j2 +0 -0
@@ -35,6 +35,10 @@ def deploy_stack(cli_pass, dev_ip, image_name, graylog_password, cluster_id, log
|
|
35
35
|
return __run_script(
|
36
36
|
['sudo', 'bash', '-x', os.path.join(DIR_PATH, 'deploy_stack.sh'), cli_pass, dev_ip, image_name, pass_hash, graylog_password, cluster_id, log_del_interval, metrics_retention_period])
|
37
37
|
|
38
|
+
def apply_dashboard(grafanaPassword):
|
39
|
+
return __run_script(
|
40
|
+
['sudo', 'bash', '-x', os.path.join(DIR_PATH, 'apply_dashboard.sh'), grafanaPassword])
|
41
|
+
|
38
42
|
|
39
43
|
def deploy_cleaner():
|
40
44
|
return __run_script(['sudo', 'bash', '-x', os.path.join(DIR_PATH, 'clean_local_storage_deploy.sh')])
|
@@ -12,26 +12,15 @@ contactPoints:
|
|
12
12
|
name: grafana-alerts
|
13
13
|
receivers:
|
14
14
|
- uid: grafana
|
15
|
-
type:
|
16
|
-
{% if ALERT_TYPE == 'slack' %}
|
15
|
+
type: slack
|
17
16
|
settings:
|
18
17
|
username: grafana_bot
|
19
|
-
url: '
|
18
|
+
url: 'https://hooks.slack.com/services/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'
|
20
19
|
title: |
|
21
|
-
{{
|
20
|
+
{{ template "slack.title" . }}
|
22
21
|
text: |
|
23
|
-
{{
|
24
|
-
{% else %}
|
25
|
-
settings:
|
26
|
-
addresses: '{{ CONTACT_POINT }}'
|
27
|
-
subject: |
|
28
|
-
{{ '{{' }} template "email.subject" . {{ '}}' }}
|
29
|
-
body: |
|
30
|
-
{{ '{{' }} template "email.body" . {{ '}}' }}
|
31
|
-
{% endif %}
|
22
|
+
{{ template "slack.message" . }}
|
32
23
|
|
33
|
-
{% if ALERT_TYPE == 'slack' %}
|
34
|
-
{% raw %}
|
35
24
|
templates:
|
36
25
|
- orgId: 1
|
37
26
|
name: slack.title
|
@@ -49,9 +38,7 @@ templates:
|
|
49
38
|
*Description*: {{ .Annotations.description }}
|
50
39
|
{{ end -}}
|
51
40
|
*Log message*: {{ index .Labels "message" }}
|
52
|
-
|
53
|
-
*Explore logs:* {{ GRAFANA_ENDPOINT }}
|
54
|
-
{% raw %}
|
41
|
+
*Explore logs:* https://grafanaURL.com/explore?orgId=1
|
55
42
|
{{ if .DashboardURL -}}
|
56
43
|
*Go to dashboard:* {{ .DashboardURL }}
|
57
44
|
{{- end }}
|
@@ -78,39 +65,3 @@ templates:
|
|
78
65
|
{{ end }}
|
79
66
|
|
80
67
|
{{- end }}
|
81
|
-
{% endraw %}
|
82
|
-
{% else %}
|
83
|
-
{% raw %}
|
84
|
-
- orgId: 1
|
85
|
-
name: email.subject
|
86
|
-
template: |-
|
87
|
-
{{ define "email.subject" -}}
|
88
|
-
[{{ .Status | toUpper }}] Grafana Alert
|
89
|
-
{{- end -}}
|
90
|
-
- orgId: 1
|
91
|
-
name: email.body
|
92
|
-
template: |-
|
93
|
-
{{ define "email.body" -}}
|
94
|
-
Alert: {{ .Labels.alertname }}
|
95
|
-
{{ if .Annotations -}}
|
96
|
-
Summary: {{ .Annotations.summary}}
|
97
|
-
Description: {{ .Annotations.description }}
|
98
|
-
{{ end -}}
|
99
|
-
Log message: {{ index .Labels "message" }}
|
100
|
-
Explore logs: {{ GRAFANA_ENDPOINT }}
|
101
|
-
{{ if .DashboardURL -}}
|
102
|
-
Go to dashboard: {{ .DashboardURL }}
|
103
|
-
{{- end }}
|
104
|
-
{{ if .PanelURL -}}
|
105
|
-
Go to panel: {{ .PanelURL }}
|
106
|
-
{{- end }}
|
107
|
-
Details:
|
108
|
-
{{ range .Labels.SortedPairs -}}
|
109
|
-
- {{ .Name }}: `{{ .Value }}`
|
110
|
-
{{ end -}}
|
111
|
-
{{ if .SilenceURL -}}
|
112
|
-
Silence this alert: {{ .SilenceURL }}
|
113
|
-
{{- end }}
|
114
|
-
{{- end }}
|
115
|
-
{% endraw %}
|
116
|
-
{% endif %}
|
@@ -0,0 +1,22 @@
|
|
1
|
+
#!/bin/bash
|
2
|
+
|
3
|
+
TD=$(dirname -- "$(readlink -f -- "$0")")
|
4
|
+
|
5
|
+
# Grafana Password
|
6
|
+
export grafanaPassword=$1
|
7
|
+
|
8
|
+
# Grafana username
|
9
|
+
GF_ADMIN_USER=admin
|
10
|
+
|
11
|
+
HOST=0.0.0.0:3000
|
12
|
+
|
13
|
+
DASHBOARDS="${TD}/dashboards"
|
14
|
+
for dashboard in "${DASHBOARDS}/cluster.json" "${DASHBOARDS}/devices.json" "${DASHBOARDS}/nodes.json" "${DASHBOARDS}/lvols.json" "${DASHBOARDS}/pools.json" "${DASHBOARDS}/node-exporter.json"; do
|
15
|
+
echo -e "\nUploading dashboard: ${dashboard}"
|
16
|
+
curl -X POST -H "Content-Type: application/json" \
|
17
|
+
-d "@${dashboard}" \
|
18
|
+
"http://${GF_ADMIN_USER}:${grafanaPassword}@${HOST}/api/dashboards/import"
|
19
|
+
echo ""
|
20
|
+
done
|
21
|
+
|
22
|
+
echo "Cluster deployment complete."
|
@@ -19,8 +19,6 @@ then
|
|
19
19
|
export FDB_CLUSTER_FILE_CONTENTS=$FDB_CLUSTER_FILE_CONTENTS
|
20
20
|
fi
|
21
21
|
|
22
|
-
docker network create monitoring-net -d overlay --attachable
|
23
|
-
|
24
22
|
docker stack deploy --compose-file="$DIR"/docker-compose-swarm-monitoring.yml monitoring
|
25
23
|
|
26
24
|
# wait for the services to become online
|
{sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_core/scripts/docker-compose-swarm-monitoring.yml
RENAMED
@@ -9,8 +9,6 @@ services:
|
|
9
9
|
deploy:
|
10
10
|
placement:
|
11
11
|
constraints: [node.role == manager]
|
12
|
-
networks:
|
13
|
-
- monitoring-net
|
14
12
|
|
15
13
|
opensearch:
|
16
14
|
image: "opensearchproject/opensearch:2.4.0"
|
@@ -26,8 +24,6 @@ services:
|
|
26
24
|
deploy:
|
27
25
|
placement:
|
28
26
|
constraints: [node.role == manager]
|
29
|
-
networks:
|
30
|
-
- monitoring-net
|
31
27
|
|
32
28
|
graylog:
|
33
29
|
hostname: "server"
|
@@ -38,7 +34,7 @@ services:
|
|
38
34
|
GRAYLOG_PASSWORD_SECRET: "${GRAYLOG_PASSWORD_SECRET}"
|
39
35
|
GRAYLOG_ROOT_PASSWORD_SHA2: "${GRAYLOG_ROOT_PASSWORD_SHA2}"
|
40
36
|
GRAYLOG_HTTP_BIND_ADDRESS: "0.0.0.0:9000"
|
41
|
-
GRAYLOG_HTTP_EXTERNAL_URI: "http://localhost/
|
37
|
+
GRAYLOG_HTTP_EXTERNAL_URI: "http://localhost:9000/"
|
42
38
|
GRAYLOG_ELASTICSEARCH_HOSTS: "http://opensearch:9200"
|
43
39
|
GRAYLOG_MONGODB_URI: "mongodb://mongodb:27017/graylog"
|
44
40
|
ports:
|
@@ -47,6 +43,7 @@ services:
|
|
47
43
|
- "5140:5140/tcp" # Syslog
|
48
44
|
- "5555:5555/tcp" # RAW TCP
|
49
45
|
- "5555:5555/udp" # RAW TCP
|
46
|
+
- "9000:9000/tcp" # Server API
|
50
47
|
- "12201:12201/tcp" # GELF TCP
|
51
48
|
- "12201:12201/udp" # GELF UDP
|
52
49
|
- "13301:13301/tcp" # Forwarder data
|
@@ -57,8 +54,6 @@ services:
|
|
57
54
|
deploy:
|
58
55
|
placement:
|
59
56
|
constraints: [node.role == manager]
|
60
|
-
networks:
|
61
|
-
- monitoring-net
|
62
57
|
|
63
58
|
promagent:
|
64
59
|
image: simplyblock/promagent
|
@@ -69,16 +64,12 @@ services:
|
|
69
64
|
deploy:
|
70
65
|
placement:
|
71
66
|
constraints: [node.role == manager]
|
72
|
-
networks:
|
73
|
-
- monitoring-net
|
74
67
|
|
75
68
|
pushgateway:
|
76
69
|
image: prom/pushgateway
|
77
70
|
deploy:
|
78
71
|
placement:
|
79
72
|
constraints: [node.role == manager]
|
80
|
-
networks:
|
81
|
-
- monitoring-net
|
82
73
|
|
83
74
|
prometheus:
|
84
75
|
image: prom/prometheus:v2.44.0
|
@@ -94,8 +85,6 @@ services:
|
|
94
85
|
deploy:
|
95
86
|
placement:
|
96
87
|
constraints: [node.role == manager]
|
97
|
-
networks:
|
98
|
-
- monitoring-net
|
99
88
|
|
100
89
|
node-exporter:
|
101
90
|
image: prom/node-exporter:v1.7.0
|
@@ -116,9 +105,7 @@ services:
|
|
116
105
|
mode: global
|
117
106
|
placement:
|
118
107
|
constraints: [node.role == worker]
|
119
|
-
|
120
|
-
- monitoring-net
|
121
|
-
|
108
|
+
|
122
109
|
grafana:
|
123
110
|
image: grafana/grafana:10.0.12
|
124
111
|
environment:
|
@@ -127,16 +114,19 @@ services:
|
|
127
114
|
GF_ALERTING_ENABLED: "true"
|
128
115
|
GF_PATHS_PROVISIONING: "/etc/grafana/provisioning"
|
129
116
|
GF_INSTALL_PLUGINS: "grafana-opensearch-datasource"
|
130
|
-
GF_SERVER_ROOT_URL: "http://localhost/grafana/"
|
131
117
|
volumes:
|
132
118
|
- ./datasource.yml:/etc/grafana/provisioning/datasources/datasource.yaml
|
133
119
|
- grafana_data:/var/lib/grafana
|
134
120
|
- ./alerting:/etc/grafana/provisioning/alerting
|
121
|
+
restart: "always"
|
122
|
+
ports:
|
123
|
+
- target: 3000
|
124
|
+
published: 3000
|
125
|
+
protocol: tcp
|
126
|
+
mode: host
|
135
127
|
deploy:
|
136
128
|
placement:
|
137
129
|
constraints: [node.role == manager]
|
138
|
-
networks:
|
139
|
-
- monitoring-net
|
140
130
|
|
141
131
|
CleanupGraylog:
|
142
132
|
image: $SIMPLYBLOCK_DOCKER_IMAGE
|
@@ -146,8 +136,8 @@ services:
|
|
146
136
|
deploy:
|
147
137
|
placement:
|
148
138
|
constraints: [node.role == manager]
|
149
|
-
|
150
|
-
|
139
|
+
|
140
|
+
### monitoring ###
|
151
141
|
|
152
142
|
volumes:
|
153
143
|
mongodb_data:
|
@@ -159,5 +149,6 @@ volumes:
|
|
159
149
|
alertmanager_data:
|
160
150
|
|
161
151
|
networks:
|
162
|
-
|
152
|
+
hostnet:
|
163
153
|
external: true
|
154
|
+
name: host
|
@@ -114,7 +114,6 @@ services:
|
|
114
114
|
- 8404:8404
|
115
115
|
networks:
|
116
116
|
- localnet
|
117
|
-
- monitoring-net
|
118
117
|
volumes:
|
119
118
|
- "$DIR/haproxy.cfg:/usr/local/etc/haproxy/haproxy.cfg"
|
120
119
|
|
@@ -186,20 +185,9 @@ services:
|
|
186
185
|
networks:
|
187
186
|
- hostnet
|
188
187
|
|
189
|
-
|
188
|
+
TasksRunner:
|
190
189
|
image: $SIMPLYBLOCK_DOCKER_IMAGE
|
191
|
-
command: "python simplyblock_core/services/
|
192
|
-
deploy:
|
193
|
-
placement:
|
194
|
-
constraints: [node.role == manager]
|
195
|
-
volumes:
|
196
|
-
- "/etc/foundationdb:/etc/foundationdb"
|
197
|
-
networks:
|
198
|
-
- hostnet
|
199
|
-
|
200
|
-
TasksRunnerMigration:
|
201
|
-
image: $SIMPLYBLOCK_DOCKER_IMAGE
|
202
|
-
command: "python simplyblock_core/services/tasks_runner_migration.py"
|
190
|
+
command: "python simplyblock_core/services/job_tasks.py"
|
203
191
|
deploy:
|
204
192
|
placement:
|
205
193
|
constraints: [node.role == manager]
|
@@ -212,9 +200,6 @@ volumes:
|
|
212
200
|
os_data:
|
213
201
|
|
214
202
|
networks:
|
215
|
-
monitoring-net:
|
216
|
-
external: true
|
217
|
-
|
218
203
|
hostnet:
|
219
204
|
external: true
|
220
205
|
name: host
|
@@ -42,16 +42,6 @@ backend wep_api_services
|
|
42
42
|
balance roundrobin
|
43
43
|
server-template webapi- 3 WebAppAPI:5000 check resolvers docker init-addr libc,none
|
44
44
|
|
45
|
-
backend grafana_services
|
46
|
-
balance roundrobin
|
47
|
-
http-request set-path %[path,regsub(^/grafana/?,/)]
|
48
|
-
server-template grafana- 1 grafana:3000 check resolvers docker init-addr libc,none
|
49
|
-
|
50
|
-
backend graylog_services
|
51
|
-
balance roundrobin
|
52
|
-
http-request set-path %[path,regsub(^/graylog/?,/)]
|
53
|
-
server-template graylog- 1 graylog:9000 check resolvers docker init-addr libc,none
|
54
|
-
|
55
45
|
frontend stats_front
|
56
46
|
bind *:8404
|
57
47
|
stats enable
|
@@ -62,9 +52,4 @@ frontend stats_front
|
|
62
52
|
|
63
53
|
frontend web_api_front
|
64
54
|
bind *:80
|
65
|
-
|
66
|
-
use_backend grafana_services if { path /grafana } || { path_beg /grafana/ }
|
67
|
-
use_backend graylog_services if { path /graylog } || { path_beg /graylog/ }
|
68
|
-
|
69
55
|
default_backend wep_api_services
|
70
|
-
|
{sbcli_pre-1.2.3 → sbcli_pre-1.2.5}/simplyblock_core/services/capacity_and_stats_collector.py
RENAMED
@@ -193,7 +193,7 @@ while True:
|
|
193
193
|
logger.info(f"Device is skipped: {device.get_id()} status: {device.status}")
|
194
194
|
continue
|
195
195
|
capacity_dict = rpc_client.alceml_get_capacity(device.alceml_bdev)
|
196
|
-
stats_dict = rpc_client.get_device_stats(device.
|
196
|
+
stats_dict = rpc_client.get_device_stats(device.alloc_bdev)
|
197
197
|
record = add_device_stats(cl, device, capacity_dict, stats_dict)
|
198
198
|
if record:
|
199
199
|
devices_records.append(record)
|
@@ -5,7 +5,7 @@ import sys
|
|
5
5
|
import uuid
|
6
6
|
|
7
7
|
from simplyblock_core import constants, kv_store
|
8
|
-
from simplyblock_core.
|
8
|
+
from simplyblock_core.models.job_schedule import JobSchedule
|
9
9
|
from simplyblock_core.models.nvme_device import NVMeDevice
|
10
10
|
from simplyblock_core.models.storage_node import StorageNode
|
11
11
|
|
@@ -27,6 +27,47 @@ db_store = kv_store.KVStore()
|
|
27
27
|
db_controller = kv_store.DBController()
|
28
28
|
|
29
29
|
|
30
|
+
def add_device_to_auto_restart(device):
|
31
|
+
tasks = db_controller.get_job_tasks(device.cluster_id)
|
32
|
+
for task in tasks:
|
33
|
+
if task.device_id == device.get_id():
|
34
|
+
if task.status != JobSchedule.STATUS_DONE:
|
35
|
+
logger.info(f"Task found, skip adding new task: {task.get_id()}")
|
36
|
+
return
|
37
|
+
|
38
|
+
ds = JobSchedule()
|
39
|
+
ds.uuid = str(uuid.uuid4())
|
40
|
+
ds.cluster_id = device.cluster_id
|
41
|
+
ds.node_id = device.node_id
|
42
|
+
ds.device_id = device.get_id()
|
43
|
+
ds.date = int(time.time())
|
44
|
+
ds.function_name = "device_restart"
|
45
|
+
ds.status = 'new'
|
46
|
+
|
47
|
+
ds.write_to_db(db_store)
|
48
|
+
return ds.get_id()
|
49
|
+
|
50
|
+
|
51
|
+
def add_node_to_auto_restart(node):
|
52
|
+
tasks = db_controller.get_job_tasks(node.cluster_id)
|
53
|
+
for task in tasks:
|
54
|
+
if task.node_id == node.get_id():
|
55
|
+
if task.status != JobSchedule.STATUS_DONE:
|
56
|
+
logger.info(f"Task found, skip adding new task: {task.get_id()}")
|
57
|
+
return
|
58
|
+
|
59
|
+
ds = JobSchedule()
|
60
|
+
ds.uuid = str(uuid.uuid4())
|
61
|
+
ds.cluster_id = node.cluster_id
|
62
|
+
ds.node_id = node.get_id()
|
63
|
+
ds.date = int(time.time())
|
64
|
+
ds.function_name = "node_restart"
|
65
|
+
ds.status = 'new'
|
66
|
+
|
67
|
+
ds.write_to_db(db_store)
|
68
|
+
return ds.get_id()
|
69
|
+
|
70
|
+
|
30
71
|
logger.info("Starting Device monitor...")
|
31
72
|
while True:
|
32
73
|
nodes = db_controller.get_storage_nodes()
|
@@ -49,8 +90,8 @@ while True:
|
|
49
90
|
auto_restart_devices.append(dev)
|
50
91
|
|
51
92
|
if len(auto_restart_devices) == 1:
|
52
|
-
|
93
|
+
add_device_to_auto_restart(auto_restart_devices[0])
|
53
94
|
elif len(auto_restart_devices) >= 2 and len(online_devices) == 0:
|
54
|
-
|
95
|
+
add_node_to_auto_restart(node)
|
55
96
|
|
56
97
|
time.sleep(constants.DEV_MONITOR_INTERVAL_SEC)
|
@@ -88,7 +88,7 @@ def process_lvol_event(event):
|
|
88
88
|
if event.message in ["error_open", 'error_read', "error_write", "error_unmap"]:
|
89
89
|
vuid = event.object_dict['vuid']
|
90
90
|
lvol = None
|
91
|
-
for lv in db_controller.get_lvols():
|
91
|
+
for lv in db_controller.get_lvols():
|
92
92
|
if lv.vuid == vuid:
|
93
93
|
lvol = lv
|
94
94
|
break
|
@@ -127,6 +127,7 @@ def process_event(event_id):
|
|
127
127
|
|
128
128
|
hostname = utils.get_hostname()
|
129
129
|
logger.info("Starting Distr event collector...")
|
130
|
+
logger.info(f"Node:{hostname}")
|
130
131
|
while True:
|
131
132
|
time.sleep(constants.DISTR_EVENT_COLLECTOR_INTERVAL_SEC)
|
132
133
|
|
@@ -140,13 +141,14 @@ while True:
|
|
140
141
|
snode.rpc_port,
|
141
142
|
snode.rpc_username,
|
142
143
|
snode.rpc_password,
|
143
|
-
timeout=
|
144
|
-
|
144
|
+
timeout=3, retry=2
|
145
|
+
)
|
146
|
+
num_of_events = constants.DISTR_EVENT_COLLECTOR_NUM_OF_EVENTS
|
145
147
|
try:
|
146
|
-
events = client.
|
147
|
-
|
148
|
+
# events = client.distr_status_events_get()
|
149
|
+
events = client.distr_status_events_discard_then_get(0, num_of_events)
|
148
150
|
if not events:
|
149
|
-
logger.
|
151
|
+
logger.error("Distr events empty")
|
150
152
|
continue
|
151
153
|
|
152
154
|
logger.info(f"Found events: {len(events)}")
|
@@ -159,11 +161,10 @@ while True:
|
|
159
161
|
for eid in event_ids:
|
160
162
|
logger.info(f"Processing event: {eid}")
|
161
163
|
process_event(eid)
|
162
|
-
|
163
|
-
|
164
|
-
client.distr_status_events_discard_then_get(len(events), 0)
|
164
|
+
logger.info(f"Discarding events: {num_of_events}")
|
165
|
+
events = client.distr_status_events_discard_then_get(num_of_events, 0)
|
165
166
|
|
166
167
|
except Exception as e:
|
167
|
-
logger.error("Failed to
|
168
|
+
logger.error("Failed to get distr events")
|
168
169
|
logger.exception(e)
|
169
170
|
continue
|
@@ -0,0 +1,136 @@
|
|
1
|
+
# coding=utf-8
|
2
|
+
import logging
|
3
|
+
|
4
|
+
import time
|
5
|
+
import sys
|
6
|
+
from datetime import datetime
|
7
|
+
|
8
|
+
|
9
|
+
from simplyblock_core.controllers import health_controller, storage_events, device_events
|
10
|
+
from simplyblock_core.models.storage_node import StorageNode
|
11
|
+
from simplyblock_core.rpc_client import RPCClient
|
12
|
+
from simplyblock_core import constants, kv_store
|
13
|
+
|
14
|
+
# Import the GELF logger
|
15
|
+
from graypy import GELFUDPHandler
|
16
|
+
|
17
|
+
def set_node_health_check(snode, health_check_status):
|
18
|
+
snode = db_controller.get_storage_node_by_id(snode.get_id())
|
19
|
+
if snode.health_check == health_check_status:
|
20
|
+
return
|
21
|
+
old_status = snode.health_check
|
22
|
+
snode.health_check = health_check_status
|
23
|
+
snode.updated_at = str(datetime.now())
|
24
|
+
snode.write_to_db(db_store)
|
25
|
+
storage_events.snode_health_check_change(snode, snode.health_check, old_status, caused_by="monitor")
|
26
|
+
|
27
|
+
|
28
|
+
def set_device_health_check(cluster_id, device, health_check_status):
|
29
|
+
if device.health_check == health_check_status:
|
30
|
+
return
|
31
|
+
nodes = db_controller.get_storage_nodes()
|
32
|
+
for node in nodes:
|
33
|
+
if node.nvme_devices:
|
34
|
+
for dev in node.nvme_devices:
|
35
|
+
if dev.get_id() == device.get_id():
|
36
|
+
old_status = dev.health_check
|
37
|
+
dev.health_check = health_check_status
|
38
|
+
node.write_to_db(db_store)
|
39
|
+
device_events.device_health_check_change(
|
40
|
+
dev, dev.health_check, old_status, caused_by="monitor")
|
41
|
+
|
42
|
+
|
43
|
+
# configure logging
|
44
|
+
logger_handler = logging.StreamHandler(stream=sys.stdout)
|
45
|
+
logger_handler.setFormatter(logging.Formatter('%(asctime)s: %(levelname)s: %(message)s'))
|
46
|
+
gelf_handler = GELFUDPHandler('0.0.0.0', constants.GELF_PORT)
|
47
|
+
logger = logging.getLogger()
|
48
|
+
logger.addHandler(gelf_handler)
|
49
|
+
logger.addHandler(logger_handler)
|
50
|
+
logger.setLevel(logging.DEBUG)
|
51
|
+
|
52
|
+
# get DB controller
|
53
|
+
db_store = kv_store.KVStore()
|
54
|
+
db_controller = kv_store.DBController()
|
55
|
+
|
56
|
+
logger.info("Starting health check service")
|
57
|
+
while True:
|
58
|
+
cluster_id = ""
|
59
|
+
cl = db_controller.get_clusters()
|
60
|
+
if cl:
|
61
|
+
cluster_id = cl[0].get_id()
|
62
|
+
|
63
|
+
snodes = db_controller.get_storage_nodes()
|
64
|
+
if not snodes:
|
65
|
+
logger.error("storage nodes list is empty")
|
66
|
+
|
67
|
+
for snode in snodes:
|
68
|
+
logger.info("Node: %s, status %s", snode.get_id(), snode.status)
|
69
|
+
|
70
|
+
if snode.status not in [StorageNode.STATUS_ONLINE, StorageNode.STATUS_UNREACHABLE]:
|
71
|
+
logger.info(f"Node status is: {snode.status}, skipping")
|
72
|
+
continue
|
73
|
+
|
74
|
+
# 1- check node ping
|
75
|
+
ping_check = health_controller._check_node_ping(snode.mgmt_ip)
|
76
|
+
logger.info(f"Check: ping mgmt ip {snode.mgmt_ip} ... {ping_check}")
|
77
|
+
|
78
|
+
# 2- check node API
|
79
|
+
node_api_check = health_controller._check_node_api(snode.mgmt_ip)
|
80
|
+
logger.info(f"Check: node API {snode.mgmt_ip}:5000 ... {node_api_check}")
|
81
|
+
|
82
|
+
if snode.status == StorageNode.STATUS_OFFLINE:
|
83
|
+
set_node_health_check(snode, ping_check & node_api_check)
|
84
|
+
continue
|
85
|
+
|
86
|
+
# 3- check node RPC
|
87
|
+
node_rpc_check = health_controller._check_node_rpc(
|
88
|
+
snode.mgmt_ip, snode.rpc_port, snode.rpc_username, snode.rpc_password)
|
89
|
+
logger.info(f"Check: node RPC {snode.mgmt_ip}:{snode.rpc_port} ... {node_rpc_check}")
|
90
|
+
|
91
|
+
# 4- docker API
|
92
|
+
node_docker_check = health_controller._check_node_docker_api(snode.mgmt_ip)
|
93
|
+
logger.info(f"Check: node docker API {snode.mgmt_ip}:2375 ... {node_docker_check}")
|
94
|
+
|
95
|
+
is_node_online = ping_check and node_api_check and node_rpc_check and node_docker_check
|
96
|
+
|
97
|
+
health_check_status = is_node_online
|
98
|
+
if not node_rpc_check:
|
99
|
+
logger.info("Putting all devices to unavailable state because RPC check failed")
|
100
|
+
for dev in snode.nvme_devices:
|
101
|
+
if dev.io_error:
|
102
|
+
logger.debug(f"Skipping Device action because of io_error {dev.get_id()}")
|
103
|
+
continue
|
104
|
+
set_device_health_check(cluster_id, dev, False)
|
105
|
+
else:
|
106
|
+
logger.info(f"Node device count: {len(snode.nvme_devices)}")
|
107
|
+
node_devices_check = True
|
108
|
+
node_remote_devices_check = True
|
109
|
+
|
110
|
+
for dev in snode.nvme_devices:
|
111
|
+
if dev.io_error:
|
112
|
+
logger.debug(f"Skipping Device check because of io_error {dev.get_id()}")
|
113
|
+
continue
|
114
|
+
ret = health_controller.check_device(dev.get_id())
|
115
|
+
set_device_health_check(cluster_id, dev, ret)
|
116
|
+
if dev.status == dev.STATUS_ONLINE:
|
117
|
+
node_devices_check &= ret
|
118
|
+
|
119
|
+
logger.info(f"Node remote device: {len(snode.remote_devices)}")
|
120
|
+
rpc_client = RPCClient(
|
121
|
+
snode.mgmt_ip, snode.rpc_port,
|
122
|
+
snode.rpc_username, snode.rpc_password,
|
123
|
+
timeout=5, retry=3)
|
124
|
+
for remote_device in snode.remote_devices:
|
125
|
+
ret = rpc_client.get_bdevs(remote_device.remote_bdev)
|
126
|
+
if ret:
|
127
|
+
logger.info(f"Checking bdev: {remote_device.remote_bdev} ... ok")
|
128
|
+
else:
|
129
|
+
logger.info(f"Checking bdev: {remote_device.remote_bdev} ... not found")
|
130
|
+
node_remote_devices_check &= bool(ret)
|
131
|
+
|
132
|
+
health_check_status = is_node_online and node_devices_check and node_remote_devices_check
|
133
|
+
set_node_health_check(snode, health_check_status)
|
134
|
+
|
135
|
+
time.sleep(constants.HEALTH_CHECK_INTERVAL_SEC)
|
136
|
+
|