continuent-monitors-nagios 0.0.3 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/tungsten_nagios_backups +140 -0
- data/bin/tungsten_nagios_connector +10 -3
- data/bin/tungsten_nagios_latency +159 -0
- data/bin/tungsten_nagios_monitor_threads +2 -0
- data/bin/tungsten_nagios_online +134 -0
- data/bin/tungsten_nagios_policy +61 -0
- data/bin/tungsten_nagios_progress +105 -0
- data/bin/tungsten_nagios_services +75 -0
- metadata +16 -18
- data/bin/check_tungsten.sh +0 -576
- data/bin/check_tungsten_backups +0 -70
- data/bin/check_tungsten_latency +0 -172
- data/bin/check_tungsten_online +0 -105
- data/bin/check_tungsten_policy +0 -61
- data/bin/check_tungsten_progress +0 -81
- data/bin/check_tungsten_services +0 -95
data/bin/check_tungsten_backups
DELETED
@@ -1,70 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
#
|
3
|
-
# The script should be run as the tungsten user to ensure the
|
4
|
-
# environment is set correctly. Export the environment variables before
|
5
|
-
# calling the script to modify behavior
|
6
|
-
#
|
7
|
-
|
8
|
-
require "#{File.dirname(__FILE__)}/../lib/ruby/tungsten"
|
9
|
-
|
10
|
-
class CheckTungstenBackups
|
11
|
-
include TungstenScript
|
12
|
-
|
13
|
-
def main
|
14
|
-
status = TI.status()
|
15
|
-
unless status.coordinator() == TI.hostname()
|
16
|
-
nagios_ok("Not running check because this node is not the coordinator")
|
17
|
-
end
|
18
|
-
seconds_since_epoch = TU.cmd_result("date +%s").to_i()
|
19
|
-
most_recent_backup = nil
|
20
|
-
|
21
|
-
status.datasources().each{
|
22
|
-
|ds|
|
23
|
-
begin
|
24
|
-
TU.ssh_result("stat -c\"%n %Y\" #{TI.trepctl_property(status.name(), 'replicator.storage.agent.fs.directory')}/store*.properties 2>/dev/null", ds, TI.user()).split("\n").each{
|
25
|
-
|line|
|
26
|
-
stored_backup=line.split(" ")
|
27
|
-
stored_backup[1] = stored_backup[1].to_i()
|
28
|
-
|
29
|
-
if most_recent_backup == nil || stored_backup[1] > most_recent_backup[:seconds]
|
30
|
-
most_recent_backup = {
|
31
|
-
:hostname => ds,
|
32
|
-
:filename => stored_backup[0],
|
33
|
-
:seconds => stored_backup[1]
|
34
|
-
}
|
35
|
-
end
|
36
|
-
}
|
37
|
-
rescue CommandError
|
38
|
-
end
|
39
|
-
}
|
40
|
-
|
41
|
-
if most_recent_backup == nil
|
42
|
-
nagios_critical("Unable to find a backup on any datasource")
|
43
|
-
end
|
44
|
-
|
45
|
-
age = seconds_since_epoch-most_recent_backup[:seconds]
|
46
|
-
if age > @options[:max_backup_age]
|
47
|
-
nagios_critical("#{most_recent_backup[:hostname]}:#{most_recent_backup[:filename]} [#{age}s] is older than #{@options[:max_backup_age]}s")
|
48
|
-
else
|
49
|
-
nagios_ok("The most recent backup is #{most_recent_backup[:hostname]}:#{most_recent_backup[:filename]} [#{age}s]")
|
50
|
-
end
|
51
|
-
end
|
52
|
-
|
53
|
-
def configure
|
54
|
-
super()
|
55
|
-
|
56
|
-
description("Check all local datasources to make sure one of them has a backup younger than the max allowed age")
|
57
|
-
add_option(:max_backup_age, {
|
58
|
-
:on => "--max-backup-age String",
|
59
|
-
:help => "Maximum allowed age in seconds of a backup on any machine",
|
60
|
-
:parse => method(:parse_integer_option),
|
61
|
-
:default => 86400
|
62
|
-
})
|
63
|
-
end
|
64
|
-
|
65
|
-
def script_name
|
66
|
-
"check_tungsten_backups"
|
67
|
-
end
|
68
|
-
|
69
|
-
self.new().run()
|
70
|
-
end
|
data/bin/check_tungsten_latency
DELETED
@@ -1,172 +0,0 @@
|
|
1
|
-
#!/bin/bash
|
2
|
-
#
|
3
|
-
# Simple Bash Script To Check Tungsten Latency
|
4
|
-
# Nagios Plugin For NRPE
|
5
|
-
#
|
6
|
-
# This script accepts two arguments, {{-w}} and {{-c}}. The {{-w}} flag is
|
7
|
-
# the level at which a warning should be returned. {{-c}} sets the level for
|
8
|
-
# a critical return value. The script uses the maximum latency of any slave
|
9
|
-
# to determine the return value.
|
10
|
-
#
|
11
|
-
OK_STATE=0
|
12
|
-
WARNING_STATE=1
|
13
|
-
CRITICAL_STATE=2
|
14
|
-
THOME=`dirname $0`
|
15
|
-
|
16
|
-
function display_help()
|
17
|
-
{
|
18
|
-
echo "Usage: ./check_tungsten_latency -w warning_level -c critical_level [-h]"
|
19
|
-
echo " -w Throw a warning alert if the maximum latency"
|
20
|
-
echo " is above this level"
|
21
|
-
echo " -c Throw a critical alert if the maximum latency"
|
22
|
-
echo " is above this level"
|
23
|
-
echo " --perfdata Display performance data of the latency"
|
24
|
-
echo " --perslave-perfdata Show performance latency values of each slave."
|
25
|
-
echo " If this is not set the maximum latency will be"
|
26
|
-
echo " displayed in the performace data"
|
27
|
-
echo " -h Display this message"
|
28
|
-
exit 0
|
29
|
-
}
|
30
|
-
|
31
|
-
# We will use this to make some floating point comparisons
|
32
|
-
function float_cond()
|
33
|
-
{
|
34
|
-
local cond=0
|
35
|
-
if [[ $# -gt 0 ]]; then
|
36
|
-
cond=$(echo "$*" | bc -q 2>&1)
|
37
|
-
if [[ $? -ne 0 ]]; then
|
38
|
-
echo "Error: $cond"
|
39
|
-
exit 1
|
40
|
-
fi
|
41
|
-
if [[ -z "$cond" ]]; then cond=0; fi
|
42
|
-
if [[ "$cond" != 0 && "$cond" != 1 ]]; then cond=0; fi
|
43
|
-
fi
|
44
|
-
local stat=$((cond == 0))
|
45
|
-
return $stat
|
46
|
-
}
|
47
|
-
|
48
|
-
warning_level=0
|
49
|
-
critical_level=0
|
50
|
-
perfdata="false"
|
51
|
-
performance_data_default_glue=""
|
52
|
-
performance_data_suffix=""
|
53
|
-
perfdata_allslaves=""
|
54
|
-
|
55
|
-
for arg
|
56
|
-
do
|
57
|
-
delim=""
|
58
|
-
case "$arg" in
|
59
|
-
#translate --gnu-long-options to -g (short options)
|
60
|
-
--perfdata) args="${args}-p ";;
|
61
|
-
--perslave-perfdata) args="${args}-s ";;
|
62
|
-
#pass through anything else
|
63
|
-
*) [[ "${arg:0:1}" == "-" ]] || delim="\""
|
64
|
-
args="${args}${delim}${arg}${delim} ";;
|
65
|
-
esac
|
66
|
-
done
|
67
|
-
|
68
|
-
#Reset the positional parameters to the short options
|
69
|
-
eval set -- $args
|
70
|
-
|
71
|
-
while getopts "w:c:h:nps" Option
|
72
|
-
do
|
73
|
-
case $Option in
|
74
|
-
w )
|
75
|
-
warning_level=$OPTARG
|
76
|
-
;;
|
77
|
-
c )
|
78
|
-
critical_level=$OPTARG
|
79
|
-
;;
|
80
|
-
h )
|
81
|
-
display_help
|
82
|
-
;;
|
83
|
-
p )
|
84
|
-
perfdata="true"
|
85
|
-
;;
|
86
|
-
s )
|
87
|
-
perfdata_allslaves="true"
|
88
|
-
;;
|
89
|
-
esac
|
90
|
-
done
|
91
|
-
if float_cond "$warning_level == 0"; then
|
92
|
-
echo "Error: warning_level has not been set"
|
93
|
-
echo ""
|
94
|
-
display_help
|
95
|
-
fi
|
96
|
-
|
97
|
-
if float_cond "$critical_level == 0"; then
|
98
|
-
echo "Error: critical_level has not been set"
|
99
|
-
echo ""
|
100
|
-
display_help
|
101
|
-
fi
|
102
|
-
|
103
|
-
if [ "$perfdata" == "true" ]; then
|
104
|
-
performance_data_default_glue=" "
|
105
|
-
performance_data_suffix=";$warning_level;$critical_level;;"
|
106
|
-
fi
|
107
|
-
|
108
|
-
error_message=""
|
109
|
-
error_messaage_glue=""
|
110
|
-
performance_data_glue=""
|
111
|
-
performance_data="| "
|
112
|
-
max_latency=0
|
113
|
-
|
114
|
-
manager_running=`${THOME}/../../tungsten-manager/bin/manager status | grep "PID" | wc -l`
|
115
|
-
# Check the manager status
|
116
|
-
if [ $manager_running -eq 0 ]; then
|
117
|
-
echo "CRITICAL: Manager is not running"
|
118
|
-
exit $CRITICAL_STATE
|
119
|
-
fi
|
120
|
-
|
121
|
-
latency_values=`echo "ls -l" | ${THOME}/../../tungsten-manager/bin/cctrl | grep -E "REPLICATOR\(role=[relay|slave]|appliedLatency" | tr -d "| "`
|
122
|
-
|
123
|
-
current_slave=""
|
124
|
-
for line in $latency_values
|
125
|
-
do
|
126
|
-
if [[ $current_slave == "" ]]
|
127
|
-
then
|
128
|
-
current_slave=`echo $line | grep "REPLICATOR" | cut -f 1 -d ":"`
|
129
|
-
else
|
130
|
-
applied_latency=`echo $line | grep "appliedLatency" | cut -f 2 -d ":"`
|
131
|
-
|
132
|
-
if float_cond "$applied_latency > $max_latency"; then
|
133
|
-
max_latency=$applied_latency
|
134
|
-
fi
|
135
|
-
|
136
|
-
if float_cond "$applied_latency > $warning_level"; then
|
137
|
-
error_message="$error_message$error_message_glue$current_slave=$applied_latency""s"
|
138
|
-
error_message_glue=", "
|
139
|
-
fi
|
140
|
-
|
141
|
-
if float_cond "$applied_latency == -1"; then
|
142
|
-
error_message="$error_message$error_message_glue$current_slave is missing latency information"
|
143
|
-
error_message_glue=", "
|
144
|
-
fi
|
145
|
-
|
146
|
-
performance_data="$performance_data$performance_data_glue$current_slave=$applied_latency$performance_data_suffix"
|
147
|
-
performance_data_glue="$performance_data_default_glue"
|
148
|
-
current_slave=""
|
149
|
-
fi
|
150
|
-
done
|
151
|
-
|
152
|
-
if [ "$perfdata_allslaves" != "true" ]; then
|
153
|
-
performance_data="| max_latency=${max_latency}$performance_data_suffix"
|
154
|
-
fi
|
155
|
-
|
156
|
-
if [ "$perfdata" == "false" ]; then
|
157
|
-
performance_data=""
|
158
|
-
fi
|
159
|
-
|
160
|
-
if float_cond "$max_latency > $critical_level"; then
|
161
|
-
echo "CRITICAL: $error_message $performance_data"
|
162
|
-
exit $CRITICAL_STATE
|
163
|
-
fi
|
164
|
-
|
165
|
-
if [[ $error_message != "" ]]; then
|
166
|
-
echo "WARNING: $error_message $performance_data"
|
167
|
-
exit $WARNING_STATE
|
168
|
-
fi
|
169
|
-
|
170
|
-
echo "OK: All slaves are running normally (max_latency=${max_latency}) $performance_data "
|
171
|
-
|
172
|
-
exit $OK_STATE
|
data/bin/check_tungsten_online
DELETED
@@ -1,105 +0,0 @@
|
|
1
|
-
#!/bin/bash
|
2
|
-
# Copyright (C) 2014 Continuent, Inc.
|
3
|
-
#
|
4
|
-
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
5
|
-
# not use this file except in compliance with the License. You may obtain
|
6
|
-
# a copy of the License at
|
7
|
-
#
|
8
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
-
#
|
10
|
-
# Unless required by applicable law or agreed to in writing, software
|
11
|
-
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
12
|
-
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
13
|
-
# License for the specific language governing permissions and limitations
|
14
|
-
# under the License.
|
15
|
-
#
|
16
|
-
# Initial developer(s): Jeff Mace
|
17
|
-
# Contributor(s):
|
18
|
-
#
|
19
|
-
# Simple Bash Script To Check Tungsten Services
|
20
|
-
# Nagios Plugin For NRPE
|
21
|
-
#
|
22
|
-
# This script does not accept any arguments. It will return a warning if any
|
23
|
-
# of Tungsten resources is not in an {{ONLINE}} state. It uses the output of
|
24
|
-
# the {{ls resources}} command to determine the current state.
|
25
|
-
#
|
26
|
-
OK_STATE=0
|
27
|
-
WARNING_STATE=1
|
28
|
-
CRITICAL_STATE=2
|
29
|
-
THOME=`dirname $0`
|
30
|
-
|
31
|
-
error_message=""
|
32
|
-
error_messaage_glue=""
|
33
|
-
offline_count=0
|
34
|
-
dataservice=""
|
35
|
-
skip_shun=0
|
36
|
-
|
37
|
-
function display_help()
|
38
|
-
{
|
39
|
-
echo "Usage: ./check_tungsten_online -s dataservice [-h]"
|
40
|
-
echo " -s The data service you would like to check"
|
41
|
-
echo " -h Display this message"
|
42
|
-
echo " -n Skip Shunned Services"
|
43
|
-
exit 0
|
44
|
-
}
|
45
|
-
|
46
|
-
while getopts "s:h:n" Option
|
47
|
-
do
|
48
|
-
case $Option in
|
49
|
-
h )
|
50
|
-
display_help
|
51
|
-
;;
|
52
|
-
s )
|
53
|
-
dataservice=$OPTARG
|
54
|
-
;;
|
55
|
-
n )
|
56
|
-
skip_shun=1
|
57
|
-
;;
|
58
|
-
esac
|
59
|
-
done
|
60
|
-
|
61
|
-
manager_running=`${THOME}/../../tungsten-manager/bin/manager status | grep "PID" | wc -l`
|
62
|
-
# Check the manager status
|
63
|
-
if [ $manager_running -eq 0 ]; then
|
64
|
-
echo "CRITICAL: Manager is not running"
|
65
|
-
exit $CRITICAL_STATE
|
66
|
-
fi
|
67
|
-
|
68
|
-
if [ "$dataservice" == "" ]; then
|
69
|
-
offline_services=`echo "ls resources" | ${THOME}/../../tungsten-manager/bin/cctrl | grep \| | grep : | grep -v ONLINE | tr -d "| " | cut -f 1,2 -d ":"`
|
70
|
-
else
|
71
|
-
offline_services=`echo "use $dataservice; ls" | ${THOME}/../../tungsten-manager/bin/cctrl -multi | grep "(\(composite \)\?master\|(\(composite \)\?slave\|(relay" | grep -v ONLINE | tr -d "|" | cut -f 1 -d "("`
|
72
|
-
fi
|
73
|
-
|
74
|
-
for offline_service in $offline_services
|
75
|
-
do
|
76
|
-
offline_count=$(($offline_count+1))
|
77
|
-
error_message="$error_message$error_message_glue$offline_service"
|
78
|
-
error_message_glue=", "
|
79
|
-
done
|
80
|
-
|
81
|
-
if [ $offline_count -gt 0 ]
|
82
|
-
then
|
83
|
-
echo "CRITICAL: $error_message are not ONLINE"
|
84
|
-
exit $CRITICAL_STATE
|
85
|
-
fi
|
86
|
-
|
87
|
-
|
88
|
-
if [ $skip_shun -eq 0 ]
|
89
|
-
then
|
90
|
-
if [ "$dataservice" == "" ]; then
|
91
|
-
shunned=`echo "ls" | ${THOME}/../../tungsten-manager/bin/cctrl | grep 'SHUNNED' | wc -l`
|
92
|
-
else
|
93
|
-
shunned=`echo "use $dataservice; ls" | ${THOME}/../../tungsten-manager/bin/cctrl -multi | grep 'SHUNNED' | wc -l`
|
94
|
-
fi
|
95
|
-
|
96
|
-
if [ $shunned -gt 0 ]
|
97
|
-
then
|
98
|
-
echo "CRITICAL: Dataservices are shunned"
|
99
|
-
exit $CRITICAL_STATE
|
100
|
-
fi
|
101
|
-
fi
|
102
|
-
|
103
|
-
|
104
|
-
echo "OK: All services are online"
|
105
|
-
exit $OK_STATE
|
data/bin/check_tungsten_policy
DELETED
@@ -1,61 +0,0 @@
|
|
1
|
-
#!/bin/bash
|
2
|
-
#
|
3
|
-
# Simple Bash Script To Check Tungsten Policy
|
4
|
-
# Nagios Plugin For NRPE
|
5
|
-
#
|
6
|
-
# This script does not accept any arguments. It will return error if the
|
7
|
-
# cluster is in maintenance mode
|
8
|
-
#
|
9
|
-
OK_STATE=0
|
10
|
-
WARNING_STATE=1
|
11
|
-
CRITICAL_STATE=2
|
12
|
-
THOME=`dirname $0`
|
13
|
-
|
14
|
-
error_message=""
|
15
|
-
error_messaage_glue=""
|
16
|
-
offline_count=0
|
17
|
-
dataservice=""
|
18
|
-
|
19
|
-
function display_help()
|
20
|
-
{
|
21
|
-
echo "Usage: ./check_tungsten_policy -s dataservice [-h]"
|
22
|
-
echo " -s The data service you would like to check"
|
23
|
-
echo " -h Display this message"
|
24
|
-
exit 0
|
25
|
-
}
|
26
|
-
|
27
|
-
while getopts "s:h" Option
|
28
|
-
do
|
29
|
-
case $Option in
|
30
|
-
h )
|
31
|
-
display_help
|
32
|
-
;;
|
33
|
-
s )
|
34
|
-
dataservice=$OPTARG
|
35
|
-
;;
|
36
|
-
esac
|
37
|
-
done
|
38
|
-
|
39
|
-
manager_running=`${THOME}/../../tungsten-manager/bin/manager status | grep "PID" | wc -l`
|
40
|
-
# Check the manager status
|
41
|
-
if [ $manager_running -eq 0 ]; then
|
42
|
-
echo "CRITICAL: Manager is not running"
|
43
|
-
exit $CRITICAL_STATE
|
44
|
-
fi
|
45
|
-
|
46
|
-
if [ "$dataservice" == "" ]; then
|
47
|
-
maint_mode=`echo "ls " | ${THOME}/../../tungsten-manager/bin/cctrl | grep MAINTENANCE | wc -l`
|
48
|
-
else
|
49
|
-
maint_mode=`echo "use $dataservice; ls " | ${THOME}/../../tungsten-manager/bin/cctrl | grep MAINTENANCE | wc -l`
|
50
|
-
fi
|
51
|
-
|
52
|
-
|
53
|
-
if [ $maint_mode -gt 0 ]
|
54
|
-
then
|
55
|
-
echo "CRITICAL: Cluster is in Maintenance mode"
|
56
|
-
exit $CRITICAL_STATE
|
57
|
-
fi
|
58
|
-
|
59
|
-
|
60
|
-
echo "OK: Cluster is in Automatic Mode"
|
61
|
-
exit $OK_STATE
|
data/bin/check_tungsten_progress
DELETED
@@ -1,81 +0,0 @@
|
|
1
|
-
#!/bin/bash
|
2
|
-
#
|
3
|
-
# Simple Bash Script To Check Tungsten Progress
|
4
|
-
# Nagios Plugin For NRPE
|
5
|
-
#
|
6
|
-
# This script accepts three arguments, {{-w}}, {{-c}} and {{-t}}. The {{-w}} flag
|
7
|
-
# is the level at which a warning should be returned. {{-c}} sets the level for
|
8
|
-
# a critical return value. The {{-t}} flag sets the amount of time used to wait
|
9
|
-
# between monitoring the progress of the cluster. The script uses the difference
|
10
|
-
# between the values to determine if a warning or critical alert should be
|
11
|
-
# issued
|
12
|
-
#
|
13
|
-
OK_STATE=0
|
14
|
-
WARNING_STATE=1
|
15
|
-
CRITICAL_STATE=2
|
16
|
-
THOME=`dirname $0`
|
17
|
-
|
18
|
-
function display_help()
|
19
|
-
{
|
20
|
-
echo "Usage: ./check_tungsten_progress -t time [-h]"
|
21
|
-
echo " -t The number of seconds to wait when monitoring progress"
|
22
|
-
echo " -h Display this message"
|
23
|
-
exit 0
|
24
|
-
}
|
25
|
-
|
26
|
-
# We will use this to make some floating point comparisons
|
27
|
-
function float_cond()
|
28
|
-
{
|
29
|
-
local cond=0
|
30
|
-
if [[ $# -gt 0 ]]; then
|
31
|
-
cond=$(echo "$*" | bc -q 2>&1)
|
32
|
-
if [[ $? -ne 0 ]]; then
|
33
|
-
echo "Error: $cond"
|
34
|
-
exit 1
|
35
|
-
fi
|
36
|
-
if [[ -z "$cond" ]]; then cond=0; fi
|
37
|
-
if [[ "$cond" != 0 && "$cond" != 1 ]]; then cond=0; fi
|
38
|
-
fi
|
39
|
-
local stat=$((cond == 0))
|
40
|
-
return $stat
|
41
|
-
}
|
42
|
-
|
43
|
-
time_period=1
|
44
|
-
while getopts "t:h" Option
|
45
|
-
do
|
46
|
-
case $Option in
|
47
|
-
t)
|
48
|
-
time_period=$OPTARG
|
49
|
-
;;
|
50
|
-
h )
|
51
|
-
display_help
|
52
|
-
;;
|
53
|
-
esac
|
54
|
-
done
|
55
|
-
|
56
|
-
if float_cond "$time_period == 0"; then
|
57
|
-
echo "Error: time_period has not been set"
|
58
|
-
echo ""
|
59
|
-
display_help
|
60
|
-
fi
|
61
|
-
|
62
|
-
is_online=`${THOME}/../../tungsten-replicator/bin/trepctl status | grep "state" | grep "ONLINE" | wc -l`
|
63
|
-
if float_cond "$is_online == 0"; then
|
64
|
-
echo "CRITICAL: Replicator is not ONLINE"
|
65
|
-
exit $CRITICAL_STATE
|
66
|
-
fi
|
67
|
-
|
68
|
-
pre_progress_number=`${THOME}/../../tungsten-replicator/bin/trepctl status | grep "appliedLastSeqno" | tr -d "| " | awk -F":" '{print $2}'`
|
69
|
-
echo "cluster heartbeat" | ${THOME}/../../tungsten-manager/bin/cctrl > /dev/null
|
70
|
-
sleep $time_period
|
71
|
-
post_progress_number=`${THOME}/../../tungsten-replicator/bin/trepctl status | grep "appliedLastSeqno" | tr -d "| " | awk -F":" '{print $2}'`
|
72
|
-
|
73
|
-
progress_number=`echo "$post_progress_number - $pre_progress_number" | bc -q 2>/dev/null`
|
74
|
-
|
75
|
-
if float_cond "$progress_number < 1"; then
|
76
|
-
echo "WARNING: Replicator did not show progress"
|
77
|
-
exit $WARNING_STATE
|
78
|
-
fi
|
79
|
-
|
80
|
-
echo "OK: Replicator is making progress"
|
81
|
-
exit $OK_STATE
|