continuent-monitors-nagios 0.0.3 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/tungsten_nagios_backups +140 -0
- data/bin/tungsten_nagios_connector +10 -3
- data/bin/tungsten_nagios_latency +159 -0
- data/bin/tungsten_nagios_monitor_threads +2 -0
- data/bin/tungsten_nagios_online +134 -0
- data/bin/tungsten_nagios_policy +61 -0
- data/bin/tungsten_nagios_progress +105 -0
- data/bin/tungsten_nagios_services +75 -0
- metadata +16 -18
- data/bin/check_tungsten.sh +0 -576
- data/bin/check_tungsten_backups +0 -70
- data/bin/check_tungsten_latency +0 -172
- data/bin/check_tungsten_online +0 -105
- data/bin/check_tungsten_policy +0 -61
- data/bin/check_tungsten_progress +0 -81
- data/bin/check_tungsten_services +0 -95
data/bin/check_tungsten_backups
DELETED
@@ -1,70 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
#
|
3
|
-
# The script should be run as the tungsten user to ensure the
|
4
|
-
# environment is set correctly. Export the environment variables before
|
5
|
-
# calling the script to modify behavior
|
6
|
-
#
|
7
|
-
|
8
|
-
require "#{File.dirname(__FILE__)}/../lib/ruby/tungsten"
|
9
|
-
|
10
|
-
class CheckTungstenBackups
|
11
|
-
include TungstenScript
|
12
|
-
|
13
|
-
def main
|
14
|
-
status = TI.status()
|
15
|
-
unless status.coordinator() == TI.hostname()
|
16
|
-
nagios_ok("Not running check because this node is not the coordinator")
|
17
|
-
end
|
18
|
-
seconds_since_epoch = TU.cmd_result("date +%s").to_i()
|
19
|
-
most_recent_backup = nil
|
20
|
-
|
21
|
-
status.datasources().each{
|
22
|
-
|ds|
|
23
|
-
begin
|
24
|
-
TU.ssh_result("stat -c\"%n %Y\" #{TI.trepctl_property(status.name(), 'replicator.storage.agent.fs.directory')}/store*.properties 2>/dev/null", ds, TI.user()).split("\n").each{
|
25
|
-
|line|
|
26
|
-
stored_backup=line.split(" ")
|
27
|
-
stored_backup[1] = stored_backup[1].to_i()
|
28
|
-
|
29
|
-
if most_recent_backup == nil || stored_backup[1] > most_recent_backup[:seconds]
|
30
|
-
most_recent_backup = {
|
31
|
-
:hostname => ds,
|
32
|
-
:filename => stored_backup[0],
|
33
|
-
:seconds => stored_backup[1]
|
34
|
-
}
|
35
|
-
end
|
36
|
-
}
|
37
|
-
rescue CommandError
|
38
|
-
end
|
39
|
-
}
|
40
|
-
|
41
|
-
if most_recent_backup == nil
|
42
|
-
nagios_critical("Unable to find a backup on any datasource")
|
43
|
-
end
|
44
|
-
|
45
|
-
age = seconds_since_epoch-most_recent_backup[:seconds]
|
46
|
-
if age > @options[:max_backup_age]
|
47
|
-
nagios_critical("#{most_recent_backup[:hostname]}:#{most_recent_backup[:filename]} [#{age}s] is older than #{@options[:max_backup_age]}s")
|
48
|
-
else
|
49
|
-
nagios_ok("The most recent backup is #{most_recent_backup[:hostname]}:#{most_recent_backup[:filename]} [#{age}s]")
|
50
|
-
end
|
51
|
-
end
|
52
|
-
|
53
|
-
def configure
|
54
|
-
super()
|
55
|
-
|
56
|
-
description("Check all local datasources to make sure one of them has a backup younger than the max allowed age")
|
57
|
-
add_option(:max_backup_age, {
|
58
|
-
:on => "--max-backup-age String",
|
59
|
-
:help => "Maximum allowed age in seconds of a backup on any machine",
|
60
|
-
:parse => method(:parse_integer_option),
|
61
|
-
:default => 86400
|
62
|
-
})
|
63
|
-
end
|
64
|
-
|
65
|
-
def script_name
|
66
|
-
"check_tungsten_backups"
|
67
|
-
end
|
68
|
-
|
69
|
-
self.new().run()
|
70
|
-
end
|
data/bin/check_tungsten_latency
DELETED
@@ -1,172 +0,0 @@
|
|
1
|
-
#!/bin/bash
|
2
|
-
#
|
3
|
-
# Simple Bash Script To Check Tungsten Latency
|
4
|
-
# Nagios Plugin For NRPE
|
5
|
-
#
|
6
|
-
# This script accepts two arguments, {{-w}} and {{-c}}. The {{-w}} flag is
|
7
|
-
# the level at which a warning should be returned. {{-c}} sets the level for
|
8
|
-
# a critical return value. The script uses the maximum latency of any slave
|
9
|
-
# to determine the return value.
|
10
|
-
#
|
11
|
-
OK_STATE=0
|
12
|
-
WARNING_STATE=1
|
13
|
-
CRITICAL_STATE=2
|
14
|
-
THOME=`dirname $0`
|
15
|
-
|
16
|
-
function display_help()
|
17
|
-
{
|
18
|
-
echo "Usage: ./check_tungsten_latency -w warning_level -c critical_level [-h]"
|
19
|
-
echo " -w Throw a warning alert if the maximum latency"
|
20
|
-
echo " is above this level"
|
21
|
-
echo " -c Throw a critical alert if the maximum latency"
|
22
|
-
echo " is above this level"
|
23
|
-
echo " --perfdata Display performance data of the latency"
|
24
|
-
echo " --perslave-perfdata Show performance latency values of each slave."
|
25
|
-
echo " If this is not set the maximum latency will be"
|
26
|
-
echo " displayed in the performace data"
|
27
|
-
echo " -h Display this message"
|
28
|
-
exit 0
|
29
|
-
}
|
30
|
-
|
31
|
-
# We will use this to make some floating point comparisons
|
32
|
-
function float_cond()
|
33
|
-
{
|
34
|
-
local cond=0
|
35
|
-
if [[ $# -gt 0 ]]; then
|
36
|
-
cond=$(echo "$*" | bc -q 2>&1)
|
37
|
-
if [[ $? -ne 0 ]]; then
|
38
|
-
echo "Error: $cond"
|
39
|
-
exit 1
|
40
|
-
fi
|
41
|
-
if [[ -z "$cond" ]]; then cond=0; fi
|
42
|
-
if [[ "$cond" != 0 && "$cond" != 1 ]]; then cond=0; fi
|
43
|
-
fi
|
44
|
-
local stat=$((cond == 0))
|
45
|
-
return $stat
|
46
|
-
}
|
47
|
-
|
48
|
-
warning_level=0
|
49
|
-
critical_level=0
|
50
|
-
perfdata="false"
|
51
|
-
performance_data_default_glue=""
|
52
|
-
performance_data_suffix=""
|
53
|
-
perfdata_allslaves=""
|
54
|
-
|
55
|
-
for arg
|
56
|
-
do
|
57
|
-
delim=""
|
58
|
-
case "$arg" in
|
59
|
-
#translate --gnu-long-options to -g (short options)
|
60
|
-
--perfdata) args="${args}-p ";;
|
61
|
-
--perslave-perfdata) args="${args}-s ";;
|
62
|
-
#pass through anything else
|
63
|
-
*) [[ "${arg:0:1}" == "-" ]] || delim="\""
|
64
|
-
args="${args}${delim}${arg}${delim} ";;
|
65
|
-
esac
|
66
|
-
done
|
67
|
-
|
68
|
-
#Reset the positional parameters to the short options
|
69
|
-
eval set -- $args
|
70
|
-
|
71
|
-
while getopts "w:c:h:nps" Option
|
72
|
-
do
|
73
|
-
case $Option in
|
74
|
-
w )
|
75
|
-
warning_level=$OPTARG
|
76
|
-
;;
|
77
|
-
c )
|
78
|
-
critical_level=$OPTARG
|
79
|
-
;;
|
80
|
-
h )
|
81
|
-
display_help
|
82
|
-
;;
|
83
|
-
p )
|
84
|
-
perfdata="true"
|
85
|
-
;;
|
86
|
-
s )
|
87
|
-
perfdata_allslaves="true"
|
88
|
-
;;
|
89
|
-
esac
|
90
|
-
done
|
91
|
-
if float_cond "$warning_level == 0"; then
|
92
|
-
echo "Error: warning_level has not been set"
|
93
|
-
echo ""
|
94
|
-
display_help
|
95
|
-
fi
|
96
|
-
|
97
|
-
if float_cond "$critical_level == 0"; then
|
98
|
-
echo "Error: critical_level has not been set"
|
99
|
-
echo ""
|
100
|
-
display_help
|
101
|
-
fi
|
102
|
-
|
103
|
-
if [ "$perfdata" == "true" ]; then
|
104
|
-
performance_data_default_glue=" "
|
105
|
-
performance_data_suffix=";$warning_level;$critical_level;;"
|
106
|
-
fi
|
107
|
-
|
108
|
-
error_message=""
|
109
|
-
error_messaage_glue=""
|
110
|
-
performance_data_glue=""
|
111
|
-
performance_data="| "
|
112
|
-
max_latency=0
|
113
|
-
|
114
|
-
manager_running=`${THOME}/../../tungsten-manager/bin/manager status | grep "PID" | wc -l`
|
115
|
-
# Check the manager status
|
116
|
-
if [ $manager_running -eq 0 ]; then
|
117
|
-
echo "CRITICAL: Manager is not running"
|
118
|
-
exit $CRITICAL_STATE
|
119
|
-
fi
|
120
|
-
|
121
|
-
latency_values=`echo "ls -l" | ${THOME}/../../tungsten-manager/bin/cctrl | grep -E "REPLICATOR\(role=[relay|slave]|appliedLatency" | tr -d "| "`
|
122
|
-
|
123
|
-
current_slave=""
|
124
|
-
for line in $latency_values
|
125
|
-
do
|
126
|
-
if [[ $current_slave == "" ]]
|
127
|
-
then
|
128
|
-
current_slave=`echo $line | grep "REPLICATOR" | cut -f 1 -d ":"`
|
129
|
-
else
|
130
|
-
applied_latency=`echo $line | grep "appliedLatency" | cut -f 2 -d ":"`
|
131
|
-
|
132
|
-
if float_cond "$applied_latency > $max_latency"; then
|
133
|
-
max_latency=$applied_latency
|
134
|
-
fi
|
135
|
-
|
136
|
-
if float_cond "$applied_latency > $warning_level"; then
|
137
|
-
error_message="$error_message$error_message_glue$current_slave=$applied_latency""s"
|
138
|
-
error_message_glue=", "
|
139
|
-
fi
|
140
|
-
|
141
|
-
if float_cond "$applied_latency == -1"; then
|
142
|
-
error_message="$error_message$error_message_glue$current_slave is missing latency information"
|
143
|
-
error_message_glue=", "
|
144
|
-
fi
|
145
|
-
|
146
|
-
performance_data="$performance_data$performance_data_glue$current_slave=$applied_latency$performance_data_suffix"
|
147
|
-
performance_data_glue="$performance_data_default_glue"
|
148
|
-
current_slave=""
|
149
|
-
fi
|
150
|
-
done
|
151
|
-
|
152
|
-
if [ "$perfdata_allslaves" != "true" ]; then
|
153
|
-
performance_data="| max_latency=${max_latency}$performance_data_suffix"
|
154
|
-
fi
|
155
|
-
|
156
|
-
if [ "$perfdata" == "false" ]; then
|
157
|
-
performance_data=""
|
158
|
-
fi
|
159
|
-
|
160
|
-
if float_cond "$max_latency > $critical_level"; then
|
161
|
-
echo "CRITICAL: $error_message $performance_data"
|
162
|
-
exit $CRITICAL_STATE
|
163
|
-
fi
|
164
|
-
|
165
|
-
if [[ $error_message != "" ]]; then
|
166
|
-
echo "WARNING: $error_message $performance_data"
|
167
|
-
exit $WARNING_STATE
|
168
|
-
fi
|
169
|
-
|
170
|
-
echo "OK: All slaves are running normally (max_latency=${max_latency}) $performance_data "
|
171
|
-
|
172
|
-
exit $OK_STATE
|
data/bin/check_tungsten_online
DELETED
@@ -1,105 +0,0 @@
|
|
1
|
-
#!/bin/bash
|
2
|
-
# Copyright (C) 2014 Continuent, Inc.
|
3
|
-
#
|
4
|
-
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
5
|
-
# not use this file except in compliance with the License. You may obtain
|
6
|
-
# a copy of the License at
|
7
|
-
#
|
8
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
-
#
|
10
|
-
# Unless required by applicable law or agreed to in writing, software
|
11
|
-
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
12
|
-
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
13
|
-
# License for the specific language governing permissions and limitations
|
14
|
-
# under the License.
|
15
|
-
#
|
16
|
-
# Initial developer(s): Jeff Mace
|
17
|
-
# Contributor(s):
|
18
|
-
#
|
19
|
-
# Simple Bash Script To Check Tungsten Services
|
20
|
-
# Nagios Plugin For NRPE
|
21
|
-
#
|
22
|
-
# This script does not accept any arguments. It will return a warning if any
|
23
|
-
# of Tungsten resources is not in an {{ONLINE}} state. It uses the output of
|
24
|
-
# the {{ls resources}} command to determine the current state.
|
25
|
-
#
|
26
|
-
OK_STATE=0
|
27
|
-
WARNING_STATE=1
|
28
|
-
CRITICAL_STATE=2
|
29
|
-
THOME=`dirname $0`
|
30
|
-
|
31
|
-
error_message=""
|
32
|
-
error_messaage_glue=""
|
33
|
-
offline_count=0
|
34
|
-
dataservice=""
|
35
|
-
skip_shun=0
|
36
|
-
|
37
|
-
function display_help()
|
38
|
-
{
|
39
|
-
echo "Usage: ./check_tungsten_online -s dataservice [-h]"
|
40
|
-
echo " -s The data service you would like to check"
|
41
|
-
echo " -h Display this message"
|
42
|
-
echo " -n Skip Shunned Services"
|
43
|
-
exit 0
|
44
|
-
}
|
45
|
-
|
46
|
-
while getopts "s:h:n" Option
|
47
|
-
do
|
48
|
-
case $Option in
|
49
|
-
h )
|
50
|
-
display_help
|
51
|
-
;;
|
52
|
-
s )
|
53
|
-
dataservice=$OPTARG
|
54
|
-
;;
|
55
|
-
n )
|
56
|
-
skip_shun=1
|
57
|
-
;;
|
58
|
-
esac
|
59
|
-
done
|
60
|
-
|
61
|
-
manager_running=`${THOME}/../../tungsten-manager/bin/manager status | grep "PID" | wc -l`
|
62
|
-
# Check the manager status
|
63
|
-
if [ $manager_running -eq 0 ]; then
|
64
|
-
echo "CRITICAL: Manager is not running"
|
65
|
-
exit $CRITICAL_STATE
|
66
|
-
fi
|
67
|
-
|
68
|
-
if [ "$dataservice" == "" ]; then
|
69
|
-
offline_services=`echo "ls resources" | ${THOME}/../../tungsten-manager/bin/cctrl | grep \| | grep : | grep -v ONLINE | tr -d "| " | cut -f 1,2 -d ":"`
|
70
|
-
else
|
71
|
-
offline_services=`echo "use $dataservice; ls" | ${THOME}/../../tungsten-manager/bin/cctrl -multi | grep "(\(composite \)\?master\|(\(composite \)\?slave\|(relay" | grep -v ONLINE | tr -d "|" | cut -f 1 -d "("`
|
72
|
-
fi
|
73
|
-
|
74
|
-
for offline_service in $offline_services
|
75
|
-
do
|
76
|
-
offline_count=$(($offline_count+1))
|
77
|
-
error_message="$error_message$error_message_glue$offline_service"
|
78
|
-
error_message_glue=", "
|
79
|
-
done
|
80
|
-
|
81
|
-
if [ $offline_count -gt 0 ]
|
82
|
-
then
|
83
|
-
echo "CRITICAL: $error_message are not ONLINE"
|
84
|
-
exit $CRITICAL_STATE
|
85
|
-
fi
|
86
|
-
|
87
|
-
|
88
|
-
if [ $skip_shun -eq 0 ]
|
89
|
-
then
|
90
|
-
if [ "$dataservice" == "" ]; then
|
91
|
-
shunned=`echo "ls" | ${THOME}/../../tungsten-manager/bin/cctrl | grep 'SHUNNED' | wc -l`
|
92
|
-
else
|
93
|
-
shunned=`echo "use $dataservice; ls" | ${THOME}/../../tungsten-manager/bin/cctrl -multi | grep 'SHUNNED' | wc -l`
|
94
|
-
fi
|
95
|
-
|
96
|
-
if [ $shunned -gt 0 ]
|
97
|
-
then
|
98
|
-
echo "CRITICAL: Dataservices are shunned"
|
99
|
-
exit $CRITICAL_STATE
|
100
|
-
fi
|
101
|
-
fi
|
102
|
-
|
103
|
-
|
104
|
-
echo "OK: All services are online"
|
105
|
-
exit $OK_STATE
|
data/bin/check_tungsten_policy
DELETED
@@ -1,61 +0,0 @@
|
|
1
|
-
#!/bin/bash
|
2
|
-
#
|
3
|
-
# Simple Bash Script To Check Tungsten Policy
|
4
|
-
# Nagios Plugin For NRPE
|
5
|
-
#
|
6
|
-
# This script does not accept any arguments. It will return error if the
|
7
|
-
# cluster is in maintenance mode
|
8
|
-
#
|
9
|
-
OK_STATE=0
|
10
|
-
WARNING_STATE=1
|
11
|
-
CRITICAL_STATE=2
|
12
|
-
THOME=`dirname $0`
|
13
|
-
|
14
|
-
error_message=""
|
15
|
-
error_messaage_glue=""
|
16
|
-
offline_count=0
|
17
|
-
dataservice=""
|
18
|
-
|
19
|
-
function display_help()
|
20
|
-
{
|
21
|
-
echo "Usage: ./check_tungsten_policy -s dataservice [-h]"
|
22
|
-
echo " -s The data service you would like to check"
|
23
|
-
echo " -h Display this message"
|
24
|
-
exit 0
|
25
|
-
}
|
26
|
-
|
27
|
-
while getopts "s:h" Option
|
28
|
-
do
|
29
|
-
case $Option in
|
30
|
-
h )
|
31
|
-
display_help
|
32
|
-
;;
|
33
|
-
s )
|
34
|
-
dataservice=$OPTARG
|
35
|
-
;;
|
36
|
-
esac
|
37
|
-
done
|
38
|
-
|
39
|
-
manager_running=`${THOME}/../../tungsten-manager/bin/manager status | grep "PID" | wc -l`
|
40
|
-
# Check the manager status
|
41
|
-
if [ $manager_running -eq 0 ]; then
|
42
|
-
echo "CRITICAL: Manager is not running"
|
43
|
-
exit $CRITICAL_STATE
|
44
|
-
fi
|
45
|
-
|
46
|
-
if [ "$dataservice" == "" ]; then
|
47
|
-
maint_mode=`echo "ls " | ${THOME}/../../tungsten-manager/bin/cctrl | grep MAINTENANCE | wc -l`
|
48
|
-
else
|
49
|
-
maint_mode=`echo "use $dataservice; ls " | ${THOME}/../../tungsten-manager/bin/cctrl | grep MAINTENANCE | wc -l`
|
50
|
-
fi
|
51
|
-
|
52
|
-
|
53
|
-
if [ $maint_mode -gt 0 ]
|
54
|
-
then
|
55
|
-
echo "CRITICAL: Cluster is in Maintenance mode"
|
56
|
-
exit $CRITICAL_STATE
|
57
|
-
fi
|
58
|
-
|
59
|
-
|
60
|
-
echo "OK: Cluster is in Automatic Mode"
|
61
|
-
exit $OK_STATE
|
data/bin/check_tungsten_progress
DELETED
@@ -1,81 +0,0 @@
|
|
1
|
-
#!/bin/bash
|
2
|
-
#
|
3
|
-
# Simple Bash Script To Check Tungsten Progress
|
4
|
-
# Nagios Plugin For NRPE
|
5
|
-
#
|
6
|
-
# This script accepts three arguments, {{-w}}, {{-c}} and {{-t}}. The {{-w}} flag
|
7
|
-
# is the level at which a warning should be returned. {{-c}} sets the level for
|
8
|
-
# a critical return value. The {{-t}} flag sets the amount of time used to wait
|
9
|
-
# between monitoring the progress of the cluster. The script uses the difference
|
10
|
-
# between the values to determine if a warning or critical alert should be
|
11
|
-
# issued
|
12
|
-
#
|
13
|
-
OK_STATE=0
|
14
|
-
WARNING_STATE=1
|
15
|
-
CRITICAL_STATE=2
|
16
|
-
THOME=`dirname $0`
|
17
|
-
|
18
|
-
function display_help()
|
19
|
-
{
|
20
|
-
echo "Usage: ./check_tungsten_progress -t time [-h]"
|
21
|
-
echo " -t The number of seconds to wait when monitoring progress"
|
22
|
-
echo " -h Display this message"
|
23
|
-
exit 0
|
24
|
-
}
|
25
|
-
|
26
|
-
# We will use this to make some floating point comparisons
|
27
|
-
function float_cond()
|
28
|
-
{
|
29
|
-
local cond=0
|
30
|
-
if [[ $# -gt 0 ]]; then
|
31
|
-
cond=$(echo "$*" | bc -q 2>&1)
|
32
|
-
if [[ $? -ne 0 ]]; then
|
33
|
-
echo "Error: $cond"
|
34
|
-
exit 1
|
35
|
-
fi
|
36
|
-
if [[ -z "$cond" ]]; then cond=0; fi
|
37
|
-
if [[ "$cond" != 0 && "$cond" != 1 ]]; then cond=0; fi
|
38
|
-
fi
|
39
|
-
local stat=$((cond == 0))
|
40
|
-
return $stat
|
41
|
-
}
|
42
|
-
|
43
|
-
time_period=1
|
44
|
-
while getopts "t:h" Option
|
45
|
-
do
|
46
|
-
case $Option in
|
47
|
-
t)
|
48
|
-
time_period=$OPTARG
|
49
|
-
;;
|
50
|
-
h )
|
51
|
-
display_help
|
52
|
-
;;
|
53
|
-
esac
|
54
|
-
done
|
55
|
-
|
56
|
-
if float_cond "$time_period == 0"; then
|
57
|
-
echo "Error: time_period has not been set"
|
58
|
-
echo ""
|
59
|
-
display_help
|
60
|
-
fi
|
61
|
-
|
62
|
-
is_online=`${THOME}/../../tungsten-replicator/bin/trepctl status | grep "state" | grep "ONLINE" | wc -l`
|
63
|
-
if float_cond "$is_online == 0"; then
|
64
|
-
echo "CRITICAL: Replicator is not ONLINE"
|
65
|
-
exit $CRITICAL_STATE
|
66
|
-
fi
|
67
|
-
|
68
|
-
pre_progress_number=`${THOME}/../../tungsten-replicator/bin/trepctl status | grep "appliedLastSeqno" | tr -d "| " | awk -F":" '{print $2}'`
|
69
|
-
echo "cluster heartbeat" | ${THOME}/../../tungsten-manager/bin/cctrl > /dev/null
|
70
|
-
sleep $time_period
|
71
|
-
post_progress_number=`${THOME}/../../tungsten-replicator/bin/trepctl status | grep "appliedLastSeqno" | tr -d "| " | awk -F":" '{print $2}'`
|
72
|
-
|
73
|
-
progress_number=`echo "$post_progress_number - $pre_progress_number" | bc -q 2>/dev/null`
|
74
|
-
|
75
|
-
if float_cond "$progress_number < 1"; then
|
76
|
-
echo "WARNING: Replicator did not show progress"
|
77
|
-
exit $WARNING_STATE
|
78
|
-
fi
|
79
|
-
|
80
|
-
echo "OK: Replicator is making progress"
|
81
|
-
exit $OK_STATE
|