nagios-herald 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +8 -0
  3. data/.travis.yml +9 -0
  4. data/CHANGELOG.md +11 -0
  5. data/CONTRIBUTING.md +28 -0
  6. data/Gemfile +5 -0
  7. data/LICENSE +21 -0
  8. data/README.md +94 -0
  9. data/Rakefile +9 -0
  10. data/bin/draw_stack_bars +76 -0
  11. data/bin/dump_nagios_env.sh +25 -0
  12. data/bin/get_ganglia_graph +82 -0
  13. data/bin/get_graph +50 -0
  14. data/bin/get_graphite_graph +58 -0
  15. data/bin/nagios-herald +6 -0
  16. data/bin/splunk_alert_frequency +54 -0
  17. data/contrib/nrpe-plugins/check_cpu_stats.sh +186 -0
  18. data/contrib/nrpe-plugins/check_disk.sh +34 -0
  19. data/contrib/nrpe-plugins/check_mem.pl +181 -0
  20. data/contrib/nrpe-plugins/nrpe-plugin-examples.md +11 -0
  21. data/docs/config.md +62 -0
  22. data/docs/example_alerts.md +48 -0
  23. data/docs/formatters.md +180 -0
  24. data/docs/helpers.md +12 -0
  25. data/docs/images/cpu_no_context.png +0 -0
  26. data/docs/images/cpu_with_context.png +0 -0
  27. data/docs/images/disk_space_no_context.png +0 -0
  28. data/docs/images/disk_space_with_context.png +0 -0
  29. data/docs/images/memory_high_no_context.png +0 -0
  30. data/docs/images/memory_high_with_context.png +0 -0
  31. data/docs/images/nagios-herald-formatter-content-example.png +0 -0
  32. data/docs/images/nagios-herald.png +0 -0
  33. data/docs/images/stack-bars.png +0 -0
  34. data/docs/images/vanilla-nagios.png +0 -0
  35. data/docs/messages.md +16 -0
  36. data/docs/nagios-config.md +74 -0
  37. data/docs/tools.md +79 -0
  38. data/etc/config.yml.example +14 -0
  39. data/etc/readme.md +2 -0
  40. data/lib/nagios-herald/config.rb +25 -0
  41. data/lib/nagios-herald/executor.rb +265 -0
  42. data/lib/nagios-herald/formatter_loader.rb +82 -0
  43. data/lib/nagios-herald/formatters/base.rb +524 -0
  44. data/lib/nagios-herald/formatters/check_cpu.rb +71 -0
  45. data/lib/nagios-herald/formatters/check_disk.rb +143 -0
  46. data/lib/nagios-herald/formatters/check_logstash.rb +155 -0
  47. data/lib/nagios-herald/formatters/check_memory.rb +42 -0
  48. data/lib/nagios-herald/formatters/example.rb +19 -0
  49. data/lib/nagios-herald/formatters.rb +1 -0
  50. data/lib/nagios-herald/helpers/ganglia_graph.rb +99 -0
  51. data/lib/nagios-herald/helpers/graphite_graph.rb +85 -0
  52. data/lib/nagios-herald/helpers/logstash_query.rb +125 -0
  53. data/lib/nagios-herald/helpers/splunk_alert_frequency.rb +170 -0
  54. data/lib/nagios-herald/helpers/splunk_query.rb +119 -0
  55. data/lib/nagios-herald/helpers/url_image.rb +76 -0
  56. data/lib/nagios-herald/helpers.rb +5 -0
  57. data/lib/nagios-herald/logging.rb +48 -0
  58. data/lib/nagios-herald/message_loader.rb +40 -0
  59. data/lib/nagios-herald/messages/base.rb +56 -0
  60. data/lib/nagios-herald/messages/email.rb +150 -0
  61. data/lib/nagios-herald/messages/irc.rb +58 -0
  62. data/lib/nagios-herald/messages/pager.rb +75 -0
  63. data/lib/nagios-herald/messages.rb +3 -0
  64. data/lib/nagios-herald/test_helpers/base_test_case.rb +82 -0
  65. data/lib/nagios-herald/util.rb +45 -0
  66. data/lib/nagios-herald/version.rb +3 -0
  67. data/lib/nagios-herald.rb +7 -0
  68. data/lib/stackbars/__init__.py +0 -0
  69. data/lib/stackbars/chart_utils.py +25 -0
  70. data/lib/stackbars/grouped_stackbars.py +97 -0
  71. data/lib/stackbars/pilfonts/Tahoma.ttf +0 -0
  72. data/lib/stackbars/pilfonts/aerial.ttf +0 -0
  73. data/lib/stackbars/pilfonts/arial_black.ttf +0 -0
  74. data/lib/stackbars/stackbar.py +100 -0
  75. data/nagios-herald.gemspec +33 -0
  76. data/test/env_files/check_cpu_idle.CRITICAL +199 -0
  77. data/test/env_files/check_cpu_iowait.WARNING +199 -0
  78. data/test/env_files/check_disk.CRITICAL +197 -0
  79. data/test/env_files/check_disk.CRITICAL_ICINGA +197 -0
  80. data/test/env_files/check_disk.RECOVERY +197 -0
  81. data/test/env_files/check_memory.CRITICAL +197 -0
  82. data/test/env_files/nagios_vars.EXAMPLE +197 -0
  83. data/test/unit/test_config.rb +31 -0
  84. data/test/unit/test_executor.rb +65 -0
  85. data/test/unit/test_formatter_base.rb +131 -0
  86. data/test/unit/test_formatter_check_cpu_idle_critical.rb +135 -0
  87. data/test/unit/test_formatter_check_memory.rb +135 -0
  88. data/test/unit/test_icinga_variables.rb +31 -0
  89. data/test/unit/test_logging.rb +35 -0
  90. data/test/unit/test_message_email.rb +69 -0
  91. data/test/unit/test_message_pager.rb +69 -0
  92. metadata +204 -0
@@ -0,0 +1,186 @@
1
+ #!/bin/bash
2
+ # ========================================================================================
3
+ # CPU Utilization Statistics plugin for Nagios
4
+ #
5
+ # Written by : Steve Bosek
6
+ # Release : 2.1
7
+ # Creation date : 8 September 2007
8
+ # Revision date : 28 Februar 2008
9
+ # Package : DTB Nagios Plugin
10
+ # Description : Nagios plugin (script) to check cpu utilization statistics.
11
+ # This script has been designed and written on Unix plateform (Linux, Aix, Solaris),
12
+ # requiring iostat as external program. The locations of these can easily
13
+ # be changed by editing the variables $IOSTAT at the top of the script.
14
+ # The script is used to query 4 of the key cpu statistics (user,system,iowait,idle)
15
+ # at the same time. Note though that there is only one set of warning
16
+ # and critical values for iowait percent.
17
+ #
18
+ # Usage : ./check_cpu_stats.sh [-w <warn>] [-c <crit] ( [ -i <intervals in second> ] [ -n <report number> ])
19
+ # ----------------------------------------------------------------------------------------
20
+ #
21
+ # TODO: Support for HP-UX
22
+ #
23
+ #
24
+ # ========================================================================================
25
+ #
26
+ # HISTORY :
27
+ # Release | Date | Authors | Description
28
+ # --------------+---------------+---------------+------------------------------------------
29
+ # 2.0 | 16.02.08 | Steve Bosek | Solaris support and new parameters
30
+ # | | | New Parameters : - iostat seconds intervals
31
+ # | | | - iostat report number
32
+ # 2.1 | 08.06.08 | Steve Bosek | Bug perfdata and convert comma in point for Linux result
33
+ # -----------------------------------------------------------------------------------------
34
+ #
35
+ # =========================================================================================
36
+
37
+ # Paths to commands used in this script. These may have to be modified to match your system setup.
38
+
39
+ IOSTAT=/usr/bin/iostat
40
+
41
+ # Nagios return codes
42
+ STATE_OK=0
43
+ STATE_WARNING=1
44
+ STATE_CRITICAL=2
45
+ STATE_UNKNOWN=3
46
+
47
+ # Plugin parameters value if not define
48
+ WARNING_THRESHOLD=${WARNING_THRESHOLD:="30"}
49
+ CRITICAL_THRESHOLD=${CRITICAL_THRESHOLD:="100"}
50
+ INTERVAL_SEC=${INTERVAL_SEC:="3"}
51
+ NUM_REPORT=${NUM_REPORT:="2"}
52
+
53
+ # Plugin variable description
54
+ PROGNAME=$(basename $0)
55
+ RELEASE="Revision 2.1"
56
+ AUTHOR="(c) 2008 Steve Bosek (steve.bosek@gmail.com)"
57
+
58
+ if [ ! -x $IOSTAT ]; then
59
+ echo "UNKNOWN: iostat not found or is not executable by the nagios user."
60
+ exit $STATE_UNKNOWN
61
+ fi
62
+
63
+ # Functions plugin usage
64
+ print_release() {
65
+ echo "$RELEASE $AUTHOR"
66
+ }
67
+
68
+ print_usage() {
69
+ echo ""
70
+ echo "$PROGNAME $RELEASE - CPU Utilization check script for Nagios"
71
+ echo ""
72
+ echo "Usage: check_cpu_stats.sh -w -c -wi -ci (-i -n)"
73
+ echo ""
74
+ echo " -w Warning level in % for cpu iowait"
75
+ echo " -c Crical level in % for cpu iowait"
76
+ echo " -wi Warn if cpu idle is less than x"
77
+ echo " -ci Critical is CPU idle is less than x"
78
+ echo " -i Interval in seconds for iostat (default : 3)"
79
+ echo " -n Number report for iostat (default : 2)"
80
+ echo " -h Show this page"
81
+ echo ""
82
+ echo "Usage: $PROGNAME"
83
+ echo "Usage: $PROGNAME --help"
84
+ echo ""
85
+ }
86
+
87
+ print_help() {
88
+ print_usage
89
+ echo ""
90
+ echo "This plugin will check cpu utilization (user,system,iowait,idle in %)"
91
+ echo ""
92
+ exit 0
93
+ }
94
+
95
+ # Parse parameters
96
+ while [ $# -gt 0 ]; do
97
+ case "$1" in
98
+ -h | --help)
99
+ print_help
100
+ exit $STATE_OK
101
+ ;;
102
+ -v | --version)
103
+ print_release
104
+ exit $STATE_OK
105
+ ;;
106
+ -w | --warning)
107
+ shift
108
+ WARNING_THRESHOLD=$1
109
+ ;;
110
+ -c | --critical)
111
+ shift
112
+ CRITICAL_THRESHOLD=$1
113
+ ;;
114
+ -wi | --warn-idle)
115
+ shift
116
+ WARN_IDLE=$1
117
+ ;;
118
+ -ci | --critical-idle)
119
+ shift
120
+ CRIT_IDLE=$1
121
+ ;;
122
+ -i | --interval)
123
+ shift
124
+ INTERVAL_SEC=$1
125
+ ;;
126
+ -n | --number)
127
+ shift
128
+ NUM_REPORT=$1
129
+ ;;
130
+ *) echo "Unknown argument: $1"
131
+ print_usage
132
+ exit $STATE_UNKNOWN
133
+ ;;
134
+ esac
135
+ shift
136
+ done
137
+
138
+ # CPU Utilization Statistics Unix Plateform ( Linux,AIX,Solaris are supported )
139
+ case `uname` in
140
+ Linux ) CPU_REPORT=`iostat -c $INTERVAL_SEC $NUM_REPORT|tail -2|head -n 1| tr -s " " " " `
141
+ CPU_USER=`echo $CPU_REPORT | cut -d " " -f 1 `
142
+ CPU_SYSTEM=`echo $CPU_REPORT | cut -d " " -f 3 `
143
+ CPU_IOWAIT=`echo $CPU_REPORT | cut -d " " -f 4 `
144
+ CPU_IO=`echo $CPU_IOWAIT | sed s/\\\./""/`
145
+ CPU_IDLE=`echo $CPU_REPORT | cut -d " " -f 6`
146
+ CPU_IDL=`echo $CPU_IDLE | sed s/\\\./""/`
147
+ ;;
148
+ *) echo "UNKNOWN: `uname` not yet supported by this plugin. Coming soon !"
149
+ exit $STATE_UNKNOWN
150
+ ;;
151
+ esac
152
+
153
+ WARNING_THRESH=$(( $WARNING_THRESHOLD * 100 ))
154
+ CRIT_THRESH=$(( $CRITICAL_THRESHOLD * 100 ))
155
+ WARN_IDLE_THRESH=$(( $WARN_IDLE * 100 ))
156
+ CRIT_IDLE_THRESH=$(( $CRIT_IDLE * 100 ))
157
+
158
+ OUTPUT="user=${CPU_USER}% system=${CPU_SYSTEM}% iowait=${CPU_IOWAIT}% idle=${CPU_IDLE}% | user = ${CPU_USER}, system = ${CPU_SYSTEM}, iowait = ${CPU_IOWAIT}, idle = ${CPU_IDLE} "
159
+
160
+ find_top_five_procs_by_cpu() {
161
+ echo "TOP 5 PROCESSES BY CPU:"
162
+ ps -eo %cpu,cputime,user,pid,args --sort -%cpu | head -n 6 | awk '{command = substr($0, index($0,$5)); printf "%5s %12s %12s %6s %s\n", $1, $2, $3, $4, command}'
163
+ }
164
+
165
+ # Return
166
+ if [ $CPU_IO -ge $CRIT_THRESH ]; then
167
+ echo "CRITICAL CPU iowait is > ${CRITICAL_THRESHOLD}%: ${OUTPUT}"
168
+ find_top_five_procs_by_cpu
169
+ exit $STATE_CRITICAL
170
+ elif [ $CPU_IO -ge $WARNING_THRESH ]; then
171
+ echo "WARNING CPU iowait is > ${WARNING_THRESHOLD}%: ${OUTPUT}"
172
+ find_top_five_procs_by_cpu
173
+ exit $STATE_WARNING
174
+ elif [ $CPU_IDL -le $CRIT_IDLE_THRESH ]; then
175
+ echo "CRITICAL CPU idle is < ${CRIT_IDLE}%: ${OUTPUT}"
176
+ find_top_five_procs_by_cpu
177
+ exit $STATE_CRITICAL
178
+ elif [ $CPU_IDL -le $WARN_IDLE_THRESH ]; then
179
+ echo "WARNING CPU idle is < ${WARN_IDLE}%: ${OUTPUT}"
180
+ find_top_five_procs_by_cpu
181
+ exit $STATE_WARNING
182
+ else
183
+ echo "OK: ${OUTPUT}"
184
+ exit $STATE_OK
185
+ fi
186
+
@@ -0,0 +1,34 @@
1
+ #!/bin/bash
2
+ # This script wraps around the `check_disk` command available in the nagios-plugins package.
3
+
4
+ WARN_THRES="10%"
5
+ CRIT_THRES="5%"
6
+ EXCLUDE_PATH=""
7
+ EXCLUDE_FS=""
8
+
9
+ # We can exclude directories...
10
+ EXCLUDE_OPTS=""
11
+ EXCLUDE_FS_OPTS=""
12
+
13
+ if [ -n "$EXCLUDE_PATH" ]
14
+ then
15
+ EXCLUDE_OPTS="-x ${EXCLUDE_PATH}"
16
+ fi
17
+
18
+ if [ -n "$EXCLUDE_FS" ]
19
+ then
20
+ EXCLUDE_FS_OPTS="-x ${EXCLUDE_FS}"
21
+ fi
22
+
23
+ # Call check_disk
24
+ /usr/local/nagios-plugins/check_disk -l -e -w $WARN_THRES -c $CRIT_THRES $EXCLUDE_OPTS $EXCLUDE_FS_OPTS
25
+
26
+ # Store the return code so we can exit with the right code even after doing other things.
27
+ RETURN=$?
28
+
29
+ # Print the check's thresholds.
30
+ printf "\nTHRESHOLDS - WARNING:%s;CRITICAL:%s;\n\n" $WARN_THRES $CRIT_THRES
31
+ # Print the output of `df` for the 'additional details' section.
32
+ df -h
33
+
34
+ exit $RETURN
@@ -0,0 +1,181 @@
1
+ #!/usr/bin/perl -w
2
+ # $Id: check_mem.pl 2 2002-02-28 06:42:51Z egalstad $
3
+
4
+ # check_mem.pl Copyright (C) 2000 Dan Larsson <dl@tyfon.net>
5
+ #
6
+ # This program is free software; you can redistribute it and/or
7
+ # modify it under the terms of the GNU General Public License
8
+ # as published by the Free Software Foundation; either version 2
9
+ # of the License, or (at your option) any later version.
10
+ #
11
+ # This program is distributed in the hope that it will be useful,
12
+ # but WITHOUT ANY WARRANTY; without even the implied warranty
13
+ # of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ # GNU General Public License for more details.
15
+ #
16
+ # you should have received a copy of the GNU General Public License
17
+ # along with this program (or with Nagios); if not, write to the
18
+ # Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19
+ # Boston, MA 02111-1307, USA
20
+
21
+ # Tell Perl what we need to use
22
+ use strict;
23
+ use Getopt::Std;
24
+
25
+ use vars qw($opt_c $opt_f $opt_u $opt_w
26
+ $free_memory $used_memory $total_memory $cached_memory
27
+ $crit_level $warn_level
28
+ %exit_codes @memlist
29
+ $percent $fmt_pct $fmt_used $fmt_free $fmt_cached $fmt_slabs
30
+ $verb_err $command_line $slabs_reclaimable);
31
+
32
+ # Predefined exit codes for Nagios
33
+ %exit_codes = ('UNKNOWN' , 3,
34
+ 'OK' , 0,
35
+ 'WARNING' , 1,
36
+ 'CRITICAL', 2,);
37
+
38
+ # Turn this to 1 to see reason for parameter errors (if any)
39
+ $verb_err = 1;
40
+
41
+ # This the unix command string that brings Perl the data
42
+ $command_line = `free |grep Mem|awk '{print \$2,\$3,\$4,\$7}'`;
43
+
44
+ chomp $command_line;
45
+ @memlist = split(/ /, $command_line);
46
+
47
+ # Get the amount used by dentry_cache etc, as this counts as "free" too.
48
+ $slabs_reclaimable = `grep SReclaimable /proc/meminfo | awk '{print \$2}'`;
49
+ chomp $slabs_reclaimable;
50
+
51
+ # Time for calculations. Cached and the slabs reclaimable shouldn't count as "used"
52
+ # because they can and will be used by the kernel if needs be (before swapping)
53
+ $cached_memory = $memlist[3];
54
+ $used_memory = $memlist[1] - $cached_memory - $slabs_reclaimable;
55
+ $free_memory = $memlist[2] + $cached_memory + $slabs_reclaimable;
56
+ $total_memory = $memlist[0];
57
+
58
+ # All our machines have over a GB of RAM. Stop this sillyness.
59
+ $used_memory = $used_memory / 1024 / 1024;
60
+ $free_memory = $free_memory / 1024 / 1024;
61
+ $total_memory = $total_memory / 1024 / 1024;
62
+ $cached_memory = $cached_memory / 1024 / 1024;
63
+ $slabs_reclaimable = $slabs_reclaimable / 1024 / 1024;
64
+
65
+ # Some pretty formatting for output purposes.
66
+ $fmt_free = sprintf "%.3f", $free_memory;
67
+ $fmt_used = sprintf "%.3f", $used_memory;
68
+ $fmt_cached = sprintf "%.3f", $cached_memory;
69
+ $fmt_slabs = sprintf "%.3f", $slabs_reclaimable;
70
+
71
+ # Get the options
72
+ if ($#ARGV le 0)
73
+ {
74
+ &usage;
75
+ }
76
+ else
77
+ {
78
+ getopts('c:fuw:');
79
+ }
80
+
81
+ # Shortcircuit the switches
82
+ if (!$opt_w or $opt_w == 0 or !$opt_c or $opt_c == 0)
83
+ {
84
+ print "*** You must define WARN and CRITICAL levels!" if ($verb_err);
85
+ &usage;
86
+ }
87
+ elsif (!$opt_f and !$opt_u)
88
+ {
89
+ print "*** You must select to monitor either USED or FREE memory!" if ($verb_err);
90
+ &usage;
91
+ }
92
+
93
+ # Check if levels are sane
94
+ if ($opt_w <= $opt_c and $opt_f)
95
+ {
96
+ print "*** WARN level must not be less than CRITICAL when checking FREE memory!" if ($verb_err);
97
+ &usage;
98
+ }
99
+ elsif ($opt_w >= $opt_c and $opt_u)
100
+ {
101
+ print "*** WARN level must not be greater than CRITICAL when checking USED memory!" if ($verb_err);
102
+ &usage;
103
+ }
104
+
105
+ $warn_level = $opt_w;
106
+ $crit_level = $opt_c;
107
+
108
+ sub find_top_five_procs_by_mem {
109
+ # Find the top 5 process by memory usage; sort by RSS in descending order.
110
+ my @top_five_procs = qx/ps -eo %mem,rss,user,pid,args --sort -rss | head -n 6 | awk '{command = substr(\$0, index(\$0,\$5)); printf "%5s %12s %12s %6s %s\\n", \$1, \$2, \$3, \$4, command}'/;
111
+ print 'TOP 5 PROCESSES BY MEMORY USAGE:\n';
112
+ foreach my $line (@top_five_procs) {
113
+ chomp $line;
114
+ print $line . '\n';
115
+ }
116
+ }
117
+
118
+ if ($opt_f)
119
+ {
120
+ $percent = $free_memory / $total_memory * 100;
121
+ $fmt_pct = sprintf "%.1f", $percent;
122
+ if ($percent <= $crit_level)
123
+ {
124
+ print "Memory CRITICAL - $fmt_pct% free ($fmt_free GB total including $fmt_cached GB cached, $fmt_slabs GB reclaimable) \n";
125
+ find_top_five_procs_by_mem();
126
+ exit $exit_codes{'CRITICAL'};
127
+ }
128
+ elsif ($percent <= $warn_level)
129
+ {
130
+ print "Memory WARNING - $fmt_pct% free ($fmt_free GB total including $fmt_cached GB cached, $fmt_slabs GB reclaimable) \n";
131
+ find_top_five_procs_by_mem();
132
+ exit $exit_codes{'WARNING'};
133
+ }
134
+ else
135
+ {
136
+ print "Memory OK - $fmt_pct% free ($fmt_free GB total including $fmt_cached GB cached, $fmt_slabs GB reclaimable) \n";
137
+ exit $exit_codes{'OK'};
138
+ }
139
+ }
140
+ elsif ($opt_u)
141
+ {
142
+ $percent = $used_memory / $total_memory * 100;
143
+ $fmt_pct = sprintf "%.1f", $percent;
144
+ if ($percent >= $crit_level)
145
+ {
146
+ print "Memory CRITICAL - $fmt_pct% used ($fmt_used GB total plus $fmt_cached GB cached, $fmt_slabs GB reclaimable)\n";
147
+ find_top_five_procs_by_mem();
148
+ exit $exit_codes{'CRITICAL'};
149
+ }
150
+ elsif ($percent >= $warn_level)
151
+ {
152
+ print "Memory WARNING - $fmt_pct% used ($fmt_used GB total plus $fmt_cached GB cached, $fmt_slabs GB reclaimable)\n";
153
+ find_top_five_procs_by_mem();
154
+ exit $exit_codes{'WARNING'};
155
+ }
156
+ else
157
+ {
158
+ print "Memory OK - $fmt_pct% used ($fmt_used GB total plus $fmt_cached GB cached, $fmt_slabs GB reclaimable)\n";
159
+ exit $exit_codes{'OK'};
160
+ }
161
+ }
162
+
163
+ # Show usage
164
+ sub usage()
165
+ {
166
+ print "\ncheck_mem.pl v1.0 - Nagios Plugin\n\n";
167
+ print "usage:\n";
168
+ print " check_mem.pl -<f|u> -w <warnlevel> -c <critlevel>\n\n";
169
+ print "options:\n";
170
+ print " -f Check FREE memory\n";
171
+ print " -u Check USED memory\n";
172
+ print " -w PERCENT Percent free/used when to warn\n";
173
+ print " -c PERCENT Percent free/used when critical\n";
174
+ print "\nCopyright (C) 2000 Dan Larsson <dl\@tyfon.net>\n";
175
+ print "check_mem.pl comes with absolutely NO WARRANTY either implied or explicit\n";
176
+ print "This program is licensed under the terms of the\n";
177
+ print "GNU General Public License (check source code for details)\n";
178
+ exit $exit_codes{'UNKNOWN'};
179
+ }
180
+
181
+
@@ -0,0 +1,11 @@
1
+ # NRPE Plugin Examples
2
+
3
+ The following NRPE scripts are provided as examples of check scripts that
4
+ correspond to similarly named ``nagios-herald`` formatters. Their output is
5
+ used by the formatters to provide more context in alerts.
6
+
7
+ * ``check_cpu_stats.sh`` - A modified version of the script available at
8
+ [http://exchange.nagios.org/directory/Plugins/System-Metrics/CPU-Usage-and-Load/check_cpu_stats-2Esh/details](http://exchange.nagios.org/directory/Plugins/System-Metrics/CPU-Usage-and-Load/check_cpu_stats-2Esh/details)
9
+ * ``check_disk.sh`` - A wrapper around the ``check_disk`` command available in the ``nagios-plugins`` package.
10
+ * ``check_mem.pl`` - A modified version of the script available at
11
+ [http://exchange.nagios.org/directory/Plugins/System-Metrics/Memory/check_mem-2Epl/details](http://exchange.nagios.org/directory/Plugins/System-Metrics/Memory/check_mem-2Epl/details)
data/docs/config.md ADDED
@@ -0,0 +1,62 @@
1
+ # Configuration
2
+
3
+ ``nagios-herald`` supports a YAML-based configuration file. The configuration file is named
4
+ ``config.yml`` and lives in the ``etc/`` directory of this project. This project provides
5
+ an [example](/etc/config.yml.example) configuration file that you can use to get started.
6
+
7
+ ## Command Line Options
8
+
9
+ ``nagios-herald`` provides several command line options, some of which can override values
10
+ in the configuration file. During runtime, ``nagios-herald`` merges the configuration
11
+ and command line options into a single hash available through the code.
12
+ Command line options **always override configuration file values, when there is a conflict**.
13
+
14
+ ## Accessing Configuration Values
15
+
16
+ All configuration file values and command line options are available in a single, globally
17
+ available hash named ``Config.config``. See below for an example configuration file.
18
+
19
+ ```
20
+ # define the FQDN of servers we call on to provide context in notifications
21
+ servers:
22
+ ganglia: ganglia.example.com
23
+ graphite: graphite.example.com
24
+ splunk:
25
+ url: https://splunk.example.com:8089/services/search/jobs
26
+ username: splunkuser
27
+ password: splunkpass
28
+ logstash:
29
+ url: http://logstash.example.com:9200
30
+ result_field_truncate: 200
31
+ ```
32
+
33
+ To access the value for the Ganglia server URI, one would write code similar to the below:
34
+
35
+ ```ruby
36
+ ganglia_uri = Config.config[:servers][:ganglia]
37
+ ```
38
+
39
+ ### Notable Configuration Values
40
+
41
+ Two of the most important configuration values are ``formatter_dir`` and ``logfile``.
42
+
43
+ ``formatter_dir`` (equivalent to the ``--formatter-dir`` command line option) tells **nagios-herald**
44
+ where to locate **your custom formatters**. It will load those in addition to the built-in formatters.
45
+ Custom formatters are given precedence allowing formatter authors to override the built-in formatters.
46
+
47
+ ``logfile`` tells **nagios-herald** where to log its output. This is especially critical to catch errors
48
+ should they arise. If **nagios-herald** isn't sending notifications, **it's a bug**; consult ``logfile`` for
49
+ details. Optionally, setting ``trace`` to **true** (equivalent to ``--trace`` on the command line) will
50
+ provide a backtrace to aid in debugging.
51
+
52
+ ### Icinga Support
53
+
54
+ ``nagios-herald`` relies on the environment variables set when a notification is sent.
55
+ Those variables are prefixed with ``NAGIOS_``. Icinga, a popular variant of Nagios,
56
+ prefixes those variables with ``ICINGA_``. To enable support for Icinga, set the
57
+ ``icinga`` configuraion value:
58
+
59
+ ``icinga: true``
60
+
61
+ If that value is set, calling ``Util#get_nagios_var`` will replace ``NAGIOS_``
62
+ with ``ICINGA_`` in the environment variable's name.
@@ -0,0 +1,48 @@
1
+ # Example Alerts
2
+
3
+ To demonstrate how ``nagios-herald`` can format alerts to be more legible and useful, see below for some example alerts without context (before ``nagios-herald``) and **with** context (**after** ``nagios-herald``).
4
+
5
+ ## CPU Alert
6
+
7
+ ### No Context
8
+
9
+ This is a basic CPU alert informing the operator that *something* is wrong with a server's overall processor utilization.
10
+
11
+ ![cpu_no_context](/docs/images/cpu_no_context.png)
12
+
13
+ ### With Context
14
+
15
+ The CPU check has been updated to inform the operator of which threshold has been exceeded and lists the top 5 processes by processor utilization and ``nagios-herald`` formatted the content to make it more legible.
16
+
17
+ ![cpu_with_context](/docs/images/cpu_with_context.png)
18
+
19
+ ## Disk Space Alert
20
+
21
+ Among most operators, disk space alerts probably garner the most disdain for their lack of utility and frequency of delivery.
22
+
23
+ ### No Context
24
+
25
+ It's easy to understand why disk space alerts are perceived to be useless given the following example.
26
+
27
+ ![disk_space_no_context](/docs/images/disk_space_no_context.png)
28
+
29
+ ### With Context
30
+
31
+ Now, imagine how much more useful it would be if the check output the results of ``df`` and ``nagios-herald`` could use that information to provide context. In the example below, ``nagios-herald`` was used to add a **stack bar** to indicate which volume exceeded the threshold, add a **Ganglia graph** of disk utilization for the past 24 hours, **highlight** the relevant volume in the ``df`` output, and even inform the operator of the **number of times in the last week** that the alert fired.
32
+
33
+ ![disk_space_with_context](/docs/images/disk_space_with_context.png)
34
+
35
+ ## Memory Alert
36
+
37
+ ### No Context
38
+
39
+ Memory alerts that simply inform the operator that some threshold have been exceeded aren't very helpful.
40
+
41
+ ![memory_high_no_context](/docs/images/memory_high_no_context.png)
42
+
43
+ ### With Context
44
+
45
+ The first thing most operators do is run ``top`` to understand what processes are running, sorted by memory usage. In the example below, the check was updated to output the top 5 processes by memory utilization and ``nagios-herald`` formatted the output for legibility.
46
+
47
+ ![memory_high_with_context](/docs/images/memory_high_with_context.png)
48
+