riemann-monitors 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/Gemfile +2 -0
- data/LICENSE +21 -0
- data/README.adoc +75 -0
- data/Rakefile +125 -0
- data/bin/riemann-apache-status +98 -0
- data/bin/riemann-bench +71 -0
- data/bin/riemann-cloudant +58 -0
- data/bin/riemann-consul +106 -0
- data/bin/riemann-dir-files-count +55 -0
- data/bin/riemann-dir-space +55 -0
- data/bin/riemann-diskstats +95 -0
- data/bin/riemann-fd +66 -0
- data/bin/riemann-freeswitch +119 -0
- data/bin/riemann-haproxy +58 -0
- data/bin/riemann-health +289 -0
- data/bin/riemann-httpstatus +73 -0
- data/bin/riemann-kvminstance +22 -0
- data/bin/riemann-memcached +38 -0
- data/bin/riemann-net +81 -0
- data/bin/riemann-nginx-status +84 -0
- data/bin/riemann-ntp +35 -0
- data/bin/riemann-proc +131 -0
- data/bin/riemann-varnish +54 -0
- data/bin/riemann-zookeeper +41 -0
- data/data/statfields +49 -0
- data/lib/riemann-monitors/main.rb +111 -0
- data/lib/riemann-monitors/version.rb +6 -0
- data/lib/riemann-monitors.rb +8 -0
- data/project.yaml +12 -0
- data/riemann-monitors.gemspec +73 -0
- metadata +210 -0
@@ -0,0 +1,95 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require_relative "../lib/riemann-monitors"
|
4
|
+
|
5
|
+
class Riemann::Monitors::Diskstats
|
6
|
+
include Riemann::Monitors
|
7
|
+
|
8
|
+
opt :devices, "Devices to monitor", :type => :strings, :default => nil
|
9
|
+
opt :ignore_devices, "Devices to ignore", :type => :strings, :default =>nil
|
10
|
+
|
11
|
+
def initialize
|
12
|
+
@old_state = nil
|
13
|
+
end
|
14
|
+
|
15
|
+
def state
|
16
|
+
f = File.read('/proc/diskstats')
|
17
|
+
state = f.split("\n").reject { |d| d =~ /(ram|loop)/ }.inject({}) do |s, line|
|
18
|
+
if line =~ /^(?:\s+\d+){2}\s+([\w\d\-]+) (.*)$/
|
19
|
+
dev = $1
|
20
|
+
|
21
|
+
['reads reqs',
|
22
|
+
'reads merged',
|
23
|
+
'reads sector',
|
24
|
+
'reads time',
|
25
|
+
'writes reqs',
|
26
|
+
'writes merged',
|
27
|
+
'writes sector',
|
28
|
+
'writes time',
|
29
|
+
'io reqs',
|
30
|
+
'io time',
|
31
|
+
'io weighted'
|
32
|
+
].map do |service|
|
33
|
+
"#{dev} #{service}"
|
34
|
+
end.zip(
|
35
|
+
$2.split(/\s+/).map { |str| str.to_i }
|
36
|
+
).each do |service, value|
|
37
|
+
s[service] = value
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
s
|
42
|
+
end
|
43
|
+
|
44
|
+
# Filter interfaces
|
45
|
+
if is = opts[:devices]
|
46
|
+
state = state.select do |service, value|
|
47
|
+
is.include? service.split(' ').first
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
if ign = opts[:ignore_devices]
|
52
|
+
state = state.reject do |service, value|
|
53
|
+
ign.include? service.split(' ').first
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
state
|
58
|
+
end
|
59
|
+
|
60
|
+
def tick
|
61
|
+
state = self.state
|
62
|
+
|
63
|
+
if @old_state
|
64
|
+
state.each do |service, metric|
|
65
|
+
|
66
|
+
if service =~ /io reqs$/
|
67
|
+
report(
|
68
|
+
:service => "diskstats " + service,
|
69
|
+
:metric => metric,
|
70
|
+
:state => "ok"
|
71
|
+
)
|
72
|
+
else
|
73
|
+
delta = metric - @old_state[service]
|
74
|
+
|
75
|
+
report(
|
76
|
+
:service => "diskstats " + service,
|
77
|
+
:metric => (delta.to_f / opts[:interval]),
|
78
|
+
:state => "ok"
|
79
|
+
)
|
80
|
+
end
|
81
|
+
|
82
|
+
if service =~ /io time$/
|
83
|
+
report(:service => "diskstats " + service.gsub(/time/, 'util'),
|
84
|
+
:metric => (delta.to_f / (opts[:interval]*1000)),
|
85
|
+
:state => "ok")
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
@old_state = state
|
91
|
+
end
|
92
|
+
|
93
|
+
end
|
94
|
+
|
95
|
+
Riemann::Monitors::Diskstats.run
|
data/bin/riemann-fd
ADDED
@@ -0,0 +1,66 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# Reports current file descriptor use to riemann.
|
4
|
+
# By default reports the total system fd usage, can also report usage of individual processes
|
5
|
+
|
6
|
+
require_relative "../lib/riemann-monitors"
|
7
|
+
|
8
|
+
class Riemann::Monitors::Health
|
9
|
+
include Riemann::Monitors
|
10
|
+
|
11
|
+
opt :fd_sys_warning, "open file descriptor threshold for system", :default => 800
|
12
|
+
opt :fd_sys_critical, "open file descriptor critical threshold for system", :default => 900
|
13
|
+
opt :fd_proc_warning, "open file descriptor threshold for process", :default => 800
|
14
|
+
opt :fd_proc_critical, "open file descriptor critical threshold for process", :default => 900
|
15
|
+
opt :processes, "list of processes to measure fd usage in addition to system total", :type => :ints
|
16
|
+
|
17
|
+
def initialize
|
18
|
+
@limits = {
|
19
|
+
:fd => {:critical => opts[:fd_sys_critical], :warning => opts[:fd_sys_warning]},
|
20
|
+
:process => {:critical => opts[:fd_proc_critical], :warning => opts[:fd_proc_warning]},
|
21
|
+
}
|
22
|
+
ostype = `uname -s`.chomp.downcase
|
23
|
+
puts "WARNING: OS '#{ostype}' not explicitly supported. Falling back to Linux" unless ostype == "linux"
|
24
|
+
@fd = method :linux_fd
|
25
|
+
end
|
26
|
+
|
27
|
+
def alert(service, state, metric, description)
|
28
|
+
report(
|
29
|
+
:service => service.to_s,
|
30
|
+
:state => state.to_s,
|
31
|
+
:metric => metric.to_f,
|
32
|
+
:description => description
|
33
|
+
)
|
34
|
+
end
|
35
|
+
|
36
|
+
def linux_fd
|
37
|
+
sys_used = Integer(`lsof | wc -l`)
|
38
|
+
if sys_used > @limits[:fd][:critical]
|
39
|
+
alert "fd sys", :critical, sys_used, "system is using #{sys_used} fds"
|
40
|
+
elsif sys_used > @limits[:fd][:warning]
|
41
|
+
alert "fd sys", :warning, sys_used, "system is using #{sys_used} fds"
|
42
|
+
else
|
43
|
+
alert "fd sys", :ok, sys_used, "system is using #{sys_used} fds"
|
44
|
+
end
|
45
|
+
|
46
|
+
unless opts[:processes].nil?
|
47
|
+
opts[:processes].each do |process|
|
48
|
+
used = Integer(`lsof -p #{process} | wc -l`)
|
49
|
+
name, pid = `ps axo comm,pid | grep -w #{process}`.split
|
50
|
+
if used > @limits[:process][:critical]
|
51
|
+
alert "fd #{name} #{process}", :critical, used, "process #{name} #{process} is using #{used} fds"
|
52
|
+
elsif used > @limits[:process][:warning]
|
53
|
+
alert "fd #{name} #{process}", :warning, used, "process #{name} #{process} is using #{used} fds"
|
54
|
+
else
|
55
|
+
alert "fd #{name} #{process}", :ok, used, "process #{name} #{process} is using #{used} fds"
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
def tick
|
62
|
+
@fd.call
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
Riemann::Monitors::Health.run
|
@@ -0,0 +1,119 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require File.expand_path('../../lib/riemann/tools', __FILE__)
|
4
|
+
|
5
|
+
class Riemann::Monitors::FreeSWITCH
|
6
|
+
include Riemann::Monitors
|
7
|
+
|
8
|
+
opt :calls_warning, "Calls warning threshold", :default => 100
|
9
|
+
opt :calls_critical, "Calls critical threshold", :default => 300
|
10
|
+
opt :pid_file, "FreeSWITCH daemon pidfile", :type => String, :default => "/var/run/freeswitch/freeswitch.pid"
|
11
|
+
|
12
|
+
def initialize
|
13
|
+
@limits = {
|
14
|
+
:calls => {:critical => opts[:calls_critical], :warning => opts[:calls_warning]}
|
15
|
+
}
|
16
|
+
end
|
17
|
+
|
18
|
+
def dead_proc?(pid)
|
19
|
+
begin
|
20
|
+
Process.getpgid(pid)
|
21
|
+
false
|
22
|
+
rescue Errno::ESRCH
|
23
|
+
true
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def alert(service, state, metric, description)
|
28
|
+
report(
|
29
|
+
:service => service.to_s,
|
30
|
+
:state => state.to_s,
|
31
|
+
:metric => metric.to_f,
|
32
|
+
:description => description
|
33
|
+
)
|
34
|
+
end
|
35
|
+
|
36
|
+
def exec_with_timeout(cmd, timeout)
|
37
|
+
pid = Process.spawn(cmd, {[:err,:out] => :close, :pgroup => true})
|
38
|
+
begin
|
39
|
+
Timeout.timeout(timeout) do
|
40
|
+
Process.waitpid(pid, 0)
|
41
|
+
$?.exitstatus == 0
|
42
|
+
end
|
43
|
+
rescue Timeout::Error
|
44
|
+
Process.kill(15, -Process.getpgid(pid))
|
45
|
+
puts "Killed pid: #{pid}"
|
46
|
+
false
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
def tick
|
51
|
+
# Determine how many current calls I have according to FreeSWITCH
|
52
|
+
fs_calls = %x[fs_cli -x "show calls count"| grep -Po '^\\d+'].to_i
|
53
|
+
|
54
|
+
# Determine how many current channels I have according to FreeSWITCH
|
55
|
+
fs_channels = %x[fs_cli -x "show channels count"| grep -Po '^\\d+'].to_i
|
56
|
+
|
57
|
+
# Determine how many conferences I have according to FreeSWITCH
|
58
|
+
fs_conferences = %x[fs_cli -x "conference list"| grep -Pco '^Conference'].to_i
|
59
|
+
|
60
|
+
# Try to read pidfile. If it fails use Devil's dummy PID
|
61
|
+
begin
|
62
|
+
fs_pid = File.read(opts[:pid_file]).to_i
|
63
|
+
rescue
|
64
|
+
puts "Couldn't read pidfile: #{opts[:pid_file]}"
|
65
|
+
fs_pid = -666
|
66
|
+
end
|
67
|
+
|
68
|
+
fs_threads = fs_pid > 0 ? %x[ps huH p #{fs_pid} | wc -l].to_i : 0
|
69
|
+
|
70
|
+
# Submit calls to riemann
|
71
|
+
if fs_calls > @limits[:calls][:critical]
|
72
|
+
alert "FreeSWITCH current calls", :critical, fs_calls, "Number of calls are #{fs_calls}"
|
73
|
+
elsif fs_calls > @limits[:calls][:warning]
|
74
|
+
alert "FreeSWITCH current calls", :warning, fs_calls, "Number of calls are #{fs_calls}"
|
75
|
+
else
|
76
|
+
alert "FreeSWITCH current calls", :ok, fs_calls, "Number of calls are #{fs_calls}"
|
77
|
+
end
|
78
|
+
|
79
|
+
# Submit channels to riemann
|
80
|
+
if fs_channels > @limits[:calls][:critical]
|
81
|
+
alert "FreeSWITCH current channels", :critical, fs_channels, "Number of channels are #{fs_channels}"
|
82
|
+
elsif fs_channels > @limits[:calls][:warning]
|
83
|
+
alert "FreeSWITCH current channels", :warning, fs_channels, "Number of channels are #{fs_channels}"
|
84
|
+
else
|
85
|
+
alert "FreeSWITCH current channels", :ok, fs_channels, "Number of channels are #{fs_channels}"
|
86
|
+
end
|
87
|
+
|
88
|
+
# Submit conferences to riemann
|
89
|
+
if fs_conferences > @limits[:calls][:critical]
|
90
|
+
alert "FreeSWITCH current conferences", :critical, fs_conferences, "Number of conferences are #{fs_conferences}"
|
91
|
+
elsif fs_conferences > @limits[:calls][:warning]
|
92
|
+
alert "FreeSWITCH current conferences", :warning, fs_conferences, "Number of conferences are #{fs_conferences}"
|
93
|
+
else
|
94
|
+
alert "FreeSWITCH current conferences", :ok, fs_conferences, "Number of conferences are #{fs_conferences}"
|
95
|
+
end
|
96
|
+
|
97
|
+
# Submit threads to riemann
|
98
|
+
if fs_threads
|
99
|
+
alert "FreeSWITCH current threads", :ok, fs_threads, "Number of threads are #{fs_threads}"
|
100
|
+
end
|
101
|
+
|
102
|
+
# Submit status to riemann
|
103
|
+
if dead_proc?(fs_pid)
|
104
|
+
alert "FreeSWITCH status", :critical, -1, "FreeSWITCH service status: not running"
|
105
|
+
else
|
106
|
+
alert "FreeSWITCH status", :ok, nil, "FreeSWITCH service status: running"
|
107
|
+
end
|
108
|
+
|
109
|
+
# Submit CLI status to riemann using timeout in case it's unresponsive
|
110
|
+
if exec_with_timeout("fs_cli -x status", 2)
|
111
|
+
alert "FreeSWITCH CLI status", :ok, nil, "FreeSWITCH CLI status: responsive"
|
112
|
+
else
|
113
|
+
alert "FreeSWITCH CLI status", :critical, -1, "FreeSWITCH CLI status: not responding"
|
114
|
+
end
|
115
|
+
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
Riemann::Monitors::FreeSWITCH.run
|
data/bin/riemann-haproxy
ADDED
@@ -0,0 +1,58 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# Gathers haproxy CSV statistics and submits them to Riemann.
|
4
|
+
|
5
|
+
require File.expand_path('../../lib/riemann/tools', __FILE__)
|
6
|
+
|
7
|
+
class Riemann::Monitors::Haproxy
|
8
|
+
include Riemann::Monitors
|
9
|
+
require 'net/http'
|
10
|
+
require 'csv'
|
11
|
+
|
12
|
+
opt :stats_url, "Full url to haproxy stats (eg: https://user:password@host.com:9999/stats)", :required => true, :type => :string
|
13
|
+
|
14
|
+
def initialize
|
15
|
+
@uri = URI(opts[:stats_url]+';csv')
|
16
|
+
end
|
17
|
+
|
18
|
+
def tick
|
19
|
+
csv = CSV.parse(get_csv.body.split("# ")[1], { :headers => true })
|
20
|
+
csv.each do |row|
|
21
|
+
row = row.to_hash
|
22
|
+
ns = "haproxy #{row['pxname']} #{row['svname']}"
|
23
|
+
row.each do |property, metric|
|
24
|
+
unless (property.nil? || property == 'pxname' || property == 'svname')
|
25
|
+
report(
|
26
|
+
:host => @uri.host,
|
27
|
+
:service => "#{ns} #{property}",
|
28
|
+
:metric => metric.to_f,
|
29
|
+
:tags => ['haproxy']
|
30
|
+
)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
report(
|
35
|
+
:host => @uri.host,
|
36
|
+
:service => "#{ns} state",
|
37
|
+
:state => (['UP', 'OPEN'].include?(row['status']) ? 'ok' : 'critical'),
|
38
|
+
:tags => ['haproxy']
|
39
|
+
)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def get_csv
|
44
|
+
http = Net::HTTP.new(@uri.host, @uri.port)
|
45
|
+
http.use_ssl = true if @uri.scheme == 'https'
|
46
|
+
http.start do |h|
|
47
|
+
get = Net::HTTP::Get.new(@uri.request_uri)
|
48
|
+
unless @uri.userinfo.nil?
|
49
|
+
userinfo = @uri.userinfo.split(":")
|
50
|
+
get.basic_auth userinfo[0], userinfo[1]
|
51
|
+
end
|
52
|
+
h.request get
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
end
|
57
|
+
|
58
|
+
Riemann::Monitors::Haproxy.run
|
data/bin/riemann-health
ADDED
@@ -0,0 +1,289 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# Reports current CPU, disk, load average, and memory use to riemann.
|
4
|
+
|
5
|
+
require_relative "../lib/riemann-monitors"
|
6
|
+
|
7
|
+
class Riemann::Monitors::Health
|
8
|
+
include Riemann::Monitors
|
9
|
+
|
10
|
+
opt :cpu_warning, "CPU warning threshold (fraction of total jiffies)", :default => 0.9
|
11
|
+
opt :cpu_critical, "CPU critical threshold (fraction of total jiffies)", :default => 0.95
|
12
|
+
opt :disk_warning, "Disk warning threshold (fraction of space used)", :default => 0.9
|
13
|
+
opt :disk_critical, "Disk critical threshold (fraction of space used)", :default => 0.95
|
14
|
+
opt :load_warning, "Load warning threshold (load average / core)", :default => 3
|
15
|
+
opt :load_critical, "Load critical threshold (load average / core)", :default => 8
|
16
|
+
opt :memory_warning, "Memory warning threshold (fraction of RAM)", :default => 0.85
|
17
|
+
opt :memory_critical, "Memory critical threshold (fraction of RAM)", :default => 0.95
|
18
|
+
opt :checks, "A list of checks to run.", :type => :strings, :default => ['cpu', 'load', 'memory', 'disk']
|
19
|
+
|
20
|
+
def initialize
|
21
|
+
@limits = {
|
22
|
+
:cpu => {:critical => opts[:cpu_critical], :warning => opts[:cpu_warning]},
|
23
|
+
:disk => {:critical => opts[:disk_critical], :warning => opts[:disk_warning]},
|
24
|
+
:load => {:critical => opts[:load_critical], :warning => opts[:load_warning]},
|
25
|
+
:memory => {:critical => opts[:memory_critical], :warning => opts[:memory_warning]}
|
26
|
+
}
|
27
|
+
case (@ostype = `uname -s`.chomp.downcase)
|
28
|
+
when 'darwin'
|
29
|
+
@cores = `sysctl -n hw.ncpu`.to_i
|
30
|
+
@cpu = method :darwin_cpu
|
31
|
+
@disk = method :disk
|
32
|
+
@load = method :darwin_load
|
33
|
+
@memory = method :darwin_memory
|
34
|
+
darwin_top
|
35
|
+
when 'freebsd'
|
36
|
+
@cores = `sysctl -n hw.ncpu`.to_i
|
37
|
+
@cpu = method :freebsd_cpu
|
38
|
+
@disk = method :disk
|
39
|
+
@load = method :freebsd_load
|
40
|
+
@memory = method :freebsd_memory
|
41
|
+
else
|
42
|
+
@cores = cores
|
43
|
+
puts "WARNING: OS '#{@ostype}' not explicitly supported. Falling back to Linux" unless @ostype == "linux"
|
44
|
+
@cpu = method :linux_cpu
|
45
|
+
@disk = method :disk
|
46
|
+
@load = method :linux_load
|
47
|
+
@memory = method :linux_memory
|
48
|
+
end
|
49
|
+
|
50
|
+
opts[:checks].each do |check|
|
51
|
+
case check
|
52
|
+
when "disk"
|
53
|
+
@disk_enabled = true
|
54
|
+
when "load"
|
55
|
+
@load_enabled = true
|
56
|
+
when "cpu"
|
57
|
+
@cpu_enabled = true
|
58
|
+
when "memory"
|
59
|
+
@memory_enabled = true
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
def alert(service, state, metric, description)
|
65
|
+
report(
|
66
|
+
:service => service.to_s,
|
67
|
+
:state => state.to_s,
|
68
|
+
:metric => metric.to_f,
|
69
|
+
:description => description
|
70
|
+
)
|
71
|
+
end
|
72
|
+
|
73
|
+
def cores
|
74
|
+
i = 0;
|
75
|
+
File.read("/proc/cpuinfo").split(/\n\n/).inject({}) do |cores, p|
|
76
|
+
physical_id = p[/physical id\s+:\s+(\d+)/, 1]
|
77
|
+
core_id = p[/core id\s+:\s+(\d+)/, 1]
|
78
|
+
if physical_id and core_id
|
79
|
+
cores["#{physical_id}:#{core_id}"] = true
|
80
|
+
elsif physical_id
|
81
|
+
cores["#{physical_id}:"] = true
|
82
|
+
else
|
83
|
+
cores[i += 1] = true;
|
84
|
+
end
|
85
|
+
|
86
|
+
cores
|
87
|
+
end.size
|
88
|
+
end
|
89
|
+
|
90
|
+
def report_pct(service, fraction, report)
|
91
|
+
if fraction
|
92
|
+
if fraction > @limits[service][:critical]
|
93
|
+
alert service, :critical, fraction, "#{sprintf("%.2f", fraction * 100)}% #{report}"
|
94
|
+
elsif fraction > @limits[service][:warning]
|
95
|
+
alert service, :warning, fraction, "#{sprintf("%.2f", fraction * 100)}% #{report}"
|
96
|
+
else
|
97
|
+
alert service, :ok, fraction, "#{sprintf("%.2f", fraction * 100)}% #{report}"
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
def linux_cpu
|
103
|
+
new = File.read('/proc/stat')
|
104
|
+
unless new[/cpu\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)/]
|
105
|
+
alert 'cpu', :unknown, nil, "/proc/stat doesn't include a CPU line"
|
106
|
+
return false
|
107
|
+
end
|
108
|
+
u2, n2, s2, i2 = [$1, $2, $3, $4].map { |e| e.to_i }
|
109
|
+
|
110
|
+
if @old_cpu
|
111
|
+
u1, n1, s1, i1 = @old_cpu
|
112
|
+
|
113
|
+
used = (u2+n2+s2) - (u1+n1+s1)
|
114
|
+
total = used + i2-i1
|
115
|
+
fraction = used.to_f / total
|
116
|
+
|
117
|
+
report_pct :cpu, fraction, "user+nice+system\n\n#{`ps -eo pcpu,pid,comm | sort -nrb -k1 | head -10`.chomp}"
|
118
|
+
end
|
119
|
+
|
120
|
+
@old_cpu = [u2, n2, s2, i2]
|
121
|
+
end
|
122
|
+
|
123
|
+
def linux_load
|
124
|
+
load = File.read('/proc/loadavg').split(/\s+/)[0].to_f / @cores
|
125
|
+
if load > @limits[:load][:critical]
|
126
|
+
alert "load", :critical, load, "1-minute load average/core is #{load}"
|
127
|
+
elsif load > @limits[:load][:warning]
|
128
|
+
alert "load", :warning, load, "1-minute load average/core is #{load}"
|
129
|
+
else
|
130
|
+
alert "load", :ok, load, "1-minute load average/core is #{load}"
|
131
|
+
end
|
132
|
+
end
|
133
|
+
|
134
|
+
def linux_memory
|
135
|
+
m = File.read('/proc/meminfo').split(/\n/).inject({}) { |info, line|
|
136
|
+
x = line.split(/:?\s+/)
|
137
|
+
# Assume kB...
|
138
|
+
info[x[0]] = x[1].to_i
|
139
|
+
info
|
140
|
+
}
|
141
|
+
|
142
|
+
free = m['MemFree'].to_i + m['Buffers'].to_i + m['Cached'].to_i
|
143
|
+
total = m['MemTotal'].to_i
|
144
|
+
fraction = 1 - (free.to_f / total)
|
145
|
+
|
146
|
+
report_pct :memory, fraction, "used\n\n#{`ps -eo pmem,pid,comm | sort -nrb -k1 | head -10`.chomp}"
|
147
|
+
end
|
148
|
+
|
149
|
+
def freebsd_cpu
|
150
|
+
u2, n2, s2, t2, i2 = `sysctl -n kern.cp_time 2>/dev/null`.split.map{ |e| e.to_i } #FreeBSD has 5 cpu stats
|
151
|
+
|
152
|
+
if @old_cpu
|
153
|
+
u1, n1, s1, t1, i1 = @old_cpu
|
154
|
+
|
155
|
+
used = (u2+n2+s2+t2) - (u1+n1+s1+t1)
|
156
|
+
total = used + i2-i1
|
157
|
+
fraction = used.to_f / total
|
158
|
+
|
159
|
+
report_pct :cpu, fraction, "user+nice+sytem+interrupt\n\n#{`ps -axo pcpu,pid,comm | sort -nrb -k1 | head -10`.chomp}"
|
160
|
+
end
|
161
|
+
|
162
|
+
@old_cpu = [u2, n2, s2, t2, i2]
|
163
|
+
end
|
164
|
+
|
165
|
+
def freebsd_load
|
166
|
+
m = `uptime`.split(':')[-1].chomp.gsub(/\s+/,'').split(',')
|
167
|
+
load = m[0].to_f / @cores
|
168
|
+
if load > @limits[:load][:critical]
|
169
|
+
alert "load", :critical, load, "1-minute load average/core is #{load}"
|
170
|
+
elsif load > @limits[:load][:warning]
|
171
|
+
alert "load", :warning, load, "1-minute load average/core is #{load}"
|
172
|
+
else
|
173
|
+
alert "load", :ok, load, "1-minute load average/core is #{load}"
|
174
|
+
end
|
175
|
+
end
|
176
|
+
|
177
|
+
def freebsd_memory
|
178
|
+
meminfo = `sysctl -n vm.stats.vm.v_page_count vm.stats.vm.v_wire_count vm.stats.vm.v_active_count 2>/dev/null`.chomp.split
|
179
|
+
fraction = (meminfo[1].to_f + meminfo[2].to_f) / meminfo[0].to_f
|
180
|
+
|
181
|
+
report_pct :memory, fraction, "used\n\n#{`ps -axo pmem,pid,comm | sort -nrb -k1 | head -10`.chomp}"
|
182
|
+
end
|
183
|
+
|
184
|
+
def darwin_top
|
185
|
+
raw = `top -l 1 | grep -i "^\\(cpu\\|physmem\\|load\\)"`.chomp
|
186
|
+
@topdata = {:stamp => Time.now.to_i }
|
187
|
+
raw.each_line do |ln|
|
188
|
+
if ln.match(/Load Avg: [0-9.]+, [0-9.]+, ([0-9.])+/i)
|
189
|
+
@topdata[:load] = $1.to_f
|
190
|
+
elsif ln.match(/CPU usage: [0-9.]+% user, [0-9.]+% sys, ([0-9.]+)% idle/i)
|
191
|
+
@topdata[:cpu] = 1 - ($1.to_f / 100)
|
192
|
+
elsif mdat = ln.match(/PhysMem: ([0-9]+)([BKMGT]) wired, ([0-9]+)([BKMGT]) active, ([0-9]+)([BKMGT]) inactive, ([0-9]+)([BKMGT]) used, ([0-9]+)([BKMGT]) free/i)
|
193
|
+
wired = mdat[1].to_i * (1024 ** "BKMGT".index(mdat[2]))
|
194
|
+
active = mdat[3].to_i * (1024 ** "BKMGT".index(mdat[4]))
|
195
|
+
inactive = mdat[5].to_i * (1024 ** "BKMGT".index(mdat[6]))
|
196
|
+
used = mdat[7].to_i * (1024 ** "BKMGT".index(mdat[8]))
|
197
|
+
free = mdat[9].to_i * (1024 ** "BKMGT".index(mdat[10]))
|
198
|
+
@topdata[:memory] = (wired + active + used).to_f / (wired + active + used + inactive + free)
|
199
|
+
# This is for OSX Mavericks which
|
200
|
+
# uses a different format for top
|
201
|
+
# Example: PhysMem: 4662M used (1328M wired), 2782M unused.
|
202
|
+
elsif mdat = ln.match(/PhysMem: ([0-9]+)([BKMGT]) used \(([0-9]+)([BKMGT]) wired\), ([0-9]+)([BKMGT]) unused/i)
|
203
|
+
used = mdat[1].to_i * (1024 ** "BKMGT".index(mdat[2]))
|
204
|
+
wired = mdat[3].to_i * (1024 ** "BKMGT".index(mdat[4]))
|
205
|
+
unused = mdat[5].to_i * (1024 ** "BKMGT".index(mdat[6]))
|
206
|
+
@topdata[:memory] = (used).to_f / (used + unused)
|
207
|
+
end
|
208
|
+
end
|
209
|
+
end
|
210
|
+
|
211
|
+
def darwin_cpu
|
212
|
+
darwin_top unless (Time.now.to_i - @topdata[:stamp]) < opts[:interval]
|
213
|
+
unless @topdata[:cpu]
|
214
|
+
alert 'cpu', :unknown, nil, "unable to get CPU stats from top"
|
215
|
+
return false
|
216
|
+
end
|
217
|
+
report_pct :cpu, @topdata[:cpu], "usage\n\n#{`ps -eo pcpu,pid,comm | sort -nrb -k1 | head -10`.chomp}"
|
218
|
+
end
|
219
|
+
|
220
|
+
def darwin_load
|
221
|
+
darwin_top unless (Time.now.to_i - @topdata[:stamp]) < opts[:interval]
|
222
|
+
unless @topdata[:load]
|
223
|
+
alert 'load', :unknown, nil, "unable to get load ave from top"
|
224
|
+
return false
|
225
|
+
end
|
226
|
+
metric = @topdata[:load] / @cores
|
227
|
+
if metric > @limits[:load][:critical]
|
228
|
+
alert "load", :critical, metric, "1-minute load average per core is #{metric}"
|
229
|
+
elsif metric > @limits[:load][:warning]
|
230
|
+
alert "load", :warning, metric, "1-minute load average per core is #{metric}"
|
231
|
+
else
|
232
|
+
alert "load", :ok, metric, "1-minute load average per core is #{metric}"
|
233
|
+
end
|
234
|
+
end
|
235
|
+
|
236
|
+
def darwin_memory
|
237
|
+
darwin_top unless (Time.now.to_i - @topdata[:stamp]) < opts[:interval]
|
238
|
+
unless @topdata[:memory]
|
239
|
+
alert 'memory', :unknown, nil, "unable to get memory data from top"
|
240
|
+
return false
|
241
|
+
end
|
242
|
+
report_pct :memory, @topdata[:memory], "usage\n\n#{`ps -eo pmem,pid,comm | sort -nrb -k1 | head -10`.chomp}"
|
243
|
+
end
|
244
|
+
|
245
|
+
def df
|
246
|
+
case @ostype
|
247
|
+
when 'darwin', 'freebsd'
|
248
|
+
`df -P -t noiso9660`
|
249
|
+
else
|
250
|
+
`df -P --exclude-type=iso9660`
|
251
|
+
end
|
252
|
+
end
|
253
|
+
|
254
|
+
def disk
|
255
|
+
df.split(/\n/).each do |r|
|
256
|
+
f = r.split(/\s+/)
|
257
|
+
next if f[0] == 'Filesystem'
|
258
|
+
next unless f[0] =~ /\// # Needs at least one slash in the mount path
|
259
|
+
|
260
|
+
# Calculate capacity
|
261
|
+
x = f[4].to_f/100
|
262
|
+
|
263
|
+
if x > @limits[:disk][:critical]
|
264
|
+
alert "disk #{f[5]}", :critical, x, "#{f[4]} used"
|
265
|
+
elsif x > @limits[:disk][:warning]
|
266
|
+
alert "disk #{f[5]}", :warning, x, "#{f[4]} used"
|
267
|
+
else
|
268
|
+
alert "disk #{f[5]}", :ok, x, "#{f[4]} used"
|
269
|
+
end
|
270
|
+
end
|
271
|
+
end
|
272
|
+
|
273
|
+
def tick
|
274
|
+
if @cpu_enabled
|
275
|
+
@cpu.call
|
276
|
+
end
|
277
|
+
if @memory_enabled
|
278
|
+
@memory.call
|
279
|
+
end
|
280
|
+
if @disk_enabled
|
281
|
+
@disk.call
|
282
|
+
end
|
283
|
+
if @load_enabled
|
284
|
+
@load.call
|
285
|
+
end
|
286
|
+
end
|
287
|
+
end
|
288
|
+
|
289
|
+
Riemann::Monitors::Health.run
|
@@ -0,0 +1,73 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# Checks the status of an HTTP request and submits it to Riemann
|
4
|
+
require_relative "../lib/riemann-monitors"
|
5
|
+
|
6
|
+
class Riemann::Monitors::HTTPStatus
|
7
|
+
include Riemann::Monitors
|
8
|
+
require "net/http"
|
9
|
+
require "uri"
|
10
|
+
|
11
|
+
opt :uri, "Target URI", type: String, default: 'http://localhost/'
|
12
|
+
opt :sitename, "Site name (abbreviation for identifying uri)", type: String
|
13
|
+
opt :method, "HTTP Method (get, head)", type: String, default: "head"
|
14
|
+
|
15
|
+
def initialize
|
16
|
+
@uri = URI.parse(opts[:uri])
|
17
|
+
@sitename = opts[:sitename]&.gsub(/\//, '_') if opts.has_key?(:sitename)
|
18
|
+
@servicename = ["http", opts[:method], @sitename].join("/")
|
19
|
+
@verbclass = case opts[:method]
|
20
|
+
when /\Aget\z/i
|
21
|
+
Net::HTTP::Get
|
22
|
+
when /\Ahead\z/i
|
23
|
+
Net::HTTP::Head
|
24
|
+
else
|
25
|
+
raise "This script only makes sense with get and head methods. You might want a custom script."
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def tick
|
30
|
+
response, roundtrip = get_connection()
|
31
|
+
unless (response).nil?
|
32
|
+
metrics = {service: @servicename,
|
33
|
+
description: @sitename,
|
34
|
+
tags: ["http"]}
|
35
|
+
c = response.code&.to_i
|
36
|
+
if !c.nil?
|
37
|
+
metrics[:state] = "ok"
|
38
|
+
metrics[:metric] = c
|
39
|
+
end
|
40
|
+
if !roundtrip.nil?
|
41
|
+
metrics[:roundtrip] = roundtrip
|
42
|
+
end
|
43
|
+
if !response.body.nil? && @verbclass != Net::HTTP::Head
|
44
|
+
metrics[:body_length] = response.body.length
|
45
|
+
end
|
46
|
+
report(metrics)
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
def get_connection()
|
51
|
+
response = nil
|
52
|
+
roundtrip = nil
|
53
|
+
begin
|
54
|
+
p1 = Time.now
|
55
|
+
Net::HTTP.start(@uri.host, @uri.port) do |http|
|
56
|
+
http.open_timeout = opts[:timeout]
|
57
|
+
request = @verbclass.new(@uri)
|
58
|
+
response = http.request(request)
|
59
|
+
response.read_body
|
60
|
+
end
|
61
|
+
p2 = Time.now
|
62
|
+
roundtrip = p2 - p1
|
63
|
+
rescue => e
|
64
|
+
report(:service => @servicename,
|
65
|
+
:state => "critical",
|
66
|
+
:description => "http connection failure: #{e.class} - #{e.message}",
|
67
|
+
:tags => ["http", "action"])
|
68
|
+
end
|
69
|
+
[response, roundtrip]
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
Riemann::Monitors::HTTPStatus.run
|