riemann-tools 0.2.13 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.docker/Dockerfile +7 -0
- data/.docker/publish.sh +35 -0
- data/.github/dependabot.yml +11 -0
- data/.github/workflows/ci.yml +42 -0
- data/.github/workflows/codeql-analysis.yml +72 -0
- data/.gitignore +6 -0
- data/.rspec +2 -0
- data/.rubocop.yml +32 -0
- data/.travis.yml +31 -0
- data/CHANGELOG.md +422 -0
- data/Gemfile +6 -0
- data/ISSUE_TEMPLATE.md +15 -0
- data/README.markdown +14 -15
- data/Rakefile +23 -0
- data/SECURITY.md +42 -0
- data/bin/riemann-apache-status +92 -77
- data/bin/riemann-bench +54 -48
- data/bin/riemann-cloudant +44 -39
- data/bin/riemann-consul +82 -75
- data/bin/riemann-dir-files-count +53 -46
- data/bin/riemann-dir-space +53 -46
- data/bin/riemann-diskstats +78 -74
- data/bin/riemann-fd +68 -47
- data/bin/riemann-freeswitch +108 -102
- data/bin/riemann-haproxy +46 -39
- data/bin/riemann-health +4 -335
- data/bin/riemann-kvminstance +18 -12
- data/bin/riemann-memcached +35 -28
- data/bin/riemann-net +4 -103
- data/bin/riemann-nginx-status +74 -66
- data/bin/riemann-ntp +4 -32
- data/bin/riemann-portcheck +40 -30
- data/bin/riemann-proc +96 -89
- data/bin/riemann-varnish +51 -44
- data/bin/riemann-zookeeper +38 -33
- data/lib/riemann/tools/health.rb +347 -0
- data/lib/riemann/tools/net.rb +104 -0
- data/lib/riemann/tools/ntp.rb +41 -0
- data/lib/riemann/tools/utils.rb +17 -0
- data/lib/riemann/tools/version.rb +7 -0
- data/lib/riemann/tools.rb +40 -33
- data/riemann-tools.gemspec +42 -0
- data/tools/riemann-aws/LICENSE +21 -0
- data/tools/riemann-aws/README.md +54 -0
- data/tools/riemann-aws/Rakefile +37 -0
- data/tools/riemann-aws/bin/riemann-aws-billing +93 -0
- data/tools/riemann-aws/bin/riemann-aws-rds-status +68 -0
- data/tools/riemann-aws/bin/riemann-aws-sqs-status +50 -0
- data/tools/riemann-aws/bin/riemann-aws-status +83 -0
- data/tools/riemann-aws/bin/riemann-elb-metrics +168 -0
- data/tools/riemann-aws/bin/riemann-s3-list +87 -0
- data/tools/riemann-aws/bin/riemann-s3-status +102 -0
- data/tools/riemann-chronos/LICENSE +21 -0
- data/tools/riemann-chronos/README.md +10 -0
- data/tools/riemann-chronos/Rakefile +37 -0
- data/tools/riemann-chronos/bin/riemann-chronos +161 -0
- data/tools/riemann-docker/LICENSE +21 -0
- data/tools/riemann-docker/README.md +10 -0
- data/tools/riemann-docker/Rakefile +36 -0
- data/tools/riemann-docker/bin/riemann-docker +206 -0
- data/tools/riemann-elasticsearch/LICENSE +21 -0
- data/tools/riemann-elasticsearch/README.md +10 -0
- data/tools/riemann-elasticsearch/Rakefile +37 -0
- data/tools/riemann-elasticsearch/bin/riemann-elasticsearch +174 -0
- data/tools/riemann-marathon/LICENSE +21 -0
- data/tools/riemann-marathon/README.md +10 -0
- data/tools/riemann-marathon/Rakefile +37 -0
- data/tools/riemann-marathon/bin/riemann-marathon +163 -0
- data/tools/riemann-mesos/LICENSE +21 -0
- data/tools/riemann-mesos/README.md +10 -0
- data/tools/riemann-mesos/Rakefile +37 -0
- data/tools/riemann-mesos/bin/riemann-mesos +146 -0
- data/tools/riemann-munin/LICENSE +21 -0
- data/tools/riemann-munin/README.md +10 -0
- data/tools/riemann-munin/Rakefile +36 -0
- data/tools/riemann-munin/bin/riemann-munin +43 -0
- data/tools/riemann-rabbitmq/LICENSE +21 -0
- data/tools/riemann-rabbitmq/README.md +10 -0
- data/tools/riemann-rabbitmq/Rakefile +37 -0
- data/tools/riemann-rabbitmq/bin/riemann-rabbitmq +273 -0
- data/tools/riemann-riak/LICENSE +21 -0
- data/tools/riemann-riak/README.md +10 -0
- data/tools/riemann-riak/Rakefile +36 -0
- data/tools/riemann-riak/bin/riemann-riak +323 -0
- data/tools/riemann-riak/bin/riemann-riak-keys +13 -0
- data/tools/riemann-riak/bin/riemann-riak-ring +9 -0
- data/tools/riemann-riak/riak_status/key_count.erl +13 -0
- data/tools/riemann-riak/riak_status/riak_status.rb +152 -0
- data/tools/riemann-riak/riak_status/ringready.erl +9 -0
- metadata +195 -34
data/bin/riemann-freeswitch
CHANGED
@@ -1,118 +1,124 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
+
# frozen_string_literal: true
|
2
3
|
|
3
|
-
require
|
4
|
+
require 'English'
|
4
5
|
|
5
|
-
|
6
|
-
include Riemann::Tools
|
6
|
+
Process.setproctitle($PROGRAM_NAME)
|
7
7
|
|
8
|
-
|
9
|
-
opt :calls_critical, "Calls critical threshold", :default => 300
|
10
|
-
opt :pid_file, "FreeSWITCH daemon pidfile", :type => String, :default => "/var/run/freeswitch/freeswitch.pid"
|
8
|
+
require File.expand_path('../lib/riemann/tools', __dir__)
|
11
9
|
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
end
|
10
|
+
module Riemann
|
11
|
+
module Tools
|
12
|
+
class FreeSWITCH
|
13
|
+
include Riemann::Tools
|
17
14
|
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
false
|
22
|
-
rescue Errno::ESRCH
|
23
|
-
true
|
24
|
-
end
|
25
|
-
end
|
15
|
+
opt :calls_warning, 'Calls warning threshold', default: 100
|
16
|
+
opt :calls_critical, 'Calls critical threshold', default: 300
|
17
|
+
opt :pid_file, 'FreeSWITCH daemon pidfile', type: String, default: '/var/run/freeswitch/freeswitch.pid'
|
26
18
|
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
:metric => metric.to_f,
|
32
|
-
:description => description
|
33
|
-
)
|
34
|
-
end
|
35
|
-
|
36
|
-
def exec_with_timeout(cmd, timeout)
|
37
|
-
pid = Process.spawn(cmd, {[:err,:out] => :close, :pgroup => true})
|
38
|
-
begin
|
39
|
-
Timeout.timeout(timeout) do
|
40
|
-
Process.waitpid(pid, 0)
|
41
|
-
$?.exitstatus == 0
|
19
|
+
def initialize
|
20
|
+
@limits = {
|
21
|
+
calls: { critical: opts[:calls_critical], warning: opts[:calls_warning] },
|
22
|
+
}
|
42
23
|
end
|
43
|
-
rescue Timeout::Error
|
44
|
-
Process.kill(15, -Process.getpgid(pid))
|
45
|
-
puts "Killed pid: #{pid}"
|
46
|
-
false
|
47
|
-
end
|
48
|
-
end
|
49
|
-
|
50
|
-
def tick
|
51
|
-
# Determine how many current calls I have according to FreeSWITCH
|
52
|
-
fs_calls = %x[fs_cli -x "show calls count"| grep -Po '^\\d+'].to_i
|
53
|
-
|
54
|
-
# Determine how many current channels I have according to FreeSWITCH
|
55
|
-
fs_channels = %x[fs_cli -x "show channels count"| grep -Po '^\\d+'].to_i
|
56
|
-
|
57
|
-
# Determine how many conferences I have according to FreeSWITCH
|
58
|
-
fs_conferences = %x[fs_cli -x "conference list"| grep -Pco '^Conference'].to_i
|
59
|
-
|
60
|
-
# Try to read pidfile. If it fails use Devil's dummy PID
|
61
|
-
begin
|
62
|
-
fs_pid = File.read(opts[:pid_file]).to_i
|
63
|
-
rescue
|
64
|
-
puts "Couldn't read pidfile: #{opts[:pid_file]}"
|
65
|
-
fs_pid = -666
|
66
|
-
end
|
67
|
-
|
68
|
-
fs_threads = fs_pid > 0 ? %x[ps huH p #{fs_pid} | wc -l].to_i : 0
|
69
24
|
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
alert "FreeSWITCH current calls", :ok, fs_calls, "Number of calls are #{fs_calls}"
|
77
|
-
end
|
78
|
-
|
79
|
-
# Submit channels to riemann
|
80
|
-
if fs_channels > @limits[:calls][:critical]
|
81
|
-
alert "FreeSWITCH current channels", :critical, fs_channels, "Number of channels are #{fs_channels}"
|
82
|
-
elsif fs_channels > @limits[:calls][:warning]
|
83
|
-
alert "FreeSWITCH current channels", :warning, fs_channels, "Number of channels are #{fs_channels}"
|
84
|
-
else
|
85
|
-
alert "FreeSWITCH current channels", :ok, fs_channels, "Number of channels are #{fs_channels}"
|
86
|
-
end
|
87
|
-
|
88
|
-
# Submit conferences to riemann
|
89
|
-
if fs_conferences > @limits[:calls][:critical]
|
90
|
-
alert "FreeSWITCH current conferences", :critical, fs_conferences, "Number of conferences are #{fs_conferences}"
|
91
|
-
elsif fs_conferences > @limits[:calls][:warning]
|
92
|
-
alert "FreeSWITCH current conferences", :warning, fs_conferences, "Number of conferences are #{fs_conferences}"
|
93
|
-
else
|
94
|
-
alert "FreeSWITCH current conferences", :ok, fs_conferences, "Number of conferences are #{fs_conferences}"
|
95
|
-
end
|
25
|
+
def dead_proc?(pid)
|
26
|
+
Process.getpgid(pid)
|
27
|
+
false
|
28
|
+
rescue Errno::ESRCH
|
29
|
+
true
|
30
|
+
end
|
96
31
|
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
32
|
+
def alert(service, state, metric, description)
|
33
|
+
report(
|
34
|
+
service: service.to_s,
|
35
|
+
state: state.to_s,
|
36
|
+
metric: metric.to_f,
|
37
|
+
description: description,
|
38
|
+
)
|
39
|
+
end
|
101
40
|
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
41
|
+
def exec_with_timeout(cmd, timeout)
|
42
|
+
pid = Process.spawn(cmd, { %i[err out] => :close, :pgroup => true })
|
43
|
+
begin
|
44
|
+
Timeout.timeout(timeout) do
|
45
|
+
Process.waitpid(pid, 0)
|
46
|
+
$CHILD_STATUS.exitstatus.zero?
|
47
|
+
end
|
48
|
+
rescue Timeout::Error
|
49
|
+
Process.kill(15, -Process.getpgid(pid))
|
50
|
+
puts "Killed pid: #{pid}"
|
51
|
+
false
|
52
|
+
end
|
53
|
+
end
|
108
54
|
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
55
|
+
def tick
|
56
|
+
# Determine how many current calls I have according to FreeSWITCH
|
57
|
+
fs_calls = `fs_cli -x "show calls count"| grep -Po '^\\d+'`.to_i
|
58
|
+
|
59
|
+
# Determine how many current channels I have according to FreeSWITCH
|
60
|
+
fs_channels = `fs_cli -x "show channels count"| grep -Po '^\\d+'`.to_i
|
61
|
+
|
62
|
+
# Determine how many conferences I have according to FreeSWITCH
|
63
|
+
fs_conferences = `fs_cli -x "conference list"| grep -Pco '^Conference'`.to_i
|
64
|
+
|
65
|
+
# Try to read pidfile. If it fails use Devil's dummy PID
|
66
|
+
begin
|
67
|
+
fs_pid = File.read(opts[:pid_file]).to_i
|
68
|
+
rescue StandardError
|
69
|
+
puts "Couldn't read pidfile: #{opts[:pid_file]}"
|
70
|
+
fs_pid = -666
|
71
|
+
end
|
72
|
+
|
73
|
+
fs_threads = fs_pid.positive? ? `ps huH p #{fs_pid} | wc -l`.to_i : 0
|
74
|
+
|
75
|
+
# Submit calls to riemann
|
76
|
+
if fs_calls > @limits[:calls][:critical]
|
77
|
+
alert 'FreeSWITCH current calls', :critical, fs_calls, "Number of calls are #{fs_calls}"
|
78
|
+
elsif fs_calls > @limits[:calls][:warning]
|
79
|
+
alert 'FreeSWITCH current calls', :warning, fs_calls, "Number of calls are #{fs_calls}"
|
80
|
+
else
|
81
|
+
alert 'FreeSWITCH current calls', :ok, fs_calls, "Number of calls are #{fs_calls}"
|
82
|
+
end
|
83
|
+
|
84
|
+
# Submit channels to riemann
|
85
|
+
if fs_channels > @limits[:calls][:critical]
|
86
|
+
alert 'FreeSWITCH current channels', :critical, fs_channels, "Number of channels are #{fs_channels}"
|
87
|
+
elsif fs_channels > @limits[:calls][:warning]
|
88
|
+
alert 'FreeSWITCH current channels', :warning, fs_channels, "Number of channels are #{fs_channels}"
|
89
|
+
else
|
90
|
+
alert 'FreeSWITCH current channels', :ok, fs_channels, "Number of channels are #{fs_channels}"
|
91
|
+
end
|
92
|
+
|
93
|
+
# Submit conferences to riemann
|
94
|
+
if fs_conferences > @limits[:calls][:critical]
|
95
|
+
alert 'FreeSWITCH current conferences', :critical, fs_conferences,
|
96
|
+
"Number of conferences are #{fs_conferences}"
|
97
|
+
elsif fs_conferences > @limits[:calls][:warning]
|
98
|
+
alert 'FreeSWITCH current conferences', :warning, fs_conferences,
|
99
|
+
"Number of conferences are #{fs_conferences}"
|
100
|
+
else
|
101
|
+
alert 'FreeSWITCH current conferences', :ok, fs_conferences, "Number of conferences are #{fs_conferences}"
|
102
|
+
end
|
103
|
+
|
104
|
+
# Submit threads to riemann
|
105
|
+
alert 'FreeSWITCH current threads', :ok, fs_threads, "Number of threads are #{fs_threads}" if fs_threads
|
106
|
+
|
107
|
+
# Submit status to riemann
|
108
|
+
if dead_proc?(fs_pid)
|
109
|
+
alert 'FreeSWITCH status', :critical, -1, 'FreeSWITCH service status: not running'
|
110
|
+
else
|
111
|
+
alert 'FreeSWITCH status', :ok, nil, 'FreeSWITCH service status: running'
|
112
|
+
end
|
113
|
+
|
114
|
+
# Submit CLI status to riemann using timeout in case it's unresponsive
|
115
|
+
if exec_with_timeout('fs_cli -x status', 2)
|
116
|
+
alert 'FreeSWITCH CLI status', :ok, nil, 'FreeSWITCH CLI status: responsive'
|
117
|
+
else
|
118
|
+
alert 'FreeSWITCH CLI status', :critical, -1, 'FreeSWITCH CLI status: not responding'
|
119
|
+
end
|
120
|
+
end
|
114
121
|
end
|
115
|
-
|
116
122
|
end
|
117
123
|
end
|
118
124
|
|
data/bin/riemann-haproxy
CHANGED
@@ -1,58 +1,65 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
Process.setproctitle($PROGRAM_NAME)
|
2
5
|
|
3
6
|
# Gathers haproxy CSV statistics and submits them to Riemann.
|
4
7
|
|
5
|
-
require File.expand_path('
|
8
|
+
require File.expand_path('../lib/riemann/tools', __dir__)
|
6
9
|
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
10
|
+
module Riemann
|
11
|
+
module Tools
|
12
|
+
class Haproxy
|
13
|
+
include Riemann::Tools
|
14
|
+
require 'net/http'
|
15
|
+
require 'csv'
|
11
16
|
|
12
|
-
|
17
|
+
opt :stats_url, 'Full url to haproxy stats (eg: https://user:password@host.com:9999/stats)', required: true,
|
18
|
+
type: :string
|
13
19
|
|
14
|
-
|
15
|
-
|
16
|
-
|
20
|
+
def initialize
|
21
|
+
@uri = URI("#{opts[:stats_url]};csv")
|
22
|
+
end
|
23
|
+
|
24
|
+
def tick
|
25
|
+
csv.each do |row|
|
26
|
+
row = row.to_hash
|
27
|
+
ns = "haproxy #{row['pxname']} #{row['svname']}"
|
28
|
+
row.each do |property, metric|
|
29
|
+
next if property.nil? || property == 'pxname' || property == 'svname'
|
30
|
+
|
31
|
+
report(
|
32
|
+
host: @uri.host,
|
33
|
+
service: "#{ns} #{property}",
|
34
|
+
metric: metric.to_f,
|
35
|
+
tags: ['haproxy'],
|
36
|
+
)
|
37
|
+
end
|
17
38
|
|
18
|
-
def tick
|
19
|
-
csv = CSV.parse(get_csv.body.split("# ")[1], { :headers => true })
|
20
|
-
csv.each do |row|
|
21
|
-
row = row.to_hash
|
22
|
-
ns = "haproxy #{row['pxname']} #{row['svname']}"
|
23
|
-
row.each do |property, metric|
|
24
|
-
unless (property.nil? || property == 'pxname' || property == 'svname')
|
25
39
|
report(
|
26
|
-
:
|
27
|
-
:
|
28
|
-
:
|
29
|
-
:
|
40
|
+
host: @uri.host,
|
41
|
+
service: "#{ns} state",
|
42
|
+
state: (%w[UP OPEN].include?(row['status']) ? 'ok' : 'critical'),
|
43
|
+
tags: ['haproxy'],
|
30
44
|
)
|
31
45
|
end
|
32
46
|
end
|
33
47
|
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
http.start do |h|
|
47
|
-
get = Net::HTTP::Get.new(@uri.request_uri)
|
48
|
-
unless @uri.userinfo.nil?
|
49
|
-
userinfo = @uri.userinfo.split(":")
|
50
|
-
get.basic_auth userinfo[0], userinfo[1]
|
48
|
+
def csv
|
49
|
+
http = Net::HTTP.new(@uri.host, @uri.port)
|
50
|
+
http.use_ssl = true if @uri.scheme == 'https'
|
51
|
+
http.start do |h|
|
52
|
+
get = Net::HTTP::Get.new(@uri.request_uri)
|
53
|
+
unless @uri.userinfo.nil?
|
54
|
+
userinfo = @uri.userinfo.split(':')
|
55
|
+
get.basic_auth userinfo[0], userinfo[1]
|
56
|
+
end
|
57
|
+
h.request get
|
58
|
+
end
|
59
|
+
CSV.parse(http.body.split('# ')[1], { headers: true })
|
51
60
|
end
|
52
|
-
h.request get
|
53
61
|
end
|
54
62
|
end
|
55
|
-
|
56
63
|
end
|
57
64
|
|
58
65
|
Riemann::Tools::Haproxy.run
|
data/bin/riemann-health
CHANGED
@@ -1,341 +1,10 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
+
# frozen_string_literal: true
|
2
3
|
|
3
|
-
|
4
|
-
|
5
|
-
require File.expand_path('../../lib/riemann/tools', __FILE__)
|
6
|
-
|
7
|
-
class Riemann::Tools::Health
|
8
|
-
include Riemann::Tools
|
9
|
-
|
10
|
-
opt :cpu_warning, "CPU warning threshold (fraction of total jiffies)", :default => 0.9
|
11
|
-
opt :cpu_critical, "CPU critical threshold (fraction of total jiffies)", :default => 0.95
|
12
|
-
opt :disk_warning, "Disk warning threshold (fraction of space used)", :default => 0.9
|
13
|
-
opt :disk_critical, "Disk critical threshold (fraction of space used)", :default => 0.95
|
14
|
-
opt :load_warning, "Load warning threshold (load average / core)", :default => 3
|
15
|
-
opt :load_critical, "Load critical threshold (load average / core)", :default => 8
|
16
|
-
opt :memory_warning, "Memory warning threshold (fraction of RAM)", :default => 0.85
|
17
|
-
opt :memory_critical, "Memory critical threshold (fraction of RAM)", :default => 0.95
|
18
|
-
opt :checks, "A list of checks to run.", :type => :strings, :default => ['cpu', 'load', 'memory', 'disk']
|
19
|
-
|
20
|
-
def initialize
|
21
|
-
@limits = {
|
22
|
-
:cpu => {:critical => opts[:cpu_critical], :warning => opts[:cpu_warning]},
|
23
|
-
:disk => {:critical => opts[:disk_critical], :warning => opts[:disk_warning]},
|
24
|
-
:load => {:critical => opts[:load_critical], :warning => opts[:load_warning]},
|
25
|
-
:memory => {:critical => opts[:memory_critical], :warning => opts[:memory_warning]}
|
26
|
-
}
|
27
|
-
case (@ostype = `uname -s`.chomp.downcase)
|
28
|
-
when 'darwin'
|
29
|
-
@cores = `sysctl -n hw.ncpu`.to_i
|
30
|
-
@cpu = method :darwin_cpu
|
31
|
-
@disk = method :disk
|
32
|
-
@load = method :darwin_load
|
33
|
-
@memory = method :darwin_memory
|
34
|
-
darwin_top
|
35
|
-
when 'freebsd'
|
36
|
-
@cores = `sysctl -n hw.ncpu`.to_i
|
37
|
-
@cpu = method :freebsd_cpu
|
38
|
-
@disk = method :disk
|
39
|
-
@load = method :bsd_load
|
40
|
-
@memory = method :freebsd_memory
|
41
|
-
when 'openbsd'
|
42
|
-
@cores = `sysctl -n hw.ncpu`.to_i
|
43
|
-
@cpu = method :openbsd_cpu
|
44
|
-
@disk = method :disk
|
45
|
-
@load = method :bsd_load
|
46
|
-
@memory = method :openbsd_memory
|
47
|
-
when 'sunos'
|
48
|
-
@cores = `mpstat -a 2>/dev/null`.split[33].to_i
|
49
|
-
@cpu = method :sunos_cpu
|
50
|
-
@disk = method :disk
|
51
|
-
@load = method :bsd_load
|
52
|
-
@memory = method :sunos_memory
|
53
|
-
else
|
54
|
-
@cores = `nproc`.to_i
|
55
|
-
puts "WARNING: OS '#{@ostype}' not explicitly supported. Falling back to Linux" unless @ostype == "linux"
|
56
|
-
@cpu = method :linux_cpu
|
57
|
-
@disk = method :disk
|
58
|
-
@load = method :linux_load
|
59
|
-
@memory = method :linux_memory
|
60
|
-
end
|
61
|
-
|
62
|
-
opts[:checks].each do |check|
|
63
|
-
case check
|
64
|
-
when "disk"
|
65
|
-
@disk_enabled = true
|
66
|
-
when "load"
|
67
|
-
@load_enabled = true
|
68
|
-
when "cpu"
|
69
|
-
@cpu_enabled = true
|
70
|
-
when "memory"
|
71
|
-
@memory_enabled = true
|
72
|
-
end
|
73
|
-
end
|
74
|
-
end
|
75
|
-
|
76
|
-
def alert(service, state, metric, description)
|
77
|
-
report(
|
78
|
-
:service => service.to_s,
|
79
|
-
:state => state.to_s,
|
80
|
-
:metric => metric.to_f,
|
81
|
-
:description => description
|
82
|
-
)
|
83
|
-
end
|
84
|
-
|
85
|
-
def report_pct(service, fraction, report)
|
86
|
-
if fraction
|
87
|
-
if fraction > @limits[service][:critical]
|
88
|
-
alert service, :critical, fraction, "#{sprintf("%.2f", fraction * 100)}% #{report}"
|
89
|
-
elsif fraction > @limits[service][:warning]
|
90
|
-
alert service, :warning, fraction, "#{sprintf("%.2f", fraction * 100)}% #{report}"
|
91
|
-
else
|
92
|
-
alert service, :ok, fraction, "#{sprintf("%.2f", fraction * 100)}% #{report}"
|
93
|
-
end
|
94
|
-
end
|
95
|
-
end
|
96
|
-
|
97
|
-
def linux_cpu
|
98
|
-
new = File.read('/proc/stat')
|
99
|
-
unless new[/cpu\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)/]
|
100
|
-
alert 'cpu', :unknown, nil, "/proc/stat doesn't include a CPU line"
|
101
|
-
return false
|
102
|
-
end
|
103
|
-
u2, n2, s2, i2 = [$1, $2, $3, $4].map { |e| e.to_i }
|
104
|
-
|
105
|
-
if @old_cpu
|
106
|
-
u1, n1, s1, i1 = @old_cpu
|
107
|
-
|
108
|
-
used = (u2+n2+s2) - (u1+n1+s1)
|
109
|
-
total = used + i2-i1
|
110
|
-
fraction = used.to_f / total
|
111
|
-
|
112
|
-
report_pct :cpu, fraction, "user+nice+system\n\n#{`ps -eo pcpu,pid,comm | sort -nrb -k1 | head -10`.chomp}"
|
113
|
-
end
|
114
|
-
|
115
|
-
@old_cpu = [u2, n2, s2, i2]
|
116
|
-
end
|
117
|
-
|
118
|
-
def linux_load
|
119
|
-
load = File.read('/proc/loadavg').split(/\s+/)[0].to_f / @cores
|
120
|
-
if load > @limits[:load][:critical]
|
121
|
-
alert "load", :critical, load, "1-minute load average/core is #{load}"
|
122
|
-
elsif load > @limits[:load][:warning]
|
123
|
-
alert "load", :warning, load, "1-minute load average/core is #{load}"
|
124
|
-
else
|
125
|
-
alert "load", :ok, load, "1-minute load average/core is #{load}"
|
126
|
-
end
|
127
|
-
end
|
128
|
-
|
129
|
-
def linux_memory
|
130
|
-
m = File.read('/proc/meminfo').split(/\n/).inject({}) { |info, line|
|
131
|
-
x = line.split(/:?\s+/)
|
132
|
-
# Assume kB...
|
133
|
-
info[x[0]] = x[1].to_i
|
134
|
-
info
|
135
|
-
}
|
136
|
-
|
137
|
-
free = m['MemFree'].to_i + m['Buffers'].to_i + m['Cached'].to_i
|
138
|
-
total = m['MemTotal'].to_i
|
139
|
-
fraction = 1 - (free.to_f / total)
|
140
|
-
|
141
|
-
report_pct :memory, fraction, "used\n\n#{`ps -eo pmem,pid,comm | sort -nrb -k1 | head -10`.chomp}"
|
142
|
-
end
|
143
|
-
|
144
|
-
def freebsd_cpu
|
145
|
-
u2, n2, s2, t2, i2 = `sysctl -n kern.cp_time 2>/dev/null`.split.map{ |e| e.to_i } #FreeBSD has 5 cpu stats
|
146
|
-
|
147
|
-
if @old_cpu
|
148
|
-
u1, n1, s1, t1, i1 = @old_cpu
|
149
|
-
|
150
|
-
used = (u2+n2+s2+t2) - (u1+n1+s1+t1)
|
151
|
-
total = used + i2-i1
|
152
|
-
fraction = used.to_f / total
|
153
|
-
|
154
|
-
report_pct :cpu, fraction, "user+nice+sytem+interrupt\n\n#{`ps -axo pcpu,pid,comm | sort -nrb -k1 | head -10`.chomp}"
|
155
|
-
end
|
156
|
-
|
157
|
-
@old_cpu = [u2, n2, s2, t2, i2]
|
158
|
-
end
|
159
|
-
|
160
|
-
def openbsd_cpu
|
161
|
-
u2, n2, s2, t2, i2 = `sysctl -n kern.cp_time 2>/dev/null`.split(',').map{ |e| e.to_i } #OpenBSD separates with ,
|
162
|
-
|
163
|
-
if @old_cpu
|
164
|
-
u1, n1, s1, t1, i1 = @old_cpu
|
4
|
+
Process.setproctitle($PROGRAM_NAME)
|
165
5
|
|
166
|
-
|
167
|
-
total = used + i2-i1
|
168
|
-
fraction = used.to_f / total
|
169
|
-
|
170
|
-
report_pct :cpu, fraction, "user+nice+sytem+interrupt\n\n#{`ps -axo pcpu,pid,comm | sort -nrb -k1 | head -10`.chomp}"
|
171
|
-
end
|
172
|
-
|
173
|
-
@old_cpu = [u2, n2, s2, t2, i2]
|
174
|
-
end
|
175
|
-
|
176
|
-
def sunos_cpu
|
177
|
-
mpstats = `mpstat -a 2>/dev/null`.split
|
178
|
-
u2 = mpstats[29].to_i
|
179
|
-
s2 = mpstats[30].to_i
|
180
|
-
t2 = mpstats[31].to_i
|
181
|
-
i2 = mpstats[32].to_i
|
182
|
-
|
183
|
-
if @old_cpu
|
184
|
-
u1, s1, t1, i1 = @old_cpu
|
185
|
-
|
186
|
-
used = (u2+s2+t2) - (u1+s1+t1)
|
187
|
-
total = used + i2-i1
|
188
|
-
if i2 == i1 && used == 0 #If the system is <1% used in both samples then total will be 0 + (99 - 99), avoid a div by 0
|
189
|
-
fraction = 0
|
190
|
-
else
|
191
|
-
fraction = used.to_f / total
|
192
|
-
end
|
193
|
-
|
194
|
-
report_pct :cpu, fraction, "user+sytem+interrupt\n\n#{`ps -ao pcpu,pid,comm | sort -nrb -k1 | head -10`.chomp}"
|
195
|
-
end
|
196
|
-
|
197
|
-
@old_cpu = [u2, s2, t2, i2]
|
198
|
-
end
|
199
|
-
|
200
|
-
def bsd_load
|
201
|
-
m = `uptime`.split(':')[-1].chomp.gsub(/\s+/,'').split(',')
|
202
|
-
load = m[0].to_f / @cores
|
203
|
-
if load > @limits[:load][:critical]
|
204
|
-
alert "load", :critical, load, "1-minute load average/core is #{load}"
|
205
|
-
elsif load > @limits[:load][:warning]
|
206
|
-
alert "load", :warning, load, "1-minute load average/core is #{load}"
|
207
|
-
else
|
208
|
-
alert "load", :ok, load, "1-minute load average/core is #{load}"
|
209
|
-
end
|
210
|
-
end
|
211
|
-
|
212
|
-
def freebsd_memory
|
213
|
-
meminfo = `sysctl -n vm.stats.vm.v_page_count vm.stats.vm.v_wire_count vm.stats.vm.v_active_count 2>/dev/null`.chomp.split
|
214
|
-
fraction = (meminfo[1].to_f + meminfo[2].to_f) / meminfo[0].to_f
|
215
|
-
|
216
|
-
report_pct :memory, fraction, "used\n\n#{`ps -axo pmem,pid,comm | sort -nrb -k1 | head -10`.chomp}"
|
217
|
-
end
|
218
|
-
|
219
|
-
def openbsd_memory
|
220
|
-
meminfo = `vmstat 2>/dev/null`.chomp.split
|
221
|
-
fraction = meminfo[28].to_f / meminfo[29].to_f #The ratio of active to free memory unlike the others :(
|
222
|
-
|
223
|
-
report_pct :memory, fraction, "used\n\n#{`ps -axo pmem,pid,comm | sort -nrb -k1 | head -10`.chomp}"
|
224
|
-
end
|
225
|
-
|
226
|
-
def sunos_memory
|
227
|
-
meminfo = `vmstat 2>/dev/null`.chomp.split
|
228
|
-
total_mem = `prtconf | grep Memory`.split[2].to_f * 1024 # reports in GB but vmstat is in MB
|
229
|
-
fraction = ( total_mem - meminfo[32].to_f ) / total_mem
|
230
|
-
|
231
|
-
report_pct :memory, fraction, "used\n\n#{`ps -ao pmem,pid,comm | sort -nrb -k1 | head -10`.chomp}"
|
232
|
-
end
|
233
|
-
|
234
|
-
def darwin_top
|
235
|
-
raw = `top -l 1 | grep -i "^\\(cpu\\|physmem\\|load\\)"`.chomp
|
236
|
-
@topdata = {:stamp => Time.now.to_i }
|
237
|
-
raw.each_line do |ln|
|
238
|
-
if ln.match(/Load Avg: [0-9.]+, [0-9.]+, ([0-9.])+/i)
|
239
|
-
@topdata[:load] = $1.to_f
|
240
|
-
elsif ln.match(/CPU usage: [0-9.]+% user, [0-9.]+% sys, ([0-9.]+)% idle/i)
|
241
|
-
@topdata[:cpu] = 1 - ($1.to_f / 100)
|
242
|
-
elsif mdat = ln.match(/PhysMem: ([0-9]+)([BKMGT]) wired, ([0-9]+)([BKMGT]) active, ([0-9]+)([BKMGT]) inactive, ([0-9]+)([BKMGT]) used, ([0-9]+)([BKMGT]) free/i)
|
243
|
-
wired = mdat[1].to_i * (1024 ** "BKMGT".index(mdat[2]))
|
244
|
-
active = mdat[3].to_i * (1024 ** "BKMGT".index(mdat[4]))
|
245
|
-
inactive = mdat[5].to_i * (1024 ** "BKMGT".index(mdat[6]))
|
246
|
-
used = mdat[7].to_i * (1024 ** "BKMGT".index(mdat[8]))
|
247
|
-
free = mdat[9].to_i * (1024 ** "BKMGT".index(mdat[10]))
|
248
|
-
@topdata[:memory] = (wired + active + used).to_f / (wired + active + used + inactive + free)
|
249
|
-
# This is for OSX Mavericks which
|
250
|
-
# uses a different format for top
|
251
|
-
# Example: PhysMem: 4662M used (1328M wired), 2782M unused.
|
252
|
-
elsif mdat = ln.match(/PhysMem: ([0-9]+)([BKMGT]) used \(([0-9]+)([BKMGT]) wired\), ([0-9]+)([BKMGT]) unused/i)
|
253
|
-
used = mdat[1].to_i * (1024 ** "BKMGT".index(mdat[2]))
|
254
|
-
wired = mdat[3].to_i * (1024 ** "BKMGT".index(mdat[4]))
|
255
|
-
unused = mdat[5].to_i * (1024 ** "BKMGT".index(mdat[6]))
|
256
|
-
@topdata[:memory] = (used).to_f / (used + unused)
|
257
|
-
end
|
258
|
-
end
|
259
|
-
end
|
260
|
-
|
261
|
-
def darwin_cpu
|
262
|
-
darwin_top unless (Time.now.to_i - @topdata[:stamp]) < opts[:interval]
|
263
|
-
unless @topdata[:cpu]
|
264
|
-
alert 'cpu', :unknown, nil, "unable to get CPU stats from top"
|
265
|
-
return false
|
266
|
-
end
|
267
|
-
report_pct :cpu, @topdata[:cpu], "usage\n\n#{`ps -eo pcpu,pid,comm | sort -nrb -k1 | head -10`.chomp}"
|
268
|
-
end
|
269
|
-
|
270
|
-
def darwin_load
|
271
|
-
darwin_top unless (Time.now.to_i - @topdata[:stamp]) < opts[:interval]
|
272
|
-
unless @topdata[:load]
|
273
|
-
alert 'load', :unknown, nil, "unable to get load ave from top"
|
274
|
-
return false
|
275
|
-
end
|
276
|
-
metric = @topdata[:load] / @cores
|
277
|
-
if metric > @limits[:load][:critical]
|
278
|
-
alert "load", :critical, metric, "1-minute load average per core is #{metric}"
|
279
|
-
elsif metric > @limits[:load][:warning]
|
280
|
-
alert "load", :warning, metric, "1-minute load average per core is #{metric}"
|
281
|
-
else
|
282
|
-
alert "load", :ok, metric, "1-minute load average per core is #{metric}"
|
283
|
-
end
|
284
|
-
end
|
285
|
-
|
286
|
-
def darwin_memory
|
287
|
-
darwin_top unless (Time.now.to_i - @topdata[:stamp]) < opts[:interval]
|
288
|
-
unless @topdata[:memory]
|
289
|
-
alert 'memory', :unknown, nil, "unable to get memory data from top"
|
290
|
-
return false
|
291
|
-
end
|
292
|
-
report_pct :memory, @topdata[:memory], "usage\n\n#{`ps -eo pmem,pid,comm | sort -nrb -k1 | head -10`.chomp}"
|
293
|
-
end
|
294
|
-
|
295
|
-
def df
|
296
|
-
case @ostype
|
297
|
-
when 'darwin', 'freebsd', 'openbsd'
|
298
|
-
`df -P -t noiso9660`
|
299
|
-
when 'sunos'
|
300
|
-
`df -P` # Is there a good way to exlude iso9660 here?
|
301
|
-
else
|
302
|
-
`df -P --exclude-type=iso9660`
|
303
|
-
end
|
304
|
-
end
|
305
|
-
|
306
|
-
def disk
|
307
|
-
df.split(/\n/).each do |r|
|
308
|
-
f = r.split(/\s+/)
|
309
|
-
next if f[0] == 'Filesystem'
|
310
|
-
next unless f[0] =~ /\// # Needs at least one slash in the mount path
|
311
|
-
|
312
|
-
# Calculate capacity
|
313
|
-
x = f[4].to_f/100
|
314
|
-
|
315
|
-
if x > @limits[:disk][:critical]
|
316
|
-
alert "disk #{f[5]}", :critical, x, "#{f[4]} used"
|
317
|
-
elsif x > @limits[:disk][:warning]
|
318
|
-
alert "disk #{f[5]}", :warning, x, "#{f[4]} used"
|
319
|
-
else
|
320
|
-
alert "disk #{f[5]}", :ok, x, "#{f[4]} used"
|
321
|
-
end
|
322
|
-
end
|
323
|
-
end
|
6
|
+
# Reports current CPU, disk, load average, and memory use to riemann.
|
324
7
|
|
325
|
-
|
326
|
-
if @cpu_enabled
|
327
|
-
@cpu.call
|
328
|
-
end
|
329
|
-
if @memory_enabled
|
330
|
-
@memory.call
|
331
|
-
end
|
332
|
-
if @disk_enabled
|
333
|
-
@disk.call
|
334
|
-
end
|
335
|
-
if @load_enabled
|
336
|
-
@load.call
|
337
|
-
end
|
338
|
-
end
|
339
|
-
end
|
8
|
+
require 'riemann/tools/health'
|
340
9
|
|
341
10
|
Riemann::Tools::Health.run
|