riemann-tools.haf 0.1.9

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,66 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # Reports current file descriptor use to riemann.
4
+ # By default reports the total system fd usage, can also report usage of individual processes
5
+
6
+ require File.expand_path('../../lib/riemann/tools', __FILE__)
7
+
8
+ class Riemann::Tools::Health
9
+ include Riemann::Tools
10
+
11
+ opt :fd_sys_warning, "open file descriptor threshold for system", :default => 800
12
+ opt :fd_sys_critical, "open file descriptor critical threshold for system", :default => 900
13
+ opt :fd_proc_warning, "open file descriptor threshold for process", :default => 800
14
+ opt :fd_proc_critical, "open file descriptor critical threshold for process", :default => 900
15
+ opt :processes, "list of processes to measure fd usage in addition to system total", :type => :ints
16
+
17
+ def initialize
18
+ @limits = {
19
+ :fd => {:critical => opts[:fd_sys_critical], :warning => opts[:fd_sys_warning]},
20
+ :process => {:critical => opts[:fd_proc_critical], :warning => opts[:fd_proc_warning]},
21
+ }
22
+ ostype = `uname -s`.chomp.downcase
23
+ puts "WARNING: OS '#{ostype}' not explicitly supported. Falling back to Linux" unless ostype == "linux"
24
+ @fd = method :linux_fd
25
+ end
26
+
27
+ def alert(service, state, metric, description)
28
+ report(
29
+ :service => service.to_s,
30
+ :state => state.to_s,
31
+ :metric => metric.to_f,
32
+ :description => description
33
+ )
34
+ end
35
+
36
+ def linux_fd
37
+ sys_used = Integer(`lsof | wc -l`)
38
+ if sys_used > @limits[:fd][:critical]
39
+ alert "fd sys", :critical, sys_used, "system is using #{sys_used} fds"
40
+ elsif sys_used > @limits[:fd][:warning]
41
+ alert "fd sys", :warning, sys_used, "system is using #{sys_used} fds"
42
+ else
43
+ alert "fd sys", :ok, sys_used, "system is using #{sys_used} fds"
44
+ end
45
+
46
+ unless opts[:processes].nil?
47
+ opts[:processes].each do |process|
48
+ used = Integer(`lsof -p #{process} | wc -l`)
49
+ name, pid = `ps axo comm,pid | grep -w #{process}`.split
50
+ if used > @limits[:process][:critical]
51
+ alert "fd #{name} #{process}", :critical, used, "process #{name} #{process} is using #{used} fds"
52
+ elsif used > @limits[:process][:warning]
53
+ alert "fd #{name} #{process}", :warning, used, "process #{name} #{process} is using #{used} fds"
54
+ else
55
+ alert "fd #{name} #{process}", :ok, used, "process #{name} #{process} is using #{used} fds"
56
+ end
57
+ end
58
+ end
59
+ end
60
+
61
+ def tick
62
+ @fd.call
63
+ end
64
+ end
65
+
66
+ Riemann::Tools::Health.run
@@ -0,0 +1,31 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require File.expand_path('../../lib/riemann/tools', __FILE__)
4
+
5
+ class Riemann::Tools::FreeSWITCH
6
+ include Riemann::Tools
7
+
8
+ def tick
9
+
10
+ #determine how many current calls I have according to FreeSWITCH
11
+ fs_calls = %x[fs_cli -x "show calls count"| grep -Po '^\\d+']
12
+
13
+ #determine how many current channels I have according to FreeSWITCH
14
+ fs_channels = %x[fs_cli -x "show channels count"| grep -Po '^\\d+']
15
+
16
+ #submit them to riemann
17
+ report(
18
+ :service => "FreeSWITCH current calls",
19
+ :metric => fs_calls.to_i,
20
+ :state => "info"
21
+ )
22
+
23
+ report(
24
+ :service => "FreeSWITCH current channels",
25
+ :metric => fs_channels.to_i,
26
+ :state => "info"
27
+ )
28
+ end
29
+ end
30
+
31
+ Riemann::Tools::FreeSWITCH.run
@@ -0,0 +1,52 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # Gathers haproxy CSV statistics and submits them to Riemann.
4
+
5
+ require File.expand_path('../../lib/riemann/tools', __FILE__)
6
+
7
+ class Riemann::Tools::Haproxy
8
+ include Riemann::Tools
9
+ require 'net/http'
10
+ require 'csv'
11
+
12
+ opt :stats_url, "Full url to haproxy stats (eg: https://user:password@host.com:9999/stats)", :required => true, :type => :string
13
+
14
+ def initialize
15
+ @uri = URI(opts[:stats_url]+';csv')
16
+ end
17
+
18
+ def tick
19
+ csv = CSV.parse(get_csv.body.split("# ")[1], { :headers => true })
20
+ csv.each do |row|
21
+ row = row.to_hash
22
+ ns = "haproxy #{row['pxname']} #{row['svname']}"
23
+ row.each do |property, metric|
24
+ unless (property.nil? || property == 'pxname' || property == 'svname')
25
+ report(
26
+ :host => @uri.host,
27
+ :service => "#{ns} #{property}",
28
+ :metric => metric.to_f,
29
+ :state => (['UP', 'OPEN'].include?(row['status']) ? 'ok' : 'critical'),
30
+ :tags => ['haproxy']
31
+ )
32
+ end
33
+ end
34
+ end
35
+ end
36
+
37
+ def get_csv
38
+ http = Net::HTTP.new(@uri.host, @uri.port)
39
+ http.use_ssl = true if @uri.scheme == 'https'
40
+ http.start do |h|
41
+ get = Net::HTTP::Get.new(@uri.request_uri)
42
+ unless @uri.userinfo.nil?
43
+ userinfo = @uri.userinfo.split(":")
44
+ get.basic_auth userinfo[0], userinfo[1]
45
+ end
46
+ h.request get
47
+ end
48
+ end
49
+
50
+ end
51
+
52
+ Riemann::Tools::Haproxy.run
@@ -0,0 +1,270 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # Reports current CPU, disk, load average, and memory use to riemann.
4
+
5
+ require File.expand_path('../../lib/riemann/tools', __FILE__)
6
+
7
+ class Riemann::Tools::Health
8
+ include Riemann::Tools
9
+
10
+ opt :cpu_warning, "CPU warning threshold (fraction of total jiffies)", :default => 0.9
11
+ opt :cpu_critical, "CPU critical threshold (fraction of total jiffies)", :default => 0.95
12
+ opt :disk_warning, "Disk warning threshold (fraction of space used)", :default => 0.9
13
+ opt :disk_critical, "Disk critical threshold (fraction of space used)", :default => 0.95
14
+ opt :load_warning, "Load warning threshold (load average / core)", :default => 3
15
+ opt :load_critical, "Load critical threshold (load average / core)", :default => 8
16
+ opt :memory_warning, "Memory warning threshold (fraction of RAM)", :default => 0.85
17
+ opt :memory_critical, "Memory critical threshold (fraction of RAM)", :default => 0.95
18
+ opt :checks, "A list of checks to run.", :type => :strings, :default => ['cpu', 'load', 'memory', 'disk']
19
+
20
+ def initialize
21
+ @limits = {
22
+ :cpu => {:critical => opts[:cpu_critical], :warning => opts[:cpu_warning]},
23
+ :disk => {:critical => opts[:disk_critical], :warning => opts[:disk_warning]},
24
+ :load => {:critical => opts[:load_critical], :warning => opts[:load_warning]},
25
+ :memory => {:critical => opts[:memory_critical], :warning => opts[:memory_warning]}
26
+ }
27
+ case (ostype = `uname -s`.chomp.downcase)
28
+ when 'darwin'
29
+ @cores = `sysctl -n hw.ncpu`.to_i
30
+ @cpu = method :darwin_cpu
31
+ @disk = method :disk
32
+ @load = method :darwin_load
33
+ @memory = method :darwin_memory
34
+ darwin_top
35
+ when 'freebsd'
36
+ @cores = `sysctl -n hw.ncpu`.to_i
37
+ @cpu = method :freebsd_cpu
38
+ @disk = method :disk
39
+ @load = method :freebsd_load
40
+ @memory = method :freebsd_memory
41
+ else
42
+ @cores = cores
43
+ puts "WARNING: OS '#{ostype}' not explicitly supported. Falling back to Linux" unless ostype == "linux"
44
+ @cpu = method :linux_cpu
45
+ @disk = method :disk
46
+ @load = method :linux_load
47
+ @memory = method :linux_memory
48
+ end
49
+
50
+ opts[:checks].each do |check|
51
+ case check
52
+ when "disk"
53
+ @disk_enabled = true
54
+ when "load"
55
+ @load_enabled = true
56
+ when "cpu"
57
+ @cpu_enabled = true
58
+ when "memory"
59
+ @memory_enabled = true
60
+ end
61
+ end
62
+ end
63
+
64
+ def alert(service, state, metric, description)
65
+ report(
66
+ :service => service.to_s,
67
+ :state => state.to_s,
68
+ :metric => metric.to_f,
69
+ :description => description
70
+ )
71
+ end
72
+
73
+ def cores
74
+ i = 0;
75
+ File.read("/proc/cpuinfo").split(/\n\n/).inject({}) do |cores, p|
76
+ physical_id = p[/physical id\s+:\s+(\d+)/, 1]
77
+ core_id = p[/core id\s+:\s+(\d+)/, 1]
78
+ if physical_id and core_id
79
+ cores["#{physical_id}:#{core_id}"] = true
80
+ elsif physical_id
81
+ cores["#{physical_id}:"] = true
82
+ else
83
+ cores[i += 1] = true;
84
+ end
85
+
86
+ cores
87
+ end.size
88
+ end
89
+
90
+ def report_pct(service, fraction, report)
91
+ if fraction
92
+ if fraction > @limits[service][:critical]
93
+ alert service, :critical, fraction, "#{sprintf("%.2f", fraction * 100)}% #{report}"
94
+ elsif fraction > @limits[service][:warning]
95
+ alert service, :warning, fraction, "#{sprintf("%.2f", fraction * 100)}% #{report}"
96
+ else
97
+ alert service, :ok, fraction, "#{sprintf("%.2f", fraction * 100)}% #{report}"
98
+ end
99
+ end
100
+ end
101
+
102
+ def linux_cpu
103
+ new = File.read('/proc/stat')
104
+ unless new[/cpu\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)/]
105
+ alert 'cpu', :unknown, nil, "/proc/stat doesn't include a CPU line"
106
+ return false
107
+ end
108
+ u2, n2, s2, i2 = [$1, $2, $3, $4].map { |e| e.to_i }
109
+
110
+ if @old_cpu
111
+ u1, n1, s1, i1 = @old_cpu
112
+
113
+ used = (u2+n2+s2) - (u1+n1+s1)
114
+ total = used + i2-i1
115
+ fraction = used.to_f / total
116
+
117
+ report_pct :cpu, fraction, "user+nice+sytem\n\n#{`ps -eo pcpu,pid,comm | sort -nrb -k1 | head -10`.chomp}"
118
+ end
119
+
120
+ @old_cpu = [u2, n2, s2, i2]
121
+ end
122
+
123
+ def linux_load
124
+ load = File.read('/proc/loadavg').split(/\s+/)[2].to_f / @cores
125
+ if load > @limits[:load][:critical]
126
+ alert "load", :critical, load, "15-minute load average/core is #{load}"
127
+ elsif load > @limits[:load][:warning]
128
+ alert "load", :warning, load, "15-minute load average/core is #{load}"
129
+ else
130
+ alert "load", :ok, load, "15-minute load average/core is #{load}"
131
+ end
132
+ end
133
+
134
+ def linux_memory
135
+ m = File.read('/proc/meminfo').split(/\n/).inject({}) { |info, line|
136
+ x = line.split(/:?\s+/)
137
+ # Assume kB...
138
+ info[x[0]] = x[1].to_i
139
+ info
140
+ }
141
+
142
+ free = m['MemFree'].to_i + m['Buffers'].to_i + m['Cached'].to_i
143
+ total = m['MemTotal'].to_i
144
+ fraction = 1 - (free.to_f / total)
145
+
146
+ report_pct :memory, fraction, "used\n\n#{`ps -eo pmem,pid,comm | sort -nrb -k1 | head -10`.chomp}"
147
+ end
148
+
149
+ def freebsd_cpu
150
+ u2, n2, s2, t2, i2 = `sysctl -n kern.cp_time 2>/dev/null`.split.map{ |e| e.to_i } #FreeBSD has 5 cpu stats
151
+
152
+ if @old_cpu
153
+ u1, n1, s1, t1, i1 = @old_cpu
154
+
155
+ used = (u2+n2+s2+t2) - (u1+n1+s1+t1)
156
+ total = used + i2-i1
157
+ fraction = used.to_f / total
158
+
159
+ report_pct :cpu, fraction, "user+nice+sytem+interrupt\n\n#{`ps -axo pcpu,pid,comm | sort -nrb -k1 | head -10`.chomp}"
160
+ end
161
+
162
+ @old_cpu = [u2, n2, s2, t2, i2]
163
+ end
164
+
165
+ def freebsd_load
166
+ m = `uptime`.split[-1].match(/^[0-9]*\.[0-9]*$/)
167
+ load = m[0].to_f / @cores
168
+ if load > @limits[:load][:critical]
169
+ alert "load", :critical, load, "15-minute load average/core is #{load}"
170
+ elsif load > @limits[:load][:warning]
171
+ alert "load", :warning, load, "15-minute load average/core is #{load}"
172
+ else
173
+ alert "load", :ok, load, "15-minute load average/core is #{load}"
174
+ end
175
+ end
176
+
177
+ def freebsd_memory
178
+ meminfo = `sysctl -n vm.stats.vm.v_page_count vm.stats.vm.v_wire_count vm.stats.vm.v_active_count 2>/dev/null`.chomp.split
179
+ fraction = (meminfo[1].to_f + meminfo[2].to_f) / meminfo[0].to_f
180
+
181
+ report_pct :memory, fraction, "used\n\n#{`ps -axo pmem,pid,comm | sort -nrb -k1 | head -10`.chomp}"
182
+ end
183
+
184
+ def darwin_top
185
+ raw = `top -l 1 | grep -i "^\\(cpu\\|physmem\\|load\\)"`.chomp
186
+ @topdata = {:stamp => Time.now.to_i }
187
+ raw.each_line do |ln|
188
+ if ln.match(/Load Avg: [0-9.]+, [0-9.]+, ([0-9.])+/i)
189
+ @topdata[:load] = $1.to_f
190
+ elsif ln.match(/CPU usage: [0-9.]+% user, [0-9.]+% sys, ([0-9.]+)% idle/i)
191
+ @topdata[:cpu] = 1 - ($1.to_f / 100)
192
+ elsif mdat = ln.match(/PhysMem: ([0-9]+)([BKMGT]) wired, ([0-9]+)([BKMGT]) active, ([0-9]+)([BKMGT]) inactive, ([0-9]+)([BKMGT]) used, ([0-9]+)([BKMGT]) free/i)
193
+ wired = mdat[1].to_i * (1024 ** "BKMGT".index(mdat[2]))
194
+ active = mdat[3].to_i * (1024 ** "BKMGT".index(mdat[4]))
195
+ inactive = mdat[5].to_i * (1024 ** "BKMGT".index(mdat[6]))
196
+ used = mdat[7].to_i * (1024 ** "BKMGT".index(mdat[8]))
197
+ free = mdat[9].to_i * (1024 ** "BKMGT".index(mdat[10]))
198
+ @topdata[:memory] = (wired + active + used).to_f / (wired + active + used + inactive + free)
199
+ end
200
+ end
201
+ end
202
+
203
+ def darwin_cpu
204
+ darwin_top unless (Time.now.to_i - @topdata[:stamp]) < opts[:interval]
205
+ unless @topdata[:cpu]
206
+ alert 'cpu', :unknown, nil, "unable to get CPU stats from top"
207
+ return false
208
+ end
209
+ report_pct :cpu, @topdata[:cpu], "usage\n\n#{`ps -eo pcpu,pid,comm | sort -nrb -k1 | head -10`.chomp}"
210
+ end
211
+
212
+ def darwin_load
213
+ darwin_top unless (Time.now.to_i - @topdata[:stamp]) < opts[:interval]
214
+ unless @topdata[:load]
215
+ alert 'load', :unknown, nil, "unable to get load ave from top"
216
+ return false
217
+ end
218
+ metric = @topdata[:load] / @cores
219
+ if metric > @limits[:load][:critical]
220
+ alert "load", :critical, metric, "15-minute load average per core is #{metric}"
221
+ elsif metric > @limits[:load][:warning]
222
+ alert "load", :warning, metric, "15-minute load average per core is #{metric}"
223
+ else
224
+ alert "load", :ok, metric, "15-minute load average per core is #{metric}"
225
+ end
226
+ end
227
+
228
+ def darwin_memory
229
+ darwin_top unless (Time.now.to_i - @topdata[:stamp]) < opts[:interval]
230
+ unless @topdata[:memory]
231
+ alert 'memory', :unknown, nil, "unable to get memory data from top"
232
+ return false
233
+ end
234
+ report_pct :memory, @topdata[:memory], "usage\n\n#{`ps -eo pmem,pid,comm | sort -nrb -k1 | head -10`.chomp}"
235
+ end
236
+
237
+ def disk
238
+ `df -P`.split(/\n/).each do |r|
239
+ f = r.split(/\s+/)
240
+ next unless f[0] =~ /^\//
241
+ next if f[0] == 'Filesystem'
242
+ x = f[4].to_f/100
243
+
244
+ if x > @limits[:disk][:critical]
245
+ alert "disk #{f[5]}", :critical, x, "#{f[4]} used"
246
+ elsif x > @limits[:disk][:warning]
247
+ alert "disk #{f[5]}", :warning, x, "#{f[4]} used"
248
+ else
249
+ alert "disk #{f[5]}", :ok, x, "#{f[4]} used"
250
+ end
251
+ end
252
+ end
253
+
254
+ def tick
255
+ if @cpu_enabled
256
+ @cpu.call
257
+ end
258
+ if @memory_enabled
259
+ @memory.call
260
+ end
261
+ if @disk_enabled
262
+ @disk.call
263
+ end
264
+ if @load_enabled
265
+ @load.call
266
+ end
267
+ end
268
+ end
269
+
270
+ Riemann::Tools::Health.run
@@ -0,0 +1,22 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require File.expand_path('../../lib/riemann/tools', __FILE__)
4
+
5
+ class Riemann::Tools::KVM
6
+ include Riemann::Tools
7
+
8
+ def tick
9
+
10
+ #determine how many instances I have according to libvirt
11
+ kvm_instances = %x[virsh list |grep i-|wc -l]
12
+
13
+ #submit them to riemann
14
+ report(
15
+ :service => "KVM Running VMs",
16
+ :metric => kvm_instances.to_i,
17
+ :state => "info"
18
+ )
19
+ end
20
+ end
21
+
22
+ Riemann::Tools::KVM.run
@@ -0,0 +1,37 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # Gathers memcached STATS and submits them to Riemann.
4
+
5
+ require File.expand_path('../../lib/riemann/tools', __FILE__)
6
+
7
+ class Riemann::Tools::Memcached
8
+ include Riemann::Tools
9
+ require 'socket'
10
+
11
+ opt :memcached_host, "Memcached hostname", :default => 'localhost'
12
+ opt :memcached_port, "Memcached port", :default => 11211
13
+
14
+ def tick
15
+ sock = TCPSocket.new(opts[:memcached_host], opts[:memcached_port])
16
+ sock.print("stats\r\n")
17
+ sock.flush
18
+ stats = sock.gets
19
+
20
+ data = {}
21
+ while true
22
+ stats = sock.gets
23
+ break if stats.strip == 'END'
24
+ m = stats.match /STAT (\w+) (\S+)/
25
+ report(
26
+ :host => opts[:memcached_host].dup,
27
+ :service => "memcached #{m[1]}",
28
+ :metric => m[2].to_f,
29
+ :state => 'ok',
30
+ :tags => ['memcached']
31
+ )
32
+ end
33
+ sock.close
34
+ end
35
+ end
36
+
37
+ Riemann::Tools::Memcached.run