riemann-tools.haf 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,66 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # Reports current file descriptor use to riemann.
4
+ # By default reports the total system fd usage, can also report usage of individual processes
5
+
6
+ require File.expand_path('../../lib/riemann/tools', __FILE__)
7
+
8
+ class Riemann::Tools::Health
9
+ include Riemann::Tools
10
+
11
+ opt :fd_sys_warning, "open file descriptor threshold for system", :default => 800
12
+ opt :fd_sys_critical, "open file descriptor critical threshold for system", :default => 900
13
+ opt :fd_proc_warning, "open file descriptor threshold for process", :default => 800
14
+ opt :fd_proc_critical, "open file descriptor critical threshold for process", :default => 900
15
+ opt :processes, "list of processes to measure fd usage in addition to system total", :type => :ints
16
+
17
+ def initialize
18
+ @limits = {
19
+ :fd => {:critical => opts[:fd_sys_critical], :warning => opts[:fd_sys_warning]},
20
+ :process => {:critical => opts[:fd_proc_critical], :warning => opts[:fd_proc_warning]},
21
+ }
22
+ ostype = `uname -s`.chomp.downcase
23
+ puts "WARNING: OS '#{ostype}' not explicitly supported. Falling back to Linux" unless ostype == "linux"
24
+ @fd = method :linux_fd
25
+ end
26
+
27
+ def alert(service, state, metric, description)
28
+ report(
29
+ :service => service.to_s,
30
+ :state => state.to_s,
31
+ :metric => metric.to_f,
32
+ :description => description
33
+ )
34
+ end
35
+
36
+ def linux_fd
37
+ sys_used = Integer(`lsof | wc -l`)
38
+ if sys_used > @limits[:fd][:critical]
39
+ alert "fd sys", :critical, sys_used, "system is using #{sys_used} fds"
40
+ elsif sys_used > @limits[:fd][:warning]
41
+ alert "fd sys", :warning, sys_used, "system is using #{sys_used} fds"
42
+ else
43
+ alert "fd sys", :ok, sys_used, "system is using #{sys_used} fds"
44
+ end
45
+
46
+ unless opts[:processes].nil?
47
+ opts[:processes].each do |process|
48
+ used = Integer(`lsof -p #{process} | wc -l`)
49
+ name, pid = `ps axo comm,pid | grep -w #{process}`.split
50
+ if used > @limits[:process][:critical]
51
+ alert "fd #{name} #{process}", :critical, used, "process #{name} #{process} is using #{used} fds"
52
+ elsif used > @limits[:process][:warning]
53
+ alert "fd #{name} #{process}", :warning, used, "process #{name} #{process} is using #{used} fds"
54
+ else
55
+ alert "fd #{name} #{process}", :ok, used, "process #{name} #{process} is using #{used} fds"
56
+ end
57
+ end
58
+ end
59
+ end
60
+
61
+ def tick
62
+ @fd.call
63
+ end
64
+ end
65
+
66
+ Riemann::Tools::Health.run
@@ -0,0 +1,31 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require File.expand_path('../../lib/riemann/tools', __FILE__)
4
+
5
+ class Riemann::Tools::FreeSWITCH
6
+ include Riemann::Tools
7
+
8
+ def tick
9
+
10
+ #determine how many current calls I have according to FreeSWITCH
11
+ fs_calls = %x[fs_cli -x "show calls count"| grep -Po '^\\d+']
12
+
13
+ #determine how many current channels I have according to FreeSWITCH
14
+ fs_channels = %x[fs_cli -x "show channels count"| grep -Po '^\\d+']
15
+
16
+ #submit them to riemann
17
+ report(
18
+ :service => "FreeSWITCH current calls",
19
+ :metric => fs_calls.to_i,
20
+ :state => "info"
21
+ )
22
+
23
+ report(
24
+ :service => "FreeSWITCH current channels",
25
+ :metric => fs_channels.to_i,
26
+ :state => "info"
27
+ )
28
+ end
29
+ end
30
+
31
+ Riemann::Tools::FreeSWITCH.run
@@ -0,0 +1,52 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # Gathers haproxy CSV statistics and submits them to Riemann.
4
+
5
+ require File.expand_path('../../lib/riemann/tools', __FILE__)
6
+
7
+ class Riemann::Tools::Haproxy
8
+ include Riemann::Tools
9
+ require 'net/http'
10
+ require 'csv'
11
+
12
+ opt :stats_url, "Full url to haproxy stats (eg: https://user:password@host.com:9999/stats)", :required => true, :type => :string
13
+
14
+ def initialize
15
+ @uri = URI(opts[:stats_url]+';csv')
16
+ end
17
+
18
+ def tick
19
+ csv = CSV.parse(get_csv.body.split("# ")[1], { :headers => true })
20
+ csv.each do |row|
21
+ row = row.to_hash
22
+ ns = "haproxy #{row['pxname']} #{row['svname']}"
23
+ row.each do |property, metric|
24
+ unless (property.nil? || property == 'pxname' || property == 'svname')
25
+ report(
26
+ :host => @uri.host,
27
+ :service => "#{ns} #{property}",
28
+ :metric => metric.to_f,
29
+ :state => (['UP', 'OPEN'].include?(row['status']) ? 'ok' : 'critical'),
30
+ :tags => ['haproxy']
31
+ )
32
+ end
33
+ end
34
+ end
35
+ end
36
+
37
+ def get_csv
38
+ http = Net::HTTP.new(@uri.host, @uri.port)
39
+ http.use_ssl = true if @uri.scheme == 'https'
40
+ http.start do |h|
41
+ get = Net::HTTP::Get.new(@uri.request_uri)
42
+ unless @uri.userinfo.nil?
43
+ userinfo = @uri.userinfo.split(":")
44
+ get.basic_auth userinfo[0], userinfo[1]
45
+ end
46
+ h.request get
47
+ end
48
+ end
49
+
50
+ end
51
+
52
+ Riemann::Tools::Haproxy.run
@@ -0,0 +1,270 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # Reports current CPU, disk, load average, and memory use to riemann.
4
+
5
+ require File.expand_path('../../lib/riemann/tools', __FILE__)
6
+
7
+ class Riemann::Tools::Health
8
+ include Riemann::Tools
9
+
10
+ opt :cpu_warning, "CPU warning threshold (fraction of total jiffies)", :default => 0.9
11
+ opt :cpu_critical, "CPU critical threshold (fraction of total jiffies)", :default => 0.95
12
+ opt :disk_warning, "Disk warning threshold (fraction of space used)", :default => 0.9
13
+ opt :disk_critical, "Disk critical threshold (fraction of space used)", :default => 0.95
14
+ opt :load_warning, "Load warning threshold (load average / core)", :default => 3
15
+ opt :load_critical, "Load critical threshold (load average / core)", :default => 8
16
+ opt :memory_warning, "Memory warning threshold (fraction of RAM)", :default => 0.85
17
+ opt :memory_critical, "Memory critical threshold (fraction of RAM)", :default => 0.95
18
+ opt :checks, "A list of checks to run.", :type => :strings, :default => ['cpu', 'load', 'memory', 'disk']
19
+
20
+ def initialize
21
+ @limits = {
22
+ :cpu => {:critical => opts[:cpu_critical], :warning => opts[:cpu_warning]},
23
+ :disk => {:critical => opts[:disk_critical], :warning => opts[:disk_warning]},
24
+ :load => {:critical => opts[:load_critical], :warning => opts[:load_warning]},
25
+ :memory => {:critical => opts[:memory_critical], :warning => opts[:memory_warning]}
26
+ }
27
+ case (ostype = `uname -s`.chomp.downcase)
28
+ when 'darwin'
29
+ @cores = `sysctl -n hw.ncpu`.to_i
30
+ @cpu = method :darwin_cpu
31
+ @disk = method :disk
32
+ @load = method :darwin_load
33
+ @memory = method :darwin_memory
34
+ darwin_top
35
+ when 'freebsd'
36
+ @cores = `sysctl -n hw.ncpu`.to_i
37
+ @cpu = method :freebsd_cpu
38
+ @disk = method :disk
39
+ @load = method :freebsd_load
40
+ @memory = method :freebsd_memory
41
+ else
42
+ @cores = cores
43
+ puts "WARNING: OS '#{ostype}' not explicitly supported. Falling back to Linux" unless ostype == "linux"
44
+ @cpu = method :linux_cpu
45
+ @disk = method :disk
46
+ @load = method :linux_load
47
+ @memory = method :linux_memory
48
+ end
49
+
50
+ opts[:checks].each do |check|
51
+ case check
52
+ when "disk"
53
+ @disk_enabled = true
54
+ when "load"
55
+ @load_enabled = true
56
+ when "cpu"
57
+ @cpu_enabled = true
58
+ when "memory"
59
+ @memory_enabled = true
60
+ end
61
+ end
62
+ end
63
+
64
+ def alert(service, state, metric, description)
65
+ report(
66
+ :service => service.to_s,
67
+ :state => state.to_s,
68
+ :metric => metric.to_f,
69
+ :description => description
70
+ )
71
+ end
72
+
73
+ def cores
74
+ i = 0;
75
+ File.read("/proc/cpuinfo").split(/\n\n/).inject({}) do |cores, p|
76
+ physical_id = p[/physical id\s+:\s+(\d+)/, 1]
77
+ core_id = p[/core id\s+:\s+(\d+)/, 1]
78
+ if physical_id and core_id
79
+ cores["#{physical_id}:#{core_id}"] = true
80
+ elsif physical_id
81
+ cores["#{physical_id}:"] = true
82
+ else
83
+ cores[i += 1] = true;
84
+ end
85
+
86
+ cores
87
+ end.size
88
+ end
89
+
90
+ def report_pct(service, fraction, report)
91
+ if fraction
92
+ if fraction > @limits[service][:critical]
93
+ alert service, :critical, fraction, "#{sprintf("%.2f", fraction * 100)}% #{report}"
94
+ elsif fraction > @limits[service][:warning]
95
+ alert service, :warning, fraction, "#{sprintf("%.2f", fraction * 100)}% #{report}"
96
+ else
97
+ alert service, :ok, fraction, "#{sprintf("%.2f", fraction * 100)}% #{report}"
98
+ end
99
+ end
100
+ end
101
+
102
+ def linux_cpu
103
+ new = File.read('/proc/stat')
104
+ unless new[/cpu\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)/]
105
+ alert 'cpu', :unknown, nil, "/proc/stat doesn't include a CPU line"
106
+ return false
107
+ end
108
+ u2, n2, s2, i2 = [$1, $2, $3, $4].map { |e| e.to_i }
109
+
110
+ if @old_cpu
111
+ u1, n1, s1, i1 = @old_cpu
112
+
113
+ used = (u2+n2+s2) - (u1+n1+s1)
114
+ total = used + i2-i1
115
+ fraction = used.to_f / total
116
+
117
+ report_pct :cpu, fraction, "user+nice+sytem\n\n#{`ps -eo pcpu,pid,comm | sort -nrb -k1 | head -10`.chomp}"
118
+ end
119
+
120
+ @old_cpu = [u2, n2, s2, i2]
121
+ end
122
+
123
+ def linux_load
124
+ load = File.read('/proc/loadavg').split(/\s+/)[2].to_f / @cores
125
+ if load > @limits[:load][:critical]
126
+ alert "load", :critical, load, "15-minute load average/core is #{load}"
127
+ elsif load > @limits[:load][:warning]
128
+ alert "load", :warning, load, "15-minute load average/core is #{load}"
129
+ else
130
+ alert "load", :ok, load, "15-minute load average/core is #{load}"
131
+ end
132
+ end
133
+
134
+ def linux_memory
135
+ m = File.read('/proc/meminfo').split(/\n/).inject({}) { |info, line|
136
+ x = line.split(/:?\s+/)
137
+ # Assume kB...
138
+ info[x[0]] = x[1].to_i
139
+ info
140
+ }
141
+
142
+ free = m['MemFree'].to_i + m['Buffers'].to_i + m['Cached'].to_i
143
+ total = m['MemTotal'].to_i
144
+ fraction = 1 - (free.to_f / total)
145
+
146
+ report_pct :memory, fraction, "used\n\n#{`ps -eo pmem,pid,comm | sort -nrb -k1 | head -10`.chomp}"
147
+ end
148
+
149
+ def freebsd_cpu
150
+ u2, n2, s2, t2, i2 = `sysctl -n kern.cp_time 2>/dev/null`.split.map{ |e| e.to_i } #FreeBSD has 5 cpu stats
151
+
152
+ if @old_cpu
153
+ u1, n1, s1, t1, i1 = @old_cpu
154
+
155
+ used = (u2+n2+s2+t2) - (u1+n1+s1+t1)
156
+ total = used + i2-i1
157
+ fraction = used.to_f / total
158
+
159
+ report_pct :cpu, fraction, "user+nice+sytem+interrupt\n\n#{`ps -axo pcpu,pid,comm | sort -nrb -k1 | head -10`.chomp}"
160
+ end
161
+
162
+ @old_cpu = [u2, n2, s2, t2, i2]
163
+ end
164
+
165
+ def freebsd_load
166
+ m = `uptime`.split[-1].match(/^[0-9]*\.[0-9]*$/)
167
+ load = m[0].to_f / @cores
168
+ if load > @limits[:load][:critical]
169
+ alert "load", :critical, load, "15-minute load average/core is #{load}"
170
+ elsif load > @limits[:load][:warning]
171
+ alert "load", :warning, load, "15-minute load average/core is #{load}"
172
+ else
173
+ alert "load", :ok, load, "15-minute load average/core is #{load}"
174
+ end
175
+ end
176
+
177
+ def freebsd_memory
178
+ meminfo = `sysctl -n vm.stats.vm.v_page_count vm.stats.vm.v_wire_count vm.stats.vm.v_active_count 2>/dev/null`.chomp.split
179
+ fraction = (meminfo[1].to_f + meminfo[2].to_f) / meminfo[0].to_f
180
+
181
+ report_pct :memory, fraction, "used\n\n#{`ps -axo pmem,pid,comm | sort -nrb -k1 | head -10`.chomp}"
182
+ end
183
+
184
+ def darwin_top
185
+ raw = `top -l 1 | grep -i "^\\(cpu\\|physmem\\|load\\)"`.chomp
186
+ @topdata = {:stamp => Time.now.to_i }
187
+ raw.each_line do |ln|
188
+ if ln.match(/Load Avg: [0-9.]+, [0-9.]+, ([0-9.])+/i)
189
+ @topdata[:load] = $1.to_f
190
+ elsif ln.match(/CPU usage: [0-9.]+% user, [0-9.]+% sys, ([0-9.]+)% idle/i)
191
+ @topdata[:cpu] = 1 - ($1.to_f / 100)
192
+ elsif mdat = ln.match(/PhysMem: ([0-9]+)([BKMGT]) wired, ([0-9]+)([BKMGT]) active, ([0-9]+)([BKMGT]) inactive, ([0-9]+)([BKMGT]) used, ([0-9]+)([BKMGT]) free/i)
193
+ wired = mdat[1].to_i * (1024 ** "BKMGT".index(mdat[2]))
194
+ active = mdat[3].to_i * (1024 ** "BKMGT".index(mdat[4]))
195
+ inactive = mdat[5].to_i * (1024 ** "BKMGT".index(mdat[6]))
196
+ used = mdat[7].to_i * (1024 ** "BKMGT".index(mdat[8]))
197
+ free = mdat[9].to_i * (1024 ** "BKMGT".index(mdat[10]))
198
+ @topdata[:memory] = (wired + active + used).to_f / (wired + active + used + inactive + free)
199
+ end
200
+ end
201
+ end
202
+
203
+ def darwin_cpu
204
+ darwin_top unless (Time.now.to_i - @topdata[:stamp]) < opts[:interval]
205
+ unless @topdata[:cpu]
206
+ alert 'cpu', :unknown, nil, "unable to get CPU stats from top"
207
+ return false
208
+ end
209
+ report_pct :cpu, @topdata[:cpu], "usage\n\n#{`ps -eo pcpu,pid,comm | sort -nrb -k1 | head -10`.chomp}"
210
+ end
211
+
212
+ def darwin_load
213
+ darwin_top unless (Time.now.to_i - @topdata[:stamp]) < opts[:interval]
214
+ unless @topdata[:load]
215
+ alert 'load', :unknown, nil, "unable to get load ave from top"
216
+ return false
217
+ end
218
+ metric = @topdata[:load] / @cores
219
+ if metric > @limits[:load][:critical]
220
+ alert "load", :critical, metric, "15-minute load average per core is #{metric}"
221
+ elsif metric > @limits[:load][:warning]
222
+ alert "load", :warning, metric, "15-minute load average per core is #{metric}"
223
+ else
224
+ alert "load", :ok, metric, "15-minute load average per core is #{metric}"
225
+ end
226
+ end
227
+
228
+ def darwin_memory
229
+ darwin_top unless (Time.now.to_i - @topdata[:stamp]) < opts[:interval]
230
+ unless @topdata[:memory]
231
+ alert 'memory', :unknown, nil, "unable to get memory data from top"
232
+ return false
233
+ end
234
+ report_pct :memory, @topdata[:memory], "usage\n\n#{`ps -eo pmem,pid,comm | sort -nrb -k1 | head -10`.chomp}"
235
+ end
236
+
237
+ def disk
238
+ `df -P`.split(/\n/).each do |r|
239
+ f = r.split(/\s+/)
240
+ next unless f[0] =~ /^\//
241
+ next if f[0] == 'Filesystem'
242
+ x = f[4].to_f/100
243
+
244
+ if x > @limits[:disk][:critical]
245
+ alert "disk #{f[5]}", :critical, x, "#{f[4]} used"
246
+ elsif x > @limits[:disk][:warning]
247
+ alert "disk #{f[5]}", :warning, x, "#{f[4]} used"
248
+ else
249
+ alert "disk #{f[5]}", :ok, x, "#{f[4]} used"
250
+ end
251
+ end
252
+ end
253
+
254
+ def tick
255
+ if @cpu_enabled
256
+ @cpu.call
257
+ end
258
+ if @memory_enabled
259
+ @memory.call
260
+ end
261
+ if @disk_enabled
262
+ @disk.call
263
+ end
264
+ if @load_enabled
265
+ @load.call
266
+ end
267
+ end
268
+ end
269
+
270
+ Riemann::Tools::Health.run
@@ -0,0 +1,22 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require File.expand_path('../../lib/riemann/tools', __FILE__)
4
+
5
+ class Riemann::Tools::KVM
6
+ include Riemann::Tools
7
+
8
+ def tick
9
+
10
+ #determine how many instances I have according to libvirt
11
+ kvm_instances = %x[virsh list |grep i-|wc -l]
12
+
13
+ #submit them to riemann
14
+ report(
15
+ :service => "KVM Running VMs",
16
+ :metric => kvm_instances.to_i,
17
+ :state => "info"
18
+ )
19
+ end
20
+ end
21
+
22
+ Riemann::Tools::KVM.run
@@ -0,0 +1,37 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # Gathers memcached STATS and submits them to Riemann.
4
+
5
+ require File.expand_path('../../lib/riemann/tools', __FILE__)
6
+
7
+ class Riemann::Tools::Memcached
8
+ include Riemann::Tools
9
+ require 'socket'
10
+
11
+ opt :memcached_host, "Memcached hostname", :default => 'localhost'
12
+ opt :memcached_port, "Memcached port", :default => 11211
13
+
14
+ def tick
15
+ sock = TCPSocket.new(opts[:memcached_host], opts[:memcached_port])
16
+ sock.print("stats\r\n")
17
+ sock.flush
18
+ stats = sock.gets
19
+
20
+ data = {}
21
+ while true
22
+ stats = sock.gets
23
+ break if stats.strip == 'END'
24
+ m = stats.match /STAT (\w+) (\S+)/
25
+ report(
26
+ :host => opts[:memcached_host].dup,
27
+ :service => "memcached #{m[1]}",
28
+ :metric => m[2].to_f,
29
+ :state => 'ok',
30
+ :tags => ['memcached']
31
+ )
32
+ end
33
+ sock.close
34
+ end
35
+ end
36
+
37
+ Riemann::Tools::Memcached.run