riemann-tools 1.1.1 → 1.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/workflows/ci.yml +2 -0
- data/.gitignore +2 -0
- data/.rubocop.yml +8 -0
- data/.ruby-version +1 -0
- data/CHANGELOG.md +25 -2
- data/Rakefile +10 -3
- data/bin/riemann-apache-status +1 -106
- data/bin/riemann-bench +2 -70
- data/bin/riemann-cloudant +1 -56
- data/bin/riemann-consul +1 -106
- data/bin/riemann-dir-files-count +1 -55
- data/bin/riemann-dir-space +1 -55
- data/bin/riemann-diskstats +1 -92
- data/bin/riemann-fd +2 -81
- data/bin/riemann-freeswitch +2 -119
- data/bin/riemann-haproxy +1 -58
- data/bin/riemann-health +0 -2
- data/bin/riemann-kvminstance +2 -22
- data/bin/riemann-memcached +1 -37
- data/bin/riemann-net +0 -2
- data/bin/riemann-nginx-status +1 -85
- data/bin/riemann-ntp +0 -2
- data/bin/riemann-portcheck +1 -44
- data/bin/riemann-proc +1 -108
- data/bin/riemann-varnish +1 -54
- data/bin/riemann-wrapper +75 -0
- data/bin/riemann-zookeeper +1 -39
- data/lib/riemann/tools/apache_status.rb +107 -0
- data/lib/riemann/tools/bench.rb +72 -0
- data/lib/riemann/tools/cloudant.rb +57 -0
- data/lib/riemann/tools/consul_health.rb +107 -0
- data/lib/riemann/tools/dir_files_count.rb +56 -0
- data/lib/riemann/tools/dir_space.rb +56 -0
- data/lib/riemann/tools/diskstats.rb +94 -0
- data/lib/riemann/tools/fd.rb +81 -0
- data/lib/riemann/tools/freeswitch.rb +119 -0
- data/lib/riemann/tools/haproxy.rb +59 -0
- data/lib/riemann/tools/health.rb +150 -19
- data/lib/riemann/tools/kvm.rb +23 -0
- data/lib/riemann/tools/memcached.rb +38 -0
- data/lib/riemann/tools/net.rb +2 -1
- data/lib/riemann/tools/nginx_status.rb +86 -0
- data/lib/riemann/tools/ntp.rb +1 -0
- data/lib/riemann/tools/portcheck.rb +45 -0
- data/lib/riemann/tools/proc.rb +109 -0
- data/lib/riemann/tools/riemann_client_wrapper.rb +43 -0
- data/lib/riemann/tools/uptime_parser.tab.rb +323 -0
- data/lib/riemann/tools/varnish.rb +55 -0
- data/lib/riemann/tools/version.rb +1 -1
- data/lib/riemann/tools/zookeeper.rb +40 -0
- data/lib/riemann/tools.rb +2 -20
- data/riemann-tools.gemspec +4 -1
- data/tools/riemann-aws/Rakefile +6 -9
- data/tools/riemann-aws/bin/riemann-aws-billing +2 -87
- data/tools/riemann-aws/bin/riemann-aws-rds-status +2 -62
- data/tools/riemann-aws/bin/riemann-aws-sqs-status +2 -44
- data/tools/riemann-aws/bin/riemann-aws-status +2 -77
- data/tools/riemann-aws/bin/riemann-elb-metrics +2 -162
- data/tools/riemann-aws/bin/riemann-s3-list +2 -81
- data/tools/riemann-aws/bin/riemann-s3-status +2 -96
- data/tools/riemann-aws/lib/riemann/tools/aws/billing.rb +87 -0
- data/tools/riemann-aws/lib/riemann/tools/aws/elb_metrics.rb +163 -0
- data/tools/riemann-aws/lib/riemann/tools/aws/rds_status.rb +63 -0
- data/tools/riemann-aws/lib/riemann/tools/aws/s3_list.rb +82 -0
- data/tools/riemann-aws/lib/riemann/tools/aws/s3_status.rb +97 -0
- data/tools/riemann-aws/lib/riemann/tools/aws/sqs_status.rb +45 -0
- data/tools/riemann-aws/lib/riemann/tools/aws/status.rb +74 -0
- data/tools/riemann-chronos/Rakefile +6 -9
- data/tools/riemann-chronos/bin/riemann-chronos +1 -154
- data/tools/riemann-chronos/lib/riemann/tools/chronos.rb +157 -0
- data/tools/riemann-docker/Rakefile +5 -8
- data/tools/riemann-docker/bin/riemann-docker +2 -200
- data/tools/riemann-docker/lib/riemann/tools/docker.rb +200 -0
- data/tools/riemann-elasticsearch/Rakefile +6 -9
- data/tools/riemann-elasticsearch/bin/riemann-elasticsearch +1 -167
- data/tools/riemann-elasticsearch/lib/riemann/tools/elasticsearch.rb +170 -0
- data/tools/riemann-marathon/Rakefile +6 -9
- data/tools/riemann-marathon/bin/riemann-marathon +1 -156
- data/tools/riemann-marathon/lib/riemann/tools/marathon.rb +159 -0
- data/tools/riemann-mesos/Rakefile +6 -9
- data/tools/riemann-mesos/bin/riemann-mesos +1 -139
- data/tools/riemann-mesos/lib/riemann/tools/mesos.rb +142 -0
- data/tools/riemann-munin/Rakefile +5 -8
- data/tools/riemann-munin/bin/riemann-munin +1 -36
- data/tools/riemann-munin/lib/riemann/tools/munin.rb +37 -0
- data/tools/riemann-rabbitmq/Rakefile +6 -9
- data/tools/riemann-rabbitmq/bin/riemann-rabbitmq +1 -266
- data/tools/riemann-rabbitmq/lib/riemann/tools/rabbitmq.rb +269 -0
- data/tools/riemann-riak/Rakefile +5 -8
- data/tools/riemann-riak/bin/riemann-riak +1 -316
- data/tools/riemann-riak/bin/riemann-riak-keys +0 -1
- data/tools/riemann-riak/bin/riemann-riak-ring +0 -1
- data/tools/riemann-riak/lib/riemann/tools/riak.rb +317 -0
- metadata +57 -10
- data/.travis.yml +0 -31
- data/tools/riemann-riak/riak_status/key_count.erl +0 -13
- data/tools/riemann-riak/riak_status/riak_status.rb +0 -152
- data/tools/riemann-riak/riak_status/ringready.erl +0 -9
data/lib/riemann/tools/health.rb
CHANGED
@@ -2,7 +2,9 @@
|
|
2
2
|
|
3
3
|
require 'riemann/tools'
|
4
4
|
require 'riemann/tools/utils'
|
5
|
+
require 'riemann/tools/uptime_parser.tab'
|
5
6
|
|
7
|
+
# Reports current CPU, disk, load average, and memory use to riemann.
|
6
8
|
module Riemann
|
7
9
|
module Tools
|
8
10
|
class Health
|
@@ -14,12 +16,18 @@ module Riemann
|
|
14
16
|
opt :disk_warning, 'Disk warning threshold (fraction of space used)', default: 0.9
|
15
17
|
opt :disk_critical, 'Disk critical threshold (fraction of space used)', default: 0.95
|
16
18
|
opt :disk_ignorefs, 'A list of filesystem types to ignore',
|
17
|
-
default: %w[anon_inodefs autofs cd9660 devfs devtmpfs fdescfs iso9660 linprocfs linsysfs nfs overlay procfs tmpfs]
|
19
|
+
default: %w[anon_inodefs autofs cd9660 devfs devtmpfs fdescfs iso9660 linprocfs linsysfs nfs overlay procfs squashfs tmpfs]
|
18
20
|
opt :load_warning, 'Load warning threshold (load average / core)', default: 3.0
|
19
21
|
opt :load_critical, 'Load critical threshold (load average / core)', default: 8.0
|
20
22
|
opt :memory_warning, 'Memory warning threshold (fraction of RAM)', default: 0.85
|
21
23
|
opt :memory_critical, 'Memory critical threshold (fraction of RAM)', default: 0.95
|
22
|
-
opt :
|
24
|
+
opt :uptime_warning, 'Uptime warning threshold', default: 86_400
|
25
|
+
opt :uptime_critical, 'Uptime critical threshold', default: 3600
|
26
|
+
opt :users_warning, 'Users warning threshold', default: 1
|
27
|
+
opt :users_critical, 'Users critical threshold', default: 1
|
28
|
+
opt :swap_warning, 'Swap warning threshold', default: 0.4
|
29
|
+
opt :swap_critical, 'Swap critical threshold', default: 0.5
|
30
|
+
opt :checks, 'A list of checks to run.', type: :strings, default: %w[cpu load memory disk swap]
|
23
31
|
|
24
32
|
def initialize
|
25
33
|
@limits = {
|
@@ -27,6 +35,9 @@ module Riemann
|
|
27
35
|
disk: { critical: opts[:disk_critical], warning: opts[:disk_warning] },
|
28
36
|
load: { critical: opts[:load_critical], warning: opts[:load_warning] },
|
29
37
|
memory: { critical: opts[:memory_critical], warning: opts[:memory_warning] },
|
38
|
+
uptime: { critical: opts[:uptime_critical], warning: opts[:uptime_warning] },
|
39
|
+
users: { critical: opts[:users_critical], warning: opts[:users_warning] },
|
40
|
+
swap: { critical: opts[:swap_critical], warning: opts[:swap_warning] },
|
30
41
|
}
|
31
42
|
case (@ostype = `uname -s`.chomp.downcase)
|
32
43
|
when 'darwin'
|
@@ -35,25 +46,32 @@ module Riemann
|
|
35
46
|
@disk = method :disk
|
36
47
|
@load = method :darwin_load
|
37
48
|
@memory = method :darwin_memory
|
38
|
-
|
49
|
+
@uptime = method :bsd_uptime
|
50
|
+
@swap = method :bsd_swap
|
39
51
|
when 'freebsd'
|
40
52
|
@cores = `sysctl -n hw.ncpu`.to_i
|
41
53
|
@cpu = method :freebsd_cpu
|
42
54
|
@disk = method :disk
|
43
55
|
@load = method :bsd_load
|
44
56
|
@memory = method :freebsd_memory
|
57
|
+
@uptime = method :bsd_uptime
|
58
|
+
@swap = method :bsd_swap
|
45
59
|
when 'openbsd'
|
46
60
|
@cores = `sysctl -n hw.ncpu`.to_i
|
47
61
|
@cpu = method :openbsd_cpu
|
48
62
|
@disk = method :disk
|
49
63
|
@load = method :bsd_load
|
50
64
|
@memory = method :openbsd_memory
|
65
|
+
@uptime = method :bsd_uptime
|
66
|
+
@swap = method :bsd_swap
|
51
67
|
when 'sunos'
|
52
68
|
@cores = `mpstat -a 2>/dev/null`.split[33].to_i
|
53
69
|
@cpu = method :sunos_cpu
|
54
70
|
@disk = method :disk
|
55
71
|
@load = method :bsd_load
|
56
72
|
@memory = method :sunos_memory
|
73
|
+
@uptime = method :bsd_uptime
|
74
|
+
@swap = method :bsd_swap
|
57
75
|
else
|
58
76
|
@cores = `nproc`.to_i
|
59
77
|
puts "WARNING: OS '#{@ostype}' not explicitly supported. Falling back to Linux" unless @ostype == 'linux'
|
@@ -61,8 +79,11 @@ module Riemann
|
|
61
79
|
@disk = method :disk
|
62
80
|
@load = method :linux_load
|
63
81
|
@memory = method :linux_memory
|
82
|
+
@uptime = method :linux_uptime
|
83
|
+
@swap = method :linux_swap
|
64
84
|
@supports_exclude_type = `df --help 2>&1 | grep -e "--exclude-type"` != ''
|
65
85
|
end
|
86
|
+
@users = method :users
|
66
87
|
|
67
88
|
opts[:checks].each do |check|
|
68
89
|
case check
|
@@ -74,6 +95,12 @@ module Riemann
|
|
74
95
|
@cpu_enabled = true
|
75
96
|
when 'memory'
|
76
97
|
@memory_enabled = true
|
98
|
+
when 'uptime'
|
99
|
+
@uptime_enabled = true
|
100
|
+
when 'users'
|
101
|
+
@users_enabled = true
|
102
|
+
when 'swap'
|
103
|
+
@swap_enabled = true
|
77
104
|
end
|
78
105
|
end
|
79
106
|
end
|
@@ -99,6 +126,30 @@ module Riemann
|
|
99
126
|
end
|
100
127
|
end
|
101
128
|
|
129
|
+
def report_int(service, value, report)
|
130
|
+
return unless value
|
131
|
+
|
132
|
+
if value >= @limits[service][:critical]
|
133
|
+
alert service, :critical, value, "#{value} #{report}"
|
134
|
+
elsif value >= @limits[service][:warning]
|
135
|
+
alert service, :warning, value, "#{value} #{report}"
|
136
|
+
else
|
137
|
+
alert service, :ok, value, "#{value} #{report}"
|
138
|
+
end
|
139
|
+
end
|
140
|
+
|
141
|
+
def report_uptime(uptime)
|
142
|
+
description = uptime_to_human(uptime)
|
143
|
+
|
144
|
+
if uptime < @limits[:uptime][:critical]
|
145
|
+
alert 'uptime', :critical, uptime, description
|
146
|
+
elsif uptime < @limits[:uptime][:warning]
|
147
|
+
alert 'uptime', :warning, uptime, description
|
148
|
+
else
|
149
|
+
alert 'uptime', :ok, uptime, description
|
150
|
+
end
|
151
|
+
end
|
152
|
+
|
102
153
|
def linux_cpu
|
103
154
|
new = File.read('/proc/stat')
|
104
155
|
unless new[/cpu\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)/]
|
@@ -205,9 +256,16 @@ module Riemann
|
|
205
256
|
@old_cpu = [u2, s2, t2, i2]
|
206
257
|
end
|
207
258
|
|
259
|
+
def uptime_parser
|
260
|
+
@uptime_parser ||= UptimeParser.new
|
261
|
+
end
|
262
|
+
|
263
|
+
def uptime
|
264
|
+
@cached_data[:uptime] ||= uptime_parser.parse(`uptime`)
|
265
|
+
end
|
266
|
+
|
208
267
|
def bsd_load
|
209
|
-
|
210
|
-
load = m[0].to_f / @cores
|
268
|
+
load = uptime[:load_averages][1] / @cores
|
211
269
|
if load > @limits[:load][:critical]
|
212
270
|
alert 'load', :critical, load, "1-minute load average/core is #{load}"
|
213
271
|
elsif load > @limits[:load][:warning]
|
@@ -240,47 +298,50 @@ module Riemann
|
|
240
298
|
end
|
241
299
|
|
242
300
|
def darwin_top
|
301
|
+
return @cached_data[:darwin_top] if @cached_data[:darwin_top]
|
302
|
+
|
243
303
|
raw = `top -l 1 | grep -i "^\\(cpu\\|physmem\\|load\\)"`.chomp
|
244
|
-
|
304
|
+
topdata = {}
|
245
305
|
raw.each_line do |ln|
|
246
306
|
if ln.match(/Load Avg: [0-9.]+, [0-9.]+, ([0-9.])+/i)
|
247
|
-
|
307
|
+
topdata[:load] = Regexp.last_match(1).to_f
|
248
308
|
elsif ln.match(/CPU usage: [0-9.]+% user, [0-9.]+% sys, ([0-9.]+)% idle/i)
|
249
|
-
|
309
|
+
topdata[:cpu] = 1 - (Regexp.last_match(1).to_f / 100)
|
250
310
|
elsif (mdat = ln.match(/PhysMem: ([0-9]+)([BKMGT]) wired, ([0-9]+)([BKMGT]) active, ([0-9]+)([BKMGT]) inactive, ([0-9]+)([BKMGT]) used, ([0-9]+)([BKMGT]) free/i))
|
251
311
|
wired = mdat[1].to_i * (1024**'BKMGT'.index(mdat[2]))
|
252
312
|
active = mdat[3].to_i * (1024**'BKMGT'.index(mdat[4]))
|
253
313
|
inactive = mdat[5].to_i * (1024**'BKMGT'.index(mdat[6]))
|
254
314
|
used = mdat[7].to_i * (1024**'BKMGT'.index(mdat[8]))
|
255
315
|
free = mdat[9].to_i * (1024**'BKMGT'.index(mdat[10]))
|
256
|
-
|
316
|
+
topdata[:memory] = (wired + active + used).to_f / (wired + active + used + inactive + free)
|
257
317
|
# This is for OSX Mavericks which
|
258
318
|
# uses a different format for top
|
259
319
|
# Example: PhysMem: 4662M used (1328M wired), 2782M unused.
|
260
320
|
elsif (mdat = ln.match(/PhysMem: ([0-9]+)([BKMGT]) used \([0-9]+[BKMGT] wired\), ([0-9]+)([BKMGT]) unused/i))
|
261
321
|
used = mdat[1].to_i * (1024**'BKMGT'.index(mdat[2]))
|
262
322
|
unused = mdat[3].to_i * (1024**'BKMGT'.index(mdat[4]))
|
263
|
-
|
323
|
+
topdata[:memory] = used.to_f / (used + unused)
|
264
324
|
end
|
265
325
|
end
|
326
|
+
@cached_data[:darwin_top] = topdata
|
266
327
|
end
|
267
328
|
|
268
329
|
def darwin_cpu
|
269
|
-
|
270
|
-
unless
|
330
|
+
topdata = darwin_top
|
331
|
+
unless topdata[:cpu]
|
271
332
|
alert 'cpu', :unknown, nil, 'unable to get CPU stats from top'
|
272
333
|
return false
|
273
334
|
end
|
274
|
-
report_pct :cpu,
|
335
|
+
report_pct :cpu, topdata[:cpu], "usage\n\n#{reverse_numeric_sort_with_header(`ps -eo pcpu,pid,comm`)}"
|
275
336
|
end
|
276
337
|
|
277
338
|
def darwin_load
|
278
|
-
|
279
|
-
unless
|
339
|
+
topdata = darwin_top
|
340
|
+
unless topdata[:load]
|
280
341
|
alert 'load', :unknown, nil, 'unable to get load ave from top'
|
281
342
|
return false
|
282
343
|
end
|
283
|
-
metric =
|
344
|
+
metric = topdata[:load] / @cores
|
284
345
|
if metric > @limits[:load][:critical]
|
285
346
|
alert 'load', :critical, metric, "1-minute load average per core is #{metric}"
|
286
347
|
elsif metric > @limits[:load][:warning]
|
@@ -291,12 +352,12 @@ module Riemann
|
|
291
352
|
end
|
292
353
|
|
293
354
|
def darwin_memory
|
294
|
-
|
295
|
-
unless
|
355
|
+
topdata = darwin_top
|
356
|
+
unless topdata[:memory]
|
296
357
|
alert 'memory', :unknown, nil, 'unable to get memory data from top'
|
297
358
|
return false
|
298
359
|
end
|
299
|
-
report_pct :memory,
|
360
|
+
report_pct :memory, topdata[:memory], "usage\n\n#{reverse_numeric_sort_with_header(`ps -eo pmem,pid,comm`)}"
|
300
361
|
end
|
301
362
|
|
302
363
|
def df
|
@@ -336,11 +397,81 @@ module Riemann
|
|
336
397
|
end
|
337
398
|
end
|
338
399
|
|
400
|
+
def bsd_uptime
|
401
|
+
value = uptime[:uptime]
|
402
|
+
|
403
|
+
report_uptime(value)
|
404
|
+
end
|
405
|
+
|
406
|
+
def linux_uptime
|
407
|
+
value = File.read('/proc/uptime').split(/\s+/)[0].to_f
|
408
|
+
|
409
|
+
report_uptime(value)
|
410
|
+
end
|
411
|
+
|
412
|
+
def users
|
413
|
+
value = uptime[:users]
|
414
|
+
|
415
|
+
report_int(:users, value, "user#{'s' if value != 1}")
|
416
|
+
end
|
417
|
+
|
418
|
+
def bsd_swap
|
419
|
+
_device, blocks, used, _avail, _capacity = `swapinfo`.lines.last.split(/\s+/)
|
420
|
+
|
421
|
+
value = Float(used) / Integer(blocks)
|
422
|
+
|
423
|
+
report_pct :swap, value, 'used'
|
424
|
+
rescue ArgumentError
|
425
|
+
# Ignore
|
426
|
+
end
|
427
|
+
|
428
|
+
def linux_swap
|
429
|
+
total_size = 0.0
|
430
|
+
total_used = 0.0
|
431
|
+
|
432
|
+
File.read('/proc/swaps').lines.each_with_index do |line, n|
|
433
|
+
next if n.zero?
|
434
|
+
|
435
|
+
_filename, _type, size, used, _priority = line.split(/\s+/)
|
436
|
+
|
437
|
+
total_size += size.to_f
|
438
|
+
total_used += used.to_f
|
439
|
+
end
|
440
|
+
|
441
|
+
return if total_size.zero?
|
442
|
+
|
443
|
+
value = total_used / total_size
|
444
|
+
|
445
|
+
report_pct :swap, value, 'used'
|
446
|
+
end
|
447
|
+
|
448
|
+
def uptime_to_human(value)
|
449
|
+
seconds = value.to_i
|
450
|
+
days = seconds / 86_400
|
451
|
+
seconds %= 86_400
|
452
|
+
hrs = seconds / 3600
|
453
|
+
seconds %= 3600
|
454
|
+
mins = seconds / 60
|
455
|
+
[
|
456
|
+
("#{days} day#{'s' if days > 1}" unless days.zero?),
|
457
|
+
format('%<hrs>2d:%<mins>02d', hrs: hrs, mins: mins),
|
458
|
+
].compact.join(' ')
|
459
|
+
end
|
460
|
+
|
339
461
|
def tick
|
462
|
+
invalidate_cache
|
463
|
+
|
340
464
|
@cpu.call if @cpu_enabled
|
341
465
|
@memory.call if @memory_enabled
|
342
466
|
@disk.call if @disk_enabled
|
343
467
|
@load.call if @load_enabled
|
468
|
+
@uptime.call if @uptime_enabled
|
469
|
+
@users.call if @users_enabled
|
470
|
+
@swap.call if @swap_enabled
|
471
|
+
end
|
472
|
+
|
473
|
+
def invalidate_cache
|
474
|
+
@cached_data = {}
|
344
475
|
end
|
345
476
|
end
|
346
477
|
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'riemann/tools'
|
4
|
+
|
5
|
+
module Riemann
|
6
|
+
module Tools
|
7
|
+
class Kvm
|
8
|
+
include Riemann::Tools
|
9
|
+
|
10
|
+
def tick
|
11
|
+
# determine how many instances I have according to libvirt
|
12
|
+
kvm_instances = `LANG=C virsh list | grep -c running`
|
13
|
+
|
14
|
+
# submit them to riemann
|
15
|
+
report(
|
16
|
+
service: 'KVM Running VMs',
|
17
|
+
metric: kvm_instances.to_i,
|
18
|
+
state: 'info',
|
19
|
+
)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'riemann/tools'
|
4
|
+
|
5
|
+
# Gathers memcached STATS and submits them to Riemann.
|
6
|
+
module Riemann
|
7
|
+
module Tools
|
8
|
+
class Memcached
|
9
|
+
include Riemann::Tools
|
10
|
+
require 'socket'
|
11
|
+
|
12
|
+
opt :memcached_host, 'Memcached hostname', default: 'localhost'
|
13
|
+
opt :memcached_port, 'Memcached port', default: 11_211
|
14
|
+
|
15
|
+
def tick
|
16
|
+
sock = TCPSocket.new(opts[:memcached_host], opts[:memcached_port])
|
17
|
+
sock.print("stats\r\n")
|
18
|
+
sock.flush
|
19
|
+
stats = sock.gets
|
20
|
+
|
21
|
+
loop do
|
22
|
+
stats = sock.gets
|
23
|
+
break if stats.strip == 'END'
|
24
|
+
|
25
|
+
m = stats.match(/STAT (\w+) (\S+)/)
|
26
|
+
report(
|
27
|
+
host: opts[:memcached_host].dup,
|
28
|
+
service: "memcached #{m[1]}",
|
29
|
+
metric: m[2].to_f,
|
30
|
+
state: 'ok',
|
31
|
+
tags: ['memcached'],
|
32
|
+
)
|
33
|
+
end
|
34
|
+
sock.close
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
data/lib/riemann/tools/net.rb
CHANGED
@@ -2,6 +2,7 @@
|
|
2
2
|
|
3
3
|
require 'riemann/tools'
|
4
4
|
|
5
|
+
# Gathers network interface statistics and submits them to Riemann.
|
5
6
|
module Riemann
|
6
7
|
module Tools
|
7
8
|
class Net
|
@@ -42,7 +43,7 @@ module Riemann
|
|
42
43
|
'tx bytes',
|
43
44
|
'tx packets',
|
44
45
|
'tx errs',
|
45
|
-
'tx
|
46
|
+
'tx drop',
|
46
47
|
'tx fifo',
|
47
48
|
'tx colls',
|
48
49
|
'tx carrier',
|
@@ -0,0 +1,86 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'riemann/tools'
|
4
|
+
|
5
|
+
# Gathers nginx status stub statistics and submits them to Riemann.
|
6
|
+
# See http://wiki.nginx.org/HttpStubStatusModule for configuring Nginx appropriately
|
7
|
+
module Riemann
|
8
|
+
module Tools
|
9
|
+
class NginxStatus
|
10
|
+
include Riemann::Tools
|
11
|
+
require 'net/http'
|
12
|
+
require 'uri'
|
13
|
+
|
14
|
+
opt :uri, 'Nginx Stub Status URI', default: 'http://localhost:8080/nginx_status'
|
15
|
+
opt :checks, 'Which metrics to report.', type: :strings,
|
16
|
+
default: %w[active accepted handled requests reading writing waiting]
|
17
|
+
opt :active_warning, 'Active connections warning threshold', default: 0
|
18
|
+
opt :active_critical, 'Active connections critical threshold', default: 0
|
19
|
+
opt :reading_warning, 'Reading connections warning threshold', default: 0
|
20
|
+
opt :reading_critical, 'Reading connections critical threshold', default: 0
|
21
|
+
opt :writing_warning, 'Writing connections warning threshold', default: 0
|
22
|
+
opt :writing_critical, 'Writing connections critical threshold', default: 0
|
23
|
+
opt :waiting_warning, 'Waiting connections warning threshold', default: 0
|
24
|
+
opt :waiting_critical, 'Waiting connections critical threshold', default: 0
|
25
|
+
|
26
|
+
def initialize
|
27
|
+
@uri = URI.parse(opts[:uri])
|
28
|
+
|
29
|
+
# sample response:
|
30
|
+
#
|
31
|
+
# Active connections: 1
|
32
|
+
# server accepts handled requests
|
33
|
+
# 39 39 39
|
34
|
+
# Reading: 0 Writing: 1 Waiting: 0
|
35
|
+
@keys = %w[active accepted handled requests reading writing waiting]
|
36
|
+
@re = /Active connections: (\d+) \n.+\n (\d+) (\d+) (\d+) \nReading: (\d+) Writing: (\d+) Waiting: (\d+)/m
|
37
|
+
end
|
38
|
+
|
39
|
+
def state(key, value)
|
40
|
+
if opts.key? "#{key}_critical".to_sym
|
41
|
+
critical_threshold = opts["#{key}_critical".to_sym]
|
42
|
+
return 'critical' if critical_threshold.positive? && (value >= critical_threshold)
|
43
|
+
end
|
44
|
+
|
45
|
+
if opts.key? "#{key}_warning".to_sym
|
46
|
+
warning_threshold = opts["#{key}_warning".to_sym]
|
47
|
+
return 'warning' if warning_threshold.positive? && (value >= warning_threshold)
|
48
|
+
end
|
49
|
+
|
50
|
+
'ok'
|
51
|
+
end
|
52
|
+
|
53
|
+
def tick
|
54
|
+
response = nil
|
55
|
+
begin
|
56
|
+
response = ::Net::HTTP.get(@uri)
|
57
|
+
rescue StandardError => e
|
58
|
+
report(
|
59
|
+
service: 'nginx health',
|
60
|
+
state: 'critical',
|
61
|
+
description: "Connection error: #{e.class} - #{e.message}",
|
62
|
+
)
|
63
|
+
end
|
64
|
+
|
65
|
+
return if response.nil?
|
66
|
+
|
67
|
+
report(
|
68
|
+
service: 'nginx health',
|
69
|
+
state: 'ok',
|
70
|
+
description: 'Nginx status connection ok',
|
71
|
+
)
|
72
|
+
|
73
|
+
values = @re.match(response).to_a[1, 7].map(&:to_i)
|
74
|
+
|
75
|
+
@keys.zip(values).each do |key, value|
|
76
|
+
report({
|
77
|
+
service: "nginx #{key}",
|
78
|
+
metric: value,
|
79
|
+
state: state(key, value),
|
80
|
+
tags: ['nginx'],
|
81
|
+
})
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
data/lib/riemann/tools/ntp.rb
CHANGED
@@ -0,0 +1,45 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'riemann/tools'
|
4
|
+
|
5
|
+
# Checks for open tcp ports.
|
6
|
+
# (c) Max Voit 2017
|
7
|
+
module Riemann
|
8
|
+
module Tools
|
9
|
+
class Portcheck
|
10
|
+
include Riemann::Tools
|
11
|
+
require 'socket'
|
12
|
+
|
13
|
+
opt :hostname, 'Host, defaults to localhost', default: `hostname`.chomp
|
14
|
+
opt :ports, "List of ports to check, e.g. '-r 80 443'", type: :ints
|
15
|
+
|
16
|
+
def initialize
|
17
|
+
@hostname = opts.fetch(:hostname)
|
18
|
+
@ports = opts.fetch(:ports)
|
19
|
+
end
|
20
|
+
|
21
|
+
def tick
|
22
|
+
@ports.each do |thisport|
|
23
|
+
# try opening tcp connection with 5s timeout;
|
24
|
+
# if this fails, the port is considered closed
|
25
|
+
portopen = begin
|
26
|
+
Socket.tcp(@hostname, thisport, connect_timeout: 5) { true }
|
27
|
+
rescue StandardError
|
28
|
+
false
|
29
|
+
end
|
30
|
+
state = if portopen
|
31
|
+
'ok'
|
32
|
+
else
|
33
|
+
'critical'
|
34
|
+
end
|
35
|
+
report(
|
36
|
+
host: @hostname.to_s,
|
37
|
+
service: "port #{thisport}",
|
38
|
+
state: state.to_s,
|
39
|
+
tags: ['portcheck'],
|
40
|
+
)
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,109 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'riemann/tools'
|
4
|
+
|
5
|
+
# Reports running process count to riemann.
|
6
|
+
module Riemann
|
7
|
+
module Tools
|
8
|
+
class Proc
|
9
|
+
include Riemann::Tools
|
10
|
+
|
11
|
+
opt :proc_regex, 'regular expression that matches the process to be monitored', type: :string, default: '.*'
|
12
|
+
opt :proc_min_critical, 'running process count minimum', default: 0
|
13
|
+
opt :proc_max_critical, 'running process count maximum', default: 65_536
|
14
|
+
|
15
|
+
def initialize
|
16
|
+
@limits = { critical: { min: opts[:proc_min_critical], max: opts[:proc_max_critical] } }
|
17
|
+
|
18
|
+
abort 'FATAL: specify a process regular expression, see --help for usage' unless opts[:proc_regex]
|
19
|
+
|
20
|
+
ostype = `uname -s`.chomp.downcase
|
21
|
+
puts "WARNING: OS '#{ostype}' not explicitly supported. Falling back to Linux" unless ostype == 'linux'
|
22
|
+
@check = method :linux_proc
|
23
|
+
end
|
24
|
+
|
25
|
+
def alert(service, state, metric, description)
|
26
|
+
report(
|
27
|
+
service: service.to_s,
|
28
|
+
state: state.to_s,
|
29
|
+
metric: metric.to_f,
|
30
|
+
description: description,
|
31
|
+
)
|
32
|
+
end
|
33
|
+
|
34
|
+
def linux_proc
|
35
|
+
process = opts[:proc_regex]
|
36
|
+
found = `ps axo pid=,rss=,vsize=,state=,cputime=,lstart=,command= | grep '#{process}' | grep -v grep | grep -v riemann-proc`
|
37
|
+
running = found.count("\n")
|
38
|
+
if (running > @limits[:critical][:max]) || (running < @limits[:critical][:min])
|
39
|
+
alert "proc count/#{process}", :critical, running, "process #{process} is running #{running} instances.\n"
|
40
|
+
else
|
41
|
+
alert "proc count/#{process}", :ok, running, "process #{process} is running #{running} instances.\n"
|
42
|
+
end
|
43
|
+
# Iterate on all the lines and create an entry for the following metrics:
|
44
|
+
#
|
45
|
+
# process/<pid>-<start-time>/rss
|
46
|
+
# process/<pid>-<start-time>/vsize
|
47
|
+
# process/<pid>-<start-time>/running
|
48
|
+
# process/<pid>-<start-time>/cputime
|
49
|
+
#
|
50
|
+
# description should contain the command itself.
|
51
|
+
# value should be either process RSS, VSIZE, or 1 if running
|
52
|
+
# state is always unknown for the moment
|
53
|
+
#
|
54
|
+
ps_regex = /([0-9]+) +([0-9]+) +([0-9]+) +([A-Z]) +([0-9:.]+) +[A-Za-z]{3} +([A-Za-z]{3} {1,2}[0-9]+ [0-9:]+ [0-9]+) +(.*)/
|
55
|
+
found.each_line do |line|
|
56
|
+
m = ps_regex.match(line)
|
57
|
+
next if m.nil?
|
58
|
+
|
59
|
+
pid, rss, vsize, state, cputime, start, command = m.captures
|
60
|
+
start_s = DateTime.parse(start, 'Mmm DD HH:MM:ss YYYY').to_time.to_i
|
61
|
+
cputime_s = DateTime.parse(cputime, '%H:%M:%S')
|
62
|
+
cputime_seconds = (cputime_s.hour * 3600) + (cputime_s.minute * 60) + cputime_s.second
|
63
|
+
running = 0
|
64
|
+
case state[0]
|
65
|
+
when 'R'
|
66
|
+
state_s = 'ok'
|
67
|
+
running = 1
|
68
|
+
when 'S'
|
69
|
+
state_s = 'ok'
|
70
|
+
when 'I'
|
71
|
+
state_s = 'warning'
|
72
|
+
when 'T', 'U', 'Z'
|
73
|
+
state_s = 'critical'
|
74
|
+
else
|
75
|
+
state_s = 'unknown'
|
76
|
+
end
|
77
|
+
report(
|
78
|
+
service: "proc #{pid}-#{start_s}/rss",
|
79
|
+
state: state_s.to_s,
|
80
|
+
metric: rss.to_f,
|
81
|
+
description: command,
|
82
|
+
)
|
83
|
+
report(
|
84
|
+
service: "proc #{pid}-#{start_s}/vsize",
|
85
|
+
state: state_s.to_s,
|
86
|
+
metric: vsize.to_f,
|
87
|
+
description: command,
|
88
|
+
)
|
89
|
+
report(
|
90
|
+
service: "proc #{pid}-#{start_s}/running",
|
91
|
+
state: state_s.to_s,
|
92
|
+
metric: running.to_f,
|
93
|
+
description: command,
|
94
|
+
)
|
95
|
+
report(
|
96
|
+
service: "proc #{pid}-#{start_s}/cputime",
|
97
|
+
state: state_s.to_s,
|
98
|
+
metric: cputime_seconds,
|
99
|
+
description: command,
|
100
|
+
)
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
def tick
|
105
|
+
@check.call
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
109
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'singleton'
|
4
|
+
|
5
|
+
require 'riemann/client'
|
6
|
+
|
7
|
+
module Riemann
|
8
|
+
module Tools
|
9
|
+
class RiemannClientWrapper
|
10
|
+
include Singleton
|
11
|
+
|
12
|
+
def initialize
|
13
|
+
@client = nil
|
14
|
+
end
|
15
|
+
|
16
|
+
def configure(options)
|
17
|
+
return self unless @client.nil?
|
18
|
+
|
19
|
+
r = Riemann::Client.new(
|
20
|
+
host: options[:host],
|
21
|
+
port: options[:port],
|
22
|
+
timeout: options[:timeout],
|
23
|
+
ssl: options[:tls],
|
24
|
+
key_file: options[:tls_key],
|
25
|
+
cert_file: options[:tls_cert],
|
26
|
+
ca_file: options[:tls_ca_cert],
|
27
|
+
ssl_verify: options[:tls_verify],
|
28
|
+
)
|
29
|
+
|
30
|
+
@client = if options[:tcp] || options[:tls]
|
31
|
+
r.tcp
|
32
|
+
else
|
33
|
+
r
|
34
|
+
end
|
35
|
+
self
|
36
|
+
end
|
37
|
+
|
38
|
+
def <<(event)
|
39
|
+
@client << event
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|