remon 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +10 -0
- data/.rspec +3 -0
- data/FEATURES.md +39 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +21 -0
- data/README.md +41 -0
- data/Rakefile +27 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/dev_exe/remon +4 -0
- data/exe/remon +101 -0
- data/lib/remon.rb +11 -0
- data/lib/remon/check.rb +145 -0
- data/lib/remon/check_dsl.rb +92 -0
- data/lib/remon/check_runner.rb +53 -0
- data/lib/remon/checks/consul.rb +41 -0
- data/lib/remon/checks/disk.rb +36 -0
- data/lib/remon/checks/http.rb +53 -0
- data/lib/remon/checks/oom.rb +26 -0
- data/lib/remon/checks/redis.rb +23 -0
- data/lib/remon/checks/salt.rb +27 -0
- data/lib/remon/checks/system.rb +96 -0
- data/lib/remon/checks/yum.rb +30 -0
- data/lib/remon/config.rb +101 -0
- data/lib/remon/custom_logger.rb +6 -0
- data/lib/remon/deduped_queue.rb +38 -0
- data/lib/remon/error.rb +4 -0
- data/lib/remon/event_processor.rb +33 -0
- data/lib/remon/ext/num_ext.rb +23 -0
- data/lib/remon/helper.rb +41 -0
- data/lib/remon/logger.rb +17 -0
- data/lib/remon/metrics/consul.rb +32 -0
- data/lib/remon/metrics/disk.rb +24 -0
- data/lib/remon/metrics/http.rb +40 -0
- data/lib/remon/metrics/oom.rb +32 -0
- data/lib/remon/metrics/salt.rb +18 -0
- data/lib/remon/metrics/system.rb +63 -0
- data/lib/remon/metrics/yum.rb +20 -0
- data/lib/remon/proc_check.rb +26 -0
- data/lib/remon/scheduler.rb +106 -0
- data/lib/remon/scripts/salt-status +24 -0
- data/lib/remon/scripts/yum-status +12 -0
- data/lib/remon/sysinfo.rb +69 -0
- data/lib/remon/version.rb +3 -0
- data/remon.gemspec +26 -0
- data/test_config.rb +44 -0
- metadata +146 -0
@@ -0,0 +1,92 @@
|
|
1
|
+
require_relative 'error'
|
2
|
+
require_relative 'check'
|
3
|
+
require_relative 'proc_check'
|
4
|
+
require 'set'
|
5
|
+
|
6
|
+
module Remon
|
7
|
+
class CheckDsl
|
8
|
+
|
9
|
+
def initialize(load_paths = [])
|
10
|
+
@load_paths = load_paths
|
11
|
+
@checks = {}
|
12
|
+
end
|
13
|
+
|
14
|
+
def defcheck(name = nil, &block)
|
15
|
+
return define_klass(&block) if not name
|
16
|
+
name = name.to_s
|
17
|
+
validate_name(name)
|
18
|
+
if @checks[name]
|
19
|
+
raise Error, "check #{name} already defined"
|
20
|
+
end
|
21
|
+
klass = define_klass(&block)
|
22
|
+
klass.name = name
|
23
|
+
@checks[name] = klass
|
24
|
+
end
|
25
|
+
|
26
|
+
def check(name)
|
27
|
+
name = name.to_s
|
28
|
+
validate_name(name)
|
29
|
+
@checks[name] || load_check(name)
|
30
|
+
end
|
31
|
+
|
32
|
+
def proc_check(name = nil, &block)
|
33
|
+
ProcCheck.new(name, block)
|
34
|
+
end
|
35
|
+
|
36
|
+
private
|
37
|
+
|
38
|
+
def validate_name(name)
|
39
|
+
regex = /\A[a-zA-Z0-9_:]+\z/
|
40
|
+
if not name =~ regex
|
41
|
+
raise Error, "only alphanumeric, _, : characters allowed for check name"
|
42
|
+
end
|
43
|
+
if name.scan(/:/).size > 1
|
44
|
+
raise Error, "nested namespacing not allowed in check names"
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
def define_klass(&block)
|
49
|
+
Class.new(Check, &block)
|
50
|
+
end
|
51
|
+
|
52
|
+
def load_file(f)
|
53
|
+
instance_eval File.read(f), f
|
54
|
+
end
|
55
|
+
|
56
|
+
def load_check(name)
|
57
|
+
file = find_check_file(name)
|
58
|
+
load_file file
|
59
|
+
if a = @checks[name]
|
60
|
+
return a
|
61
|
+
else
|
62
|
+
raise Error, "unable to find check: #{name} in #{file}"
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
def find_check_file(name)
|
67
|
+
files = check_files(name)
|
68
|
+
combination = files.product(@load_paths).find { |f, d| Dir.glob("#{d}/#{f}").first }
|
69
|
+
if not combination
|
70
|
+
raise Error, "unable to find check: #{name} in PATH: #{@load_paths.join(":")}"
|
71
|
+
end
|
72
|
+
dir = combination[1]
|
73
|
+
file = combination[0]
|
74
|
+
path = "#{dir}/#{file}"
|
75
|
+
end
|
76
|
+
|
77
|
+
def check_files(name)
|
78
|
+
files = []
|
79
|
+
if name.include? ":"
|
80
|
+
part = name.partition(":")
|
81
|
+
namespace = part[0]
|
82
|
+
rest = part[2]
|
83
|
+
files << "#{namespace}/#{rest}.rb"
|
84
|
+
files << "#{namespace}.rb"
|
85
|
+
else
|
86
|
+
files << "#{name}.rb"
|
87
|
+
end
|
88
|
+
files
|
89
|
+
end
|
90
|
+
|
91
|
+
end
|
92
|
+
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
require_relative 'logger'
|
2
|
+
|
3
|
+
module Remon
|
4
|
+
class CheckRunner
|
5
|
+
include Logger
|
6
|
+
|
7
|
+
def initialize(task_queue:, num_workers:, result_queue:)
|
8
|
+
@task_queue = task_queue
|
9
|
+
@num_workers = num_workers
|
10
|
+
@result_queue = result_queue
|
11
|
+
@workers = []
|
12
|
+
end
|
13
|
+
|
14
|
+
def start
|
15
|
+
logger.debug "starting runners"
|
16
|
+
@num_workers.times { @workers << new_worker }
|
17
|
+
end
|
18
|
+
|
19
|
+
def stop
|
20
|
+
@workers.each { |t| Thread.kill t if (t && t.alive?)}
|
21
|
+
end
|
22
|
+
|
23
|
+
def new_worker
|
24
|
+
Thread.new do
|
25
|
+
loop { process_job }
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def process_job
|
30
|
+
j = @task_queue.pop
|
31
|
+
logger.debug "running #{j}" if logger.debug?
|
32
|
+
result = run_job j
|
33
|
+
case result
|
34
|
+
when Array
|
35
|
+
result.each { |r| @result_queue << r if r}
|
36
|
+
else
|
37
|
+
@result_queue << result if result
|
38
|
+
end
|
39
|
+
rescue => e
|
40
|
+
logger.error "error while running job #{e.message}"
|
41
|
+
end
|
42
|
+
|
43
|
+
|
44
|
+
def run_job(j)
|
45
|
+
if @num_workers > 1
|
46
|
+
j.run_mutex
|
47
|
+
else
|
48
|
+
j.run
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
end
|
53
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
require 'remon/metrics/consul'
|
2
|
+
|
3
|
+
defcheck :consul do
|
4
|
+
|
5
|
+
def init(host: "127.0.0.1", port: 8500)
|
6
|
+
@consul = Metrics::Consul.new(host: host, port: port)
|
7
|
+
end
|
8
|
+
|
9
|
+
def run
|
10
|
+
members_status
|
11
|
+
end
|
12
|
+
|
13
|
+
private
|
14
|
+
|
15
|
+
def members_status
|
16
|
+
failed_nodes = @consul.failed_nodes
|
17
|
+
state = failed_nodes.size > 0 ? "critical" : "ok"
|
18
|
+
event({
|
19
|
+
service: "consul members",
|
20
|
+
description: description(failed_nodes),
|
21
|
+
state: state,
|
22
|
+
metric: metric(state)
|
23
|
+
})
|
24
|
+
end
|
25
|
+
|
26
|
+
def description(failed_nodes)
|
27
|
+
n = failed_nodes.size
|
28
|
+
s = "#{n} failed nodes"
|
29
|
+
if n > 0
|
30
|
+
d = failed_nodes.map {|k,v| "#{k}: #{v}"}.join("\n")
|
31
|
+
"#{s}\n#{d}"
|
32
|
+
else
|
33
|
+
s
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def metric(state)
|
38
|
+
state == "ok" ? 0 : 1
|
39
|
+
end
|
40
|
+
|
41
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
require 'remon/metrics/disk'
|
2
|
+
defcheck :disk do
|
3
|
+
|
4
|
+
opts ({
|
5
|
+
warning: 85,
|
6
|
+
critical: 95
|
7
|
+
})
|
8
|
+
|
9
|
+
def init
|
10
|
+
@disk = Metrics::Disk.new
|
11
|
+
end
|
12
|
+
|
13
|
+
def run
|
14
|
+
disk
|
15
|
+
end
|
16
|
+
|
17
|
+
private
|
18
|
+
|
19
|
+
def disk
|
20
|
+
disks = @disk.disks_usage
|
21
|
+
max = disks.max_by { |d| d[:percent] }
|
22
|
+
metric = max[:percent]
|
23
|
+
s = service_state(metric * 100)
|
24
|
+
event({
|
25
|
+
service: "disk",
|
26
|
+
description: description(disks),
|
27
|
+
state: s,
|
28
|
+
metric: metric
|
29
|
+
})
|
30
|
+
end
|
31
|
+
|
32
|
+
def description(disks)
|
33
|
+
disks.map { |i| "#{i[:mount]} - #{(i[:percent]*100).round 2}% #{i[:size]}"}.join("\n")
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
require 'remon/metrics/http'
|
2
|
+
|
3
|
+
defcheck :http do
|
4
|
+
|
5
|
+
opts ({
|
6
|
+
error_statuses: [502, 500, 503],
|
7
|
+
read_timeout: 1,
|
8
|
+
open_timeout: 1
|
9
|
+
})
|
10
|
+
|
11
|
+
def init(url)
|
12
|
+
@url = url
|
13
|
+
@http = Metrics::Http.new(url)
|
14
|
+
end
|
15
|
+
|
16
|
+
def run
|
17
|
+
http_status
|
18
|
+
end
|
19
|
+
|
20
|
+
private
|
21
|
+
|
22
|
+
def http_status
|
23
|
+
time, status = @http.status(read_timeout: opts[:read_timeout], open_timeout: opts[:open_timeout])
|
24
|
+
state = state(status)
|
25
|
+
event({
|
26
|
+
service: "http #{@url}",
|
27
|
+
description: "#{status} in #{(time * 1000).round(2)} ms",
|
28
|
+
state: state,
|
29
|
+
metric: metric(state)
|
30
|
+
})
|
31
|
+
end
|
32
|
+
|
33
|
+
def metric(state)
|
34
|
+
case state
|
35
|
+
when "ok"
|
36
|
+
0
|
37
|
+
when "warning"
|
38
|
+
0.9
|
39
|
+
when "critical"
|
40
|
+
1
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
def state(status)
|
45
|
+
if status >= 500
|
46
|
+
"critical"
|
47
|
+
elsif status == 444
|
48
|
+
"critical"
|
49
|
+
else
|
50
|
+
"ok"
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
require 'remon/metrics/oom'
|
2
|
+
defcheck :oom do
|
3
|
+
|
4
|
+
def init(log_file: "/var/log/messages")
|
5
|
+
@oom = Metrics::Oom.new(log_file)
|
6
|
+
end
|
7
|
+
|
8
|
+
def run
|
9
|
+
stats = @oom.stats
|
10
|
+
[ oom_event(stats[:today], tag: :today),
|
11
|
+
oom_event(stats[:total], tag: :total)]
|
12
|
+
end
|
13
|
+
|
14
|
+
private
|
15
|
+
|
16
|
+
def oom_event(count, tag:)
|
17
|
+
state = count > 0 ? "warning" : "ok"
|
18
|
+
event({
|
19
|
+
service: "oom log #{tag}",
|
20
|
+
description: "#{count} times oom",
|
21
|
+
state: state,
|
22
|
+
metric: count
|
23
|
+
})
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
defcheck :redis do
|
2
|
+
|
3
|
+
opts ({
|
4
|
+
cpu_warn: 85,
|
5
|
+
cpu_critical: 95
|
6
|
+
})
|
7
|
+
|
8
|
+
task def cpu
|
9
|
+
metric = rand(1..100)
|
10
|
+
s = state(metric, warn: o[:cpu_warn], critical: o[:cpu_critical])
|
11
|
+
event "cpu", state, metric, description
|
12
|
+
event ({
|
13
|
+
service: "cpu",
|
14
|
+
description: "cpu",
|
15
|
+
metric: metric,
|
16
|
+
state: s
|
17
|
+
})
|
18
|
+
end
|
19
|
+
|
20
|
+
# task def memory
|
21
|
+
|
22
|
+
# end
|
23
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
require 'remon/metrics/salt'
|
2
|
+
|
3
|
+
defcheck :salt do
|
4
|
+
|
5
|
+
def init
|
6
|
+
@salt = Metrics::Salt.new
|
7
|
+
end
|
8
|
+
|
9
|
+
def run
|
10
|
+
drift_status
|
11
|
+
end
|
12
|
+
|
13
|
+
private
|
14
|
+
|
15
|
+
def drift_status
|
16
|
+
status = @salt.status
|
17
|
+
state = status[:state] == "ok" ? "ok" : "warning"
|
18
|
+
metric = state == "ok" ? 0 : 1
|
19
|
+
event({
|
20
|
+
service: "salt",
|
21
|
+
description: "#{status[:state]}: #{status[:ok]}/#{status[:total]}",
|
22
|
+
state: state,
|
23
|
+
metric: metric
|
24
|
+
})
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
@@ -0,0 +1,96 @@
|
|
1
|
+
require 'remon/metrics/system'
|
2
|
+
defcheck "system" do
|
3
|
+
|
4
|
+
opts ({
|
5
|
+
cpu_warning: 85,
|
6
|
+
cpu_critical: 95,
|
7
|
+
|
8
|
+
iowait_warning: 30,
|
9
|
+
iowait_critical: 70,
|
10
|
+
|
11
|
+
load_warning: 3,
|
12
|
+
load_critical: 8,
|
13
|
+
|
14
|
+
memory_warning: 0.85,
|
15
|
+
memory_critical: 0.95
|
16
|
+
})
|
17
|
+
|
18
|
+
def init
|
19
|
+
@sys = Metrics::System.new
|
20
|
+
end
|
21
|
+
|
22
|
+
def run
|
23
|
+
[*cpu_and_iowait, loadavg, memory, uptime]
|
24
|
+
end
|
25
|
+
|
26
|
+
def cpu_and_iowait
|
27
|
+
old_cpu = @old_cpu
|
28
|
+
new_cpu = @sys.cpu_stat
|
29
|
+
|
30
|
+
if not new_cpu
|
31
|
+
return e 'cpu', :unknown, nil, "/proc/stat doesn't include a CPU line"
|
32
|
+
end
|
33
|
+
@old_cpu = new_cpu
|
34
|
+
return nil if not old_cpu
|
35
|
+
used, iowait = @sys.cpu_usage(old_cpu, new_cpu)
|
36
|
+
[cpu_event(used), iowait_event(iowait)]
|
37
|
+
end
|
38
|
+
|
39
|
+
def loadavg
|
40
|
+
metric = @sys.loadavg_normalized
|
41
|
+
event({
|
42
|
+
service: "load",
|
43
|
+
metric: metric,
|
44
|
+
description: "1-minute load average/core is #{metric}",
|
45
|
+
state: service_state("load", metric)
|
46
|
+
})
|
47
|
+
end
|
48
|
+
|
49
|
+
def memory
|
50
|
+
metric = @sys.memory
|
51
|
+
description = "#{(metric * 100).round(2)}% used\n\n#{`ps -eo pmem,pid,comm | sort -nrb -k1 | head -10`.chomp}"
|
52
|
+
|
53
|
+
event({
|
54
|
+
service: "memory",
|
55
|
+
metric: metric,
|
56
|
+
description: description,
|
57
|
+
state: service_state("memory", metric)
|
58
|
+
})
|
59
|
+
end
|
60
|
+
|
61
|
+
def uptime
|
62
|
+
up_seconds = @sys.uptime
|
63
|
+
metric = (up_seconds/24/3600).round(2)
|
64
|
+
@ips ||= Sysinfo.ips.join(", ")
|
65
|
+
description = <<~HEREDOC
|
66
|
+
ip: "#{@ips}"
|
67
|
+
instance_type: "#{Sysinfo.instance_type}"
|
68
|
+
HEREDOC
|
69
|
+
|
70
|
+
event({
|
71
|
+
service: "uptime",
|
72
|
+
metric: metric,
|
73
|
+
description: description,
|
74
|
+
state: "ok"
|
75
|
+
})
|
76
|
+
end
|
77
|
+
|
78
|
+
private
|
79
|
+
|
80
|
+
def cpu_event(metric)
|
81
|
+
description = "#{(metric * 100).round(2)}% user+nice+system\n\n#{`ps -eo pcpu,pid,comm | sort -nrb -k1 | head -10`.chomp}"
|
82
|
+
event service: "cpu",
|
83
|
+
description: description,
|
84
|
+
metric: metric,
|
85
|
+
state: service_state("cpu", metric * 100)
|
86
|
+
end
|
87
|
+
|
88
|
+
def iowait_event(metric)
|
89
|
+
description = "#{metric * 100 }% iowait"
|
90
|
+
event service: "iowait",
|
91
|
+
description: description,
|
92
|
+
metric: metric,
|
93
|
+
state: service_state("iowait", metric * 100)
|
94
|
+
end
|
95
|
+
|
96
|
+
end
|