remon 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +10 -0
- data/.rspec +3 -0
- data/FEATURES.md +39 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +21 -0
- data/README.md +41 -0
- data/Rakefile +27 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/dev_exe/remon +4 -0
- data/exe/remon +101 -0
- data/lib/remon.rb +11 -0
- data/lib/remon/check.rb +145 -0
- data/lib/remon/check_dsl.rb +92 -0
- data/lib/remon/check_runner.rb +53 -0
- data/lib/remon/checks/consul.rb +41 -0
- data/lib/remon/checks/disk.rb +36 -0
- data/lib/remon/checks/http.rb +53 -0
- data/lib/remon/checks/oom.rb +26 -0
- data/lib/remon/checks/redis.rb +23 -0
- data/lib/remon/checks/salt.rb +27 -0
- data/lib/remon/checks/system.rb +96 -0
- data/lib/remon/checks/yum.rb +30 -0
- data/lib/remon/config.rb +101 -0
- data/lib/remon/custom_logger.rb +6 -0
- data/lib/remon/deduped_queue.rb +38 -0
- data/lib/remon/error.rb +4 -0
- data/lib/remon/event_processor.rb +33 -0
- data/lib/remon/ext/num_ext.rb +23 -0
- data/lib/remon/helper.rb +41 -0
- data/lib/remon/logger.rb +17 -0
- data/lib/remon/metrics/consul.rb +32 -0
- data/lib/remon/metrics/disk.rb +24 -0
- data/lib/remon/metrics/http.rb +40 -0
- data/lib/remon/metrics/oom.rb +32 -0
- data/lib/remon/metrics/salt.rb +18 -0
- data/lib/remon/metrics/system.rb +63 -0
- data/lib/remon/metrics/yum.rb +20 -0
- data/lib/remon/proc_check.rb +26 -0
- data/lib/remon/scheduler.rb +106 -0
- data/lib/remon/scripts/salt-status +24 -0
- data/lib/remon/scripts/yum-status +12 -0
- data/lib/remon/sysinfo.rb +69 -0
- data/lib/remon/version.rb +3 -0
- data/remon.gemspec +26 -0
- data/test_config.rb +44 -0
- metadata +146 -0
@@ -0,0 +1,92 @@
|
|
1
|
+
require_relative 'error'
|
2
|
+
require_relative 'check'
|
3
|
+
require_relative 'proc_check'
|
4
|
+
require 'set'
|
5
|
+
|
6
|
+
module Remon
|
7
|
+
class CheckDsl
|
8
|
+
|
9
|
+
def initialize(load_paths = [])
|
10
|
+
@load_paths = load_paths
|
11
|
+
@checks = {}
|
12
|
+
end
|
13
|
+
|
14
|
+
def defcheck(name = nil, &block)
|
15
|
+
return define_klass(&block) if not name
|
16
|
+
name = name.to_s
|
17
|
+
validate_name(name)
|
18
|
+
if @checks[name]
|
19
|
+
raise Error, "check #{name} already defined"
|
20
|
+
end
|
21
|
+
klass = define_klass(&block)
|
22
|
+
klass.name = name
|
23
|
+
@checks[name] = klass
|
24
|
+
end
|
25
|
+
|
26
|
+
def check(name)
|
27
|
+
name = name.to_s
|
28
|
+
validate_name(name)
|
29
|
+
@checks[name] || load_check(name)
|
30
|
+
end
|
31
|
+
|
32
|
+
def proc_check(name = nil, &block)
|
33
|
+
ProcCheck.new(name, block)
|
34
|
+
end
|
35
|
+
|
36
|
+
private
|
37
|
+
|
38
|
+
def validate_name(name)
|
39
|
+
regex = /\A[a-zA-Z0-9_:]+\z/
|
40
|
+
if not name =~ regex
|
41
|
+
raise Error, "only alphanumeric, _, : characters allowed for check name"
|
42
|
+
end
|
43
|
+
if name.scan(/:/).size > 1
|
44
|
+
raise Error, "nested namespacing not allowed in check names"
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
def define_klass(&block)
|
49
|
+
Class.new(Check, &block)
|
50
|
+
end
|
51
|
+
|
52
|
+
def load_file(f)
|
53
|
+
instance_eval File.read(f), f
|
54
|
+
end
|
55
|
+
|
56
|
+
def load_check(name)
|
57
|
+
file = find_check_file(name)
|
58
|
+
load_file file
|
59
|
+
if a = @checks[name]
|
60
|
+
return a
|
61
|
+
else
|
62
|
+
raise Error, "unable to find check: #{name} in #{file}"
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
def find_check_file(name)
|
67
|
+
files = check_files(name)
|
68
|
+
combination = files.product(@load_paths).find { |f, d| Dir.glob("#{d}/#{f}").first }
|
69
|
+
if not combination
|
70
|
+
raise Error, "unable to find check: #{name} in PATH: #{@load_paths.join(":")}"
|
71
|
+
end
|
72
|
+
dir = combination[1]
|
73
|
+
file = combination[0]
|
74
|
+
path = "#{dir}/#{file}"
|
75
|
+
end
|
76
|
+
|
77
|
+
def check_files(name)
|
78
|
+
files = []
|
79
|
+
if name.include? ":"
|
80
|
+
part = name.partition(":")
|
81
|
+
namespace = part[0]
|
82
|
+
rest = part[2]
|
83
|
+
files << "#{namespace}/#{rest}.rb"
|
84
|
+
files << "#{namespace}.rb"
|
85
|
+
else
|
86
|
+
files << "#{name}.rb"
|
87
|
+
end
|
88
|
+
files
|
89
|
+
end
|
90
|
+
|
91
|
+
end
|
92
|
+
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
require_relative 'logger'
|
2
|
+
|
3
|
+
module Remon
|
4
|
+
class CheckRunner
|
5
|
+
include Logger
|
6
|
+
|
7
|
+
def initialize(task_queue:, num_workers:, result_queue:)
|
8
|
+
@task_queue = task_queue
|
9
|
+
@num_workers = num_workers
|
10
|
+
@result_queue = result_queue
|
11
|
+
@workers = []
|
12
|
+
end
|
13
|
+
|
14
|
+
def start
|
15
|
+
logger.debug "starting runners"
|
16
|
+
@num_workers.times { @workers << new_worker }
|
17
|
+
end
|
18
|
+
|
19
|
+
def stop
|
20
|
+
@workers.each { |t| Thread.kill t if (t && t.alive?)}
|
21
|
+
end
|
22
|
+
|
23
|
+
def new_worker
|
24
|
+
Thread.new do
|
25
|
+
loop { process_job }
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def process_job
|
30
|
+
j = @task_queue.pop
|
31
|
+
logger.debug "running #{j}" if logger.debug?
|
32
|
+
result = run_job j
|
33
|
+
case result
|
34
|
+
when Array
|
35
|
+
result.each { |r| @result_queue << r if r}
|
36
|
+
else
|
37
|
+
@result_queue << result if result
|
38
|
+
end
|
39
|
+
rescue => e
|
40
|
+
logger.error "error while running job #{e.message}"
|
41
|
+
end
|
42
|
+
|
43
|
+
|
44
|
+
def run_job(j)
|
45
|
+
if @num_workers > 1
|
46
|
+
j.run_mutex
|
47
|
+
else
|
48
|
+
j.run
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
end
|
53
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
require 'remon/metrics/consul'
|
2
|
+
|
3
|
+
defcheck :consul do
|
4
|
+
|
5
|
+
def init(host: "127.0.0.1", port: 8500)
|
6
|
+
@consul = Metrics::Consul.new(host: host, port: port)
|
7
|
+
end
|
8
|
+
|
9
|
+
def run
|
10
|
+
members_status
|
11
|
+
end
|
12
|
+
|
13
|
+
private
|
14
|
+
|
15
|
+
def members_status
|
16
|
+
failed_nodes = @consul.failed_nodes
|
17
|
+
state = failed_nodes.size > 0 ? "critical" : "ok"
|
18
|
+
event({
|
19
|
+
service: "consul members",
|
20
|
+
description: description(failed_nodes),
|
21
|
+
state: state,
|
22
|
+
metric: metric(state)
|
23
|
+
})
|
24
|
+
end
|
25
|
+
|
26
|
+
def description(failed_nodes)
|
27
|
+
n = failed_nodes.size
|
28
|
+
s = "#{n} failed nodes"
|
29
|
+
if n > 0
|
30
|
+
d = failed_nodes.map {|k,v| "#{k}: #{v}"}.join("\n")
|
31
|
+
"#{s}\n#{d}"
|
32
|
+
else
|
33
|
+
s
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def metric(state)
|
38
|
+
state == "ok" ? 0 : 1
|
39
|
+
end
|
40
|
+
|
41
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
require 'remon/metrics/disk'
|
2
|
+
defcheck :disk do
|
3
|
+
|
4
|
+
opts ({
|
5
|
+
warning: 85,
|
6
|
+
critical: 95
|
7
|
+
})
|
8
|
+
|
9
|
+
def init
|
10
|
+
@disk = Metrics::Disk.new
|
11
|
+
end
|
12
|
+
|
13
|
+
def run
|
14
|
+
disk
|
15
|
+
end
|
16
|
+
|
17
|
+
private
|
18
|
+
|
19
|
+
def disk
|
20
|
+
disks = @disk.disks_usage
|
21
|
+
max = disks.max_by { |d| d[:percent] }
|
22
|
+
metric = max[:percent]
|
23
|
+
s = service_state(metric * 100)
|
24
|
+
event({
|
25
|
+
service: "disk",
|
26
|
+
description: description(disks),
|
27
|
+
state: s,
|
28
|
+
metric: metric
|
29
|
+
})
|
30
|
+
end
|
31
|
+
|
32
|
+
def description(disks)
|
33
|
+
disks.map { |i| "#{i[:mount]} - #{(i[:percent]*100).round 2}% #{i[:size]}"}.join("\n")
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
require 'remon/metrics/http'
|
2
|
+
|
3
|
+
defcheck :http do
|
4
|
+
|
5
|
+
opts ({
|
6
|
+
error_statuses: [502, 500, 503],
|
7
|
+
read_timeout: 1,
|
8
|
+
open_timeout: 1
|
9
|
+
})
|
10
|
+
|
11
|
+
def init(url)
|
12
|
+
@url = url
|
13
|
+
@http = Metrics::Http.new(url)
|
14
|
+
end
|
15
|
+
|
16
|
+
def run
|
17
|
+
http_status
|
18
|
+
end
|
19
|
+
|
20
|
+
private
|
21
|
+
|
22
|
+
def http_status
|
23
|
+
time, status = @http.status(read_timeout: opts[:read_timeout], open_timeout: opts[:open_timeout])
|
24
|
+
state = state(status)
|
25
|
+
event({
|
26
|
+
service: "http #{@url}",
|
27
|
+
description: "#{status} in #{(time * 1000).round(2)} ms",
|
28
|
+
state: state,
|
29
|
+
metric: metric(state)
|
30
|
+
})
|
31
|
+
end
|
32
|
+
|
33
|
+
def metric(state)
|
34
|
+
case state
|
35
|
+
when "ok"
|
36
|
+
0
|
37
|
+
when "warning"
|
38
|
+
0.9
|
39
|
+
when "critical"
|
40
|
+
1
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
def state(status)
|
45
|
+
if status >= 500
|
46
|
+
"critical"
|
47
|
+
elsif status == 444
|
48
|
+
"critical"
|
49
|
+
else
|
50
|
+
"ok"
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
require 'remon/metrics/oom'
|
2
|
+
defcheck :oom do
|
3
|
+
|
4
|
+
def init(log_file: "/var/log/messages")
|
5
|
+
@oom = Metrics::Oom.new(log_file)
|
6
|
+
end
|
7
|
+
|
8
|
+
def run
|
9
|
+
stats = @oom.stats
|
10
|
+
[ oom_event(stats[:today], tag: :today),
|
11
|
+
oom_event(stats[:total], tag: :total)]
|
12
|
+
end
|
13
|
+
|
14
|
+
private
|
15
|
+
|
16
|
+
def oom_event(count, tag:)
|
17
|
+
state = count > 0 ? "warning" : "ok"
|
18
|
+
event({
|
19
|
+
service: "oom log #{tag}",
|
20
|
+
description: "#{count} times oom",
|
21
|
+
state: state,
|
22
|
+
metric: count
|
23
|
+
})
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
defcheck :redis do
|
2
|
+
|
3
|
+
opts ({
|
4
|
+
cpu_warn: 85,
|
5
|
+
cpu_critical: 95
|
6
|
+
})
|
7
|
+
|
8
|
+
task def cpu
|
9
|
+
metric = rand(1..100)
|
10
|
+
s = state(metric, warn: o[:cpu_warn], critical: o[:cpu_critical])
|
11
|
+
event "cpu", state, metric, description
|
12
|
+
event ({
|
13
|
+
service: "cpu",
|
14
|
+
description: "cpu",
|
15
|
+
metric: metric,
|
16
|
+
state: s
|
17
|
+
})
|
18
|
+
end
|
19
|
+
|
20
|
+
# task def memory
|
21
|
+
|
22
|
+
# end
|
23
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
require 'remon/metrics/salt'
|
2
|
+
|
3
|
+
defcheck :salt do
|
4
|
+
|
5
|
+
def init
|
6
|
+
@salt = Metrics::Salt.new
|
7
|
+
end
|
8
|
+
|
9
|
+
def run
|
10
|
+
drift_status
|
11
|
+
end
|
12
|
+
|
13
|
+
private
|
14
|
+
|
15
|
+
def drift_status
|
16
|
+
status = @salt.status
|
17
|
+
state = status[:state] == "ok" ? "ok" : "warning"
|
18
|
+
metric = state == "ok" ? 0 : 1
|
19
|
+
event({
|
20
|
+
service: "salt",
|
21
|
+
description: "#{status[:state]}: #{status[:ok]}/#{status[:total]}",
|
22
|
+
state: state,
|
23
|
+
metric: metric
|
24
|
+
})
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
@@ -0,0 +1,96 @@
|
|
1
|
+
require 'remon/metrics/system'
|
2
|
+
defcheck "system" do
|
3
|
+
|
4
|
+
opts ({
|
5
|
+
cpu_warning: 85,
|
6
|
+
cpu_critical: 95,
|
7
|
+
|
8
|
+
iowait_warning: 30,
|
9
|
+
iowait_critical: 70,
|
10
|
+
|
11
|
+
load_warning: 3,
|
12
|
+
load_critical: 8,
|
13
|
+
|
14
|
+
memory_warning: 0.85,
|
15
|
+
memory_critical: 0.95
|
16
|
+
})
|
17
|
+
|
18
|
+
def init
|
19
|
+
@sys = Metrics::System.new
|
20
|
+
end
|
21
|
+
|
22
|
+
def run
|
23
|
+
[*cpu_and_iowait, loadavg, memory, uptime]
|
24
|
+
end
|
25
|
+
|
26
|
+
def cpu_and_iowait
|
27
|
+
old_cpu = @old_cpu
|
28
|
+
new_cpu = @sys.cpu_stat
|
29
|
+
|
30
|
+
if not new_cpu
|
31
|
+
return e 'cpu', :unknown, nil, "/proc/stat doesn't include a CPU line"
|
32
|
+
end
|
33
|
+
@old_cpu = new_cpu
|
34
|
+
return nil if not old_cpu
|
35
|
+
used, iowait = @sys.cpu_usage(old_cpu, new_cpu)
|
36
|
+
[cpu_event(used), iowait_event(iowait)]
|
37
|
+
end
|
38
|
+
|
39
|
+
def loadavg
|
40
|
+
metric = @sys.loadavg_normalized
|
41
|
+
event({
|
42
|
+
service: "load",
|
43
|
+
metric: metric,
|
44
|
+
description: "1-minute load average/core is #{metric}",
|
45
|
+
state: service_state("load", metric)
|
46
|
+
})
|
47
|
+
end
|
48
|
+
|
49
|
+
def memory
|
50
|
+
metric = @sys.memory
|
51
|
+
description = "#{(metric * 100).round(2)}% used\n\n#{`ps -eo pmem,pid,comm | sort -nrb -k1 | head -10`.chomp}"
|
52
|
+
|
53
|
+
event({
|
54
|
+
service: "memory",
|
55
|
+
metric: metric,
|
56
|
+
description: description,
|
57
|
+
state: service_state("memory", metric)
|
58
|
+
})
|
59
|
+
end
|
60
|
+
|
61
|
+
def uptime
|
62
|
+
up_seconds = @sys.uptime
|
63
|
+
metric = (up_seconds/24/3600).round(2)
|
64
|
+
@ips ||= Sysinfo.ips.join(", ")
|
65
|
+
description = <<~HEREDOC
|
66
|
+
ip: "#{@ips}"
|
67
|
+
instance_type: "#{Sysinfo.instance_type}"
|
68
|
+
HEREDOC
|
69
|
+
|
70
|
+
event({
|
71
|
+
service: "uptime",
|
72
|
+
metric: metric,
|
73
|
+
description: description,
|
74
|
+
state: "ok"
|
75
|
+
})
|
76
|
+
end
|
77
|
+
|
78
|
+
private
|
79
|
+
|
80
|
+
def cpu_event(metric)
|
81
|
+
description = "#{(metric * 100).round(2)}% user+nice+system\n\n#{`ps -eo pcpu,pid,comm | sort -nrb -k1 | head -10`.chomp}"
|
82
|
+
event service: "cpu",
|
83
|
+
description: description,
|
84
|
+
metric: metric,
|
85
|
+
state: service_state("cpu", metric * 100)
|
86
|
+
end
|
87
|
+
|
88
|
+
def iowait_event(metric)
|
89
|
+
description = "#{metric * 100 }% iowait"
|
90
|
+
event service: "iowait",
|
91
|
+
description: description,
|
92
|
+
metric: metric,
|
93
|
+
state: service_state("iowait", metric * 100)
|
94
|
+
end
|
95
|
+
|
96
|
+
end
|