remon 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (48) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +10 -0
  3. data/.rspec +3 -0
  4. data/FEATURES.md +39 -0
  5. data/Gemfile +4 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +41 -0
  8. data/Rakefile +27 -0
  9. data/bin/console +14 -0
  10. data/bin/setup +8 -0
  11. data/dev_exe/remon +4 -0
  12. data/exe/remon +101 -0
  13. data/lib/remon.rb +11 -0
  14. data/lib/remon/check.rb +145 -0
  15. data/lib/remon/check_dsl.rb +92 -0
  16. data/lib/remon/check_runner.rb +53 -0
  17. data/lib/remon/checks/consul.rb +41 -0
  18. data/lib/remon/checks/disk.rb +36 -0
  19. data/lib/remon/checks/http.rb +53 -0
  20. data/lib/remon/checks/oom.rb +26 -0
  21. data/lib/remon/checks/redis.rb +23 -0
  22. data/lib/remon/checks/salt.rb +27 -0
  23. data/lib/remon/checks/system.rb +96 -0
  24. data/lib/remon/checks/yum.rb +30 -0
  25. data/lib/remon/config.rb +101 -0
  26. data/lib/remon/custom_logger.rb +6 -0
  27. data/lib/remon/deduped_queue.rb +38 -0
  28. data/lib/remon/error.rb +4 -0
  29. data/lib/remon/event_processor.rb +33 -0
  30. data/lib/remon/ext/num_ext.rb +23 -0
  31. data/lib/remon/helper.rb +41 -0
  32. data/lib/remon/logger.rb +17 -0
  33. data/lib/remon/metrics/consul.rb +32 -0
  34. data/lib/remon/metrics/disk.rb +24 -0
  35. data/lib/remon/metrics/http.rb +40 -0
  36. data/lib/remon/metrics/oom.rb +32 -0
  37. data/lib/remon/metrics/salt.rb +18 -0
  38. data/lib/remon/metrics/system.rb +63 -0
  39. data/lib/remon/metrics/yum.rb +20 -0
  40. data/lib/remon/proc_check.rb +26 -0
  41. data/lib/remon/scheduler.rb +106 -0
  42. data/lib/remon/scripts/salt-status +24 -0
  43. data/lib/remon/scripts/yum-status +12 -0
  44. data/lib/remon/sysinfo.rb +69 -0
  45. data/lib/remon/version.rb +3 -0
  46. data/remon.gemspec +26 -0
  47. data/test_config.rb +44 -0
  48. metadata +146 -0
@@ -0,0 +1,92 @@
1
+ require_relative 'error'
2
+ require_relative 'check'
3
+ require_relative 'proc_check'
4
+ require 'set'
5
+
6
+ module Remon
7
+ class CheckDsl
8
+
9
+ def initialize(load_paths = [])
10
+ @load_paths = load_paths
11
+ @checks = {}
12
+ end
13
+
14
+ def defcheck(name = nil, &block)
15
+ return define_klass(&block) if not name
16
+ name = name.to_s
17
+ validate_name(name)
18
+ if @checks[name]
19
+ raise Error, "check #{name} already defined"
20
+ end
21
+ klass = define_klass(&block)
22
+ klass.name = name
23
+ @checks[name] = klass
24
+ end
25
+
26
+ def check(name)
27
+ name = name.to_s
28
+ validate_name(name)
29
+ @checks[name] || load_check(name)
30
+ end
31
+
32
+ def proc_check(name = nil, &block)
33
+ ProcCheck.new(name, block)
34
+ end
35
+
36
+ private
37
+
38
+ def validate_name(name)
39
+ regex = /\A[a-zA-Z0-9_:]+\z/
40
+ if not name =~ regex
41
+ raise Error, "only alphanumeric, _, : characters allowed for check name"
42
+ end
43
+ if name.scan(/:/).size > 1
44
+ raise Error, "nested namespacing not allowed in check names"
45
+ end
46
+ end
47
+
48
+ def define_klass(&block)
49
+ Class.new(Check, &block)
50
+ end
51
+
52
+ def load_file(f)
53
+ instance_eval File.read(f), f
54
+ end
55
+
56
+ def load_check(name)
57
+ file = find_check_file(name)
58
+ load_file file
59
+ if a = @checks[name]
60
+ return a
61
+ else
62
+ raise Error, "unable to find check: #{name} in #{file}"
63
+ end
64
+ end
65
+
66
+ def find_check_file(name)
67
+ files = check_files(name)
68
+ combination = files.product(@load_paths).find { |f, d| Dir.glob("#{d}/#{f}").first }
69
+ if not combination
70
+ raise Error, "unable to find check: #{name} in PATH: #{@load_paths.join(":")}"
71
+ end
72
+ dir = combination[1]
73
+ file = combination[0]
74
+ path = "#{dir}/#{file}"
75
+ end
76
+
77
+ def check_files(name)
78
+ files = []
79
+ if name.include? ":"
80
+ part = name.partition(":")
81
+ namespace = part[0]
82
+ rest = part[2]
83
+ files << "#{namespace}/#{rest}.rb"
84
+ files << "#{namespace}.rb"
85
+ else
86
+ files << "#{name}.rb"
87
+ end
88
+ files
89
+ end
90
+
91
+ end
92
+ end
@@ -0,0 +1,53 @@
1
+ require_relative 'logger'
2
+
3
+ module Remon
4
+ class CheckRunner
5
+ include Logger
6
+
7
+ def initialize(task_queue:, num_workers:, result_queue:)
8
+ @task_queue = task_queue
9
+ @num_workers = num_workers
10
+ @result_queue = result_queue
11
+ @workers = []
12
+ end
13
+
14
+ def start
15
+ logger.debug "starting runners"
16
+ @num_workers.times { @workers << new_worker }
17
+ end
18
+
19
+ def stop
20
+ @workers.each { |t| Thread.kill t if (t && t.alive?)}
21
+ end
22
+
23
+ def new_worker
24
+ Thread.new do
25
+ loop { process_job }
26
+ end
27
+ end
28
+
29
+ def process_job
30
+ j = @task_queue.pop
31
+ logger.debug "running #{j}" if logger.debug?
32
+ result = run_job j
33
+ case result
34
+ when Array
35
+ result.each { |r| @result_queue << r if r}
36
+ else
37
+ @result_queue << result if result
38
+ end
39
+ rescue => e
40
+ logger.error "error while running job #{e.message}"
41
+ end
42
+
43
+
44
+ def run_job(j)
45
+ if @num_workers > 1
46
+ j.run_mutex
47
+ else
48
+ j.run
49
+ end
50
+ end
51
+
52
+ end
53
+ end
@@ -0,0 +1,41 @@
1
+ require 'remon/metrics/consul'
2
+
3
+ defcheck :consul do
4
+
5
+ def init(host: "127.0.0.1", port: 8500)
6
+ @consul = Metrics::Consul.new(host: host, port: port)
7
+ end
8
+
9
+ def run
10
+ members_status
11
+ end
12
+
13
+ private
14
+
15
+ def members_status
16
+ failed_nodes = @consul.failed_nodes
17
+ state = failed_nodes.size > 0 ? "critical" : "ok"
18
+ event({
19
+ service: "consul members",
20
+ description: description(failed_nodes),
21
+ state: state,
22
+ metric: metric(state)
23
+ })
24
+ end
25
+
26
+ def description(failed_nodes)
27
+ n = failed_nodes.size
28
+ s = "#{n} failed nodes"
29
+ if n > 0
30
+ d = failed_nodes.map {|k,v| "#{k}: #{v}"}.join("\n")
31
+ "#{s}\n#{d}"
32
+ else
33
+ s
34
+ end
35
+ end
36
+
37
+ def metric(state)
38
+ state == "ok" ? 0 : 1
39
+ end
40
+
41
+ end
@@ -0,0 +1,36 @@
1
+ require 'remon/metrics/disk'
2
+ defcheck :disk do
3
+
4
+ opts ({
5
+ warning: 85,
6
+ critical: 95
7
+ })
8
+
9
+ def init
10
+ @disk = Metrics::Disk.new
11
+ end
12
+
13
+ def run
14
+ disk
15
+ end
16
+
17
+ private
18
+
19
+ def disk
20
+ disks = @disk.disks_usage
21
+ max = disks.max_by { |d| d[:percent] }
22
+ metric = max[:percent]
23
+ s = service_state(metric * 100)
24
+ event({
25
+ service: "disk",
26
+ description: description(disks),
27
+ state: s,
28
+ metric: metric
29
+ })
30
+ end
31
+
32
+ def description(disks)
33
+ disks.map { |i| "#{i[:mount]} - #{(i[:percent]*100).round 2}% #{i[:size]}"}.join("\n")
34
+ end
35
+
36
+ end
@@ -0,0 +1,53 @@
1
+ require 'remon/metrics/http'
2
+
3
+ defcheck :http do
4
+
5
+ opts ({
6
+ error_statuses: [502, 500, 503],
7
+ read_timeout: 1,
8
+ open_timeout: 1
9
+ })
10
+
11
+ def init(url)
12
+ @url = url
13
+ @http = Metrics::Http.new(url)
14
+ end
15
+
16
+ def run
17
+ http_status
18
+ end
19
+
20
+ private
21
+
22
+ def http_status
23
+ time, status = @http.status(read_timeout: opts[:read_timeout], open_timeout: opts[:open_timeout])
24
+ state = state(status)
25
+ event({
26
+ service: "http #{@url}",
27
+ description: "#{status} in #{(time * 1000).round(2)} ms",
28
+ state: state,
29
+ metric: metric(state)
30
+ })
31
+ end
32
+
33
+ def metric(state)
34
+ case state
35
+ when "ok"
36
+ 0
37
+ when "warning"
38
+ 0.9
39
+ when "critical"
40
+ 1
41
+ end
42
+ end
43
+
44
+ def state(status)
45
+ if status >= 500
46
+ "critical"
47
+ elsif status == 444
48
+ "critical"
49
+ else
50
+ "ok"
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,26 @@
1
+ require 'remon/metrics/oom'
2
+ defcheck :oom do
3
+
4
+ def init(log_file: "/var/log/messages")
5
+ @oom = Metrics::Oom.new(log_file)
6
+ end
7
+
8
+ def run
9
+ stats = @oom.stats
10
+ [ oom_event(stats[:today], tag: :today),
11
+ oom_event(stats[:total], tag: :total)]
12
+ end
13
+
14
+ private
15
+
16
+ def oom_event(count, tag:)
17
+ state = count > 0 ? "warning" : "ok"
18
+ event({
19
+ service: "oom log #{tag}",
20
+ description: "#{count} times oom",
21
+ state: state,
22
+ metric: count
23
+ })
24
+ end
25
+
26
+ end
@@ -0,0 +1,23 @@
1
+ defcheck :redis do
2
+
3
+ opts ({
4
+ cpu_warn: 85,
5
+ cpu_critical: 95
6
+ })
7
+
8
+ task def cpu
9
+ metric = rand(1..100)
10
+ s = state(metric, warn: o[:cpu_warn], critical: o[:cpu_critical])
11
+ event "cpu", state, metric, description
12
+ event ({
13
+ service: "cpu",
14
+ description: "cpu",
15
+ metric: metric,
16
+ state: s
17
+ })
18
+ end
19
+
20
+ # task def memory
21
+
22
+ # end
23
+ end
@@ -0,0 +1,27 @@
1
+ require 'remon/metrics/salt'
2
+
3
+ defcheck :salt do
4
+
5
+ def init
6
+ @salt = Metrics::Salt.new
7
+ end
8
+
9
+ def run
10
+ drift_status
11
+ end
12
+
13
+ private
14
+
15
+ def drift_status
16
+ status = @salt.status
17
+ state = status[:state] == "ok" ? "ok" : "warning"
18
+ metric = state == "ok" ? 0 : 1
19
+ event({
20
+ service: "salt",
21
+ description: "#{status[:state]}: #{status[:ok]}/#{status[:total]}",
22
+ state: state,
23
+ metric: metric
24
+ })
25
+ end
26
+
27
+ end
@@ -0,0 +1,96 @@
1
+ require 'remon/metrics/system'
2
+ defcheck "system" do
3
+
4
+ opts ({
5
+ cpu_warning: 85,
6
+ cpu_critical: 95,
7
+
8
+ iowait_warning: 30,
9
+ iowait_critical: 70,
10
+
11
+ load_warning: 3,
12
+ load_critical: 8,
13
+
14
+ memory_warning: 0.85,
15
+ memory_critical: 0.95
16
+ })
17
+
18
+ def init
19
+ @sys = Metrics::System.new
20
+ end
21
+
22
+ def run
23
+ [*cpu_and_iowait, loadavg, memory, uptime]
24
+ end
25
+
26
+ def cpu_and_iowait
27
+ old_cpu = @old_cpu
28
+ new_cpu = @sys.cpu_stat
29
+
30
+ if not new_cpu
31
+ return e 'cpu', :unknown, nil, "/proc/stat doesn't include a CPU line"
32
+ end
33
+ @old_cpu = new_cpu
34
+ return nil if not old_cpu
35
+ used, iowait = @sys.cpu_usage(old_cpu, new_cpu)
36
+ [cpu_event(used), iowait_event(iowait)]
37
+ end
38
+
39
+ def loadavg
40
+ metric = @sys.loadavg_normalized
41
+ event({
42
+ service: "load",
43
+ metric: metric,
44
+ description: "1-minute load average/core is #{metric}",
45
+ state: service_state("load", metric)
46
+ })
47
+ end
48
+
49
+ def memory
50
+ metric = @sys.memory
51
+ description = "#{(metric * 100).round(2)}% used\n\n#{`ps -eo pmem,pid,comm | sort -nrb -k1 | head -10`.chomp}"
52
+
53
+ event({
54
+ service: "memory",
55
+ metric: metric,
56
+ description: description,
57
+ state: service_state("memory", metric)
58
+ })
59
+ end
60
+
61
+ def uptime
62
+ up_seconds = @sys.uptime
63
+ metric = (up_seconds/24/3600).round(2)
64
+ @ips ||= Sysinfo.ips.join(", ")
65
+ description = <<~HEREDOC
66
+ ip: "#{@ips}"
67
+ instance_type: "#{Sysinfo.instance_type}"
68
+ HEREDOC
69
+
70
+ event({
71
+ service: "uptime",
72
+ metric: metric,
73
+ description: description,
74
+ state: "ok"
75
+ })
76
+ end
77
+
78
+ private
79
+
80
+ def cpu_event(metric)
81
+ description = "#{(metric * 100).round(2)}% user+nice+system\n\n#{`ps -eo pcpu,pid,comm | sort -nrb -k1 | head -10`.chomp}"
82
+ event service: "cpu",
83
+ description: description,
84
+ metric: metric,
85
+ state: service_state("cpu", metric * 100)
86
+ end
87
+
88
+ def iowait_event(metric)
89
+ description = "#{metric * 100 }% iowait"
90
+ event service: "iowait",
91
+ description: description,
92
+ metric: metric,
93
+ state: service_state("iowait", metric * 100)
94
+ end
95
+
96
+ end