remon 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +10 -0
  3. data/.rspec +3 -0
  4. data/FEATURES.md +39 -0
  5. data/Gemfile +4 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +41 -0
  8. data/Rakefile +27 -0
  9. data/bin/console +14 -0
  10. data/bin/setup +8 -0
  11. data/dev_exe/remon +4 -0
  12. data/exe/remon +101 -0
  13. data/lib/remon.rb +11 -0
  14. data/lib/remon/check.rb +145 -0
  15. data/lib/remon/check_dsl.rb +92 -0
  16. data/lib/remon/check_runner.rb +53 -0
  17. data/lib/remon/checks/consul.rb +41 -0
  18. data/lib/remon/checks/disk.rb +36 -0
  19. data/lib/remon/checks/http.rb +53 -0
  20. data/lib/remon/checks/oom.rb +26 -0
  21. data/lib/remon/checks/redis.rb +23 -0
  22. data/lib/remon/checks/salt.rb +27 -0
  23. data/lib/remon/checks/system.rb +96 -0
  24. data/lib/remon/checks/yum.rb +30 -0
  25. data/lib/remon/config.rb +101 -0
  26. data/lib/remon/custom_logger.rb +6 -0
  27. data/lib/remon/deduped_queue.rb +38 -0
  28. data/lib/remon/error.rb +4 -0
  29. data/lib/remon/event_processor.rb +33 -0
  30. data/lib/remon/ext/num_ext.rb +23 -0
  31. data/lib/remon/helper.rb +41 -0
  32. data/lib/remon/logger.rb +17 -0
  33. data/lib/remon/metrics/consul.rb +32 -0
  34. data/lib/remon/metrics/disk.rb +24 -0
  35. data/lib/remon/metrics/http.rb +40 -0
  36. data/lib/remon/metrics/oom.rb +32 -0
  37. data/lib/remon/metrics/salt.rb +18 -0
  38. data/lib/remon/metrics/system.rb +63 -0
  39. data/lib/remon/metrics/yum.rb +20 -0
  40. data/lib/remon/proc_check.rb +26 -0
  41. data/lib/remon/scheduler.rb +106 -0
  42. data/lib/remon/scripts/salt-status +24 -0
  43. data/lib/remon/scripts/yum-status +12 -0
  44. data/lib/remon/sysinfo.rb +69 -0
  45. data/lib/remon/version.rb +3 -0
  46. data/remon.gemspec +26 -0
  47. data/test_config.rb +44 -0
  48. metadata +146 -0
@@ -0,0 +1,92 @@
1
+ require_relative 'error'
2
+ require_relative 'check'
3
+ require_relative 'proc_check'
4
+ require 'set'
5
+
6
+ module Remon
7
+ class CheckDsl
8
+
9
+ def initialize(load_paths = [])
10
+ @load_paths = load_paths
11
+ @checks = {}
12
+ end
13
+
14
+ def defcheck(name = nil, &block)
15
+ return define_klass(&block) if not name
16
+ name = name.to_s
17
+ validate_name(name)
18
+ if @checks[name]
19
+ raise Error, "check #{name} already defined"
20
+ end
21
+ klass = define_klass(&block)
22
+ klass.name = name
23
+ @checks[name] = klass
24
+ end
25
+
26
+ def check(name)
27
+ name = name.to_s
28
+ validate_name(name)
29
+ @checks[name] || load_check(name)
30
+ end
31
+
32
+ def proc_check(name = nil, &block)
33
+ ProcCheck.new(name, block)
34
+ end
35
+
36
+ private
37
+
38
+ def validate_name(name)
39
+ regex = /\A[a-zA-Z0-9_:]+\z/
40
+ if not name =~ regex
41
+ raise Error, "only alphanumeric, _, : characters allowed for check name"
42
+ end
43
+ if name.scan(/:/).size > 1
44
+ raise Error, "nested namespacing not allowed in check names"
45
+ end
46
+ end
47
+
48
+ def define_klass(&block)
49
+ Class.new(Check, &block)
50
+ end
51
+
52
+ def load_file(f)
53
+ instance_eval File.read(f), f
54
+ end
55
+
56
+ def load_check(name)
57
+ file = find_check_file(name)
58
+ load_file file
59
+ if a = @checks[name]
60
+ return a
61
+ else
62
+ raise Error, "unable to find check: #{name} in #{file}"
63
+ end
64
+ end
65
+
66
+ def find_check_file(name)
67
+ files = check_files(name)
68
+ combination = files.product(@load_paths).find { |f, d| Dir.glob("#{d}/#{f}").first }
69
+ if not combination
70
+ raise Error, "unable to find check: #{name} in PATH: #{@load_paths.join(":")}"
71
+ end
72
+ dir = combination[1]
73
+ file = combination[0]
74
+ path = "#{dir}/#{file}"
75
+ end
76
+
77
+ def check_files(name)
78
+ files = []
79
+ if name.include? ":"
80
+ part = name.partition(":")
81
+ namespace = part[0]
82
+ rest = part[2]
83
+ files << "#{namespace}/#{rest}.rb"
84
+ files << "#{namespace}.rb"
85
+ else
86
+ files << "#{name}.rb"
87
+ end
88
+ files
89
+ end
90
+
91
+ end
92
+ end
@@ -0,0 +1,53 @@
1
+ require_relative 'logger'
2
+
3
+ module Remon
4
+ class CheckRunner
5
+ include Logger
6
+
7
+ def initialize(task_queue:, num_workers:, result_queue:)
8
+ @task_queue = task_queue
9
+ @num_workers = num_workers
10
+ @result_queue = result_queue
11
+ @workers = []
12
+ end
13
+
14
+ def start
15
+ logger.debug "starting runners"
16
+ @num_workers.times { @workers << new_worker }
17
+ end
18
+
19
+ def stop
20
+ @workers.each { |t| Thread.kill t if (t && t.alive?)}
21
+ end
22
+
23
+ def new_worker
24
+ Thread.new do
25
+ loop { process_job }
26
+ end
27
+ end
28
+
29
+ def process_job
30
+ j = @task_queue.pop
31
+ logger.debug "running #{j}" if logger.debug?
32
+ result = run_job j
33
+ case result
34
+ when Array
35
+ result.each { |r| @result_queue << r if r}
36
+ else
37
+ @result_queue << result if result
38
+ end
39
+ rescue => e
40
+ logger.error "error while running job #{e.message}"
41
+ end
42
+
43
+
44
+ def run_job(j)
45
+ if @num_workers > 1
46
+ j.run_mutex
47
+ else
48
+ j.run
49
+ end
50
+ end
51
+
52
+ end
53
+ end
@@ -0,0 +1,41 @@
1
+ require 'remon/metrics/consul'
2
+
3
+ defcheck :consul do
4
+
5
+ def init(host: "127.0.0.1", port: 8500)
6
+ @consul = Metrics::Consul.new(host: host, port: port)
7
+ end
8
+
9
+ def run
10
+ members_status
11
+ end
12
+
13
+ private
14
+
15
+ def members_status
16
+ failed_nodes = @consul.failed_nodes
17
+ state = failed_nodes.size > 0 ? "critical" : "ok"
18
+ event({
19
+ service: "consul members",
20
+ description: description(failed_nodes),
21
+ state: state,
22
+ metric: metric(state)
23
+ })
24
+ end
25
+
26
+ def description(failed_nodes)
27
+ n = failed_nodes.size
28
+ s = "#{n} failed nodes"
29
+ if n > 0
30
+ d = failed_nodes.map {|k,v| "#{k}: #{v}"}.join("\n")
31
+ "#{s}\n#{d}"
32
+ else
33
+ s
34
+ end
35
+ end
36
+
37
+ def metric(state)
38
+ state == "ok" ? 0 : 1
39
+ end
40
+
41
+ end
@@ -0,0 +1,36 @@
1
+ require 'remon/metrics/disk'
2
+ defcheck :disk do
3
+
4
+ opts ({
5
+ warning: 85,
6
+ critical: 95
7
+ })
8
+
9
+ def init
10
+ @disk = Metrics::Disk.new
11
+ end
12
+
13
+ def run
14
+ disk
15
+ end
16
+
17
+ private
18
+
19
+ def disk
20
+ disks = @disk.disks_usage
21
+ max = disks.max_by { |d| d[:percent] }
22
+ metric = max[:percent]
23
+ s = service_state(metric * 100)
24
+ event({
25
+ service: "disk",
26
+ description: description(disks),
27
+ state: s,
28
+ metric: metric
29
+ })
30
+ end
31
+
32
+ def description(disks)
33
+ disks.map { |i| "#{i[:mount]} - #{(i[:percent]*100).round 2}% #{i[:size]}"}.join("\n")
34
+ end
35
+
36
+ end
@@ -0,0 +1,53 @@
1
+ require 'remon/metrics/http'
2
+
3
+ defcheck :http do
4
+
5
+ opts ({
6
+ error_statuses: [502, 500, 503],
7
+ read_timeout: 1,
8
+ open_timeout: 1
9
+ })
10
+
11
+ def init(url)
12
+ @url = url
13
+ @http = Metrics::Http.new(url)
14
+ end
15
+
16
+ def run
17
+ http_status
18
+ end
19
+
20
+ private
21
+
22
+ def http_status
23
+ time, status = @http.status(read_timeout: opts[:read_timeout], open_timeout: opts[:open_timeout])
24
+ state = state(status)
25
+ event({
26
+ service: "http #{@url}",
27
+ description: "#{status} in #{(time * 1000).round(2)} ms",
28
+ state: state,
29
+ metric: metric(state)
30
+ })
31
+ end
32
+
33
+ def metric(state)
34
+ case state
35
+ when "ok"
36
+ 0
37
+ when "warning"
38
+ 0.9
39
+ when "critical"
40
+ 1
41
+ end
42
+ end
43
+
44
+ def state(status)
45
+ if status >= 500
46
+ "critical"
47
+ elsif status == 444
48
+ "critical"
49
+ else
50
+ "ok"
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,26 @@
1
+ require 'remon/metrics/oom'
2
+ defcheck :oom do
3
+
4
+ def init(log_file: "/var/log/messages")
5
+ @oom = Metrics::Oom.new(log_file)
6
+ end
7
+
8
+ def run
9
+ stats = @oom.stats
10
+ [ oom_event(stats[:today], tag: :today),
11
+ oom_event(stats[:total], tag: :total)]
12
+ end
13
+
14
+ private
15
+
16
+ def oom_event(count, tag:)
17
+ state = count > 0 ? "warning" : "ok"
18
+ event({
19
+ service: "oom log #{tag}",
20
+ description: "#{count} times oom",
21
+ state: state,
22
+ metric: count
23
+ })
24
+ end
25
+
26
+ end
@@ -0,0 +1,23 @@
1
+ defcheck :redis do
2
+
3
+ opts ({
4
+ cpu_warn: 85,
5
+ cpu_critical: 95
6
+ })
7
+
8
+ task def cpu
9
+ metric = rand(1..100)
10
+ s = state(metric, warn: o[:cpu_warn], critical: o[:cpu_critical])
11
+ event "cpu", state, metric, description
12
+ event ({
13
+ service: "cpu",
14
+ description: "cpu",
15
+ metric: metric,
16
+ state: s
17
+ })
18
+ end
19
+
20
+ # task def memory
21
+
22
+ # end
23
+ end
@@ -0,0 +1,27 @@
1
+ require 'remon/metrics/salt'
2
+
3
+ defcheck :salt do
4
+
5
+ def init
6
+ @salt = Metrics::Salt.new
7
+ end
8
+
9
+ def run
10
+ drift_status
11
+ end
12
+
13
+ private
14
+
15
+ def drift_status
16
+ status = @salt.status
17
+ state = status[:state] == "ok" ? "ok" : "warning"
18
+ metric = state == "ok" ? 0 : 1
19
+ event({
20
+ service: "salt",
21
+ description: "#{status[:state]}: #{status[:ok]}/#{status[:total]}",
22
+ state: state,
23
+ metric: metric
24
+ })
25
+ end
26
+
27
+ end
@@ -0,0 +1,96 @@
1
+ require 'remon/metrics/system'
2
+ defcheck "system" do
3
+
4
+ opts ({
5
+ cpu_warning: 85,
6
+ cpu_critical: 95,
7
+
8
+ iowait_warning: 30,
9
+ iowait_critical: 70,
10
+
11
+ load_warning: 3,
12
+ load_critical: 8,
13
+
14
+ memory_warning: 0.85,
15
+ memory_critical: 0.95
16
+ })
17
+
18
+ def init
19
+ @sys = Metrics::System.new
20
+ end
21
+
22
+ def run
23
+ [*cpu_and_iowait, loadavg, memory, uptime]
24
+ end
25
+
26
+ def cpu_and_iowait
27
+ old_cpu = @old_cpu
28
+ new_cpu = @sys.cpu_stat
29
+
30
+ if not new_cpu
31
+ return e 'cpu', :unknown, nil, "/proc/stat doesn't include a CPU line"
32
+ end
33
+ @old_cpu = new_cpu
34
+ return nil if not old_cpu
35
+ used, iowait = @sys.cpu_usage(old_cpu, new_cpu)
36
+ [cpu_event(used), iowait_event(iowait)]
37
+ end
38
+
39
+ def loadavg
40
+ metric = @sys.loadavg_normalized
41
+ event({
42
+ service: "load",
43
+ metric: metric,
44
+ description: "1-minute load average/core is #{metric}",
45
+ state: service_state("load", metric)
46
+ })
47
+ end
48
+
49
+ def memory
50
+ metric = @sys.memory
51
+ description = "#{(metric * 100).round(2)}% used\n\n#{`ps -eo pmem,pid,comm | sort -nrb -k1 | head -10`.chomp}"
52
+
53
+ event({
54
+ service: "memory",
55
+ metric: metric,
56
+ description: description,
57
+ state: service_state("memory", metric)
58
+ })
59
+ end
60
+
61
+ def uptime
62
+ up_seconds = @sys.uptime
63
+ metric = (up_seconds/24/3600).round(2)
64
+ @ips ||= Sysinfo.ips.join(", ")
65
+ description = <<~HEREDOC
66
+ ip: "#{@ips}"
67
+ instance_type: "#{Sysinfo.instance_type}"
68
+ HEREDOC
69
+
70
+ event({
71
+ service: "uptime",
72
+ metric: metric,
73
+ description: description,
74
+ state: "ok"
75
+ })
76
+ end
77
+
78
+ private
79
+
80
+ def cpu_event(metric)
81
+ description = "#{(metric * 100).round(2)}% user+nice+system\n\n#{`ps -eo pcpu,pid,comm | sort -nrb -k1 | head -10`.chomp}"
82
+ event service: "cpu",
83
+ description: description,
84
+ metric: metric,
85
+ state: service_state("cpu", metric * 100)
86
+ end
87
+
88
+ def iowait_event(metric)
89
+ description = "#{metric * 100 }% iowait"
90
+ event service: "iowait",
91
+ description: description,
92
+ metric: metric,
93
+ state: service_state("iowait", metric * 100)
94
+ end
95
+
96
+ end