remon 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (48) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +10 -0
  3. data/.rspec +3 -0
  4. data/FEATURES.md +39 -0
  5. data/Gemfile +4 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +41 -0
  8. data/Rakefile +27 -0
  9. data/bin/console +14 -0
  10. data/bin/setup +8 -0
  11. data/dev_exe/remon +4 -0
  12. data/exe/remon +101 -0
  13. data/lib/remon.rb +11 -0
  14. data/lib/remon/check.rb +145 -0
  15. data/lib/remon/check_dsl.rb +92 -0
  16. data/lib/remon/check_runner.rb +53 -0
  17. data/lib/remon/checks/consul.rb +41 -0
  18. data/lib/remon/checks/disk.rb +36 -0
  19. data/lib/remon/checks/http.rb +53 -0
  20. data/lib/remon/checks/oom.rb +26 -0
  21. data/lib/remon/checks/redis.rb +23 -0
  22. data/lib/remon/checks/salt.rb +27 -0
  23. data/lib/remon/checks/system.rb +96 -0
  24. data/lib/remon/checks/yum.rb +30 -0
  25. data/lib/remon/config.rb +101 -0
  26. data/lib/remon/custom_logger.rb +6 -0
  27. data/lib/remon/deduped_queue.rb +38 -0
  28. data/lib/remon/error.rb +4 -0
  29. data/lib/remon/event_processor.rb +33 -0
  30. data/lib/remon/ext/num_ext.rb +23 -0
  31. data/lib/remon/helper.rb +41 -0
  32. data/lib/remon/logger.rb +17 -0
  33. data/lib/remon/metrics/consul.rb +32 -0
  34. data/lib/remon/metrics/disk.rb +24 -0
  35. data/lib/remon/metrics/http.rb +40 -0
  36. data/lib/remon/metrics/oom.rb +32 -0
  37. data/lib/remon/metrics/salt.rb +18 -0
  38. data/lib/remon/metrics/system.rb +63 -0
  39. data/lib/remon/metrics/yum.rb +20 -0
  40. data/lib/remon/proc_check.rb +26 -0
  41. data/lib/remon/scheduler.rb +106 -0
  42. data/lib/remon/scripts/salt-status +24 -0
  43. data/lib/remon/scripts/yum-status +12 -0
  44. data/lib/remon/sysinfo.rb +69 -0
  45. data/lib/remon/version.rb +3 -0
  46. data/remon.gemspec +26 -0
  47. data/test_config.rb +44 -0
  48. metadata +146 -0
@@ -0,0 +1,30 @@
1
+ require 'remon/metrics/yum'
2
+
3
+ defcheck :yum do
4
+
5
+ def init
6
+ @yum = Metrics::Yum.new
7
+ end
8
+
9
+ def run
10
+ updates_available
11
+ end
12
+
13
+ private
14
+
15
+ def updates_available
16
+ service = "yum updates"
17
+ count = @yum.updates_available
18
+ state = count > 0 ? "warning" : "ok"
19
+ metric = state == "ok" ? 0 : 1
20
+ event({
21
+ service: service,
22
+ description: "#{count} updates available",
23
+ state: state,
24
+ metric: metric
25
+ })
26
+ rescue => e
27
+ logger.error "#{e.class}: #{e.message}"
28
+ warning_event service
29
+ end
30
+ end
@@ -0,0 +1,101 @@
1
+ require 'set'
2
+ require_relative 'error'
3
+ require_relative 'ext/num_ext'
4
+ require_relative 'check_dsl'
5
+ require 'forwardable'
6
+
7
+ module Remon
8
+ class Config
9
+ using NumExt
10
+
11
+ LOAD_PATHS = ["#{__dir__}/checks"]
12
+
13
+ extend Forwardable
14
+
15
+ def initialize(config_file: nil, config_dir: nil, load_paths: [])
16
+ @config_file = config_file
17
+ @config_dir = config_dir
18
+ @schedule = {}
19
+ @scheduler_offset = 0
20
+ @workers = 1
21
+ @task_group = { interval: 0, offset: 15, randomize: false }
22
+ load_paths = Set.new(load_paths).merge(LOAD_PATHS)
23
+ @dsl = CheckDsl.new load_paths.to_a
24
+ end
25
+
26
+ def config
27
+ @config_read ||= begin
28
+ read_config
29
+ true
30
+ end
31
+ {
32
+ schedule: @schedule,
33
+ scheduler_offset: @scheduler_offset,
34
+ process_proc: @process_proc,
35
+ workers: @workers
36
+ }
37
+ end
38
+
39
+ private
40
+
41
+ def_delegators :@dsl, :check, :defcheck, :proc_check
42
+
43
+ def host(host)
44
+ Remon.host = host
45
+ end
46
+
47
+ def every(secs, randomize: false, offset: 0, &block)
48
+ raise Error, "offset must be less than interval" if offset > secs
49
+ before = @task_group
50
+ @task_group = { interval: secs, offset: offset, randomize: randomize }
51
+ yield
52
+ ensure
53
+ @task_group = before
54
+ end
55
+
56
+ def scheduler_offset(offset)
57
+ @scheduler_offset = offset
58
+ end
59
+
60
+ def process_event(&block)
61
+ @process_proc = block
62
+ end
63
+
64
+ def workers(workers)
65
+ @workers = workers
66
+ end
67
+
68
+ def schedule_check(check, args = [], kwargs = {})
69
+ @schedule[@task_group] ||= Set.new
70
+ if not check.is_a? Check
71
+ kwargs[:ttl] ||= default_ttl(@task_group[:interval])
72
+ klass = self.check(check)
73
+ check = klass.new(*args, **kwargs)
74
+ end
75
+ @schedule[@task_group] << check
76
+ end
77
+
78
+ def default_ttl(interval)
79
+ 3 * interval
80
+ end
81
+
82
+ def read_config
83
+ read_config_file @config_file if @config_file
84
+ if @config_dir
85
+ Dir.glob("#{@config_dir}/*.rb").each { |f| read_config_file f }
86
+ end
87
+ end
88
+
89
+ def read_config_file(file)
90
+ if not File.readable? file
91
+ raise Error, "config #{file} not readable"
92
+ end
93
+ instance_eval(File.read(file))
94
+ rescue NoMethodError => e
95
+ raise Error, "invalid option used in config: #{e.name}"
96
+ end
97
+
98
+ alias_method :run_check, :schedule_check
99
+
100
+ end
101
+ end
@@ -0,0 +1,6 @@
1
+ require 'logger'
2
+
3
+ module Remon
4
+ class CustomLogger
5
+ end
6
+ end
@@ -0,0 +1,38 @@
1
+ require_relative 'logger'
2
+
3
+ module Remon
4
+ class DedupedQueue
5
+
6
+ include Logger
7
+
8
+ def initialize
9
+ @hash = {}
10
+ @mutex = Mutex.new
11
+ @queue = Queue.new
12
+ end
13
+
14
+ def enqueue(task)
15
+ @mutex.synchronize do
16
+ if @hash[task]
17
+ logger.debug "duplicate item #{task}" if logger.debug?
18
+ return
19
+ end
20
+ @hash[task] = true
21
+ end
22
+ @queue << task
23
+ end
24
+
25
+ def dequeue(non_block = false)
26
+ task = @queue.pop(non_block)
27
+ @mutex.synchronize do
28
+ @hash.delete task
29
+ end
30
+ task
31
+ end
32
+
33
+ alias_method :'<<', :enqueue
34
+ alias_method :pop, :dequeue
35
+ alias_method :shift, :dequeue
36
+
37
+ end
38
+ end
@@ -0,0 +1,4 @@
1
+ module Remon
2
+ class Error < StandardError
3
+ end
4
+ end
@@ -0,0 +1,33 @@
1
+ require_relative 'logger'
2
+
3
+ module Remon
4
+ class EventProcessor
5
+
6
+ include Logger
7
+ attr_reader :queue
8
+
9
+ def initialize(pr)
10
+ @proc = pr
11
+ @queue = Queue.new
12
+ end
13
+
14
+ def start
15
+ @thread ||= Thread.new do
16
+ logger.debug { "starting event processor" }
17
+ loop { process_event }
18
+ end
19
+ end
20
+
21
+ def process_event
22
+ event = @queue.pop
23
+ @proc.call event
24
+ rescue => e
25
+ logger.warn "warn error #{e.message}"
26
+ end
27
+
28
+ def stop
29
+ Thread.kill @thread if @thread
30
+ end
31
+
32
+ end
33
+ end
@@ -0,0 +1,23 @@
1
+ module Remon
2
+ module NumExt
3
+ refine Integer do
4
+
5
+ def seconds
6
+ self
7
+ end
8
+
9
+ def second
10
+ self
11
+ end
12
+
13
+ def minutes
14
+ self * 60
15
+ end
16
+
17
+ def minute
18
+ self * 60
19
+ end
20
+
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,41 @@
1
+ require_relative 'logger'
2
+ module Remon
3
+ module Helper
4
+
5
+ include Logger
6
+
7
+ def cmd(command, error_msg: nil, return_output: true, env: {}, shell: false)
8
+ if command.is_a? Array
9
+ command_arr = command
10
+ command_str = command.join(" ")
11
+ else
12
+ command_arr = command.split
13
+ command_str = command
14
+ end
15
+ logger.debug command_str
16
+
17
+ run_command = shell ? command_str : command_arr
18
+ output = if return_output
19
+ IO.popen(env, run_command) { |f| f.read }
20
+ else
21
+ system(env, run_command, 2 => 1)
22
+ end
23
+ exitstatus = $?.exitstatus
24
+
25
+ if exitstatus != 0
26
+ error_msg ||= "non zero exit for \"#{command_str}\""
27
+ raise Error, error_msg
28
+ end
29
+ return output
30
+ end
31
+
32
+ def safe_cmd(*args, **kwargs)
33
+ output = cmd(*args, **kwargs)
34
+ return $?.exitstatus, output
35
+ rescue => e
36
+ logger.debug e.message
37
+ return -1, nil
38
+ end
39
+
40
+ end
41
+ end
@@ -0,0 +1,17 @@
1
+ require 'logger'
2
+ module Remon
3
+ module Logger
4
+
5
+ def self.logger
6
+ @logger ||= begin
7
+ l = ::Logger.new(STDOUT)
8
+ l.level = ::Logger::WARN
9
+ l
10
+ end
11
+ end
12
+
13
+ def logger
14
+ ::Remon::Logger.logger
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,32 @@
1
+ require 'open-uri'
2
+ require 'json'
3
+
4
+ module Remon
5
+ module Metrics
6
+ class Consul
7
+
8
+ def initialize(host: "127.0.0.1", port: 8500)
9
+ @host = host
10
+ @port = port
11
+ end
12
+
13
+ def failed_nodes
14
+ nodes = {}
15
+ failed_serf_checks = critical_checks.select { |i| i["CheckID"] == "serfHealth" }
16
+ failed_serf_checks.each { |i| nodes[i["Node"]] = i["Output"] }
17
+ nodes
18
+ end
19
+
20
+ private
21
+
22
+ def consul_url(path)
23
+ "http://#{@host}:#{@port}#{path}"
24
+ end
25
+
26
+ def critical_checks
27
+ url = consul_url("/v1/health/state/critical")
28
+ JSON.parse(open(url).read)
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,24 @@
1
+ module Remon
2
+ module Metrics
3
+ class Disk
4
+
5
+ def disks_usage
6
+ disks = []
7
+ IO.popen(['df', '-h']) do |io|
8
+ io.each_line do |l|
9
+ f = l.split(/\s+/)
10
+ next if f[0] == 'Filesystem'
11
+ next unless f[0] =~ /\// # Needs at least one slash in the mount path
12
+
13
+ disk_info = {}
14
+ disk_info[:mount] = f[5]
15
+ disk_info[:percent] = (f[4].to_f/100).round(2)
16
+ disk_info[:size] = f[1]
17
+ disks << disk_info
18
+ end
19
+ end
20
+ disks
21
+ end
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,40 @@
1
+ require 'open-uri'
2
+ require 'benchmark'
3
+
4
+ module Remon
5
+ module Metrics
6
+ class Http
7
+
8
+ def initialize(url)
9
+ @uri = URI.parse(url)
10
+ end
11
+
12
+ def status(read_timeout: 1, open_timeout: 1)
13
+ status = nil
14
+ time = Benchmark.realtime do
15
+ status = get_status(read_timeout, open_timeout)
16
+ end
17
+ return time, status
18
+ end
19
+
20
+ private
21
+
22
+ def get_status(read_timeout, open_timeout)
23
+ @uri.open(read_timeout: read_timeout, open_timeout: open_timeout, redirect: false) do |f|
24
+ f.status[0].to_i
25
+ end
26
+ rescue EOFError
27
+ return 444
28
+ rescue Errno::ECONNREFUSED
29
+ return 502
30
+ rescue Net::OpenTimeout
31
+ return 504
32
+ rescue Net::ReadTimeout
33
+ return 504
34
+ rescue OpenURI::HTTPRedirect => e
35
+ return 301
36
+ end
37
+
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,32 @@
1
+ module Remon
2
+ module Metrics
3
+ class Oom
4
+
5
+ def initialize(log_file)
6
+ @log_file = log_file
7
+ raise Error, "#{log_file} not readable" if not File.readable? log_file
8
+ end
9
+
10
+ def stats
11
+ counts = oom_counts
12
+ total_count = counts.values.reduce(&:+)
13
+ todays_count = counts[Time.now.strftime("%b%e")]
14
+ {today: todays_count, total: total_count}
15
+ end
16
+
17
+ private
18
+
19
+ def oom_counts
20
+ counts = Hash.new(0)
21
+ IO.popen "grep 'invoked oom-killer' #{@log_file} | awk '{print $1 $2}' | uniq -c" do |f|
22
+ f.each_line do |line|
23
+ split = line.strip.split
24
+ counts[split[1]] = split[0].to_i
25
+ end
26
+ end
27
+ counts
28
+ end
29
+
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,18 @@
1
+ module Remon
2
+ module Metrics
3
+ class Salt
4
+
5
+ def initialize(timeout: 240)
6
+ @timeout = timeout
7
+ end
8
+
9
+ def status
10
+ script = File.expand_path("#{__dir__}/../scripts/salt-status")
11
+ out = `ruby #{script} #{@timeout} 2>/dev/null`
12
+ output = out.chomp.split(":")
13
+ { state: output[0], ok: output[1].to_i, total: output[2].to_i }
14
+ end
15
+
16
+ end
17
+ end
18
+ end