remon 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +10 -0
- data/.rspec +3 -0
- data/FEATURES.md +39 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +21 -0
- data/README.md +41 -0
- data/Rakefile +27 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/dev_exe/remon +4 -0
- data/exe/remon +101 -0
- data/lib/remon.rb +11 -0
- data/lib/remon/check.rb +145 -0
- data/lib/remon/check_dsl.rb +92 -0
- data/lib/remon/check_runner.rb +53 -0
- data/lib/remon/checks/consul.rb +41 -0
- data/lib/remon/checks/disk.rb +36 -0
- data/lib/remon/checks/http.rb +53 -0
- data/lib/remon/checks/oom.rb +26 -0
- data/lib/remon/checks/redis.rb +23 -0
- data/lib/remon/checks/salt.rb +27 -0
- data/lib/remon/checks/system.rb +96 -0
- data/lib/remon/checks/yum.rb +30 -0
- data/lib/remon/config.rb +101 -0
- data/lib/remon/custom_logger.rb +6 -0
- data/lib/remon/deduped_queue.rb +38 -0
- data/lib/remon/error.rb +4 -0
- data/lib/remon/event_processor.rb +33 -0
- data/lib/remon/ext/num_ext.rb +23 -0
- data/lib/remon/helper.rb +41 -0
- data/lib/remon/logger.rb +17 -0
- data/lib/remon/metrics/consul.rb +32 -0
- data/lib/remon/metrics/disk.rb +24 -0
- data/lib/remon/metrics/http.rb +40 -0
- data/lib/remon/metrics/oom.rb +32 -0
- data/lib/remon/metrics/salt.rb +18 -0
- data/lib/remon/metrics/system.rb +63 -0
- data/lib/remon/metrics/yum.rb +20 -0
- data/lib/remon/proc_check.rb +26 -0
- data/lib/remon/scheduler.rb +106 -0
- data/lib/remon/scripts/salt-status +24 -0
- data/lib/remon/scripts/yum-status +12 -0
- data/lib/remon/sysinfo.rb +69 -0
- data/lib/remon/version.rb +3 -0
- data/remon.gemspec +26 -0
- data/test_config.rb +44 -0
- metadata +146 -0
@@ -0,0 +1,30 @@
|
|
1
|
+
require 'remon/metrics/yum'
|
2
|
+
|
3
|
+
defcheck :yum do
|
4
|
+
|
5
|
+
def init
|
6
|
+
@yum = Metrics::Yum.new
|
7
|
+
end
|
8
|
+
|
9
|
+
def run
|
10
|
+
updates_available
|
11
|
+
end
|
12
|
+
|
13
|
+
private
|
14
|
+
|
15
|
+
def updates_available
|
16
|
+
service = "yum updates"
|
17
|
+
count = @yum.updates_available
|
18
|
+
state = count > 0 ? "warning" : "ok"
|
19
|
+
metric = state == "ok" ? 0 : 1
|
20
|
+
event({
|
21
|
+
service: service,
|
22
|
+
description: "#{count} updates available",
|
23
|
+
state: state,
|
24
|
+
metric: metric
|
25
|
+
})
|
26
|
+
rescue => e
|
27
|
+
logger.error "#{e.class}: #{e.message}"
|
28
|
+
warning_event service
|
29
|
+
end
|
30
|
+
end
|
data/lib/remon/config.rb
ADDED
@@ -0,0 +1,101 @@
|
|
1
|
+
require 'set'
|
2
|
+
require_relative 'error'
|
3
|
+
require_relative 'ext/num_ext'
|
4
|
+
require_relative 'check_dsl'
|
5
|
+
require 'forwardable'
|
6
|
+
|
7
|
+
module Remon
|
8
|
+
class Config
|
9
|
+
using NumExt
|
10
|
+
|
11
|
+
LOAD_PATHS = ["#{__dir__}/checks"]
|
12
|
+
|
13
|
+
extend Forwardable
|
14
|
+
|
15
|
+
def initialize(config_file: nil, config_dir: nil, load_paths: [])
|
16
|
+
@config_file = config_file
|
17
|
+
@config_dir = config_dir
|
18
|
+
@schedule = {}
|
19
|
+
@scheduler_offset = 0
|
20
|
+
@workers = 1
|
21
|
+
@task_group = { interval: 0, offset: 15, randomize: false }
|
22
|
+
load_paths = Set.new(load_paths).merge(LOAD_PATHS)
|
23
|
+
@dsl = CheckDsl.new load_paths.to_a
|
24
|
+
end
|
25
|
+
|
26
|
+
def config
|
27
|
+
@config_read ||= begin
|
28
|
+
read_config
|
29
|
+
true
|
30
|
+
end
|
31
|
+
{
|
32
|
+
schedule: @schedule,
|
33
|
+
scheduler_offset: @scheduler_offset,
|
34
|
+
process_proc: @process_proc,
|
35
|
+
workers: @workers
|
36
|
+
}
|
37
|
+
end
|
38
|
+
|
39
|
+
private
|
40
|
+
|
41
|
+
def_delegators :@dsl, :check, :defcheck, :proc_check
|
42
|
+
|
43
|
+
def host(host)
|
44
|
+
Remon.host = host
|
45
|
+
end
|
46
|
+
|
47
|
+
def every(secs, randomize: false, offset: 0, &block)
|
48
|
+
raise Error, "offset must be less than interval" if offset > secs
|
49
|
+
before = @task_group
|
50
|
+
@task_group = { interval: secs, offset: offset, randomize: randomize }
|
51
|
+
yield
|
52
|
+
ensure
|
53
|
+
@task_group = before
|
54
|
+
end
|
55
|
+
|
56
|
+
def scheduler_offset(offset)
|
57
|
+
@scheduler_offset = offset
|
58
|
+
end
|
59
|
+
|
60
|
+
def process_event(&block)
|
61
|
+
@process_proc = block
|
62
|
+
end
|
63
|
+
|
64
|
+
def workers(workers)
|
65
|
+
@workers = workers
|
66
|
+
end
|
67
|
+
|
68
|
+
def schedule_check(check, args = [], kwargs = {})
|
69
|
+
@schedule[@task_group] ||= Set.new
|
70
|
+
if not check.is_a? Check
|
71
|
+
kwargs[:ttl] ||= default_ttl(@task_group[:interval])
|
72
|
+
klass = self.check(check)
|
73
|
+
check = klass.new(*args, **kwargs)
|
74
|
+
end
|
75
|
+
@schedule[@task_group] << check
|
76
|
+
end
|
77
|
+
|
78
|
+
def default_ttl(interval)
|
79
|
+
3 * interval
|
80
|
+
end
|
81
|
+
|
82
|
+
def read_config
|
83
|
+
read_config_file @config_file if @config_file
|
84
|
+
if @config_dir
|
85
|
+
Dir.glob("#{@config_dir}/*.rb").each { |f| read_config_file f }
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
def read_config_file(file)
|
90
|
+
if not File.readable? file
|
91
|
+
raise Error, "config #{file} not readable"
|
92
|
+
end
|
93
|
+
instance_eval(File.read(file))
|
94
|
+
rescue NoMethodError => e
|
95
|
+
raise Error, "invalid option used in config: #{e.name}"
|
96
|
+
end
|
97
|
+
|
98
|
+
alias_method :run_check, :schedule_check
|
99
|
+
|
100
|
+
end
|
101
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
require_relative 'logger'
|
2
|
+
|
3
|
+
module Remon
|
4
|
+
class DedupedQueue
|
5
|
+
|
6
|
+
include Logger
|
7
|
+
|
8
|
+
def initialize
|
9
|
+
@hash = {}
|
10
|
+
@mutex = Mutex.new
|
11
|
+
@queue = Queue.new
|
12
|
+
end
|
13
|
+
|
14
|
+
def enqueue(task)
|
15
|
+
@mutex.synchronize do
|
16
|
+
if @hash[task]
|
17
|
+
logger.debug "duplicate item #{task}" if logger.debug?
|
18
|
+
return
|
19
|
+
end
|
20
|
+
@hash[task] = true
|
21
|
+
end
|
22
|
+
@queue << task
|
23
|
+
end
|
24
|
+
|
25
|
+
def dequeue(non_block = false)
|
26
|
+
task = @queue.pop(non_block)
|
27
|
+
@mutex.synchronize do
|
28
|
+
@hash.delete task
|
29
|
+
end
|
30
|
+
task
|
31
|
+
end
|
32
|
+
|
33
|
+
alias_method :'<<', :enqueue
|
34
|
+
alias_method :pop, :dequeue
|
35
|
+
alias_method :shift, :dequeue
|
36
|
+
|
37
|
+
end
|
38
|
+
end
|
data/lib/remon/error.rb
ADDED
@@ -0,0 +1,33 @@
|
|
1
|
+
require_relative 'logger'
|
2
|
+
|
3
|
+
module Remon
|
4
|
+
class EventProcessor
|
5
|
+
|
6
|
+
include Logger
|
7
|
+
attr_reader :queue
|
8
|
+
|
9
|
+
def initialize(pr)
|
10
|
+
@proc = pr
|
11
|
+
@queue = Queue.new
|
12
|
+
end
|
13
|
+
|
14
|
+
def start
|
15
|
+
@thread ||= Thread.new do
|
16
|
+
logger.debug { "starting event processor" }
|
17
|
+
loop { process_event }
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def process_event
|
22
|
+
event = @queue.pop
|
23
|
+
@proc.call event
|
24
|
+
rescue => e
|
25
|
+
logger.warn "warn error #{e.message}"
|
26
|
+
end
|
27
|
+
|
28
|
+
def stop
|
29
|
+
Thread.kill @thread if @thread
|
30
|
+
end
|
31
|
+
|
32
|
+
end
|
33
|
+
end
|
data/lib/remon/helper.rb
ADDED
@@ -0,0 +1,41 @@
|
|
1
|
+
require_relative 'logger'
|
2
|
+
module Remon
|
3
|
+
module Helper
|
4
|
+
|
5
|
+
include Logger
|
6
|
+
|
7
|
+
def cmd(command, error_msg: nil, return_output: true, env: {}, shell: false)
|
8
|
+
if command.is_a? Array
|
9
|
+
command_arr = command
|
10
|
+
command_str = command.join(" ")
|
11
|
+
else
|
12
|
+
command_arr = command.split
|
13
|
+
command_str = command
|
14
|
+
end
|
15
|
+
logger.debug command_str
|
16
|
+
|
17
|
+
run_command = shell ? command_str : command_arr
|
18
|
+
output = if return_output
|
19
|
+
IO.popen(env, run_command) { |f| f.read }
|
20
|
+
else
|
21
|
+
system(env, run_command, 2 => 1)
|
22
|
+
end
|
23
|
+
exitstatus = $?.exitstatus
|
24
|
+
|
25
|
+
if exitstatus != 0
|
26
|
+
error_msg ||= "non zero exit for \"#{command_str}\""
|
27
|
+
raise Error, error_msg
|
28
|
+
end
|
29
|
+
return output
|
30
|
+
end
|
31
|
+
|
32
|
+
def safe_cmd(*args, **kwargs)
|
33
|
+
output = cmd(*args, **kwargs)
|
34
|
+
return $?.exitstatus, output
|
35
|
+
rescue => e
|
36
|
+
logger.debug e.message
|
37
|
+
return -1, nil
|
38
|
+
end
|
39
|
+
|
40
|
+
end
|
41
|
+
end
|
data/lib/remon/logger.rb
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
require 'open-uri'
|
2
|
+
require 'json'
|
3
|
+
|
4
|
+
module Remon
|
5
|
+
module Metrics
|
6
|
+
class Consul
|
7
|
+
|
8
|
+
def initialize(host: "127.0.0.1", port: 8500)
|
9
|
+
@host = host
|
10
|
+
@port = port
|
11
|
+
end
|
12
|
+
|
13
|
+
def failed_nodes
|
14
|
+
nodes = {}
|
15
|
+
failed_serf_checks = critical_checks.select { |i| i["CheckID"] == "serfHealth" }
|
16
|
+
failed_serf_checks.each { |i| nodes[i["Node"]] = i["Output"] }
|
17
|
+
nodes
|
18
|
+
end
|
19
|
+
|
20
|
+
private
|
21
|
+
|
22
|
+
def consul_url(path)
|
23
|
+
"http://#{@host}:#{@port}#{path}"
|
24
|
+
end
|
25
|
+
|
26
|
+
def critical_checks
|
27
|
+
url = consul_url("/v1/health/state/critical")
|
28
|
+
JSON.parse(open(url).read)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
module Remon
|
2
|
+
module Metrics
|
3
|
+
class Disk
|
4
|
+
|
5
|
+
def disks_usage
|
6
|
+
disks = []
|
7
|
+
IO.popen(['df', '-h']) do |io|
|
8
|
+
io.each_line do |l|
|
9
|
+
f = l.split(/\s+/)
|
10
|
+
next if f[0] == 'Filesystem'
|
11
|
+
next unless f[0] =~ /\// # Needs at least one slash in the mount path
|
12
|
+
|
13
|
+
disk_info = {}
|
14
|
+
disk_info[:mount] = f[5]
|
15
|
+
disk_info[:percent] = (f[4].to_f/100).round(2)
|
16
|
+
disk_info[:size] = f[1]
|
17
|
+
disks << disk_info
|
18
|
+
end
|
19
|
+
end
|
20
|
+
disks
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
require 'open-uri'
|
2
|
+
require 'benchmark'
|
3
|
+
|
4
|
+
module Remon
|
5
|
+
module Metrics
|
6
|
+
class Http
|
7
|
+
|
8
|
+
def initialize(url)
|
9
|
+
@uri = URI.parse(url)
|
10
|
+
end
|
11
|
+
|
12
|
+
def status(read_timeout: 1, open_timeout: 1)
|
13
|
+
status = nil
|
14
|
+
time = Benchmark.realtime do
|
15
|
+
status = get_status(read_timeout, open_timeout)
|
16
|
+
end
|
17
|
+
return time, status
|
18
|
+
end
|
19
|
+
|
20
|
+
private
|
21
|
+
|
22
|
+
def get_status(read_timeout, open_timeout)
|
23
|
+
@uri.open(read_timeout: read_timeout, open_timeout: open_timeout, redirect: false) do |f|
|
24
|
+
f.status[0].to_i
|
25
|
+
end
|
26
|
+
rescue EOFError
|
27
|
+
return 444
|
28
|
+
rescue Errno::ECONNREFUSED
|
29
|
+
return 502
|
30
|
+
rescue Net::OpenTimeout
|
31
|
+
return 504
|
32
|
+
rescue Net::ReadTimeout
|
33
|
+
return 504
|
34
|
+
rescue OpenURI::HTTPRedirect => e
|
35
|
+
return 301
|
36
|
+
end
|
37
|
+
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
module Remon
|
2
|
+
module Metrics
|
3
|
+
class Oom
|
4
|
+
|
5
|
+
def initialize(log_file)
|
6
|
+
@log_file = log_file
|
7
|
+
raise Error, "#{log_file} not readable" if not File.readable? log_file
|
8
|
+
end
|
9
|
+
|
10
|
+
def stats
|
11
|
+
counts = oom_counts
|
12
|
+
total_count = counts.values.reduce(&:+)
|
13
|
+
todays_count = counts[Time.now.strftime("%b%e")]
|
14
|
+
{today: todays_count, total: total_count}
|
15
|
+
end
|
16
|
+
|
17
|
+
private
|
18
|
+
|
19
|
+
def oom_counts
|
20
|
+
counts = Hash.new(0)
|
21
|
+
IO.popen "grep 'invoked oom-killer' #{@log_file} | awk '{print $1 $2}' | uniq -c" do |f|
|
22
|
+
f.each_line do |line|
|
23
|
+
split = line.strip.split
|
24
|
+
counts[split[1]] = split[0].to_i
|
25
|
+
end
|
26
|
+
end
|
27
|
+
counts
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
module Remon
|
2
|
+
module Metrics
|
3
|
+
class Salt
|
4
|
+
|
5
|
+
def initialize(timeout: 240)
|
6
|
+
@timeout = timeout
|
7
|
+
end
|
8
|
+
|
9
|
+
def status
|
10
|
+
script = File.expand_path("#{__dir__}/../scripts/salt-status")
|
11
|
+
out = `ruby #{script} #{@timeout} 2>/dev/null`
|
12
|
+
output = out.chomp.split(":")
|
13
|
+
{ state: output[0], ok: output[1].to_i, total: output[2].to_i }
|
14
|
+
end
|
15
|
+
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|