bosh-monitor 1.5.0.pre.1113
Sign up to get free protection for your applications and to get access to all the features.
- data/README +80 -0
- data/bin/bosh-monitor +30 -0
- data/bin/bosh-monitor-console +51 -0
- data/bin/listener +58 -0
- data/lib/bosh/monitor.rb +72 -0
- data/lib/bosh/monitor/agent.rb +51 -0
- data/lib/bosh/monitor/agent_manager.rb +295 -0
- data/lib/bosh/monitor/api_controller.rb +18 -0
- data/lib/bosh/monitor/config.rb +71 -0
- data/lib/bosh/monitor/core_ext.rb +8 -0
- data/lib/bosh/monitor/director.rb +76 -0
- data/lib/bosh/monitor/director_monitor.rb +33 -0
- data/lib/bosh/monitor/errors.rb +19 -0
- data/lib/bosh/monitor/event_processor.rb +109 -0
- data/lib/bosh/monitor/events/alert.rb +92 -0
- data/lib/bosh/monitor/events/base.rb +70 -0
- data/lib/bosh/monitor/events/heartbeat.rb +139 -0
- data/lib/bosh/monitor/metric.rb +16 -0
- data/lib/bosh/monitor/plugins/base.rb +27 -0
- data/lib/bosh/monitor/plugins/cloud_watch.rb +56 -0
- data/lib/bosh/monitor/plugins/datadog.rb +78 -0
- data/lib/bosh/monitor/plugins/dummy.rb +20 -0
- data/lib/bosh/monitor/plugins/email.rb +135 -0
- data/lib/bosh/monitor/plugins/http_request_helper.rb +25 -0
- data/lib/bosh/monitor/plugins/logger.rb +13 -0
- data/lib/bosh/monitor/plugins/nats.rb +43 -0
- data/lib/bosh/monitor/plugins/pagerduty.rb +48 -0
- data/lib/bosh/monitor/plugins/paging_datadog_client.rb +24 -0
- data/lib/bosh/monitor/plugins/resurrector.rb +82 -0
- data/lib/bosh/monitor/plugins/resurrector_helper.rb +84 -0
- data/lib/bosh/monitor/plugins/tsdb.rb +43 -0
- data/lib/bosh/monitor/plugins/varz.rb +17 -0
- data/lib/bosh/monitor/protocols/tsdb.rb +68 -0
- data/lib/bosh/monitor/runner.rb +162 -0
- data/lib/bosh/monitor/version.rb +5 -0
- data/lib/bosh/monitor/yaml_helper.rb +18 -0
- metadata +246 -0
@@ -0,0 +1,25 @@
|
|
1
|
+
module Bosh::Monitor::Plugins
|
2
|
+
module HttpRequestHelper
|
3
|
+
def send_http_post_request(uri, request)
|
4
|
+
send_http_request(:post, uri, request)
|
5
|
+
end
|
6
|
+
|
7
|
+
def send_http_put_request(uri, request)
|
8
|
+
send_http_request(:put, uri, request)
|
9
|
+
end
|
10
|
+
|
11
|
+
def send_http_request(method, uri, request)
|
12
|
+
name = self.class.name
|
13
|
+
logger.debug("sending HTTP #{method.to_s.upcase} to: #{uri}")
|
14
|
+
started = Time.now
|
15
|
+
http = EM::HttpRequest.new(uri).send(method, request)
|
16
|
+
http.callback do
|
17
|
+
logger.debug("#{name} event sent (took #{Time.now - started} seconds): #{http.response_header.status}")
|
18
|
+
end
|
19
|
+
|
20
|
+
http.errback do |e|
|
21
|
+
logger.error("Failed to send #{name} event: #{e.error}")
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
module Bosh::Monitor
|
2
|
+
module Plugins
|
3
|
+
class Nats < Base
|
4
|
+
SUBJECT = "bosh.hm.events"
|
5
|
+
|
6
|
+
def validate_options
|
7
|
+
options.kind_of?(Hash) &&
|
8
|
+
options["endpoint"] &&
|
9
|
+
options.has_key?("user") &&
|
10
|
+
options.has_key?("password")
|
11
|
+
end
|
12
|
+
|
13
|
+
def run
|
14
|
+
unless EM.reactor_running?
|
15
|
+
logger.error("NATS delivery agent can only be started when event loop is running")
|
16
|
+
return false
|
17
|
+
end
|
18
|
+
|
19
|
+
nats_client_options = {
|
20
|
+
:uri => options["endpoint"],
|
21
|
+
:user => options["user"],
|
22
|
+
:pass => options["password"],
|
23
|
+
:autostart => false
|
24
|
+
}
|
25
|
+
|
26
|
+
@nats = NATS.connect(nats_client_options) do
|
27
|
+
logger.info("Ready to publish alerts to NATS at `#{options["endpoint"]}'")
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def process(event)
|
32
|
+
if @nats.nil?
|
33
|
+
@logger.error("Cannot deliver event, NATS not initialized")
|
34
|
+
return false
|
35
|
+
end
|
36
|
+
|
37
|
+
nats_subject = options["subject"] || SUBJECT
|
38
|
+
@nats.publish(nats_subject, event.to_json)
|
39
|
+
true
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
module Bosh::Monitor
|
2
|
+
module Plugins
|
3
|
+
class Pagerduty < Base
|
4
|
+
include Bosh::Monitor::Plugins::HttpRequestHelper
|
5
|
+
|
6
|
+
API_URI = "https://events.pagerduty.com/generic/2010-04-15/create_event.json"
|
7
|
+
|
8
|
+
def run
|
9
|
+
unless EM.reactor_running?
|
10
|
+
logger.error("Pagerduty plugin can only be started when event loop is running")
|
11
|
+
return false
|
12
|
+
end
|
13
|
+
|
14
|
+
logger.info("Pagerduty delivery agent is running...")
|
15
|
+
end
|
16
|
+
|
17
|
+
def validate_options
|
18
|
+
options.kind_of?(Hash) &&
|
19
|
+
options["service_key"].kind_of?(String)
|
20
|
+
end
|
21
|
+
|
22
|
+
def process(event)
|
23
|
+
started = Time.now
|
24
|
+
|
25
|
+
payload = {
|
26
|
+
:service_key => options["service_key"],
|
27
|
+
:event_type => "trigger",
|
28
|
+
:incident_key => event.id,
|
29
|
+
:description => event.short_description,
|
30
|
+
:details => event.to_hash
|
31
|
+
}
|
32
|
+
|
33
|
+
request = {
|
34
|
+
:body => Yajl::Encoder.encode(payload)
|
35
|
+
}
|
36
|
+
|
37
|
+
if options["http_proxy"]
|
38
|
+
proxy = URI.parse(options["http_proxy"])
|
39
|
+
request[:proxy] = { :host => proxy.host, :port => proxy.port }
|
40
|
+
end
|
41
|
+
|
42
|
+
send_http_post_request(API_URI, request)
|
43
|
+
rescue => e
|
44
|
+
logger.error("Error sending pagerduty event: #{e}")
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
class PagingDatadogClient
|
2
|
+
attr_reader :datadog_recipient
|
3
|
+
|
4
|
+
def initialize(datadog_recipient, datadog_client)
|
5
|
+
@datadog_recipient = datadog_recipient
|
6
|
+
@datadog_client = datadog_client
|
7
|
+
end
|
8
|
+
|
9
|
+
def emit_points(metric, points, options={})
|
10
|
+
@datadog_client.emit_points(metric, points, options)
|
11
|
+
end
|
12
|
+
|
13
|
+
def emit_event(event)
|
14
|
+
event_hash = event.to_hash
|
15
|
+
new_message = if event.priority == "normal"
|
16
|
+
"#{event.msg_text} @#{@datadog_recipient}"
|
17
|
+
else
|
18
|
+
event.msg_text
|
19
|
+
end
|
20
|
+
new_event = Dogapi::Event.new(new_message, event_hash)
|
21
|
+
|
22
|
+
@datadog_client.emit_event(new_event)
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,82 @@
|
|
1
|
+
# This health monitor plugin should be used in conjunction with another plugin that
|
2
|
+
# alerts when a VM is unresponsive, as this plugin will try to automatically fix the
|
3
|
+
# problem by recreating the VM
|
4
|
+
module Bosh::Monitor
|
5
|
+
module Plugins
|
6
|
+
class Resurrector < Base
|
7
|
+
include Bosh::Monitor::Plugins::HttpRequestHelper
|
8
|
+
|
9
|
+
attr_reader :url
|
10
|
+
|
11
|
+
def initialize(options={})
|
12
|
+
super(options)
|
13
|
+
director = @options['director']
|
14
|
+
raise ArgumentError 'director options not set' unless director
|
15
|
+
|
16
|
+
@url = URI(director['endpoint'])
|
17
|
+
@user = director['user']
|
18
|
+
@password = director['password']
|
19
|
+
@processor = Bhm.event_processor
|
20
|
+
@alert_tracker = ResurrectorHelper::AlertTracker.new(@options)
|
21
|
+
end
|
22
|
+
|
23
|
+
def run
|
24
|
+
unless EM.reactor_running?
|
25
|
+
logger.error("Resurrector plugin can only be started when event loop is running")
|
26
|
+
return false
|
27
|
+
end
|
28
|
+
|
29
|
+
logger.info("Resurrector is running...")
|
30
|
+
end
|
31
|
+
|
32
|
+
def process(alert)
|
33
|
+
deployment = alert.attributes['deployment']
|
34
|
+
job = alert.attributes['job']
|
35
|
+
index = alert.attributes['index']
|
36
|
+
|
37
|
+
# only when the agent times out do we add deployment, job & index to the alert
|
38
|
+
# attributes, so this won't trigger a recreate for other types of alerts
|
39
|
+
if deployment && job && index
|
40
|
+
agent_key = ResurrectorHelper::JobInstanceKey.new(deployment, job, index)
|
41
|
+
@alert_tracker.record(agent_key, alert.created_at)
|
42
|
+
|
43
|
+
payload = {'jobs' => {job => [index]}}
|
44
|
+
request = {
|
45
|
+
head: {
|
46
|
+
'Content-Type' => 'application/json',
|
47
|
+
'authorization' => [@user, @password]
|
48
|
+
},
|
49
|
+
body: Yajl::Encoder.encode(payload)
|
50
|
+
}
|
51
|
+
|
52
|
+
@url.path = "/deployments/#{deployment}/scan_and_fix"
|
53
|
+
|
54
|
+
if @alert_tracker.melting_down?(deployment)
|
55
|
+
# freak out
|
56
|
+
ts = Time.now.to_i
|
57
|
+
@processor.process(:alert,
|
58
|
+
severity: 1,
|
59
|
+
source: "HM plugin resurrector",
|
60
|
+
title: "We are in meltdown.",
|
61
|
+
created_at: ts)
|
62
|
+
|
63
|
+
logger.error("(Resurrector) we are in meltdown.")
|
64
|
+
else
|
65
|
+
# queue instead, and only queue if it isn't already in the queue
|
66
|
+
# what if we can't keep up with the failure rate?
|
67
|
+
# - maybe not, maybe the meltdown detection takes care of the rate issue
|
68
|
+
logger.warn("(Resurrector) notifying director to recreate unresponsive VM: #{deployment} #{job}/#{index}")
|
69
|
+
|
70
|
+
send_http_put_request(url.to_s, request)
|
71
|
+
end
|
72
|
+
|
73
|
+
|
74
|
+
else
|
75
|
+
logger.warn("(Resurrector) event did not have deployment, job and index: #{alert}")
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
@@ -0,0 +1,84 @@
|
|
1
|
+
module Bosh::Monitor::Plugins
|
2
|
+
module ResurrectorHelper
|
3
|
+
|
4
|
+
# Hashable tuple of the identifying properties of a job
|
5
|
+
class JobInstanceKey
|
6
|
+
attr_accessor :deployment, :job, :index
|
7
|
+
|
8
|
+
def initialize(deployment, job, index)
|
9
|
+
@deployment = deployment
|
10
|
+
@job = job
|
11
|
+
@index = index
|
12
|
+
end
|
13
|
+
|
14
|
+
def hash
|
15
|
+
(deployment.to_s + job.to_s + index.to_s).hash
|
16
|
+
end
|
17
|
+
|
18
|
+
def eql?(other)
|
19
|
+
other.deployment == deployment &&
|
20
|
+
other.job == job &&
|
21
|
+
other.index == index
|
22
|
+
end
|
23
|
+
|
24
|
+
def to_s
|
25
|
+
[deployment, job, index].join('/')
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
# Service which tracks alerts and decides whether or not the cluster is melting down.
|
30
|
+
# When the cluster is melting down, the resurrector backs off on fixing instances.
|
31
|
+
class AlertTracker
|
32
|
+
|
33
|
+
# Below this number of down agents we don't consider a meltdown occurring
|
34
|
+
attr_accessor :minimum_down_jobs
|
35
|
+
|
36
|
+
# Number of seconds at which an alert is considered "current"; alerts older than
|
37
|
+
# this are ignored. Integer number of seconds.
|
38
|
+
attr_accessor :time_threshold
|
39
|
+
|
40
|
+
# Percentage of the cluster which must be down for scanning to stop. Float fraction
|
41
|
+
# between 0 and 1.
|
42
|
+
attr_accessor :percent_threshold
|
43
|
+
|
44
|
+
def initialize(args={})
|
45
|
+
@agent_manager = Bhm.agent_manager
|
46
|
+
@alert_times = {} # maps JobInstanceKey to time of last Alert
|
47
|
+
@minimum_down_jobs = args.fetch('minimum_down_jobs', 5)
|
48
|
+
@percent_threshold = args.fetch('percent_threshold', 0.2)
|
49
|
+
@time_threshold = args.fetch('time_threshold', 600)
|
50
|
+
end
|
51
|
+
|
52
|
+
# "Melting down" means a large part of the cluster is offline and manual intervention
|
53
|
+
# may be required to fix.
|
54
|
+
def melting_down?(deployment)
|
55
|
+
agent_alerts = alerts_for_deployment(deployment)
|
56
|
+
total_number_of_agents = agent_alerts.size
|
57
|
+
number_of_down_agents = agent_alerts.select { |_, alert_time|
|
58
|
+
alert_time > (Time.now - time_threshold)
|
59
|
+
}.size
|
60
|
+
|
61
|
+
return false if number_of_down_agents < minimum_down_jobs
|
62
|
+
|
63
|
+
(number_of_down_agents.to_f / total_number_of_agents) >= percent_threshold
|
64
|
+
end
|
65
|
+
|
66
|
+
def record(agent_key, alert_time)
|
67
|
+
@alert_times[agent_key] = alert_time
|
68
|
+
end
|
69
|
+
|
70
|
+
private
|
71
|
+
|
72
|
+
def alerts_for_deployment(deployment)
|
73
|
+
agents = @agent_manager.get_agents_for_deployment(deployment)
|
74
|
+
keys = agents.values.map { |agent|
|
75
|
+
JobInstanceKey.new(agent.deployment, agent.job, agent.index)
|
76
|
+
}
|
77
|
+
|
78
|
+
result = {}
|
79
|
+
keys.each { |key| result[key] = @alert_times.fetch(key, Time.at(0)) }
|
80
|
+
result
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
module Bosh::Monitor
|
2
|
+
module Plugins
|
3
|
+
class Tsdb < Base
|
4
|
+
|
5
|
+
def validate_options
|
6
|
+
options.kind_of?(Hash) &&
|
7
|
+
options["host"] &&
|
8
|
+
options["port"] &&
|
9
|
+
true
|
10
|
+
end
|
11
|
+
|
12
|
+
def run
|
13
|
+
unless EM.reactor_running?
|
14
|
+
logger.error("TSDB delivery agent can only be started when event loop is running")
|
15
|
+
return false
|
16
|
+
end
|
17
|
+
|
18
|
+
host = options["host"]
|
19
|
+
port = options["port"]
|
20
|
+
@tsdb = EM.connect(host, port, Bhm::TsdbConnection, host, port)
|
21
|
+
end
|
22
|
+
|
23
|
+
def process(event)
|
24
|
+
if @tsdb.nil?
|
25
|
+
@logger.error("Cannot deliver event, TSDB connection is not initialized")
|
26
|
+
return false
|
27
|
+
end
|
28
|
+
|
29
|
+
metrics = event.metrics
|
30
|
+
|
31
|
+
if !metrics.kind_of?(Enumerable)
|
32
|
+
raise PluginError, "Invalid event metrics: Enumerable expected, #{metrics.class} given"
|
33
|
+
end
|
34
|
+
|
35
|
+
metrics.each do |metric|
|
36
|
+
@tsdb.send_metric(metric.name, metric.timestamp, metric.value, metric.tags)
|
37
|
+
end
|
38
|
+
|
39
|
+
true
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module Bosh::Monitor
|
2
|
+
module Plugins
|
3
|
+
class Varz < Base
|
4
|
+
def run
|
5
|
+
logger.info("Varz plugin is running...")
|
6
|
+
end
|
7
|
+
|
8
|
+
def process(event)
|
9
|
+
@agents ||= {}
|
10
|
+
@agents[event.kind] ||= {}
|
11
|
+
agent_id = event.attributes["agent_id"] || "unknown"
|
12
|
+
@agents[event.kind][agent_id.to_s] = event.to_hash
|
13
|
+
Bhm.set_varz("last_agents_" + event.kind.to_s, @agents[event.kind])
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,68 @@
|
|
1
|
+
module Bosh::Monitor
|
2
|
+
class TsdbConnection < EventMachine::Connection
|
3
|
+
|
4
|
+
BACKOFF_CEILING = 9
|
5
|
+
MAX_RETRIES = 35
|
6
|
+
|
7
|
+
attr_reader :retries
|
8
|
+
|
9
|
+
def initialize(host, port)
|
10
|
+
@host = host
|
11
|
+
@port = port
|
12
|
+
@logger = Bhm.logger
|
13
|
+
reset_retries
|
14
|
+
end
|
15
|
+
|
16
|
+
def reset_retries
|
17
|
+
@retries = 0
|
18
|
+
end
|
19
|
+
|
20
|
+
def increment_retries
|
21
|
+
@retries += 1
|
22
|
+
end
|
23
|
+
|
24
|
+
def send_metric(name, timestamp, value, tags = {})
|
25
|
+
formatted_tags = tags.map { |tag| tag.join("=") }.sort.join(" ")
|
26
|
+
command = "put #{name} #{timestamp} #{value} #{formatted_tags}\n"
|
27
|
+
@logger.debug("[TSDB] >> #{command.chomp}")
|
28
|
+
send_data(command)
|
29
|
+
end
|
30
|
+
|
31
|
+
def connection_completed
|
32
|
+
reset_retries
|
33
|
+
@reconnecting = false
|
34
|
+
@connected = true
|
35
|
+
@logger.info("Connected to TSDB server at #{@host}:#{@port}")
|
36
|
+
end
|
37
|
+
|
38
|
+
def unbind
|
39
|
+
if @connected
|
40
|
+
@logger.warn("Lost connection to TSDB server at #{@host}:#{@port}")
|
41
|
+
end
|
42
|
+
@connected = false
|
43
|
+
|
44
|
+
retry_in = 2**[retries, BACKOFF_CEILING].min - 1
|
45
|
+
increment_retries
|
46
|
+
|
47
|
+
if retries > MAX_RETRIES
|
48
|
+
raise "Failed to reconnect to TSDB after #{MAX_RETRIES} retries"
|
49
|
+
end
|
50
|
+
|
51
|
+
if retries > 1
|
52
|
+
@logger.info("Failed to reconnect to TSDB, will try again in #{retry_in} seconds...")
|
53
|
+
end
|
54
|
+
|
55
|
+
EM.add_timer(retry_in) { tsdb_reconnect }
|
56
|
+
end
|
57
|
+
|
58
|
+
def tsdb_reconnect
|
59
|
+
@logger.info("Trying to reconnect to TSDB server at #{@host}:#{@port} (#{retries})...")
|
60
|
+
reconnect(@host, @port)
|
61
|
+
end
|
62
|
+
|
63
|
+
def receive_data(data)
|
64
|
+
@logger.info("[TSDB] << #{data.chomp}")
|
65
|
+
end
|
66
|
+
|
67
|
+
end
|
68
|
+
end
|