bosh-monitor 1.5.0.pre.1113
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +80 -0
- data/bin/bosh-monitor +30 -0
- data/bin/bosh-monitor-console +51 -0
- data/bin/listener +58 -0
- data/lib/bosh/monitor.rb +72 -0
- data/lib/bosh/monitor/agent.rb +51 -0
- data/lib/bosh/monitor/agent_manager.rb +295 -0
- data/lib/bosh/monitor/api_controller.rb +18 -0
- data/lib/bosh/monitor/config.rb +71 -0
- data/lib/bosh/monitor/core_ext.rb +8 -0
- data/lib/bosh/monitor/director.rb +76 -0
- data/lib/bosh/monitor/director_monitor.rb +33 -0
- data/lib/bosh/monitor/errors.rb +19 -0
- data/lib/bosh/monitor/event_processor.rb +109 -0
- data/lib/bosh/monitor/events/alert.rb +92 -0
- data/lib/bosh/monitor/events/base.rb +70 -0
- data/lib/bosh/monitor/events/heartbeat.rb +139 -0
- data/lib/bosh/monitor/metric.rb +16 -0
- data/lib/bosh/monitor/plugins/base.rb +27 -0
- data/lib/bosh/monitor/plugins/cloud_watch.rb +56 -0
- data/lib/bosh/monitor/plugins/datadog.rb +78 -0
- data/lib/bosh/monitor/plugins/dummy.rb +20 -0
- data/lib/bosh/monitor/plugins/email.rb +135 -0
- data/lib/bosh/monitor/plugins/http_request_helper.rb +25 -0
- data/lib/bosh/monitor/plugins/logger.rb +13 -0
- data/lib/bosh/monitor/plugins/nats.rb +43 -0
- data/lib/bosh/monitor/plugins/pagerduty.rb +48 -0
- data/lib/bosh/monitor/plugins/paging_datadog_client.rb +24 -0
- data/lib/bosh/monitor/plugins/resurrector.rb +82 -0
- data/lib/bosh/monitor/plugins/resurrector_helper.rb +84 -0
- data/lib/bosh/monitor/plugins/tsdb.rb +43 -0
- data/lib/bosh/monitor/plugins/varz.rb +17 -0
- data/lib/bosh/monitor/protocols/tsdb.rb +68 -0
- data/lib/bosh/monitor/runner.rb +162 -0
- data/lib/bosh/monitor/version.rb +5 -0
- data/lib/bosh/monitor/yaml_helper.rb +18 -0
- metadata +246 -0
@@ -0,0 +1,25 @@
|
|
1
|
+
module Bosh::Monitor::Plugins
|
2
|
+
module HttpRequestHelper
|
3
|
+
def send_http_post_request(uri, request)
|
4
|
+
send_http_request(:post, uri, request)
|
5
|
+
end
|
6
|
+
|
7
|
+
def send_http_put_request(uri, request)
|
8
|
+
send_http_request(:put, uri, request)
|
9
|
+
end
|
10
|
+
|
11
|
+
def send_http_request(method, uri, request)
|
12
|
+
name = self.class.name
|
13
|
+
logger.debug("sending HTTP #{method.to_s.upcase} to: #{uri}")
|
14
|
+
started = Time.now
|
15
|
+
http = EM::HttpRequest.new(uri).send(method, request)
|
16
|
+
http.callback do
|
17
|
+
logger.debug("#{name} event sent (took #{Time.now - started} seconds): #{http.response_header.status}")
|
18
|
+
end
|
19
|
+
|
20
|
+
http.errback do |e|
|
21
|
+
logger.error("Failed to send #{name} event: #{e.error}")
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
module Bosh::Monitor
|
2
|
+
module Plugins
|
3
|
+
class Nats < Base
|
4
|
+
SUBJECT = "bosh.hm.events"
|
5
|
+
|
6
|
+
def validate_options
|
7
|
+
options.kind_of?(Hash) &&
|
8
|
+
options["endpoint"] &&
|
9
|
+
options.has_key?("user") &&
|
10
|
+
options.has_key?("password")
|
11
|
+
end
|
12
|
+
|
13
|
+
def run
|
14
|
+
unless EM.reactor_running?
|
15
|
+
logger.error("NATS delivery agent can only be started when event loop is running")
|
16
|
+
return false
|
17
|
+
end
|
18
|
+
|
19
|
+
nats_client_options = {
|
20
|
+
:uri => options["endpoint"],
|
21
|
+
:user => options["user"],
|
22
|
+
:pass => options["password"],
|
23
|
+
:autostart => false
|
24
|
+
}
|
25
|
+
|
26
|
+
@nats = NATS.connect(nats_client_options) do
|
27
|
+
logger.info("Ready to publish alerts to NATS at `#{options["endpoint"]}'")
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def process(event)
|
32
|
+
if @nats.nil?
|
33
|
+
@logger.error("Cannot deliver event, NATS not initialized")
|
34
|
+
return false
|
35
|
+
end
|
36
|
+
|
37
|
+
nats_subject = options["subject"] || SUBJECT
|
38
|
+
@nats.publish(nats_subject, event.to_json)
|
39
|
+
true
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
module Bosh::Monitor
|
2
|
+
module Plugins
|
3
|
+
class Pagerduty < Base
|
4
|
+
include Bosh::Monitor::Plugins::HttpRequestHelper
|
5
|
+
|
6
|
+
API_URI = "https://events.pagerduty.com/generic/2010-04-15/create_event.json"
|
7
|
+
|
8
|
+
def run
|
9
|
+
unless EM.reactor_running?
|
10
|
+
logger.error("Pagerduty plugin can only be started when event loop is running")
|
11
|
+
return false
|
12
|
+
end
|
13
|
+
|
14
|
+
logger.info("Pagerduty delivery agent is running...")
|
15
|
+
end
|
16
|
+
|
17
|
+
def validate_options
|
18
|
+
options.kind_of?(Hash) &&
|
19
|
+
options["service_key"].kind_of?(String)
|
20
|
+
end
|
21
|
+
|
22
|
+
def process(event)
|
23
|
+
started = Time.now
|
24
|
+
|
25
|
+
payload = {
|
26
|
+
:service_key => options["service_key"],
|
27
|
+
:event_type => "trigger",
|
28
|
+
:incident_key => event.id,
|
29
|
+
:description => event.short_description,
|
30
|
+
:details => event.to_hash
|
31
|
+
}
|
32
|
+
|
33
|
+
request = {
|
34
|
+
:body => Yajl::Encoder.encode(payload)
|
35
|
+
}
|
36
|
+
|
37
|
+
if options["http_proxy"]
|
38
|
+
proxy = URI.parse(options["http_proxy"])
|
39
|
+
request[:proxy] = { :host => proxy.host, :port => proxy.port }
|
40
|
+
end
|
41
|
+
|
42
|
+
send_http_post_request(API_URI, request)
|
43
|
+
rescue => e
|
44
|
+
logger.error("Error sending pagerduty event: #{e}")
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
class PagingDatadogClient
|
2
|
+
attr_reader :datadog_recipient
|
3
|
+
|
4
|
+
def initialize(datadog_recipient, datadog_client)
|
5
|
+
@datadog_recipient = datadog_recipient
|
6
|
+
@datadog_client = datadog_client
|
7
|
+
end
|
8
|
+
|
9
|
+
def emit_points(metric, points, options={})
|
10
|
+
@datadog_client.emit_points(metric, points, options)
|
11
|
+
end
|
12
|
+
|
13
|
+
def emit_event(event)
|
14
|
+
event_hash = event.to_hash
|
15
|
+
new_message = if event.priority == "normal"
|
16
|
+
"#{event.msg_text} @#{@datadog_recipient}"
|
17
|
+
else
|
18
|
+
event.msg_text
|
19
|
+
end
|
20
|
+
new_event = Dogapi::Event.new(new_message, event_hash)
|
21
|
+
|
22
|
+
@datadog_client.emit_event(new_event)
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,82 @@
|
|
1
|
+
# This health monitor plugin should be used in conjunction with another plugin that
|
2
|
+
# alerts when a VM is unresponsive, as this plugin will try to automatically fix the
|
3
|
+
# problem by recreating the VM
|
4
|
+
module Bosh::Monitor
|
5
|
+
module Plugins
|
6
|
+
class Resurrector < Base
|
7
|
+
include Bosh::Monitor::Plugins::HttpRequestHelper
|
8
|
+
|
9
|
+
attr_reader :url
|
10
|
+
|
11
|
+
def initialize(options={})
|
12
|
+
super(options)
|
13
|
+
director = @options['director']
|
14
|
+
raise ArgumentError 'director options not set' unless director
|
15
|
+
|
16
|
+
@url = URI(director['endpoint'])
|
17
|
+
@user = director['user']
|
18
|
+
@password = director['password']
|
19
|
+
@processor = Bhm.event_processor
|
20
|
+
@alert_tracker = ResurrectorHelper::AlertTracker.new(@options)
|
21
|
+
end
|
22
|
+
|
23
|
+
def run
|
24
|
+
unless EM.reactor_running?
|
25
|
+
logger.error("Resurrector plugin can only be started when event loop is running")
|
26
|
+
return false
|
27
|
+
end
|
28
|
+
|
29
|
+
logger.info("Resurrector is running...")
|
30
|
+
end
|
31
|
+
|
32
|
+
def process(alert)
|
33
|
+
deployment = alert.attributes['deployment']
|
34
|
+
job = alert.attributes['job']
|
35
|
+
index = alert.attributes['index']
|
36
|
+
|
37
|
+
# only when the agent times out do we add deployment, job & index to the alert
|
38
|
+
# attributes, so this won't trigger a recreate for other types of alerts
|
39
|
+
if deployment && job && index
|
40
|
+
agent_key = ResurrectorHelper::JobInstanceKey.new(deployment, job, index)
|
41
|
+
@alert_tracker.record(agent_key, alert.created_at)
|
42
|
+
|
43
|
+
payload = {'jobs' => {job => [index]}}
|
44
|
+
request = {
|
45
|
+
head: {
|
46
|
+
'Content-Type' => 'application/json',
|
47
|
+
'authorization' => [@user, @password]
|
48
|
+
},
|
49
|
+
body: Yajl::Encoder.encode(payload)
|
50
|
+
}
|
51
|
+
|
52
|
+
@url.path = "/deployments/#{deployment}/scan_and_fix"
|
53
|
+
|
54
|
+
if @alert_tracker.melting_down?(deployment)
|
55
|
+
# freak out
|
56
|
+
ts = Time.now.to_i
|
57
|
+
@processor.process(:alert,
|
58
|
+
severity: 1,
|
59
|
+
source: "HM plugin resurrector",
|
60
|
+
title: "We are in meltdown.",
|
61
|
+
created_at: ts)
|
62
|
+
|
63
|
+
logger.error("(Resurrector) we are in meltdown.")
|
64
|
+
else
|
65
|
+
# queue instead, and only queue if it isn't already in the queue
|
66
|
+
# what if we can't keep up with the failure rate?
|
67
|
+
# - maybe not, maybe the meltdown detection takes care of the rate issue
|
68
|
+
logger.warn("(Resurrector) notifying director to recreate unresponsive VM: #{deployment} #{job}/#{index}")
|
69
|
+
|
70
|
+
send_http_put_request(url.to_s, request)
|
71
|
+
end
|
72
|
+
|
73
|
+
|
74
|
+
else
|
75
|
+
logger.warn("(Resurrector) event did not have deployment, job and index: #{alert}")
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
@@ -0,0 +1,84 @@
|
|
1
|
+
module Bosh::Monitor::Plugins
|
2
|
+
module ResurrectorHelper
|
3
|
+
|
4
|
+
# Hashable tuple of the identifying properties of a job
|
5
|
+
class JobInstanceKey
|
6
|
+
attr_accessor :deployment, :job, :index
|
7
|
+
|
8
|
+
def initialize(deployment, job, index)
|
9
|
+
@deployment = deployment
|
10
|
+
@job = job
|
11
|
+
@index = index
|
12
|
+
end
|
13
|
+
|
14
|
+
def hash
|
15
|
+
(deployment.to_s + job.to_s + index.to_s).hash
|
16
|
+
end
|
17
|
+
|
18
|
+
def eql?(other)
|
19
|
+
other.deployment == deployment &&
|
20
|
+
other.job == job &&
|
21
|
+
other.index == index
|
22
|
+
end
|
23
|
+
|
24
|
+
def to_s
|
25
|
+
[deployment, job, index].join('/')
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
# Service which tracks alerts and decides whether or not the cluster is melting down.
|
30
|
+
# When the cluster is melting down, the resurrector backs off on fixing instances.
|
31
|
+
class AlertTracker
|
32
|
+
|
33
|
+
# Below this number of down agents we don't consider a meltdown occurring
|
34
|
+
attr_accessor :minimum_down_jobs
|
35
|
+
|
36
|
+
# Number of seconds at which an alert is considered "current"; alerts older than
|
37
|
+
# this are ignored. Integer number of seconds.
|
38
|
+
attr_accessor :time_threshold
|
39
|
+
|
40
|
+
# Percentage of the cluster which must be down for scanning to stop. Float fraction
|
41
|
+
# between 0 and 1.
|
42
|
+
attr_accessor :percent_threshold
|
43
|
+
|
44
|
+
def initialize(args={})
|
45
|
+
@agent_manager = Bhm.agent_manager
|
46
|
+
@alert_times = {} # maps JobInstanceKey to time of last Alert
|
47
|
+
@minimum_down_jobs = args.fetch('minimum_down_jobs', 5)
|
48
|
+
@percent_threshold = args.fetch('percent_threshold', 0.2)
|
49
|
+
@time_threshold = args.fetch('time_threshold', 600)
|
50
|
+
end
|
51
|
+
|
52
|
+
# "Melting down" means a large part of the cluster is offline and manual intervention
|
53
|
+
# may be required to fix.
|
54
|
+
def melting_down?(deployment)
|
55
|
+
agent_alerts = alerts_for_deployment(deployment)
|
56
|
+
total_number_of_agents = agent_alerts.size
|
57
|
+
number_of_down_agents = agent_alerts.select { |_, alert_time|
|
58
|
+
alert_time > (Time.now - time_threshold)
|
59
|
+
}.size
|
60
|
+
|
61
|
+
return false if number_of_down_agents < minimum_down_jobs
|
62
|
+
|
63
|
+
(number_of_down_agents.to_f / total_number_of_agents) >= percent_threshold
|
64
|
+
end
|
65
|
+
|
66
|
+
def record(agent_key, alert_time)
|
67
|
+
@alert_times[agent_key] = alert_time
|
68
|
+
end
|
69
|
+
|
70
|
+
private
|
71
|
+
|
72
|
+
def alerts_for_deployment(deployment)
|
73
|
+
agents = @agent_manager.get_agents_for_deployment(deployment)
|
74
|
+
keys = agents.values.map { |agent|
|
75
|
+
JobInstanceKey.new(agent.deployment, agent.job, agent.index)
|
76
|
+
}
|
77
|
+
|
78
|
+
result = {}
|
79
|
+
keys.each { |key| result[key] = @alert_times.fetch(key, Time.at(0)) }
|
80
|
+
result
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
module Bosh::Monitor
|
2
|
+
module Plugins
|
3
|
+
class Tsdb < Base
|
4
|
+
|
5
|
+
def validate_options
|
6
|
+
options.kind_of?(Hash) &&
|
7
|
+
options["host"] &&
|
8
|
+
options["port"] &&
|
9
|
+
true
|
10
|
+
end
|
11
|
+
|
12
|
+
def run
|
13
|
+
unless EM.reactor_running?
|
14
|
+
logger.error("TSDB delivery agent can only be started when event loop is running")
|
15
|
+
return false
|
16
|
+
end
|
17
|
+
|
18
|
+
host = options["host"]
|
19
|
+
port = options["port"]
|
20
|
+
@tsdb = EM.connect(host, port, Bhm::TsdbConnection, host, port)
|
21
|
+
end
|
22
|
+
|
23
|
+
def process(event)
|
24
|
+
if @tsdb.nil?
|
25
|
+
@logger.error("Cannot deliver event, TSDB connection is not initialized")
|
26
|
+
return false
|
27
|
+
end
|
28
|
+
|
29
|
+
metrics = event.metrics
|
30
|
+
|
31
|
+
if !metrics.kind_of?(Enumerable)
|
32
|
+
raise PluginError, "Invalid event metrics: Enumerable expected, #{metrics.class} given"
|
33
|
+
end
|
34
|
+
|
35
|
+
metrics.each do |metric|
|
36
|
+
@tsdb.send_metric(metric.name, metric.timestamp, metric.value, metric.tags)
|
37
|
+
end
|
38
|
+
|
39
|
+
true
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module Bosh::Monitor
|
2
|
+
module Plugins
|
3
|
+
class Varz < Base
|
4
|
+
def run
|
5
|
+
logger.info("Varz plugin is running...")
|
6
|
+
end
|
7
|
+
|
8
|
+
def process(event)
|
9
|
+
@agents ||= {}
|
10
|
+
@agents[event.kind] ||= {}
|
11
|
+
agent_id = event.attributes["agent_id"] || "unknown"
|
12
|
+
@agents[event.kind][agent_id.to_s] = event.to_hash
|
13
|
+
Bhm.set_varz("last_agents_" + event.kind.to_s, @agents[event.kind])
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,68 @@
|
|
1
|
+
module Bosh::Monitor
|
2
|
+
class TsdbConnection < EventMachine::Connection
|
3
|
+
|
4
|
+
BACKOFF_CEILING = 9
|
5
|
+
MAX_RETRIES = 35
|
6
|
+
|
7
|
+
attr_reader :retries
|
8
|
+
|
9
|
+
def initialize(host, port)
|
10
|
+
@host = host
|
11
|
+
@port = port
|
12
|
+
@logger = Bhm.logger
|
13
|
+
reset_retries
|
14
|
+
end
|
15
|
+
|
16
|
+
def reset_retries
|
17
|
+
@retries = 0
|
18
|
+
end
|
19
|
+
|
20
|
+
def increment_retries
|
21
|
+
@retries += 1
|
22
|
+
end
|
23
|
+
|
24
|
+
def send_metric(name, timestamp, value, tags = {})
|
25
|
+
formatted_tags = tags.map { |tag| tag.join("=") }.sort.join(" ")
|
26
|
+
command = "put #{name} #{timestamp} #{value} #{formatted_tags}\n"
|
27
|
+
@logger.debug("[TSDB] >> #{command.chomp}")
|
28
|
+
send_data(command)
|
29
|
+
end
|
30
|
+
|
31
|
+
def connection_completed
|
32
|
+
reset_retries
|
33
|
+
@reconnecting = false
|
34
|
+
@connected = true
|
35
|
+
@logger.info("Connected to TSDB server at #{@host}:#{@port}")
|
36
|
+
end
|
37
|
+
|
38
|
+
def unbind
|
39
|
+
if @connected
|
40
|
+
@logger.warn("Lost connection to TSDB server at #{@host}:#{@port}")
|
41
|
+
end
|
42
|
+
@connected = false
|
43
|
+
|
44
|
+
retry_in = 2**[retries, BACKOFF_CEILING].min - 1
|
45
|
+
increment_retries
|
46
|
+
|
47
|
+
if retries > MAX_RETRIES
|
48
|
+
raise "Failed to reconnect to TSDB after #{MAX_RETRIES} retries"
|
49
|
+
end
|
50
|
+
|
51
|
+
if retries > 1
|
52
|
+
@logger.info("Failed to reconnect to TSDB, will try again in #{retry_in} seconds...")
|
53
|
+
end
|
54
|
+
|
55
|
+
EM.add_timer(retry_in) { tsdb_reconnect }
|
56
|
+
end
|
57
|
+
|
58
|
+
def tsdb_reconnect
|
59
|
+
@logger.info("Trying to reconnect to TSDB server at #{@host}:#{@port} (#{retries})...")
|
60
|
+
reconnect(@host, @port)
|
61
|
+
end
|
62
|
+
|
63
|
+
def receive_data(data)
|
64
|
+
@logger.info("[TSDB] << #{data.chomp}")
|
65
|
+
end
|
66
|
+
|
67
|
+
end
|
68
|
+
end
|