bosh-monitor 1.5.0.pre.1113

Sign up to get free protection for your applications and to get access to all the features.
Files changed (37) hide show
  1. data/README +80 -0
  2. data/bin/bosh-monitor +30 -0
  3. data/bin/bosh-monitor-console +51 -0
  4. data/bin/listener +58 -0
  5. data/lib/bosh/monitor.rb +72 -0
  6. data/lib/bosh/monitor/agent.rb +51 -0
  7. data/lib/bosh/monitor/agent_manager.rb +295 -0
  8. data/lib/bosh/monitor/api_controller.rb +18 -0
  9. data/lib/bosh/monitor/config.rb +71 -0
  10. data/lib/bosh/monitor/core_ext.rb +8 -0
  11. data/lib/bosh/monitor/director.rb +76 -0
  12. data/lib/bosh/monitor/director_monitor.rb +33 -0
  13. data/lib/bosh/monitor/errors.rb +19 -0
  14. data/lib/bosh/monitor/event_processor.rb +109 -0
  15. data/lib/bosh/monitor/events/alert.rb +92 -0
  16. data/lib/bosh/monitor/events/base.rb +70 -0
  17. data/lib/bosh/monitor/events/heartbeat.rb +139 -0
  18. data/lib/bosh/monitor/metric.rb +16 -0
  19. data/lib/bosh/monitor/plugins/base.rb +27 -0
  20. data/lib/bosh/monitor/plugins/cloud_watch.rb +56 -0
  21. data/lib/bosh/monitor/plugins/datadog.rb +78 -0
  22. data/lib/bosh/monitor/plugins/dummy.rb +20 -0
  23. data/lib/bosh/monitor/plugins/email.rb +135 -0
  24. data/lib/bosh/monitor/plugins/http_request_helper.rb +25 -0
  25. data/lib/bosh/monitor/plugins/logger.rb +13 -0
  26. data/lib/bosh/monitor/plugins/nats.rb +43 -0
  27. data/lib/bosh/monitor/plugins/pagerduty.rb +48 -0
  28. data/lib/bosh/monitor/plugins/paging_datadog_client.rb +24 -0
  29. data/lib/bosh/monitor/plugins/resurrector.rb +82 -0
  30. data/lib/bosh/monitor/plugins/resurrector_helper.rb +84 -0
  31. data/lib/bosh/monitor/plugins/tsdb.rb +43 -0
  32. data/lib/bosh/monitor/plugins/varz.rb +17 -0
  33. data/lib/bosh/monitor/protocols/tsdb.rb +68 -0
  34. data/lib/bosh/monitor/runner.rb +162 -0
  35. data/lib/bosh/monitor/version.rb +5 -0
  36. data/lib/bosh/monitor/yaml_helper.rb +18 -0
  37. metadata +246 -0
@@ -0,0 +1,25 @@
1
+ module Bosh::Monitor::Plugins
2
+ module HttpRequestHelper
3
+ def send_http_post_request(uri, request)
4
+ send_http_request(:post, uri, request)
5
+ end
6
+
7
+ def send_http_put_request(uri, request)
8
+ send_http_request(:put, uri, request)
9
+ end
10
+
11
+ def send_http_request(method, uri, request)
12
+ name = self.class.name
13
+ logger.debug("sending HTTP #{method.to_s.upcase} to: #{uri}")
14
+ started = Time.now
15
+ http = EM::HttpRequest.new(uri).send(method, request)
16
+ http.callback do
17
+ logger.debug("#{name} event sent (took #{Time.now - started} seconds): #{http.response_header.status}")
18
+ end
19
+
20
+ http.errback do |e|
21
+ logger.error("Failed to send #{name} event: #{e.error}")
22
+ end
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,13 @@
1
+ module Bosh::Monitor
2
+ module Plugins
3
+ class Logger < Base
4
+ def run
5
+ logger.info("Logging delivery agent is running...")
6
+ end
7
+
8
+ def process(event)
9
+ logger.info("[#{event.kind.to_s.upcase}] #{event}")
10
+ end
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,43 @@
1
+ module Bosh::Monitor
2
+ module Plugins
3
+ class Nats < Base
4
+ SUBJECT = "bosh.hm.events"
5
+
6
+ def validate_options
7
+ options.kind_of?(Hash) &&
8
+ options["endpoint"] &&
9
+ options.has_key?("user") &&
10
+ options.has_key?("password")
11
+ end
12
+
13
+ def run
14
+ unless EM.reactor_running?
15
+ logger.error("NATS delivery agent can only be started when event loop is running")
16
+ return false
17
+ end
18
+
19
+ nats_client_options = {
20
+ :uri => options["endpoint"],
21
+ :user => options["user"],
22
+ :pass => options["password"],
23
+ :autostart => false
24
+ }
25
+
26
+ @nats = NATS.connect(nats_client_options) do
27
+ logger.info("Ready to publish alerts to NATS at `#{options["endpoint"]}'")
28
+ end
29
+ end
30
+
31
+ def process(event)
32
+ if @nats.nil?
33
+ @logger.error("Cannot deliver event, NATS not initialized")
34
+ return false
35
+ end
36
+
37
+ nats_subject = options["subject"] || SUBJECT
38
+ @nats.publish(nats_subject, event.to_json)
39
+ true
40
+ end
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,48 @@
1
+ module Bosh::Monitor
2
+ module Plugins
3
+ class Pagerduty < Base
4
+ include Bosh::Monitor::Plugins::HttpRequestHelper
5
+
6
+ API_URI = "https://events.pagerduty.com/generic/2010-04-15/create_event.json"
7
+
8
+ def run
9
+ unless EM.reactor_running?
10
+ logger.error("Pagerduty plugin can only be started when event loop is running")
11
+ return false
12
+ end
13
+
14
+ logger.info("Pagerduty delivery agent is running...")
15
+ end
16
+
17
+ def validate_options
18
+ options.kind_of?(Hash) &&
19
+ options["service_key"].kind_of?(String)
20
+ end
21
+
22
+ def process(event)
23
+ started = Time.now
24
+
25
+ payload = {
26
+ :service_key => options["service_key"],
27
+ :event_type => "trigger",
28
+ :incident_key => event.id,
29
+ :description => event.short_description,
30
+ :details => event.to_hash
31
+ }
32
+
33
+ request = {
34
+ :body => Yajl::Encoder.encode(payload)
35
+ }
36
+
37
+ if options["http_proxy"]
38
+ proxy = URI.parse(options["http_proxy"])
39
+ request[:proxy] = { :host => proxy.host, :port => proxy.port }
40
+ end
41
+
42
+ send_http_post_request(API_URI, request)
43
+ rescue => e
44
+ logger.error("Error sending pagerduty event: #{e}")
45
+ end
46
+ end
47
+ end
48
+ end
@@ -0,0 +1,24 @@
1
+ class PagingDatadogClient
2
+ attr_reader :datadog_recipient
3
+
4
+ def initialize(datadog_recipient, datadog_client)
5
+ @datadog_recipient = datadog_recipient
6
+ @datadog_client = datadog_client
7
+ end
8
+
9
+ def emit_points(metric, points, options={})
10
+ @datadog_client.emit_points(metric, points, options)
11
+ end
12
+
13
+ def emit_event(event)
14
+ event_hash = event.to_hash
15
+ new_message = if event.priority == "normal"
16
+ "#{event.msg_text} @#{@datadog_recipient}"
17
+ else
18
+ event.msg_text
19
+ end
20
+ new_event = Dogapi::Event.new(new_message, event_hash)
21
+
22
+ @datadog_client.emit_event(new_event)
23
+ end
24
+ end
@@ -0,0 +1,82 @@
1
+ # This health monitor plugin should be used in conjunction with another plugin that
2
+ # alerts when a VM is unresponsive, as this plugin will try to automatically fix the
3
+ # problem by recreating the VM
4
+ module Bosh::Monitor
5
+ module Plugins
6
+ class Resurrector < Base
7
+ include Bosh::Monitor::Plugins::HttpRequestHelper
8
+
9
+ attr_reader :url
10
+
11
+ def initialize(options={})
12
+ super(options)
13
+ director = @options['director']
14
+ raise ArgumentError 'director options not set' unless director
15
+
16
+ @url = URI(director['endpoint'])
17
+ @user = director['user']
18
+ @password = director['password']
19
+ @processor = Bhm.event_processor
20
+ @alert_tracker = ResurrectorHelper::AlertTracker.new(@options)
21
+ end
22
+
23
+ def run
24
+ unless EM.reactor_running?
25
+ logger.error("Resurrector plugin can only be started when event loop is running")
26
+ return false
27
+ end
28
+
29
+ logger.info("Resurrector is running...")
30
+ end
31
+
32
+ def process(alert)
33
+ deployment = alert.attributes['deployment']
34
+ job = alert.attributes['job']
35
+ index = alert.attributes['index']
36
+
37
+ # only when the agent times out do we add deployment, job & index to the alert
38
+ # attributes, so this won't trigger a recreate for other types of alerts
39
+ if deployment && job && index
40
+ agent_key = ResurrectorHelper::JobInstanceKey.new(deployment, job, index)
41
+ @alert_tracker.record(agent_key, alert.created_at)
42
+
43
+ payload = {'jobs' => {job => [index]}}
44
+ request = {
45
+ head: {
46
+ 'Content-Type' => 'application/json',
47
+ 'authorization' => [@user, @password]
48
+ },
49
+ body: Yajl::Encoder.encode(payload)
50
+ }
51
+
52
+ @url.path = "/deployments/#{deployment}/scan_and_fix"
53
+
54
+ if @alert_tracker.melting_down?(deployment)
55
+ # freak out
56
+ ts = Time.now.to_i
57
+ @processor.process(:alert,
58
+ severity: 1,
59
+ source: "HM plugin resurrector",
60
+ title: "We are in meltdown.",
61
+ created_at: ts)
62
+
63
+ logger.error("(Resurrector) we are in meltdown.")
64
+ else
65
+ # queue instead, and only queue if it isn't already in the queue
66
+ # what if we can't keep up with the failure rate?
67
+ # - maybe not, maybe the meltdown detection takes care of the rate issue
68
+ logger.warn("(Resurrector) notifying director to recreate unresponsive VM: #{deployment} #{job}/#{index}")
69
+
70
+ send_http_put_request(url.to_s, request)
71
+ end
72
+
73
+
74
+ else
75
+ logger.warn("(Resurrector) event did not have deployment, job and index: #{alert}")
76
+ end
77
+ end
78
+
79
+ end
80
+ end
81
+ end
82
+
@@ -0,0 +1,84 @@
1
+ module Bosh::Monitor::Plugins
2
+ module ResurrectorHelper
3
+
4
+ # Hashable tuple of the identifying properties of a job
5
+ class JobInstanceKey
6
+ attr_accessor :deployment, :job, :index
7
+
8
+ def initialize(deployment, job, index)
9
+ @deployment = deployment
10
+ @job = job
11
+ @index = index
12
+ end
13
+
14
+ def hash
15
+ (deployment.to_s + job.to_s + index.to_s).hash
16
+ end
17
+
18
+ def eql?(other)
19
+ other.deployment == deployment &&
20
+ other.job == job &&
21
+ other.index == index
22
+ end
23
+
24
+ def to_s
25
+ [deployment, job, index].join('/')
26
+ end
27
+ end
28
+
29
+ # Service which tracks alerts and decides whether or not the cluster is melting down.
30
+ # When the cluster is melting down, the resurrector backs off on fixing instances.
31
+ class AlertTracker
32
+
33
+ # Below this number of down agents we don't consider a meltdown occurring
34
+ attr_accessor :minimum_down_jobs
35
+
36
+ # Number of seconds at which an alert is considered "current"; alerts older than
37
+ # this are ignored. Integer number of seconds.
38
+ attr_accessor :time_threshold
39
+
40
+ # Percentage of the cluster which must be down for scanning to stop. Float fraction
41
+ # between 0 and 1.
42
+ attr_accessor :percent_threshold
43
+
44
+ def initialize(args={})
45
+ @agent_manager = Bhm.agent_manager
46
+ @alert_times = {} # maps JobInstanceKey to time of last Alert
47
+ @minimum_down_jobs = args.fetch('minimum_down_jobs', 5)
48
+ @percent_threshold = args.fetch('percent_threshold', 0.2)
49
+ @time_threshold = args.fetch('time_threshold', 600)
50
+ end
51
+
52
+ # "Melting down" means a large part of the cluster is offline and manual intervention
53
+ # may be required to fix.
54
+ def melting_down?(deployment)
55
+ agent_alerts = alerts_for_deployment(deployment)
56
+ total_number_of_agents = agent_alerts.size
57
+ number_of_down_agents = agent_alerts.select { |_, alert_time|
58
+ alert_time > (Time.now - time_threshold)
59
+ }.size
60
+
61
+ return false if number_of_down_agents < minimum_down_jobs
62
+
63
+ (number_of_down_agents.to_f / total_number_of_agents) >= percent_threshold
64
+ end
65
+
66
+ def record(agent_key, alert_time)
67
+ @alert_times[agent_key] = alert_time
68
+ end
69
+
70
+ private
71
+
72
+ def alerts_for_deployment(deployment)
73
+ agents = @agent_manager.get_agents_for_deployment(deployment)
74
+ keys = agents.values.map { |agent|
75
+ JobInstanceKey.new(agent.deployment, agent.job, agent.index)
76
+ }
77
+
78
+ result = {}
79
+ keys.each { |key| result[key] = @alert_times.fetch(key, Time.at(0)) }
80
+ result
81
+ end
82
+ end
83
+ end
84
+ end
@@ -0,0 +1,43 @@
1
+ module Bosh::Monitor
2
+ module Plugins
3
+ class Tsdb < Base
4
+
5
+ def validate_options
6
+ options.kind_of?(Hash) &&
7
+ options["host"] &&
8
+ options["port"] &&
9
+ true
10
+ end
11
+
12
+ def run
13
+ unless EM.reactor_running?
14
+ logger.error("TSDB delivery agent can only be started when event loop is running")
15
+ return false
16
+ end
17
+
18
+ host = options["host"]
19
+ port = options["port"]
20
+ @tsdb = EM.connect(host, port, Bhm::TsdbConnection, host, port)
21
+ end
22
+
23
+ def process(event)
24
+ if @tsdb.nil?
25
+ @logger.error("Cannot deliver event, TSDB connection is not initialized")
26
+ return false
27
+ end
28
+
29
+ metrics = event.metrics
30
+
31
+ if !metrics.kind_of?(Enumerable)
32
+ raise PluginError, "Invalid event metrics: Enumerable expected, #{metrics.class} given"
33
+ end
34
+
35
+ metrics.each do |metric|
36
+ @tsdb.send_metric(metric.name, metric.timestamp, metric.value, metric.tags)
37
+ end
38
+
39
+ true
40
+ end
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,17 @@
1
+ module Bosh::Monitor
2
+ module Plugins
3
+ class Varz < Base
4
+ def run
5
+ logger.info("Varz plugin is running...")
6
+ end
7
+
8
+ def process(event)
9
+ @agents ||= {}
10
+ @agents[event.kind] ||= {}
11
+ agent_id = event.attributes["agent_id"] || "unknown"
12
+ @agents[event.kind][agent_id.to_s] = event.to_hash
13
+ Bhm.set_varz("last_agents_" + event.kind.to_s, @agents[event.kind])
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,68 @@
1
+ module Bosh::Monitor
2
+ class TsdbConnection < EventMachine::Connection
3
+
4
+ BACKOFF_CEILING = 9
5
+ MAX_RETRIES = 35
6
+
7
+ attr_reader :retries
8
+
9
+ def initialize(host, port)
10
+ @host = host
11
+ @port = port
12
+ @logger = Bhm.logger
13
+ reset_retries
14
+ end
15
+
16
+ def reset_retries
17
+ @retries = 0
18
+ end
19
+
20
+ def increment_retries
21
+ @retries += 1
22
+ end
23
+
24
+ def send_metric(name, timestamp, value, tags = {})
25
+ formatted_tags = tags.map { |tag| tag.join("=") }.sort.join(" ")
26
+ command = "put #{name} #{timestamp} #{value} #{formatted_tags}\n"
27
+ @logger.debug("[TSDB] >> #{command.chomp}")
28
+ send_data(command)
29
+ end
30
+
31
+ def connection_completed
32
+ reset_retries
33
+ @reconnecting = false
34
+ @connected = true
35
+ @logger.info("Connected to TSDB server at #{@host}:#{@port}")
36
+ end
37
+
38
+ def unbind
39
+ if @connected
40
+ @logger.warn("Lost connection to TSDB server at #{@host}:#{@port}")
41
+ end
42
+ @connected = false
43
+
44
+ retry_in = 2**[retries, BACKOFF_CEILING].min - 1
45
+ increment_retries
46
+
47
+ if retries > MAX_RETRIES
48
+ raise "Failed to reconnect to TSDB after #{MAX_RETRIES} retries"
49
+ end
50
+
51
+ if retries > 1
52
+ @logger.info("Failed to reconnect to TSDB, will try again in #{retry_in} seconds...")
53
+ end
54
+
55
+ EM.add_timer(retry_in) { tsdb_reconnect }
56
+ end
57
+
58
+ def tsdb_reconnect
59
+ @logger.info("Trying to reconnect to TSDB server at #{@host}:#{@port} (#{retries})...")
60
+ reconnect(@host, @port)
61
+ end
62
+
63
+ def receive_data(data)
64
+ @logger.info("[TSDB] << #{data.chomp}")
65
+ end
66
+
67
+ end
68
+ end