bosh-monitor 1.5.0.pre.1113

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. data/README +80 -0
  2. data/bin/bosh-monitor +30 -0
  3. data/bin/bosh-monitor-console +51 -0
  4. data/bin/listener +58 -0
  5. data/lib/bosh/monitor.rb +72 -0
  6. data/lib/bosh/monitor/agent.rb +51 -0
  7. data/lib/bosh/monitor/agent_manager.rb +295 -0
  8. data/lib/bosh/monitor/api_controller.rb +18 -0
  9. data/lib/bosh/monitor/config.rb +71 -0
  10. data/lib/bosh/monitor/core_ext.rb +8 -0
  11. data/lib/bosh/monitor/director.rb +76 -0
  12. data/lib/bosh/monitor/director_monitor.rb +33 -0
  13. data/lib/bosh/monitor/errors.rb +19 -0
  14. data/lib/bosh/monitor/event_processor.rb +109 -0
  15. data/lib/bosh/monitor/events/alert.rb +92 -0
  16. data/lib/bosh/monitor/events/base.rb +70 -0
  17. data/lib/bosh/monitor/events/heartbeat.rb +139 -0
  18. data/lib/bosh/monitor/metric.rb +16 -0
  19. data/lib/bosh/monitor/plugins/base.rb +27 -0
  20. data/lib/bosh/monitor/plugins/cloud_watch.rb +56 -0
  21. data/lib/bosh/monitor/plugins/datadog.rb +78 -0
  22. data/lib/bosh/monitor/plugins/dummy.rb +20 -0
  23. data/lib/bosh/monitor/plugins/email.rb +135 -0
  24. data/lib/bosh/monitor/plugins/http_request_helper.rb +25 -0
  25. data/lib/bosh/monitor/plugins/logger.rb +13 -0
  26. data/lib/bosh/monitor/plugins/nats.rb +43 -0
  27. data/lib/bosh/monitor/plugins/pagerduty.rb +48 -0
  28. data/lib/bosh/monitor/plugins/paging_datadog_client.rb +24 -0
  29. data/lib/bosh/monitor/plugins/resurrector.rb +82 -0
  30. data/lib/bosh/monitor/plugins/resurrector_helper.rb +84 -0
  31. data/lib/bosh/monitor/plugins/tsdb.rb +43 -0
  32. data/lib/bosh/monitor/plugins/varz.rb +17 -0
  33. data/lib/bosh/monitor/protocols/tsdb.rb +68 -0
  34. data/lib/bosh/monitor/runner.rb +162 -0
  35. data/lib/bosh/monitor/version.rb +5 -0
  36. data/lib/bosh/monitor/yaml_helper.rb +18 -0
  37. metadata +246 -0
@@ -0,0 +1,25 @@
1
+ module Bosh::Monitor::Plugins
2
+ module HttpRequestHelper
3
+ def send_http_post_request(uri, request)
4
+ send_http_request(:post, uri, request)
5
+ end
6
+
7
+ def send_http_put_request(uri, request)
8
+ send_http_request(:put, uri, request)
9
+ end
10
+
11
+ def send_http_request(method, uri, request)
12
+ name = self.class.name
13
+ logger.debug("sending HTTP #{method.to_s.upcase} to: #{uri}")
14
+ started = Time.now
15
+ http = EM::HttpRequest.new(uri).send(method, request)
16
+ http.callback do
17
+ logger.debug("#{name} event sent (took #{Time.now - started} seconds): #{http.response_header.status}")
18
+ end
19
+
20
+ http.errback do |e|
21
+ logger.error("Failed to send #{name} event: #{e.error}")
22
+ end
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,13 @@
1
+ module Bosh::Monitor
2
+ module Plugins
3
+ class Logger < Base
4
+ def run
5
+ logger.info("Logging delivery agent is running...")
6
+ end
7
+
8
+ def process(event)
9
+ logger.info("[#{event.kind.to_s.upcase}] #{event}")
10
+ end
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,43 @@
1
+ module Bosh::Monitor
2
+ module Plugins
3
+ class Nats < Base
4
+ SUBJECT = "bosh.hm.events"
5
+
6
+ def validate_options
7
+ options.kind_of?(Hash) &&
8
+ options["endpoint"] &&
9
+ options.has_key?("user") &&
10
+ options.has_key?("password")
11
+ end
12
+
13
+ def run
14
+ unless EM.reactor_running?
15
+ logger.error("NATS delivery agent can only be started when event loop is running")
16
+ return false
17
+ end
18
+
19
+ nats_client_options = {
20
+ :uri => options["endpoint"],
21
+ :user => options["user"],
22
+ :pass => options["password"],
23
+ :autostart => false
24
+ }
25
+
26
+ @nats = NATS.connect(nats_client_options) do
27
+ logger.info("Ready to publish alerts to NATS at `#{options["endpoint"]}'")
28
+ end
29
+ end
30
+
31
+ def process(event)
32
+ if @nats.nil?
33
+ @logger.error("Cannot deliver event, NATS not initialized")
34
+ return false
35
+ end
36
+
37
+ nats_subject = options["subject"] || SUBJECT
38
+ @nats.publish(nats_subject, event.to_json)
39
+ true
40
+ end
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,48 @@
1
+ module Bosh::Monitor
2
+ module Plugins
3
+ class Pagerduty < Base
4
+ include Bosh::Monitor::Plugins::HttpRequestHelper
5
+
6
+ API_URI = "https://events.pagerduty.com/generic/2010-04-15/create_event.json"
7
+
8
+ def run
9
+ unless EM.reactor_running?
10
+ logger.error("Pagerduty plugin can only be started when event loop is running")
11
+ return false
12
+ end
13
+
14
+ logger.info("Pagerduty delivery agent is running...")
15
+ end
16
+
17
+ def validate_options
18
+ options.kind_of?(Hash) &&
19
+ options["service_key"].kind_of?(String)
20
+ end
21
+
22
+ def process(event)
23
+ started = Time.now
24
+
25
+ payload = {
26
+ :service_key => options["service_key"],
27
+ :event_type => "trigger",
28
+ :incident_key => event.id,
29
+ :description => event.short_description,
30
+ :details => event.to_hash
31
+ }
32
+
33
+ request = {
34
+ :body => Yajl::Encoder.encode(payload)
35
+ }
36
+
37
+ if options["http_proxy"]
38
+ proxy = URI.parse(options["http_proxy"])
39
+ request[:proxy] = { :host => proxy.host, :port => proxy.port }
40
+ end
41
+
42
+ send_http_post_request(API_URI, request)
43
+ rescue => e
44
+ logger.error("Error sending pagerduty event: #{e}")
45
+ end
46
+ end
47
+ end
48
+ end
@@ -0,0 +1,24 @@
1
+ class PagingDatadogClient
2
+ attr_reader :datadog_recipient
3
+
4
+ def initialize(datadog_recipient, datadog_client)
5
+ @datadog_recipient = datadog_recipient
6
+ @datadog_client = datadog_client
7
+ end
8
+
9
+ def emit_points(metric, points, options={})
10
+ @datadog_client.emit_points(metric, points, options)
11
+ end
12
+
13
+ def emit_event(event)
14
+ event_hash = event.to_hash
15
+ new_message = if event.priority == "normal"
16
+ "#{event.msg_text} @#{@datadog_recipient}"
17
+ else
18
+ event.msg_text
19
+ end
20
+ new_event = Dogapi::Event.new(new_message, event_hash)
21
+
22
+ @datadog_client.emit_event(new_event)
23
+ end
24
+ end
@@ -0,0 +1,82 @@
1
+ # This health monitor plugin should be used in conjunction with another plugin that
2
+ # alerts when a VM is unresponsive, as this plugin will try to automatically fix the
3
+ # problem by recreating the VM
4
+ module Bosh::Monitor
5
+ module Plugins
6
+ class Resurrector < Base
7
+ include Bosh::Monitor::Plugins::HttpRequestHelper
8
+
9
+ attr_reader :url
10
+
11
+ def initialize(options={})
12
+ super(options)
13
+ director = @options['director']
14
+ raise ArgumentError 'director options not set' unless director
15
+
16
+ @url = URI(director['endpoint'])
17
+ @user = director['user']
18
+ @password = director['password']
19
+ @processor = Bhm.event_processor
20
+ @alert_tracker = ResurrectorHelper::AlertTracker.new(@options)
21
+ end
22
+
23
+ def run
24
+ unless EM.reactor_running?
25
+ logger.error("Resurrector plugin can only be started when event loop is running")
26
+ return false
27
+ end
28
+
29
+ logger.info("Resurrector is running...")
30
+ end
31
+
32
+ def process(alert)
33
+ deployment = alert.attributes['deployment']
34
+ job = alert.attributes['job']
35
+ index = alert.attributes['index']
36
+
37
+ # only when the agent times out do we add deployment, job & index to the alert
38
+ # attributes, so this won't trigger a recreate for other types of alerts
39
+ if deployment && job && index
40
+ agent_key = ResurrectorHelper::JobInstanceKey.new(deployment, job, index)
41
+ @alert_tracker.record(agent_key, alert.created_at)
42
+
43
+ payload = {'jobs' => {job => [index]}}
44
+ request = {
45
+ head: {
46
+ 'Content-Type' => 'application/json',
47
+ 'authorization' => [@user, @password]
48
+ },
49
+ body: Yajl::Encoder.encode(payload)
50
+ }
51
+
52
+ @url.path = "/deployments/#{deployment}/scan_and_fix"
53
+
54
+ if @alert_tracker.melting_down?(deployment)
55
+ # freak out
56
+ ts = Time.now.to_i
57
+ @processor.process(:alert,
58
+ severity: 1,
59
+ source: "HM plugin resurrector",
60
+ title: "We are in meltdown.",
61
+ created_at: ts)
62
+
63
+ logger.error("(Resurrector) we are in meltdown.")
64
+ else
65
+ # queue instead, and only queue if it isn't already in the queue
66
+ # what if we can't keep up with the failure rate?
67
+ # - maybe not, maybe the meltdown detection takes care of the rate issue
68
+ logger.warn("(Resurrector) notifying director to recreate unresponsive VM: #{deployment} #{job}/#{index}")
69
+
70
+ send_http_put_request(url.to_s, request)
71
+ end
72
+
73
+
74
+ else
75
+ logger.warn("(Resurrector) event did not have deployment, job and index: #{alert}")
76
+ end
77
+ end
78
+
79
+ end
80
+ end
81
+ end
82
+
@@ -0,0 +1,84 @@
1
+ module Bosh::Monitor::Plugins
2
+ module ResurrectorHelper
3
+
4
+ # Hashable tuple of the identifying properties of a job
5
+ class JobInstanceKey
6
+ attr_accessor :deployment, :job, :index
7
+
8
+ def initialize(deployment, job, index)
9
+ @deployment = deployment
10
+ @job = job
11
+ @index = index
12
+ end
13
+
14
+ def hash
15
+ (deployment.to_s + job.to_s + index.to_s).hash
16
+ end
17
+
18
+ def eql?(other)
19
+ other.deployment == deployment &&
20
+ other.job == job &&
21
+ other.index == index
22
+ end
23
+
24
+ def to_s
25
+ [deployment, job, index].join('/')
26
+ end
27
+ end
28
+
29
+ # Service which tracks alerts and decides whether or not the cluster is melting down.
30
+ # When the cluster is melting down, the resurrector backs off on fixing instances.
31
+ class AlertTracker
32
+
33
+ # Below this number of down agents we don't consider a meltdown occurring
34
+ attr_accessor :minimum_down_jobs
35
+
36
+ # Number of seconds at which an alert is considered "current"; alerts older than
37
+ # this are ignored. Integer number of seconds.
38
+ attr_accessor :time_threshold
39
+
40
+ # Percentage of the cluster which must be down for scanning to stop. Float fraction
41
+ # between 0 and 1.
42
+ attr_accessor :percent_threshold
43
+
44
+ def initialize(args={})
45
+ @agent_manager = Bhm.agent_manager
46
+ @alert_times = {} # maps JobInstanceKey to time of last Alert
47
+ @minimum_down_jobs = args.fetch('minimum_down_jobs', 5)
48
+ @percent_threshold = args.fetch('percent_threshold', 0.2)
49
+ @time_threshold = args.fetch('time_threshold', 600)
50
+ end
51
+
52
+ # "Melting down" means a large part of the cluster is offline and manual intervention
53
+ # may be required to fix.
54
+ def melting_down?(deployment)
55
+ agent_alerts = alerts_for_deployment(deployment)
56
+ total_number_of_agents = agent_alerts.size
57
+ number_of_down_agents = agent_alerts.select { |_, alert_time|
58
+ alert_time > (Time.now - time_threshold)
59
+ }.size
60
+
61
+ return false if number_of_down_agents < minimum_down_jobs
62
+
63
+ (number_of_down_agents.to_f / total_number_of_agents) >= percent_threshold
64
+ end
65
+
66
+ def record(agent_key, alert_time)
67
+ @alert_times[agent_key] = alert_time
68
+ end
69
+
70
+ private
71
+
72
+ def alerts_for_deployment(deployment)
73
+ agents = @agent_manager.get_agents_for_deployment(deployment)
74
+ keys = agents.values.map { |agent|
75
+ JobInstanceKey.new(agent.deployment, agent.job, agent.index)
76
+ }
77
+
78
+ result = {}
79
+ keys.each { |key| result[key] = @alert_times.fetch(key, Time.at(0)) }
80
+ result
81
+ end
82
+ end
83
+ end
84
+ end
@@ -0,0 +1,43 @@
1
+ module Bosh::Monitor
2
+ module Plugins
3
+ class Tsdb < Base
4
+
5
+ def validate_options
6
+ options.kind_of?(Hash) &&
7
+ options["host"] &&
8
+ options["port"] &&
9
+ true
10
+ end
11
+
12
+ def run
13
+ unless EM.reactor_running?
14
+ logger.error("TSDB delivery agent can only be started when event loop is running")
15
+ return false
16
+ end
17
+
18
+ host = options["host"]
19
+ port = options["port"]
20
+ @tsdb = EM.connect(host, port, Bhm::TsdbConnection, host, port)
21
+ end
22
+
23
+ def process(event)
24
+ if @tsdb.nil?
25
+ @logger.error("Cannot deliver event, TSDB connection is not initialized")
26
+ return false
27
+ end
28
+
29
+ metrics = event.metrics
30
+
31
+ if !metrics.kind_of?(Enumerable)
32
+ raise PluginError, "Invalid event metrics: Enumerable expected, #{metrics.class} given"
33
+ end
34
+
35
+ metrics.each do |metric|
36
+ @tsdb.send_metric(metric.name, metric.timestamp, metric.value, metric.tags)
37
+ end
38
+
39
+ true
40
+ end
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,17 @@
1
+ module Bosh::Monitor
2
+ module Plugins
3
+ class Varz < Base
4
+ def run
5
+ logger.info("Varz plugin is running...")
6
+ end
7
+
8
+ def process(event)
9
+ @agents ||= {}
10
+ @agents[event.kind] ||= {}
11
+ agent_id = event.attributes["agent_id"] || "unknown"
12
+ @agents[event.kind][agent_id.to_s] = event.to_hash
13
+ Bhm.set_varz("last_agents_" + event.kind.to_s, @agents[event.kind])
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,68 @@
1
+ module Bosh::Monitor
2
+ class TsdbConnection < EventMachine::Connection
3
+
4
+ BACKOFF_CEILING = 9
5
+ MAX_RETRIES = 35
6
+
7
+ attr_reader :retries
8
+
9
+ def initialize(host, port)
10
+ @host = host
11
+ @port = port
12
+ @logger = Bhm.logger
13
+ reset_retries
14
+ end
15
+
16
+ def reset_retries
17
+ @retries = 0
18
+ end
19
+
20
+ def increment_retries
21
+ @retries += 1
22
+ end
23
+
24
+ def send_metric(name, timestamp, value, tags = {})
25
+ formatted_tags = tags.map { |tag| tag.join("=") }.sort.join(" ")
26
+ command = "put #{name} #{timestamp} #{value} #{formatted_tags}\n"
27
+ @logger.debug("[TSDB] >> #{command.chomp}")
28
+ send_data(command)
29
+ end
30
+
31
+ def connection_completed
32
+ reset_retries
33
+ @reconnecting = false
34
+ @connected = true
35
+ @logger.info("Connected to TSDB server at #{@host}:#{@port}")
36
+ end
37
+
38
+ def unbind
39
+ if @connected
40
+ @logger.warn("Lost connection to TSDB server at #{@host}:#{@port}")
41
+ end
42
+ @connected = false
43
+
44
+ retry_in = 2**[retries, BACKOFF_CEILING].min - 1
45
+ increment_retries
46
+
47
+ if retries > MAX_RETRIES
48
+ raise "Failed to reconnect to TSDB after #{MAX_RETRIES} retries"
49
+ end
50
+
51
+ if retries > 1
52
+ @logger.info("Failed to reconnect to TSDB, will try again in #{retry_in} seconds...")
53
+ end
54
+
55
+ EM.add_timer(retry_in) { tsdb_reconnect }
56
+ end
57
+
58
+ def tsdb_reconnect
59
+ @logger.info("Trying to reconnect to TSDB server at #{@host}:#{@port} (#{retries})...")
60
+ reconnect(@host, @port)
61
+ end
62
+
63
+ def receive_data(data)
64
+ @logger.info("[TSDB] << #{data.chomp}")
65
+ end
66
+
67
+ end
68
+ end