bosh-monitor 1.5.0.pre.1113

Sign up to get free protection for your applications and to get access to all the features.
Files changed (37) hide show
  1. data/README +80 -0
  2. data/bin/bosh-monitor +30 -0
  3. data/bin/bosh-monitor-console +51 -0
  4. data/bin/listener +58 -0
  5. data/lib/bosh/monitor.rb +72 -0
  6. data/lib/bosh/monitor/agent.rb +51 -0
  7. data/lib/bosh/monitor/agent_manager.rb +295 -0
  8. data/lib/bosh/monitor/api_controller.rb +18 -0
  9. data/lib/bosh/monitor/config.rb +71 -0
  10. data/lib/bosh/monitor/core_ext.rb +8 -0
  11. data/lib/bosh/monitor/director.rb +76 -0
  12. data/lib/bosh/monitor/director_monitor.rb +33 -0
  13. data/lib/bosh/monitor/errors.rb +19 -0
  14. data/lib/bosh/monitor/event_processor.rb +109 -0
  15. data/lib/bosh/monitor/events/alert.rb +92 -0
  16. data/lib/bosh/monitor/events/base.rb +70 -0
  17. data/lib/bosh/monitor/events/heartbeat.rb +139 -0
  18. data/lib/bosh/monitor/metric.rb +16 -0
  19. data/lib/bosh/monitor/plugins/base.rb +27 -0
  20. data/lib/bosh/monitor/plugins/cloud_watch.rb +56 -0
  21. data/lib/bosh/monitor/plugins/datadog.rb +78 -0
  22. data/lib/bosh/monitor/plugins/dummy.rb +20 -0
  23. data/lib/bosh/monitor/plugins/email.rb +135 -0
  24. data/lib/bosh/monitor/plugins/http_request_helper.rb +25 -0
  25. data/lib/bosh/monitor/plugins/logger.rb +13 -0
  26. data/lib/bosh/monitor/plugins/nats.rb +43 -0
  27. data/lib/bosh/monitor/plugins/pagerduty.rb +48 -0
  28. data/lib/bosh/monitor/plugins/paging_datadog_client.rb +24 -0
  29. data/lib/bosh/monitor/plugins/resurrector.rb +82 -0
  30. data/lib/bosh/monitor/plugins/resurrector_helper.rb +84 -0
  31. data/lib/bosh/monitor/plugins/tsdb.rb +43 -0
  32. data/lib/bosh/monitor/plugins/varz.rb +17 -0
  33. data/lib/bosh/monitor/protocols/tsdb.rb +68 -0
  34. data/lib/bosh/monitor/runner.rb +162 -0
  35. data/lib/bosh/monitor/version.rb +5 -0
  36. data/lib/bosh/monitor/yaml_helper.rb +18 -0
  37. metadata +246 -0
@@ -0,0 +1,139 @@
1
+ module Bosh::Monitor
2
+ module Events
3
+ class Heartbeat < Base
4
+
5
+ CORE_JOBS = Set.new(%w(cloud_controller dea health_manager nats router routerv2 stager uaa vcap_redis))
6
+
7
+ SERVICE_JOBS_PREFIXES = %w(mysql mongodb redis rabbit postgresql vblob).join("|")
8
+ SERVICE_JOBS_GATEWAY_REGEX = /(#{SERVICE_JOBS_PREFIXES})_gateway$/i
9
+ SERVICE_JOBS_NODE_REGEX = /(#{SERVICE_JOBS_PREFIXES})_node(.*)/i
10
+
11
+ SERVICE_AUXILIARY_JOBS = Set.new(%w(serialization_data_server backup_manager))
12
+
13
+ attr_reader :agent_id, :deployment, :job, :index, :metrics
14
+
15
+ def initialize(attributes = {})
16
+ super
17
+ @kind = :heartbeat
18
+ @metrics = []
19
+
20
+ @id = @attributes["id"]
21
+ @timestamp = Time.at(@attributes["timestamp"]) rescue @attributes["timestamp"]
22
+
23
+ @deployment = @attributes["deployment"]
24
+ @agent_id = @attributes["agent_id"]
25
+ @job = @attributes["job"]
26
+ @index = @attributes["index"].to_s
27
+ @job_state = @attributes["job_state"]
28
+
29
+ @tags = {}
30
+ @tags["job"] = @job if @job
31
+ @tags["index"] = @index if @index
32
+ @tags["role"] = guess_role
33
+
34
+ @vitals = @attributes["vitals"] || {}
35
+ @load = @vitals["load"] || []
36
+ @cpu = @vitals["cpu"] || {}
37
+ @mem = @vitals["mem"] || {}
38
+ @swap = @vitals["swap"] || {}
39
+ @disk = @vitals["disk"] || {}
40
+ @system_disk = @disk["system"] || {}
41
+ @ephemeral_disk = @disk["ephemeral"] || {}
42
+ @persistent_disk = @disk["persistent"] || {}
43
+
44
+ populate_metrics
45
+ end
46
+
47
+ def validate
48
+ add_error("id is missing") if @id.nil?
49
+ add_error("timestamp is missing") if @timestamp.nil?
50
+
51
+ if @timestamp && !@timestamp.kind_of?(Time)
52
+ add_error("timestamp is invalid")
53
+ end
54
+ end
55
+
56
+ def add_metric(name, value)
57
+ @metrics << Metric.new(name, value, @timestamp.to_i, @tags) if value
58
+ end
59
+
60
+ def short_description
61
+ "Heartbeat from #{@job}/#{@index} (#{@agent_id}) @ #{@timestamp.utc}"
62
+ end
63
+
64
+ def to_s
65
+ self.short_description
66
+ end
67
+
68
+ def to_hash
69
+ {
70
+ :kind => "heartbeat",
71
+ :id => @id,
72
+ :timestamp => @timestamp.to_i,
73
+ :deployment => @deployment,
74
+ :agent_id => @agent_id,
75
+ :job => @job,
76
+ :index => @index,
77
+ :job_state => @job_state,
78
+ :vitals => @vitals
79
+ }
80
+ end
81
+
82
+ def to_json
83
+ Yajl::Encoder.encode(self.to_hash)
84
+ end
85
+
86
+ def to_plain_text
87
+ self.short_description
88
+ end
89
+
90
+ private
91
+
92
+ def populate_metrics
93
+ add_metric("system.load.1m", @load[0]) if @load.kind_of?(Array)
94
+ add_metric("system.cpu.user", @cpu["user"])
95
+ add_metric("system.cpu.sys", @cpu["sys"])
96
+ add_metric("system.cpu.wait", @cpu["wait"])
97
+ add_metric("system.mem.percent", @mem["percent"])
98
+ add_metric("system.mem.kb", @mem["kb"])
99
+ add_metric("system.swap.percent", @swap["percent"])
100
+ add_metric("system.swap.kb", @swap["kb"])
101
+ add_metric("system.disk.system.percent", @system_disk["percent"])
102
+ add_metric("system.disk.system.inode_percent", @system_disk["inode_percent"])
103
+ add_metric("system.disk.ephemeral.percent", @ephemeral_disk["percent"])
104
+ add_metric("system.disk.ephemeral.inode_percent", @ephemeral_disk["inode_percent"])
105
+ add_metric("system.disk.persistent.percent", @persistent_disk["percent"])
106
+ add_metric("system.disk.persistent.inode_percent", @persistent_disk["inode_percent"])
107
+ add_metric("system.healthy", @job_state == "running" ? 1 : 0)
108
+ end
109
+
110
+ def guess_role
111
+ # Dashboard might want to partition jobs
112
+ # into several buckets, so let's help it
113
+ # by applying a couple of heuristics
114
+
115
+ return "core" if CORE_JOBS.include?(@job.to_s.downcase)
116
+
117
+ return "service" if SERVICE_AUXILIARY_JOBS.include?(@job.to_s.downcase)
118
+
119
+ # job name prefixed by "service"
120
+ if @job.to_s.downcase =~ /^service/i
121
+ return "service"
122
+ end
123
+
124
+ # job name suffixed by "_gateway"
125
+ if @job.to_s.downcase =~ SERVICE_JOBS_GATEWAY_REGEX
126
+ return "service"
127
+ end
128
+
129
+ # job name contains "_node"
130
+ if @job.to_s.downcase =~ SERVICE_JOBS_NODE_REGEX
131
+ return "service"
132
+ end
133
+
134
+ return "unknown"
135
+ end
136
+
137
+ end
138
+ end
139
+ end
@@ -0,0 +1,16 @@
1
+ module Bosh::Monitor
2
+ class Metric
3
+
4
+ attr_accessor :name
5
+ attr_accessor :value
6
+ attr_accessor :timestamp
7
+ attr_accessor :tags
8
+
9
+ def initialize(name, value, timestamp, tags)
10
+ @name = name
11
+ @value = value
12
+ @timestamp = timestamp
13
+ @tags = tags
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,27 @@
1
+ module Bosh::Monitor
2
+ module Plugins
3
+ class Base
4
+ attr_reader :logger
5
+ attr_reader :options
6
+ attr_reader :event_kinds
7
+
8
+ def initialize(options = {})
9
+ @logger = Bhm.logger
10
+ @options = (options || {}).dup
11
+ @event_kinds = []
12
+ end
13
+
14
+ def validate_options
15
+ true
16
+ end
17
+
18
+ def run
19
+ raise FatalError, "`run' method is not implemented in `#{self.class}'"
20
+ end
21
+
22
+ def process(event)
23
+ raise FatalError, "`process' method is not implemented in `#{self.class}'"
24
+ end
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,56 @@
1
+ require 'aws-sdk'
2
+
3
+ module Bosh::Monitor
4
+ module Plugins
5
+ class CloudWatch < Base
6
+ def initialize(options={})
7
+ @options = options
8
+ end
9
+
10
+ def aws_cloud_watch
11
+ @aws_cloud_watch ||= AWS::CloudWatch.new(@options)
12
+ end
13
+
14
+ def run
15
+ end
16
+
17
+ def process(event)
18
+ if event.is_a? Bosh::Monitor::Events::Heartbeat
19
+ aws_cloud_watch.put_metric_data(heartbeat_to_cloudwatch_metric(event))
20
+ end
21
+ end
22
+
23
+ private
24
+
25
+ def heartbeat_to_cloudwatch_metric(heartbeat)
26
+ {
27
+ namespace: "BOSH/HealthMonitor",
28
+ metric_data: heartbeat.metrics.collect do |metric|
29
+ build_metric(metric, dimensions(heartbeat))
30
+ end
31
+ }
32
+ end
33
+
34
+ def dimensions(heartbeat)
35
+ @dimensions ||= [
36
+ {name: "job", value: heartbeat.job},
37
+ {name: "index", value: heartbeat.index},
38
+ {name: "name", value: "#{heartbeat.job}/#{heartbeat.index}"},
39
+ {name: "deployment", value: heartbeat.deployment},
40
+ {name: "agent_id", value: heartbeat.agent_id}
41
+ ]
42
+ end
43
+
44
+ def build_metric(metric, dimensions)
45
+ timestamp = Time.at(metric.timestamp).utc.iso8601
46
+
47
+ {
48
+ metric_name: metric.name.to_s,
49
+ value: metric.value.to_s,
50
+ timestamp: timestamp,
51
+ dimensions: dimensions
52
+ }
53
+ end
54
+ end
55
+ end
56
+ end
@@ -0,0 +1,78 @@
1
+ require 'dogapi'
2
+
3
+ module Bosh::Monitor
4
+ module Plugins
5
+ class DataDog < Base
6
+
7
+ NORMAL_PRIORITY = [:alert, :critical, :error]
8
+
9
+ def validate_options
10
+ !!(options.kind_of?(Hash) && options["api_key"] && options["application_key"])
11
+ end
12
+
13
+ def run
14
+ @api_key = options["api_key"]
15
+ @application_key = options["application_key"]
16
+ @pagerduty_service_name = options["pagerduty_service_name"]
17
+
18
+ logger.info("DataDog plugin is running...")
19
+ end
20
+
21
+ def dog_client
22
+ return @dog_client if @dog_client
23
+ client = Dogapi::Client.new(@api_key, @application_key)
24
+ @dog_client = @pagerduty_service_name ? PagingDatadogClient.new(@pagerduty_service_name, client) : client
25
+ end
26
+
27
+ def process(event)
28
+ case event
29
+ when Bosh::Monitor::Events::Heartbeat
30
+ EM.defer { process_heartbeat(event) }
31
+ when Bosh::Monitor::Events::Alert
32
+ EM.defer { process_alert(event) }
33
+ else
34
+ #ignore
35
+ end
36
+ end
37
+
38
+ private
39
+
40
+ def process_heartbeat(heartbeat)
41
+ tags = %W[
42
+ job:#{heartbeat.job}
43
+ index:#{heartbeat.index}
44
+ deployment:#{heartbeat.deployment}
45
+ agent:#{heartbeat.agent_id}
46
+ ]
47
+
48
+ heartbeat.metrics.each do |metric|
49
+ point = [Time.at(metric.timestamp), metric.value]
50
+ dog_client.emit_points("bosh.healthmonitor.#{metric.name}", [point], tags: tags)
51
+ end
52
+ end
53
+
54
+ def process_alert(alert)
55
+ msg, title, source, timestamp = alert.to_hash.values_at(:summary,
56
+ :title,
57
+ :source,
58
+ :created_at)
59
+
60
+
61
+ # DataDog only supports "low" and "normal" priority
62
+ priority = normal_priority?(alert.severity) ? "normal" : "low"
63
+ dog_client.emit_event(
64
+ Dogapi::Event.new(msg,
65
+ msg_title: title,
66
+ date_happened: timestamp,
67
+ tags: ["source:#{source}"],
68
+ priority: priority
69
+ )
70
+ )
71
+ end
72
+
73
+ def normal_priority?(severity)
74
+ NORMAL_PRIORITY.include?(severity)
75
+ end
76
+ end
77
+ end
78
+ end
@@ -0,0 +1,20 @@
1
+ module Bosh::Monitor
2
+ module Plugins
3
+ class Dummy < Base
4
+ def run
5
+ logger.info("Dummy delivery agent is running...")
6
+ end
7
+
8
+ def process(event)
9
+ logger.info("Processing event!")
10
+ logger.info(event)
11
+ @events ||= []
12
+ @events << event
13
+ end
14
+
15
+ def events
16
+ @events
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,135 @@
1
+ module Bosh::Monitor
2
+ module Plugins
3
+ class Email < Base
4
+ DEFAULT_INTERVAL = 10
5
+
6
+ def initialize(options = {})
7
+ @queues = {}
8
+ @lock = Mutex.new
9
+
10
+ if options.has_key?("interval")
11
+ @delivery_interval = options["interval"].to_f
12
+ else
13
+ @delivery_interval = DEFAULT_INTERVAL
14
+ end
15
+
16
+ @started = false
17
+ super
18
+ end
19
+
20
+ def queue_size(kind)
21
+ return 0 if @queues[kind].nil?
22
+ @queues[kind].size
23
+ end
24
+
25
+ def run
26
+ unless EM.reactor_running?
27
+ logger.error("Email plugin can only be started when event loop is running")
28
+ return false
29
+ end
30
+
31
+ return true if @started
32
+ logger.info("Email plugin is running...")
33
+
34
+ EM.add_periodic_timer(@delivery_interval) do
35
+ begin
36
+ process_queues
37
+ rescue => e
38
+ logger.error("Problem processing email queues: #{e}")
39
+ end
40
+ end
41
+ @started = true
42
+ end
43
+
44
+ def validate_options
45
+ options.kind_of?(Hash) &&
46
+ options["recipients"].kind_of?(Array) &&
47
+ options["smtp"].kind_of?(Hash) &&
48
+ options["smtp"]["host"] &&
49
+ options["smtp"]["port"] &&
50
+ options["smtp"]["from"] &&
51
+ true # force the whole method to return Boolean
52
+ end
53
+
54
+ def recipients
55
+ options["recipients"]
56
+ end
57
+
58
+ def smtp_options
59
+ options["smtp"]
60
+ end
61
+
62
+ def process(event)
63
+ @lock.synchronize do
64
+ @queues[event.kind] ||= []
65
+ @queues[event.kind] << event
66
+ end
67
+ end
68
+
69
+ def process_queues
70
+ @queues.each_pair do |kind, queue|
71
+ next if queue.empty?
72
+ email_subject = "%s from BOSH Health Monitor" % [ pluralize(queue_size(kind), kind) ]
73
+ email_body = ""
74
+
75
+ @lock.synchronize do
76
+ while event = queue.shift
77
+ email_body << event.to_plain_text << "\n"
78
+ end
79
+ end
80
+
81
+ send_email_async(email_subject, email_body)
82
+ end
83
+ end
84
+
85
+ def send_email_async(subject, body, date = Time.now)
86
+ started = Time.now
87
+ logger.debug("Sending email...")
88
+
89
+ headers = {
90
+ "From" => smtp_options["from"],
91
+ "To" => recipients.join(", "),
92
+ "Subject" => subject,
93
+ "Date" => date,
94
+ "Content-Type" => "text/plain; charset=\"iso-8859-1\""
95
+ }
96
+
97
+ smtp_client_options = {
98
+ :domain => smtp_options["domain"],
99
+ :host => smtp_options["host"],
100
+ :port => smtp_options["port"],
101
+ :from => smtp_options["from"],
102
+ :to => recipients,
103
+ :header => headers,
104
+ :body => body
105
+ }
106
+
107
+ if smtp_options["tls"]
108
+ smtp_client_options[:starttls] = true
109
+ end
110
+
111
+ if smtp_options["auth"]
112
+ smtp_client_options[:auth] = {
113
+ # FIXME: EM SMTP client will only work with plain auth
114
+ :type => smtp_options["auth"].to_sym,
115
+ :username => smtp_options["user"],
116
+ :password => smtp_options["password"]
117
+ }
118
+ end
119
+
120
+ email = EM::Protocols::SmtpClient.send(smtp_client_options)
121
+
122
+ email.callback do
123
+ logger.debug("Email sent (took #{Time.now - started} seconds)")
124
+ end
125
+
126
+ email.errback do |e|
127
+ logger.error("Failed to send email: #{e}")
128
+ end
129
+
130
+ rescue => e
131
+ logger.error("Error sending email: #{e}")
132
+ end
133
+ end
134
+ end
135
+ end