bosh-monitor 1.5.0.pre.1113

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. data/README +80 -0
  2. data/bin/bosh-monitor +30 -0
  3. data/bin/bosh-monitor-console +51 -0
  4. data/bin/listener +58 -0
  5. data/lib/bosh/monitor.rb +72 -0
  6. data/lib/bosh/monitor/agent.rb +51 -0
  7. data/lib/bosh/monitor/agent_manager.rb +295 -0
  8. data/lib/bosh/monitor/api_controller.rb +18 -0
  9. data/lib/bosh/monitor/config.rb +71 -0
  10. data/lib/bosh/monitor/core_ext.rb +8 -0
  11. data/lib/bosh/monitor/director.rb +76 -0
  12. data/lib/bosh/monitor/director_monitor.rb +33 -0
  13. data/lib/bosh/monitor/errors.rb +19 -0
  14. data/lib/bosh/monitor/event_processor.rb +109 -0
  15. data/lib/bosh/monitor/events/alert.rb +92 -0
  16. data/lib/bosh/monitor/events/base.rb +70 -0
  17. data/lib/bosh/monitor/events/heartbeat.rb +139 -0
  18. data/lib/bosh/monitor/metric.rb +16 -0
  19. data/lib/bosh/monitor/plugins/base.rb +27 -0
  20. data/lib/bosh/monitor/plugins/cloud_watch.rb +56 -0
  21. data/lib/bosh/monitor/plugins/datadog.rb +78 -0
  22. data/lib/bosh/monitor/plugins/dummy.rb +20 -0
  23. data/lib/bosh/monitor/plugins/email.rb +135 -0
  24. data/lib/bosh/monitor/plugins/http_request_helper.rb +25 -0
  25. data/lib/bosh/monitor/plugins/logger.rb +13 -0
  26. data/lib/bosh/monitor/plugins/nats.rb +43 -0
  27. data/lib/bosh/monitor/plugins/pagerduty.rb +48 -0
  28. data/lib/bosh/monitor/plugins/paging_datadog_client.rb +24 -0
  29. data/lib/bosh/monitor/plugins/resurrector.rb +82 -0
  30. data/lib/bosh/monitor/plugins/resurrector_helper.rb +84 -0
  31. data/lib/bosh/monitor/plugins/tsdb.rb +43 -0
  32. data/lib/bosh/monitor/plugins/varz.rb +17 -0
  33. data/lib/bosh/monitor/protocols/tsdb.rb +68 -0
  34. data/lib/bosh/monitor/runner.rb +162 -0
  35. data/lib/bosh/monitor/version.rb +5 -0
  36. data/lib/bosh/monitor/yaml_helper.rb +18 -0
  37. metadata +246 -0
@@ -0,0 +1,139 @@
1
+ module Bosh::Monitor
2
+ module Events
3
+ class Heartbeat < Base
4
+
5
+ CORE_JOBS = Set.new(%w(cloud_controller dea health_manager nats router routerv2 stager uaa vcap_redis))
6
+
7
+ SERVICE_JOBS_PREFIXES = %w(mysql mongodb redis rabbit postgresql vblob).join("|")
8
+ SERVICE_JOBS_GATEWAY_REGEX = /(#{SERVICE_JOBS_PREFIXES})_gateway$/i
9
+ SERVICE_JOBS_NODE_REGEX = /(#{SERVICE_JOBS_PREFIXES})_node(.*)/i
10
+
11
+ SERVICE_AUXILIARY_JOBS = Set.new(%w(serialization_data_server backup_manager))
12
+
13
+ attr_reader :agent_id, :deployment, :job, :index, :metrics
14
+
15
+ def initialize(attributes = {})
16
+ super
17
+ @kind = :heartbeat
18
+ @metrics = []
19
+
20
+ @id = @attributes["id"]
21
+ @timestamp = Time.at(@attributes["timestamp"]) rescue @attributes["timestamp"]
22
+
23
+ @deployment = @attributes["deployment"]
24
+ @agent_id = @attributes["agent_id"]
25
+ @job = @attributes["job"]
26
+ @index = @attributes["index"].to_s
27
+ @job_state = @attributes["job_state"]
28
+
29
+ @tags = {}
30
+ @tags["job"] = @job if @job
31
+ @tags["index"] = @index if @index
32
+ @tags["role"] = guess_role
33
+
34
+ @vitals = @attributes["vitals"] || {}
35
+ @load = @vitals["load"] || []
36
+ @cpu = @vitals["cpu"] || {}
37
+ @mem = @vitals["mem"] || {}
38
+ @swap = @vitals["swap"] || {}
39
+ @disk = @vitals["disk"] || {}
40
+ @system_disk = @disk["system"] || {}
41
+ @ephemeral_disk = @disk["ephemeral"] || {}
42
+ @persistent_disk = @disk["persistent"] || {}
43
+
44
+ populate_metrics
45
+ end
46
+
47
+ def validate
48
+ add_error("id is missing") if @id.nil?
49
+ add_error("timestamp is missing") if @timestamp.nil?
50
+
51
+ if @timestamp && !@timestamp.kind_of?(Time)
52
+ add_error("timestamp is invalid")
53
+ end
54
+ end
55
+
56
+ def add_metric(name, value)
57
+ @metrics << Metric.new(name, value, @timestamp.to_i, @tags) if value
58
+ end
59
+
60
+ def short_description
61
+ "Heartbeat from #{@job}/#{@index} (#{@agent_id}) @ #{@timestamp.utc}"
62
+ end
63
+
64
+ def to_s
65
+ self.short_description
66
+ end
67
+
68
+ def to_hash
69
+ {
70
+ :kind => "heartbeat",
71
+ :id => @id,
72
+ :timestamp => @timestamp.to_i,
73
+ :deployment => @deployment,
74
+ :agent_id => @agent_id,
75
+ :job => @job,
76
+ :index => @index,
77
+ :job_state => @job_state,
78
+ :vitals => @vitals
79
+ }
80
+ end
81
+
82
+ def to_json
83
+ Yajl::Encoder.encode(self.to_hash)
84
+ end
85
+
86
+ def to_plain_text
87
+ self.short_description
88
+ end
89
+
90
+ private
91
+
92
+ def populate_metrics
93
+ add_metric("system.load.1m", @load[0]) if @load.kind_of?(Array)
94
+ add_metric("system.cpu.user", @cpu["user"])
95
+ add_metric("system.cpu.sys", @cpu["sys"])
96
+ add_metric("system.cpu.wait", @cpu["wait"])
97
+ add_metric("system.mem.percent", @mem["percent"])
98
+ add_metric("system.mem.kb", @mem["kb"])
99
+ add_metric("system.swap.percent", @swap["percent"])
100
+ add_metric("system.swap.kb", @swap["kb"])
101
+ add_metric("system.disk.system.percent", @system_disk["percent"])
102
+ add_metric("system.disk.system.inode_percent", @system_disk["inode_percent"])
103
+ add_metric("system.disk.ephemeral.percent", @ephemeral_disk["percent"])
104
+ add_metric("system.disk.ephemeral.inode_percent", @ephemeral_disk["inode_percent"])
105
+ add_metric("system.disk.persistent.percent", @persistent_disk["percent"])
106
+ add_metric("system.disk.persistent.inode_percent", @persistent_disk["inode_percent"])
107
+ add_metric("system.healthy", @job_state == "running" ? 1 : 0)
108
+ end
109
+
110
+ def guess_role
111
+ # Dashboard might want to partition jobs
112
+ # into several buckets, so let's help it
113
+ # by applying a couple of heuristics
114
+
115
+ return "core" if CORE_JOBS.include?(@job.to_s.downcase)
116
+
117
+ return "service" if SERVICE_AUXILIARY_JOBS.include?(@job.to_s.downcase)
118
+
119
+ # job name prefixed by "service"
120
+ if @job.to_s.downcase =~ /^service/i
121
+ return "service"
122
+ end
123
+
124
+ # job name suffixed by "_gateway"
125
+ if @job.to_s.downcase =~ SERVICE_JOBS_GATEWAY_REGEX
126
+ return "service"
127
+ end
128
+
129
+ # job name contains "_node"
130
+ if @job.to_s.downcase =~ SERVICE_JOBS_NODE_REGEX
131
+ return "service"
132
+ end
133
+
134
+ return "unknown"
135
+ end
136
+
137
+ end
138
+ end
139
+ end
@@ -0,0 +1,16 @@
1
+ module Bosh::Monitor
2
+ class Metric
3
+
4
+ attr_accessor :name
5
+ attr_accessor :value
6
+ attr_accessor :timestamp
7
+ attr_accessor :tags
8
+
9
+ def initialize(name, value, timestamp, tags)
10
+ @name = name
11
+ @value = value
12
+ @timestamp = timestamp
13
+ @tags = tags
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,27 @@
1
+ module Bosh::Monitor
2
+ module Plugins
3
+ class Base
4
+ attr_reader :logger
5
+ attr_reader :options
6
+ attr_reader :event_kinds
7
+
8
+ def initialize(options = {})
9
+ @logger = Bhm.logger
10
+ @options = (options || {}).dup
11
+ @event_kinds = []
12
+ end
13
+
14
+ def validate_options
15
+ true
16
+ end
17
+
18
+ def run
19
+ raise FatalError, "`run' method is not implemented in `#{self.class}'"
20
+ end
21
+
22
+ def process(event)
23
+ raise FatalError, "`process' method is not implemented in `#{self.class}'"
24
+ end
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,56 @@
1
+ require 'aws-sdk'
2
+
3
+ module Bosh::Monitor
4
+ module Plugins
5
+ class CloudWatch < Base
6
+ def initialize(options={})
7
+ @options = options
8
+ end
9
+
10
+ def aws_cloud_watch
11
+ @aws_cloud_watch ||= AWS::CloudWatch.new(@options)
12
+ end
13
+
14
+ def run
15
+ end
16
+
17
+ def process(event)
18
+ if event.is_a? Bosh::Monitor::Events::Heartbeat
19
+ aws_cloud_watch.put_metric_data(heartbeat_to_cloudwatch_metric(event))
20
+ end
21
+ end
22
+
23
+ private
24
+
25
+ def heartbeat_to_cloudwatch_metric(heartbeat)
26
+ {
27
+ namespace: "BOSH/HealthMonitor",
28
+ metric_data: heartbeat.metrics.collect do |metric|
29
+ build_metric(metric, dimensions(heartbeat))
30
+ end
31
+ }
32
+ end
33
+
34
+ def dimensions(heartbeat)
35
+ @dimensions ||= [
36
+ {name: "job", value: heartbeat.job},
37
+ {name: "index", value: heartbeat.index},
38
+ {name: "name", value: "#{heartbeat.job}/#{heartbeat.index}"},
39
+ {name: "deployment", value: heartbeat.deployment},
40
+ {name: "agent_id", value: heartbeat.agent_id}
41
+ ]
42
+ end
43
+
44
+ def build_metric(metric, dimensions)
45
+ timestamp = Time.at(metric.timestamp).utc.iso8601
46
+
47
+ {
48
+ metric_name: metric.name.to_s,
49
+ value: metric.value.to_s,
50
+ timestamp: timestamp,
51
+ dimensions: dimensions
52
+ }
53
+ end
54
+ end
55
+ end
56
+ end
@@ -0,0 +1,78 @@
1
+ require 'dogapi'
2
+
3
+ module Bosh::Monitor
4
+ module Plugins
5
+ class DataDog < Base
6
+
7
+ NORMAL_PRIORITY = [:alert, :critical, :error]
8
+
9
+ def validate_options
10
+ !!(options.kind_of?(Hash) && options["api_key"] && options["application_key"])
11
+ end
12
+
13
+ def run
14
+ @api_key = options["api_key"]
15
+ @application_key = options["application_key"]
16
+ @pagerduty_service_name = options["pagerduty_service_name"]
17
+
18
+ logger.info("DataDog plugin is running...")
19
+ end
20
+
21
+ def dog_client
22
+ return @dog_client if @dog_client
23
+ client = Dogapi::Client.new(@api_key, @application_key)
24
+ @dog_client = @pagerduty_service_name ? PagingDatadogClient.new(@pagerduty_service_name, client) : client
25
+ end
26
+
27
+ def process(event)
28
+ case event
29
+ when Bosh::Monitor::Events::Heartbeat
30
+ EM.defer { process_heartbeat(event) }
31
+ when Bosh::Monitor::Events::Alert
32
+ EM.defer { process_alert(event) }
33
+ else
34
+ #ignore
35
+ end
36
+ end
37
+
38
+ private
39
+
40
+ def process_heartbeat(heartbeat)
41
+ tags = %W[
42
+ job:#{heartbeat.job}
43
+ index:#{heartbeat.index}
44
+ deployment:#{heartbeat.deployment}
45
+ agent:#{heartbeat.agent_id}
46
+ ]
47
+
48
+ heartbeat.metrics.each do |metric|
49
+ point = [Time.at(metric.timestamp), metric.value]
50
+ dog_client.emit_points("bosh.healthmonitor.#{metric.name}", [point], tags: tags)
51
+ end
52
+ end
53
+
54
+ def process_alert(alert)
55
+ msg, title, source, timestamp = alert.to_hash.values_at(:summary,
56
+ :title,
57
+ :source,
58
+ :created_at)
59
+
60
+
61
+ # DataDog only supports "low" and "normal" priority
62
+ priority = normal_priority?(alert.severity) ? "normal" : "low"
63
+ dog_client.emit_event(
64
+ Dogapi::Event.new(msg,
65
+ msg_title: title,
66
+ date_happened: timestamp,
67
+ tags: ["source:#{source}"],
68
+ priority: priority
69
+ )
70
+ )
71
+ end
72
+
73
+ def normal_priority?(severity)
74
+ NORMAL_PRIORITY.include?(severity)
75
+ end
76
+ end
77
+ end
78
+ end
@@ -0,0 +1,20 @@
1
+ module Bosh::Monitor
2
+ module Plugins
3
+ class Dummy < Base
4
+ def run
5
+ logger.info("Dummy delivery agent is running...")
6
+ end
7
+
8
+ def process(event)
9
+ logger.info("Processing event!")
10
+ logger.info(event)
11
+ @events ||= []
12
+ @events << event
13
+ end
14
+
15
+ def events
16
+ @events
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,135 @@
1
+ module Bosh::Monitor
2
+ module Plugins
3
+ class Email < Base
4
+ DEFAULT_INTERVAL = 10
5
+
6
+ def initialize(options = {})
7
+ @queues = {}
8
+ @lock = Mutex.new
9
+
10
+ if options.has_key?("interval")
11
+ @delivery_interval = options["interval"].to_f
12
+ else
13
+ @delivery_interval = DEFAULT_INTERVAL
14
+ end
15
+
16
+ @started = false
17
+ super
18
+ end
19
+
20
+ def queue_size(kind)
21
+ return 0 if @queues[kind].nil?
22
+ @queues[kind].size
23
+ end
24
+
25
+ def run
26
+ unless EM.reactor_running?
27
+ logger.error("Email plugin can only be started when event loop is running")
28
+ return false
29
+ end
30
+
31
+ return true if @started
32
+ logger.info("Email plugin is running...")
33
+
34
+ EM.add_periodic_timer(@delivery_interval) do
35
+ begin
36
+ process_queues
37
+ rescue => e
38
+ logger.error("Problem processing email queues: #{e}")
39
+ end
40
+ end
41
+ @started = true
42
+ end
43
+
44
+ def validate_options
45
+ options.kind_of?(Hash) &&
46
+ options["recipients"].kind_of?(Array) &&
47
+ options["smtp"].kind_of?(Hash) &&
48
+ options["smtp"]["host"] &&
49
+ options["smtp"]["port"] &&
50
+ options["smtp"]["from"] &&
51
+ true # force the whole method to return Boolean
52
+ end
53
+
54
+ def recipients
55
+ options["recipients"]
56
+ end
57
+
58
+ def smtp_options
59
+ options["smtp"]
60
+ end
61
+
62
+ def process(event)
63
+ @lock.synchronize do
64
+ @queues[event.kind] ||= []
65
+ @queues[event.kind] << event
66
+ end
67
+ end
68
+
69
+ def process_queues
70
+ @queues.each_pair do |kind, queue|
71
+ next if queue.empty?
72
+ email_subject = "%s from BOSH Health Monitor" % [ pluralize(queue_size(kind), kind) ]
73
+ email_body = ""
74
+
75
+ @lock.synchronize do
76
+ while event = queue.shift
77
+ email_body << event.to_plain_text << "\n"
78
+ end
79
+ end
80
+
81
+ send_email_async(email_subject, email_body)
82
+ end
83
+ end
84
+
85
+ def send_email_async(subject, body, date = Time.now)
86
+ started = Time.now
87
+ logger.debug("Sending email...")
88
+
89
+ headers = {
90
+ "From" => smtp_options["from"],
91
+ "To" => recipients.join(", "),
92
+ "Subject" => subject,
93
+ "Date" => date,
94
+ "Content-Type" => "text/plain; charset=\"iso-8859-1\""
95
+ }
96
+
97
+ smtp_client_options = {
98
+ :domain => smtp_options["domain"],
99
+ :host => smtp_options["host"],
100
+ :port => smtp_options["port"],
101
+ :from => smtp_options["from"],
102
+ :to => recipients,
103
+ :header => headers,
104
+ :body => body
105
+ }
106
+
107
+ if smtp_options["tls"]
108
+ smtp_client_options[:starttls] = true
109
+ end
110
+
111
+ if smtp_options["auth"]
112
+ smtp_client_options[:auth] = {
113
+ # FIXME: EM SMTP client will only work with plain auth
114
+ :type => smtp_options["auth"].to_sym,
115
+ :username => smtp_options["user"],
116
+ :password => smtp_options["password"]
117
+ }
118
+ end
119
+
120
+ email = EM::Protocols::SmtpClient.send(smtp_client_options)
121
+
122
+ email.callback do
123
+ logger.debug("Email sent (took #{Time.now - started} seconds)")
124
+ end
125
+
126
+ email.errback do |e|
127
+ logger.error("Failed to send email: #{e}")
128
+ end
129
+
130
+ rescue => e
131
+ logger.error("Error sending email: #{e}")
132
+ end
133
+ end
134
+ end
135
+ end