bosh-monitor 1.5.0.pre.1113
Sign up to get free protection for your applications and to get access to all the features.
- data/README +80 -0
- data/bin/bosh-monitor +30 -0
- data/bin/bosh-monitor-console +51 -0
- data/bin/listener +58 -0
- data/lib/bosh/monitor.rb +72 -0
- data/lib/bosh/monitor/agent.rb +51 -0
- data/lib/bosh/monitor/agent_manager.rb +295 -0
- data/lib/bosh/monitor/api_controller.rb +18 -0
- data/lib/bosh/monitor/config.rb +71 -0
- data/lib/bosh/monitor/core_ext.rb +8 -0
- data/lib/bosh/monitor/director.rb +76 -0
- data/lib/bosh/monitor/director_monitor.rb +33 -0
- data/lib/bosh/monitor/errors.rb +19 -0
- data/lib/bosh/monitor/event_processor.rb +109 -0
- data/lib/bosh/monitor/events/alert.rb +92 -0
- data/lib/bosh/monitor/events/base.rb +70 -0
- data/lib/bosh/monitor/events/heartbeat.rb +139 -0
- data/lib/bosh/monitor/metric.rb +16 -0
- data/lib/bosh/monitor/plugins/base.rb +27 -0
- data/lib/bosh/monitor/plugins/cloud_watch.rb +56 -0
- data/lib/bosh/monitor/plugins/datadog.rb +78 -0
- data/lib/bosh/monitor/plugins/dummy.rb +20 -0
- data/lib/bosh/monitor/plugins/email.rb +135 -0
- data/lib/bosh/monitor/plugins/http_request_helper.rb +25 -0
- data/lib/bosh/monitor/plugins/logger.rb +13 -0
- data/lib/bosh/monitor/plugins/nats.rb +43 -0
- data/lib/bosh/monitor/plugins/pagerduty.rb +48 -0
- data/lib/bosh/monitor/plugins/paging_datadog_client.rb +24 -0
- data/lib/bosh/monitor/plugins/resurrector.rb +82 -0
- data/lib/bosh/monitor/plugins/resurrector_helper.rb +84 -0
- data/lib/bosh/monitor/plugins/tsdb.rb +43 -0
- data/lib/bosh/monitor/plugins/varz.rb +17 -0
- data/lib/bosh/monitor/protocols/tsdb.rb +68 -0
- data/lib/bosh/monitor/runner.rb +162 -0
- data/lib/bosh/monitor/version.rb +5 -0
- data/lib/bosh/monitor/yaml_helper.rb +18 -0
- metadata +246 -0
@@ -0,0 +1,139 @@
|
|
1
|
+
module Bosh::Monitor
|
2
|
+
module Events
|
3
|
+
class Heartbeat < Base
|
4
|
+
|
5
|
+
CORE_JOBS = Set.new(%w(cloud_controller dea health_manager nats router routerv2 stager uaa vcap_redis))
|
6
|
+
|
7
|
+
SERVICE_JOBS_PREFIXES = %w(mysql mongodb redis rabbit postgresql vblob).join("|")
|
8
|
+
SERVICE_JOBS_GATEWAY_REGEX = /(#{SERVICE_JOBS_PREFIXES})_gateway$/i
|
9
|
+
SERVICE_JOBS_NODE_REGEX = /(#{SERVICE_JOBS_PREFIXES})_node(.*)/i
|
10
|
+
|
11
|
+
SERVICE_AUXILIARY_JOBS = Set.new(%w(serialization_data_server backup_manager))
|
12
|
+
|
13
|
+
attr_reader :agent_id, :deployment, :job, :index, :metrics
|
14
|
+
|
15
|
+
def initialize(attributes = {})
|
16
|
+
super
|
17
|
+
@kind = :heartbeat
|
18
|
+
@metrics = []
|
19
|
+
|
20
|
+
@id = @attributes["id"]
|
21
|
+
@timestamp = Time.at(@attributes["timestamp"]) rescue @attributes["timestamp"]
|
22
|
+
|
23
|
+
@deployment = @attributes["deployment"]
|
24
|
+
@agent_id = @attributes["agent_id"]
|
25
|
+
@job = @attributes["job"]
|
26
|
+
@index = @attributes["index"].to_s
|
27
|
+
@job_state = @attributes["job_state"]
|
28
|
+
|
29
|
+
@tags = {}
|
30
|
+
@tags["job"] = @job if @job
|
31
|
+
@tags["index"] = @index if @index
|
32
|
+
@tags["role"] = guess_role
|
33
|
+
|
34
|
+
@vitals = @attributes["vitals"] || {}
|
35
|
+
@load = @vitals["load"] || []
|
36
|
+
@cpu = @vitals["cpu"] || {}
|
37
|
+
@mem = @vitals["mem"] || {}
|
38
|
+
@swap = @vitals["swap"] || {}
|
39
|
+
@disk = @vitals["disk"] || {}
|
40
|
+
@system_disk = @disk["system"] || {}
|
41
|
+
@ephemeral_disk = @disk["ephemeral"] || {}
|
42
|
+
@persistent_disk = @disk["persistent"] || {}
|
43
|
+
|
44
|
+
populate_metrics
|
45
|
+
end
|
46
|
+
|
47
|
+
def validate
|
48
|
+
add_error("id is missing") if @id.nil?
|
49
|
+
add_error("timestamp is missing") if @timestamp.nil?
|
50
|
+
|
51
|
+
if @timestamp && !@timestamp.kind_of?(Time)
|
52
|
+
add_error("timestamp is invalid")
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
def add_metric(name, value)
|
57
|
+
@metrics << Metric.new(name, value, @timestamp.to_i, @tags) if value
|
58
|
+
end
|
59
|
+
|
60
|
+
def short_description
|
61
|
+
"Heartbeat from #{@job}/#{@index} (#{@agent_id}) @ #{@timestamp.utc}"
|
62
|
+
end
|
63
|
+
|
64
|
+
def to_s
|
65
|
+
self.short_description
|
66
|
+
end
|
67
|
+
|
68
|
+
def to_hash
|
69
|
+
{
|
70
|
+
:kind => "heartbeat",
|
71
|
+
:id => @id,
|
72
|
+
:timestamp => @timestamp.to_i,
|
73
|
+
:deployment => @deployment,
|
74
|
+
:agent_id => @agent_id,
|
75
|
+
:job => @job,
|
76
|
+
:index => @index,
|
77
|
+
:job_state => @job_state,
|
78
|
+
:vitals => @vitals
|
79
|
+
}
|
80
|
+
end
|
81
|
+
|
82
|
+
def to_json
|
83
|
+
Yajl::Encoder.encode(self.to_hash)
|
84
|
+
end
|
85
|
+
|
86
|
+
def to_plain_text
|
87
|
+
self.short_description
|
88
|
+
end
|
89
|
+
|
90
|
+
private
|
91
|
+
|
92
|
+
def populate_metrics
|
93
|
+
add_metric("system.load.1m", @load[0]) if @load.kind_of?(Array)
|
94
|
+
add_metric("system.cpu.user", @cpu["user"])
|
95
|
+
add_metric("system.cpu.sys", @cpu["sys"])
|
96
|
+
add_metric("system.cpu.wait", @cpu["wait"])
|
97
|
+
add_metric("system.mem.percent", @mem["percent"])
|
98
|
+
add_metric("system.mem.kb", @mem["kb"])
|
99
|
+
add_metric("system.swap.percent", @swap["percent"])
|
100
|
+
add_metric("system.swap.kb", @swap["kb"])
|
101
|
+
add_metric("system.disk.system.percent", @system_disk["percent"])
|
102
|
+
add_metric("system.disk.system.inode_percent", @system_disk["inode_percent"])
|
103
|
+
add_metric("system.disk.ephemeral.percent", @ephemeral_disk["percent"])
|
104
|
+
add_metric("system.disk.ephemeral.inode_percent", @ephemeral_disk["inode_percent"])
|
105
|
+
add_metric("system.disk.persistent.percent", @persistent_disk["percent"])
|
106
|
+
add_metric("system.disk.persistent.inode_percent", @persistent_disk["inode_percent"])
|
107
|
+
add_metric("system.healthy", @job_state == "running" ? 1 : 0)
|
108
|
+
end
|
109
|
+
|
110
|
+
def guess_role
|
111
|
+
# Dashboard might want to partition jobs
|
112
|
+
# into several buckets, so let's help it
|
113
|
+
# by applying a couple of heuristics
|
114
|
+
|
115
|
+
return "core" if CORE_JOBS.include?(@job.to_s.downcase)
|
116
|
+
|
117
|
+
return "service" if SERVICE_AUXILIARY_JOBS.include?(@job.to_s.downcase)
|
118
|
+
|
119
|
+
# job name prefixed by "service"
|
120
|
+
if @job.to_s.downcase =~ /^service/i
|
121
|
+
return "service"
|
122
|
+
end
|
123
|
+
|
124
|
+
# job name suffixed by "_gateway"
|
125
|
+
if @job.to_s.downcase =~ SERVICE_JOBS_GATEWAY_REGEX
|
126
|
+
return "service"
|
127
|
+
end
|
128
|
+
|
129
|
+
# job name contains "_node"
|
130
|
+
if @job.to_s.downcase =~ SERVICE_JOBS_NODE_REGEX
|
131
|
+
return "service"
|
132
|
+
end
|
133
|
+
|
134
|
+
return "unknown"
|
135
|
+
end
|
136
|
+
|
137
|
+
end
|
138
|
+
end
|
139
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
module Bosh::Monitor
|
2
|
+
class Metric
|
3
|
+
|
4
|
+
attr_accessor :name
|
5
|
+
attr_accessor :value
|
6
|
+
attr_accessor :timestamp
|
7
|
+
attr_accessor :tags
|
8
|
+
|
9
|
+
def initialize(name, value, timestamp, tags)
|
10
|
+
@name = name
|
11
|
+
@value = value
|
12
|
+
@timestamp = timestamp
|
13
|
+
@tags = tags
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
module Bosh::Monitor
|
2
|
+
module Plugins
|
3
|
+
class Base
|
4
|
+
attr_reader :logger
|
5
|
+
attr_reader :options
|
6
|
+
attr_reader :event_kinds
|
7
|
+
|
8
|
+
def initialize(options = {})
|
9
|
+
@logger = Bhm.logger
|
10
|
+
@options = (options || {}).dup
|
11
|
+
@event_kinds = []
|
12
|
+
end
|
13
|
+
|
14
|
+
def validate_options
|
15
|
+
true
|
16
|
+
end
|
17
|
+
|
18
|
+
def run
|
19
|
+
raise FatalError, "`run' method is not implemented in `#{self.class}'"
|
20
|
+
end
|
21
|
+
|
22
|
+
def process(event)
|
23
|
+
raise FatalError, "`process' method is not implemented in `#{self.class}'"
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
require 'aws-sdk'
|
2
|
+
|
3
|
+
module Bosh::Monitor
|
4
|
+
module Plugins
|
5
|
+
class CloudWatch < Base
|
6
|
+
def initialize(options={})
|
7
|
+
@options = options
|
8
|
+
end
|
9
|
+
|
10
|
+
def aws_cloud_watch
|
11
|
+
@aws_cloud_watch ||= AWS::CloudWatch.new(@options)
|
12
|
+
end
|
13
|
+
|
14
|
+
def run
|
15
|
+
end
|
16
|
+
|
17
|
+
def process(event)
|
18
|
+
if event.is_a? Bosh::Monitor::Events::Heartbeat
|
19
|
+
aws_cloud_watch.put_metric_data(heartbeat_to_cloudwatch_metric(event))
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
private
|
24
|
+
|
25
|
+
def heartbeat_to_cloudwatch_metric(heartbeat)
|
26
|
+
{
|
27
|
+
namespace: "BOSH/HealthMonitor",
|
28
|
+
metric_data: heartbeat.metrics.collect do |metric|
|
29
|
+
build_metric(metric, dimensions(heartbeat))
|
30
|
+
end
|
31
|
+
}
|
32
|
+
end
|
33
|
+
|
34
|
+
def dimensions(heartbeat)
|
35
|
+
@dimensions ||= [
|
36
|
+
{name: "job", value: heartbeat.job},
|
37
|
+
{name: "index", value: heartbeat.index},
|
38
|
+
{name: "name", value: "#{heartbeat.job}/#{heartbeat.index}"},
|
39
|
+
{name: "deployment", value: heartbeat.deployment},
|
40
|
+
{name: "agent_id", value: heartbeat.agent_id}
|
41
|
+
]
|
42
|
+
end
|
43
|
+
|
44
|
+
def build_metric(metric, dimensions)
|
45
|
+
timestamp = Time.at(metric.timestamp).utc.iso8601
|
46
|
+
|
47
|
+
{
|
48
|
+
metric_name: metric.name.to_s,
|
49
|
+
value: metric.value.to_s,
|
50
|
+
timestamp: timestamp,
|
51
|
+
dimensions: dimensions
|
52
|
+
}
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
@@ -0,0 +1,78 @@
|
|
1
|
+
require 'dogapi'
|
2
|
+
|
3
|
+
module Bosh::Monitor
|
4
|
+
module Plugins
|
5
|
+
class DataDog < Base
|
6
|
+
|
7
|
+
NORMAL_PRIORITY = [:alert, :critical, :error]
|
8
|
+
|
9
|
+
def validate_options
|
10
|
+
!!(options.kind_of?(Hash) && options["api_key"] && options["application_key"])
|
11
|
+
end
|
12
|
+
|
13
|
+
def run
|
14
|
+
@api_key = options["api_key"]
|
15
|
+
@application_key = options["application_key"]
|
16
|
+
@pagerduty_service_name = options["pagerduty_service_name"]
|
17
|
+
|
18
|
+
logger.info("DataDog plugin is running...")
|
19
|
+
end
|
20
|
+
|
21
|
+
def dog_client
|
22
|
+
return @dog_client if @dog_client
|
23
|
+
client = Dogapi::Client.new(@api_key, @application_key)
|
24
|
+
@dog_client = @pagerduty_service_name ? PagingDatadogClient.new(@pagerduty_service_name, client) : client
|
25
|
+
end
|
26
|
+
|
27
|
+
def process(event)
|
28
|
+
case event
|
29
|
+
when Bosh::Monitor::Events::Heartbeat
|
30
|
+
EM.defer { process_heartbeat(event) }
|
31
|
+
when Bosh::Monitor::Events::Alert
|
32
|
+
EM.defer { process_alert(event) }
|
33
|
+
else
|
34
|
+
#ignore
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
private
|
39
|
+
|
40
|
+
def process_heartbeat(heartbeat)
|
41
|
+
tags = %W[
|
42
|
+
job:#{heartbeat.job}
|
43
|
+
index:#{heartbeat.index}
|
44
|
+
deployment:#{heartbeat.deployment}
|
45
|
+
agent:#{heartbeat.agent_id}
|
46
|
+
]
|
47
|
+
|
48
|
+
heartbeat.metrics.each do |metric|
|
49
|
+
point = [Time.at(metric.timestamp), metric.value]
|
50
|
+
dog_client.emit_points("bosh.healthmonitor.#{metric.name}", [point], tags: tags)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
def process_alert(alert)
|
55
|
+
msg, title, source, timestamp = alert.to_hash.values_at(:summary,
|
56
|
+
:title,
|
57
|
+
:source,
|
58
|
+
:created_at)
|
59
|
+
|
60
|
+
|
61
|
+
# DataDog only supports "low" and "normal" priority
|
62
|
+
priority = normal_priority?(alert.severity) ? "normal" : "low"
|
63
|
+
dog_client.emit_event(
|
64
|
+
Dogapi::Event.new(msg,
|
65
|
+
msg_title: title,
|
66
|
+
date_happened: timestamp,
|
67
|
+
tags: ["source:#{source}"],
|
68
|
+
priority: priority
|
69
|
+
)
|
70
|
+
)
|
71
|
+
end
|
72
|
+
|
73
|
+
def normal_priority?(severity)
|
74
|
+
NORMAL_PRIORITY.include?(severity)
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
module Bosh::Monitor
|
2
|
+
module Plugins
|
3
|
+
class Dummy < Base
|
4
|
+
def run
|
5
|
+
logger.info("Dummy delivery agent is running...")
|
6
|
+
end
|
7
|
+
|
8
|
+
def process(event)
|
9
|
+
logger.info("Processing event!")
|
10
|
+
logger.info(event)
|
11
|
+
@events ||= []
|
12
|
+
@events << event
|
13
|
+
end
|
14
|
+
|
15
|
+
def events
|
16
|
+
@events
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,135 @@
|
|
1
|
+
module Bosh::Monitor
|
2
|
+
module Plugins
|
3
|
+
class Email < Base
|
4
|
+
DEFAULT_INTERVAL = 10
|
5
|
+
|
6
|
+
def initialize(options = {})
|
7
|
+
@queues = {}
|
8
|
+
@lock = Mutex.new
|
9
|
+
|
10
|
+
if options.has_key?("interval")
|
11
|
+
@delivery_interval = options["interval"].to_f
|
12
|
+
else
|
13
|
+
@delivery_interval = DEFAULT_INTERVAL
|
14
|
+
end
|
15
|
+
|
16
|
+
@started = false
|
17
|
+
super
|
18
|
+
end
|
19
|
+
|
20
|
+
def queue_size(kind)
|
21
|
+
return 0 if @queues[kind].nil?
|
22
|
+
@queues[kind].size
|
23
|
+
end
|
24
|
+
|
25
|
+
def run
|
26
|
+
unless EM.reactor_running?
|
27
|
+
logger.error("Email plugin can only be started when event loop is running")
|
28
|
+
return false
|
29
|
+
end
|
30
|
+
|
31
|
+
return true if @started
|
32
|
+
logger.info("Email plugin is running...")
|
33
|
+
|
34
|
+
EM.add_periodic_timer(@delivery_interval) do
|
35
|
+
begin
|
36
|
+
process_queues
|
37
|
+
rescue => e
|
38
|
+
logger.error("Problem processing email queues: #{e}")
|
39
|
+
end
|
40
|
+
end
|
41
|
+
@started = true
|
42
|
+
end
|
43
|
+
|
44
|
+
def validate_options
|
45
|
+
options.kind_of?(Hash) &&
|
46
|
+
options["recipients"].kind_of?(Array) &&
|
47
|
+
options["smtp"].kind_of?(Hash) &&
|
48
|
+
options["smtp"]["host"] &&
|
49
|
+
options["smtp"]["port"] &&
|
50
|
+
options["smtp"]["from"] &&
|
51
|
+
true # force the whole method to return Boolean
|
52
|
+
end
|
53
|
+
|
54
|
+
def recipients
|
55
|
+
options["recipients"]
|
56
|
+
end
|
57
|
+
|
58
|
+
def smtp_options
|
59
|
+
options["smtp"]
|
60
|
+
end
|
61
|
+
|
62
|
+
def process(event)
|
63
|
+
@lock.synchronize do
|
64
|
+
@queues[event.kind] ||= []
|
65
|
+
@queues[event.kind] << event
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
def process_queues
|
70
|
+
@queues.each_pair do |kind, queue|
|
71
|
+
next if queue.empty?
|
72
|
+
email_subject = "%s from BOSH Health Monitor" % [ pluralize(queue_size(kind), kind) ]
|
73
|
+
email_body = ""
|
74
|
+
|
75
|
+
@lock.synchronize do
|
76
|
+
while event = queue.shift
|
77
|
+
email_body << event.to_plain_text << "\n"
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
send_email_async(email_subject, email_body)
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
def send_email_async(subject, body, date = Time.now)
|
86
|
+
started = Time.now
|
87
|
+
logger.debug("Sending email...")
|
88
|
+
|
89
|
+
headers = {
|
90
|
+
"From" => smtp_options["from"],
|
91
|
+
"To" => recipients.join(", "),
|
92
|
+
"Subject" => subject,
|
93
|
+
"Date" => date,
|
94
|
+
"Content-Type" => "text/plain; charset=\"iso-8859-1\""
|
95
|
+
}
|
96
|
+
|
97
|
+
smtp_client_options = {
|
98
|
+
:domain => smtp_options["domain"],
|
99
|
+
:host => smtp_options["host"],
|
100
|
+
:port => smtp_options["port"],
|
101
|
+
:from => smtp_options["from"],
|
102
|
+
:to => recipients,
|
103
|
+
:header => headers,
|
104
|
+
:body => body
|
105
|
+
}
|
106
|
+
|
107
|
+
if smtp_options["tls"]
|
108
|
+
smtp_client_options[:starttls] = true
|
109
|
+
end
|
110
|
+
|
111
|
+
if smtp_options["auth"]
|
112
|
+
smtp_client_options[:auth] = {
|
113
|
+
# FIXME: EM SMTP client will only work with plain auth
|
114
|
+
:type => smtp_options["auth"].to_sym,
|
115
|
+
:username => smtp_options["user"],
|
116
|
+
:password => smtp_options["password"]
|
117
|
+
}
|
118
|
+
end
|
119
|
+
|
120
|
+
email = EM::Protocols::SmtpClient.send(smtp_client_options)
|
121
|
+
|
122
|
+
email.callback do
|
123
|
+
logger.debug("Email sent (took #{Time.now - started} seconds)")
|
124
|
+
end
|
125
|
+
|
126
|
+
email.errback do |e|
|
127
|
+
logger.error("Failed to send email: #{e}")
|
128
|
+
end
|
129
|
+
|
130
|
+
rescue => e
|
131
|
+
logger.error("Error sending email: #{e}")
|
132
|
+
end
|
133
|
+
end
|
134
|
+
end
|
135
|
+
end
|