bosh-monitor 1.5.0.pre.1113
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +80 -0
- data/bin/bosh-monitor +30 -0
- data/bin/bosh-monitor-console +51 -0
- data/bin/listener +58 -0
- data/lib/bosh/monitor.rb +72 -0
- data/lib/bosh/monitor/agent.rb +51 -0
- data/lib/bosh/monitor/agent_manager.rb +295 -0
- data/lib/bosh/monitor/api_controller.rb +18 -0
- data/lib/bosh/monitor/config.rb +71 -0
- data/lib/bosh/monitor/core_ext.rb +8 -0
- data/lib/bosh/monitor/director.rb +76 -0
- data/lib/bosh/monitor/director_monitor.rb +33 -0
- data/lib/bosh/monitor/errors.rb +19 -0
- data/lib/bosh/monitor/event_processor.rb +109 -0
- data/lib/bosh/monitor/events/alert.rb +92 -0
- data/lib/bosh/monitor/events/base.rb +70 -0
- data/lib/bosh/monitor/events/heartbeat.rb +139 -0
- data/lib/bosh/monitor/metric.rb +16 -0
- data/lib/bosh/monitor/plugins/base.rb +27 -0
- data/lib/bosh/monitor/plugins/cloud_watch.rb +56 -0
- data/lib/bosh/monitor/plugins/datadog.rb +78 -0
- data/lib/bosh/monitor/plugins/dummy.rb +20 -0
- data/lib/bosh/monitor/plugins/email.rb +135 -0
- data/lib/bosh/monitor/plugins/http_request_helper.rb +25 -0
- data/lib/bosh/monitor/plugins/logger.rb +13 -0
- data/lib/bosh/monitor/plugins/nats.rb +43 -0
- data/lib/bosh/monitor/plugins/pagerduty.rb +48 -0
- data/lib/bosh/monitor/plugins/paging_datadog_client.rb +24 -0
- data/lib/bosh/monitor/plugins/resurrector.rb +82 -0
- data/lib/bosh/monitor/plugins/resurrector_helper.rb +84 -0
- data/lib/bosh/monitor/plugins/tsdb.rb +43 -0
- data/lib/bosh/monitor/plugins/varz.rb +17 -0
- data/lib/bosh/monitor/protocols/tsdb.rb +68 -0
- data/lib/bosh/monitor/runner.rb +162 -0
- data/lib/bosh/monitor/version.rb +5 -0
- data/lib/bosh/monitor/yaml_helper.rb +18 -0
- metadata +246 -0
@@ -0,0 +1,139 @@
|
|
1
|
+
module Bosh::Monitor
|
2
|
+
module Events
|
3
|
+
class Heartbeat < Base
|
4
|
+
|
5
|
+
CORE_JOBS = Set.new(%w(cloud_controller dea health_manager nats router routerv2 stager uaa vcap_redis))
|
6
|
+
|
7
|
+
SERVICE_JOBS_PREFIXES = %w(mysql mongodb redis rabbit postgresql vblob).join("|")
|
8
|
+
SERVICE_JOBS_GATEWAY_REGEX = /(#{SERVICE_JOBS_PREFIXES})_gateway$/i
|
9
|
+
SERVICE_JOBS_NODE_REGEX = /(#{SERVICE_JOBS_PREFIXES})_node(.*)/i
|
10
|
+
|
11
|
+
SERVICE_AUXILIARY_JOBS = Set.new(%w(serialization_data_server backup_manager))
|
12
|
+
|
13
|
+
attr_reader :agent_id, :deployment, :job, :index, :metrics
|
14
|
+
|
15
|
+
def initialize(attributes = {})
|
16
|
+
super
|
17
|
+
@kind = :heartbeat
|
18
|
+
@metrics = []
|
19
|
+
|
20
|
+
@id = @attributes["id"]
|
21
|
+
@timestamp = Time.at(@attributes["timestamp"]) rescue @attributes["timestamp"]
|
22
|
+
|
23
|
+
@deployment = @attributes["deployment"]
|
24
|
+
@agent_id = @attributes["agent_id"]
|
25
|
+
@job = @attributes["job"]
|
26
|
+
@index = @attributes["index"].to_s
|
27
|
+
@job_state = @attributes["job_state"]
|
28
|
+
|
29
|
+
@tags = {}
|
30
|
+
@tags["job"] = @job if @job
|
31
|
+
@tags["index"] = @index if @index
|
32
|
+
@tags["role"] = guess_role
|
33
|
+
|
34
|
+
@vitals = @attributes["vitals"] || {}
|
35
|
+
@load = @vitals["load"] || []
|
36
|
+
@cpu = @vitals["cpu"] || {}
|
37
|
+
@mem = @vitals["mem"] || {}
|
38
|
+
@swap = @vitals["swap"] || {}
|
39
|
+
@disk = @vitals["disk"] || {}
|
40
|
+
@system_disk = @disk["system"] || {}
|
41
|
+
@ephemeral_disk = @disk["ephemeral"] || {}
|
42
|
+
@persistent_disk = @disk["persistent"] || {}
|
43
|
+
|
44
|
+
populate_metrics
|
45
|
+
end
|
46
|
+
|
47
|
+
def validate
|
48
|
+
add_error("id is missing") if @id.nil?
|
49
|
+
add_error("timestamp is missing") if @timestamp.nil?
|
50
|
+
|
51
|
+
if @timestamp && !@timestamp.kind_of?(Time)
|
52
|
+
add_error("timestamp is invalid")
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
def add_metric(name, value)
|
57
|
+
@metrics << Metric.new(name, value, @timestamp.to_i, @tags) if value
|
58
|
+
end
|
59
|
+
|
60
|
+
def short_description
|
61
|
+
"Heartbeat from #{@job}/#{@index} (#{@agent_id}) @ #{@timestamp.utc}"
|
62
|
+
end
|
63
|
+
|
64
|
+
def to_s
|
65
|
+
self.short_description
|
66
|
+
end
|
67
|
+
|
68
|
+
def to_hash
|
69
|
+
{
|
70
|
+
:kind => "heartbeat",
|
71
|
+
:id => @id,
|
72
|
+
:timestamp => @timestamp.to_i,
|
73
|
+
:deployment => @deployment,
|
74
|
+
:agent_id => @agent_id,
|
75
|
+
:job => @job,
|
76
|
+
:index => @index,
|
77
|
+
:job_state => @job_state,
|
78
|
+
:vitals => @vitals
|
79
|
+
}
|
80
|
+
end
|
81
|
+
|
82
|
+
def to_json
|
83
|
+
Yajl::Encoder.encode(self.to_hash)
|
84
|
+
end
|
85
|
+
|
86
|
+
def to_plain_text
|
87
|
+
self.short_description
|
88
|
+
end
|
89
|
+
|
90
|
+
private
|
91
|
+
|
92
|
+
def populate_metrics
|
93
|
+
add_metric("system.load.1m", @load[0]) if @load.kind_of?(Array)
|
94
|
+
add_metric("system.cpu.user", @cpu["user"])
|
95
|
+
add_metric("system.cpu.sys", @cpu["sys"])
|
96
|
+
add_metric("system.cpu.wait", @cpu["wait"])
|
97
|
+
add_metric("system.mem.percent", @mem["percent"])
|
98
|
+
add_metric("system.mem.kb", @mem["kb"])
|
99
|
+
add_metric("system.swap.percent", @swap["percent"])
|
100
|
+
add_metric("system.swap.kb", @swap["kb"])
|
101
|
+
add_metric("system.disk.system.percent", @system_disk["percent"])
|
102
|
+
add_metric("system.disk.system.inode_percent", @system_disk["inode_percent"])
|
103
|
+
add_metric("system.disk.ephemeral.percent", @ephemeral_disk["percent"])
|
104
|
+
add_metric("system.disk.ephemeral.inode_percent", @ephemeral_disk["inode_percent"])
|
105
|
+
add_metric("system.disk.persistent.percent", @persistent_disk["percent"])
|
106
|
+
add_metric("system.disk.persistent.inode_percent", @persistent_disk["inode_percent"])
|
107
|
+
add_metric("system.healthy", @job_state == "running" ? 1 : 0)
|
108
|
+
end
|
109
|
+
|
110
|
+
def guess_role
|
111
|
+
# Dashboard might want to partition jobs
|
112
|
+
# into several buckets, so let's help it
|
113
|
+
# by applying a couple of heuristics
|
114
|
+
|
115
|
+
return "core" if CORE_JOBS.include?(@job.to_s.downcase)
|
116
|
+
|
117
|
+
return "service" if SERVICE_AUXILIARY_JOBS.include?(@job.to_s.downcase)
|
118
|
+
|
119
|
+
# job name prefixed by "service"
|
120
|
+
if @job.to_s.downcase =~ /^service/i
|
121
|
+
return "service"
|
122
|
+
end
|
123
|
+
|
124
|
+
# job name suffixed by "_gateway"
|
125
|
+
if @job.to_s.downcase =~ SERVICE_JOBS_GATEWAY_REGEX
|
126
|
+
return "service"
|
127
|
+
end
|
128
|
+
|
129
|
+
# job name contains "_node"
|
130
|
+
if @job.to_s.downcase =~ SERVICE_JOBS_NODE_REGEX
|
131
|
+
return "service"
|
132
|
+
end
|
133
|
+
|
134
|
+
return "unknown"
|
135
|
+
end
|
136
|
+
|
137
|
+
end
|
138
|
+
end
|
139
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
module Bosh::Monitor
|
2
|
+
class Metric
|
3
|
+
|
4
|
+
attr_accessor :name
|
5
|
+
attr_accessor :value
|
6
|
+
attr_accessor :timestamp
|
7
|
+
attr_accessor :tags
|
8
|
+
|
9
|
+
def initialize(name, value, timestamp, tags)
|
10
|
+
@name = name
|
11
|
+
@value = value
|
12
|
+
@timestamp = timestamp
|
13
|
+
@tags = tags
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
module Bosh::Monitor
|
2
|
+
module Plugins
|
3
|
+
class Base
|
4
|
+
attr_reader :logger
|
5
|
+
attr_reader :options
|
6
|
+
attr_reader :event_kinds
|
7
|
+
|
8
|
+
def initialize(options = {})
|
9
|
+
@logger = Bhm.logger
|
10
|
+
@options = (options || {}).dup
|
11
|
+
@event_kinds = []
|
12
|
+
end
|
13
|
+
|
14
|
+
def validate_options
|
15
|
+
true
|
16
|
+
end
|
17
|
+
|
18
|
+
def run
|
19
|
+
raise FatalError, "`run' method is not implemented in `#{self.class}'"
|
20
|
+
end
|
21
|
+
|
22
|
+
def process(event)
|
23
|
+
raise FatalError, "`process' method is not implemented in `#{self.class}'"
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
require 'aws-sdk'
|
2
|
+
|
3
|
+
module Bosh::Monitor
|
4
|
+
module Plugins
|
5
|
+
class CloudWatch < Base
|
6
|
+
def initialize(options={})
|
7
|
+
@options = options
|
8
|
+
end
|
9
|
+
|
10
|
+
def aws_cloud_watch
|
11
|
+
@aws_cloud_watch ||= AWS::CloudWatch.new(@options)
|
12
|
+
end
|
13
|
+
|
14
|
+
def run
|
15
|
+
end
|
16
|
+
|
17
|
+
def process(event)
|
18
|
+
if event.is_a? Bosh::Monitor::Events::Heartbeat
|
19
|
+
aws_cloud_watch.put_metric_data(heartbeat_to_cloudwatch_metric(event))
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
private
|
24
|
+
|
25
|
+
def heartbeat_to_cloudwatch_metric(heartbeat)
|
26
|
+
{
|
27
|
+
namespace: "BOSH/HealthMonitor",
|
28
|
+
metric_data: heartbeat.metrics.collect do |metric|
|
29
|
+
build_metric(metric, dimensions(heartbeat))
|
30
|
+
end
|
31
|
+
}
|
32
|
+
end
|
33
|
+
|
34
|
+
def dimensions(heartbeat)
|
35
|
+
@dimensions ||= [
|
36
|
+
{name: "job", value: heartbeat.job},
|
37
|
+
{name: "index", value: heartbeat.index},
|
38
|
+
{name: "name", value: "#{heartbeat.job}/#{heartbeat.index}"},
|
39
|
+
{name: "deployment", value: heartbeat.deployment},
|
40
|
+
{name: "agent_id", value: heartbeat.agent_id}
|
41
|
+
]
|
42
|
+
end
|
43
|
+
|
44
|
+
def build_metric(metric, dimensions)
|
45
|
+
timestamp = Time.at(metric.timestamp).utc.iso8601
|
46
|
+
|
47
|
+
{
|
48
|
+
metric_name: metric.name.to_s,
|
49
|
+
value: metric.value.to_s,
|
50
|
+
timestamp: timestamp,
|
51
|
+
dimensions: dimensions
|
52
|
+
}
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
@@ -0,0 +1,78 @@
|
|
1
|
+
require 'dogapi'
|
2
|
+
|
3
|
+
module Bosh::Monitor
|
4
|
+
module Plugins
|
5
|
+
class DataDog < Base
|
6
|
+
|
7
|
+
NORMAL_PRIORITY = [:alert, :critical, :error]
|
8
|
+
|
9
|
+
def validate_options
|
10
|
+
!!(options.kind_of?(Hash) && options["api_key"] && options["application_key"])
|
11
|
+
end
|
12
|
+
|
13
|
+
def run
|
14
|
+
@api_key = options["api_key"]
|
15
|
+
@application_key = options["application_key"]
|
16
|
+
@pagerduty_service_name = options["pagerduty_service_name"]
|
17
|
+
|
18
|
+
logger.info("DataDog plugin is running...")
|
19
|
+
end
|
20
|
+
|
21
|
+
def dog_client
|
22
|
+
return @dog_client if @dog_client
|
23
|
+
client = Dogapi::Client.new(@api_key, @application_key)
|
24
|
+
@dog_client = @pagerduty_service_name ? PagingDatadogClient.new(@pagerduty_service_name, client) : client
|
25
|
+
end
|
26
|
+
|
27
|
+
def process(event)
|
28
|
+
case event
|
29
|
+
when Bosh::Monitor::Events::Heartbeat
|
30
|
+
EM.defer { process_heartbeat(event) }
|
31
|
+
when Bosh::Monitor::Events::Alert
|
32
|
+
EM.defer { process_alert(event) }
|
33
|
+
else
|
34
|
+
#ignore
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
private
|
39
|
+
|
40
|
+
def process_heartbeat(heartbeat)
|
41
|
+
tags = %W[
|
42
|
+
job:#{heartbeat.job}
|
43
|
+
index:#{heartbeat.index}
|
44
|
+
deployment:#{heartbeat.deployment}
|
45
|
+
agent:#{heartbeat.agent_id}
|
46
|
+
]
|
47
|
+
|
48
|
+
heartbeat.metrics.each do |metric|
|
49
|
+
point = [Time.at(metric.timestamp), metric.value]
|
50
|
+
dog_client.emit_points("bosh.healthmonitor.#{metric.name}", [point], tags: tags)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
def process_alert(alert)
|
55
|
+
msg, title, source, timestamp = alert.to_hash.values_at(:summary,
|
56
|
+
:title,
|
57
|
+
:source,
|
58
|
+
:created_at)
|
59
|
+
|
60
|
+
|
61
|
+
# DataDog only supports "low" and "normal" priority
|
62
|
+
priority = normal_priority?(alert.severity) ? "normal" : "low"
|
63
|
+
dog_client.emit_event(
|
64
|
+
Dogapi::Event.new(msg,
|
65
|
+
msg_title: title,
|
66
|
+
date_happened: timestamp,
|
67
|
+
tags: ["source:#{source}"],
|
68
|
+
priority: priority
|
69
|
+
)
|
70
|
+
)
|
71
|
+
end
|
72
|
+
|
73
|
+
def normal_priority?(severity)
|
74
|
+
NORMAL_PRIORITY.include?(severity)
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
module Bosh::Monitor
|
2
|
+
module Plugins
|
3
|
+
class Dummy < Base
|
4
|
+
def run
|
5
|
+
logger.info("Dummy delivery agent is running...")
|
6
|
+
end
|
7
|
+
|
8
|
+
def process(event)
|
9
|
+
logger.info("Processing event!")
|
10
|
+
logger.info(event)
|
11
|
+
@events ||= []
|
12
|
+
@events << event
|
13
|
+
end
|
14
|
+
|
15
|
+
def events
|
16
|
+
@events
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,135 @@
|
|
1
|
+
module Bosh::Monitor
|
2
|
+
module Plugins
|
3
|
+
class Email < Base
|
4
|
+
DEFAULT_INTERVAL = 10
|
5
|
+
|
6
|
+
def initialize(options = {})
|
7
|
+
@queues = {}
|
8
|
+
@lock = Mutex.new
|
9
|
+
|
10
|
+
if options.has_key?("interval")
|
11
|
+
@delivery_interval = options["interval"].to_f
|
12
|
+
else
|
13
|
+
@delivery_interval = DEFAULT_INTERVAL
|
14
|
+
end
|
15
|
+
|
16
|
+
@started = false
|
17
|
+
super
|
18
|
+
end
|
19
|
+
|
20
|
+
def queue_size(kind)
|
21
|
+
return 0 if @queues[kind].nil?
|
22
|
+
@queues[kind].size
|
23
|
+
end
|
24
|
+
|
25
|
+
def run
|
26
|
+
unless EM.reactor_running?
|
27
|
+
logger.error("Email plugin can only be started when event loop is running")
|
28
|
+
return false
|
29
|
+
end
|
30
|
+
|
31
|
+
return true if @started
|
32
|
+
logger.info("Email plugin is running...")
|
33
|
+
|
34
|
+
EM.add_periodic_timer(@delivery_interval) do
|
35
|
+
begin
|
36
|
+
process_queues
|
37
|
+
rescue => e
|
38
|
+
logger.error("Problem processing email queues: #{e}")
|
39
|
+
end
|
40
|
+
end
|
41
|
+
@started = true
|
42
|
+
end
|
43
|
+
|
44
|
+
def validate_options
|
45
|
+
options.kind_of?(Hash) &&
|
46
|
+
options["recipients"].kind_of?(Array) &&
|
47
|
+
options["smtp"].kind_of?(Hash) &&
|
48
|
+
options["smtp"]["host"] &&
|
49
|
+
options["smtp"]["port"] &&
|
50
|
+
options["smtp"]["from"] &&
|
51
|
+
true # force the whole method to return Boolean
|
52
|
+
end
|
53
|
+
|
54
|
+
def recipients
|
55
|
+
options["recipients"]
|
56
|
+
end
|
57
|
+
|
58
|
+
def smtp_options
|
59
|
+
options["smtp"]
|
60
|
+
end
|
61
|
+
|
62
|
+
def process(event)
|
63
|
+
@lock.synchronize do
|
64
|
+
@queues[event.kind] ||= []
|
65
|
+
@queues[event.kind] << event
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
def process_queues
|
70
|
+
@queues.each_pair do |kind, queue|
|
71
|
+
next if queue.empty?
|
72
|
+
email_subject = "%s from BOSH Health Monitor" % [ pluralize(queue_size(kind), kind) ]
|
73
|
+
email_body = ""
|
74
|
+
|
75
|
+
@lock.synchronize do
|
76
|
+
while event = queue.shift
|
77
|
+
email_body << event.to_plain_text << "\n"
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
send_email_async(email_subject, email_body)
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
def send_email_async(subject, body, date = Time.now)
|
86
|
+
started = Time.now
|
87
|
+
logger.debug("Sending email...")
|
88
|
+
|
89
|
+
headers = {
|
90
|
+
"From" => smtp_options["from"],
|
91
|
+
"To" => recipients.join(", "),
|
92
|
+
"Subject" => subject,
|
93
|
+
"Date" => date,
|
94
|
+
"Content-Type" => "text/plain; charset=\"iso-8859-1\""
|
95
|
+
}
|
96
|
+
|
97
|
+
smtp_client_options = {
|
98
|
+
:domain => smtp_options["domain"],
|
99
|
+
:host => smtp_options["host"],
|
100
|
+
:port => smtp_options["port"],
|
101
|
+
:from => smtp_options["from"],
|
102
|
+
:to => recipients,
|
103
|
+
:header => headers,
|
104
|
+
:body => body
|
105
|
+
}
|
106
|
+
|
107
|
+
if smtp_options["tls"]
|
108
|
+
smtp_client_options[:starttls] = true
|
109
|
+
end
|
110
|
+
|
111
|
+
if smtp_options["auth"]
|
112
|
+
smtp_client_options[:auth] = {
|
113
|
+
# FIXME: EM SMTP client will only work with plain auth
|
114
|
+
:type => smtp_options["auth"].to_sym,
|
115
|
+
:username => smtp_options["user"],
|
116
|
+
:password => smtp_options["password"]
|
117
|
+
}
|
118
|
+
end
|
119
|
+
|
120
|
+
email = EM::Protocols::SmtpClient.send(smtp_client_options)
|
121
|
+
|
122
|
+
email.callback do
|
123
|
+
logger.debug("Email sent (took #{Time.now - started} seconds)")
|
124
|
+
end
|
125
|
+
|
126
|
+
email.errback do |e|
|
127
|
+
logger.error("Failed to send email: #{e}")
|
128
|
+
end
|
129
|
+
|
130
|
+
rescue => e
|
131
|
+
logger.error("Error sending email: #{e}")
|
132
|
+
end
|
133
|
+
end
|
134
|
+
end
|
135
|
+
end
|