bosh-monitor 1.5.0.pre.1113
Sign up to get free protection for your applications and to get access to all the features.
- data/README +80 -0
- data/bin/bosh-monitor +30 -0
- data/bin/bosh-monitor-console +51 -0
- data/bin/listener +58 -0
- data/lib/bosh/monitor.rb +72 -0
- data/lib/bosh/monitor/agent.rb +51 -0
- data/lib/bosh/monitor/agent_manager.rb +295 -0
- data/lib/bosh/monitor/api_controller.rb +18 -0
- data/lib/bosh/monitor/config.rb +71 -0
- data/lib/bosh/monitor/core_ext.rb +8 -0
- data/lib/bosh/monitor/director.rb +76 -0
- data/lib/bosh/monitor/director_monitor.rb +33 -0
- data/lib/bosh/monitor/errors.rb +19 -0
- data/lib/bosh/monitor/event_processor.rb +109 -0
- data/lib/bosh/monitor/events/alert.rb +92 -0
- data/lib/bosh/monitor/events/base.rb +70 -0
- data/lib/bosh/monitor/events/heartbeat.rb +139 -0
- data/lib/bosh/monitor/metric.rb +16 -0
- data/lib/bosh/monitor/plugins/base.rb +27 -0
- data/lib/bosh/monitor/plugins/cloud_watch.rb +56 -0
- data/lib/bosh/monitor/plugins/datadog.rb +78 -0
- data/lib/bosh/monitor/plugins/dummy.rb +20 -0
- data/lib/bosh/monitor/plugins/email.rb +135 -0
- data/lib/bosh/monitor/plugins/http_request_helper.rb +25 -0
- data/lib/bosh/monitor/plugins/logger.rb +13 -0
- data/lib/bosh/monitor/plugins/nats.rb +43 -0
- data/lib/bosh/monitor/plugins/pagerduty.rb +48 -0
- data/lib/bosh/monitor/plugins/paging_datadog_client.rb +24 -0
- data/lib/bosh/monitor/plugins/resurrector.rb +82 -0
- data/lib/bosh/monitor/plugins/resurrector_helper.rb +84 -0
- data/lib/bosh/monitor/plugins/tsdb.rb +43 -0
- data/lib/bosh/monitor/plugins/varz.rb +17 -0
- data/lib/bosh/monitor/protocols/tsdb.rb +68 -0
- data/lib/bosh/monitor/runner.rb +162 -0
- data/lib/bosh/monitor/version.rb +5 -0
- data/lib/bosh/monitor/yaml_helper.rb +18 -0
- metadata +246 -0
@@ -0,0 +1,18 @@
|
|
1
|
+
module Bosh::Monitor
|
2
|
+
|
3
|
+
class ApiController < Sinatra::Base
|
4
|
+
|
5
|
+
configure do
|
6
|
+
set(:show_exceptions, false)
|
7
|
+
set(:raise_errors, false)
|
8
|
+
set(:dump_errors, false)
|
9
|
+
end
|
10
|
+
|
11
|
+
get "/varz" do
|
12
|
+
content_type(:json)
|
13
|
+
Yajl::Encoder.encode(Bhm.varz, :terminator => "\n")
|
14
|
+
end
|
15
|
+
|
16
|
+
end
|
17
|
+
|
18
|
+
end
|
@@ -0,0 +1,71 @@
|
|
1
|
+
module Bosh::Monitor
|
2
|
+
|
3
|
+
class << self
|
4
|
+
|
5
|
+
attr_accessor :logger
|
6
|
+
attr_accessor :director
|
7
|
+
attr_accessor :intervals
|
8
|
+
attr_accessor :mbus
|
9
|
+
attr_accessor :event_mbus
|
10
|
+
attr_accessor :agent_manager
|
11
|
+
attr_accessor :event_processor
|
12
|
+
|
13
|
+
attr_accessor :http_port, :http_user, :http_password
|
14
|
+
attr_accessor :plugins
|
15
|
+
attr_accessor :varz
|
16
|
+
|
17
|
+
attr_accessor :nats
|
18
|
+
|
19
|
+
def config=(config)
|
20
|
+
validate_config(config)
|
21
|
+
|
22
|
+
@logger = Logging.logger(config["logfile"] || STDOUT)
|
23
|
+
@intervals = OpenStruct.new(config["intervals"])
|
24
|
+
@director = Director.new(config["director"])
|
25
|
+
@mbus = OpenStruct.new(config["mbus"])
|
26
|
+
|
27
|
+
@event_processor = EventProcessor.new
|
28
|
+
@agent_manager = AgentManager.new(event_processor)
|
29
|
+
|
30
|
+
@varz = {}
|
31
|
+
|
32
|
+
# Interval defaults
|
33
|
+
@intervals.prune_events ||= 30
|
34
|
+
@intervals.poll_director ||= 60
|
35
|
+
@intervals.poll_grace_period ||= 30
|
36
|
+
@intervals.log_stats ||= 60
|
37
|
+
@intervals.analyze_agents ||= 60
|
38
|
+
@intervals.agent_timeout ||= 60
|
39
|
+
@intervals.rogue_agent_alert ||= 120
|
40
|
+
|
41
|
+
if config["http"].is_a?(Hash)
|
42
|
+
@http_port = config["http"]["port"]
|
43
|
+
@http_user = config["http"]["user"]
|
44
|
+
@http_password = config["http"]["password"]
|
45
|
+
end
|
46
|
+
|
47
|
+
if config["event_mbus"]
|
48
|
+
@event_mbus = OpenStruct.new(config["event_mbus"])
|
49
|
+
end
|
50
|
+
|
51
|
+
if config["loglevel"].is_a?(String)
|
52
|
+
@logger.level = config["loglevel"].to_sym
|
53
|
+
end
|
54
|
+
|
55
|
+
if config["plugins"].is_a?(Enumerable)
|
56
|
+
@plugins = config["plugins"]
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def set_varz(key, value)
|
61
|
+
@varz ||= {}
|
62
|
+
@varz[key] = value
|
63
|
+
end
|
64
|
+
|
65
|
+
def validate_config(config)
|
66
|
+
unless config.is_a?(Hash)
|
67
|
+
raise ConfigError, "Invalid config format, Hash expected, #{config.class} given"
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
@@ -0,0 +1,76 @@
|
|
1
|
+
module Bosh::Monitor
|
2
|
+
class Director
|
3
|
+
|
4
|
+
def initialize(options)
|
5
|
+
@endpoint = options["endpoint"].to_s
|
6
|
+
@user = options["user"].to_s
|
7
|
+
@password = options["password"].to_s
|
8
|
+
end
|
9
|
+
|
10
|
+
def get_deployments
|
11
|
+
http = perform_request(:get, "/deployments")
|
12
|
+
|
13
|
+
body = http.response
|
14
|
+
status = http.response_header.http_status
|
15
|
+
|
16
|
+
if status != "200"
|
17
|
+
raise DirectorError, "Cannot get deployments from director at #{http.uri}: #{status} #{body}"
|
18
|
+
end
|
19
|
+
|
20
|
+
parse_json(body, Array)
|
21
|
+
end
|
22
|
+
|
23
|
+
def get_deployment_vms(name)
|
24
|
+
http = perform_request(:get, "/deployments/#{name}/vms")
|
25
|
+
|
26
|
+
body = http.response
|
27
|
+
status = http.response_header.http_status
|
28
|
+
|
29
|
+
if status != "200"
|
30
|
+
raise DirectorError, "Cannot get deployment `#{name}' from director at #{http.uri}: #{status} #{body}"
|
31
|
+
end
|
32
|
+
|
33
|
+
parse_json(body, Array)
|
34
|
+
end
|
35
|
+
|
36
|
+
private
|
37
|
+
|
38
|
+
def parse_json(json, expected_type = nil)
|
39
|
+
result = Yajl::Parser.parse(json)
|
40
|
+
|
41
|
+
if expected_type && !result.kind_of?(expected_type)
|
42
|
+
raise DirectorError, "Invalid JSON response format, expected #{expected_type}, got #{result.class}"
|
43
|
+
end
|
44
|
+
|
45
|
+
result
|
46
|
+
|
47
|
+
rescue Yajl::ParseError => e
|
48
|
+
raise DirectorError, "Cannot parse director response: #{e.message}"
|
49
|
+
end
|
50
|
+
|
51
|
+
# JMS and GO: This effectively turns async requests into synchronous requests.
|
52
|
+
# This is a very bad thing to do on eventmachine because it will block the single
|
53
|
+
# event loop. This code should be removed and all requests converted
|
54
|
+
# to "the eventmachine way".
|
55
|
+
def perform_request(method, uri)
|
56
|
+
f = Fiber.current
|
57
|
+
|
58
|
+
target_uri = @endpoint + uri
|
59
|
+
|
60
|
+
headers = {
|
61
|
+
"authorization" => [@user, @password]
|
62
|
+
}
|
63
|
+
|
64
|
+
http = EM::HttpRequest.new(target_uri).send(method.to_sym, :head => headers)
|
65
|
+
|
66
|
+
http.callback { f.resume(http) }
|
67
|
+
http.errback { f.resume(http) }
|
68
|
+
|
69
|
+
Fiber.yield
|
70
|
+
|
71
|
+
rescue URI::Error => e
|
72
|
+
raise DirectorError, "Invalid URI: #{target_uri}"
|
73
|
+
end
|
74
|
+
|
75
|
+
end
|
76
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
module Bosh::Monitor
|
2
|
+
class DirectorMonitor
|
3
|
+
def initialize(config)
|
4
|
+
@nats = config.nats
|
5
|
+
@logger = config.logger
|
6
|
+
@event_processor = config.event_processor
|
7
|
+
end
|
8
|
+
|
9
|
+
def subscribe
|
10
|
+
@nats.subscribe('hm.director.alert') do |msg|
|
11
|
+
alert = Yajl::Parser.parse(msg)
|
12
|
+
|
13
|
+
if valid_payload?(alert)
|
14
|
+
@event_processor.process(:alert, alert)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
private
|
20
|
+
|
21
|
+
def valid_payload?(payload)
|
22
|
+
missing_keys = %w(id severity title summary created_at) - payload.keys
|
23
|
+
valid = missing_keys.empty?
|
24
|
+
|
25
|
+
unless valid
|
26
|
+
first_missing_key = missing_keys.first
|
27
|
+
@logger.error("Invalid payload from director: the key '#{first_missing_key}' was missing. #{payload.inspect}")
|
28
|
+
end
|
29
|
+
|
30
|
+
valid
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module Bosh::Monitor
|
2
|
+
|
3
|
+
class Error < StandardError
|
4
|
+
def self.code(code = nil)
|
5
|
+
define_method(:code) { code }
|
6
|
+
end
|
7
|
+
end
|
8
|
+
|
9
|
+
class FatalError < Error; code(42); end
|
10
|
+
|
11
|
+
class ConfigError < Error; code(101); end
|
12
|
+
class DirectorError < Error; code(201); end
|
13
|
+
class ConnectionError < Error; code(202); end
|
14
|
+
|
15
|
+
class EventProcessingError < Error; code(301); end
|
16
|
+
class InvalidEvent < Error; code(302); end
|
17
|
+
|
18
|
+
class PluginError < Error; code(401); end
|
19
|
+
end
|
@@ -0,0 +1,109 @@
|
|
1
|
+
module Bosh::Monitor
|
2
|
+
class EventProcessor
|
3
|
+
attr_reader :plugins
|
4
|
+
|
5
|
+
def initialize
|
6
|
+
@events = {}
|
7
|
+
@plugins = {}
|
8
|
+
|
9
|
+
@lock = Mutex.new
|
10
|
+
@logger = Bhm.logger
|
11
|
+
end
|
12
|
+
|
13
|
+
def add_plugin(plugin, event_kinds = [])
|
14
|
+
if plugin.respond_to?(:validate_options) && !plugin.validate_options
|
15
|
+
raise FatalError, "Invalid plugin options for `#{plugin.class}'"
|
16
|
+
end
|
17
|
+
|
18
|
+
@lock.synchronize do
|
19
|
+
event_kinds.each do |kind|
|
20
|
+
kind = kind.to_sym
|
21
|
+
@plugins[kind] ||= Set.new
|
22
|
+
@plugins[kind] << plugin
|
23
|
+
end
|
24
|
+
plugin.run
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
def process(kind, data)
|
29
|
+
kind = kind.to_sym
|
30
|
+
event = Bhm::Events::Base.create!(kind, data)
|
31
|
+
|
32
|
+
@lock.synchronize do
|
33
|
+
@events[kind] ||= {}
|
34
|
+
|
35
|
+
if @events[kind].has_key?(event.id)
|
36
|
+
@logger.debug("Ignoring duplicate #{event.kind} `#{event.id}'")
|
37
|
+
return true
|
38
|
+
end
|
39
|
+
# We don't really need to store event itself for the moment,
|
40
|
+
# as we only use its id to dedup new events.
|
41
|
+
@events[kind][event.id] = { :received_at => Time.now.to_i }
|
42
|
+
end
|
43
|
+
|
44
|
+
if @plugins[kind].nil? || @plugins[kind].empty?
|
45
|
+
@logger.debug("No plugins are interested in `#{event.kind}' event")
|
46
|
+
return true
|
47
|
+
end
|
48
|
+
|
49
|
+
@plugins[kind].each do |plugin|
|
50
|
+
plugin_process(plugin, event)
|
51
|
+
end
|
52
|
+
|
53
|
+
true
|
54
|
+
end
|
55
|
+
|
56
|
+
def events_count
|
57
|
+
# Accumulate event counter over all event kinds
|
58
|
+
@lock.synchronize do
|
59
|
+
@events.inject(0) do |counter, (kind, events)|
|
60
|
+
counter += events.size
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
def enable_pruning(interval)
|
66
|
+
@reaper ||= Thread.new do
|
67
|
+
loop do
|
68
|
+
# Some events might be in the system up to 2 * interval
|
69
|
+
# seconds this way, but it seems to be a reasonable trade-off
|
70
|
+
prune_events(interval)
|
71
|
+
sleep(interval)
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
def prune_events(lifetime)
|
77
|
+
@lock.synchronize do
|
78
|
+
pruned_count = 0
|
79
|
+
total_count = 0
|
80
|
+
|
81
|
+
@events.each_value do |list|
|
82
|
+
list.delete_if do |id, data|
|
83
|
+
total_count += 1
|
84
|
+
if data[:received_at] <= Time.now.to_i - lifetime
|
85
|
+
pruned_count += 1
|
86
|
+
true
|
87
|
+
else
|
88
|
+
false
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
@logger.debug("Pruned %s" % [ pluralize(pruned_count, "old event") ])
|
94
|
+
@logger.debug("Total %s" % [ pluralize(total_count, "event") ])
|
95
|
+
end
|
96
|
+
rescue => e
|
97
|
+
@logger.error("Error pruning events: #{e}")
|
98
|
+
@logger.error(e.backtrace.join("\n"))
|
99
|
+
end
|
100
|
+
|
101
|
+
private
|
102
|
+
|
103
|
+
def plugin_process(plugin, event)
|
104
|
+
plugin.process(event)
|
105
|
+
rescue Bhm::PluginError => e
|
106
|
+
@logger.error("Plugin #{plugin.class} failed to process #{event.kind}: #{e}")
|
107
|
+
end
|
108
|
+
end
|
109
|
+
end
|
@@ -0,0 +1,92 @@
|
|
1
|
+
module Bosh::Monitor
|
2
|
+
module Events
|
3
|
+
class Alert < Base
|
4
|
+
|
5
|
+
# Considering Bosh::Agent::Alert
|
6
|
+
SEVERITY_MAP = {
|
7
|
+
1 => :alert,
|
8
|
+
2 => :critical,
|
9
|
+
3 => :error,
|
10
|
+
4 => :warning,
|
11
|
+
-1 => :ignored
|
12
|
+
}
|
13
|
+
|
14
|
+
attr_reader :created_at, :source, :title
|
15
|
+
|
16
|
+
def initialize(attributes = {})
|
17
|
+
super
|
18
|
+
@kind = :alert
|
19
|
+
|
20
|
+
@id = @attributes["id"]
|
21
|
+
@severity = @attributes["severity"]
|
22
|
+
@title = @attributes["title"]
|
23
|
+
@summary = @attributes["summary"] || @title
|
24
|
+
@source = @attributes["source"]
|
25
|
+
|
26
|
+
# This rescue is just to preserve existing test behavior. However, this
|
27
|
+
# seems like a pretty wacky way to handle errors - wouldn't we rather
|
28
|
+
# have a nice exception?
|
29
|
+
@created_at = Time.at(@attributes["created_at"]) rescue @attributes["created_at"]
|
30
|
+
end
|
31
|
+
|
32
|
+
def validate
|
33
|
+
add_error("id is missing") if @id.nil?
|
34
|
+
add_error("severity is missing") if @severity.nil?
|
35
|
+
|
36
|
+
if @severity && (!@severity.kind_of?(Integer) || @severity < 0)
|
37
|
+
add_error("severity is invalid (non-negative integer expected)")
|
38
|
+
end
|
39
|
+
|
40
|
+
add_error("title is missing") if @title.nil?
|
41
|
+
add_error("timestamp is missing") if @created_at.nil?
|
42
|
+
|
43
|
+
if @created_at && !@created_at.kind_of?(Time)
|
44
|
+
add_error('created_at is invalid UNIX timestamp')
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
def short_description
|
49
|
+
"Severity #{@severity}: #{@source} #{@title}"
|
50
|
+
end
|
51
|
+
|
52
|
+
def severity
|
53
|
+
SEVERITY_MAP[@severity] || @severity
|
54
|
+
end
|
55
|
+
|
56
|
+
def to_hash
|
57
|
+
{
|
58
|
+
:kind => "alert",
|
59
|
+
:id => @id,
|
60
|
+
:severity => @severity,
|
61
|
+
:title => @title,
|
62
|
+
:summary => @summary,
|
63
|
+
:source => @source,
|
64
|
+
:created_at => @created_at.to_i
|
65
|
+
}
|
66
|
+
end
|
67
|
+
|
68
|
+
def to_json
|
69
|
+
Yajl::Encoder.encode(self.to_hash)
|
70
|
+
end
|
71
|
+
|
72
|
+
def to_s
|
73
|
+
"Alert @ #{@created_at.utc}, severity #{@severity}: #{@summary}"
|
74
|
+
end
|
75
|
+
|
76
|
+
def to_plain_text
|
77
|
+
result = ""
|
78
|
+
result << "#{@source}\n" unless @source.nil?
|
79
|
+
result << (@title || "Unknown Alert") << "\n"
|
80
|
+
result << "Severity: #{@severity}\n"
|
81
|
+
result << "Summary: #{@summary}\n" unless @summary.nil?
|
82
|
+
result << "Time: #{@created_at.utc}\n"
|
83
|
+
result
|
84
|
+
end
|
85
|
+
|
86
|
+
def metrics
|
87
|
+
[ ]
|
88
|
+
end
|
89
|
+
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
@@ -0,0 +1,70 @@
|
|
1
|
+
module Bosh::Monitor
|
2
|
+
module Events
|
3
|
+
class Base
|
4
|
+
attr_accessor :id
|
5
|
+
|
6
|
+
attr_reader :logger
|
7
|
+
attr_reader :kind
|
8
|
+
attr_reader :attributes
|
9
|
+
attr_reader :errors
|
10
|
+
|
11
|
+
def self.create!(kind, attributes = {})
|
12
|
+
event = create(kind, attributes)
|
13
|
+
if !event.valid?
|
14
|
+
raise InvalidEvent, event.error_message
|
15
|
+
end
|
16
|
+
event
|
17
|
+
end
|
18
|
+
|
19
|
+
def self.create(kind, attributes = {})
|
20
|
+
if !attributes.kind_of?(Hash)
|
21
|
+
raise InvalidEvent, "Cannot create event from #{attributes.class}"
|
22
|
+
end
|
23
|
+
|
24
|
+
case kind.to_s
|
25
|
+
when "heartbeat"
|
26
|
+
klass = Bhm::Events::Heartbeat
|
27
|
+
when "alert"
|
28
|
+
klass = Bhm::Events::Alert
|
29
|
+
else
|
30
|
+
raise InvalidEvent, "Cannot find `#{kind}' event handler"
|
31
|
+
end
|
32
|
+
|
33
|
+
event = klass.new(attributes)
|
34
|
+
event.id = SecureRandom.uuid if event.id.nil?
|
35
|
+
event
|
36
|
+
end
|
37
|
+
|
38
|
+
def initialize(attributes = {})
|
39
|
+
@attributes = {}
|
40
|
+
@kind = :unknown
|
41
|
+
|
42
|
+
attributes.each_pair do |k, v|
|
43
|
+
@attributes[k.to_s] = v
|
44
|
+
end
|
45
|
+
|
46
|
+
@logger = Bhm.logger
|
47
|
+
@errors = Set.new
|
48
|
+
end
|
49
|
+
|
50
|
+
def add_error(error)
|
51
|
+
@errors << error
|
52
|
+
end
|
53
|
+
|
54
|
+
def valid?
|
55
|
+
validate
|
56
|
+
@errors.empty?
|
57
|
+
end
|
58
|
+
|
59
|
+
def error_message
|
60
|
+
@errors.to_a.join(", ")
|
61
|
+
end
|
62
|
+
|
63
|
+
[:validate, :to_plain_text, :to_hash, :to_json, :metrics].each do |method|
|
64
|
+
define_method(method) do
|
65
|
+
raise FatalError, "`#{method}' is not implemented by #{self.class}"
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|