bosh-monitor 1.5.0.pre.1113
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +80 -0
- data/bin/bosh-monitor +30 -0
- data/bin/bosh-monitor-console +51 -0
- data/bin/listener +58 -0
- data/lib/bosh/monitor.rb +72 -0
- data/lib/bosh/monitor/agent.rb +51 -0
- data/lib/bosh/monitor/agent_manager.rb +295 -0
- data/lib/bosh/monitor/api_controller.rb +18 -0
- data/lib/bosh/monitor/config.rb +71 -0
- data/lib/bosh/monitor/core_ext.rb +8 -0
- data/lib/bosh/monitor/director.rb +76 -0
- data/lib/bosh/monitor/director_monitor.rb +33 -0
- data/lib/bosh/monitor/errors.rb +19 -0
- data/lib/bosh/monitor/event_processor.rb +109 -0
- data/lib/bosh/monitor/events/alert.rb +92 -0
- data/lib/bosh/monitor/events/base.rb +70 -0
- data/lib/bosh/monitor/events/heartbeat.rb +139 -0
- data/lib/bosh/monitor/metric.rb +16 -0
- data/lib/bosh/monitor/plugins/base.rb +27 -0
- data/lib/bosh/monitor/plugins/cloud_watch.rb +56 -0
- data/lib/bosh/monitor/plugins/datadog.rb +78 -0
- data/lib/bosh/monitor/plugins/dummy.rb +20 -0
- data/lib/bosh/monitor/plugins/email.rb +135 -0
- data/lib/bosh/monitor/plugins/http_request_helper.rb +25 -0
- data/lib/bosh/monitor/plugins/logger.rb +13 -0
- data/lib/bosh/monitor/plugins/nats.rb +43 -0
- data/lib/bosh/monitor/plugins/pagerduty.rb +48 -0
- data/lib/bosh/monitor/plugins/paging_datadog_client.rb +24 -0
- data/lib/bosh/monitor/plugins/resurrector.rb +82 -0
- data/lib/bosh/monitor/plugins/resurrector_helper.rb +84 -0
- data/lib/bosh/monitor/plugins/tsdb.rb +43 -0
- data/lib/bosh/monitor/plugins/varz.rb +17 -0
- data/lib/bosh/monitor/protocols/tsdb.rb +68 -0
- data/lib/bosh/monitor/runner.rb +162 -0
- data/lib/bosh/monitor/version.rb +5 -0
- data/lib/bosh/monitor/yaml_helper.rb +18 -0
- metadata +246 -0
@@ -0,0 +1,18 @@
|
|
1
|
+
module Bosh::Monitor
|
2
|
+
|
3
|
+
class ApiController < Sinatra::Base
|
4
|
+
|
5
|
+
configure do
|
6
|
+
set(:show_exceptions, false)
|
7
|
+
set(:raise_errors, false)
|
8
|
+
set(:dump_errors, false)
|
9
|
+
end
|
10
|
+
|
11
|
+
get "/varz" do
|
12
|
+
content_type(:json)
|
13
|
+
Yajl::Encoder.encode(Bhm.varz, :terminator => "\n")
|
14
|
+
end
|
15
|
+
|
16
|
+
end
|
17
|
+
|
18
|
+
end
|
@@ -0,0 +1,71 @@
|
|
1
|
+
module Bosh::Monitor
|
2
|
+
|
3
|
+
class << self
|
4
|
+
|
5
|
+
attr_accessor :logger
|
6
|
+
attr_accessor :director
|
7
|
+
attr_accessor :intervals
|
8
|
+
attr_accessor :mbus
|
9
|
+
attr_accessor :event_mbus
|
10
|
+
attr_accessor :agent_manager
|
11
|
+
attr_accessor :event_processor
|
12
|
+
|
13
|
+
attr_accessor :http_port, :http_user, :http_password
|
14
|
+
attr_accessor :plugins
|
15
|
+
attr_accessor :varz
|
16
|
+
|
17
|
+
attr_accessor :nats
|
18
|
+
|
19
|
+
def config=(config)
|
20
|
+
validate_config(config)
|
21
|
+
|
22
|
+
@logger = Logging.logger(config["logfile"] || STDOUT)
|
23
|
+
@intervals = OpenStruct.new(config["intervals"])
|
24
|
+
@director = Director.new(config["director"])
|
25
|
+
@mbus = OpenStruct.new(config["mbus"])
|
26
|
+
|
27
|
+
@event_processor = EventProcessor.new
|
28
|
+
@agent_manager = AgentManager.new(event_processor)
|
29
|
+
|
30
|
+
@varz = {}
|
31
|
+
|
32
|
+
# Interval defaults
|
33
|
+
@intervals.prune_events ||= 30
|
34
|
+
@intervals.poll_director ||= 60
|
35
|
+
@intervals.poll_grace_period ||= 30
|
36
|
+
@intervals.log_stats ||= 60
|
37
|
+
@intervals.analyze_agents ||= 60
|
38
|
+
@intervals.agent_timeout ||= 60
|
39
|
+
@intervals.rogue_agent_alert ||= 120
|
40
|
+
|
41
|
+
if config["http"].is_a?(Hash)
|
42
|
+
@http_port = config["http"]["port"]
|
43
|
+
@http_user = config["http"]["user"]
|
44
|
+
@http_password = config["http"]["password"]
|
45
|
+
end
|
46
|
+
|
47
|
+
if config["event_mbus"]
|
48
|
+
@event_mbus = OpenStruct.new(config["event_mbus"])
|
49
|
+
end
|
50
|
+
|
51
|
+
if config["loglevel"].is_a?(String)
|
52
|
+
@logger.level = config["loglevel"].to_sym
|
53
|
+
end
|
54
|
+
|
55
|
+
if config["plugins"].is_a?(Enumerable)
|
56
|
+
@plugins = config["plugins"]
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def set_varz(key, value)
|
61
|
+
@varz ||= {}
|
62
|
+
@varz[key] = value
|
63
|
+
end
|
64
|
+
|
65
|
+
def validate_config(config)
|
66
|
+
unless config.is_a?(Hash)
|
67
|
+
raise ConfigError, "Invalid config format, Hash expected, #{config.class} given"
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
@@ -0,0 +1,76 @@
|
|
1
|
+
module Bosh::Monitor
|
2
|
+
class Director
|
3
|
+
|
4
|
+
def initialize(options)
|
5
|
+
@endpoint = options["endpoint"].to_s
|
6
|
+
@user = options["user"].to_s
|
7
|
+
@password = options["password"].to_s
|
8
|
+
end
|
9
|
+
|
10
|
+
def get_deployments
|
11
|
+
http = perform_request(:get, "/deployments")
|
12
|
+
|
13
|
+
body = http.response
|
14
|
+
status = http.response_header.http_status
|
15
|
+
|
16
|
+
if status != "200"
|
17
|
+
raise DirectorError, "Cannot get deployments from director at #{http.uri}: #{status} #{body}"
|
18
|
+
end
|
19
|
+
|
20
|
+
parse_json(body, Array)
|
21
|
+
end
|
22
|
+
|
23
|
+
def get_deployment_vms(name)
|
24
|
+
http = perform_request(:get, "/deployments/#{name}/vms")
|
25
|
+
|
26
|
+
body = http.response
|
27
|
+
status = http.response_header.http_status
|
28
|
+
|
29
|
+
if status != "200"
|
30
|
+
raise DirectorError, "Cannot get deployment `#{name}' from director at #{http.uri}: #{status} #{body}"
|
31
|
+
end
|
32
|
+
|
33
|
+
parse_json(body, Array)
|
34
|
+
end
|
35
|
+
|
36
|
+
private
|
37
|
+
|
38
|
+
def parse_json(json, expected_type = nil)
|
39
|
+
result = Yajl::Parser.parse(json)
|
40
|
+
|
41
|
+
if expected_type && !result.kind_of?(expected_type)
|
42
|
+
raise DirectorError, "Invalid JSON response format, expected #{expected_type}, got #{result.class}"
|
43
|
+
end
|
44
|
+
|
45
|
+
result
|
46
|
+
|
47
|
+
rescue Yajl::ParseError => e
|
48
|
+
raise DirectorError, "Cannot parse director response: #{e.message}"
|
49
|
+
end
|
50
|
+
|
51
|
+
# JMS and GO: This effectively turns async requests into synchronous requests.
|
52
|
+
# This is a very bad thing to do on eventmachine because it will block the single
|
53
|
+
# event loop. This code should be removed and all requests converted
|
54
|
+
# to "the eventmachine way".
|
55
|
+
def perform_request(method, uri)
|
56
|
+
f = Fiber.current
|
57
|
+
|
58
|
+
target_uri = @endpoint + uri
|
59
|
+
|
60
|
+
headers = {
|
61
|
+
"authorization" => [@user, @password]
|
62
|
+
}
|
63
|
+
|
64
|
+
http = EM::HttpRequest.new(target_uri).send(method.to_sym, :head => headers)
|
65
|
+
|
66
|
+
http.callback { f.resume(http) }
|
67
|
+
http.errback { f.resume(http) }
|
68
|
+
|
69
|
+
Fiber.yield
|
70
|
+
|
71
|
+
rescue URI::Error => e
|
72
|
+
raise DirectorError, "Invalid URI: #{target_uri}"
|
73
|
+
end
|
74
|
+
|
75
|
+
end
|
76
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
module Bosh::Monitor
|
2
|
+
class DirectorMonitor
|
3
|
+
def initialize(config)
|
4
|
+
@nats = config.nats
|
5
|
+
@logger = config.logger
|
6
|
+
@event_processor = config.event_processor
|
7
|
+
end
|
8
|
+
|
9
|
+
def subscribe
|
10
|
+
@nats.subscribe('hm.director.alert') do |msg|
|
11
|
+
alert = Yajl::Parser.parse(msg)
|
12
|
+
|
13
|
+
if valid_payload?(alert)
|
14
|
+
@event_processor.process(:alert, alert)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
private
|
20
|
+
|
21
|
+
def valid_payload?(payload)
|
22
|
+
missing_keys = %w(id severity title summary created_at) - payload.keys
|
23
|
+
valid = missing_keys.empty?
|
24
|
+
|
25
|
+
unless valid
|
26
|
+
first_missing_key = missing_keys.first
|
27
|
+
@logger.error("Invalid payload from director: the key '#{first_missing_key}' was missing. #{payload.inspect}")
|
28
|
+
end
|
29
|
+
|
30
|
+
valid
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module Bosh::Monitor
|
2
|
+
|
3
|
+
class Error < StandardError
|
4
|
+
def self.code(code = nil)
|
5
|
+
define_method(:code) { code }
|
6
|
+
end
|
7
|
+
end
|
8
|
+
|
9
|
+
class FatalError < Error; code(42); end
|
10
|
+
|
11
|
+
class ConfigError < Error; code(101); end
|
12
|
+
class DirectorError < Error; code(201); end
|
13
|
+
class ConnectionError < Error; code(202); end
|
14
|
+
|
15
|
+
class EventProcessingError < Error; code(301); end
|
16
|
+
class InvalidEvent < Error; code(302); end
|
17
|
+
|
18
|
+
class PluginError < Error; code(401); end
|
19
|
+
end
|
@@ -0,0 +1,109 @@
|
|
1
|
+
module Bosh::Monitor
|
2
|
+
class EventProcessor
|
3
|
+
attr_reader :plugins
|
4
|
+
|
5
|
+
def initialize
|
6
|
+
@events = {}
|
7
|
+
@plugins = {}
|
8
|
+
|
9
|
+
@lock = Mutex.new
|
10
|
+
@logger = Bhm.logger
|
11
|
+
end
|
12
|
+
|
13
|
+
def add_plugin(plugin, event_kinds = [])
|
14
|
+
if plugin.respond_to?(:validate_options) && !plugin.validate_options
|
15
|
+
raise FatalError, "Invalid plugin options for `#{plugin.class}'"
|
16
|
+
end
|
17
|
+
|
18
|
+
@lock.synchronize do
|
19
|
+
event_kinds.each do |kind|
|
20
|
+
kind = kind.to_sym
|
21
|
+
@plugins[kind] ||= Set.new
|
22
|
+
@plugins[kind] << plugin
|
23
|
+
end
|
24
|
+
plugin.run
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
def process(kind, data)
|
29
|
+
kind = kind.to_sym
|
30
|
+
event = Bhm::Events::Base.create!(kind, data)
|
31
|
+
|
32
|
+
@lock.synchronize do
|
33
|
+
@events[kind] ||= {}
|
34
|
+
|
35
|
+
if @events[kind].has_key?(event.id)
|
36
|
+
@logger.debug("Ignoring duplicate #{event.kind} `#{event.id}'")
|
37
|
+
return true
|
38
|
+
end
|
39
|
+
# We don't really need to store event itself for the moment,
|
40
|
+
# as we only use its id to dedup new events.
|
41
|
+
@events[kind][event.id] = { :received_at => Time.now.to_i }
|
42
|
+
end
|
43
|
+
|
44
|
+
if @plugins[kind].nil? || @plugins[kind].empty?
|
45
|
+
@logger.debug("No plugins are interested in `#{event.kind}' event")
|
46
|
+
return true
|
47
|
+
end
|
48
|
+
|
49
|
+
@plugins[kind].each do |plugin|
|
50
|
+
plugin_process(plugin, event)
|
51
|
+
end
|
52
|
+
|
53
|
+
true
|
54
|
+
end
|
55
|
+
|
56
|
+
def events_count
|
57
|
+
# Accumulate event counter over all event kinds
|
58
|
+
@lock.synchronize do
|
59
|
+
@events.inject(0) do |counter, (kind, events)|
|
60
|
+
counter += events.size
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
def enable_pruning(interval)
|
66
|
+
@reaper ||= Thread.new do
|
67
|
+
loop do
|
68
|
+
# Some events might be in the system up to 2 * interval
|
69
|
+
# seconds this way, but it seems to be a reasonable trade-off
|
70
|
+
prune_events(interval)
|
71
|
+
sleep(interval)
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
def prune_events(lifetime)
|
77
|
+
@lock.synchronize do
|
78
|
+
pruned_count = 0
|
79
|
+
total_count = 0
|
80
|
+
|
81
|
+
@events.each_value do |list|
|
82
|
+
list.delete_if do |id, data|
|
83
|
+
total_count += 1
|
84
|
+
if data[:received_at] <= Time.now.to_i - lifetime
|
85
|
+
pruned_count += 1
|
86
|
+
true
|
87
|
+
else
|
88
|
+
false
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
@logger.debug("Pruned %s" % [ pluralize(pruned_count, "old event") ])
|
94
|
+
@logger.debug("Total %s" % [ pluralize(total_count, "event") ])
|
95
|
+
end
|
96
|
+
rescue => e
|
97
|
+
@logger.error("Error pruning events: #{e}")
|
98
|
+
@logger.error(e.backtrace.join("\n"))
|
99
|
+
end
|
100
|
+
|
101
|
+
private
|
102
|
+
|
103
|
+
def plugin_process(plugin, event)
|
104
|
+
plugin.process(event)
|
105
|
+
rescue Bhm::PluginError => e
|
106
|
+
@logger.error("Plugin #{plugin.class} failed to process #{event.kind}: #{e}")
|
107
|
+
end
|
108
|
+
end
|
109
|
+
end
|
@@ -0,0 +1,92 @@
|
|
1
|
+
module Bosh::Monitor
|
2
|
+
module Events
|
3
|
+
class Alert < Base
|
4
|
+
|
5
|
+
# Considering Bosh::Agent::Alert
|
6
|
+
SEVERITY_MAP = {
|
7
|
+
1 => :alert,
|
8
|
+
2 => :critical,
|
9
|
+
3 => :error,
|
10
|
+
4 => :warning,
|
11
|
+
-1 => :ignored
|
12
|
+
}
|
13
|
+
|
14
|
+
attr_reader :created_at, :source, :title
|
15
|
+
|
16
|
+
def initialize(attributes = {})
|
17
|
+
super
|
18
|
+
@kind = :alert
|
19
|
+
|
20
|
+
@id = @attributes["id"]
|
21
|
+
@severity = @attributes["severity"]
|
22
|
+
@title = @attributes["title"]
|
23
|
+
@summary = @attributes["summary"] || @title
|
24
|
+
@source = @attributes["source"]
|
25
|
+
|
26
|
+
# This rescue is just to preserve existing test behavior. However, this
|
27
|
+
# seems like a pretty wacky way to handle errors - wouldn't we rather
|
28
|
+
# have a nice exception?
|
29
|
+
@created_at = Time.at(@attributes["created_at"]) rescue @attributes["created_at"]
|
30
|
+
end
|
31
|
+
|
32
|
+
def validate
|
33
|
+
add_error("id is missing") if @id.nil?
|
34
|
+
add_error("severity is missing") if @severity.nil?
|
35
|
+
|
36
|
+
if @severity && (!@severity.kind_of?(Integer) || @severity < 0)
|
37
|
+
add_error("severity is invalid (non-negative integer expected)")
|
38
|
+
end
|
39
|
+
|
40
|
+
add_error("title is missing") if @title.nil?
|
41
|
+
add_error("timestamp is missing") if @created_at.nil?
|
42
|
+
|
43
|
+
if @created_at && !@created_at.kind_of?(Time)
|
44
|
+
add_error('created_at is invalid UNIX timestamp')
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
def short_description
|
49
|
+
"Severity #{@severity}: #{@source} #{@title}"
|
50
|
+
end
|
51
|
+
|
52
|
+
def severity
|
53
|
+
SEVERITY_MAP[@severity] || @severity
|
54
|
+
end
|
55
|
+
|
56
|
+
def to_hash
|
57
|
+
{
|
58
|
+
:kind => "alert",
|
59
|
+
:id => @id,
|
60
|
+
:severity => @severity,
|
61
|
+
:title => @title,
|
62
|
+
:summary => @summary,
|
63
|
+
:source => @source,
|
64
|
+
:created_at => @created_at.to_i
|
65
|
+
}
|
66
|
+
end
|
67
|
+
|
68
|
+
def to_json
|
69
|
+
Yajl::Encoder.encode(self.to_hash)
|
70
|
+
end
|
71
|
+
|
72
|
+
def to_s
|
73
|
+
"Alert @ #{@created_at.utc}, severity #{@severity}: #{@summary}"
|
74
|
+
end
|
75
|
+
|
76
|
+
def to_plain_text
|
77
|
+
result = ""
|
78
|
+
result << "#{@source}\n" unless @source.nil?
|
79
|
+
result << (@title || "Unknown Alert") << "\n"
|
80
|
+
result << "Severity: #{@severity}\n"
|
81
|
+
result << "Summary: #{@summary}\n" unless @summary.nil?
|
82
|
+
result << "Time: #{@created_at.utc}\n"
|
83
|
+
result
|
84
|
+
end
|
85
|
+
|
86
|
+
def metrics
|
87
|
+
[ ]
|
88
|
+
end
|
89
|
+
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
@@ -0,0 +1,70 @@
|
|
1
|
+
module Bosh::Monitor
|
2
|
+
module Events
|
3
|
+
class Base
|
4
|
+
attr_accessor :id
|
5
|
+
|
6
|
+
attr_reader :logger
|
7
|
+
attr_reader :kind
|
8
|
+
attr_reader :attributes
|
9
|
+
attr_reader :errors
|
10
|
+
|
11
|
+
def self.create!(kind, attributes = {})
|
12
|
+
event = create(kind, attributes)
|
13
|
+
if !event.valid?
|
14
|
+
raise InvalidEvent, event.error_message
|
15
|
+
end
|
16
|
+
event
|
17
|
+
end
|
18
|
+
|
19
|
+
def self.create(kind, attributes = {})
|
20
|
+
if !attributes.kind_of?(Hash)
|
21
|
+
raise InvalidEvent, "Cannot create event from #{attributes.class}"
|
22
|
+
end
|
23
|
+
|
24
|
+
case kind.to_s
|
25
|
+
when "heartbeat"
|
26
|
+
klass = Bhm::Events::Heartbeat
|
27
|
+
when "alert"
|
28
|
+
klass = Bhm::Events::Alert
|
29
|
+
else
|
30
|
+
raise InvalidEvent, "Cannot find `#{kind}' event handler"
|
31
|
+
end
|
32
|
+
|
33
|
+
event = klass.new(attributes)
|
34
|
+
event.id = SecureRandom.uuid if event.id.nil?
|
35
|
+
event
|
36
|
+
end
|
37
|
+
|
38
|
+
def initialize(attributes = {})
|
39
|
+
@attributes = {}
|
40
|
+
@kind = :unknown
|
41
|
+
|
42
|
+
attributes.each_pair do |k, v|
|
43
|
+
@attributes[k.to_s] = v
|
44
|
+
end
|
45
|
+
|
46
|
+
@logger = Bhm.logger
|
47
|
+
@errors = Set.new
|
48
|
+
end
|
49
|
+
|
50
|
+
def add_error(error)
|
51
|
+
@errors << error
|
52
|
+
end
|
53
|
+
|
54
|
+
def valid?
|
55
|
+
validate
|
56
|
+
@errors.empty?
|
57
|
+
end
|
58
|
+
|
59
|
+
def error_message
|
60
|
+
@errors.to_a.join(", ")
|
61
|
+
end
|
62
|
+
|
63
|
+
[:validate, :to_plain_text, :to_hash, :to_json, :metrics].each do |method|
|
64
|
+
define_method(method) do
|
65
|
+
raise FatalError, "`#{method}' is not implemented by #{self.class}"
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|