bosh-monitor 1.5.0.pre.1113

Sign up to get free protection for your applications and to get access to all the features.
Files changed (37) hide show
  1. data/README +80 -0
  2. data/bin/bosh-monitor +30 -0
  3. data/bin/bosh-monitor-console +51 -0
  4. data/bin/listener +58 -0
  5. data/lib/bosh/monitor.rb +72 -0
  6. data/lib/bosh/monitor/agent.rb +51 -0
  7. data/lib/bosh/monitor/agent_manager.rb +295 -0
  8. data/lib/bosh/monitor/api_controller.rb +18 -0
  9. data/lib/bosh/monitor/config.rb +71 -0
  10. data/lib/bosh/monitor/core_ext.rb +8 -0
  11. data/lib/bosh/monitor/director.rb +76 -0
  12. data/lib/bosh/monitor/director_monitor.rb +33 -0
  13. data/lib/bosh/monitor/errors.rb +19 -0
  14. data/lib/bosh/monitor/event_processor.rb +109 -0
  15. data/lib/bosh/monitor/events/alert.rb +92 -0
  16. data/lib/bosh/monitor/events/base.rb +70 -0
  17. data/lib/bosh/monitor/events/heartbeat.rb +139 -0
  18. data/lib/bosh/monitor/metric.rb +16 -0
  19. data/lib/bosh/monitor/plugins/base.rb +27 -0
  20. data/lib/bosh/monitor/plugins/cloud_watch.rb +56 -0
  21. data/lib/bosh/monitor/plugins/datadog.rb +78 -0
  22. data/lib/bosh/monitor/plugins/dummy.rb +20 -0
  23. data/lib/bosh/monitor/plugins/email.rb +135 -0
  24. data/lib/bosh/monitor/plugins/http_request_helper.rb +25 -0
  25. data/lib/bosh/monitor/plugins/logger.rb +13 -0
  26. data/lib/bosh/monitor/plugins/nats.rb +43 -0
  27. data/lib/bosh/monitor/plugins/pagerduty.rb +48 -0
  28. data/lib/bosh/monitor/plugins/paging_datadog_client.rb +24 -0
  29. data/lib/bosh/monitor/plugins/resurrector.rb +82 -0
  30. data/lib/bosh/monitor/plugins/resurrector_helper.rb +84 -0
  31. data/lib/bosh/monitor/plugins/tsdb.rb +43 -0
  32. data/lib/bosh/monitor/plugins/varz.rb +17 -0
  33. data/lib/bosh/monitor/protocols/tsdb.rb +68 -0
  34. data/lib/bosh/monitor/runner.rb +162 -0
  35. data/lib/bosh/monitor/version.rb +5 -0
  36. data/lib/bosh/monitor/yaml_helper.rb +18 -0
  37. metadata +246 -0
@@ -0,0 +1,18 @@
1
+ module Bosh::Monitor
2
+
3
+ class ApiController < Sinatra::Base
4
+
5
+ configure do
6
+ set(:show_exceptions, false)
7
+ set(:raise_errors, false)
8
+ set(:dump_errors, false)
9
+ end
10
+
11
+ get "/varz" do
12
+ content_type(:json)
13
+ Yajl::Encoder.encode(Bhm.varz, :terminator => "\n")
14
+ end
15
+
16
+ end
17
+
18
+ end
@@ -0,0 +1,71 @@
1
+ module Bosh::Monitor
2
+
3
+ class << self
4
+
5
+ attr_accessor :logger
6
+ attr_accessor :director
7
+ attr_accessor :intervals
8
+ attr_accessor :mbus
9
+ attr_accessor :event_mbus
10
+ attr_accessor :agent_manager
11
+ attr_accessor :event_processor
12
+
13
+ attr_accessor :http_port, :http_user, :http_password
14
+ attr_accessor :plugins
15
+ attr_accessor :varz
16
+
17
+ attr_accessor :nats
18
+
19
+ def config=(config)
20
+ validate_config(config)
21
+
22
+ @logger = Logging.logger(config["logfile"] || STDOUT)
23
+ @intervals = OpenStruct.new(config["intervals"])
24
+ @director = Director.new(config["director"])
25
+ @mbus = OpenStruct.new(config["mbus"])
26
+
27
+ @event_processor = EventProcessor.new
28
+ @agent_manager = AgentManager.new(event_processor)
29
+
30
+ @varz = {}
31
+
32
+ # Interval defaults
33
+ @intervals.prune_events ||= 30
34
+ @intervals.poll_director ||= 60
35
+ @intervals.poll_grace_period ||= 30
36
+ @intervals.log_stats ||= 60
37
+ @intervals.analyze_agents ||= 60
38
+ @intervals.agent_timeout ||= 60
39
+ @intervals.rogue_agent_alert ||= 120
40
+
41
+ if config["http"].is_a?(Hash)
42
+ @http_port = config["http"]["port"]
43
+ @http_user = config["http"]["user"]
44
+ @http_password = config["http"]["password"]
45
+ end
46
+
47
+ if config["event_mbus"]
48
+ @event_mbus = OpenStruct.new(config["event_mbus"])
49
+ end
50
+
51
+ if config["loglevel"].is_a?(String)
52
+ @logger.level = config["loglevel"].to_sym
53
+ end
54
+
55
+ if config["plugins"].is_a?(Enumerable)
56
+ @plugins = config["plugins"]
57
+ end
58
+ end
59
+
60
+ def set_varz(key, value)
61
+ @varz ||= {}
62
+ @varz[key] = value
63
+ end
64
+
65
+ def validate_config(config)
66
+ unless config.is_a?(Hash)
67
+ raise ConfigError, "Invalid config format, Hash expected, #{config.class} given"
68
+ end
69
+ end
70
+ end
71
+ end
@@ -0,0 +1,8 @@
1
+ module Kernel
2
+
3
+ def pluralize(number, singular, plural = nil)
4
+ plural = plural || "#{singular}s"
5
+ number == 1 ? "1 #{singular}" : "#{number} #{plural}"
6
+ end
7
+
8
+ end
@@ -0,0 +1,76 @@
1
+ module Bosh::Monitor
2
+ class Director
3
+
4
+ def initialize(options)
5
+ @endpoint = options["endpoint"].to_s
6
+ @user = options["user"].to_s
7
+ @password = options["password"].to_s
8
+ end
9
+
10
+ def get_deployments
11
+ http = perform_request(:get, "/deployments")
12
+
13
+ body = http.response
14
+ status = http.response_header.http_status
15
+
16
+ if status != "200"
17
+ raise DirectorError, "Cannot get deployments from director at #{http.uri}: #{status} #{body}"
18
+ end
19
+
20
+ parse_json(body, Array)
21
+ end
22
+
23
+ def get_deployment_vms(name)
24
+ http = perform_request(:get, "/deployments/#{name}/vms")
25
+
26
+ body = http.response
27
+ status = http.response_header.http_status
28
+
29
+ if status != "200"
30
+ raise DirectorError, "Cannot get deployment `#{name}' from director at #{http.uri}: #{status} #{body}"
31
+ end
32
+
33
+ parse_json(body, Array)
34
+ end
35
+
36
+ private
37
+
38
+ def parse_json(json, expected_type = nil)
39
+ result = Yajl::Parser.parse(json)
40
+
41
+ if expected_type && !result.kind_of?(expected_type)
42
+ raise DirectorError, "Invalid JSON response format, expected #{expected_type}, got #{result.class}"
43
+ end
44
+
45
+ result
46
+
47
+ rescue Yajl::ParseError => e
48
+ raise DirectorError, "Cannot parse director response: #{e.message}"
49
+ end
50
+
51
+ # JMS and GO: This effectively turns async requests into synchronous requests.
52
+ # This is a very bad thing to do on eventmachine because it will block the single
53
+ # event loop. This code should be removed and all requests converted
54
+ # to "the eventmachine way".
55
+ def perform_request(method, uri)
56
+ f = Fiber.current
57
+
58
+ target_uri = @endpoint + uri
59
+
60
+ headers = {
61
+ "authorization" => [@user, @password]
62
+ }
63
+
64
+ http = EM::HttpRequest.new(target_uri).send(method.to_sym, :head => headers)
65
+
66
+ http.callback { f.resume(http) }
67
+ http.errback { f.resume(http) }
68
+
69
+ Fiber.yield
70
+
71
+ rescue URI::Error => e
72
+ raise DirectorError, "Invalid URI: #{target_uri}"
73
+ end
74
+
75
+ end
76
+ end
@@ -0,0 +1,33 @@
1
+ module Bosh::Monitor
2
+ class DirectorMonitor
3
+ def initialize(config)
4
+ @nats = config.nats
5
+ @logger = config.logger
6
+ @event_processor = config.event_processor
7
+ end
8
+
9
+ def subscribe
10
+ @nats.subscribe('hm.director.alert') do |msg|
11
+ alert = Yajl::Parser.parse(msg)
12
+
13
+ if valid_payload?(alert)
14
+ @event_processor.process(:alert, alert)
15
+ end
16
+ end
17
+ end
18
+
19
+ private
20
+
21
+ def valid_payload?(payload)
22
+ missing_keys = %w(id severity title summary created_at) - payload.keys
23
+ valid = missing_keys.empty?
24
+
25
+ unless valid
26
+ first_missing_key = missing_keys.first
27
+ @logger.error("Invalid payload from director: the key '#{first_missing_key}' was missing. #{payload.inspect}")
28
+ end
29
+
30
+ valid
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,19 @@
1
+ module Bosh::Monitor
2
+
3
+ class Error < StandardError
4
+ def self.code(code = nil)
5
+ define_method(:code) { code }
6
+ end
7
+ end
8
+
9
+ class FatalError < Error; code(42); end
10
+
11
+ class ConfigError < Error; code(101); end
12
+ class DirectorError < Error; code(201); end
13
+ class ConnectionError < Error; code(202); end
14
+
15
+ class EventProcessingError < Error; code(301); end
16
+ class InvalidEvent < Error; code(302); end
17
+
18
+ class PluginError < Error; code(401); end
19
+ end
@@ -0,0 +1,109 @@
1
+ module Bosh::Monitor
2
+ class EventProcessor
3
+ attr_reader :plugins
4
+
5
+ def initialize
6
+ @events = {}
7
+ @plugins = {}
8
+
9
+ @lock = Mutex.new
10
+ @logger = Bhm.logger
11
+ end
12
+
13
+ def add_plugin(plugin, event_kinds = [])
14
+ if plugin.respond_to?(:validate_options) && !plugin.validate_options
15
+ raise FatalError, "Invalid plugin options for `#{plugin.class}'"
16
+ end
17
+
18
+ @lock.synchronize do
19
+ event_kinds.each do |kind|
20
+ kind = kind.to_sym
21
+ @plugins[kind] ||= Set.new
22
+ @plugins[kind] << plugin
23
+ end
24
+ plugin.run
25
+ end
26
+ end
27
+
28
+ def process(kind, data)
29
+ kind = kind.to_sym
30
+ event = Bhm::Events::Base.create!(kind, data)
31
+
32
+ @lock.synchronize do
33
+ @events[kind] ||= {}
34
+
35
+ if @events[kind].has_key?(event.id)
36
+ @logger.debug("Ignoring duplicate #{event.kind} `#{event.id}'")
37
+ return true
38
+ end
39
+ # We don't really need to store event itself for the moment,
40
+ # as we only use its id to dedup new events.
41
+ @events[kind][event.id] = { :received_at => Time.now.to_i }
42
+ end
43
+
44
+ if @plugins[kind].nil? || @plugins[kind].empty?
45
+ @logger.debug("No plugins are interested in `#{event.kind}' event")
46
+ return true
47
+ end
48
+
49
+ @plugins[kind].each do |plugin|
50
+ plugin_process(plugin, event)
51
+ end
52
+
53
+ true
54
+ end
55
+
56
+ def events_count
57
+ # Accumulate event counter over all event kinds
58
+ @lock.synchronize do
59
+ @events.inject(0) do |counter, (kind, events)|
60
+ counter += events.size
61
+ end
62
+ end
63
+ end
64
+
65
+ def enable_pruning(interval)
66
+ @reaper ||= Thread.new do
67
+ loop do
68
+ # Some events might be in the system up to 2 * interval
69
+ # seconds this way, but it seems to be a reasonable trade-off
70
+ prune_events(interval)
71
+ sleep(interval)
72
+ end
73
+ end
74
+ end
75
+
76
+ def prune_events(lifetime)
77
+ @lock.synchronize do
78
+ pruned_count = 0
79
+ total_count = 0
80
+
81
+ @events.each_value do |list|
82
+ list.delete_if do |id, data|
83
+ total_count += 1
84
+ if data[:received_at] <= Time.now.to_i - lifetime
85
+ pruned_count += 1
86
+ true
87
+ else
88
+ false
89
+ end
90
+ end
91
+ end
92
+
93
+ @logger.debug("Pruned %s" % [ pluralize(pruned_count, "old event") ])
94
+ @logger.debug("Total %s" % [ pluralize(total_count, "event") ])
95
+ end
96
+ rescue => e
97
+ @logger.error("Error pruning events: #{e}")
98
+ @logger.error(e.backtrace.join("\n"))
99
+ end
100
+
101
+ private
102
+
103
+ def plugin_process(plugin, event)
104
+ plugin.process(event)
105
+ rescue Bhm::PluginError => e
106
+ @logger.error("Plugin #{plugin.class} failed to process #{event.kind}: #{e}")
107
+ end
108
+ end
109
+ end
@@ -0,0 +1,92 @@
1
+ module Bosh::Monitor
2
+ module Events
3
+ class Alert < Base
4
+
5
+ # Considering Bosh::Agent::Alert
6
+ SEVERITY_MAP = {
7
+ 1 => :alert,
8
+ 2 => :critical,
9
+ 3 => :error,
10
+ 4 => :warning,
11
+ -1 => :ignored
12
+ }
13
+
14
+ attr_reader :created_at, :source, :title
15
+
16
+ def initialize(attributes = {})
17
+ super
18
+ @kind = :alert
19
+
20
+ @id = @attributes["id"]
21
+ @severity = @attributes["severity"]
22
+ @title = @attributes["title"]
23
+ @summary = @attributes["summary"] || @title
24
+ @source = @attributes["source"]
25
+
26
+ # This rescue is just to preserve existing test behavior. However, this
27
+ # seems like a pretty wacky way to handle errors - wouldn't we rather
28
+ # have a nice exception?
29
+ @created_at = Time.at(@attributes["created_at"]) rescue @attributes["created_at"]
30
+ end
31
+
32
+ def validate
33
+ add_error("id is missing") if @id.nil?
34
+ add_error("severity is missing") if @severity.nil?
35
+
36
+ if @severity && (!@severity.kind_of?(Integer) || @severity < 0)
37
+ add_error("severity is invalid (non-negative integer expected)")
38
+ end
39
+
40
+ add_error("title is missing") if @title.nil?
41
+ add_error("timestamp is missing") if @created_at.nil?
42
+
43
+ if @created_at && !@created_at.kind_of?(Time)
44
+ add_error('created_at is invalid UNIX timestamp')
45
+ end
46
+ end
47
+
48
+ def short_description
49
+ "Severity #{@severity}: #{@source} #{@title}"
50
+ end
51
+
52
+ def severity
53
+ SEVERITY_MAP[@severity] || @severity
54
+ end
55
+
56
+ def to_hash
57
+ {
58
+ :kind => "alert",
59
+ :id => @id,
60
+ :severity => @severity,
61
+ :title => @title,
62
+ :summary => @summary,
63
+ :source => @source,
64
+ :created_at => @created_at.to_i
65
+ }
66
+ end
67
+
68
+ def to_json
69
+ Yajl::Encoder.encode(self.to_hash)
70
+ end
71
+
72
+ def to_s
73
+ "Alert @ #{@created_at.utc}, severity #{@severity}: #{@summary}"
74
+ end
75
+
76
+ def to_plain_text
77
+ result = ""
78
+ result << "#{@source}\n" unless @source.nil?
79
+ result << (@title || "Unknown Alert") << "\n"
80
+ result << "Severity: #{@severity}\n"
81
+ result << "Summary: #{@summary}\n" unless @summary.nil?
82
+ result << "Time: #{@created_at.utc}\n"
83
+ result
84
+ end
85
+
86
+ def metrics
87
+ [ ]
88
+ end
89
+
90
+ end
91
+ end
92
+ end
@@ -0,0 +1,70 @@
1
+ module Bosh::Monitor
2
+ module Events
3
+ class Base
4
+ attr_accessor :id
5
+
6
+ attr_reader :logger
7
+ attr_reader :kind
8
+ attr_reader :attributes
9
+ attr_reader :errors
10
+
11
+ def self.create!(kind, attributes = {})
12
+ event = create(kind, attributes)
13
+ if !event.valid?
14
+ raise InvalidEvent, event.error_message
15
+ end
16
+ event
17
+ end
18
+
19
+ def self.create(kind, attributes = {})
20
+ if !attributes.kind_of?(Hash)
21
+ raise InvalidEvent, "Cannot create event from #{attributes.class}"
22
+ end
23
+
24
+ case kind.to_s
25
+ when "heartbeat"
26
+ klass = Bhm::Events::Heartbeat
27
+ when "alert"
28
+ klass = Bhm::Events::Alert
29
+ else
30
+ raise InvalidEvent, "Cannot find `#{kind}' event handler"
31
+ end
32
+
33
+ event = klass.new(attributes)
34
+ event.id = SecureRandom.uuid if event.id.nil?
35
+ event
36
+ end
37
+
38
+ def initialize(attributes = {})
39
+ @attributes = {}
40
+ @kind = :unknown
41
+
42
+ attributes.each_pair do |k, v|
43
+ @attributes[k.to_s] = v
44
+ end
45
+
46
+ @logger = Bhm.logger
47
+ @errors = Set.new
48
+ end
49
+
50
+ def add_error(error)
51
+ @errors << error
52
+ end
53
+
54
+ def valid?
55
+ validate
56
+ @errors.empty?
57
+ end
58
+
59
+ def error_message
60
+ @errors.to_a.join(", ")
61
+ end
62
+
63
+ [:validate, :to_plain_text, :to_hash, :to_json, :metrics].each do |method|
64
+ define_method(method) do
65
+ raise FatalError, "`#{method}' is not implemented by #{self.class}"
66
+ end
67
+ end
68
+ end
69
+ end
70
+ end