bosh-monitor 1.5.0.pre.1113

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. data/README +80 -0
  2. data/bin/bosh-monitor +30 -0
  3. data/bin/bosh-monitor-console +51 -0
  4. data/bin/listener +58 -0
  5. data/lib/bosh/monitor.rb +72 -0
  6. data/lib/bosh/monitor/agent.rb +51 -0
  7. data/lib/bosh/monitor/agent_manager.rb +295 -0
  8. data/lib/bosh/monitor/api_controller.rb +18 -0
  9. data/lib/bosh/monitor/config.rb +71 -0
  10. data/lib/bosh/monitor/core_ext.rb +8 -0
  11. data/lib/bosh/monitor/director.rb +76 -0
  12. data/lib/bosh/monitor/director_monitor.rb +33 -0
  13. data/lib/bosh/monitor/errors.rb +19 -0
  14. data/lib/bosh/monitor/event_processor.rb +109 -0
  15. data/lib/bosh/monitor/events/alert.rb +92 -0
  16. data/lib/bosh/monitor/events/base.rb +70 -0
  17. data/lib/bosh/monitor/events/heartbeat.rb +139 -0
  18. data/lib/bosh/monitor/metric.rb +16 -0
  19. data/lib/bosh/monitor/plugins/base.rb +27 -0
  20. data/lib/bosh/monitor/plugins/cloud_watch.rb +56 -0
  21. data/lib/bosh/monitor/plugins/datadog.rb +78 -0
  22. data/lib/bosh/monitor/plugins/dummy.rb +20 -0
  23. data/lib/bosh/monitor/plugins/email.rb +135 -0
  24. data/lib/bosh/monitor/plugins/http_request_helper.rb +25 -0
  25. data/lib/bosh/monitor/plugins/logger.rb +13 -0
  26. data/lib/bosh/monitor/plugins/nats.rb +43 -0
  27. data/lib/bosh/monitor/plugins/pagerduty.rb +48 -0
  28. data/lib/bosh/monitor/plugins/paging_datadog_client.rb +24 -0
  29. data/lib/bosh/monitor/plugins/resurrector.rb +82 -0
  30. data/lib/bosh/monitor/plugins/resurrector_helper.rb +84 -0
  31. data/lib/bosh/monitor/plugins/tsdb.rb +43 -0
  32. data/lib/bosh/monitor/plugins/varz.rb +17 -0
  33. data/lib/bosh/monitor/protocols/tsdb.rb +68 -0
  34. data/lib/bosh/monitor/runner.rb +162 -0
  35. data/lib/bosh/monitor/version.rb +5 -0
  36. data/lib/bosh/monitor/yaml_helper.rb +18 -0
  37. metadata +246 -0
@@ -0,0 +1,18 @@
1
+ module Bosh::Monitor
2
+
3
+ class ApiController < Sinatra::Base
4
+
5
+ configure do
6
+ set(:show_exceptions, false)
7
+ set(:raise_errors, false)
8
+ set(:dump_errors, false)
9
+ end
10
+
11
+ get "/varz" do
12
+ content_type(:json)
13
+ Yajl::Encoder.encode(Bhm.varz, :terminator => "\n")
14
+ end
15
+
16
+ end
17
+
18
+ end
@@ -0,0 +1,71 @@
1
+ module Bosh::Monitor
2
+
3
+ class << self
4
+
5
+ attr_accessor :logger
6
+ attr_accessor :director
7
+ attr_accessor :intervals
8
+ attr_accessor :mbus
9
+ attr_accessor :event_mbus
10
+ attr_accessor :agent_manager
11
+ attr_accessor :event_processor
12
+
13
+ attr_accessor :http_port, :http_user, :http_password
14
+ attr_accessor :plugins
15
+ attr_accessor :varz
16
+
17
+ attr_accessor :nats
18
+
19
+ def config=(config)
20
+ validate_config(config)
21
+
22
+ @logger = Logging.logger(config["logfile"] || STDOUT)
23
+ @intervals = OpenStruct.new(config["intervals"])
24
+ @director = Director.new(config["director"])
25
+ @mbus = OpenStruct.new(config["mbus"])
26
+
27
+ @event_processor = EventProcessor.new
28
+ @agent_manager = AgentManager.new(event_processor)
29
+
30
+ @varz = {}
31
+
32
+ # Interval defaults
33
+ @intervals.prune_events ||= 30
34
+ @intervals.poll_director ||= 60
35
+ @intervals.poll_grace_period ||= 30
36
+ @intervals.log_stats ||= 60
37
+ @intervals.analyze_agents ||= 60
38
+ @intervals.agent_timeout ||= 60
39
+ @intervals.rogue_agent_alert ||= 120
40
+
41
+ if config["http"].is_a?(Hash)
42
+ @http_port = config["http"]["port"]
43
+ @http_user = config["http"]["user"]
44
+ @http_password = config["http"]["password"]
45
+ end
46
+
47
+ if config["event_mbus"]
48
+ @event_mbus = OpenStruct.new(config["event_mbus"])
49
+ end
50
+
51
+ if config["loglevel"].is_a?(String)
52
+ @logger.level = config["loglevel"].to_sym
53
+ end
54
+
55
+ if config["plugins"].is_a?(Enumerable)
56
+ @plugins = config["plugins"]
57
+ end
58
+ end
59
+
60
+ def set_varz(key, value)
61
+ @varz ||= {}
62
+ @varz[key] = value
63
+ end
64
+
65
+ def validate_config(config)
66
+ unless config.is_a?(Hash)
67
+ raise ConfigError, "Invalid config format, Hash expected, #{config.class} given"
68
+ end
69
+ end
70
+ end
71
+ end
@@ -0,0 +1,8 @@
1
+ module Kernel
2
+
3
+ def pluralize(number, singular, plural = nil)
4
+ plural = plural || "#{singular}s"
5
+ number == 1 ? "1 #{singular}" : "#{number} #{plural}"
6
+ end
7
+
8
+ end
@@ -0,0 +1,76 @@
1
+ module Bosh::Monitor
2
+ class Director
3
+
4
+ def initialize(options)
5
+ @endpoint = options["endpoint"].to_s
6
+ @user = options["user"].to_s
7
+ @password = options["password"].to_s
8
+ end
9
+
10
+ def get_deployments
11
+ http = perform_request(:get, "/deployments")
12
+
13
+ body = http.response
14
+ status = http.response_header.http_status
15
+
16
+ if status != "200"
17
+ raise DirectorError, "Cannot get deployments from director at #{http.uri}: #{status} #{body}"
18
+ end
19
+
20
+ parse_json(body, Array)
21
+ end
22
+
23
+ def get_deployment_vms(name)
24
+ http = perform_request(:get, "/deployments/#{name}/vms")
25
+
26
+ body = http.response
27
+ status = http.response_header.http_status
28
+
29
+ if status != "200"
30
+ raise DirectorError, "Cannot get deployment `#{name}' from director at #{http.uri}: #{status} #{body}"
31
+ end
32
+
33
+ parse_json(body, Array)
34
+ end
35
+
36
+ private
37
+
38
+ def parse_json(json, expected_type = nil)
39
+ result = Yajl::Parser.parse(json)
40
+
41
+ if expected_type && !result.kind_of?(expected_type)
42
+ raise DirectorError, "Invalid JSON response format, expected #{expected_type}, got #{result.class}"
43
+ end
44
+
45
+ result
46
+
47
+ rescue Yajl::ParseError => e
48
+ raise DirectorError, "Cannot parse director response: #{e.message}"
49
+ end
50
+
51
+ # JMS and GO: This effectively turns async requests into synchronous requests.
52
+ # This is a very bad thing to do on eventmachine because it will block the single
53
+ # event loop. This code should be removed and all requests converted
54
+ # to "the eventmachine way".
55
+ def perform_request(method, uri)
56
+ f = Fiber.current
57
+
58
+ target_uri = @endpoint + uri
59
+
60
+ headers = {
61
+ "authorization" => [@user, @password]
62
+ }
63
+
64
+ http = EM::HttpRequest.new(target_uri).send(method.to_sym, :head => headers)
65
+
66
+ http.callback { f.resume(http) }
67
+ http.errback { f.resume(http) }
68
+
69
+ Fiber.yield
70
+
71
+ rescue URI::Error => e
72
+ raise DirectorError, "Invalid URI: #{target_uri}"
73
+ end
74
+
75
+ end
76
+ end
@@ -0,0 +1,33 @@
1
+ module Bosh::Monitor
2
+ class DirectorMonitor
3
+ def initialize(config)
4
+ @nats = config.nats
5
+ @logger = config.logger
6
+ @event_processor = config.event_processor
7
+ end
8
+
9
+ def subscribe
10
+ @nats.subscribe('hm.director.alert') do |msg|
11
+ alert = Yajl::Parser.parse(msg)
12
+
13
+ if valid_payload?(alert)
14
+ @event_processor.process(:alert, alert)
15
+ end
16
+ end
17
+ end
18
+
19
+ private
20
+
21
+ def valid_payload?(payload)
22
+ missing_keys = %w(id severity title summary created_at) - payload.keys
23
+ valid = missing_keys.empty?
24
+
25
+ unless valid
26
+ first_missing_key = missing_keys.first
27
+ @logger.error("Invalid payload from director: the key '#{first_missing_key}' was missing. #{payload.inspect}")
28
+ end
29
+
30
+ valid
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,19 @@
1
+ module Bosh::Monitor
2
+
3
+ class Error < StandardError
4
+ def self.code(code = nil)
5
+ define_method(:code) { code }
6
+ end
7
+ end
8
+
9
+ class FatalError < Error; code(42); end
10
+
11
+ class ConfigError < Error; code(101); end
12
+ class DirectorError < Error; code(201); end
13
+ class ConnectionError < Error; code(202); end
14
+
15
+ class EventProcessingError < Error; code(301); end
16
+ class InvalidEvent < Error; code(302); end
17
+
18
+ class PluginError < Error; code(401); end
19
+ end
@@ -0,0 +1,109 @@
1
+ module Bosh::Monitor
2
+ class EventProcessor
3
+ attr_reader :plugins
4
+
5
+ def initialize
6
+ @events = {}
7
+ @plugins = {}
8
+
9
+ @lock = Mutex.new
10
+ @logger = Bhm.logger
11
+ end
12
+
13
+ def add_plugin(plugin, event_kinds = [])
14
+ if plugin.respond_to?(:validate_options) && !plugin.validate_options
15
+ raise FatalError, "Invalid plugin options for `#{plugin.class}'"
16
+ end
17
+
18
+ @lock.synchronize do
19
+ event_kinds.each do |kind|
20
+ kind = kind.to_sym
21
+ @plugins[kind] ||= Set.new
22
+ @plugins[kind] << plugin
23
+ end
24
+ plugin.run
25
+ end
26
+ end
27
+
28
+ def process(kind, data)
29
+ kind = kind.to_sym
30
+ event = Bhm::Events::Base.create!(kind, data)
31
+
32
+ @lock.synchronize do
33
+ @events[kind] ||= {}
34
+
35
+ if @events[kind].has_key?(event.id)
36
+ @logger.debug("Ignoring duplicate #{event.kind} `#{event.id}'")
37
+ return true
38
+ end
39
+ # We don't really need to store event itself for the moment,
40
+ # as we only use its id to dedup new events.
41
+ @events[kind][event.id] = { :received_at => Time.now.to_i }
42
+ end
43
+
44
+ if @plugins[kind].nil? || @plugins[kind].empty?
45
+ @logger.debug("No plugins are interested in `#{event.kind}' event")
46
+ return true
47
+ end
48
+
49
+ @plugins[kind].each do |plugin|
50
+ plugin_process(plugin, event)
51
+ end
52
+
53
+ true
54
+ end
55
+
56
+ def events_count
57
+ # Accumulate event counter over all event kinds
58
+ @lock.synchronize do
59
+ @events.inject(0) do |counter, (kind, events)|
60
+ counter += events.size
61
+ end
62
+ end
63
+ end
64
+
65
+ def enable_pruning(interval)
66
+ @reaper ||= Thread.new do
67
+ loop do
68
+ # Some events might be in the system up to 2 * interval
69
+ # seconds this way, but it seems to be a reasonable trade-off
70
+ prune_events(interval)
71
+ sleep(interval)
72
+ end
73
+ end
74
+ end
75
+
76
+ def prune_events(lifetime)
77
+ @lock.synchronize do
78
+ pruned_count = 0
79
+ total_count = 0
80
+
81
+ @events.each_value do |list|
82
+ list.delete_if do |id, data|
83
+ total_count += 1
84
+ if data[:received_at] <= Time.now.to_i - lifetime
85
+ pruned_count += 1
86
+ true
87
+ else
88
+ false
89
+ end
90
+ end
91
+ end
92
+
93
+ @logger.debug("Pruned %s" % [ pluralize(pruned_count, "old event") ])
94
+ @logger.debug("Total %s" % [ pluralize(total_count, "event") ])
95
+ end
96
+ rescue => e
97
+ @logger.error("Error pruning events: #{e}")
98
+ @logger.error(e.backtrace.join("\n"))
99
+ end
100
+
101
+ private
102
+
103
+ def plugin_process(plugin, event)
104
+ plugin.process(event)
105
+ rescue Bhm::PluginError => e
106
+ @logger.error("Plugin #{plugin.class} failed to process #{event.kind}: #{e}")
107
+ end
108
+ end
109
+ end
@@ -0,0 +1,92 @@
1
+ module Bosh::Monitor
2
+ module Events
3
+ class Alert < Base
4
+
5
+ # Considering Bosh::Agent::Alert
6
+ SEVERITY_MAP = {
7
+ 1 => :alert,
8
+ 2 => :critical,
9
+ 3 => :error,
10
+ 4 => :warning,
11
+ -1 => :ignored
12
+ }
13
+
14
+ attr_reader :created_at, :source, :title
15
+
16
+ def initialize(attributes = {})
17
+ super
18
+ @kind = :alert
19
+
20
+ @id = @attributes["id"]
21
+ @severity = @attributes["severity"]
22
+ @title = @attributes["title"]
23
+ @summary = @attributes["summary"] || @title
24
+ @source = @attributes["source"]
25
+
26
+ # This rescue is just to preserve existing test behavior. However, this
27
+ # seems like a pretty wacky way to handle errors - wouldn't we rather
28
+ # have a nice exception?
29
+ @created_at = Time.at(@attributes["created_at"]) rescue @attributes["created_at"]
30
+ end
31
+
32
+ def validate
33
+ add_error("id is missing") if @id.nil?
34
+ add_error("severity is missing") if @severity.nil?
35
+
36
+ if @severity && (!@severity.kind_of?(Integer) || @severity < 0)
37
+ add_error("severity is invalid (non-negative integer expected)")
38
+ end
39
+
40
+ add_error("title is missing") if @title.nil?
41
+ add_error("timestamp is missing") if @created_at.nil?
42
+
43
+ if @created_at && !@created_at.kind_of?(Time)
44
+ add_error('created_at is invalid UNIX timestamp')
45
+ end
46
+ end
47
+
48
+ def short_description
49
+ "Severity #{@severity}: #{@source} #{@title}"
50
+ end
51
+
52
+ def severity
53
+ SEVERITY_MAP[@severity] || @severity
54
+ end
55
+
56
+ def to_hash
57
+ {
58
+ :kind => "alert",
59
+ :id => @id,
60
+ :severity => @severity,
61
+ :title => @title,
62
+ :summary => @summary,
63
+ :source => @source,
64
+ :created_at => @created_at.to_i
65
+ }
66
+ end
67
+
68
+ def to_json
69
+ Yajl::Encoder.encode(self.to_hash)
70
+ end
71
+
72
+ def to_s
73
+ "Alert @ #{@created_at.utc}, severity #{@severity}: #{@summary}"
74
+ end
75
+
76
+ def to_plain_text
77
+ result = ""
78
+ result << "#{@source}\n" unless @source.nil?
79
+ result << (@title || "Unknown Alert") << "\n"
80
+ result << "Severity: #{@severity}\n"
81
+ result << "Summary: #{@summary}\n" unless @summary.nil?
82
+ result << "Time: #{@created_at.utc}\n"
83
+ result
84
+ end
85
+
86
+ def metrics
87
+ [ ]
88
+ end
89
+
90
+ end
91
+ end
92
+ end
@@ -0,0 +1,70 @@
1
+ module Bosh::Monitor
2
+ module Events
3
+ class Base
4
+ attr_accessor :id
5
+
6
+ attr_reader :logger
7
+ attr_reader :kind
8
+ attr_reader :attributes
9
+ attr_reader :errors
10
+
11
+ def self.create!(kind, attributes = {})
12
+ event = create(kind, attributes)
13
+ if !event.valid?
14
+ raise InvalidEvent, event.error_message
15
+ end
16
+ event
17
+ end
18
+
19
+ def self.create(kind, attributes = {})
20
+ if !attributes.kind_of?(Hash)
21
+ raise InvalidEvent, "Cannot create event from #{attributes.class}"
22
+ end
23
+
24
+ case kind.to_s
25
+ when "heartbeat"
26
+ klass = Bhm::Events::Heartbeat
27
+ when "alert"
28
+ klass = Bhm::Events::Alert
29
+ else
30
+ raise InvalidEvent, "Cannot find `#{kind}' event handler"
31
+ end
32
+
33
+ event = klass.new(attributes)
34
+ event.id = SecureRandom.uuid if event.id.nil?
35
+ event
36
+ end
37
+
38
+ def initialize(attributes = {})
39
+ @attributes = {}
40
+ @kind = :unknown
41
+
42
+ attributes.each_pair do |k, v|
43
+ @attributes[k.to_s] = v
44
+ end
45
+
46
+ @logger = Bhm.logger
47
+ @errors = Set.new
48
+ end
49
+
50
+ def add_error(error)
51
+ @errors << error
52
+ end
53
+
54
+ def valid?
55
+ validate
56
+ @errors.empty?
57
+ end
58
+
59
+ def error_message
60
+ @errors.to_a.join(", ")
61
+ end
62
+
63
+ [:validate, :to_plain_text, :to_hash, :to_json, :metrics].each do |method|
64
+ define_method(method) do
65
+ raise FatalError, "`#{method}' is not implemented by #{self.class}"
66
+ end
67
+ end
68
+ end
69
+ end
70
+ end