flapjack 0.7.14 → 0.7.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. data/CHANGELOG.md +10 -0
  2. data/etc/flapjack_config.yaml.example +1 -0
  3. data/features/events.feature +5 -0
  4. data/features/notification_rules.feature +1 -1
  5. data/features/steps/events_steps.rb +28 -13
  6. data/features/steps/notifications_steps.rb +1 -1
  7. data/lib/flapjack/coordinator.rb +3 -1
  8. data/lib/flapjack/data/contact.rb +8 -6
  9. data/lib/flapjack/data/entity_check.rb +78 -113
  10. data/lib/flapjack/data/event.rb +54 -65
  11. data/lib/flapjack/data/notification.rb +5 -1
  12. data/lib/flapjack/executive.rb +42 -38
  13. data/lib/flapjack/filters/acknowledgement.rb +5 -5
  14. data/lib/flapjack/filters/base.rb +2 -2
  15. data/lib/flapjack/filters/delays.rb +11 -11
  16. data/lib/flapjack/filters/detect_mass_client_failures.rb +8 -8
  17. data/lib/flapjack/filters/ok.rb +6 -6
  18. data/lib/flapjack/filters/scheduled_maintenance.rb +2 -2
  19. data/lib/flapjack/filters/unscheduled_maintenance.rb +3 -2
  20. data/lib/flapjack/gateways/api.rb +374 -277
  21. data/lib/flapjack/gateways/api/entity_check_presenter.rb +52 -21
  22. data/lib/flapjack/gateways/api/entity_presenter.rb +14 -9
  23. data/lib/flapjack/gateways/email.rb +7 -0
  24. data/lib/flapjack/gateways/email/alert.html.haml +13 -1
  25. data/lib/flapjack/gateways/email/alert.text.erb +5 -4
  26. data/lib/flapjack/gateways/jabber.rb +90 -34
  27. data/lib/flapjack/gateways/pagerduty.rb +6 -2
  28. data/lib/flapjack/gateways/web.rb +13 -8
  29. data/lib/flapjack/gateways/web/views/check.haml +70 -45
  30. data/lib/flapjack/gateways/web/views/checks.haml +1 -1
  31. data/lib/flapjack/gateways/web/views/entity.haml +1 -1
  32. data/lib/flapjack/patches.rb +9 -2
  33. data/lib/flapjack/pikelet.rb +14 -10
  34. data/lib/flapjack/utility.rb +10 -4
  35. data/lib/flapjack/version.rb +1 -1
  36. data/spec/lib/flapjack/coordinator_spec.rb +19 -5
  37. data/spec/lib/flapjack/data/entity_check_spec.rb +3 -30
  38. data/spec/lib/flapjack/data/event_spec.rb +96 -1
  39. data/spec/lib/flapjack/executive_spec.rb +5 -11
  40. data/spec/lib/flapjack/gateways/api/entity_check_presenter_spec.rb +22 -3
  41. data/spec/lib/flapjack/gateways/api/entity_presenter_spec.rb +30 -15
  42. data/spec/lib/flapjack/gateways/api_spec.rb +552 -186
  43. data/spec/lib/flapjack/gateways/email_spec.rb +2 -0
  44. data/spec/lib/flapjack/gateways/jabber_spec.rb +5 -4
  45. data/spec/lib/flapjack/gateways/pagerduty_spec.rb +3 -2
  46. data/spec/lib/flapjack/gateways/web_spec.rb +17 -12
  47. data/spec/lib/flapjack/pikelet_spec.rb +5 -2
  48. metadata +4 -5
  49. data/config.ru +0 -11
@@ -8,6 +8,8 @@ module Flapjack
8
8
 
9
9
  attr_accessor :previous_state, :previous_state_duration
10
10
 
11
+ attr_reader :check, :summary, :details, :acknowledgement_id
12
+
11
13
  # Helper method for getting the next event.
12
14
  #
13
15
  # Has a blocking and non-blocking method signature.
@@ -25,9 +27,6 @@ module Flapjack
25
27
  :archive_events => false,
26
28
  :events_archive_maxage => (3 * 60 * 60) }
27
29
  options = defaults.merge(opts)
28
- if options[:logger]
29
- logger = options[:logger]
30
- end
31
30
 
32
31
  if options[:archive_events]
33
32
  dest = "events_archive:#{Time.now.utc.strftime "%Y%m%d%H"}"
@@ -49,10 +48,12 @@ module Flapjack
49
48
  begin
50
49
  parsed = ::JSON.parse( raw )
51
50
  rescue => e
52
- logger.warn("Error deserialising event json: #{e}, raw json: #{raw.inspect}")
51
+ if options[:logger]
52
+ options[:logger].warn("Error deserialising event json: #{e}, raw json: #{raw.inspect}")
53
+ end
53
54
  return nil
54
55
  end
55
- return self.new( parsed )
56
+ self.new( parsed )
56
57
  end
57
58
 
58
59
  # creates, or modifies, an event object and adds it to the events list in redis
@@ -74,38 +75,55 @@ module Flapjack
74
75
  redis.llen('events')
75
76
  end
76
77
 
77
- def initialize(attrs={})
78
- @attrs = attrs
79
- @attrs['time'] = Time.now.to_i unless @attrs.has_key?('time')
78
+ def self.create_acknowledgement(entity_name, check, opts = {})
79
+ data = { 'type' => 'action',
80
+ 'state' => 'acknowledgement',
81
+ 'entity' => entity_name,
82
+ 'check' => check,
83
+ 'summary' => opts[:summary],
84
+ 'duration' => opts[:duration],
85
+ 'acknowledgement_id' => opts[:acknowledgement_id]
86
+ }
87
+ add(data, :redis => opts[:redis])
88
+ end
89
+
90
+ def self.test_notifications(entity_name, check, opts = {})
91
+ data = { 'type' => 'action',
92
+ 'state' => 'test_notifications',
93
+ 'entity' => entity_name,
94
+ 'check' => check,
95
+ 'summary' => opts[:summary],
96
+ 'details' => opts[:details]
97
+ }
98
+ add(data, :redis => opts[:redis])
99
+ end
100
+
101
+ def initialize(attrs = {})
102
+ ['type', 'state', 'entity', 'check', 'time', 'summary', 'details',
103
+ 'acknowledgement_id', 'duration'].each do |key|
104
+ instance_variable_set("@#{key}", attrs[key])
105
+ end
80
106
  end
81
107
 
82
108
  def state
83
- return unless @attrs['state']
84
- @attrs['state'].downcase
109
+ return unless @state
110
+ @state.downcase
85
111
  end
86
112
 
87
113
  def entity
88
- return unless @attrs['entity']
89
- @attrs['entity'].downcase
90
- end
91
-
92
- def check
93
- @attrs['check']
94
- end
95
-
96
-
97
- # FIXME some values are only set for certain event types --
98
- # this may not be the best way to do this
99
- def acknowledgement_id
100
- @attrs['acknowledgement_id']
114
+ return unless @entity
115
+ @entity.downcase
101
116
  end
102
117
 
103
118
  def duration
104
- return unless @attrs['duration']
105
- @attrs['duration'].to_i
119
+ return unless @duration
120
+ @duration.to_i
106
121
  end
107
- # end FIXME
108
122
 
123
+ def time
124
+ return unless @time
125
+ @time.to_i
126
+ end
109
127
 
110
128
  def id
111
129
  (entity || '-') + ':' + (check || '-')
@@ -118,25 +136,8 @@ module Flapjack
118
136
  end
119
137
 
120
138
  def type
121
- return unless @attrs['type']
122
- @attrs['type'].downcase
123
- end
124
-
125
- def summary
126
- @attrs['summary']
127
- end
128
-
129
- def details
130
- @attrs['details']
131
- end
132
-
133
- def time
134
- return unless @attrs['time']
135
- @attrs['time'].to_i
136
- end
137
-
138
- def action?
139
- type == 'action'
139
+ return unless @type
140
+ @type.downcase
140
141
  end
141
142
 
142
143
  def service?
@@ -144,37 +145,25 @@ module Flapjack
144
145
  end
145
146
 
146
147
  def acknowledgement?
147
- action? and state == 'acknowledgement'
148
+ (type == 'action') && (state == 'acknowledgement')
148
149
  end
149
150
 
150
151
  def test_notifications?
151
- action? and state == 'test_notifications'
152
+ (type == 'action') && (state == 'test_notifications')
152
153
  end
153
154
 
154
155
  def ok?
155
- (state == 'ok') or (state == 'up')
156
- end
157
-
158
- def unknown?
159
- state == 'unknown'
160
- end
161
-
162
- def unreachable?
163
- state == 'unreachable'
164
- end
165
-
166
- def warning?
167
- state == 'warning'
168
- end
169
-
170
- def critical?
171
- state == 'critical'
156
+ (state == 'ok') || (state == 'up')
172
157
  end
173
158
 
174
159
  def failure?
175
- warning? or critical? or unknown?
160
+ ['critical', 'warning', 'unknown'].include?(state)
176
161
  end
177
162
 
163
+ # # Not used anywhere
164
+ # def unreachable?
165
+ # state == 'unreachable'
166
+ # end
178
167
  end
179
168
  end
180
169
  end
@@ -9,7 +9,7 @@ module Flapjack
9
9
  class Notification
10
10
 
11
11
  attr_reader :event, :type, :max_notified_severity, :contacts,
12
- :default_timezone
12
+ :default_timezone, :last_state
13
13
 
14
14
  def self.for_event(event, opts = {})
15
15
  self.new(:event => event,
@@ -17,6 +17,7 @@ module Flapjack
17
17
  :max_notified_severity => opts[:max_notified_severity],
18
18
  :contacts => opts[:contacts],
19
19
  :default_timezone => opts[:default_timezone],
20
+ :last_state => opts[:last_state],
20
21
  :logger => opts[:logger])
21
22
  end
22
23
 
@@ -37,6 +38,8 @@ module Flapjack
37
38
  contents = {'event_id' => event_id,
38
39
  'state' => event_state,
39
40
  'summary' => event.summary,
41
+ 'last_state' => @last_state ? @last_state[:state] : nil,
42
+ 'last_summary' => @last_state ? @last_state[:summary] : nil,
40
43
  'details' => event.details,
41
44
  'time' => event.time,
42
45
  'duration' => event.duration || nil,
@@ -122,6 +125,7 @@ module Flapjack
122
125
  @max_notified_severity = opts[:max_notified_severity]
123
126
  @contacts = opts[:contacts]
124
127
  @default_timezone = opts[:default_timezone]
128
+ @last_state = opts[:last_state]
125
129
  @logger = opts[:logger]
126
130
  end
127
131
 
@@ -5,6 +5,9 @@ require 'log4r/outputter/fileoutputter'
5
5
  require 'tzinfo'
6
6
  require 'active_support/time'
7
7
 
8
+ require 'chronic'
9
+ require 'chronic_duration'
10
+
8
11
  require 'flapjack/filters/acknowledgement'
9
12
  require 'flapjack/filters/ok'
10
13
  require 'flapjack/filters/scheduled_maintenance'
@@ -60,9 +63,12 @@ module Flapjack
60
63
  @archive_events = @config['archive_events'] || false
61
64
  @events_archive_maxage = @config['events_archive_maxage']
62
65
 
66
+ ncsm_duration_conf = @config['new_check_scheduled_maintenance_duration'] || '100 years'
67
+ @ncsm_duration = ChronicDuration.parse(ncsm_duration_conf)
68
+
63
69
  # FIXME: Put loading filters into separate method
64
70
  # FIXME: should we make the filters more configurable by the end user?
65
- options = { :log => opts[:logger], :persistence => @redis }
71
+ options = { :logger => opts[:logger], :redis => @redis }
66
72
  @filters = []
67
73
  @filters << Flapjack::Filters::Ok.new(options)
68
74
  @filters << Flapjack::Filters::ScheduledMaintenance.new(options)
@@ -71,7 +77,7 @@ module Flapjack
71
77
  @filters << Flapjack::Filters::Delays.new(options)
72
78
  @filters << Flapjack::Filters::Acknowledgement.new(options)
73
79
 
74
- @boot_time = Time.now
80
+ @boot_time = opts[:boot_time]
75
81
  @fqdn = `/bin/hostname -f`.chomp
76
82
  @pid = Process.pid
77
83
  @instance_id = "#{@fqdn}:#{@pid}"
@@ -142,41 +148,35 @@ module Flapjack
142
148
  pending = Flapjack::Data::Event.pending_count(:redis => @redis)
143
149
  @logger.debug("#{pending} events waiting on the queue")
144
150
  @logger.debug("Raw event received: #{event.inspect}")
145
- time_at = event.time
146
- time_at_str = time_at ? ", #{Time.at(time_at).to_s}" : ''
147
- @logger.debug("Processing Event: #{event.id}, #{event.type}, #{event.state}, #{event.summary}#{time_at_str}")
151
+ return if ('shutdown' == event.type)
148
152
 
149
- entity_check = ('shutdown' == event.type) ? nil :
150
- Flapjack::Data::EntityCheck.for_event_id(event.id, :redis => @redis)
153
+ event_str = "#{event.id}, #{event.type}, #{event.state}, #{event.summary}"
154
+ event_str << ", #{Time.at(event.time).to_s}" if event.time
155
+ @logger.debug("Processing Event: #{event_str}")
151
156
 
152
- result = update_keys(event, entity_check)
153
- return if result[:shutdown]
157
+ entity_check = Flapjack::Data::EntityCheck.for_event_id(event.id, :redis => @redis)
158
+ timestamp = Time.now.to_i
154
159
 
155
- blocker = nil
160
+ should_notify = update_keys(event, entity_check, timestamp)
156
161
 
157
- if result[:skip_filters]
162
+ if !should_notify
158
163
  @logger.debug("Not generating notifications for event #{event.id} because filtering was skipped")
159
164
  return
160
- else
161
- blocker = @filters.find {|filter| filter.block?(event) }
162
- end
163
-
164
- if blocker
165
+ elsif blocker = @filters.find {|filter| filter.block?(event) }
165
166
  @logger.debug("Not generating notifications for event #{event.id} because this filter blocked: #{blocker.name}")
166
167
  return
167
168
  end
168
169
 
169
- @logger.info("Generating notifications for event #{event.id}, #{event.type}, #{event.state}, #{event.summary}#{time_at_str}")
170
- generate_notification_messages(event, entity_check)
170
+ @logger.info("Generating notifications for event #{event_str}")
171
+ generate_notification_messages(event, entity_check, timestamp)
171
172
  end
172
173
 
173
- def update_keys(event, entity_check)
174
-
174
+ def update_keys(event, entity_check, timestamp)
175
175
  # TODO: run touch_keys from a separate EM timer for efficiency
176
176
  touch_keys
177
177
 
178
- result = { :skip_filters => false }
179
- timestamp = Time.now.to_i
178
+ result = true
179
+
180
180
  @event_count = @redis.hincrby('event_counters', 'all', 1)
181
181
  @redis.hincrby("event_counters:#{@instance_id}", 'all', 1)
182
182
 
@@ -201,23 +201,30 @@ module Flapjack
201
201
  end
202
202
 
203
203
  event.previous_state = entity_check.state
204
- event.previous_state_duration = Time.now.to_i - entity_check.last_change.to_i
205
- @logger.info("No previous state for event #{event.id}") if event.previous_state.nil?
206
-
207
- # If there is a state change, update record with: the time, the new state
208
- if event.state != event.previous_state
209
- entity_check.update_state(event.state, :timestamp => timestamp,
210
- :summary => event.summary, :client => event.client,
211
- :count => @event_count, :details => event.details)
204
+
205
+ if event.previous_state.nil?
206
+ @logger.info("No previous state for event #{event.id}")
207
+
208
+ if @ncsm_duration >= 0
209
+ @logger.info("Setting scheduled maintenance for #{time_period_in_words(@ncsm_duration)}")
210
+ entity_check.create_scheduled_maintenance(:start_time => timestamp,
211
+ :duration => @ncsm_duration, :summary => 'Automatically created for new check')
212
+ end
213
+ else
214
+ event.previous_state_duration = timestamp - entity_check.last_change.to_i
212
215
  end
213
216
 
217
+ entity_check.update_state(event.state, :timestamp => timestamp,
218
+ :summary => event.summary, :client => event.client,
219
+ :count => @event_count, :details => event.details)
220
+
214
221
  # No state change, and event is ok, so no need to run through filters
215
222
  # OR
216
223
  # If the service event's state is ok and there was no previous state, don't alert.
217
224
  # This stops new checks from alerting as "recovery" after they have been added.
218
225
  if !event.previous_state && event.ok?
219
226
  @logger.debug("setting skip_filters to true because there was no previous state and event is ok")
220
- result[:skip_filters] = true
227
+ result = false
221
228
  end
222
229
 
223
230
  entity_check.update_current_scheduled_maintenance
@@ -232,9 +239,6 @@ module Flapjack
232
239
  if event.acknowledgement? && event.acknowledgement_id
233
240
  @redis.hdel('unacknowledged_failures', event.acknowledgement_id)
234
241
  end
235
- when 'shutdown'
236
- # should this be logged as an action instead? being minimally invasive for now
237
- result[:shutdown] = true
238
242
  end
239
243
 
240
244
  result
@@ -242,8 +246,7 @@ module Flapjack
242
246
 
243
247
  # takes an event for which a notification needs to be generated, works out the type of
244
248
  # notification, updates the notification history in redis, generates the notifications
245
- def generate_notification_messages(event, entity_check)
246
- timestamp = Time.now.to_i
249
+ def generate_notification_messages(event, entity_check, timestamp)
247
250
  notification_type = 'unknown'
248
251
  case event.type
249
252
  when 'service'
@@ -274,7 +277,7 @@ module Flapjack
274
277
 
275
278
  if contacts.empty?
276
279
  @logger.debug("No contacts for #{event.id}")
277
- @notifylog.info("#{Time.now.to_s} | #{event.id} | #{notification_type} | NO CONTACTS")
280
+ @notifylog.info("#{Time.at(timestamp).to_s} | #{event.id} | #{notification_type} | NO CONTACTS")
278
281
  return
279
282
  end
280
283
 
@@ -283,6 +286,7 @@ module Flapjack
283
286
  :max_notified_severity => max_notified_severity,
284
287
  :contacts => contacts,
285
288
  :default_timezone => @default_contact_timezone,
289
+ :last_state => entity_check.historical_state_before(timestamp),
286
290
  :logger => @logger)
287
291
 
288
292
  notification.messages.each do |message|
@@ -291,7 +295,7 @@ module Flapjack
291
295
  address = message.address
292
296
  event_id = event.id
293
297
 
294
- @notifylog.info("#{Time.now.to_s} | #{event_id} | " +
298
+ @notifylog.info("#{Time.at(timestamp).to_s} | #{event_id} | " +
295
299
  "#{notification_type} | #{message.contact.id} | #{media_type} | #{address}")
296
300
 
297
301
  unless @queues[media_type.to_sym]
@@ -16,10 +16,10 @@ module Flapjack
16
16
  result = false
17
17
  if event.type == 'action'
18
18
  if event.acknowledgement?
19
- if @persistence.zscore("failed_checks", event.id)
20
- ec = Flapjack::Data::EntityCheck.for_event_id(event.id, :redis => @persistence)
19
+ if @redis.zscore("failed_checks", event.id)
20
+ ec = Flapjack::Data::EntityCheck.for_event_id(event.id, :redis => @redis)
21
21
  if ec.nil?
22
- @log.error "Filter: Acknowledgement: unknown entity for event '#{event.id}'"
22
+ @logger.error "Filter: Acknowledgement: unknown entity for event '#{event.id}'"
23
23
  else
24
24
  ec.create_unscheduled_maintenance(:start_time => timestamp,
25
25
  :duration => (event.duration || (4 * 60 * 60)),
@@ -28,14 +28,14 @@ module Flapjack
28
28
  end
29
29
  else
30
30
  result = true
31
- @log.debug("Filter: Acknowledgement: blocking because zscore of failed_checks for #{event.id} is false") unless @persistence.zscore("failed_checks", event.id)
31
+ @logger.debug("Filter: Acknowledgement: blocking because zscore of failed_checks for #{event.id} is false") unless @redis.zscore("failed_checks", event.id)
32
32
  end
33
33
  else
34
34
  message = "no action taken"
35
35
  result = false
36
36
  end
37
37
  end
38
- @log.debug("Filter: Acknowledgement: #{result ? "block" : "pass"} (#{message})")
38
+ @logger.debug("Filter: Acknowledgement: #{result ? "block" : "pass"} (#{message})")
39
39
  result
40
40
  end
41
41
  end
@@ -4,8 +4,8 @@ module Flapjack
4
4
  module Filters
5
5
  module Base
6
6
  def initialize(opts={})
7
- @log = opts[:log]
8
- @persistence = opts[:persistence]
7
+ @logger = opts[:logger]
8
+ @redis = opts[:redis]
9
9
  end
10
10
 
11
11
  def name
@@ -25,15 +25,15 @@ module Flapjack
25
25
 
26
26
  result = false
27
27
 
28
- if (event.type == 'service') and (event.failure?)
28
+ if event.service? && event.failure?
29
29
 
30
- entity_check = Flapjack::Data::EntityCheck.for_event_id(event.id, :redis => @persistence)
30
+ entity_check = Flapjack::Data::EntityCheck.for_event_id(event.id, :redis => @redis)
31
31
  current_time = Time.now.to_i
32
32
 
33
33
  if entity_check.failed?
34
- last_problem_alert = entity_check.last_problem_notification
35
- last_warning_alert = entity_check.last_warning_notification
36
- last_critical_alert = entity_check.last_critical_notification
34
+ last_problem_alert = entity_check.last_notification_for_state(:problem)[:timestamp]
35
+ last_warning_alert = entity_check.last_notification_for_state(:warning)[:timestamp]
36
+ last_critical_alert = entity_check.last_notification_for_state(:critical)[:timestamp]
37
37
  last_change = entity_check.last_change
38
38
  last_notification = entity_check.last_notification
39
39
  last_alert_state = last_notification[:type]
@@ -41,7 +41,7 @@ module Flapjack
41
41
 
42
42
  current_state_duration = current_time - last_change
43
43
  time_since_last_alert = current_time - last_problem_alert unless last_problem_alert.nil?
44
- @log.debug("Filter: Delays: last_problem_alert: #{last_problem_alert.to_s}, " +
44
+ @logger.debug("Filter: Delays: last_problem_alert: #{last_problem_alert.to_s}, " +
45
45
  "last_change: #{last_change.inspect}, " +
46
46
  "current_state_duration: #{current_state_duration.inspect}, " +
47
47
  "time_since_last_alert: #{time_since_last_alert.inspect}, " +
@@ -50,26 +50,26 @@ module Flapjack
50
50
  "last_alert_state == event.state ? #{last_alert_state.to_s == event.state}")
51
51
  if (current_state_duration < failure_delay)
52
52
  result = true
53
- @log.debug("Filter: Delays: blocking because duration of current failure " +
53
+ @logger.debug("Filter: Delays: blocking because duration of current failure " +
54
54
  "(#{current_state_duration}) is less than failure_delay (#{failure_delay})")
55
55
  elsif !last_problem_alert.nil? && (time_since_last_alert < resend_delay) &&
56
56
  (last_alert_state.to_s == event.state)
57
57
 
58
58
  result = true
59
- @log.debug("Filter: Delays: blocking because time since last alert for " +
59
+ @logger.debug("Filter: Delays: blocking because time since last alert for " +
60
60
  "current problem (#{time_since_last_alert}) is less than " +
61
61
  "resend_delay (#{resend_delay}) and last alert state (#{last_alert_state}) " +
62
62
  "is equal to current event state (#{event.state})")
63
63
  else
64
- @log.debug("Filter: Delays: not blocking because neither of the time comparison " +
64
+ @logger.debug("Filter: Delays: not blocking because neither of the time comparison " +
65
65
  "conditions were met")
66
66
  end
67
67
  else
68
- @log.debug("Filter: Delays: entity_check.failed? returned false ...")
68
+ @logger.debug("Filter: Delays: entity_check.failed? returned false ...")
69
69
  end
70
70
  end
71
71
 
72
- @log.debug("Filter: Delays: #{result ? "block" : "pass"}")
72
+ @logger.debug("Filter: Delays: #{result ? "block" : "pass"}")
73
73
  result
74
74
  end
75
75
  end