flapjack 0.7.22 → 0.7.25

Sign up to get free protection for your applications and to get access to all the features.
Files changed (43) hide show
  1. data/CHANGELOG.md +19 -0
  2. data/bin/flapjack +3 -1
  3. data/bin/flapjack-nagios-receiver +5 -4
  4. data/bin/receive-events +2 -2
  5. data/features/events.feature +101 -95
  6. data/features/notification_rules.feature +36 -4
  7. data/features/steps/notifications_steps.rb +4 -0
  8. data/flapjack.gemspec +3 -2
  9. data/lib/flapjack/coordinator.rb +8 -6
  10. data/lib/flapjack/data/entity_check.rb +20 -13
  11. data/lib/flapjack/data/event.rb +4 -7
  12. data/lib/flapjack/data/notification.rb +63 -45
  13. data/lib/flapjack/filters/acknowledgement.rb +26 -24
  14. data/lib/flapjack/filters/delays.rb +46 -42
  15. data/lib/flapjack/filters/ok.rb +31 -34
  16. data/lib/flapjack/filters/scheduled_maintenance.rb +2 -2
  17. data/lib/flapjack/filters/unscheduled_maintenance.rb +2 -3
  18. data/lib/flapjack/gateways/email.rb +111 -114
  19. data/lib/flapjack/gateways/email/alert.html.erb +11 -11
  20. data/lib/flapjack/gateways/email/alert.text.erb +19 -6
  21. data/lib/flapjack/gateways/sms_messagenet.rb +15 -5
  22. data/lib/flapjack/gateways/web.rb +3 -4
  23. data/lib/flapjack/gateways/web/public/css/flapjack.css +0 -2
  24. data/lib/flapjack/gateways/web/public/img/flapjack-favicon-32-16.ico +0 -0
  25. data/lib/flapjack/gateways/web/public/img/flapjack-favicon-64-32-24-16.ico +0 -0
  26. data/lib/flapjack/gateways/web/public/img/flapjack-transparent-300.png +0 -0
  27. data/lib/flapjack/gateways/web/public/img/flapjack-transparent-350-400.png +0 -0
  28. data/lib/flapjack/gateways/web/views/_head.html.erb +1 -0
  29. data/lib/flapjack/gateways/web/views/index.html.erb +1 -1
  30. data/lib/flapjack/notifier.rb +2 -3
  31. data/lib/flapjack/pikelet.rb +5 -4
  32. data/lib/flapjack/processor.rb +39 -27
  33. data/lib/flapjack/version.rb +1 -1
  34. data/spec/lib/flapjack/data/entity_check_spec.rb +5 -0
  35. data/spec/lib/flapjack/data/event_spec.rb +0 -1
  36. data/spec/lib/flapjack/gateways/email_spec.rb +5 -9
  37. data/spec/lib/flapjack/gateways/sms_messagenet.spec.rb +80 -1
  38. data/spec/lib/flapjack/gateways/web_spec.rb +1 -1
  39. data/spec/lib/flapjack/pikelet_spec.rb +4 -3
  40. data/spec/lib/flapjack/processor_spec.rb +0 -1
  41. metadata +28 -11
  42. data/lib/flapjack/filters/detect_mass_client_failures.rb +0 -44
  43. data/spec/lib/flapjack/filters/detect_mass_client_failures_spec.rb +0 -6
@@ -262,23 +262,25 @@ module Flapjack
262
262
  details = options[:details]
263
263
  count = options[:count]
264
264
 
265
- if self.state != new_state
266
- client = options[:client]
265
+ old_state = self.state
266
+
267
+ @redis.multi
268
+
269
+ if old_state != new_state
267
270
 
268
271
  # Note the current state (for speedy lookups)
269
272
  @redis.hset("check:#{@key}", 'state', new_state)
270
273
 
271
274
  # FIXME: rename to last_state_change?
272
275
  @redis.hset("check:#{@key}", 'last_change', timestamp)
273
- case state
276
+
277
+ case new_state
274
278
  when STATE_WARNING, STATE_CRITICAL, STATE_UNKNOWN
275
279
  @redis.zadd('failed_checks', timestamp, @key)
276
280
  # FIXME: Iterate through a list of tags associated with an entity:check pair, and update counters
277
- @redis.zadd("failed_checks:client:#{client}", timestamp, @key) if client
278
281
  else
279
282
  @redis.zrem("failed_checks", @key)
280
283
  # FIXME: Iterate through a list of tags associated with an entity:check pair, and update counters
281
- @redis.zrem("failed_checks:client:#{client}", @key) if client
282
284
  end
283
285
 
284
286
  # Retain event data for entity:check pair
@@ -291,22 +293,27 @@ module Flapjack
291
293
  @redis.zadd("#{@key}:sorted_state_timestamps", timestamp, timestamp)
292
294
  end
293
295
 
296
+ # Track when we last saw an event for a particular entity:check pair
297
+ self.last_update = timestamp
298
+
294
299
  # Even if this isn't a state change, we need to update the current state
295
300
  # hash summary and details (as they may have changed)
296
301
  @redis.hset("check:#{@key}", 'summary', (summary || ''))
297
302
  @redis.hset("check:#{@key}", 'details', (details || ''))
298
- end
299
303
 
300
- def last_update
301
- lu = @redis.hget("check:#{@key}", 'last_update')
302
- return unless (lu && lu =~ /^\d+$/)
303
- lu.to_i
304
+ @redis.exec
304
305
  end
305
306
 
306
307
  def last_update=(timestamp)
307
308
  @redis.hset("check:#{@key}", 'last_update', timestamp)
308
309
  @redis.zadd("current_checks:#{entity.name}", timestamp, check)
309
- @redis.zadd("current_entities", timestamp, entity.name)
310
+ @redis.zadd('current_entities', timestamp, entity.name)
311
+ end
312
+
313
+ def last_update
314
+ lu = @redis.hget("check:#{@key}", 'last_update')
315
+ return unless lu && !!(lu =~ /^\d+$/)
316
+ lu.to_i
310
317
  end
311
318
 
312
319
  # disables a check (removes currency)
@@ -320,12 +327,12 @@ module Flapjack
320
327
  end
321
328
 
322
329
  def enabled?
323
- !! @redis.zscore("current_checks:#{entity.name}", check)
330
+ !!@redis.zscore("current_checks:#{entity.name}", check)
324
331
  end
325
332
 
326
333
  def last_change
327
334
  lc = @redis.hget("check:#{@key}", 'last_change')
328
- return unless (lc && lc =~ /^\d+$/)
335
+ return unless lc && !!(lc =~ /^\d+$/)
329
336
  lc.to_i
330
337
  end
331
338
 
@@ -6,7 +6,7 @@ module Flapjack
6
6
  module Data
7
7
  class Event
8
8
 
9
- attr_accessor :counter, :previous_state, :previous_state_duration, :tags
9
+ attr_accessor :counter, :tags
10
10
 
11
11
  attr_reader :check, :summary, :details, :acknowledgement_id
12
12
 
@@ -57,9 +57,12 @@ module Flapjack
57
57
  end
58
58
 
59
59
  # creates, or modifies, an event object and adds it to the events list in redis
60
+ # 'entity' => entity,
61
+ # 'check' => check,
60
62
  # 'type' => 'service',
61
63
  # 'state' => state,
62
64
  # 'summary' => check_output,
65
+ # 'details' => check_long_output,
63
66
  # 'time' => timestamp
64
67
  def self.add(evt, opts = {})
65
68
  raise "Redis connection not set" unless redis = opts[:redis]
@@ -129,12 +132,6 @@ module Flapjack
129
132
  (entity || '-') + ':' + (check || '-')
130
133
  end
131
134
 
132
- # FIXME: site specific
133
- def client
134
- return unless entity
135
- entity.split('-').first
136
- end
137
-
138
135
  def type
139
136
  return unless @type
140
137
  @type.downcase
@@ -10,7 +10,7 @@ module Flapjack
10
10
  module Data
11
11
  class Notification
12
12
 
13
- attr_reader :type, :event_id, :event_state, :event_count
13
+ attr_reader :type, :event_id, :state
14
14
 
15
15
  def self.type_for_event(event)
16
16
  case event.type
@@ -49,18 +49,21 @@ module Flapjack
49
49
  last_state = opts[:last_state] || {}
50
50
 
51
51
  tag_data = event.tags.is_a?(Set) ? event.tags.to_a : nil
52
- notif = {'event_id' => event.id,
53
- 'state' => event.state,
54
- 'summary' => event.summary,
55
- 'last_state' => last_state[:state],
56
- 'last_summary' => last_state[:summary],
57
- 'details' => event.details,
58
- 'time' => event.time,
59
- 'duration' => event.duration || nil,
60
- 'type' => opts[:type] || type_for_event(event),
61
- 'severity' => opts[:severity],
62
- 'count' => event.counter,
63
- 'tags' => tag_data }
52
+ notif = {'event_id' => event.id,
53
+ 'state' => event.state,
54
+ 'summary' => event.summary,
55
+ 'details' => event.details,
56
+ 'time' => event.time,
57
+ 'duration' => event.duration,
58
+ 'count' => event.counter,
59
+ 'last_state' => last_state[:state],
60
+ 'last_summary' => last_state[:summary],
61
+ 'state_duration' => opts[:state_duration],
62
+
63
+ 'type' => opts[:type] || type_for_event(event),
64
+ 'severity' => opts[:severity],
65
+
66
+ 'tags' => tag_data }
64
67
 
65
68
  redis.rpush(queue, Oj.dump(notif))
66
69
  end
@@ -88,17 +91,22 @@ module Flapjack
88
91
  self.new( parsed )
89
92
  end
90
93
 
94
+ def ok?
95
+ ['ok', 'up'].include?(@state)
96
+ end
97
+
91
98
  def contents
92
99
  @contents ||= {'event_id' => @event_id,
93
- 'state' => @event_state,
94
- 'summary' => @event_summary,
95
- 'last_state' => @last_event_state,
96
- 'last_summary' => @last_event_summary,
97
- 'details' => @event_details,
98
- 'time' => @event_time,
99
- 'duration' => @event_duration,
100
+ 'state' => @state,
101
+ 'summary' => @summary,
102
+ 'duration' => @duration,
103
+ 'last_state' => @last_state,
104
+ 'last_summary' => @last_summary,
105
+ 'state_duration' => @state_duration,
106
+ 'details' => @details,
107
+ 'time' => @time,
100
108
  'notification_type' => @type,
101
- 'event_count' => @event_count,
109
+ 'event_count' => @count,
102
110
  'tags' => @tags
103
111
  }
104
112
  end
@@ -115,7 +123,7 @@ module Flapjack
115
123
  media = contact.media
116
124
 
117
125
  logger.debug "Notification#messages: creating messages for contact: #{contact_id} " +
118
- "event_id: \"#{@event_id}\" state: #{@event_state} event_tags: #{@tags.to_json} media: #{media.inspect}"
126
+ "event_id: \"#{@event_id}\" state: #{@state} event_tags: #{@tags.to_json} media: #{media.inspect}"
119
127
  rlen = rules.length
120
128
  logger.debug "found #{rlen} rule#{(rlen == 1) ? '' : 's'} for contact #{contact_id}"
121
129
 
@@ -132,7 +140,7 @@ module Flapjack
132
140
 
133
141
  logger.debug "#{matchers.length} matchers remain for this contact after time, entity and tags are matched:"
134
142
  matchers.each do |matcher|
135
- logger.debug "matcher: #{matcher.to_json}"
143
+ logger.debug " - #{matcher.to_json}"
136
144
  end
137
145
 
138
146
  # delete any general matchers if there are more specific matchers left
@@ -143,35 +151,42 @@ module Flapjack
143
151
  matchers.reject! {|matcher| !matcher.is_specific? }
144
152
 
145
153
  if num_matchers != matchers.length
146
- logger.debug("notification: removal of general matchers when entity specific matchers are present: number of matchers changed from #{num_matchers} to #{matchers.length} for contact id: #{contact_id}")
154
+ logger.debug("removal of general matchers when entity specific matchers are present: number of matchers changed from #{num_matchers} to #{matchers.length} for contact id: #{contact_id}")
147
155
  matchers.each do |matcher|
148
- logger.debug "matcher: #{matcher.to_json}"
156
+ logger.debug " - #{matcher.to_json}"
149
157
  end
150
158
  end
151
159
  end
152
160
 
153
161
  # delete media based on blackholes
154
- next if matchers.any? {|matcher| matcher.blackhole?(@event_state) }
155
-
156
- logger.debug "notification: num matchers after removing blackhole matchers: #{matchers.size}"
162
+ blackhole_matchers = matchers.map {|matcher| matcher.blackhole?(@severity) ? matcher : nil }.compact
163
+ if blackhole_matchers.length > 0
164
+ logger.debug "dropping this media as #{blackhole_matchers.length} blackhole matchers are present:"
165
+ blackhole_matchers.each {|bm|
166
+ logger.debug " - #{bm.to_json}"
167
+ }
168
+ next
169
+ else
170
+ logger.debug "no blackhole matchers matched"
171
+ end
157
172
 
158
173
  rule_media = matchers.collect{|matcher|
159
174
  matcher.media_for_severity(@severity)
160
175
  }.flatten.uniq
161
176
 
162
- logger.debug "notification: collected media_for_severity(#{@severity}): #{rule_media}"
177
+ logger.debug "collected media_for_severity(#{@severity}): #{rule_media}"
163
178
  rule_media = rule_media.reject {|medium|
164
179
  contact.drop_notifications?(:media => medium,
165
180
  :check => @event_id,
166
- :state => @event_state)
181
+ :state => @state)
167
182
  }
168
183
 
169
- logger.debug "notification: media after contact_drop?: #{rule_media}"
184
+ logger.debug "media after contact_drop?: #{rule_media}"
170
185
 
171
186
  media.select {|medium, address| rule_media.include?(medium) }
172
187
  end
173
188
 
174
- logger.debug "notification: media_to_use: #{media_to_use}"
189
+ logger.debug "media_to_use: #{media_to_use}"
175
190
 
176
191
  media_to_use.each_pair.inject([]) { |ret, (k, v)|
177
192
  m = Flapjack::Data::Message.for_contact(contact,
@@ -186,19 +201,22 @@ module Flapjack
186
201
 
187
202
  # created from parsed JSON, so opts keys are in strings
188
203
  def initialize(opts = {})
189
- @event_id = opts['event_id']
190
- @event_state = opts['state']
191
- @event_summary = opts['summary']
192
- @event_details = opts['details']
193
- @event_time = opts['time']
194
- @event_duration = opts['duration']
195
- @event_count = opts['count']
196
- @last_event_state = opts['last_state']
197
- @last_event_summary = opts['last_summary']
198
- @type = opts['type']
199
- @severity = opts['severity']
200
- tags = opts['tags']
201
- @tags = tags.is_a?(Array) ? Flapjack::Data::TagSet.new(tags) : nil
204
+ @event_id = opts['event_id']
205
+ @state = opts['state']
206
+ @summary = opts['summary']
207
+ @details = opts['details']
208
+ @time = opts['time']
209
+ @count = opts['count']
210
+
211
+ @last_state = opts['last_state']
212
+ @last_summary = opts['last_summary']
213
+ @state_duration = opts['state_duration']
214
+
215
+ @type = opts['type']
216
+ @severity = opts['severity']
217
+
218
+ tags = opts['tags']
219
+ @tags = tags.is_a?(Array) ? Flapjack::Data::TagSet.new(tags) : nil
202
220
  end
203
221
 
204
222
  # # time restrictions match?
@@ -11,32 +11,34 @@ module Flapjack
11
11
  class Acknowledgement
12
12
  include Base
13
13
 
14
- def block?(event)
14
+ def block?(event, entity_check, previous_state)
15
15
  timestamp = Time.now.to_i
16
- result = false
17
- if event.type == 'action'
18
- if event.acknowledgement?
19
- if @redis.zscore("failed_checks", event.id)
20
- ec = Flapjack::Data::EntityCheck.for_event_id(event.id, :redis => @redis)
21
- if ec.nil?
22
- @logger.error "Filter: Acknowledgement: unknown entity for event '#{event.id}'"
23
- else
24
- ec.create_unscheduled_maintenance(timestamp,
25
- (event.duration || (4 * 60 * 60)),
26
- :summary => event.summary)
27
- message = "unscheduled maintenance created for #{event.id}"
28
- end
29
- else
30
- result = true
31
- @logger.debug("Filter: Acknowledgement: blocking because zscore of failed_checks for #{event.id} is false") unless @redis.zscore("failed_checks", event.id)
32
- end
33
- else
34
- message = "no action taken"
35
- result = false
36
- end
16
+
17
+ label = 'Filter: Acknowledgement:'
18
+
19
+ return false unless event.type == 'action'
20
+
21
+ unless event.acknowledgement?
22
+ @logger.debug("#{label} pass (not an ack)")
23
+ return false
24
+ end
25
+
26
+ if entity_check.nil?
27
+ @logger.error "#{label} unknown entity for event '#{event.id}'"
28
+ return false
37
29
  end
38
- @logger.debug("Filter: Acknowledgement: #{result ? "block" : "pass"} (#{message})")
39
- result
30
+
31
+ unless @redis.zscore("failed_checks", event.id)
32
+ @logger.debug("#{label} blocking because zscore of failed_checks for #{event.id} is false")
33
+ return true
34
+ end
35
+
36
+ entity_check.create_unscheduled_maintenance(timestamp,
37
+ (event.duration || (4 * 60 * 60)),
38
+ :summary => event.summary)
39
+
40
+ @logger.debug("#{label} pass (unscheduled maintenance created for #{event.id})")
41
+ false
40
42
  end
41
43
  end
42
44
  end
@@ -19,58 +19,62 @@ module Flapjack
19
19
  class Delays
20
20
  include Base
21
21
 
22
- def block?(event)
22
+ def block?(event, entity_check, previous_state)
23
23
  failure_delay = 30
24
24
  resend_delay = 300
25
25
 
26
- result = false
26
+ label = 'Filter: Delays:'
27
27
 
28
- if event.service? && event.failure?
28
+ unless event.service? && event.failure?
29
+ @logger.debug("#{label} pass - not a service event in a failure state")
30
+ return false
31
+ end
32
+
33
+ unless entity_check.failed?
34
+ @logger.debug("#{label} entity_check.failed? returned false ...")
35
+ return false
36
+ end
29
37
 
30
- entity_check = Flapjack::Data::EntityCheck.for_event_id(event.id, :redis => @redis)
31
- current_time = Time.now.to_i
38
+ last_problem_alert = entity_check.last_notification_for_state(:problem)[:timestamp]
39
+ last_warning_alert = entity_check.last_notification_for_state(:warning)[:timestamp]
40
+ last_critical_alert = entity_check.last_notification_for_state(:critical)[:timestamp]
41
+ last_change = entity_check.last_change
42
+ last_notification = entity_check.last_notification
43
+ last_alert_state = last_notification[:type]
44
+ last_alert_timestamp = last_notification[:timestamp]
32
45
 
33
- if entity_check.failed?
34
- last_problem_alert = entity_check.last_notification_for_state(:problem)[:timestamp]
35
- last_warning_alert = entity_check.last_notification_for_state(:warning)[:timestamp]
36
- last_critical_alert = entity_check.last_notification_for_state(:critical)[:timestamp]
37
- last_change = entity_check.last_change
38
- last_notification = entity_check.last_notification
39
- last_alert_state = last_notification[:type]
40
- last_alert_timestamp = last_notification[:timestamp]
46
+ current_time = Time.now.to_i
47
+ current_state_duration = current_time - last_change
48
+ time_since_last_alert = current_time - last_problem_alert unless last_problem_alert.nil?
41
49
 
42
- current_state_duration = current_time - last_change
43
- time_since_last_alert = current_time - last_problem_alert unless last_problem_alert.nil?
44
- @logger.debug("Filter: Delays: last_problem_alert: #{last_problem_alert.to_s}, " +
45
- "last_change: #{last_change.inspect}, " +
46
- "current_state_duration: #{current_state_duration.inspect}, " +
47
- "time_since_last_alert: #{time_since_last_alert.inspect}, " +
48
- "last_alert_state: [#{last_alert_state.inspect}], " +
49
- "event.state: [#{event.state.inspect}], " +
50
- "last_alert_state == event.state ? #{last_alert_state.to_s == event.state}")
51
- if (current_state_duration < failure_delay)
52
- result = true
53
- @logger.debug("Filter: Delays: blocking because duration of current failure " +
54
- "(#{current_state_duration}) is less than failure_delay (#{failure_delay})")
55
- elsif !last_problem_alert.nil? && (time_since_last_alert < resend_delay) &&
56
- (last_alert_state.to_s == event.state)
50
+ @logger.debug("#{label} last_problem_alert: #{last_problem_alert.to_s}, " +
51
+ "last_change: #{last_change.inspect}, " +
52
+ "current_state_duration: #{current_state_duration.inspect}, " +
53
+ "time_since_last_alert: #{time_since_last_alert.inspect}, " +
54
+ "last_alert_state: [#{last_alert_state.inspect}], " +
55
+ "event.state: [#{event.state.inspect}], " +
56
+ "last_alert_state == event.state ? #{last_alert_state.to_s == event.state}")
57
57
 
58
- result = true
59
- @logger.debug("Filter: Delays: blocking because time since last alert for " +
60
- "current problem (#{time_since_last_alert}) is less than " +
61
- "resend_delay (#{resend_delay}) and last alert state (#{last_alert_state}) " +
62
- "is equal to current event state (#{event.state})")
63
- else
64
- @logger.debug("Filter: Delays: not blocking because neither of the time comparison " +
65
- "conditions were met")
66
- end
67
- else
68
- @logger.debug("Filter: Delays: entity_check.failed? returned false ...")
69
- end
58
+ if current_state_duration < failure_delay
59
+ @logger.debug("#{label} block - duration of current failure " +
60
+ "(#{current_state_duration}) is less than failure_delay (#{failure_delay})")
61
+ return true
70
62
  end
71
63
 
72
- @logger.debug("Filter: Delays: #{result ? "block" : "pass"}")
73
- result
64
+ if !last_problem_alert.nil? && (time_since_last_alert < resend_delay) &&
65
+ (last_alert_state.to_s == event.state)
66
+
67
+ @logger.debug("#{label} block - time since last alert for " +
68
+ "current problem (#{time_since_last_alert}) is less than " +
69
+ "resend_delay (#{resend_delay}) and last alert state (#{last_alert_state}) " +
70
+ "is equal to current event state (#{event.state})")
71
+ return true
72
+ end
73
+
74
+ @logger.debug("#{label} pass - not blocking because neither of the time comparison " +
75
+ "conditions were met")
76
+ return false
77
+
74
78
  end
75
79
  end
76
80
  end