flapjack 0.7.22 → 0.7.25

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. data/CHANGELOG.md +19 -0
  2. data/bin/flapjack +3 -1
  3. data/bin/flapjack-nagios-receiver +5 -4
  4. data/bin/receive-events +2 -2
  5. data/features/events.feature +101 -95
  6. data/features/notification_rules.feature +36 -4
  7. data/features/steps/notifications_steps.rb +4 -0
  8. data/flapjack.gemspec +3 -2
  9. data/lib/flapjack/coordinator.rb +8 -6
  10. data/lib/flapjack/data/entity_check.rb +20 -13
  11. data/lib/flapjack/data/event.rb +4 -7
  12. data/lib/flapjack/data/notification.rb +63 -45
  13. data/lib/flapjack/filters/acknowledgement.rb +26 -24
  14. data/lib/flapjack/filters/delays.rb +46 -42
  15. data/lib/flapjack/filters/ok.rb +31 -34
  16. data/lib/flapjack/filters/scheduled_maintenance.rb +2 -2
  17. data/lib/flapjack/filters/unscheduled_maintenance.rb +2 -3
  18. data/lib/flapjack/gateways/email.rb +111 -114
  19. data/lib/flapjack/gateways/email/alert.html.erb +11 -11
  20. data/lib/flapjack/gateways/email/alert.text.erb +19 -6
  21. data/lib/flapjack/gateways/sms_messagenet.rb +15 -5
  22. data/lib/flapjack/gateways/web.rb +3 -4
  23. data/lib/flapjack/gateways/web/public/css/flapjack.css +0 -2
  24. data/lib/flapjack/gateways/web/public/img/flapjack-favicon-32-16.ico +0 -0
  25. data/lib/flapjack/gateways/web/public/img/flapjack-favicon-64-32-24-16.ico +0 -0
  26. data/lib/flapjack/gateways/web/public/img/flapjack-transparent-300.png +0 -0
  27. data/lib/flapjack/gateways/web/public/img/flapjack-transparent-350-400.png +0 -0
  28. data/lib/flapjack/gateways/web/views/_head.html.erb +1 -0
  29. data/lib/flapjack/gateways/web/views/index.html.erb +1 -1
  30. data/lib/flapjack/notifier.rb +2 -3
  31. data/lib/flapjack/pikelet.rb +5 -4
  32. data/lib/flapjack/processor.rb +39 -27
  33. data/lib/flapjack/version.rb +1 -1
  34. data/spec/lib/flapjack/data/entity_check_spec.rb +5 -0
  35. data/spec/lib/flapjack/data/event_spec.rb +0 -1
  36. data/spec/lib/flapjack/gateways/email_spec.rb +5 -9
  37. data/spec/lib/flapjack/gateways/sms_messagenet.spec.rb +80 -1
  38. data/spec/lib/flapjack/gateways/web_spec.rb +1 -1
  39. data/spec/lib/flapjack/pikelet_spec.rb +4 -3
  40. data/spec/lib/flapjack/processor_spec.rb +0 -1
  41. metadata +28 -11
  42. data/lib/flapjack/filters/detect_mass_client_failures.rb +0 -44
  43. data/spec/lib/flapjack/filters/detect_mass_client_failures_spec.rb +0 -6
@@ -262,23 +262,25 @@ module Flapjack
262
262
  details = options[:details]
263
263
  count = options[:count]
264
264
 
265
- if self.state != new_state
266
- client = options[:client]
265
+ old_state = self.state
266
+
267
+ @redis.multi
268
+
269
+ if old_state != new_state
267
270
 
268
271
  # Note the current state (for speedy lookups)
269
272
  @redis.hset("check:#{@key}", 'state', new_state)
270
273
 
271
274
  # FIXME: rename to last_state_change?
272
275
  @redis.hset("check:#{@key}", 'last_change', timestamp)
273
- case state
276
+
277
+ case new_state
274
278
  when STATE_WARNING, STATE_CRITICAL, STATE_UNKNOWN
275
279
  @redis.zadd('failed_checks', timestamp, @key)
276
280
  # FIXME: Iterate through a list of tags associated with an entity:check pair, and update counters
277
- @redis.zadd("failed_checks:client:#{client}", timestamp, @key) if client
278
281
  else
279
282
  @redis.zrem("failed_checks", @key)
280
283
  # FIXME: Iterate through a list of tags associated with an entity:check pair, and update counters
281
- @redis.zrem("failed_checks:client:#{client}", @key) if client
282
284
  end
283
285
 
284
286
  # Retain event data for entity:check pair
@@ -291,22 +293,27 @@ module Flapjack
291
293
  @redis.zadd("#{@key}:sorted_state_timestamps", timestamp, timestamp)
292
294
  end
293
295
 
296
+ # Track when we last saw an event for a particular entity:check pair
297
+ self.last_update = timestamp
298
+
294
299
  # Even if this isn't a state change, we need to update the current state
295
300
  # hash summary and details (as they may have changed)
296
301
  @redis.hset("check:#{@key}", 'summary', (summary || ''))
297
302
  @redis.hset("check:#{@key}", 'details', (details || ''))
298
- end
299
303
 
300
- def last_update
301
- lu = @redis.hget("check:#{@key}", 'last_update')
302
- return unless (lu && lu =~ /^\d+$/)
303
- lu.to_i
304
+ @redis.exec
304
305
  end
305
306
 
306
307
  def last_update=(timestamp)
307
308
  @redis.hset("check:#{@key}", 'last_update', timestamp)
308
309
  @redis.zadd("current_checks:#{entity.name}", timestamp, check)
309
- @redis.zadd("current_entities", timestamp, entity.name)
310
+ @redis.zadd('current_entities', timestamp, entity.name)
311
+ end
312
+
313
+ def last_update
314
+ lu = @redis.hget("check:#{@key}", 'last_update')
315
+ return unless lu && !!(lu =~ /^\d+$/)
316
+ lu.to_i
310
317
  end
311
318
 
312
319
  # disables a check (removes currency)
@@ -320,12 +327,12 @@ module Flapjack
320
327
  end
321
328
 
322
329
  def enabled?
323
- !! @redis.zscore("current_checks:#{entity.name}", check)
330
+ !!@redis.zscore("current_checks:#{entity.name}", check)
324
331
  end
325
332
 
326
333
  def last_change
327
334
  lc = @redis.hget("check:#{@key}", 'last_change')
328
- return unless (lc && lc =~ /^\d+$/)
335
+ return unless lc && !!(lc =~ /^\d+$/)
329
336
  lc.to_i
330
337
  end
331
338
 
@@ -6,7 +6,7 @@ module Flapjack
6
6
  module Data
7
7
  class Event
8
8
 
9
- attr_accessor :counter, :previous_state, :previous_state_duration, :tags
9
+ attr_accessor :counter, :tags
10
10
 
11
11
  attr_reader :check, :summary, :details, :acknowledgement_id
12
12
 
@@ -57,9 +57,12 @@ module Flapjack
57
57
  end
58
58
 
59
59
  # creates, or modifies, an event object and adds it to the events list in redis
60
+ # 'entity' => entity,
61
+ # 'check' => check,
60
62
  # 'type' => 'service',
61
63
  # 'state' => state,
62
64
  # 'summary' => check_output,
65
+ # 'details' => check_long_output,
63
66
  # 'time' => timestamp
64
67
  def self.add(evt, opts = {})
65
68
  raise "Redis connection not set" unless redis = opts[:redis]
@@ -129,12 +132,6 @@ module Flapjack
129
132
  (entity || '-') + ':' + (check || '-')
130
133
  end
131
134
 
132
- # FIXME: site specific
133
- def client
134
- return unless entity
135
- entity.split('-').first
136
- end
137
-
138
135
  def type
139
136
  return unless @type
140
137
  @type.downcase
@@ -10,7 +10,7 @@ module Flapjack
10
10
  module Data
11
11
  class Notification
12
12
 
13
- attr_reader :type, :event_id, :event_state, :event_count
13
+ attr_reader :type, :event_id, :state
14
14
 
15
15
  def self.type_for_event(event)
16
16
  case event.type
@@ -49,18 +49,21 @@ module Flapjack
49
49
  last_state = opts[:last_state] || {}
50
50
 
51
51
  tag_data = event.tags.is_a?(Set) ? event.tags.to_a : nil
52
- notif = {'event_id' => event.id,
53
- 'state' => event.state,
54
- 'summary' => event.summary,
55
- 'last_state' => last_state[:state],
56
- 'last_summary' => last_state[:summary],
57
- 'details' => event.details,
58
- 'time' => event.time,
59
- 'duration' => event.duration || nil,
60
- 'type' => opts[:type] || type_for_event(event),
61
- 'severity' => opts[:severity],
62
- 'count' => event.counter,
63
- 'tags' => tag_data }
52
+ notif = {'event_id' => event.id,
53
+ 'state' => event.state,
54
+ 'summary' => event.summary,
55
+ 'details' => event.details,
56
+ 'time' => event.time,
57
+ 'duration' => event.duration,
58
+ 'count' => event.counter,
59
+ 'last_state' => last_state[:state],
60
+ 'last_summary' => last_state[:summary],
61
+ 'state_duration' => opts[:state_duration],
62
+
63
+ 'type' => opts[:type] || type_for_event(event),
64
+ 'severity' => opts[:severity],
65
+
66
+ 'tags' => tag_data }
64
67
 
65
68
  redis.rpush(queue, Oj.dump(notif))
66
69
  end
@@ -88,17 +91,22 @@ module Flapjack
88
91
  self.new( parsed )
89
92
  end
90
93
 
94
+ def ok?
95
+ ['ok', 'up'].include?(@state)
96
+ end
97
+
91
98
  def contents
92
99
  @contents ||= {'event_id' => @event_id,
93
- 'state' => @event_state,
94
- 'summary' => @event_summary,
95
- 'last_state' => @last_event_state,
96
- 'last_summary' => @last_event_summary,
97
- 'details' => @event_details,
98
- 'time' => @event_time,
99
- 'duration' => @event_duration,
100
+ 'state' => @state,
101
+ 'summary' => @summary,
102
+ 'duration' => @duration,
103
+ 'last_state' => @last_state,
104
+ 'last_summary' => @last_summary,
105
+ 'state_duration' => @state_duration,
106
+ 'details' => @details,
107
+ 'time' => @time,
100
108
  'notification_type' => @type,
101
- 'event_count' => @event_count,
109
+ 'event_count' => @count,
102
110
  'tags' => @tags
103
111
  }
104
112
  end
@@ -115,7 +123,7 @@ module Flapjack
115
123
  media = contact.media
116
124
 
117
125
  logger.debug "Notification#messages: creating messages for contact: #{contact_id} " +
118
- "event_id: \"#{@event_id}\" state: #{@event_state} event_tags: #{@tags.to_json} media: #{media.inspect}"
126
+ "event_id: \"#{@event_id}\" state: #{@state} event_tags: #{@tags.to_json} media: #{media.inspect}"
119
127
  rlen = rules.length
120
128
  logger.debug "found #{rlen} rule#{(rlen == 1) ? '' : 's'} for contact #{contact_id}"
121
129
 
@@ -132,7 +140,7 @@ module Flapjack
132
140
 
133
141
  logger.debug "#{matchers.length} matchers remain for this contact after time, entity and tags are matched:"
134
142
  matchers.each do |matcher|
135
- logger.debug "matcher: #{matcher.to_json}"
143
+ logger.debug " - #{matcher.to_json}"
136
144
  end
137
145
 
138
146
  # delete any general matchers if there are more specific matchers left
@@ -143,35 +151,42 @@ module Flapjack
143
151
  matchers.reject! {|matcher| !matcher.is_specific? }
144
152
 
145
153
  if num_matchers != matchers.length
146
- logger.debug("notification: removal of general matchers when entity specific matchers are present: number of matchers changed from #{num_matchers} to #{matchers.length} for contact id: #{contact_id}")
154
+ logger.debug("removal of general matchers when entity specific matchers are present: number of matchers changed from #{num_matchers} to #{matchers.length} for contact id: #{contact_id}")
147
155
  matchers.each do |matcher|
148
- logger.debug "matcher: #{matcher.to_json}"
156
+ logger.debug " - #{matcher.to_json}"
149
157
  end
150
158
  end
151
159
  end
152
160
 
153
161
  # delete media based on blackholes
154
- next if matchers.any? {|matcher| matcher.blackhole?(@event_state) }
155
-
156
- logger.debug "notification: num matchers after removing blackhole matchers: #{matchers.size}"
162
+ blackhole_matchers = matchers.map {|matcher| matcher.blackhole?(@severity) ? matcher : nil }.compact
163
+ if blackhole_matchers.length > 0
164
+ logger.debug "dropping this media as #{blackhole_matchers.length} blackhole matchers are present:"
165
+ blackhole_matchers.each {|bm|
166
+ logger.debug " - #{bm.to_json}"
167
+ }
168
+ next
169
+ else
170
+ logger.debug "no blackhole matchers matched"
171
+ end
157
172
 
158
173
  rule_media = matchers.collect{|matcher|
159
174
  matcher.media_for_severity(@severity)
160
175
  }.flatten.uniq
161
176
 
162
- logger.debug "notification: collected media_for_severity(#{@severity}): #{rule_media}"
177
+ logger.debug "collected media_for_severity(#{@severity}): #{rule_media}"
163
178
  rule_media = rule_media.reject {|medium|
164
179
  contact.drop_notifications?(:media => medium,
165
180
  :check => @event_id,
166
- :state => @event_state)
181
+ :state => @state)
167
182
  }
168
183
 
169
- logger.debug "notification: media after contact_drop?: #{rule_media}"
184
+ logger.debug "media after contact_drop?: #{rule_media}"
170
185
 
171
186
  media.select {|medium, address| rule_media.include?(medium) }
172
187
  end
173
188
 
174
- logger.debug "notification: media_to_use: #{media_to_use}"
189
+ logger.debug "media_to_use: #{media_to_use}"
175
190
 
176
191
  media_to_use.each_pair.inject([]) { |ret, (k, v)|
177
192
  m = Flapjack::Data::Message.for_contact(contact,
@@ -186,19 +201,22 @@ module Flapjack
186
201
 
187
202
  # created from parsed JSON, so opts keys are in strings
188
203
  def initialize(opts = {})
189
- @event_id = opts['event_id']
190
- @event_state = opts['state']
191
- @event_summary = opts['summary']
192
- @event_details = opts['details']
193
- @event_time = opts['time']
194
- @event_duration = opts['duration']
195
- @event_count = opts['count']
196
- @last_event_state = opts['last_state']
197
- @last_event_summary = opts['last_summary']
198
- @type = opts['type']
199
- @severity = opts['severity']
200
- tags = opts['tags']
201
- @tags = tags.is_a?(Array) ? Flapjack::Data::TagSet.new(tags) : nil
204
+ @event_id = opts['event_id']
205
+ @state = opts['state']
206
+ @summary = opts['summary']
207
+ @details = opts['details']
208
+ @time = opts['time']
209
+ @count = opts['count']
210
+
211
+ @last_state = opts['last_state']
212
+ @last_summary = opts['last_summary']
213
+ @state_duration = opts['state_duration']
214
+
215
+ @type = opts['type']
216
+ @severity = opts['severity']
217
+
218
+ tags = opts['tags']
219
+ @tags = tags.is_a?(Array) ? Flapjack::Data::TagSet.new(tags) : nil
202
220
  end
203
221
 
204
222
  # # time restrictions match?
@@ -11,32 +11,34 @@ module Flapjack
11
11
  class Acknowledgement
12
12
  include Base
13
13
 
14
- def block?(event)
14
+ def block?(event, entity_check, previous_state)
15
15
  timestamp = Time.now.to_i
16
- result = false
17
- if event.type == 'action'
18
- if event.acknowledgement?
19
- if @redis.zscore("failed_checks", event.id)
20
- ec = Flapjack::Data::EntityCheck.for_event_id(event.id, :redis => @redis)
21
- if ec.nil?
22
- @logger.error "Filter: Acknowledgement: unknown entity for event '#{event.id}'"
23
- else
24
- ec.create_unscheduled_maintenance(timestamp,
25
- (event.duration || (4 * 60 * 60)),
26
- :summary => event.summary)
27
- message = "unscheduled maintenance created for #{event.id}"
28
- end
29
- else
30
- result = true
31
- @logger.debug("Filter: Acknowledgement: blocking because zscore of failed_checks for #{event.id} is false") unless @redis.zscore("failed_checks", event.id)
32
- end
33
- else
34
- message = "no action taken"
35
- result = false
36
- end
16
+
17
+ label = 'Filter: Acknowledgement:'
18
+
19
+ return false unless event.type == 'action'
20
+
21
+ unless event.acknowledgement?
22
+ @logger.debug("#{label} pass (not an ack)")
23
+ return false
24
+ end
25
+
26
+ if entity_check.nil?
27
+ @logger.error "#{label} unknown entity for event '#{event.id}'"
28
+ return false
37
29
  end
38
- @logger.debug("Filter: Acknowledgement: #{result ? "block" : "pass"} (#{message})")
39
- result
30
+
31
+ unless @redis.zscore("failed_checks", event.id)
32
+ @logger.debug("#{label} blocking because zscore of failed_checks for #{event.id} is false")
33
+ return true
34
+ end
35
+
36
+ entity_check.create_unscheduled_maintenance(timestamp,
37
+ (event.duration || (4 * 60 * 60)),
38
+ :summary => event.summary)
39
+
40
+ @logger.debug("#{label} pass (unscheduled maintenance created for #{event.id})")
41
+ false
40
42
  end
41
43
  end
42
44
  end
@@ -19,58 +19,62 @@ module Flapjack
19
19
  class Delays
20
20
  include Base
21
21
 
22
- def block?(event)
22
+ def block?(event, entity_check, previous_state)
23
23
  failure_delay = 30
24
24
  resend_delay = 300
25
25
 
26
- result = false
26
+ label = 'Filter: Delays:'
27
27
 
28
- if event.service? && event.failure?
28
+ unless event.service? && event.failure?
29
+ @logger.debug("#{label} pass - not a service event in a failure state")
30
+ return false
31
+ end
32
+
33
+ unless entity_check.failed?
34
+ @logger.debug("#{label} entity_check.failed? returned false ...")
35
+ return false
36
+ end
29
37
 
30
- entity_check = Flapjack::Data::EntityCheck.for_event_id(event.id, :redis => @redis)
31
- current_time = Time.now.to_i
38
+ last_problem_alert = entity_check.last_notification_for_state(:problem)[:timestamp]
39
+ last_warning_alert = entity_check.last_notification_for_state(:warning)[:timestamp]
40
+ last_critical_alert = entity_check.last_notification_for_state(:critical)[:timestamp]
41
+ last_change = entity_check.last_change
42
+ last_notification = entity_check.last_notification
43
+ last_alert_state = last_notification[:type]
44
+ last_alert_timestamp = last_notification[:timestamp]
32
45
 
33
- if entity_check.failed?
34
- last_problem_alert = entity_check.last_notification_for_state(:problem)[:timestamp]
35
- last_warning_alert = entity_check.last_notification_for_state(:warning)[:timestamp]
36
- last_critical_alert = entity_check.last_notification_for_state(:critical)[:timestamp]
37
- last_change = entity_check.last_change
38
- last_notification = entity_check.last_notification
39
- last_alert_state = last_notification[:type]
40
- last_alert_timestamp = last_notification[:timestamp]
46
+ current_time = Time.now.to_i
47
+ current_state_duration = current_time - last_change
48
+ time_since_last_alert = current_time - last_problem_alert unless last_problem_alert.nil?
41
49
 
42
- current_state_duration = current_time - last_change
43
- time_since_last_alert = current_time - last_problem_alert unless last_problem_alert.nil?
44
- @logger.debug("Filter: Delays: last_problem_alert: #{last_problem_alert.to_s}, " +
45
- "last_change: #{last_change.inspect}, " +
46
- "current_state_duration: #{current_state_duration.inspect}, " +
47
- "time_since_last_alert: #{time_since_last_alert.inspect}, " +
48
- "last_alert_state: [#{last_alert_state.inspect}], " +
49
- "event.state: [#{event.state.inspect}], " +
50
- "last_alert_state == event.state ? #{last_alert_state.to_s == event.state}")
51
- if (current_state_duration < failure_delay)
52
- result = true
53
- @logger.debug("Filter: Delays: blocking because duration of current failure " +
54
- "(#{current_state_duration}) is less than failure_delay (#{failure_delay})")
55
- elsif !last_problem_alert.nil? && (time_since_last_alert < resend_delay) &&
56
- (last_alert_state.to_s == event.state)
50
+ @logger.debug("#{label} last_problem_alert: #{last_problem_alert.to_s}, " +
51
+ "last_change: #{last_change.inspect}, " +
52
+ "current_state_duration: #{current_state_duration.inspect}, " +
53
+ "time_since_last_alert: #{time_since_last_alert.inspect}, " +
54
+ "last_alert_state: [#{last_alert_state.inspect}], " +
55
+ "event.state: [#{event.state.inspect}], " +
56
+ "last_alert_state == event.state ? #{last_alert_state.to_s == event.state}")
57
57
 
58
- result = true
59
- @logger.debug("Filter: Delays: blocking because time since last alert for " +
60
- "current problem (#{time_since_last_alert}) is less than " +
61
- "resend_delay (#{resend_delay}) and last alert state (#{last_alert_state}) " +
62
- "is equal to current event state (#{event.state})")
63
- else
64
- @logger.debug("Filter: Delays: not blocking because neither of the time comparison " +
65
- "conditions were met")
66
- end
67
- else
68
- @logger.debug("Filter: Delays: entity_check.failed? returned false ...")
69
- end
58
+ if current_state_duration < failure_delay
59
+ @logger.debug("#{label} block - duration of current failure " +
60
+ "(#{current_state_duration}) is less than failure_delay (#{failure_delay})")
61
+ return true
70
62
  end
71
63
 
72
- @logger.debug("Filter: Delays: #{result ? "block" : "pass"}")
73
- result
64
+ if !last_problem_alert.nil? && (time_since_last_alert < resend_delay) &&
65
+ (last_alert_state.to_s == event.state)
66
+
67
+ @logger.debug("#{label} block - time since last alert for " +
68
+ "current problem (#{time_since_last_alert}) is less than " +
69
+ "resend_delay (#{resend_delay}) and last alert state (#{last_alert_state}) " +
70
+ "is equal to current event state (#{event.state})")
71
+ return true
72
+ end
73
+
74
+ @logger.debug("#{label} pass - not blocking because neither of the time comparison " +
75
+ "conditions were met")
76
+ return false
77
+
74
78
  end
75
79
  end
76
80
  end