flapjack 0.7.22 → 0.7.25
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG.md +19 -0
- data/bin/flapjack +3 -1
- data/bin/flapjack-nagios-receiver +5 -4
- data/bin/receive-events +2 -2
- data/features/events.feature +101 -95
- data/features/notification_rules.feature +36 -4
- data/features/steps/notifications_steps.rb +4 -0
- data/flapjack.gemspec +3 -2
- data/lib/flapjack/coordinator.rb +8 -6
- data/lib/flapjack/data/entity_check.rb +20 -13
- data/lib/flapjack/data/event.rb +4 -7
- data/lib/flapjack/data/notification.rb +63 -45
- data/lib/flapjack/filters/acknowledgement.rb +26 -24
- data/lib/flapjack/filters/delays.rb +46 -42
- data/lib/flapjack/filters/ok.rb +31 -34
- data/lib/flapjack/filters/scheduled_maintenance.rb +2 -2
- data/lib/flapjack/filters/unscheduled_maintenance.rb +2 -3
- data/lib/flapjack/gateways/email.rb +111 -114
- data/lib/flapjack/gateways/email/alert.html.erb +11 -11
- data/lib/flapjack/gateways/email/alert.text.erb +19 -6
- data/lib/flapjack/gateways/sms_messagenet.rb +15 -5
- data/lib/flapjack/gateways/web.rb +3 -4
- data/lib/flapjack/gateways/web/public/css/flapjack.css +0 -2
- data/lib/flapjack/gateways/web/public/img/flapjack-favicon-32-16.ico +0 -0
- data/lib/flapjack/gateways/web/public/img/flapjack-favicon-64-32-24-16.ico +0 -0
- data/lib/flapjack/gateways/web/public/img/flapjack-transparent-300.png +0 -0
- data/lib/flapjack/gateways/web/public/img/flapjack-transparent-350-400.png +0 -0
- data/lib/flapjack/gateways/web/views/_head.html.erb +1 -0
- data/lib/flapjack/gateways/web/views/index.html.erb +1 -1
- data/lib/flapjack/notifier.rb +2 -3
- data/lib/flapjack/pikelet.rb +5 -4
- data/lib/flapjack/processor.rb +39 -27
- data/lib/flapjack/version.rb +1 -1
- data/spec/lib/flapjack/data/entity_check_spec.rb +5 -0
- data/spec/lib/flapjack/data/event_spec.rb +0 -1
- data/spec/lib/flapjack/gateways/email_spec.rb +5 -9
- data/spec/lib/flapjack/gateways/sms_messagenet.spec.rb +80 -1
- data/spec/lib/flapjack/gateways/web_spec.rb +1 -1
- data/spec/lib/flapjack/pikelet_spec.rb +4 -3
- data/spec/lib/flapjack/processor_spec.rb +0 -1
- metadata +28 -11
- data/lib/flapjack/filters/detect_mass_client_failures.rb +0 -44
- data/spec/lib/flapjack/filters/detect_mass_client_failures_spec.rb +0 -6
@@ -262,23 +262,25 @@ module Flapjack
|
|
262
262
|
details = options[:details]
|
263
263
|
count = options[:count]
|
264
264
|
|
265
|
-
|
266
|
-
|
265
|
+
old_state = self.state
|
266
|
+
|
267
|
+
@redis.multi
|
268
|
+
|
269
|
+
if old_state != new_state
|
267
270
|
|
268
271
|
# Note the current state (for speedy lookups)
|
269
272
|
@redis.hset("check:#{@key}", 'state', new_state)
|
270
273
|
|
271
274
|
# FIXME: rename to last_state_change?
|
272
275
|
@redis.hset("check:#{@key}", 'last_change', timestamp)
|
273
|
-
|
276
|
+
|
277
|
+
case new_state
|
274
278
|
when STATE_WARNING, STATE_CRITICAL, STATE_UNKNOWN
|
275
279
|
@redis.zadd('failed_checks', timestamp, @key)
|
276
280
|
# FIXME: Iterate through a list of tags associated with an entity:check pair, and update counters
|
277
|
-
@redis.zadd("failed_checks:client:#{client}", timestamp, @key) if client
|
278
281
|
else
|
279
282
|
@redis.zrem("failed_checks", @key)
|
280
283
|
# FIXME: Iterate through a list of tags associated with an entity:check pair, and update counters
|
281
|
-
@redis.zrem("failed_checks:client:#{client}", @key) if client
|
282
284
|
end
|
283
285
|
|
284
286
|
# Retain event data for entity:check pair
|
@@ -291,22 +293,27 @@ module Flapjack
|
|
291
293
|
@redis.zadd("#{@key}:sorted_state_timestamps", timestamp, timestamp)
|
292
294
|
end
|
293
295
|
|
296
|
+
# Track when we last saw an event for a particular entity:check pair
|
297
|
+
self.last_update = timestamp
|
298
|
+
|
294
299
|
# Even if this isn't a state change, we need to update the current state
|
295
300
|
# hash summary and details (as they may have changed)
|
296
301
|
@redis.hset("check:#{@key}", 'summary', (summary || ''))
|
297
302
|
@redis.hset("check:#{@key}", 'details', (details || ''))
|
298
|
-
end
|
299
303
|
|
300
|
-
|
301
|
-
lu = @redis.hget("check:#{@key}", 'last_update')
|
302
|
-
return unless (lu && lu =~ /^\d+$/)
|
303
|
-
lu.to_i
|
304
|
+
@redis.exec
|
304
305
|
end
|
305
306
|
|
306
307
|
def last_update=(timestamp)
|
307
308
|
@redis.hset("check:#{@key}", 'last_update', timestamp)
|
308
309
|
@redis.zadd("current_checks:#{entity.name}", timestamp, check)
|
309
|
-
@redis.zadd(
|
310
|
+
@redis.zadd('current_entities', timestamp, entity.name)
|
311
|
+
end
|
312
|
+
|
313
|
+
def last_update
|
314
|
+
lu = @redis.hget("check:#{@key}", 'last_update')
|
315
|
+
return unless lu && !!(lu =~ /^\d+$/)
|
316
|
+
lu.to_i
|
310
317
|
end
|
311
318
|
|
312
319
|
# disables a check (removes currency)
|
@@ -320,12 +327,12 @@ module Flapjack
|
|
320
327
|
end
|
321
328
|
|
322
329
|
def enabled?
|
323
|
-
|
330
|
+
!!@redis.zscore("current_checks:#{entity.name}", check)
|
324
331
|
end
|
325
332
|
|
326
333
|
def last_change
|
327
334
|
lc = @redis.hget("check:#{@key}", 'last_change')
|
328
|
-
return unless
|
335
|
+
return unless lc && !!(lc =~ /^\d+$/)
|
329
336
|
lc.to_i
|
330
337
|
end
|
331
338
|
|
data/lib/flapjack/data/event.rb
CHANGED
@@ -6,7 +6,7 @@ module Flapjack
|
|
6
6
|
module Data
|
7
7
|
class Event
|
8
8
|
|
9
|
-
attr_accessor :counter, :
|
9
|
+
attr_accessor :counter, :tags
|
10
10
|
|
11
11
|
attr_reader :check, :summary, :details, :acknowledgement_id
|
12
12
|
|
@@ -57,9 +57,12 @@ module Flapjack
|
|
57
57
|
end
|
58
58
|
|
59
59
|
# creates, or modifies, an event object and adds it to the events list in redis
|
60
|
+
# 'entity' => entity,
|
61
|
+
# 'check' => check,
|
60
62
|
# 'type' => 'service',
|
61
63
|
# 'state' => state,
|
62
64
|
# 'summary' => check_output,
|
65
|
+
# 'details' => check_long_output,
|
63
66
|
# 'time' => timestamp
|
64
67
|
def self.add(evt, opts = {})
|
65
68
|
raise "Redis connection not set" unless redis = opts[:redis]
|
@@ -129,12 +132,6 @@ module Flapjack
|
|
129
132
|
(entity || '-') + ':' + (check || '-')
|
130
133
|
end
|
131
134
|
|
132
|
-
# FIXME: site specific
|
133
|
-
def client
|
134
|
-
return unless entity
|
135
|
-
entity.split('-').first
|
136
|
-
end
|
137
|
-
|
138
135
|
def type
|
139
136
|
return unless @type
|
140
137
|
@type.downcase
|
@@ -10,7 +10,7 @@ module Flapjack
|
|
10
10
|
module Data
|
11
11
|
class Notification
|
12
12
|
|
13
|
-
attr_reader :type, :event_id, :
|
13
|
+
attr_reader :type, :event_id, :state
|
14
14
|
|
15
15
|
def self.type_for_event(event)
|
16
16
|
case event.type
|
@@ -49,18 +49,21 @@ module Flapjack
|
|
49
49
|
last_state = opts[:last_state] || {}
|
50
50
|
|
51
51
|
tag_data = event.tags.is_a?(Set) ? event.tags.to_a : nil
|
52
|
-
notif = {'event_id'
|
53
|
-
'state'
|
54
|
-
'summary'
|
55
|
-
'
|
56
|
-
'
|
57
|
-
'
|
58
|
-
'
|
59
|
-
'
|
60
|
-
'
|
61
|
-
'
|
62
|
-
|
63
|
-
'
|
52
|
+
notif = {'event_id' => event.id,
|
53
|
+
'state' => event.state,
|
54
|
+
'summary' => event.summary,
|
55
|
+
'details' => event.details,
|
56
|
+
'time' => event.time,
|
57
|
+
'duration' => event.duration,
|
58
|
+
'count' => event.counter,
|
59
|
+
'last_state' => last_state[:state],
|
60
|
+
'last_summary' => last_state[:summary],
|
61
|
+
'state_duration' => opts[:state_duration],
|
62
|
+
|
63
|
+
'type' => opts[:type] || type_for_event(event),
|
64
|
+
'severity' => opts[:severity],
|
65
|
+
|
66
|
+
'tags' => tag_data }
|
64
67
|
|
65
68
|
redis.rpush(queue, Oj.dump(notif))
|
66
69
|
end
|
@@ -88,17 +91,22 @@ module Flapjack
|
|
88
91
|
self.new( parsed )
|
89
92
|
end
|
90
93
|
|
94
|
+
def ok?
|
95
|
+
['ok', 'up'].include?(@state)
|
96
|
+
end
|
97
|
+
|
91
98
|
def contents
|
92
99
|
@contents ||= {'event_id' => @event_id,
|
93
|
-
'state' => @
|
94
|
-
'summary' => @
|
95
|
-
'
|
96
|
-
'
|
97
|
-
'
|
98
|
-
'
|
99
|
-
'
|
100
|
+
'state' => @state,
|
101
|
+
'summary' => @summary,
|
102
|
+
'duration' => @duration,
|
103
|
+
'last_state' => @last_state,
|
104
|
+
'last_summary' => @last_summary,
|
105
|
+
'state_duration' => @state_duration,
|
106
|
+
'details' => @details,
|
107
|
+
'time' => @time,
|
100
108
|
'notification_type' => @type,
|
101
|
-
'event_count' => @
|
109
|
+
'event_count' => @count,
|
102
110
|
'tags' => @tags
|
103
111
|
}
|
104
112
|
end
|
@@ -115,7 +123,7 @@ module Flapjack
|
|
115
123
|
media = contact.media
|
116
124
|
|
117
125
|
logger.debug "Notification#messages: creating messages for contact: #{contact_id} " +
|
118
|
-
"event_id: \"#{@event_id}\" state: #{@
|
126
|
+
"event_id: \"#{@event_id}\" state: #{@state} event_tags: #{@tags.to_json} media: #{media.inspect}"
|
119
127
|
rlen = rules.length
|
120
128
|
logger.debug "found #{rlen} rule#{(rlen == 1) ? '' : 's'} for contact #{contact_id}"
|
121
129
|
|
@@ -132,7 +140,7 @@ module Flapjack
|
|
132
140
|
|
133
141
|
logger.debug "#{matchers.length} matchers remain for this contact after time, entity and tags are matched:"
|
134
142
|
matchers.each do |matcher|
|
135
|
-
logger.debug "
|
143
|
+
logger.debug " - #{matcher.to_json}"
|
136
144
|
end
|
137
145
|
|
138
146
|
# delete any general matchers if there are more specific matchers left
|
@@ -143,35 +151,42 @@ module Flapjack
|
|
143
151
|
matchers.reject! {|matcher| !matcher.is_specific? }
|
144
152
|
|
145
153
|
if num_matchers != matchers.length
|
146
|
-
logger.debug("
|
154
|
+
logger.debug("removal of general matchers when entity specific matchers are present: number of matchers changed from #{num_matchers} to #{matchers.length} for contact id: #{contact_id}")
|
147
155
|
matchers.each do |matcher|
|
148
|
-
logger.debug "
|
156
|
+
logger.debug " - #{matcher.to_json}"
|
149
157
|
end
|
150
158
|
end
|
151
159
|
end
|
152
160
|
|
153
161
|
# delete media based on blackholes
|
154
|
-
|
155
|
-
|
156
|
-
|
162
|
+
blackhole_matchers = matchers.map {|matcher| matcher.blackhole?(@severity) ? matcher : nil }.compact
|
163
|
+
if blackhole_matchers.length > 0
|
164
|
+
logger.debug "dropping this media as #{blackhole_matchers.length} blackhole matchers are present:"
|
165
|
+
blackhole_matchers.each {|bm|
|
166
|
+
logger.debug " - #{bm.to_json}"
|
167
|
+
}
|
168
|
+
next
|
169
|
+
else
|
170
|
+
logger.debug "no blackhole matchers matched"
|
171
|
+
end
|
157
172
|
|
158
173
|
rule_media = matchers.collect{|matcher|
|
159
174
|
matcher.media_for_severity(@severity)
|
160
175
|
}.flatten.uniq
|
161
176
|
|
162
|
-
logger.debug "
|
177
|
+
logger.debug "collected media_for_severity(#{@severity}): #{rule_media}"
|
163
178
|
rule_media = rule_media.reject {|medium|
|
164
179
|
contact.drop_notifications?(:media => medium,
|
165
180
|
:check => @event_id,
|
166
|
-
:state => @
|
181
|
+
:state => @state)
|
167
182
|
}
|
168
183
|
|
169
|
-
logger.debug "
|
184
|
+
logger.debug "media after contact_drop?: #{rule_media}"
|
170
185
|
|
171
186
|
media.select {|medium, address| rule_media.include?(medium) }
|
172
187
|
end
|
173
188
|
|
174
|
-
logger.debug "
|
189
|
+
logger.debug "media_to_use: #{media_to_use}"
|
175
190
|
|
176
191
|
media_to_use.each_pair.inject([]) { |ret, (k, v)|
|
177
192
|
m = Flapjack::Data::Message.for_contact(contact,
|
@@ -186,19 +201,22 @@ module Flapjack
|
|
186
201
|
|
187
202
|
# created from parsed JSON, so opts keys are in strings
|
188
203
|
def initialize(opts = {})
|
189
|
-
@event_id
|
190
|
-
@
|
191
|
-
@
|
192
|
-
@
|
193
|
-
@
|
194
|
-
@
|
195
|
-
|
196
|
-
@
|
197
|
-
@
|
198
|
-
@
|
199
|
-
|
200
|
-
|
201
|
-
@
|
204
|
+
@event_id = opts['event_id']
|
205
|
+
@state = opts['state']
|
206
|
+
@summary = opts['summary']
|
207
|
+
@details = opts['details']
|
208
|
+
@time = opts['time']
|
209
|
+
@count = opts['count']
|
210
|
+
|
211
|
+
@last_state = opts['last_state']
|
212
|
+
@last_summary = opts['last_summary']
|
213
|
+
@state_duration = opts['state_duration']
|
214
|
+
|
215
|
+
@type = opts['type']
|
216
|
+
@severity = opts['severity']
|
217
|
+
|
218
|
+
tags = opts['tags']
|
219
|
+
@tags = tags.is_a?(Array) ? Flapjack::Data::TagSet.new(tags) : nil
|
202
220
|
end
|
203
221
|
|
204
222
|
# # time restrictions match?
|
@@ -11,32 +11,34 @@ module Flapjack
|
|
11
11
|
class Acknowledgement
|
12
12
|
include Base
|
13
13
|
|
14
|
-
def block?(event)
|
14
|
+
def block?(event, entity_check, previous_state)
|
15
15
|
timestamp = Time.now.to_i
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
else
|
30
|
-
result = true
|
31
|
-
@logger.debug("Filter: Acknowledgement: blocking because zscore of failed_checks for #{event.id} is false") unless @redis.zscore("failed_checks", event.id)
|
32
|
-
end
|
33
|
-
else
|
34
|
-
message = "no action taken"
|
35
|
-
result = false
|
36
|
-
end
|
16
|
+
|
17
|
+
label = 'Filter: Acknowledgement:'
|
18
|
+
|
19
|
+
return false unless event.type == 'action'
|
20
|
+
|
21
|
+
unless event.acknowledgement?
|
22
|
+
@logger.debug("#{label} pass (not an ack)")
|
23
|
+
return false
|
24
|
+
end
|
25
|
+
|
26
|
+
if entity_check.nil?
|
27
|
+
@logger.error "#{label} unknown entity for event '#{event.id}'"
|
28
|
+
return false
|
37
29
|
end
|
38
|
-
|
39
|
-
|
30
|
+
|
31
|
+
unless @redis.zscore("failed_checks", event.id)
|
32
|
+
@logger.debug("#{label} blocking because zscore of failed_checks for #{event.id} is false")
|
33
|
+
return true
|
34
|
+
end
|
35
|
+
|
36
|
+
entity_check.create_unscheduled_maintenance(timestamp,
|
37
|
+
(event.duration || (4 * 60 * 60)),
|
38
|
+
:summary => event.summary)
|
39
|
+
|
40
|
+
@logger.debug("#{label} pass (unscheduled maintenance created for #{event.id})")
|
41
|
+
false
|
40
42
|
end
|
41
43
|
end
|
42
44
|
end
|
@@ -19,58 +19,62 @@ module Flapjack
|
|
19
19
|
class Delays
|
20
20
|
include Base
|
21
21
|
|
22
|
-
def block?(event)
|
22
|
+
def block?(event, entity_check, previous_state)
|
23
23
|
failure_delay = 30
|
24
24
|
resend_delay = 300
|
25
25
|
|
26
|
-
|
26
|
+
label = 'Filter: Delays:'
|
27
27
|
|
28
|
-
|
28
|
+
unless event.service? && event.failure?
|
29
|
+
@logger.debug("#{label} pass - not a service event in a failure state")
|
30
|
+
return false
|
31
|
+
end
|
32
|
+
|
33
|
+
unless entity_check.failed?
|
34
|
+
@logger.debug("#{label} entity_check.failed? returned false ...")
|
35
|
+
return false
|
36
|
+
end
|
29
37
|
|
30
|
-
|
31
|
-
|
38
|
+
last_problem_alert = entity_check.last_notification_for_state(:problem)[:timestamp]
|
39
|
+
last_warning_alert = entity_check.last_notification_for_state(:warning)[:timestamp]
|
40
|
+
last_critical_alert = entity_check.last_notification_for_state(:critical)[:timestamp]
|
41
|
+
last_change = entity_check.last_change
|
42
|
+
last_notification = entity_check.last_notification
|
43
|
+
last_alert_state = last_notification[:type]
|
44
|
+
last_alert_timestamp = last_notification[:timestamp]
|
32
45
|
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
last_critical_alert = entity_check.last_notification_for_state(:critical)[:timestamp]
|
37
|
-
last_change = entity_check.last_change
|
38
|
-
last_notification = entity_check.last_notification
|
39
|
-
last_alert_state = last_notification[:type]
|
40
|
-
last_alert_timestamp = last_notification[:timestamp]
|
46
|
+
current_time = Time.now.to_i
|
47
|
+
current_state_duration = current_time - last_change
|
48
|
+
time_since_last_alert = current_time - last_problem_alert unless last_problem_alert.nil?
|
41
49
|
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
"event.state: [#{event.state.inspect}], " +
|
50
|
-
"last_alert_state == event.state ? #{last_alert_state.to_s == event.state}")
|
51
|
-
if (current_state_duration < failure_delay)
|
52
|
-
result = true
|
53
|
-
@logger.debug("Filter: Delays: blocking because duration of current failure " +
|
54
|
-
"(#{current_state_duration}) is less than failure_delay (#{failure_delay})")
|
55
|
-
elsif !last_problem_alert.nil? && (time_since_last_alert < resend_delay) &&
|
56
|
-
(last_alert_state.to_s == event.state)
|
50
|
+
@logger.debug("#{label} last_problem_alert: #{last_problem_alert.to_s}, " +
|
51
|
+
"last_change: #{last_change.inspect}, " +
|
52
|
+
"current_state_duration: #{current_state_duration.inspect}, " +
|
53
|
+
"time_since_last_alert: #{time_since_last_alert.inspect}, " +
|
54
|
+
"last_alert_state: [#{last_alert_state.inspect}], " +
|
55
|
+
"event.state: [#{event.state.inspect}], " +
|
56
|
+
"last_alert_state == event.state ? #{last_alert_state.to_s == event.state}")
|
57
57
|
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
"is equal to current event state (#{event.state})")
|
63
|
-
else
|
64
|
-
@logger.debug("Filter: Delays: not blocking because neither of the time comparison " +
|
65
|
-
"conditions were met")
|
66
|
-
end
|
67
|
-
else
|
68
|
-
@logger.debug("Filter: Delays: entity_check.failed? returned false ...")
|
69
|
-
end
|
58
|
+
if current_state_duration < failure_delay
|
59
|
+
@logger.debug("#{label} block - duration of current failure " +
|
60
|
+
"(#{current_state_duration}) is less than failure_delay (#{failure_delay})")
|
61
|
+
return true
|
70
62
|
end
|
71
63
|
|
72
|
-
|
73
|
-
|
64
|
+
if !last_problem_alert.nil? && (time_since_last_alert < resend_delay) &&
|
65
|
+
(last_alert_state.to_s == event.state)
|
66
|
+
|
67
|
+
@logger.debug("#{label} block - time since last alert for " +
|
68
|
+
"current problem (#{time_since_last_alert}) is less than " +
|
69
|
+
"resend_delay (#{resend_delay}) and last alert state (#{last_alert_state}) " +
|
70
|
+
"is equal to current event state (#{event.state})")
|
71
|
+
return true
|
72
|
+
end
|
73
|
+
|
74
|
+
@logger.debug("#{label} pass - not blocking because neither of the time comparison " +
|
75
|
+
"conditions were met")
|
76
|
+
return false
|
77
|
+
|
74
78
|
end
|
75
79
|
end
|
76
80
|
end
|