flapjack 0.7.22 → 0.7.25
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG.md +19 -0
- data/bin/flapjack +3 -1
- data/bin/flapjack-nagios-receiver +5 -4
- data/bin/receive-events +2 -2
- data/features/events.feature +101 -95
- data/features/notification_rules.feature +36 -4
- data/features/steps/notifications_steps.rb +4 -0
- data/flapjack.gemspec +3 -2
- data/lib/flapjack/coordinator.rb +8 -6
- data/lib/flapjack/data/entity_check.rb +20 -13
- data/lib/flapjack/data/event.rb +4 -7
- data/lib/flapjack/data/notification.rb +63 -45
- data/lib/flapjack/filters/acknowledgement.rb +26 -24
- data/lib/flapjack/filters/delays.rb +46 -42
- data/lib/flapjack/filters/ok.rb +31 -34
- data/lib/flapjack/filters/scheduled_maintenance.rb +2 -2
- data/lib/flapjack/filters/unscheduled_maintenance.rb +2 -3
- data/lib/flapjack/gateways/email.rb +111 -114
- data/lib/flapjack/gateways/email/alert.html.erb +11 -11
- data/lib/flapjack/gateways/email/alert.text.erb +19 -6
- data/lib/flapjack/gateways/sms_messagenet.rb +15 -5
- data/lib/flapjack/gateways/web.rb +3 -4
- data/lib/flapjack/gateways/web/public/css/flapjack.css +0 -2
- data/lib/flapjack/gateways/web/public/img/flapjack-favicon-32-16.ico +0 -0
- data/lib/flapjack/gateways/web/public/img/flapjack-favicon-64-32-24-16.ico +0 -0
- data/lib/flapjack/gateways/web/public/img/flapjack-transparent-300.png +0 -0
- data/lib/flapjack/gateways/web/public/img/flapjack-transparent-350-400.png +0 -0
- data/lib/flapjack/gateways/web/views/_head.html.erb +1 -0
- data/lib/flapjack/gateways/web/views/index.html.erb +1 -1
- data/lib/flapjack/notifier.rb +2 -3
- data/lib/flapjack/pikelet.rb +5 -4
- data/lib/flapjack/processor.rb +39 -27
- data/lib/flapjack/version.rb +1 -1
- data/spec/lib/flapjack/data/entity_check_spec.rb +5 -0
- data/spec/lib/flapjack/data/event_spec.rb +0 -1
- data/spec/lib/flapjack/gateways/email_spec.rb +5 -9
- data/spec/lib/flapjack/gateways/sms_messagenet.spec.rb +80 -1
- data/spec/lib/flapjack/gateways/web_spec.rb +1 -1
- data/spec/lib/flapjack/pikelet_spec.rb +4 -3
- data/spec/lib/flapjack/processor_spec.rb +0 -1
- metadata +28 -11
- data/lib/flapjack/filters/detect_mass_client_failures.rb +0 -44
- data/spec/lib/flapjack/filters/detect_mass_client_failures_spec.rb +0 -6
@@ -262,23 +262,25 @@ module Flapjack
|
|
262
262
|
details = options[:details]
|
263
263
|
count = options[:count]
|
264
264
|
|
265
|
-
|
266
|
-
|
265
|
+
old_state = self.state
|
266
|
+
|
267
|
+
@redis.multi
|
268
|
+
|
269
|
+
if old_state != new_state
|
267
270
|
|
268
271
|
# Note the current state (for speedy lookups)
|
269
272
|
@redis.hset("check:#{@key}", 'state', new_state)
|
270
273
|
|
271
274
|
# FIXME: rename to last_state_change?
|
272
275
|
@redis.hset("check:#{@key}", 'last_change', timestamp)
|
273
|
-
|
276
|
+
|
277
|
+
case new_state
|
274
278
|
when STATE_WARNING, STATE_CRITICAL, STATE_UNKNOWN
|
275
279
|
@redis.zadd('failed_checks', timestamp, @key)
|
276
280
|
# FIXME: Iterate through a list of tags associated with an entity:check pair, and update counters
|
277
|
-
@redis.zadd("failed_checks:client:#{client}", timestamp, @key) if client
|
278
281
|
else
|
279
282
|
@redis.zrem("failed_checks", @key)
|
280
283
|
# FIXME: Iterate through a list of tags associated with an entity:check pair, and update counters
|
281
|
-
@redis.zrem("failed_checks:client:#{client}", @key) if client
|
282
284
|
end
|
283
285
|
|
284
286
|
# Retain event data for entity:check pair
|
@@ -291,22 +293,27 @@ module Flapjack
|
|
291
293
|
@redis.zadd("#{@key}:sorted_state_timestamps", timestamp, timestamp)
|
292
294
|
end
|
293
295
|
|
296
|
+
# Track when we last saw an event for a particular entity:check pair
|
297
|
+
self.last_update = timestamp
|
298
|
+
|
294
299
|
# Even if this isn't a state change, we need to update the current state
|
295
300
|
# hash summary and details (as they may have changed)
|
296
301
|
@redis.hset("check:#{@key}", 'summary', (summary || ''))
|
297
302
|
@redis.hset("check:#{@key}", 'details', (details || ''))
|
298
|
-
end
|
299
303
|
|
300
|
-
|
301
|
-
lu = @redis.hget("check:#{@key}", 'last_update')
|
302
|
-
return unless (lu && lu =~ /^\d+$/)
|
303
|
-
lu.to_i
|
304
|
+
@redis.exec
|
304
305
|
end
|
305
306
|
|
306
307
|
def last_update=(timestamp)
|
307
308
|
@redis.hset("check:#{@key}", 'last_update', timestamp)
|
308
309
|
@redis.zadd("current_checks:#{entity.name}", timestamp, check)
|
309
|
-
@redis.zadd(
|
310
|
+
@redis.zadd('current_entities', timestamp, entity.name)
|
311
|
+
end
|
312
|
+
|
313
|
+
def last_update
|
314
|
+
lu = @redis.hget("check:#{@key}", 'last_update')
|
315
|
+
return unless lu && !!(lu =~ /^\d+$/)
|
316
|
+
lu.to_i
|
310
317
|
end
|
311
318
|
|
312
319
|
# disables a check (removes currency)
|
@@ -320,12 +327,12 @@ module Flapjack
|
|
320
327
|
end
|
321
328
|
|
322
329
|
def enabled?
|
323
|
-
|
330
|
+
!!@redis.zscore("current_checks:#{entity.name}", check)
|
324
331
|
end
|
325
332
|
|
326
333
|
def last_change
|
327
334
|
lc = @redis.hget("check:#{@key}", 'last_change')
|
328
|
-
return unless
|
335
|
+
return unless lc && !!(lc =~ /^\d+$/)
|
329
336
|
lc.to_i
|
330
337
|
end
|
331
338
|
|
data/lib/flapjack/data/event.rb
CHANGED
@@ -6,7 +6,7 @@ module Flapjack
|
|
6
6
|
module Data
|
7
7
|
class Event
|
8
8
|
|
9
|
-
attr_accessor :counter, :
|
9
|
+
attr_accessor :counter, :tags
|
10
10
|
|
11
11
|
attr_reader :check, :summary, :details, :acknowledgement_id
|
12
12
|
|
@@ -57,9 +57,12 @@ module Flapjack
|
|
57
57
|
end
|
58
58
|
|
59
59
|
# creates, or modifies, an event object and adds it to the events list in redis
|
60
|
+
# 'entity' => entity,
|
61
|
+
# 'check' => check,
|
60
62
|
# 'type' => 'service',
|
61
63
|
# 'state' => state,
|
62
64
|
# 'summary' => check_output,
|
65
|
+
# 'details' => check_long_output,
|
63
66
|
# 'time' => timestamp
|
64
67
|
def self.add(evt, opts = {})
|
65
68
|
raise "Redis connection not set" unless redis = opts[:redis]
|
@@ -129,12 +132,6 @@ module Flapjack
|
|
129
132
|
(entity || '-') + ':' + (check || '-')
|
130
133
|
end
|
131
134
|
|
132
|
-
# FIXME: site specific
|
133
|
-
def client
|
134
|
-
return unless entity
|
135
|
-
entity.split('-').first
|
136
|
-
end
|
137
|
-
|
138
135
|
def type
|
139
136
|
return unless @type
|
140
137
|
@type.downcase
|
@@ -10,7 +10,7 @@ module Flapjack
|
|
10
10
|
module Data
|
11
11
|
class Notification
|
12
12
|
|
13
|
-
attr_reader :type, :event_id, :
|
13
|
+
attr_reader :type, :event_id, :state
|
14
14
|
|
15
15
|
def self.type_for_event(event)
|
16
16
|
case event.type
|
@@ -49,18 +49,21 @@ module Flapjack
|
|
49
49
|
last_state = opts[:last_state] || {}
|
50
50
|
|
51
51
|
tag_data = event.tags.is_a?(Set) ? event.tags.to_a : nil
|
52
|
-
notif = {'event_id'
|
53
|
-
'state'
|
54
|
-
'summary'
|
55
|
-
'
|
56
|
-
'
|
57
|
-
'
|
58
|
-
'
|
59
|
-
'
|
60
|
-
'
|
61
|
-
'
|
62
|
-
|
63
|
-
'
|
52
|
+
notif = {'event_id' => event.id,
|
53
|
+
'state' => event.state,
|
54
|
+
'summary' => event.summary,
|
55
|
+
'details' => event.details,
|
56
|
+
'time' => event.time,
|
57
|
+
'duration' => event.duration,
|
58
|
+
'count' => event.counter,
|
59
|
+
'last_state' => last_state[:state],
|
60
|
+
'last_summary' => last_state[:summary],
|
61
|
+
'state_duration' => opts[:state_duration],
|
62
|
+
|
63
|
+
'type' => opts[:type] || type_for_event(event),
|
64
|
+
'severity' => opts[:severity],
|
65
|
+
|
66
|
+
'tags' => tag_data }
|
64
67
|
|
65
68
|
redis.rpush(queue, Oj.dump(notif))
|
66
69
|
end
|
@@ -88,17 +91,22 @@ module Flapjack
|
|
88
91
|
self.new( parsed )
|
89
92
|
end
|
90
93
|
|
94
|
+
def ok?
|
95
|
+
['ok', 'up'].include?(@state)
|
96
|
+
end
|
97
|
+
|
91
98
|
def contents
|
92
99
|
@contents ||= {'event_id' => @event_id,
|
93
|
-
'state' => @
|
94
|
-
'summary' => @
|
95
|
-
'
|
96
|
-
'
|
97
|
-
'
|
98
|
-
'
|
99
|
-
'
|
100
|
+
'state' => @state,
|
101
|
+
'summary' => @summary,
|
102
|
+
'duration' => @duration,
|
103
|
+
'last_state' => @last_state,
|
104
|
+
'last_summary' => @last_summary,
|
105
|
+
'state_duration' => @state_duration,
|
106
|
+
'details' => @details,
|
107
|
+
'time' => @time,
|
100
108
|
'notification_type' => @type,
|
101
|
-
'event_count' => @
|
109
|
+
'event_count' => @count,
|
102
110
|
'tags' => @tags
|
103
111
|
}
|
104
112
|
end
|
@@ -115,7 +123,7 @@ module Flapjack
|
|
115
123
|
media = contact.media
|
116
124
|
|
117
125
|
logger.debug "Notification#messages: creating messages for contact: #{contact_id} " +
|
118
|
-
"event_id: \"#{@event_id}\" state: #{@
|
126
|
+
"event_id: \"#{@event_id}\" state: #{@state} event_tags: #{@tags.to_json} media: #{media.inspect}"
|
119
127
|
rlen = rules.length
|
120
128
|
logger.debug "found #{rlen} rule#{(rlen == 1) ? '' : 's'} for contact #{contact_id}"
|
121
129
|
|
@@ -132,7 +140,7 @@ module Flapjack
|
|
132
140
|
|
133
141
|
logger.debug "#{matchers.length} matchers remain for this contact after time, entity and tags are matched:"
|
134
142
|
matchers.each do |matcher|
|
135
|
-
logger.debug "
|
143
|
+
logger.debug " - #{matcher.to_json}"
|
136
144
|
end
|
137
145
|
|
138
146
|
# delete any general matchers if there are more specific matchers left
|
@@ -143,35 +151,42 @@ module Flapjack
|
|
143
151
|
matchers.reject! {|matcher| !matcher.is_specific? }
|
144
152
|
|
145
153
|
if num_matchers != matchers.length
|
146
|
-
logger.debug("
|
154
|
+
logger.debug("removal of general matchers when entity specific matchers are present: number of matchers changed from #{num_matchers} to #{matchers.length} for contact id: #{contact_id}")
|
147
155
|
matchers.each do |matcher|
|
148
|
-
logger.debug "
|
156
|
+
logger.debug " - #{matcher.to_json}"
|
149
157
|
end
|
150
158
|
end
|
151
159
|
end
|
152
160
|
|
153
161
|
# delete media based on blackholes
|
154
|
-
|
155
|
-
|
156
|
-
|
162
|
+
blackhole_matchers = matchers.map {|matcher| matcher.blackhole?(@severity) ? matcher : nil }.compact
|
163
|
+
if blackhole_matchers.length > 0
|
164
|
+
logger.debug "dropping this media as #{blackhole_matchers.length} blackhole matchers are present:"
|
165
|
+
blackhole_matchers.each {|bm|
|
166
|
+
logger.debug " - #{bm.to_json}"
|
167
|
+
}
|
168
|
+
next
|
169
|
+
else
|
170
|
+
logger.debug "no blackhole matchers matched"
|
171
|
+
end
|
157
172
|
|
158
173
|
rule_media = matchers.collect{|matcher|
|
159
174
|
matcher.media_for_severity(@severity)
|
160
175
|
}.flatten.uniq
|
161
176
|
|
162
|
-
logger.debug "
|
177
|
+
logger.debug "collected media_for_severity(#{@severity}): #{rule_media}"
|
163
178
|
rule_media = rule_media.reject {|medium|
|
164
179
|
contact.drop_notifications?(:media => medium,
|
165
180
|
:check => @event_id,
|
166
|
-
:state => @
|
181
|
+
:state => @state)
|
167
182
|
}
|
168
183
|
|
169
|
-
logger.debug "
|
184
|
+
logger.debug "media after contact_drop?: #{rule_media}"
|
170
185
|
|
171
186
|
media.select {|medium, address| rule_media.include?(medium) }
|
172
187
|
end
|
173
188
|
|
174
|
-
logger.debug "
|
189
|
+
logger.debug "media_to_use: #{media_to_use}"
|
175
190
|
|
176
191
|
media_to_use.each_pair.inject([]) { |ret, (k, v)|
|
177
192
|
m = Flapjack::Data::Message.for_contact(contact,
|
@@ -186,19 +201,22 @@ module Flapjack
|
|
186
201
|
|
187
202
|
# created from parsed JSON, so opts keys are in strings
|
188
203
|
def initialize(opts = {})
|
189
|
-
@event_id
|
190
|
-
@
|
191
|
-
@
|
192
|
-
@
|
193
|
-
@
|
194
|
-
@
|
195
|
-
|
196
|
-
@
|
197
|
-
@
|
198
|
-
@
|
199
|
-
|
200
|
-
|
201
|
-
@
|
204
|
+
@event_id = opts['event_id']
|
205
|
+
@state = opts['state']
|
206
|
+
@summary = opts['summary']
|
207
|
+
@details = opts['details']
|
208
|
+
@time = opts['time']
|
209
|
+
@count = opts['count']
|
210
|
+
|
211
|
+
@last_state = opts['last_state']
|
212
|
+
@last_summary = opts['last_summary']
|
213
|
+
@state_duration = opts['state_duration']
|
214
|
+
|
215
|
+
@type = opts['type']
|
216
|
+
@severity = opts['severity']
|
217
|
+
|
218
|
+
tags = opts['tags']
|
219
|
+
@tags = tags.is_a?(Array) ? Flapjack::Data::TagSet.new(tags) : nil
|
202
220
|
end
|
203
221
|
|
204
222
|
# # time restrictions match?
|
@@ -11,32 +11,34 @@ module Flapjack
|
|
11
11
|
class Acknowledgement
|
12
12
|
include Base
|
13
13
|
|
14
|
-
def block?(event)
|
14
|
+
def block?(event, entity_check, previous_state)
|
15
15
|
timestamp = Time.now.to_i
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
else
|
30
|
-
result = true
|
31
|
-
@logger.debug("Filter: Acknowledgement: blocking because zscore of failed_checks for #{event.id} is false") unless @redis.zscore("failed_checks", event.id)
|
32
|
-
end
|
33
|
-
else
|
34
|
-
message = "no action taken"
|
35
|
-
result = false
|
36
|
-
end
|
16
|
+
|
17
|
+
label = 'Filter: Acknowledgement:'
|
18
|
+
|
19
|
+
return false unless event.type == 'action'
|
20
|
+
|
21
|
+
unless event.acknowledgement?
|
22
|
+
@logger.debug("#{label} pass (not an ack)")
|
23
|
+
return false
|
24
|
+
end
|
25
|
+
|
26
|
+
if entity_check.nil?
|
27
|
+
@logger.error "#{label} unknown entity for event '#{event.id}'"
|
28
|
+
return false
|
37
29
|
end
|
38
|
-
|
39
|
-
|
30
|
+
|
31
|
+
unless @redis.zscore("failed_checks", event.id)
|
32
|
+
@logger.debug("#{label} blocking because zscore of failed_checks for #{event.id} is false")
|
33
|
+
return true
|
34
|
+
end
|
35
|
+
|
36
|
+
entity_check.create_unscheduled_maintenance(timestamp,
|
37
|
+
(event.duration || (4 * 60 * 60)),
|
38
|
+
:summary => event.summary)
|
39
|
+
|
40
|
+
@logger.debug("#{label} pass (unscheduled maintenance created for #{event.id})")
|
41
|
+
false
|
40
42
|
end
|
41
43
|
end
|
42
44
|
end
|
@@ -19,58 +19,62 @@ module Flapjack
|
|
19
19
|
class Delays
|
20
20
|
include Base
|
21
21
|
|
22
|
-
def block?(event)
|
22
|
+
def block?(event, entity_check, previous_state)
|
23
23
|
failure_delay = 30
|
24
24
|
resend_delay = 300
|
25
25
|
|
26
|
-
|
26
|
+
label = 'Filter: Delays:'
|
27
27
|
|
28
|
-
|
28
|
+
unless event.service? && event.failure?
|
29
|
+
@logger.debug("#{label} pass - not a service event in a failure state")
|
30
|
+
return false
|
31
|
+
end
|
32
|
+
|
33
|
+
unless entity_check.failed?
|
34
|
+
@logger.debug("#{label} entity_check.failed? returned false ...")
|
35
|
+
return false
|
36
|
+
end
|
29
37
|
|
30
|
-
|
31
|
-
|
38
|
+
last_problem_alert = entity_check.last_notification_for_state(:problem)[:timestamp]
|
39
|
+
last_warning_alert = entity_check.last_notification_for_state(:warning)[:timestamp]
|
40
|
+
last_critical_alert = entity_check.last_notification_for_state(:critical)[:timestamp]
|
41
|
+
last_change = entity_check.last_change
|
42
|
+
last_notification = entity_check.last_notification
|
43
|
+
last_alert_state = last_notification[:type]
|
44
|
+
last_alert_timestamp = last_notification[:timestamp]
|
32
45
|
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
last_critical_alert = entity_check.last_notification_for_state(:critical)[:timestamp]
|
37
|
-
last_change = entity_check.last_change
|
38
|
-
last_notification = entity_check.last_notification
|
39
|
-
last_alert_state = last_notification[:type]
|
40
|
-
last_alert_timestamp = last_notification[:timestamp]
|
46
|
+
current_time = Time.now.to_i
|
47
|
+
current_state_duration = current_time - last_change
|
48
|
+
time_since_last_alert = current_time - last_problem_alert unless last_problem_alert.nil?
|
41
49
|
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
"event.state: [#{event.state.inspect}], " +
|
50
|
-
"last_alert_state == event.state ? #{last_alert_state.to_s == event.state}")
|
51
|
-
if (current_state_duration < failure_delay)
|
52
|
-
result = true
|
53
|
-
@logger.debug("Filter: Delays: blocking because duration of current failure " +
|
54
|
-
"(#{current_state_duration}) is less than failure_delay (#{failure_delay})")
|
55
|
-
elsif !last_problem_alert.nil? && (time_since_last_alert < resend_delay) &&
|
56
|
-
(last_alert_state.to_s == event.state)
|
50
|
+
@logger.debug("#{label} last_problem_alert: #{last_problem_alert.to_s}, " +
|
51
|
+
"last_change: #{last_change.inspect}, " +
|
52
|
+
"current_state_duration: #{current_state_duration.inspect}, " +
|
53
|
+
"time_since_last_alert: #{time_since_last_alert.inspect}, " +
|
54
|
+
"last_alert_state: [#{last_alert_state.inspect}], " +
|
55
|
+
"event.state: [#{event.state.inspect}], " +
|
56
|
+
"last_alert_state == event.state ? #{last_alert_state.to_s == event.state}")
|
57
57
|
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
"is equal to current event state (#{event.state})")
|
63
|
-
else
|
64
|
-
@logger.debug("Filter: Delays: not blocking because neither of the time comparison " +
|
65
|
-
"conditions were met")
|
66
|
-
end
|
67
|
-
else
|
68
|
-
@logger.debug("Filter: Delays: entity_check.failed? returned false ...")
|
69
|
-
end
|
58
|
+
if current_state_duration < failure_delay
|
59
|
+
@logger.debug("#{label} block - duration of current failure " +
|
60
|
+
"(#{current_state_duration}) is less than failure_delay (#{failure_delay})")
|
61
|
+
return true
|
70
62
|
end
|
71
63
|
|
72
|
-
|
73
|
-
|
64
|
+
if !last_problem_alert.nil? && (time_since_last_alert < resend_delay) &&
|
65
|
+
(last_alert_state.to_s == event.state)
|
66
|
+
|
67
|
+
@logger.debug("#{label} block - time since last alert for " +
|
68
|
+
"current problem (#{time_since_last_alert}) is less than " +
|
69
|
+
"resend_delay (#{resend_delay}) and last alert state (#{last_alert_state}) " +
|
70
|
+
"is equal to current event state (#{event.state})")
|
71
|
+
return true
|
72
|
+
end
|
73
|
+
|
74
|
+
@logger.debug("#{label} pass - not blocking because neither of the time comparison " +
|
75
|
+
"conditions were met")
|
76
|
+
return false
|
77
|
+
|
74
78
|
end
|
75
79
|
end
|
76
80
|
end
|