notifu 1.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/bin/notifu +15 -0
- data/lib/notifu.rb +12 -0
- data/lib/notifu/actors/gammu_sms_bridge.rb +41 -0
- data/lib/notifu/actors/pagerduty.rb +0 -0
- data/lib/notifu/actors/slack_chan.rb +29 -0
- data/lib/notifu/actors/slack_msg.rb +29 -0
- data/lib/notifu/actors/smtp.rb +73 -0
- data/lib/notifu/actors/stdout.rb +16 -0
- data/lib/notifu/actors/twilio_call.rb +27 -0
- data/lib/notifu/cli.rb +13 -0
- data/lib/notifu/cli/object.rb +37 -0
- data/lib/notifu/cli/service.rb +53 -0
- data/lib/notifu/config.rb +120 -0
- data/lib/notifu/logger.rb +83 -0
- data/lib/notifu/mixins.rb +49 -0
- data/lib/notifu/model.rb +5 -0
- data/lib/notifu/model/contact.rb +15 -0
- data/lib/notifu/model/event.rb +52 -0
- data/lib/notifu/model/group.rb +14 -0
- data/lib/notifu/model/issue.rb +51 -0
- data/lib/notifu/model/sla.rb +20 -0
- data/lib/notifu/sensu/handler.rb +105 -0
- data/lib/notifu/util.rb +9 -0
- data/lib/notifu/workers/actor.rb +60 -0
- data/lib/notifu/workers/processor.rb +444 -0
- data/lib/notifu/workers/sidekiq_init.rb +24 -0
- metadata +90 -0
@@ -0,0 +1,444 @@
|
|
1
|
+
require_relative "sidekiq_init"
|
2
|
+
require 'excon'
|
3
|
+
|
4
|
+
$logger = Notifu::Logger.new 'processor'
|
5
|
+
|
6
|
+
Sidekiq.configure_server do |config|
|
7
|
+
config.redis = { url: Notifu::CONFIG[:redis_queues] }
|
8
|
+
Sidekiq::Logging.logger = Log4r::Logger.new 'sidekiq'
|
9
|
+
if Notifu::CONFIG[:logging][:syslog][:enabled]
|
10
|
+
Sidekiq::Logging.logger.outputters = Log4r::SyslogOutputter.new 'sidekiq', ident: 'notifu-processor'
|
11
|
+
else
|
12
|
+
Sidekiq::Logging.logger.outputters = Log4r::Outputter.stdout
|
13
|
+
end
|
14
|
+
# Sidekiq::Logging.logger.formatter = Notifu::LogFormatter.new
|
15
|
+
Sidekiq::Logging.logger.level = Log4r::INFO
|
16
|
+
end
|
17
|
+
|
18
|
+
Sidekiq.configure_client do |config|
|
19
|
+
config.redis = { url: Notifu::CONFIG[:redis_queues] }
|
20
|
+
end
|
21
|
+
|
22
|
+
###################################################################
|
23
|
+
###################################################################
|
24
|
+
####### PROCESSOR WORKER ##########################################
|
25
|
+
###################################################################
|
26
|
+
###################################################################
|
27
|
+
|
28
|
+
|
29
|
+
module Notifu
|
30
|
+
class Processor
|
31
|
+
include Sidekiq::Worker
|
32
|
+
include Notifu::Util
|
33
|
+
|
34
|
+
sidekiq_options :retry => true
|
35
|
+
sidekiq_options :queue => "processor"
|
36
|
+
|
37
|
+
attr_accessor :issue
|
38
|
+
attr_accessor :event
|
39
|
+
attr_accessor :now
|
40
|
+
attr_accessor :processing_result
|
41
|
+
|
42
|
+
###################################################################
|
43
|
+
####### SIDEKIQ GLUE METHOD #######################################
|
44
|
+
###################################################################
|
45
|
+
|
46
|
+
def perform *args
|
47
|
+
t_start = Time.now.to_f*1000.0
|
48
|
+
log "info", "Task start"
|
49
|
+
|
50
|
+
# read event
|
51
|
+
self.event = Notifu::Model::Event.new args
|
52
|
+
self.now = Time.now
|
53
|
+
log "info", "Processing event NID #{self.event.notifu_id}"
|
54
|
+
|
55
|
+
# try to check if we already know about the issue, otherwise save it into DB as a new one
|
56
|
+
self.issue = Notifu::Model::Issue.with(:notifu_id, self.event.notifu_id)
|
57
|
+
self.issue ||= Notifu::Model::Issue.create(self.event.data)
|
58
|
+
|
59
|
+
# let the magic happen
|
60
|
+
process!
|
61
|
+
|
62
|
+
t_finish = Time.now.to_f*1000.0
|
63
|
+
|
64
|
+
log "info", "Task finish (in #{t_finish-t_start}ms)"
|
65
|
+
end
|
66
|
+
|
67
|
+
###################################################################
|
68
|
+
####### MAIN PROCESSING LOGIC #####################################
|
69
|
+
###################################################################
|
70
|
+
|
71
|
+
def process!
|
72
|
+
self.event.group_sla.each do |gs|
|
73
|
+
|
74
|
+
# group related objects
|
75
|
+
begin
|
76
|
+
group = Notifu::Model::Group.with(:name, gs[:group])
|
77
|
+
sla = Notifu::Model::Sla.with(:name, gs[:sla])
|
78
|
+
rescue
|
79
|
+
log "info", "#{self.event.notifu_id} [#{self.event.host}/#{self.event.service}/#{self.event.code.to_state}]: Object init failed. Is Notifu API running?"
|
80
|
+
next
|
81
|
+
end
|
82
|
+
|
83
|
+
notified = {
|
84
|
+
sla: String.new,
|
85
|
+
group: String.new,
|
86
|
+
actors: Array.new,
|
87
|
+
contacts: Array.new,
|
88
|
+
escalation_level: "none"
|
89
|
+
}
|
90
|
+
|
91
|
+
result = []
|
92
|
+
|
93
|
+
# logic
|
94
|
+
if enough_occurrences? && self.event.action.to_s == "create"
|
95
|
+
result << "enough occurrences have passed"
|
96
|
+
if ! silenced?
|
97
|
+
result << "issue is not silenced"
|
98
|
+
if duty_time? sla.timerange_values(self.now)
|
99
|
+
result << "duty is active"
|
100
|
+
if status_changed?
|
101
|
+
result << "issue state has changed"
|
102
|
+
notified = notify!(sla, group)
|
103
|
+
result << "ACTION"
|
104
|
+
else
|
105
|
+
result << "issue state hasn't changed"
|
106
|
+
case self.event.code
|
107
|
+
when 0
|
108
|
+
result << "issue is in OK state" << "IDLE"
|
109
|
+
when 1
|
110
|
+
result << "issue is in WARNING state"
|
111
|
+
if first_notification?(sla, group)
|
112
|
+
result << "issue is new"
|
113
|
+
notified = notify!(sla, group)
|
114
|
+
result << "ACTION"
|
115
|
+
else
|
116
|
+
result << "already notified" << "IDLE"
|
117
|
+
end
|
118
|
+
when 2
|
119
|
+
result << "issue is not a warning"
|
120
|
+
if renotify?(sla, group)
|
121
|
+
result << "it's time to renotify"
|
122
|
+
notified = notify!(sla, group)
|
123
|
+
result << "ACTION"
|
124
|
+
else
|
125
|
+
result << "not yet time to renotify or escalate" << "IDLE"
|
126
|
+
end
|
127
|
+
else
|
128
|
+
result << "unknown state (#{self.event.code})" << "IDLE"
|
129
|
+
end
|
130
|
+
end
|
131
|
+
else
|
132
|
+
result << "duty is not active at this time" << "IDLE"
|
133
|
+
end
|
134
|
+
else
|
135
|
+
result << "issue is silenced" << "IDLE"
|
136
|
+
end
|
137
|
+
elsif self.event.action == "resolve" && self.issue.occurrences_count.to_i >= self.event.occurrences_trigger.to_i
|
138
|
+
if ! silenced?
|
139
|
+
result << "recovery of an event"
|
140
|
+
notified = notify!(sla, group)
|
141
|
+
result << "ACTION"
|
142
|
+
elsif self.event.unsilence
|
143
|
+
result << "recovery of an event (with unsilence)"
|
144
|
+
unsilence!
|
145
|
+
end
|
146
|
+
else
|
147
|
+
result << "not enough occurrences of this event" << "IDLE"
|
148
|
+
end
|
149
|
+
|
150
|
+
self.event.update_process_result!(notified)
|
151
|
+
|
152
|
+
action_log_message = {
|
153
|
+
logic: result.join(' -> '),
|
154
|
+
result: result[-1],
|
155
|
+
reason: result[-2],
|
156
|
+
group: group.name,
|
157
|
+
sla: sla.name,
|
158
|
+
host: self.event.host,
|
159
|
+
service: self.event.service,
|
160
|
+
message: self.event.message,
|
161
|
+
state: self.event.code.to_state,
|
162
|
+
contacts: notified[:contacts].to_json,
|
163
|
+
actors: notified[:actors].to_json,
|
164
|
+
occurrences_trigger: self.event.occurrences_trigger.to_i,
|
165
|
+
occurrences_count: self.event.occurrences_count.to_i,
|
166
|
+
check_duration: self.event.duration,
|
167
|
+
escalation_level: notified[:escalation_level].to_s,
|
168
|
+
sidekiq_jid: self.jid,
|
169
|
+
notifu_id: self.event.notifu_id,
|
170
|
+
:"@timestamp" => self.now.iso8601,
|
171
|
+
}
|
172
|
+
|
173
|
+
action_log action_log_message
|
174
|
+
|
175
|
+
end
|
176
|
+
|
177
|
+
if self.event.process_result.length > 0
|
178
|
+
self.issue.message = self.event.message
|
179
|
+
self.issue.action = self.event.action
|
180
|
+
self.issue.process_result = self.event.process_result
|
181
|
+
@issue.save
|
182
|
+
end
|
183
|
+
|
184
|
+
if status_changed?
|
185
|
+
self.issue.code = self.event.code
|
186
|
+
self.issue.time_created = self.event.time_created
|
187
|
+
end
|
188
|
+
|
189
|
+
self.issue.occurrences_trigger = self.event.occurrences_trigger
|
190
|
+
self.issue.occurrences_count = self.event.occurrences_count
|
191
|
+
self.issue.time_last_event = self.event.time_last_event
|
192
|
+
self.issue.sgs = self.event.sgs
|
193
|
+
self.issue.aspiring_code = self.event.code
|
194
|
+
self.issue.api_endpoint = self.event.api_endpoint
|
195
|
+
self.issue.duration = self.event.duration
|
196
|
+
|
197
|
+
@issue.save
|
198
|
+
|
199
|
+
# delayed cleanup job
|
200
|
+
cleanup!
|
201
|
+
end
|
202
|
+
|
203
|
+
###################################################################
|
204
|
+
####### NOTIFICATION METHOD (method for :process! ) ###############
|
205
|
+
###################################################################
|
206
|
+
|
207
|
+
def notify! (sla, group)
|
208
|
+
actors = []
|
209
|
+
contacts = []
|
210
|
+
escalation_level = "primary"
|
211
|
+
sla_actors = eval(sla.actors)
|
212
|
+
|
213
|
+
group.primary.each do |contact|
|
214
|
+
contacts << contact.name
|
215
|
+
end
|
216
|
+
actors += sla_actors[:primary]
|
217
|
+
|
218
|
+
# secondary escalation
|
219
|
+
if escalate_to?(1, sla)
|
220
|
+
group.secondary.each do |contact|
|
221
|
+
contacts << contact.name
|
222
|
+
end
|
223
|
+
actors += sla_actors[:secondary] if sla_actors[:secondary]
|
224
|
+
escalation_level = "secondary"
|
225
|
+
end
|
226
|
+
|
227
|
+
# tertiary escalation
|
228
|
+
if escalate_to?(2, sla)
|
229
|
+
group.tertiary.each do |contact|
|
230
|
+
contacts << contact.name
|
231
|
+
end
|
232
|
+
actors += sla_actors[:tertiary] if sla_actors[:tertiary]
|
233
|
+
escalation_level = "tertiary"
|
234
|
+
end
|
235
|
+
|
236
|
+
actors.each do |actor|
|
237
|
+
job = Sidekiq::Client.push( 'class' => "Notifu::Actors::#{actor.camelize}",
|
238
|
+
'args' => [ self.event.notifu_id, contacts ],
|
239
|
+
'queue' => "actor-#{actor}")
|
240
|
+
end
|
241
|
+
|
242
|
+
log "info", "Taking action (#{group.name}:#{sla.name}) NID #{self.event.notifu_id} [#{self.event.host}/#{self.event.service}/#{self.event.code.to_state}] actor: #{actors.join(', ')}; contacts: #{contacts.join(', ')}; escalation_level: #{escalation_level}"
|
243
|
+
|
244
|
+
self.issue.time_last_notified!(group.name, sla.name, Time.now.to_i)
|
245
|
+
|
246
|
+
return { sla: sla.name, group: group.name, actors: actors, contacts: contacts, escalation_level: escalation_level }
|
247
|
+
end
|
248
|
+
|
249
|
+
|
250
|
+
###################################################################
|
251
|
+
####### LOGIC BLOCK ###############################################
|
252
|
+
###################################################################
|
253
|
+
|
254
|
+
def enough_occurrences?
|
255
|
+
self.event.occurrences_count >= self.event.occurrences_trigger ? true : false
|
256
|
+
end
|
257
|
+
|
258
|
+
def escalate_to?(level, sla)
|
259
|
+
|
260
|
+
# escalation_interval = self.event.refresh
|
261
|
+
# escalation_interval ||= sla.refresh
|
262
|
+
escalation_interval = sla.refresh
|
263
|
+
escalation_period = level.to_i * escalation_interval.to_i
|
264
|
+
|
265
|
+
# log "info", "[#{escalation_period.to_s}] Creation time: " + Time.at(self.issue.time_created.to_i).to_s
|
266
|
+
# log "info", "[#{escalation_period.to_s}] Escalation time: " + Time.at(self.issue.time_created.to_i + escalation_period.to_i).to_s
|
267
|
+
# log "info", "[#{escalation_period.to_s}] Now time: " + Time.at(self.now.to_i).to_s
|
268
|
+
|
269
|
+
|
270
|
+
if self.issue.time_created.to_i + escalation_period.to_i <= self.now.to_i && is_critical?
|
271
|
+
return true
|
272
|
+
else
|
273
|
+
return false
|
274
|
+
end
|
275
|
+
end
|
276
|
+
|
277
|
+
def silenced?
|
278
|
+
if self.event.service == "keepalive"
|
279
|
+
path = "silence/#{self.event.host}"
|
280
|
+
else
|
281
|
+
path = "silence/#{self.event.host}/#{self.event.service}"
|
282
|
+
end
|
283
|
+
|
284
|
+
silenced = false
|
285
|
+
get_stashes.each do |stash|
|
286
|
+
silenced = true if stash["path"] == path
|
287
|
+
end
|
288
|
+
|
289
|
+
return silenced
|
290
|
+
end
|
291
|
+
|
292
|
+
def is_ok?
|
293
|
+
self.event.code == 0 ? true : false
|
294
|
+
end
|
295
|
+
|
296
|
+
def is_warning?
|
297
|
+
self.event.code == 1 ? true : false
|
298
|
+
end
|
299
|
+
|
300
|
+
def is_critical?
|
301
|
+
self.event.code == 2 ? true : false
|
302
|
+
end
|
303
|
+
|
304
|
+
def first_notification? sla, group
|
305
|
+
self.issue.time_last_notified?(group.name, sla.name) == nil ? true : false
|
306
|
+
end
|
307
|
+
|
308
|
+
def status_changed?
|
309
|
+
self.event.code.to_i != self.issue.code.to_i ? true : false
|
310
|
+
end
|
311
|
+
|
312
|
+
def renotify? (sla, group)
|
313
|
+
# t_renotify_int = self.event.refresh
|
314
|
+
# t_renotify_int ||= sla.refresh
|
315
|
+
t_renotify_int = sla.refresh
|
316
|
+
t_last_notified = self.issue.time_last_notified?(group.name, sla.name)
|
317
|
+
|
318
|
+
if t_last_notified.to_i + t_renotify_int.to_i <= self.now.to_i
|
319
|
+
return true
|
320
|
+
else
|
321
|
+
return false
|
322
|
+
end
|
323
|
+
end
|
324
|
+
|
325
|
+
def duty_time? (timerange)
|
326
|
+
case timerange
|
327
|
+
when "ALWAYS"
|
328
|
+
return true
|
329
|
+
when "NEVER"
|
330
|
+
return false
|
331
|
+
when /([0-9]{2}):([0-9]{2})-([0-9]{2}):([0-9]{2})/
|
332
|
+
t_duty_from = Time.local(self.now.year, self.now.month, self.now.day, timerange[0], timerange[1]).to_i
|
333
|
+
t_duty_until = Time.local(self.now.year, self.now.month, self.now.day, timerange[2], timerange[3]).to_i
|
334
|
+
if t_duty_from <= self.now.to_i <= t_duty_until
|
335
|
+
return true
|
336
|
+
else
|
337
|
+
return false
|
338
|
+
end
|
339
|
+
else
|
340
|
+
return true
|
341
|
+
end
|
342
|
+
end
|
343
|
+
|
344
|
+
###################################################################
|
345
|
+
####### HELPER BLOCK (methods for :process! ) #####################
|
346
|
+
###################################################################
|
347
|
+
|
348
|
+
|
349
|
+
##
|
350
|
+
# cleanup method
|
351
|
+
#
|
352
|
+
def cleanup!
|
353
|
+
if is_ok? && self.issue.action == "resolve"
|
354
|
+
Notifu::Cleaner.perform_async(self.issue.notifu_id)
|
355
|
+
end
|
356
|
+
end
|
357
|
+
|
358
|
+
##
|
359
|
+
# get stashes from Sensu API
|
360
|
+
#
|
361
|
+
def get_stashes
|
362
|
+
return @stashes if @stashes
|
363
|
+
begin
|
364
|
+
sensu_api = Excon.get "#{self.event.api_endpoint}/stashes"
|
365
|
+
@stashes = JSON.parse sensu_api.body
|
366
|
+
rescue
|
367
|
+
@stashes = []
|
368
|
+
log "error", "Failed to get stashes #{self.event.api_endpoint}/stashes"
|
369
|
+
end
|
370
|
+
end
|
371
|
+
|
372
|
+
|
373
|
+
##
|
374
|
+
# unsilence method
|
375
|
+
#
|
376
|
+
def unsilence!
|
377
|
+
path = "silence/#{self.event.host}/#{self.event.service}"
|
378
|
+
get_stashes.each do |stash|
|
379
|
+
if stash["path"] == path
|
380
|
+
if stash["expire"] < 0
|
381
|
+
if self.event.unsilence
|
382
|
+
begin
|
383
|
+
Excon.delete "#{self.event.api_endpoint}/stashes/silence/#{self.event.host}/#{self.event.service}"
|
384
|
+
log "info", "Unstashed #{self.event.host}/#{self.event.service} after recovery"
|
385
|
+
rescue
|
386
|
+
log "warning", "Failed to fetch stashes from Sensu API: #{self.event.api_endpoint}/stashes"
|
387
|
+
end
|
388
|
+
else
|
389
|
+
log "info", "#{self.event.host}/#{self.event.service} left stashed (auto-unstash disabled)"
|
390
|
+
end
|
391
|
+
else
|
392
|
+
log "info", "#{self.event.host}/#{self.event.service} left stashed (auto-unstash doesn't work on checks with defined expiry)"
|
393
|
+
end
|
394
|
+
end
|
395
|
+
end
|
396
|
+
end
|
397
|
+
|
398
|
+
##
|
399
|
+
# logging method
|
400
|
+
#
|
401
|
+
def log(prio, msg)
|
402
|
+
$logger.log prio, "JID-#{self.jid}: " + msg.to_s
|
403
|
+
end
|
404
|
+
|
405
|
+
##
|
406
|
+
# action logging method
|
407
|
+
#
|
408
|
+
def action_log event
|
409
|
+
$logger.action_log "processor", event
|
410
|
+
end
|
411
|
+
|
412
|
+
end
|
413
|
+
end
|
414
|
+
|
415
|
+
###################################################################
|
416
|
+
###################################################################
|
417
|
+
####### CLEANER WORKER ############################################
|
418
|
+
###################################################################
|
419
|
+
###################################################################
|
420
|
+
|
421
|
+
module Notifu
|
422
|
+
class Cleaner
|
423
|
+
include Sidekiq::Worker
|
424
|
+
include Notifu::Util
|
425
|
+
|
426
|
+
sidekiq_options :retry => true
|
427
|
+
sidekiq_options :queue => "processor"
|
428
|
+
|
429
|
+
def perform notifu_id, delay=15
|
430
|
+
sleep delay
|
431
|
+
begin
|
432
|
+
Notifu::Model::Issue.with(:notifu_id, notifu_id).delete
|
433
|
+
log "info", "Cleanup NID #{notifu_id} - success"
|
434
|
+
rescue NoMethodError
|
435
|
+
log "info", "Cleanup NID #{notifu_id} - not found"
|
436
|
+
end
|
437
|
+
end
|
438
|
+
|
439
|
+
def log(prio, msg)
|
440
|
+
$logger.log prio, "JID-#{self.jid}: " + msg.to_s
|
441
|
+
end
|
442
|
+
|
443
|
+
end
|
444
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
##
|
2
|
+
# Require block
|
3
|
+
#
|
4
|
+
require 'rubygems'
|
5
|
+
require 'bundler/setup'
|
6
|
+
require "ohm"
|
7
|
+
require "elasticsearch"
|
8
|
+
require "log4r/outputter/syslogoutputter"
|
9
|
+
require "log4r/configurator"
|
10
|
+
require "log4r"
|
11
|
+
require "syslog"
|
12
|
+
require "sidekiq"
|
13
|
+
require "sidekiq/logging"
|
14
|
+
require "notifu"
|
15
|
+
|
16
|
+
##
|
17
|
+
# Config block
|
18
|
+
#
|
19
|
+
Notifu::CONFIG = Notifu::Config.new.get
|
20
|
+
|
21
|
+
##
|
22
|
+
# Ohm init
|
23
|
+
#
|
24
|
+
Ohm.redis = Redic.new Notifu::CONFIG[:redis_data]
|