notifu 1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/bin/notifu +15 -0
- data/lib/notifu.rb +12 -0
- data/lib/notifu/actors/gammu_sms_bridge.rb +41 -0
- data/lib/notifu/actors/pagerduty.rb +0 -0
- data/lib/notifu/actors/slack_chan.rb +29 -0
- data/lib/notifu/actors/slack_msg.rb +29 -0
- data/lib/notifu/actors/smtp.rb +73 -0
- data/lib/notifu/actors/stdout.rb +16 -0
- data/lib/notifu/actors/twilio_call.rb +27 -0
- data/lib/notifu/cli.rb +13 -0
- data/lib/notifu/cli/object.rb +37 -0
- data/lib/notifu/cli/service.rb +53 -0
- data/lib/notifu/config.rb +120 -0
- data/lib/notifu/logger.rb +83 -0
- data/lib/notifu/mixins.rb +49 -0
- data/lib/notifu/model.rb +5 -0
- data/lib/notifu/model/contact.rb +15 -0
- data/lib/notifu/model/event.rb +52 -0
- data/lib/notifu/model/group.rb +14 -0
- data/lib/notifu/model/issue.rb +51 -0
- data/lib/notifu/model/sla.rb +20 -0
- data/lib/notifu/sensu/handler.rb +105 -0
- data/lib/notifu/util.rb +9 -0
- data/lib/notifu/workers/actor.rb +60 -0
- data/lib/notifu/workers/processor.rb +444 -0
- data/lib/notifu/workers/sidekiq_init.rb +24 -0
- metadata +90 -0
@@ -0,0 +1,444 @@
|
|
1
|
+
require_relative "sidekiq_init"
|
2
|
+
require 'excon'
|
3
|
+
|
4
|
+
$logger = Notifu::Logger.new 'processor'
|
5
|
+
|
6
|
+
Sidekiq.configure_server do |config|
|
7
|
+
config.redis = { url: Notifu::CONFIG[:redis_queues] }
|
8
|
+
Sidekiq::Logging.logger = Log4r::Logger.new 'sidekiq'
|
9
|
+
if Notifu::CONFIG[:logging][:syslog][:enabled]
|
10
|
+
Sidekiq::Logging.logger.outputters = Log4r::SyslogOutputter.new 'sidekiq', ident: 'notifu-processor'
|
11
|
+
else
|
12
|
+
Sidekiq::Logging.logger.outputters = Log4r::Outputter.stdout
|
13
|
+
end
|
14
|
+
# Sidekiq::Logging.logger.formatter = Notifu::LogFormatter.new
|
15
|
+
Sidekiq::Logging.logger.level = Log4r::INFO
|
16
|
+
end
|
17
|
+
|
18
|
+
Sidekiq.configure_client do |config|
|
19
|
+
config.redis = { url: Notifu::CONFIG[:redis_queues] }
|
20
|
+
end
|
21
|
+
|
22
|
+
###################################################################
|
23
|
+
###################################################################
|
24
|
+
####### PROCESSOR WORKER ##########################################
|
25
|
+
###################################################################
|
26
|
+
###################################################################
|
27
|
+
|
28
|
+
|
29
|
+
module Notifu
|
30
|
+
class Processor
|
31
|
+
include Sidekiq::Worker
|
32
|
+
include Notifu::Util
|
33
|
+
|
34
|
+
sidekiq_options :retry => true
|
35
|
+
sidekiq_options :queue => "processor"
|
36
|
+
|
37
|
+
attr_accessor :issue
|
38
|
+
attr_accessor :event
|
39
|
+
attr_accessor :now
|
40
|
+
attr_accessor :processing_result
|
41
|
+
|
42
|
+
###################################################################
|
43
|
+
####### SIDEKIQ GLUE METHOD #######################################
|
44
|
+
###################################################################
|
45
|
+
|
46
|
+
def perform *args
|
47
|
+
t_start = Time.now.to_f*1000.0
|
48
|
+
log "info", "Task start"
|
49
|
+
|
50
|
+
# read event
|
51
|
+
self.event = Notifu::Model::Event.new args
|
52
|
+
self.now = Time.now
|
53
|
+
log "info", "Processing event NID #{self.event.notifu_id}"
|
54
|
+
|
55
|
+
# try to check if we already know about the issue, otherwise save it into DB as a new one
|
56
|
+
self.issue = Notifu::Model::Issue.with(:notifu_id, self.event.notifu_id)
|
57
|
+
self.issue ||= Notifu::Model::Issue.create(self.event.data)
|
58
|
+
|
59
|
+
# let the magic happen
|
60
|
+
process!
|
61
|
+
|
62
|
+
t_finish = Time.now.to_f*1000.0
|
63
|
+
|
64
|
+
log "info", "Task finish (in #{t_finish-t_start}ms)"
|
65
|
+
end
|
66
|
+
|
67
|
+
###################################################################
|
68
|
+
####### MAIN PROCESSING LOGIC #####################################
|
69
|
+
###################################################################
|
70
|
+
|
71
|
+
def process!
|
72
|
+
self.event.group_sla.each do |gs|
|
73
|
+
|
74
|
+
# group related objects
|
75
|
+
begin
|
76
|
+
group = Notifu::Model::Group.with(:name, gs[:group])
|
77
|
+
sla = Notifu::Model::Sla.with(:name, gs[:sla])
|
78
|
+
rescue
|
79
|
+
log "info", "#{self.event.notifu_id} [#{self.event.host}/#{self.event.service}/#{self.event.code.to_state}]: Object init failed. Is Notifu API running?"
|
80
|
+
next
|
81
|
+
end
|
82
|
+
|
83
|
+
notified = {
|
84
|
+
sla: String.new,
|
85
|
+
group: String.new,
|
86
|
+
actors: Array.new,
|
87
|
+
contacts: Array.new,
|
88
|
+
escalation_level: "none"
|
89
|
+
}
|
90
|
+
|
91
|
+
result = []
|
92
|
+
|
93
|
+
# logic
|
94
|
+
if enough_occurrences? && self.event.action.to_s == "create"
|
95
|
+
result << "enough occurrences have passed"
|
96
|
+
if ! silenced?
|
97
|
+
result << "issue is not silenced"
|
98
|
+
if duty_time? sla.timerange_values(self.now)
|
99
|
+
result << "duty is active"
|
100
|
+
if status_changed?
|
101
|
+
result << "issue state has changed"
|
102
|
+
notified = notify!(sla, group)
|
103
|
+
result << "ACTION"
|
104
|
+
else
|
105
|
+
result << "issue state hasn't changed"
|
106
|
+
case self.event.code
|
107
|
+
when 0
|
108
|
+
result << "issue is in OK state" << "IDLE"
|
109
|
+
when 1
|
110
|
+
result << "issue is in WARNING state"
|
111
|
+
if first_notification?(sla, group)
|
112
|
+
result << "issue is new"
|
113
|
+
notified = notify!(sla, group)
|
114
|
+
result << "ACTION"
|
115
|
+
else
|
116
|
+
result << "already notified" << "IDLE"
|
117
|
+
end
|
118
|
+
when 2
|
119
|
+
result << "issue is not a warning"
|
120
|
+
if renotify?(sla, group)
|
121
|
+
result << "it's time to renotify"
|
122
|
+
notified = notify!(sla, group)
|
123
|
+
result << "ACTION"
|
124
|
+
else
|
125
|
+
result << "not yet time to renotify or escalate" << "IDLE"
|
126
|
+
end
|
127
|
+
else
|
128
|
+
result << "unknown state (#{self.event.code})" << "IDLE"
|
129
|
+
end
|
130
|
+
end
|
131
|
+
else
|
132
|
+
result << "duty is not active at this time" << "IDLE"
|
133
|
+
end
|
134
|
+
else
|
135
|
+
result << "issue is silenced" << "IDLE"
|
136
|
+
end
|
137
|
+
elsif self.event.action == "resolve" && self.issue.occurrences_count.to_i >= self.event.occurrences_trigger.to_i
|
138
|
+
if ! silenced?
|
139
|
+
result << "recovery of an event"
|
140
|
+
notified = notify!(sla, group)
|
141
|
+
result << "ACTION"
|
142
|
+
elsif self.event.unsilence
|
143
|
+
result << "recovery of an event (with unsilence)"
|
144
|
+
unsilence!
|
145
|
+
end
|
146
|
+
else
|
147
|
+
result << "not enough occurrences of this event" << "IDLE"
|
148
|
+
end
|
149
|
+
|
150
|
+
self.event.update_process_result!(notified)
|
151
|
+
|
152
|
+
action_log_message = {
|
153
|
+
logic: result.join(' -> '),
|
154
|
+
result: result[-1],
|
155
|
+
reason: result[-2],
|
156
|
+
group: group.name,
|
157
|
+
sla: sla.name,
|
158
|
+
host: self.event.host,
|
159
|
+
service: self.event.service,
|
160
|
+
message: self.event.message,
|
161
|
+
state: self.event.code.to_state,
|
162
|
+
contacts: notified[:contacts].to_json,
|
163
|
+
actors: notified[:actors].to_json,
|
164
|
+
occurrences_trigger: self.event.occurrences_trigger.to_i,
|
165
|
+
occurrences_count: self.event.occurrences_count.to_i,
|
166
|
+
check_duration: self.event.duration,
|
167
|
+
escalation_level: notified[:escalation_level].to_s,
|
168
|
+
sidekiq_jid: self.jid,
|
169
|
+
notifu_id: self.event.notifu_id,
|
170
|
+
:"@timestamp" => self.now.iso8601,
|
171
|
+
}
|
172
|
+
|
173
|
+
action_log action_log_message
|
174
|
+
|
175
|
+
end
|
176
|
+
|
177
|
+
if self.event.process_result.length > 0
|
178
|
+
self.issue.message = self.event.message
|
179
|
+
self.issue.action = self.event.action
|
180
|
+
self.issue.process_result = self.event.process_result
|
181
|
+
@issue.save
|
182
|
+
end
|
183
|
+
|
184
|
+
if status_changed?
|
185
|
+
self.issue.code = self.event.code
|
186
|
+
self.issue.time_created = self.event.time_created
|
187
|
+
end
|
188
|
+
|
189
|
+
self.issue.occurrences_trigger = self.event.occurrences_trigger
|
190
|
+
self.issue.occurrences_count = self.event.occurrences_count
|
191
|
+
self.issue.time_last_event = self.event.time_last_event
|
192
|
+
self.issue.sgs = self.event.sgs
|
193
|
+
self.issue.aspiring_code = self.event.code
|
194
|
+
self.issue.api_endpoint = self.event.api_endpoint
|
195
|
+
self.issue.duration = self.event.duration
|
196
|
+
|
197
|
+
@issue.save
|
198
|
+
|
199
|
+
# delayed cleanup job
|
200
|
+
cleanup!
|
201
|
+
end
|
202
|
+
|
203
|
+
###################################################################
|
204
|
+
####### NOTIFICATION METHOD (method for :process! ) ###############
|
205
|
+
###################################################################
|
206
|
+
|
207
|
+
def notify! (sla, group)
|
208
|
+
actors = []
|
209
|
+
contacts = []
|
210
|
+
escalation_level = "primary"
|
211
|
+
sla_actors = eval(sla.actors)
|
212
|
+
|
213
|
+
group.primary.each do |contact|
|
214
|
+
contacts << contact.name
|
215
|
+
end
|
216
|
+
actors += sla_actors[:primary]
|
217
|
+
|
218
|
+
# secondary escalation
|
219
|
+
if escalate_to?(1, sla)
|
220
|
+
group.secondary.each do |contact|
|
221
|
+
contacts << contact.name
|
222
|
+
end
|
223
|
+
actors += sla_actors[:secondary] if sla_actors[:secondary]
|
224
|
+
escalation_level = "secondary"
|
225
|
+
end
|
226
|
+
|
227
|
+
# tertiary escalation
|
228
|
+
if escalate_to?(2, sla)
|
229
|
+
group.tertiary.each do |contact|
|
230
|
+
contacts << contact.name
|
231
|
+
end
|
232
|
+
actors += sla_actors[:tertiary] if sla_actors[:tertiary]
|
233
|
+
escalation_level = "tertiary"
|
234
|
+
end
|
235
|
+
|
236
|
+
actors.each do |actor|
|
237
|
+
job = Sidekiq::Client.push( 'class' => "Notifu::Actors::#{actor.camelize}",
|
238
|
+
'args' => [ self.event.notifu_id, contacts ],
|
239
|
+
'queue' => "actor-#{actor}")
|
240
|
+
end
|
241
|
+
|
242
|
+
log "info", "Taking action (#{group.name}:#{sla.name}) NID #{self.event.notifu_id} [#{self.event.host}/#{self.event.service}/#{self.event.code.to_state}] actor: #{actors.join(', ')}; contacts: #{contacts.join(', ')}; escalation_level: #{escalation_level}"
|
243
|
+
|
244
|
+
self.issue.time_last_notified!(group.name, sla.name, Time.now.to_i)
|
245
|
+
|
246
|
+
return { sla: sla.name, group: group.name, actors: actors, contacts: contacts, escalation_level: escalation_level }
|
247
|
+
end
|
248
|
+
|
249
|
+
|
250
|
+
###################################################################
|
251
|
+
####### LOGIC BLOCK ###############################################
|
252
|
+
###################################################################
|
253
|
+
|
254
|
+
def enough_occurrences?
|
255
|
+
self.event.occurrences_count >= self.event.occurrences_trigger ? true : false
|
256
|
+
end
|
257
|
+
|
258
|
+
def escalate_to?(level, sla)
|
259
|
+
|
260
|
+
# escalation_interval = self.event.refresh
|
261
|
+
# escalation_interval ||= sla.refresh
|
262
|
+
escalation_interval = sla.refresh
|
263
|
+
escalation_period = level.to_i * escalation_interval.to_i
|
264
|
+
|
265
|
+
# log "info", "[#{escalation_period.to_s}] Creation time: " + Time.at(self.issue.time_created.to_i).to_s
|
266
|
+
# log "info", "[#{escalation_period.to_s}] Escalation time: " + Time.at(self.issue.time_created.to_i + escalation_period.to_i).to_s
|
267
|
+
# log "info", "[#{escalation_period.to_s}] Now time: " + Time.at(self.now.to_i).to_s
|
268
|
+
|
269
|
+
|
270
|
+
if self.issue.time_created.to_i + escalation_period.to_i <= self.now.to_i && is_critical?
|
271
|
+
return true
|
272
|
+
else
|
273
|
+
return false
|
274
|
+
end
|
275
|
+
end
|
276
|
+
|
277
|
+
def silenced?
|
278
|
+
if self.event.service == "keepalive"
|
279
|
+
path = "silence/#{self.event.host}"
|
280
|
+
else
|
281
|
+
path = "silence/#{self.event.host}/#{self.event.service}"
|
282
|
+
end
|
283
|
+
|
284
|
+
silenced = false
|
285
|
+
get_stashes.each do |stash|
|
286
|
+
silenced = true if stash["path"] == path
|
287
|
+
end
|
288
|
+
|
289
|
+
return silenced
|
290
|
+
end
|
291
|
+
|
292
|
+
def is_ok?
|
293
|
+
self.event.code == 0 ? true : false
|
294
|
+
end
|
295
|
+
|
296
|
+
def is_warning?
|
297
|
+
self.event.code == 1 ? true : false
|
298
|
+
end
|
299
|
+
|
300
|
+
def is_critical?
|
301
|
+
self.event.code == 2 ? true : false
|
302
|
+
end
|
303
|
+
|
304
|
+
def first_notification? sla, group
|
305
|
+
self.issue.time_last_notified?(group.name, sla.name) == nil ? true : false
|
306
|
+
end
|
307
|
+
|
308
|
+
def status_changed?
|
309
|
+
self.event.code.to_i != self.issue.code.to_i ? true : false
|
310
|
+
end
|
311
|
+
|
312
|
+
def renotify? (sla, group)
|
313
|
+
# t_renotify_int = self.event.refresh
|
314
|
+
# t_renotify_int ||= sla.refresh
|
315
|
+
t_renotify_int = sla.refresh
|
316
|
+
t_last_notified = self.issue.time_last_notified?(group.name, sla.name)
|
317
|
+
|
318
|
+
if t_last_notified.to_i + t_renotify_int.to_i <= self.now.to_i
|
319
|
+
return true
|
320
|
+
else
|
321
|
+
return false
|
322
|
+
end
|
323
|
+
end
|
324
|
+
|
325
|
+
def duty_time? (timerange)
|
326
|
+
case timerange
|
327
|
+
when "ALWAYS"
|
328
|
+
return true
|
329
|
+
when "NEVER"
|
330
|
+
return false
|
331
|
+
when /([0-9]{2}):([0-9]{2})-([0-9]{2}):([0-9]{2})/
|
332
|
+
t_duty_from = Time.local(self.now.year, self.now.month, self.now.day, timerange[0], timerange[1]).to_i
|
333
|
+
t_duty_until = Time.local(self.now.year, self.now.month, self.now.day, timerange[2], timerange[3]).to_i
|
334
|
+
if t_duty_from <= self.now.to_i <= t_duty_until
|
335
|
+
return true
|
336
|
+
else
|
337
|
+
return false
|
338
|
+
end
|
339
|
+
else
|
340
|
+
return true
|
341
|
+
end
|
342
|
+
end
|
343
|
+
|
344
|
+
###################################################################
|
345
|
+
####### HELPER BLOCK (methods for :process! ) #####################
|
346
|
+
###################################################################
|
347
|
+
|
348
|
+
|
349
|
+
##
|
350
|
+
# cleanup method
|
351
|
+
#
|
352
|
+
def cleanup!
|
353
|
+
if is_ok? && self.issue.action == "resolve"
|
354
|
+
Notifu::Cleaner.perform_async(self.issue.notifu_id)
|
355
|
+
end
|
356
|
+
end
|
357
|
+
|
358
|
+
##
|
359
|
+
# get stashes from Sensu API
|
360
|
+
#
|
361
|
+
def get_stashes
|
362
|
+
return @stashes if @stashes
|
363
|
+
begin
|
364
|
+
sensu_api = Excon.get "#{self.event.api_endpoint}/stashes"
|
365
|
+
@stashes = JSON.parse sensu_api.body
|
366
|
+
rescue
|
367
|
+
@stashes = []
|
368
|
+
log "error", "Failed to get stashes #{self.event.api_endpoint}/stashes"
|
369
|
+
end
|
370
|
+
end
|
371
|
+
|
372
|
+
|
373
|
+
##
|
374
|
+
# unsilence method
|
375
|
+
#
|
376
|
+
def unsilence!
|
377
|
+
path = "silence/#{self.event.host}/#{self.event.service}"
|
378
|
+
get_stashes.each do |stash|
|
379
|
+
if stash["path"] == path
|
380
|
+
if stash["expire"] < 0
|
381
|
+
if self.event.unsilence
|
382
|
+
begin
|
383
|
+
Excon.delete "#{self.event.api_endpoint}/stashes/silence/#{self.event.host}/#{self.event.service}"
|
384
|
+
log "info", "Unstashed #{self.event.host}/#{self.event.service} after recovery"
|
385
|
+
rescue
|
386
|
+
log "warning", "Failed to fetch stashes from Sensu API: #{self.event.api_endpoint}/stashes"
|
387
|
+
end
|
388
|
+
else
|
389
|
+
log "info", "#{self.event.host}/#{self.event.service} left stashed (auto-unstash disabled)"
|
390
|
+
end
|
391
|
+
else
|
392
|
+
log "info", "#{self.event.host}/#{self.event.service} left stashed (auto-unstash doesn't work on checks with defined expiry)"
|
393
|
+
end
|
394
|
+
end
|
395
|
+
end
|
396
|
+
end
|
397
|
+
|
398
|
+
##
|
399
|
+
# logging method
|
400
|
+
#
|
401
|
+
def log(prio, msg)
|
402
|
+
$logger.log prio, "JID-#{self.jid}: " + msg.to_s
|
403
|
+
end
|
404
|
+
|
405
|
+
##
|
406
|
+
# action logging method
|
407
|
+
#
|
408
|
+
def action_log event
|
409
|
+
$logger.action_log "processor", event
|
410
|
+
end
|
411
|
+
|
412
|
+
end
|
413
|
+
end
|
414
|
+
|
415
|
+
###################################################################
|
416
|
+
###################################################################
|
417
|
+
####### CLEANER WORKER ############################################
|
418
|
+
###################################################################
|
419
|
+
###################################################################
|
420
|
+
|
421
|
+
module Notifu
|
422
|
+
class Cleaner
|
423
|
+
include Sidekiq::Worker
|
424
|
+
include Notifu::Util
|
425
|
+
|
426
|
+
sidekiq_options :retry => true
|
427
|
+
sidekiq_options :queue => "processor"
|
428
|
+
|
429
|
+
def perform notifu_id, delay=15
|
430
|
+
sleep delay
|
431
|
+
begin
|
432
|
+
Notifu::Model::Issue.with(:notifu_id, notifu_id).delete
|
433
|
+
log "info", "Cleanup NID #{notifu_id} - success"
|
434
|
+
rescue NoMethodError
|
435
|
+
log "info", "Cleanup NID #{notifu_id} - not found"
|
436
|
+
end
|
437
|
+
end
|
438
|
+
|
439
|
+
def log(prio, msg)
|
440
|
+
$logger.log prio, "JID-#{self.jid}: " + msg.to_s
|
441
|
+
end
|
442
|
+
|
443
|
+
end
|
444
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
##
|
2
|
+
# Require block
|
3
|
+
#
|
4
|
+
require 'rubygems'
|
5
|
+
require 'bundler/setup'
|
6
|
+
require "ohm"
|
7
|
+
require "elasticsearch"
|
8
|
+
require "log4r/outputter/syslogoutputter"
|
9
|
+
require "log4r/configurator"
|
10
|
+
require "log4r"
|
11
|
+
require "syslog"
|
12
|
+
require "sidekiq"
|
13
|
+
require "sidekiq/logging"
|
14
|
+
require "notifu"
|
15
|
+
|
16
|
+
##
|
17
|
+
# Config block
|
18
|
+
#
|
19
|
+
Notifu::CONFIG = Notifu::Config.new.get
|
20
|
+
|
21
|
+
##
|
22
|
+
# Ohm init
|
23
|
+
#
|
24
|
+
Ohm.redis = Redic.new Notifu::CONFIG[:redis_data]
|