notifu 1.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,444 @@
1
+ require_relative "sidekiq_init"
2
+ require 'excon'
3
+
4
+ $logger = Notifu::Logger.new 'processor'
5
+
6
+ Sidekiq.configure_server do |config|
7
+ config.redis = { url: Notifu::CONFIG[:redis_queues] }
8
+ Sidekiq::Logging.logger = Log4r::Logger.new 'sidekiq'
9
+ if Notifu::CONFIG[:logging][:syslog][:enabled]
10
+ Sidekiq::Logging.logger.outputters = Log4r::SyslogOutputter.new 'sidekiq', ident: 'notifu-processor'
11
+ else
12
+ Sidekiq::Logging.logger.outputters = Log4r::Outputter.stdout
13
+ end
14
+ # Sidekiq::Logging.logger.formatter = Notifu::LogFormatter.new
15
+ Sidekiq::Logging.logger.level = Log4r::INFO
16
+ end
17
+
18
+ Sidekiq.configure_client do |config|
19
+ config.redis = { url: Notifu::CONFIG[:redis_queues] }
20
+ end
21
+
22
+ ###################################################################
23
+ ###################################################################
24
+ ####### PROCESSOR WORKER ##########################################
25
+ ###################################################################
26
+ ###################################################################
27
+
28
+
29
+ module Notifu
30
+ class Processor
31
+ include Sidekiq::Worker
32
+ include Notifu::Util
33
+
34
+ sidekiq_options :retry => true
35
+ sidekiq_options :queue => "processor"
36
+
37
+ attr_accessor :issue
38
+ attr_accessor :event
39
+ attr_accessor :now
40
+ attr_accessor :processing_result
41
+
42
+ ###################################################################
43
+ ####### SIDEKIQ GLUE METHOD #######################################
44
+ ###################################################################
45
+
46
+ def perform *args
47
+ t_start = Time.now.to_f*1000.0
48
+ log "info", "Task start"
49
+
50
+ # read event
51
+ self.event = Notifu::Model::Event.new args
52
+ self.now = Time.now
53
+ log "info", "Processing event NID #{self.event.notifu_id}"
54
+
55
+ # try to check if we already know about the issue, otherwise save it into DB as a new one
56
+ self.issue = Notifu::Model::Issue.with(:notifu_id, self.event.notifu_id)
57
+ self.issue ||= Notifu::Model::Issue.create(self.event.data)
58
+
59
+ # let the magic happen
60
+ process!
61
+
62
+ t_finish = Time.now.to_f*1000.0
63
+
64
+ log "info", "Task finish (in #{t_finish-t_start}ms)"
65
+ end
66
+
67
+ ###################################################################
68
+ ####### MAIN PROCESSING LOGIC #####################################
69
+ ###################################################################
70
+
71
+ def process!
72
+ self.event.group_sla.each do |gs|
73
+
74
+ # group related objects
75
+ begin
76
+ group = Notifu::Model::Group.with(:name, gs[:group])
77
+ sla = Notifu::Model::Sla.with(:name, gs[:sla])
78
+ rescue
79
+ log "info", "#{self.event.notifu_id} [#{self.event.host}/#{self.event.service}/#{self.event.code.to_state}]: Object init failed. Is Notifu API running?"
80
+ next
81
+ end
82
+
83
+ notified = {
84
+ sla: String.new,
85
+ group: String.new,
86
+ actors: Array.new,
87
+ contacts: Array.new,
88
+ escalation_level: "none"
89
+ }
90
+
91
+ result = []
92
+
93
+ # logic
94
+ if enough_occurrences? && self.event.action.to_s == "create"
95
+ result << "enough occurrences have passed"
96
+ if ! silenced?
97
+ result << "issue is not silenced"
98
+ if duty_time? sla.timerange_values(self.now)
99
+ result << "duty is active"
100
+ if status_changed?
101
+ result << "issue state has changed"
102
+ notified = notify!(sla, group)
103
+ result << "ACTION"
104
+ else
105
+ result << "issue state hasn't changed"
106
+ case self.event.code
107
+ when 0
108
+ result << "issue is in OK state" << "IDLE"
109
+ when 1
110
+ result << "issue is in WARNING state"
111
+ if first_notification?(sla, group)
112
+ result << "issue is new"
113
+ notified = notify!(sla, group)
114
+ result << "ACTION"
115
+ else
116
+ result << "already notified" << "IDLE"
117
+ end
118
+ when 2
119
+ result << "issue is not a warning"
120
+ if renotify?(sla, group)
121
+ result << "it's time to renotify"
122
+ notified = notify!(sla, group)
123
+ result << "ACTION"
124
+ else
125
+ result << "not yet time to renotify or escalate" << "IDLE"
126
+ end
127
+ else
128
+ result << "unknown state (#{self.event.code})" << "IDLE"
129
+ end
130
+ end
131
+ else
132
+ result << "duty is not active at this time" << "IDLE"
133
+ end
134
+ else
135
+ result << "issue is silenced" << "IDLE"
136
+ end
137
+ elsif self.event.action == "resolve" && self.issue.occurrences_count.to_i >= self.event.occurrences_trigger.to_i
138
+ if ! silenced?
139
+ result << "recovery of an event"
140
+ notified = notify!(sla, group)
141
+ result << "ACTION"
142
+ elsif self.event.unsilence
143
+ result << "recovery of an event (with unsilence)"
144
+ unsilence!
145
+ end
146
+ else
147
+ result << "not enough occurrences of this event" << "IDLE"
148
+ end
149
+
150
+ self.event.update_process_result!(notified)
151
+
152
+ action_log_message = {
153
+ logic: result.join(' -> '),
154
+ result: result[-1],
155
+ reason: result[-2],
156
+ group: group.name,
157
+ sla: sla.name,
158
+ host: self.event.host,
159
+ service: self.event.service,
160
+ message: self.event.message,
161
+ state: self.event.code.to_state,
162
+ contacts: notified[:contacts].to_json,
163
+ actors: notified[:actors].to_json,
164
+ occurrences_trigger: self.event.occurrences_trigger.to_i,
165
+ occurrences_count: self.event.occurrences_count.to_i,
166
+ check_duration: self.event.duration,
167
+ escalation_level: notified[:escalation_level].to_s,
168
+ sidekiq_jid: self.jid,
169
+ notifu_id: self.event.notifu_id,
170
+ :"@timestamp" => self.now.iso8601,
171
+ }
172
+
173
+ action_log action_log_message
174
+
175
+ end
176
+
177
+ if self.event.process_result.length > 0
178
+ self.issue.message = self.event.message
179
+ self.issue.action = self.event.action
180
+ self.issue.process_result = self.event.process_result
181
+ @issue.save
182
+ end
183
+
184
+ if status_changed?
185
+ self.issue.code = self.event.code
186
+ self.issue.time_created = self.event.time_created
187
+ end
188
+
189
+ self.issue.occurrences_trigger = self.event.occurrences_trigger
190
+ self.issue.occurrences_count = self.event.occurrences_count
191
+ self.issue.time_last_event = self.event.time_last_event
192
+ self.issue.sgs = self.event.sgs
193
+ self.issue.aspiring_code = self.event.code
194
+ self.issue.api_endpoint = self.event.api_endpoint
195
+ self.issue.duration = self.event.duration
196
+
197
+ @issue.save
198
+
199
+ # delayed cleanup job
200
+ cleanup!
201
+ end
202
+
203
+ ###################################################################
204
+ ####### NOTIFICATION METHOD (method for :process! ) ###############
205
+ ###################################################################
206
+
207
+ def notify! (sla, group)
208
+ actors = []
209
+ contacts = []
210
+ escalation_level = "primary"
211
+ sla_actors = eval(sla.actors)
212
+
213
+ group.primary.each do |contact|
214
+ contacts << contact.name
215
+ end
216
+ actors += sla_actors[:primary]
217
+
218
+ # secondary escalation
219
+ if escalate_to?(1, sla)
220
+ group.secondary.each do |contact|
221
+ contacts << contact.name
222
+ end
223
+ actors += sla_actors[:secondary] if sla_actors[:secondary]
224
+ escalation_level = "secondary"
225
+ end
226
+
227
+ # tertiary escalation
228
+ if escalate_to?(2, sla)
229
+ group.tertiary.each do |contact|
230
+ contacts << contact.name
231
+ end
232
+ actors += sla_actors[:tertiary] if sla_actors[:tertiary]
233
+ escalation_level = "tertiary"
234
+ end
235
+
236
+ actors.each do |actor|
237
+ job = Sidekiq::Client.push( 'class' => "Notifu::Actors::#{actor.camelize}",
238
+ 'args' => [ self.event.notifu_id, contacts ],
239
+ 'queue' => "actor-#{actor}")
240
+ end
241
+
242
+ log "info", "Taking action (#{group.name}:#{sla.name}) NID #{self.event.notifu_id} [#{self.event.host}/#{self.event.service}/#{self.event.code.to_state}] actor: #{actors.join(', ')}; contacts: #{contacts.join(', ')}; escalation_level: #{escalation_level}"
243
+
244
+ self.issue.time_last_notified!(group.name, sla.name, Time.now.to_i)
245
+
246
+ return { sla: sla.name, group: group.name, actors: actors, contacts: contacts, escalation_level: escalation_level }
247
+ end
248
+
249
+
250
+ ###################################################################
251
+ ####### LOGIC BLOCK ###############################################
252
+ ###################################################################
253
+
254
+ def enough_occurrences?
255
+ self.event.occurrences_count >= self.event.occurrences_trigger ? true : false
256
+ end
257
+
258
+ def escalate_to?(level, sla)
259
+
260
+ # escalation_interval = self.event.refresh
261
+ # escalation_interval ||= sla.refresh
262
+ escalation_interval = sla.refresh
263
+ escalation_period = level.to_i * escalation_interval.to_i
264
+
265
+ # log "info", "[#{escalation_period.to_s}] Creation time: " + Time.at(self.issue.time_created.to_i).to_s
266
+ # log "info", "[#{escalation_period.to_s}] Escalation time: " + Time.at(self.issue.time_created.to_i + escalation_period.to_i).to_s
267
+ # log "info", "[#{escalation_period.to_s}] Now time: " + Time.at(self.now.to_i).to_s
268
+
269
+
270
+ if self.issue.time_created.to_i + escalation_period.to_i <= self.now.to_i && is_critical?
271
+ return true
272
+ else
273
+ return false
274
+ end
275
+ end
276
+
277
+ def silenced?
278
+ if self.event.service == "keepalive"
279
+ path = "silence/#{self.event.host}"
280
+ else
281
+ path = "silence/#{self.event.host}/#{self.event.service}"
282
+ end
283
+
284
+ silenced = false
285
+ get_stashes.each do |stash|
286
+ silenced = true if stash["path"] == path
287
+ end
288
+
289
+ return silenced
290
+ end
291
+
292
+ def is_ok?
293
+ self.event.code == 0 ? true : false
294
+ end
295
+
296
+ def is_warning?
297
+ self.event.code == 1 ? true : false
298
+ end
299
+
300
+ def is_critical?
301
+ self.event.code == 2 ? true : false
302
+ end
303
+
304
+ def first_notification? sla, group
305
+ self.issue.time_last_notified?(group.name, sla.name) == nil ? true : false
306
+ end
307
+
308
+ def status_changed?
309
+ self.event.code.to_i != self.issue.code.to_i ? true : false
310
+ end
311
+
312
+ def renotify? (sla, group)
313
+ # t_renotify_int = self.event.refresh
314
+ # t_renotify_int ||= sla.refresh
315
+ t_renotify_int = sla.refresh
316
+ t_last_notified = self.issue.time_last_notified?(group.name, sla.name)
317
+
318
+ if t_last_notified.to_i + t_renotify_int.to_i <= self.now.to_i
319
+ return true
320
+ else
321
+ return false
322
+ end
323
+ end
324
+
325
+ def duty_time? (timerange)
326
+ case timerange
327
+ when "ALWAYS"
328
+ return true
329
+ when "NEVER"
330
+ return false
331
+ when /([0-9]{2}):([0-9]{2})-([0-9]{2}):([0-9]{2})/
332
+ t_duty_from = Time.local(self.now.year, self.now.month, self.now.day, timerange[0], timerange[1]).to_i
333
+ t_duty_until = Time.local(self.now.year, self.now.month, self.now.day, timerange[2], timerange[3]).to_i
334
+ if t_duty_from <= self.now.to_i <= t_duty_until
335
+ return true
336
+ else
337
+ return false
338
+ end
339
+ else
340
+ return true
341
+ end
342
+ end
343
+
344
+ ###################################################################
345
+ ####### HELPER BLOCK (methods for :process! ) #####################
346
+ ###################################################################
347
+
348
+
349
+ ##
350
+ # cleanup method
351
+ #
352
+ def cleanup!
353
+ if is_ok? && self.issue.action == "resolve"
354
+ Notifu::Cleaner.perform_async(self.issue.notifu_id)
355
+ end
356
+ end
357
+
358
+ ##
359
+ # get stashes from Sensu API
360
+ #
361
+ def get_stashes
362
+ return @stashes if @stashes
363
+ begin
364
+ sensu_api = Excon.get "#{self.event.api_endpoint}/stashes"
365
+ @stashes = JSON.parse sensu_api.body
366
+ rescue
367
+ @stashes = []
368
+ log "error", "Failed to get stashes #{self.event.api_endpoint}/stashes"
369
+ end
370
+ end
371
+
372
+
373
+ ##
374
+ # unsilence method
375
+ #
376
+ def unsilence!
377
+ path = "silence/#{self.event.host}/#{self.event.service}"
378
+ get_stashes.each do |stash|
379
+ if stash["path"] == path
380
+ if stash["expire"] < 0
381
+ if self.event.unsilence
382
+ begin
383
+ Excon.delete "#{self.event.api_endpoint}/stashes/silence/#{self.event.host}/#{self.event.service}"
384
+ log "info", "Unstashed #{self.event.host}/#{self.event.service} after recovery"
385
+ rescue
386
+ log "warning", "Failed to fetch stashes from Sensu API: #{self.event.api_endpoint}/stashes"
387
+ end
388
+ else
389
+ log "info", "#{self.event.host}/#{self.event.service} left stashed (auto-unstash disabled)"
390
+ end
391
+ else
392
+ log "info", "#{self.event.host}/#{self.event.service} left stashed (auto-unstash doesn't work on checks with defined expiry)"
393
+ end
394
+ end
395
+ end
396
+ end
397
+
398
+ ##
399
+ # logging method
400
+ #
401
+ def log(prio, msg)
402
+ $logger.log prio, "JID-#{self.jid}: " + msg.to_s
403
+ end
404
+
405
+ ##
406
+ # action logging method
407
+ #
408
+ def action_log event
409
+ $logger.action_log "processor", event
410
+ end
411
+
412
+ end
413
+ end
414
+
415
+ ###################################################################
416
+ ###################################################################
417
+ ####### CLEANER WORKER ############################################
418
+ ###################################################################
419
+ ###################################################################
420
+
421
+ module Notifu
422
+ class Cleaner
423
+ include Sidekiq::Worker
424
+ include Notifu::Util
425
+
426
+ sidekiq_options :retry => true
427
+ sidekiq_options :queue => "processor"
428
+
429
+ def perform notifu_id, delay=15
430
+ sleep delay
431
+ begin
432
+ Notifu::Model::Issue.with(:notifu_id, notifu_id).delete
433
+ log "info", "Cleanup NID #{notifu_id} - success"
434
+ rescue NoMethodError
435
+ log "info", "Cleanup NID #{notifu_id} - not found"
436
+ end
437
+ end
438
+
439
+ def log(prio, msg)
440
+ $logger.log prio, "JID-#{self.jid}: " + msg.to_s
441
+ end
442
+
443
+ end
444
+ end
@@ -0,0 +1,24 @@
1
+ ##
2
+ # Require block
3
+ #
4
+ require 'rubygems'
5
+ require 'bundler/setup'
6
+ require "ohm"
7
+ require "elasticsearch"
8
+ require "log4r/outputter/syslogoutputter"
9
+ require "log4r/configurator"
10
+ require "log4r"
11
+ require "syslog"
12
+ require "sidekiq"
13
+ require "sidekiq/logging"
14
+ require "notifu"
15
+
16
+ ##
17
+ # Config block
18
+ #
19
+ Notifu::CONFIG = Notifu::Config.new.get
20
+
21
+ ##
22
+ # Ohm init
23
+ #
24
+ Ohm.redis = Redic.new Notifu::CONFIG[:redis_data]