interferon 0.0.12 → 0.0.19

Sign up to get free protection for your applications and to get access to all the features.
data/lib/interferon.rb CHANGED
@@ -9,6 +9,7 @@ require 'interferon/alert_dsl'
9
9
  #require 'pry' #uncomment if you're debugging
10
10
  require 'erb'
11
11
  require 'ostruct'
12
+ require 'parallel'
12
13
  require 'set'
13
14
  require 'yaml'
14
15
 
@@ -23,11 +24,14 @@ module Interferon
23
24
  # groups_sources is a hash from type => options for each group source
24
25
  # host_sources is a hash from type => options for each host source
25
26
  # destinations is a similiar hash from type => options for each alerter
26
- def initialize(alerts_repo_path, groups_sources, host_sources, destinations)
27
+ def initialize(alerts_repo_path, groups_sources, host_sources, destinations,
28
+ dry_run=false, processes=nil)
27
29
  @alerts_repo_path = alerts_repo_path
28
30
  @groups_sources = groups_sources
29
31
  @host_sources = host_sources
30
32
  @destinations = destinations
33
+ @dry_run = dry_run
34
+ @processes = processes
31
35
  @request_shutdown = false
32
36
  end
33
37
 
@@ -36,7 +40,8 @@ module Interferon
36
40
  log.info "SIGTERM received. shutting down gracefully..."
37
41
  @request_shutdown = true
38
42
  end
39
- run_desc = dry_run ? 'dry run' : 'run'
43
+ @dry_run = dry_run
44
+ run_desc = @dry_run ? 'dry run' : 'run'
40
45
  log.info "beginning alerts #{run_desc}"
41
46
 
42
47
  alerts = read_alerts
@@ -45,9 +50,12 @@ module Interferon
45
50
 
46
51
  @destinations.each do |dest|
47
52
  dest['options'] ||= {}
53
+ if @dry_run
54
+ dest['options']['dry_run'] = true
55
+ end
48
56
  end
49
57
 
50
- update_alerts(@destinations, hosts, alerts, groups, dry_run)
58
+ update_alerts(@destinations, hosts, alerts, groups)
51
59
 
52
60
  if @request_shutdown
53
61
  log.info "interferon #{run_desc} shut down by SIGTERM"
@@ -133,23 +141,23 @@ module Interferon
133
141
  return hosts
134
142
  end
135
143
 
136
- def update_alerts(destinations, hosts, alerts, groups, dry_run)
144
+ def update_alerts(destinations, hosts, alerts, groups)
137
145
  loader = DestinationsLoader.new([@alerts_repo_path])
138
146
  loader.get_all(destinations).each do |dest|
139
147
  break if @request_shutdown
140
148
  log.info "updating alerts on #{dest.class.name}"
141
- update_alerts_on_destination(dest, hosts, alerts, groups, dry_run)
149
+ update_alerts_on_destination(dest, hosts, alerts, groups)
142
150
  end
143
151
  end
144
152
 
145
- def update_alerts_on_destination(dest, hosts, alerts, groups, dry_run)
153
+ def update_alerts_on_destination(dest, hosts, alerts, groups)
146
154
  # track some counters/stats per destination
147
155
  start_time = Time.new.to_f
148
156
 
149
157
  # get already-defined alerts
150
- existing_alerts = dest.existing_alerts.dup
158
+ existing_alerts = dest.existing_alerts
151
159
 
152
- if dry_run
160
+ if @dry_run
153
161
  do_dry_run_update(dest, hosts, alerts, existing_alerts, groups)
154
162
  else
155
163
  do_regular_update(dest, hosts, alerts, existing_alerts, groups)
@@ -159,7 +167,7 @@ module Interferon
159
167
  # run time summary
160
168
  run_time = Time.new.to_f - start_time
161
169
  statsd.histogram(
162
- dry_run ? 'destinations.run_time.dry_run' : 'destinations.run_time',
170
+ @dry_run ? 'destinations.run_time.dry_run' : 'destinations.run_time',
163
171
  run_time,
164
172
  :tags => ["destination:#{dest.class.name}"])
165
173
  log.info "#{dest.class.name} : run completed in %.2f seconds" % (run_time)
@@ -168,46 +176,103 @@ module Interferon
168
176
  dest.report_stats
169
177
  end
170
178
 
171
- if dry_run && !dest.api_errors.empty?
179
+ if @dry_run && !dest.api_errors.empty?
172
180
  raise dest.api_errors.to_s
173
181
  end
174
182
  end
175
183
 
176
184
  def do_dry_run_update(dest, hosts, alerts, existing_alerts, groups)
177
- to_remove = existing_alerts.reject{|key, a| !key.start_with?(DRY_RUN_ALERTS_NAME_PREFIX)}
185
+ # Track these to clean up dry-run alerts from previous runs
186
+ existing_dry_run_alerts = []
187
+ existing_alerts.each do |name, alert|
188
+ if name.start_with?(DRY_RUN_ALERTS_NAME_PREFIX)
189
+ existing_dry_run_alerts << [alert['name'], [alert['id']]]
190
+ existing_alerts.delete(name)
191
+ end
192
+ end
193
+
178
194
  alerts_queue = build_alerts_queue(hosts, alerts, groups)
179
- alerts_queue.reject!{|name, pair| !Interferon::need_dry_run(pair[0], existing_alerts)}
180
- alerts_queue.each do |name, pair|
181
- alert = pair[0]
182
- alert.change_name(DRY_RUN_ALERTS_NAME_PREFIX + alert['name'])
195
+ updates_queue = alerts_queue.reject do |name, alert_people_pair|
196
+ !Interferon::need_update(dest, alert_people_pair, existing_alerts)
197
+ end
198
+
199
+ # Add dry-run prefix to alerts and delete id to avoid impacting real alerts
200
+ existing_alerts.keys.each do |name|
201
+ existing_alert = existing_alerts[name]
202
+ dry_run_alert_name = DRY_RUN_ALERTS_NAME_PREFIX + name
203
+ existing_alert['name'] = dry_run_alert_name
204
+ existing_alert['id'] = [nil]
205
+ existing_alerts[dry_run_alert_name] = existing_alerts.delete(name)
206
+ end
207
+
208
+ # Build new queue with dry-run prefixes and ensure they are silenced
209
+ alerts_queue.each do |name, alert_people_pair|
210
+ alert = alert_people_pair[0]
211
+ dry_run_alert_name = DRY_RUN_ALERTS_NAME_PREFIX + alert['name']
212
+ alert.change_name(dry_run_alert_name)
213
+ alert.silence
214
+ end
215
+
216
+ # Create alerts in destination
217
+ created_alerts = create_alerts(dest, updates_queue)
218
+
219
+ # Existing alerts are pruned until all that remains are alerts that aren't being generated anymore
220
+ to_remove = existing_alerts.dup
221
+ alerts_queue.each do |name, alert_people_pair|
222
+ alert = alert_people_pair[0]
223
+ old_alerts = to_remove[alert['name']]
224
+
225
+ if !old_alerts.nil?
226
+ if old_alerts['id'].length == 1
227
+ to_remove.delete(alert['name'])
228
+ else
229
+ old_alerts['id'] = old_alerts['id'].drop(1)
230
+ end
231
+ end
183
232
  end
184
233
 
185
- # flush queue
186
- created_alerts_key_ids = create_alerts(dest, alerts_queue)
187
- created_alerts_ids = created_alerts_key_ids.map{|a| a[1]}
188
- to_remove_ids = to_remove.empty? ? [] : to_remove.map{|a| a['id']}
189
- # remove existing alerts that shouldn't exist
190
- (created_alerts_ids + to_remove_ids).each do |id|
234
+ # Clean up alerts not longer being generated
235
+ to_remove.each do |name, alert|
191
236
  break if @request_shutdown
192
- dest.remove_alert_by_id(id) unless id.nil?
237
+ dest.remove_alert(alert)
238
+ end
239
+
240
+ # Clean up dry-run created alerts
241
+ (created_alerts + existing_dry_run_alerts).each do |alert_id_pair|
242
+ alert_ids = alert_id_pair[1]
243
+ alert_ids.each do |alert_id|
244
+ dest.remove_alert_by_id(alert_id)
245
+ end
193
246
  end
247
+
194
248
  end
195
249
 
196
250
  def do_regular_update(dest, hosts, alerts, existing_alerts, groups)
197
- existing_alerts.each{ |key, existing_alert| existing_alert['still_exists'] = false }
198
-
199
251
  alerts_queue = build_alerts_queue(hosts, alerts, groups)
252
+ updates_queue = alerts_queue.reject do |name, alert_people_pair|
253
+ !Interferon::need_update(dest, alert_people_pair, existing_alerts)
254
+ end
200
255
 
201
- # flush queue
202
- created_alerts_keys = create_alerts(dest, alerts_queue).map{|a| a[0]}
203
- created_alerts_keys.each do |alert_key|
204
- # don't delete alerts we still have defined
205
- existing_alerts[alert_key]['still_exists'] = true if existing_alerts.include?(alert_key)
256
+ # Create alerts in destination
257
+ create_alerts(dest, updates_queue)
258
+
259
+ # Existing alerts are pruned until all that remains are alerts that aren't being generated anymore
260
+ to_remove = existing_alerts.dup
261
+ alerts_queue.each do |name, alert_people_pair|
262
+ alert = alert_people_pair[0]
263
+ old_alerts = to_remove[alert['name']]
264
+
265
+ if !old_alerts.nil?
266
+ if old_alerts['id'].length == 1
267
+ to_remove.delete(alert['name'])
268
+ else
269
+ old_alerts['id'] = old_alerts['id'].drop(1)
270
+ end
271
+ end
206
272
  end
207
273
 
208
- # remove existing alerts that shouldn't exist
209
- to_delete = existing_alerts.reject{ |key, existing_alert| existing_alert['still_exists'] }
210
- to_delete.each do |key, alert|
274
+ # Clean up alerts not longer being generated
275
+ to_remove.each do |name, alert|
211
276
  break if @request_shutdown
212
277
  dest.remove_alert(alert)
213
278
  end
@@ -237,10 +302,11 @@ module Interferon
237
302
  end
238
303
 
239
304
  def build_alerts_queue(hosts, alerts, groups)
305
+ alerts_queue = {}
240
306
  # create or update alerts; mark when we've done that
241
- alerts_queue = Hash.new
242
- alerts.each do |alert|
307
+ result = Parallel.map(alerts, in_processes: @processes) do |alert|
243
308
  break if @request_shutdown
309
+ alerts_generated = {}
244
310
  counters = {
245
311
  :errors => 0,
246
312
  :evals => 0,
@@ -268,7 +334,7 @@ module Interferon
268
334
 
269
335
  counters[:applies] += 1
270
336
  # don't define alerts twice
271
- next if alerts_queue.key?(alert[:name])
337
+ next if alerts_generated.key?(alert[:name])
272
338
 
273
339
  # figure out who to notify
274
340
  people = Set.new(alert[:notify][:people])
@@ -277,7 +343,7 @@ module Interferon
277
343
  end
278
344
 
279
345
  # queue the alert up for creation; we clone the alert to save the current state
280
- alerts_queue[alert[:name]] ||= [alert.clone, people]
346
+ alerts_generated[alert[:name]] = [alert.clone, people]
281
347
  end
282
348
 
283
349
  # log some of the counters
@@ -289,7 +355,7 @@ module Interferon
289
355
  end
290
356
 
291
357
  # did the alert fail to evaluate on all hosts?
292
- if counters[:errors] == counters[:hosts]
358
+ if counters[:errors] == counters[:hosts] && !last_eval_error.nil?
293
359
  log.error "alert #{alert} failed to evaluate in the context of all hosts!"
294
360
  log.error "last error on alert #{alert}: #{last_eval_error}"
295
361
 
@@ -306,25 +372,48 @@ module Interferon
306
372
  else
307
373
  statsd.gauge('alerts.evaluate.never_applies', 0, :tags => ["alert:#{alert}"])
308
374
  end
375
+ alerts_generated
376
+ end
377
+
378
+ result.each do |alerts_generated|
379
+ alerts_queue.merge! alerts_generated
309
380
  end
310
381
  alerts_queue
311
382
  end
312
383
 
313
- def self.need_dry_run(alert, existing_alerts_from_api)
384
+ def self.need_update(dest, alert_people_pair, existing_alerts_from_api)
385
+ alert = alert_people_pair[0]
314
386
  existing = existing_alerts_from_api[alert['name']]
315
387
  if existing.nil?
316
388
  true
317
389
  else
318
- !same_alerts_for_dry_run_purpose(alert, existing)
390
+ !same_alerts(dest, alert_people_pair, existing)
319
391
  end
320
392
  end
321
393
 
322
- def self.same_alerts_for_dry_run_purpose(alert, alert_api_json)
323
- query1 = alert['metric']['datadog_query']
324
- query2 = alert_api_json['query']
325
- query1.strip!
326
- query2.strip!
327
- query1 == query2
394
+ def self.same_alerts(dest, alert_people_pair, alert_api_json)
395
+ alert, people = alert_people_pair
396
+
397
+ prev_alert = {
398
+ :query => alert_api_json['query'].strip,
399
+ :message => alert_api_json['message'].strip,
400
+ :notify_no_data => alert_api_json['notify_no_data'],
401
+ :silenced => alert_api_json['silenced'],
402
+ :timeout => alert_api_json['timeout_h'],
403
+ :no_data_timeframe => alert_api_json['no_data_timeframe']
404
+ }
405
+
406
+ new_alert = {
407
+ :query => alert['metric']['datadog_query'].strip,
408
+ :message => dest.generate_message(alert['message'], people).strip,
409
+ :notify_no_data => alert['notify_no_data'],
410
+ :silenced => alert['silenced'] || alert['silenced_until'] > Time.now,
411
+ :timeout => alert['timeout'] ? [1, alert['timeout'].to_i / 3600].max : nil,
412
+ :no_data_timeframe => alert['no_data_timeframe'] || nil
413
+ }
414
+
415
+ prev_alert == new_alert
328
416
  end
417
+
329
418
  end
330
419
  end
@@ -6,7 +6,8 @@ module Interferon
6
6
  def get_or_set(field, val, block, default)
7
7
  @hash ||= Hash.new
8
8
  if val.nil?
9
- return @hash[field]
9
+ f = @hash[field]
10
+ f.nil? ? default : f
10
11
  else
11
12
  @hash[field] = val
12
13
  end
@@ -27,6 +28,14 @@ module Interferon
27
28
  def id(v = nil, &block)
28
29
  get_or_set(:@id, v, block, '')
29
30
  end
31
+
32
+ def silenced(v = nil, &block)
33
+ get_or_set(:@silenced, v, block, false)
34
+ end
35
+
36
+ def silenced_until(v = nil, &block)
37
+ get_or_set(:@silenced_until, v && Time.parse(v), block, Time.at(0))
38
+ end
30
39
  end
31
40
 
32
41
  class MockNotifyDSL < NotifyDSL
@@ -1,6 +1,7 @@
1
1
  module Interferon
2
2
  class MockAlert < Alert
3
3
  def initialize(dsl)
4
+ @filename = 'MOCKALERT'
4
5
  @dsl = dsl
5
6
  end
6
7
 
@@ -0,0 +1,57 @@
1
+ require 'spec_helper'
2
+ require 'interferon/group_sources/filesystem'
3
+
4
+ describe Interferon::GroupSources::Filesystem do
5
+ let (:fs_loader) { Interferon::GroupSources::Filesystem.new({'paths' => ['/tmp']}) }
6
+
7
+ describe 'list_groups' do
8
+ context "with basic groups" do
9
+ before do
10
+ group_a = double()
11
+ expect(File).to receive(:read).with('group_a.yaml').and_return('group_a_text')
12
+ expect(Psych).to receive(:parse).and_return(group_a)
13
+ expect(group_a).to receive(:to_ruby).and_return({'name' => 'group_a',
14
+ 'people' => ['Alice', 'Bob']})
15
+
16
+ group_b = double()
17
+ expect(File).to receive(:read).with('group_b.yaml').and_return('group_b_text')
18
+ expect(Psych).to receive(:parse).and_return(group_b)
19
+ expect(group_b).to receive(:to_ruby).and_return({'name' => 'group_b',
20
+ 'people' => ['Carol', 'Dave']})
21
+ end
22
+
23
+ it 'loads groups defined by YAML' do
24
+ expect(Dir).to receive(:glob).and_return(['group_a.yaml', 'group_b.yaml'].each)
25
+
26
+ groups = fs_loader.list_groups()
27
+ expect(groups).to eq({'group_a' => ['Alice', 'Bob'], 'group_b' => ['Carol', 'Dave']})
28
+ end
29
+
30
+ it 'allows groups to be aliased in YAML' do
31
+ expect(Dir).to receive(:glob).and_return(['group_a.yaml', 'group_b.yaml', 'group_c.yaml'].each)
32
+ group_c = double()
33
+ expect(File).to receive(:read).with('group_c.yaml').and_return('group_c_text')
34
+ expect(Psych).to receive(:parse).and_return(group_c)
35
+ expect(group_c).to receive(:to_ruby).and_return({'name' => 'group_c', 'alias_for' => 'group_b'})
36
+
37
+ groups = fs_loader.list_groups()
38
+ expect(groups).to eq({'group_a' => ['Alice', 'Bob'],
39
+ 'group_b' => ['Carol', 'Dave'],
40
+ 'group_c' => ['Carol', 'Dave']})
41
+ end
42
+
43
+ it 'skips bad aliases in YAML' do
44
+ expect(Dir).to receive(:glob).and_return(['group_a.yaml', 'group_b.yaml', 'group_c.yaml'].each)
45
+ group_c = double()
46
+ expect(File).to receive(:read).with('group_c.yaml').and_return('group_c_text')
47
+ expect(Psych).to receive(:parse).and_return(group_c)
48
+ expect(group_c).to receive(:to_ruby).and_return({'name' => 'group_c', 'alias_for' => 'group_d'})
49
+
50
+ groups = fs_loader.list_groups()
51
+ expect(groups).to eq({'group_a' => ['Alice', 'Bob'],
52
+ 'group_b' => ['Carol', 'Dave']})
53
+ end
54
+ end
55
+ end
56
+
57
+ end