interferon 0.0.12 → 0.0.19

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/interferon.rb CHANGED
@@ -9,6 +9,7 @@ require 'interferon/alert_dsl'
9
9
  #require 'pry' #uncomment if you're debugging
10
10
  require 'erb'
11
11
  require 'ostruct'
12
+ require 'parallel'
12
13
  require 'set'
13
14
  require 'yaml'
14
15
 
@@ -23,11 +24,14 @@ module Interferon
23
24
  # groups_sources is a hash from type => options for each group source
24
25
  # host_sources is a hash from type => options for each host source
25
26
  # destinations is a similiar hash from type => options for each alerter
26
- def initialize(alerts_repo_path, groups_sources, host_sources, destinations)
27
+ def initialize(alerts_repo_path, groups_sources, host_sources, destinations,
28
+ dry_run=false, processes=nil)
27
29
  @alerts_repo_path = alerts_repo_path
28
30
  @groups_sources = groups_sources
29
31
  @host_sources = host_sources
30
32
  @destinations = destinations
33
+ @dry_run = dry_run
34
+ @processes = processes
31
35
  @request_shutdown = false
32
36
  end
33
37
 
@@ -36,7 +40,8 @@ module Interferon
36
40
  log.info "SIGTERM received. shutting down gracefully..."
37
41
  @request_shutdown = true
38
42
  end
39
- run_desc = dry_run ? 'dry run' : 'run'
43
+ @dry_run = dry_run
44
+ run_desc = @dry_run ? 'dry run' : 'run'
40
45
  log.info "beginning alerts #{run_desc}"
41
46
 
42
47
  alerts = read_alerts
@@ -45,9 +50,12 @@ module Interferon
45
50
 
46
51
  @destinations.each do |dest|
47
52
  dest['options'] ||= {}
53
+ if @dry_run
54
+ dest['options']['dry_run'] = true
55
+ end
48
56
  end
49
57
 
50
- update_alerts(@destinations, hosts, alerts, groups, dry_run)
58
+ update_alerts(@destinations, hosts, alerts, groups)
51
59
 
52
60
  if @request_shutdown
53
61
  log.info "interferon #{run_desc} shut down by SIGTERM"
@@ -133,23 +141,23 @@ module Interferon
133
141
  return hosts
134
142
  end
135
143
 
136
- def update_alerts(destinations, hosts, alerts, groups, dry_run)
144
+ def update_alerts(destinations, hosts, alerts, groups)
137
145
  loader = DestinationsLoader.new([@alerts_repo_path])
138
146
  loader.get_all(destinations).each do |dest|
139
147
  break if @request_shutdown
140
148
  log.info "updating alerts on #{dest.class.name}"
141
- update_alerts_on_destination(dest, hosts, alerts, groups, dry_run)
149
+ update_alerts_on_destination(dest, hosts, alerts, groups)
142
150
  end
143
151
  end
144
152
 
145
- def update_alerts_on_destination(dest, hosts, alerts, groups, dry_run)
153
+ def update_alerts_on_destination(dest, hosts, alerts, groups)
146
154
  # track some counters/stats per destination
147
155
  start_time = Time.new.to_f
148
156
 
149
157
  # get already-defined alerts
150
- existing_alerts = dest.existing_alerts.dup
158
+ existing_alerts = dest.existing_alerts
151
159
 
152
- if dry_run
160
+ if @dry_run
153
161
  do_dry_run_update(dest, hosts, alerts, existing_alerts, groups)
154
162
  else
155
163
  do_regular_update(dest, hosts, alerts, existing_alerts, groups)
@@ -159,7 +167,7 @@ module Interferon
159
167
  # run time summary
160
168
  run_time = Time.new.to_f - start_time
161
169
  statsd.histogram(
162
- dry_run ? 'destinations.run_time.dry_run' : 'destinations.run_time',
170
+ @dry_run ? 'destinations.run_time.dry_run' : 'destinations.run_time',
163
171
  run_time,
164
172
  :tags => ["destination:#{dest.class.name}"])
165
173
  log.info "#{dest.class.name} : run completed in %.2f seconds" % (run_time)
@@ -168,46 +176,103 @@ module Interferon
168
176
  dest.report_stats
169
177
  end
170
178
 
171
- if dry_run && !dest.api_errors.empty?
179
+ if @dry_run && !dest.api_errors.empty?
172
180
  raise dest.api_errors.to_s
173
181
  end
174
182
  end
175
183
 
176
184
  def do_dry_run_update(dest, hosts, alerts, existing_alerts, groups)
177
- to_remove = existing_alerts.reject{|key, a| !key.start_with?(DRY_RUN_ALERTS_NAME_PREFIX)}
185
+ # Track these to clean up dry-run alerts from previous runs
186
+ existing_dry_run_alerts = []
187
+ existing_alerts.each do |name, alert|
188
+ if name.start_with?(DRY_RUN_ALERTS_NAME_PREFIX)
189
+ existing_dry_run_alerts << [alert['name'], [alert['id']]]
190
+ existing_alerts.delete(name)
191
+ end
192
+ end
193
+
178
194
  alerts_queue = build_alerts_queue(hosts, alerts, groups)
179
- alerts_queue.reject!{|name, pair| !Interferon::need_dry_run(pair[0], existing_alerts)}
180
- alerts_queue.each do |name, pair|
181
- alert = pair[0]
182
- alert.change_name(DRY_RUN_ALERTS_NAME_PREFIX + alert['name'])
195
+ updates_queue = alerts_queue.reject do |name, alert_people_pair|
196
+ !Interferon::need_update(dest, alert_people_pair, existing_alerts)
197
+ end
198
+
199
+ # Add dry-run prefix to alerts and delete id to avoid impacting real alerts
200
+ existing_alerts.keys.each do |name|
201
+ existing_alert = existing_alerts[name]
202
+ dry_run_alert_name = DRY_RUN_ALERTS_NAME_PREFIX + name
203
+ existing_alert['name'] = dry_run_alert_name
204
+ existing_alert['id'] = [nil]
205
+ existing_alerts[dry_run_alert_name] = existing_alerts.delete(name)
206
+ end
207
+
208
+ # Build new queue with dry-run prefixes and ensure they are silenced
209
+ alerts_queue.each do |name, alert_people_pair|
210
+ alert = alert_people_pair[0]
211
+ dry_run_alert_name = DRY_RUN_ALERTS_NAME_PREFIX + alert['name']
212
+ alert.change_name(dry_run_alert_name)
213
+ alert.silence
214
+ end
215
+
216
+ # Create alerts in destination
217
+ created_alerts = create_alerts(dest, updates_queue)
218
+
219
+ # Existing alerts are pruned until all that remains are alerts that aren't being generated anymore
220
+ to_remove = existing_alerts.dup
221
+ alerts_queue.each do |name, alert_people_pair|
222
+ alert = alert_people_pair[0]
223
+ old_alerts = to_remove[alert['name']]
224
+
225
+ if !old_alerts.nil?
226
+ if old_alerts['id'].length == 1
227
+ to_remove.delete(alert['name'])
228
+ else
229
+ old_alerts['id'] = old_alerts['id'].drop(1)
230
+ end
231
+ end
183
232
  end
184
233
 
185
- # flush queue
186
- created_alerts_key_ids = create_alerts(dest, alerts_queue)
187
- created_alerts_ids = created_alerts_key_ids.map{|a| a[1]}
188
- to_remove_ids = to_remove.empty? ? [] : to_remove.map{|a| a['id']}
189
- # remove existing alerts that shouldn't exist
190
- (created_alerts_ids + to_remove_ids).each do |id|
234
+ # Clean up alerts not longer being generated
235
+ to_remove.each do |name, alert|
191
236
  break if @request_shutdown
192
- dest.remove_alert_by_id(id) unless id.nil?
237
+ dest.remove_alert(alert)
238
+ end
239
+
240
+ # Clean up dry-run created alerts
241
+ (created_alerts + existing_dry_run_alerts).each do |alert_id_pair|
242
+ alert_ids = alert_id_pair[1]
243
+ alert_ids.each do |alert_id|
244
+ dest.remove_alert_by_id(alert_id)
245
+ end
193
246
  end
247
+
194
248
  end
195
249
 
196
250
  def do_regular_update(dest, hosts, alerts, existing_alerts, groups)
197
- existing_alerts.each{ |key, existing_alert| existing_alert['still_exists'] = false }
198
-
199
251
  alerts_queue = build_alerts_queue(hosts, alerts, groups)
252
+ updates_queue = alerts_queue.reject do |name, alert_people_pair|
253
+ !Interferon::need_update(dest, alert_people_pair, existing_alerts)
254
+ end
200
255
 
201
- # flush queue
202
- created_alerts_keys = create_alerts(dest, alerts_queue).map{|a| a[0]}
203
- created_alerts_keys.each do |alert_key|
204
- # don't delete alerts we still have defined
205
- existing_alerts[alert_key]['still_exists'] = true if existing_alerts.include?(alert_key)
256
+ # Create alerts in destination
257
+ create_alerts(dest, updates_queue)
258
+
259
+ # Existing alerts are pruned until all that remains are alerts that aren't being generated anymore
260
+ to_remove = existing_alerts.dup
261
+ alerts_queue.each do |name, alert_people_pair|
262
+ alert = alert_people_pair[0]
263
+ old_alerts = to_remove[alert['name']]
264
+
265
+ if !old_alerts.nil?
266
+ if old_alerts['id'].length == 1
267
+ to_remove.delete(alert['name'])
268
+ else
269
+ old_alerts['id'] = old_alerts['id'].drop(1)
270
+ end
271
+ end
206
272
  end
207
273
 
208
- # remove existing alerts that shouldn't exist
209
- to_delete = existing_alerts.reject{ |key, existing_alert| existing_alert['still_exists'] }
210
- to_delete.each do |key, alert|
274
+ # Clean up alerts not longer being generated
275
+ to_remove.each do |name, alert|
211
276
  break if @request_shutdown
212
277
  dest.remove_alert(alert)
213
278
  end
@@ -237,10 +302,11 @@ module Interferon
237
302
  end
238
303
 
239
304
  def build_alerts_queue(hosts, alerts, groups)
305
+ alerts_queue = {}
240
306
  # create or update alerts; mark when we've done that
241
- alerts_queue = Hash.new
242
- alerts.each do |alert|
307
+ result = Parallel.map(alerts, in_processes: @processes) do |alert|
243
308
  break if @request_shutdown
309
+ alerts_generated = {}
244
310
  counters = {
245
311
  :errors => 0,
246
312
  :evals => 0,
@@ -268,7 +334,7 @@ module Interferon
268
334
 
269
335
  counters[:applies] += 1
270
336
  # don't define alerts twice
271
- next if alerts_queue.key?(alert[:name])
337
+ next if alerts_generated.key?(alert[:name])
272
338
 
273
339
  # figure out who to notify
274
340
  people = Set.new(alert[:notify][:people])
@@ -277,7 +343,7 @@ module Interferon
277
343
  end
278
344
 
279
345
  # queue the alert up for creation; we clone the alert to save the current state
280
- alerts_queue[alert[:name]] ||= [alert.clone, people]
346
+ alerts_generated[alert[:name]] = [alert.clone, people]
281
347
  end
282
348
 
283
349
  # log some of the counters
@@ -289,7 +355,7 @@ module Interferon
289
355
  end
290
356
 
291
357
  # did the alert fail to evaluate on all hosts?
292
- if counters[:errors] == counters[:hosts]
358
+ if counters[:errors] == counters[:hosts] && !last_eval_error.nil?
293
359
  log.error "alert #{alert} failed to evaluate in the context of all hosts!"
294
360
  log.error "last error on alert #{alert}: #{last_eval_error}"
295
361
 
@@ -306,25 +372,48 @@ module Interferon
306
372
  else
307
373
  statsd.gauge('alerts.evaluate.never_applies', 0, :tags => ["alert:#{alert}"])
308
374
  end
375
+ alerts_generated
376
+ end
377
+
378
+ result.each do |alerts_generated|
379
+ alerts_queue.merge! alerts_generated
309
380
  end
310
381
  alerts_queue
311
382
  end
312
383
 
313
- def self.need_dry_run(alert, existing_alerts_from_api)
384
+ def self.need_update(dest, alert_people_pair, existing_alerts_from_api)
385
+ alert = alert_people_pair[0]
314
386
  existing = existing_alerts_from_api[alert['name']]
315
387
  if existing.nil?
316
388
  true
317
389
  else
318
- !same_alerts_for_dry_run_purpose(alert, existing)
390
+ !same_alerts(dest, alert_people_pair, existing)
319
391
  end
320
392
  end
321
393
 
322
- def self.same_alerts_for_dry_run_purpose(alert, alert_api_json)
323
- query1 = alert['metric']['datadog_query']
324
- query2 = alert_api_json['query']
325
- query1.strip!
326
- query2.strip!
327
- query1 == query2
394
+ def self.same_alerts(dest, alert_people_pair, alert_api_json)
395
+ alert, people = alert_people_pair
396
+
397
+ prev_alert = {
398
+ :query => alert_api_json['query'].strip,
399
+ :message => alert_api_json['message'].strip,
400
+ :notify_no_data => alert_api_json['notify_no_data'],
401
+ :silenced => alert_api_json['silenced'],
402
+ :timeout => alert_api_json['timeout_h'],
403
+ :no_data_timeframe => alert_api_json['no_data_timeframe']
404
+ }
405
+
406
+ new_alert = {
407
+ :query => alert['metric']['datadog_query'].strip,
408
+ :message => dest.generate_message(alert['message'], people).strip,
409
+ :notify_no_data => alert['notify_no_data'],
410
+ :silenced => alert['silenced'] || alert['silenced_until'] > Time.now,
411
+ :timeout => alert['timeout'] ? [1, alert['timeout'].to_i / 3600].max : nil,
412
+ :no_data_timeframe => alert['no_data_timeframe'] || nil
413
+ }
414
+
415
+ prev_alert == new_alert
328
416
  end
417
+
329
418
  end
330
419
  end
@@ -6,7 +6,8 @@ module Interferon
6
6
  def get_or_set(field, val, block, default)
7
7
  @hash ||= Hash.new
8
8
  if val.nil?
9
- return @hash[field]
9
+ f = @hash[field]
10
+ f.nil? ? default : f
10
11
  else
11
12
  @hash[field] = val
12
13
  end
@@ -27,6 +28,14 @@ module Interferon
27
28
  def id(v = nil, &block)
28
29
  get_or_set(:@id, v, block, '')
29
30
  end
31
+
32
+ def silenced(v = nil, &block)
33
+ get_or_set(:@silenced, v, block, false)
34
+ end
35
+
36
+ def silenced_until(v = nil, &block)
37
+ get_or_set(:@silenced_until, v && Time.parse(v), block, Time.at(0))
38
+ end
30
39
  end
31
40
 
32
41
  class MockNotifyDSL < NotifyDSL
@@ -1,6 +1,7 @@
1
1
  module Interferon
2
2
  class MockAlert < Alert
3
3
  def initialize(dsl)
4
+ @filename = 'MOCKALERT'
4
5
  @dsl = dsl
5
6
  end
6
7
 
@@ -0,0 +1,57 @@
1
+ require 'spec_helper'
2
+ require 'interferon/group_sources/filesystem'
3
+
4
+ describe Interferon::GroupSources::Filesystem do
5
+ let (:fs_loader) { Interferon::GroupSources::Filesystem.new({'paths' => ['/tmp']}) }
6
+
7
+ describe 'list_groups' do
8
+ context "with basic groups" do
9
+ before do
10
+ group_a = double()
11
+ expect(File).to receive(:read).with('group_a.yaml').and_return('group_a_text')
12
+ expect(Psych).to receive(:parse).and_return(group_a)
13
+ expect(group_a).to receive(:to_ruby).and_return({'name' => 'group_a',
14
+ 'people' => ['Alice', 'Bob']})
15
+
16
+ group_b = double()
17
+ expect(File).to receive(:read).with('group_b.yaml').and_return('group_b_text')
18
+ expect(Psych).to receive(:parse).and_return(group_b)
19
+ expect(group_b).to receive(:to_ruby).and_return({'name' => 'group_b',
20
+ 'people' => ['Carol', 'Dave']})
21
+ end
22
+
23
+ it 'loads groups defined by YAML' do
24
+ expect(Dir).to receive(:glob).and_return(['group_a.yaml', 'group_b.yaml'].each)
25
+
26
+ groups = fs_loader.list_groups()
27
+ expect(groups).to eq({'group_a' => ['Alice', 'Bob'], 'group_b' => ['Carol', 'Dave']})
28
+ end
29
+
30
+ it 'allows groups to be aliased in YAML' do
31
+ expect(Dir).to receive(:glob).and_return(['group_a.yaml', 'group_b.yaml', 'group_c.yaml'].each)
32
+ group_c = double()
33
+ expect(File).to receive(:read).with('group_c.yaml').and_return('group_c_text')
34
+ expect(Psych).to receive(:parse).and_return(group_c)
35
+ expect(group_c).to receive(:to_ruby).and_return({'name' => 'group_c', 'alias_for' => 'group_b'})
36
+
37
+ groups = fs_loader.list_groups()
38
+ expect(groups).to eq({'group_a' => ['Alice', 'Bob'],
39
+ 'group_b' => ['Carol', 'Dave'],
40
+ 'group_c' => ['Carol', 'Dave']})
41
+ end
42
+
43
+ it 'skips bad aliases in YAML' do
44
+ expect(Dir).to receive(:glob).and_return(['group_a.yaml', 'group_b.yaml', 'group_c.yaml'].each)
45
+ group_c = double()
46
+ expect(File).to receive(:read).with('group_c.yaml').and_return('group_c_text')
47
+ expect(Psych).to receive(:parse).and_return(group_c)
48
+ expect(group_c).to receive(:to_ruby).and_return({'name' => 'group_c', 'alias_for' => 'group_d'})
49
+
50
+ groups = fs_loader.list_groups()
51
+ expect(groups).to eq({'group_a' => ['Alice', 'Bob'],
52
+ 'group_b' => ['Carol', 'Dave']})
53
+ end
54
+ end
55
+ end
56
+
57
+ end