interferon 0.0.12 → 0.0.19
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +13 -5
- data/interferon.gemspec +6 -2
- data/lib/interferon/alert.rb +8 -0
- data/lib/interferon/alert_dsl.rb +10 -0
- data/lib/interferon/destinations/datadog.rb +127 -55
- data/lib/interferon/group_sources/filesystem.rb +18 -2
- data/lib/interferon/version.rb +1 -1
- data/lib/interferon/work_hours_helper.rb +21 -0
- data/lib/interferon.rb +134 -45
- data/spec/helpers/dsl_helper.rb +10 -1
- data/spec/helpers/mock_alert.rb +1 -0
- data/spec/lib/interferon/group_sources/filesystem_spec.rb +57 -0
- data/spec/lib/interferon_spec.rb +190 -43
- data/spec/lib/work_hours_helper_spec.rb +41 -0
- metadata +106 -25
data/lib/interferon.rb
CHANGED
@@ -9,6 +9,7 @@ require 'interferon/alert_dsl'
|
|
9
9
|
#require 'pry' #uncomment if you're debugging
|
10
10
|
require 'erb'
|
11
11
|
require 'ostruct'
|
12
|
+
require 'parallel'
|
12
13
|
require 'set'
|
13
14
|
require 'yaml'
|
14
15
|
|
@@ -23,11 +24,14 @@ module Interferon
|
|
23
24
|
# groups_sources is a hash from type => options for each group source
|
24
25
|
# host_sources is a hash from type => options for each host source
|
25
26
|
# destinations is a similiar hash from type => options for each alerter
|
26
|
-
def initialize(alerts_repo_path, groups_sources, host_sources, destinations
|
27
|
+
def initialize(alerts_repo_path, groups_sources, host_sources, destinations,
|
28
|
+
dry_run=false, processes=nil)
|
27
29
|
@alerts_repo_path = alerts_repo_path
|
28
30
|
@groups_sources = groups_sources
|
29
31
|
@host_sources = host_sources
|
30
32
|
@destinations = destinations
|
33
|
+
@dry_run = dry_run
|
34
|
+
@processes = processes
|
31
35
|
@request_shutdown = false
|
32
36
|
end
|
33
37
|
|
@@ -36,7 +40,8 @@ module Interferon
|
|
36
40
|
log.info "SIGTERM received. shutting down gracefully..."
|
37
41
|
@request_shutdown = true
|
38
42
|
end
|
39
|
-
|
43
|
+
@dry_run = dry_run
|
44
|
+
run_desc = @dry_run ? 'dry run' : 'run'
|
40
45
|
log.info "beginning alerts #{run_desc}"
|
41
46
|
|
42
47
|
alerts = read_alerts
|
@@ -45,9 +50,12 @@ module Interferon
|
|
45
50
|
|
46
51
|
@destinations.each do |dest|
|
47
52
|
dest['options'] ||= {}
|
53
|
+
if @dry_run
|
54
|
+
dest['options']['dry_run'] = true
|
55
|
+
end
|
48
56
|
end
|
49
57
|
|
50
|
-
update_alerts(@destinations, hosts, alerts, groups
|
58
|
+
update_alerts(@destinations, hosts, alerts, groups)
|
51
59
|
|
52
60
|
if @request_shutdown
|
53
61
|
log.info "interferon #{run_desc} shut down by SIGTERM"
|
@@ -133,23 +141,23 @@ module Interferon
|
|
133
141
|
return hosts
|
134
142
|
end
|
135
143
|
|
136
|
-
def update_alerts(destinations, hosts, alerts, groups
|
144
|
+
def update_alerts(destinations, hosts, alerts, groups)
|
137
145
|
loader = DestinationsLoader.new([@alerts_repo_path])
|
138
146
|
loader.get_all(destinations).each do |dest|
|
139
147
|
break if @request_shutdown
|
140
148
|
log.info "updating alerts on #{dest.class.name}"
|
141
|
-
update_alerts_on_destination(dest, hosts, alerts, groups
|
149
|
+
update_alerts_on_destination(dest, hosts, alerts, groups)
|
142
150
|
end
|
143
151
|
end
|
144
152
|
|
145
|
-
def update_alerts_on_destination(dest, hosts, alerts, groups
|
153
|
+
def update_alerts_on_destination(dest, hosts, alerts, groups)
|
146
154
|
# track some counters/stats per destination
|
147
155
|
start_time = Time.new.to_f
|
148
156
|
|
149
157
|
# get already-defined alerts
|
150
|
-
existing_alerts = dest.existing_alerts
|
158
|
+
existing_alerts = dest.existing_alerts
|
151
159
|
|
152
|
-
if dry_run
|
160
|
+
if @dry_run
|
153
161
|
do_dry_run_update(dest, hosts, alerts, existing_alerts, groups)
|
154
162
|
else
|
155
163
|
do_regular_update(dest, hosts, alerts, existing_alerts, groups)
|
@@ -159,7 +167,7 @@ module Interferon
|
|
159
167
|
# run time summary
|
160
168
|
run_time = Time.new.to_f - start_time
|
161
169
|
statsd.histogram(
|
162
|
-
dry_run ? 'destinations.run_time.dry_run' : 'destinations.run_time',
|
170
|
+
@dry_run ? 'destinations.run_time.dry_run' : 'destinations.run_time',
|
163
171
|
run_time,
|
164
172
|
:tags => ["destination:#{dest.class.name}"])
|
165
173
|
log.info "#{dest.class.name} : run completed in %.2f seconds" % (run_time)
|
@@ -168,46 +176,103 @@ module Interferon
|
|
168
176
|
dest.report_stats
|
169
177
|
end
|
170
178
|
|
171
|
-
if dry_run && !dest.api_errors.empty?
|
179
|
+
if @dry_run && !dest.api_errors.empty?
|
172
180
|
raise dest.api_errors.to_s
|
173
181
|
end
|
174
182
|
end
|
175
183
|
|
176
184
|
def do_dry_run_update(dest, hosts, alerts, existing_alerts, groups)
|
177
|
-
|
185
|
+
# Track these to clean up dry-run alerts from previous runs
|
186
|
+
existing_dry_run_alerts = []
|
187
|
+
existing_alerts.each do |name, alert|
|
188
|
+
if name.start_with?(DRY_RUN_ALERTS_NAME_PREFIX)
|
189
|
+
existing_dry_run_alerts << [alert['name'], [alert['id']]]
|
190
|
+
existing_alerts.delete(name)
|
191
|
+
end
|
192
|
+
end
|
193
|
+
|
178
194
|
alerts_queue = build_alerts_queue(hosts, alerts, groups)
|
179
|
-
alerts_queue.reject
|
180
|
-
|
181
|
-
|
182
|
-
|
195
|
+
updates_queue = alerts_queue.reject do |name, alert_people_pair|
|
196
|
+
!Interferon::need_update(dest, alert_people_pair, existing_alerts)
|
197
|
+
end
|
198
|
+
|
199
|
+
# Add dry-run prefix to alerts and delete id to avoid impacting real alerts
|
200
|
+
existing_alerts.keys.each do |name|
|
201
|
+
existing_alert = existing_alerts[name]
|
202
|
+
dry_run_alert_name = DRY_RUN_ALERTS_NAME_PREFIX + name
|
203
|
+
existing_alert['name'] = dry_run_alert_name
|
204
|
+
existing_alert['id'] = [nil]
|
205
|
+
existing_alerts[dry_run_alert_name] = existing_alerts.delete(name)
|
206
|
+
end
|
207
|
+
|
208
|
+
# Build new queue with dry-run prefixes and ensure they are silenced
|
209
|
+
alerts_queue.each do |name, alert_people_pair|
|
210
|
+
alert = alert_people_pair[0]
|
211
|
+
dry_run_alert_name = DRY_RUN_ALERTS_NAME_PREFIX + alert['name']
|
212
|
+
alert.change_name(dry_run_alert_name)
|
213
|
+
alert.silence
|
214
|
+
end
|
215
|
+
|
216
|
+
# Create alerts in destination
|
217
|
+
created_alerts = create_alerts(dest, updates_queue)
|
218
|
+
|
219
|
+
# Existing alerts are pruned until all that remains are alerts that aren't being generated anymore
|
220
|
+
to_remove = existing_alerts.dup
|
221
|
+
alerts_queue.each do |name, alert_people_pair|
|
222
|
+
alert = alert_people_pair[0]
|
223
|
+
old_alerts = to_remove[alert['name']]
|
224
|
+
|
225
|
+
if !old_alerts.nil?
|
226
|
+
if old_alerts['id'].length == 1
|
227
|
+
to_remove.delete(alert['name'])
|
228
|
+
else
|
229
|
+
old_alerts['id'] = old_alerts['id'].drop(1)
|
230
|
+
end
|
231
|
+
end
|
183
232
|
end
|
184
233
|
|
185
|
-
#
|
186
|
-
|
187
|
-
created_alerts_ids = created_alerts_key_ids.map{|a| a[1]}
|
188
|
-
to_remove_ids = to_remove.empty? ? [] : to_remove.map{|a| a['id']}
|
189
|
-
# remove existing alerts that shouldn't exist
|
190
|
-
(created_alerts_ids + to_remove_ids).each do |id|
|
234
|
+
# Clean up alerts not longer being generated
|
235
|
+
to_remove.each do |name, alert|
|
191
236
|
break if @request_shutdown
|
192
|
-
dest.
|
237
|
+
dest.remove_alert(alert)
|
238
|
+
end
|
239
|
+
|
240
|
+
# Clean up dry-run created alerts
|
241
|
+
(created_alerts + existing_dry_run_alerts).each do |alert_id_pair|
|
242
|
+
alert_ids = alert_id_pair[1]
|
243
|
+
alert_ids.each do |alert_id|
|
244
|
+
dest.remove_alert_by_id(alert_id)
|
245
|
+
end
|
193
246
|
end
|
247
|
+
|
194
248
|
end
|
195
249
|
|
196
250
|
def do_regular_update(dest, hosts, alerts, existing_alerts, groups)
|
197
|
-
existing_alerts.each{ |key, existing_alert| existing_alert['still_exists'] = false }
|
198
|
-
|
199
251
|
alerts_queue = build_alerts_queue(hosts, alerts, groups)
|
252
|
+
updates_queue = alerts_queue.reject do |name, alert_people_pair|
|
253
|
+
!Interferon::need_update(dest, alert_people_pair, existing_alerts)
|
254
|
+
end
|
200
255
|
|
201
|
-
#
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
256
|
+
# Create alerts in destination
|
257
|
+
create_alerts(dest, updates_queue)
|
258
|
+
|
259
|
+
# Existing alerts are pruned until all that remains are alerts that aren't being generated anymore
|
260
|
+
to_remove = existing_alerts.dup
|
261
|
+
alerts_queue.each do |name, alert_people_pair|
|
262
|
+
alert = alert_people_pair[0]
|
263
|
+
old_alerts = to_remove[alert['name']]
|
264
|
+
|
265
|
+
if !old_alerts.nil?
|
266
|
+
if old_alerts['id'].length == 1
|
267
|
+
to_remove.delete(alert['name'])
|
268
|
+
else
|
269
|
+
old_alerts['id'] = old_alerts['id'].drop(1)
|
270
|
+
end
|
271
|
+
end
|
206
272
|
end
|
207
273
|
|
208
|
-
#
|
209
|
-
|
210
|
-
to_delete.each do |key, alert|
|
274
|
+
# Clean up alerts not longer being generated
|
275
|
+
to_remove.each do |name, alert|
|
211
276
|
break if @request_shutdown
|
212
277
|
dest.remove_alert(alert)
|
213
278
|
end
|
@@ -237,10 +302,11 @@ module Interferon
|
|
237
302
|
end
|
238
303
|
|
239
304
|
def build_alerts_queue(hosts, alerts, groups)
|
305
|
+
alerts_queue = {}
|
240
306
|
# create or update alerts; mark when we've done that
|
241
|
-
|
242
|
-
alerts.each do |alert|
|
307
|
+
result = Parallel.map(alerts, in_processes: @processes) do |alert|
|
243
308
|
break if @request_shutdown
|
309
|
+
alerts_generated = {}
|
244
310
|
counters = {
|
245
311
|
:errors => 0,
|
246
312
|
:evals => 0,
|
@@ -268,7 +334,7 @@ module Interferon
|
|
268
334
|
|
269
335
|
counters[:applies] += 1
|
270
336
|
# don't define alerts twice
|
271
|
-
next if
|
337
|
+
next if alerts_generated.key?(alert[:name])
|
272
338
|
|
273
339
|
# figure out who to notify
|
274
340
|
people = Set.new(alert[:notify][:people])
|
@@ -277,7 +343,7 @@ module Interferon
|
|
277
343
|
end
|
278
344
|
|
279
345
|
# queue the alert up for creation; we clone the alert to save the current state
|
280
|
-
|
346
|
+
alerts_generated[alert[:name]] = [alert.clone, people]
|
281
347
|
end
|
282
348
|
|
283
349
|
# log some of the counters
|
@@ -289,7 +355,7 @@ module Interferon
|
|
289
355
|
end
|
290
356
|
|
291
357
|
# did the alert fail to evaluate on all hosts?
|
292
|
-
if counters[:errors] == counters[:hosts]
|
358
|
+
if counters[:errors] == counters[:hosts] && !last_eval_error.nil?
|
293
359
|
log.error "alert #{alert} failed to evaluate in the context of all hosts!"
|
294
360
|
log.error "last error on alert #{alert}: #{last_eval_error}"
|
295
361
|
|
@@ -306,25 +372,48 @@ module Interferon
|
|
306
372
|
else
|
307
373
|
statsd.gauge('alerts.evaluate.never_applies', 0, :tags => ["alert:#{alert}"])
|
308
374
|
end
|
375
|
+
alerts_generated
|
376
|
+
end
|
377
|
+
|
378
|
+
result.each do |alerts_generated|
|
379
|
+
alerts_queue.merge! alerts_generated
|
309
380
|
end
|
310
381
|
alerts_queue
|
311
382
|
end
|
312
383
|
|
313
|
-
def self.
|
384
|
+
def self.need_update(dest, alert_people_pair, existing_alerts_from_api)
|
385
|
+
alert = alert_people_pair[0]
|
314
386
|
existing = existing_alerts_from_api[alert['name']]
|
315
387
|
if existing.nil?
|
316
388
|
true
|
317
389
|
else
|
318
|
-
!
|
390
|
+
!same_alerts(dest, alert_people_pair, existing)
|
319
391
|
end
|
320
392
|
end
|
321
393
|
|
322
|
-
def self.
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
394
|
+
def self.same_alerts(dest, alert_people_pair, alert_api_json)
|
395
|
+
alert, people = alert_people_pair
|
396
|
+
|
397
|
+
prev_alert = {
|
398
|
+
:query => alert_api_json['query'].strip,
|
399
|
+
:message => alert_api_json['message'].strip,
|
400
|
+
:notify_no_data => alert_api_json['notify_no_data'],
|
401
|
+
:silenced => alert_api_json['silenced'],
|
402
|
+
:timeout => alert_api_json['timeout_h'],
|
403
|
+
:no_data_timeframe => alert_api_json['no_data_timeframe']
|
404
|
+
}
|
405
|
+
|
406
|
+
new_alert = {
|
407
|
+
:query => alert['metric']['datadog_query'].strip,
|
408
|
+
:message => dest.generate_message(alert['message'], people).strip,
|
409
|
+
:notify_no_data => alert['notify_no_data'],
|
410
|
+
:silenced => alert['silenced'] || alert['silenced_until'] > Time.now,
|
411
|
+
:timeout => alert['timeout'] ? [1, alert['timeout'].to_i / 3600].max : nil,
|
412
|
+
:no_data_timeframe => alert['no_data_timeframe'] || nil
|
413
|
+
}
|
414
|
+
|
415
|
+
prev_alert == new_alert
|
328
416
|
end
|
417
|
+
|
329
418
|
end
|
330
419
|
end
|
data/spec/helpers/dsl_helper.rb
CHANGED
@@ -6,7 +6,8 @@ module Interferon
|
|
6
6
|
def get_or_set(field, val, block, default)
|
7
7
|
@hash ||= Hash.new
|
8
8
|
if val.nil?
|
9
|
-
|
9
|
+
f = @hash[field]
|
10
|
+
f.nil? ? default : f
|
10
11
|
else
|
11
12
|
@hash[field] = val
|
12
13
|
end
|
@@ -27,6 +28,14 @@ module Interferon
|
|
27
28
|
def id(v = nil, &block)
|
28
29
|
get_or_set(:@id, v, block, '')
|
29
30
|
end
|
31
|
+
|
32
|
+
def silenced(v = nil, &block)
|
33
|
+
get_or_set(:@silenced, v, block, false)
|
34
|
+
end
|
35
|
+
|
36
|
+
def silenced_until(v = nil, &block)
|
37
|
+
get_or_set(:@silenced_until, v && Time.parse(v), block, Time.at(0))
|
38
|
+
end
|
30
39
|
end
|
31
40
|
|
32
41
|
class MockNotifyDSL < NotifyDSL
|
data/spec/helpers/mock_alert.rb
CHANGED
@@ -0,0 +1,57 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
require 'interferon/group_sources/filesystem'
|
3
|
+
|
4
|
+
describe Interferon::GroupSources::Filesystem do
|
5
|
+
let (:fs_loader) { Interferon::GroupSources::Filesystem.new({'paths' => ['/tmp']}) }
|
6
|
+
|
7
|
+
describe 'list_groups' do
|
8
|
+
context "with basic groups" do
|
9
|
+
before do
|
10
|
+
group_a = double()
|
11
|
+
expect(File).to receive(:read).with('group_a.yaml').and_return('group_a_text')
|
12
|
+
expect(Psych).to receive(:parse).and_return(group_a)
|
13
|
+
expect(group_a).to receive(:to_ruby).and_return({'name' => 'group_a',
|
14
|
+
'people' => ['Alice', 'Bob']})
|
15
|
+
|
16
|
+
group_b = double()
|
17
|
+
expect(File).to receive(:read).with('group_b.yaml').and_return('group_b_text')
|
18
|
+
expect(Psych).to receive(:parse).and_return(group_b)
|
19
|
+
expect(group_b).to receive(:to_ruby).and_return({'name' => 'group_b',
|
20
|
+
'people' => ['Carol', 'Dave']})
|
21
|
+
end
|
22
|
+
|
23
|
+
it 'loads groups defined by YAML' do
|
24
|
+
expect(Dir).to receive(:glob).and_return(['group_a.yaml', 'group_b.yaml'].each)
|
25
|
+
|
26
|
+
groups = fs_loader.list_groups()
|
27
|
+
expect(groups).to eq({'group_a' => ['Alice', 'Bob'], 'group_b' => ['Carol', 'Dave']})
|
28
|
+
end
|
29
|
+
|
30
|
+
it 'allows groups to be aliased in YAML' do
|
31
|
+
expect(Dir).to receive(:glob).and_return(['group_a.yaml', 'group_b.yaml', 'group_c.yaml'].each)
|
32
|
+
group_c = double()
|
33
|
+
expect(File).to receive(:read).with('group_c.yaml').and_return('group_c_text')
|
34
|
+
expect(Psych).to receive(:parse).and_return(group_c)
|
35
|
+
expect(group_c).to receive(:to_ruby).and_return({'name' => 'group_c', 'alias_for' => 'group_b'})
|
36
|
+
|
37
|
+
groups = fs_loader.list_groups()
|
38
|
+
expect(groups).to eq({'group_a' => ['Alice', 'Bob'],
|
39
|
+
'group_b' => ['Carol', 'Dave'],
|
40
|
+
'group_c' => ['Carol', 'Dave']})
|
41
|
+
end
|
42
|
+
|
43
|
+
it 'skips bad aliases in YAML' do
|
44
|
+
expect(Dir).to receive(:glob).and_return(['group_a.yaml', 'group_b.yaml', 'group_c.yaml'].each)
|
45
|
+
group_c = double()
|
46
|
+
expect(File).to receive(:read).with('group_c.yaml').and_return('group_c_text')
|
47
|
+
expect(Psych).to receive(:parse).and_return(group_c)
|
48
|
+
expect(group_c).to receive(:to_ruby).and_return({'name' => 'group_c', 'alias_for' => 'group_d'})
|
49
|
+
|
50
|
+
groups = fs_loader.list_groups()
|
51
|
+
expect(groups).to eq({'group_a' => ['Alice', 'Bob'],
|
52
|
+
'group_b' => ['Carol', 'Dave']})
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
end
|