interferon 0.1.0 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rubocop.yml +4 -0
- data/.rubocop_todo.yml +83 -0
- data/.travis.yml +4 -1
- data/bin/interferon +10 -9
- data/interferon.gemspec +18 -17
- data/lib/interferon/alert.rb +4 -10
- data/lib/interferon/alert_dsl.rb +12 -7
- data/lib/interferon/destinations/datadog.rb +103 -103
- data/lib/interferon/group_sources/filesystem.rb +5 -5
- data/lib/interferon/host_sources/aws_dynamo.rb +17 -19
- data/lib/interferon/host_sources/aws_elasticache.rb +20 -22
- data/lib/interferon/host_sources/aws_rds.rb +33 -33
- data/lib/interferon/host_sources/optica.rb +12 -10
- data/lib/interferon/host_sources/optica_services.rb +17 -15
- data/lib/interferon/host_sources/test_host_source.rb +1 -1
- data/lib/interferon/loaders.rb +4 -5
- data/lib/interferon/logging.rb +2 -3
- data/lib/interferon/version.rb +1 -1
- data/lib/interferon/work_hours_helper.rb +5 -5
- data/lib/interferon.rb +79 -80
- data/script/pre-commit +15 -20
- data/spec/fixtures/loaders/host_sources/test_host_source.rb +1 -1
- data/spec/fixtures/loaders/test_sources/order_test_source.rb +1 -1
- data/spec/fixtures/loaders/test_sources/test_source.rb +1 -1
- data/spec/fixtures/loaders2/test_sources/order_test_source.rb +1 -1
- data/spec/fixtures/loaders2/test_sources/secondary_source.rb +1 -1
- data/spec/fixtures/loaders2/test_sources/test_source.rb +1 -2
- data/spec/helpers/logging_helper.rb +2 -2
- data/spec/helpers/mock_alert.rb +1 -1
- data/spec/helpers/optica_helper.rb +70 -70
- data/spec/lib/interferon/destinations/datadog_spec.rb +58 -59
- data/spec/lib/interferon/group_sources/filesystem_spec.rb +29 -24
- data/spec/lib/interferon/host_sources/optica_services_spec.rb +11 -9
- data/spec/lib/interferon/host_sources/optica_spec.rb +6 -3
- data/spec/lib/interferon/loaders_spec.rb +19 -15
- data/spec/lib/interferon_spec.rb +61 -59
- data/spec/lib/work_hours_helper_spec.rb +15 -15
- data/spec/spec_helper.rb +1 -1
- metadata +61 -65
data/lib/interferon.rb
CHANGED
@@ -6,7 +6,7 @@ require 'interferon/loaders'
|
|
6
6
|
require 'interferon/alert'
|
7
7
|
require 'interferon/alert_dsl'
|
8
8
|
|
9
|
-
#require 'pry' #uncomment if you're debugging
|
9
|
+
# require 'pry' #uncomment if you're debugging
|
10
10
|
require 'erb'
|
11
11
|
require 'ostruct'
|
12
12
|
require 'parallel'
|
@@ -15,17 +15,16 @@ require 'yaml'
|
|
15
15
|
|
16
16
|
module Interferon
|
17
17
|
class Interferon
|
18
|
-
|
19
18
|
include Logging
|
20
19
|
attr_accessor :host_sources, :destinations, :host_info
|
21
20
|
|
22
|
-
DRY_RUN_ALERTS_NAME_PREFIX = '[-dry-run-]'
|
21
|
+
DRY_RUN_ALERTS_NAME_PREFIX = '[-dry-run-]'.freeze
|
23
22
|
|
24
23
|
# groups_sources is a hash from type => options for each group source
|
25
24
|
# host_sources is a hash from type => options for each host source
|
26
25
|
# destinations is a similar hash from type => options for each alerter
|
27
26
|
def initialize(alerts_repo_path, groups_sources, host_sources, destinations,
|
28
|
-
dry_run=false, processes=nil)
|
27
|
+
dry_run = false, processes = nil)
|
29
28
|
@alerts_repo_path = alerts_repo_path
|
30
29
|
@groups_sources = groups_sources
|
31
30
|
@host_sources = host_sources
|
@@ -36,8 +35,8 @@ module Interferon
|
|
36
35
|
end
|
37
36
|
|
38
37
|
def run(dry_run = false)
|
39
|
-
Signal.trap(
|
40
|
-
log.info
|
38
|
+
Signal.trap('TERM') do
|
39
|
+
log.info 'SIGTERM received. shutting down gracefully...'
|
41
40
|
@request_shutdown = true
|
42
41
|
end
|
43
42
|
@dry_run = dry_run
|
@@ -50,9 +49,7 @@ module Interferon
|
|
50
49
|
|
51
50
|
@destinations.each do |dest|
|
52
51
|
dest['options'] ||= {}
|
53
|
-
if @dry_run
|
54
|
-
dest['options']['dry_run'] = true
|
55
|
-
end
|
52
|
+
dest['options']['dry_run'] = true if @dry_run
|
56
53
|
end
|
57
54
|
|
58
55
|
update_alerts(@destinations, hosts, alerts, groups)
|
@@ -71,7 +68,7 @@ module Interferon
|
|
71
68
|
# validate that alerts path exists
|
72
69
|
path = File.expand_path(File.join(@alerts_repo_path, 'alerts'))
|
73
70
|
abort("no such directory #{path} for reading alert files") \
|
74
|
-
unless Dir.
|
71
|
+
unless Dir.exist?(path)
|
75
72
|
|
76
73
|
Dir.glob(File.join(path, '*.rb')) do |alert_file|
|
77
74
|
break if @request_shutdown
|
@@ -91,7 +88,7 @@ module Interferon
|
|
91
88
|
statsd.gauge('alerts.read.failed', failed)
|
92
89
|
|
93
90
|
abort("failed to read #{failed} alerts") if failed > 0
|
94
|
-
|
91
|
+
alerts
|
95
92
|
end
|
96
93
|
|
97
94
|
def read_groups(sources)
|
@@ -99,7 +96,7 @@ module Interferon
|
|
99
96
|
loader = GroupSourcesLoader.new([@alerts_repo_path])
|
100
97
|
loader.get_all(sources).each do |source|
|
101
98
|
break if @request_shutdown
|
102
|
-
source_groups = source.list_groups
|
99
|
+
source_groups = source.list_groups { groups }
|
103
100
|
|
104
101
|
# add all people to groups
|
105
102
|
people_count = 0
|
@@ -109,16 +106,18 @@ module Interferon
|
|
109
106
|
people_count += people.count
|
110
107
|
end
|
111
108
|
|
112
|
-
log.info "read #{people_count} people in #{source_groups.count} groups
|
109
|
+
log.info "read #{people_count} people in #{source_groups.count} groups" \
|
110
|
+
"from source #{source.class.name}"
|
113
111
|
end
|
114
112
|
|
115
|
-
log.info "total of #{groups.values.flatten.count} people in #{groups.count} groups
|
113
|
+
log.info "total of #{groups.values.flatten.count} people in #{groups.count} groups" \
|
114
|
+
"from #{sources.count} sources"
|
116
115
|
|
117
116
|
statsd.gauge('groups.sources', sources.count)
|
118
117
|
statsd.gauge('groups.count', groups.count)
|
119
118
|
statsd.gauge('groups.people', groups.values.flatten.count)
|
120
119
|
|
121
|
-
|
120
|
+
groups
|
122
121
|
end
|
123
122
|
|
124
123
|
def read_hosts(sources)
|
@@ -131,14 +130,14 @@ module Interferon
|
|
131
130
|
source_hosts = source.list_hosts
|
132
131
|
hosts << source_hosts
|
133
132
|
|
134
|
-
statsd.gauge('hosts.count', source_hosts.count, :
|
133
|
+
statsd.gauge('hosts.count', source_hosts.count, tags: ["source:#{source.class.name}"])
|
135
134
|
log.info "read #{source_hosts.count} hosts from source #{source.class.name}"
|
136
135
|
end
|
137
136
|
|
138
137
|
hosts.flatten!
|
139
138
|
log.info "total of #{hosts.count} entities from #{sources.count} sources"
|
140
139
|
|
141
|
-
|
140
|
+
hosts
|
142
141
|
end
|
143
142
|
|
144
143
|
def update_alerts(destinations, hosts, alerts, groups)
|
@@ -169,16 +168,15 @@ module Interferon
|
|
169
168
|
statsd.histogram(
|
170
169
|
@dry_run ? 'destinations.run_time.dry_run' : 'destinations.run_time',
|
171
170
|
run_time,
|
172
|
-
:
|
173
|
-
|
171
|
+
tags: ["destination:#{dest.class.name}"]
|
172
|
+
)
|
173
|
+
log.info "#{dest.class.name} : run completed in %.2f seconds" % run_time
|
174
174
|
|
175
175
|
# report destination stats
|
176
176
|
dest.report_stats
|
177
177
|
end
|
178
178
|
|
179
|
-
if @dry_run && !dest.api_errors.empty?
|
180
|
-
raise dest.api_errors.to_s
|
181
|
-
end
|
179
|
+
raise dest.api_errors.to_s if @dry_run && !dest.api_errors.empty?
|
182
180
|
end
|
183
181
|
|
184
182
|
def do_dry_run_update(dest, hosts, alerts, existing_alerts, groups)
|
@@ -192,8 +190,8 @@ module Interferon
|
|
192
190
|
end
|
193
191
|
|
194
192
|
alerts_queue = build_alerts_queue(hosts, alerts, groups)
|
195
|
-
updates_queue = alerts_queue.reject do |
|
196
|
-
!Interferon
|
193
|
+
updates_queue = alerts_queue.reject do |_name, alert_people_pair|
|
194
|
+
!Interferon.need_update(dest, alert_people_pair, existing_alerts)
|
197
195
|
end
|
198
196
|
|
199
197
|
# Add dry-run prefix to alerts and delete id to avoid impacting real alerts
|
@@ -206,7 +204,7 @@ module Interferon
|
|
206
204
|
end
|
207
205
|
|
208
206
|
# Build new queue with dry-run prefixes and ensure they are silenced
|
209
|
-
alerts_queue.each do |
|
207
|
+
alerts_queue.each do |_name, alert_people_pair|
|
210
208
|
alert = alert_people_pair[0]
|
211
209
|
dry_run_alert_name = DRY_RUN_ALERTS_NAME_PREFIX + alert['name']
|
212
210
|
alert.change_name(dry_run_alert_name)
|
@@ -216,23 +214,23 @@ module Interferon
|
|
216
214
|
# Create alerts in destination
|
217
215
|
created_alerts = create_alerts(dest, updates_queue)
|
218
216
|
|
219
|
-
# Existing alerts are pruned until all that remains are
|
217
|
+
# Existing alerts are pruned until all that remains are
|
218
|
+
# alerts that aren't being generated anymore
|
220
219
|
to_remove = existing_alerts.dup
|
221
|
-
alerts_queue.each do |
|
220
|
+
alerts_queue.each do |_name, alert_people_pair|
|
222
221
|
alert = alert_people_pair[0]
|
223
222
|
old_alerts = to_remove[alert['name']]
|
224
223
|
|
225
|
-
if
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
end
|
224
|
+
next if old_alerts.nil?
|
225
|
+
if old_alerts['id'].length == 1
|
226
|
+
to_remove.delete(alert['name'])
|
227
|
+
else
|
228
|
+
old_alerts['id'] = old_alerts['id'].drop(1)
|
231
229
|
end
|
232
230
|
end
|
233
231
|
|
234
232
|
# Clean up alerts not longer being generated
|
235
|
-
to_remove.each do |
|
233
|
+
to_remove.each do |_name, alert|
|
236
234
|
break if @request_shutdown
|
237
235
|
dest.remove_alert(alert)
|
238
236
|
end
|
@@ -244,35 +242,34 @@ module Interferon
|
|
244
242
|
dest.remove_alert_by_id(alert_id)
|
245
243
|
end
|
246
244
|
end
|
247
|
-
|
248
245
|
end
|
249
246
|
|
250
247
|
def do_regular_update(dest, hosts, alerts, existing_alerts, groups)
|
251
248
|
alerts_queue = build_alerts_queue(hosts, alerts, groups)
|
252
|
-
updates_queue = alerts_queue.reject do |
|
253
|
-
!Interferon
|
249
|
+
updates_queue = alerts_queue.reject do |_name, alert_people_pair|
|
250
|
+
!Interferon.need_update(dest, alert_people_pair, existing_alerts)
|
254
251
|
end
|
255
252
|
|
256
253
|
# Create alerts in destination
|
257
254
|
create_alerts(dest, updates_queue)
|
258
255
|
|
259
|
-
# Existing alerts are pruned until all that remains are
|
256
|
+
# Existing alerts are pruned until all that remains are
|
257
|
+
# alerts that aren't being generated anymore
|
260
258
|
to_remove = existing_alerts.dup
|
261
|
-
alerts_queue.each do |
|
259
|
+
alerts_queue.each do |_name, alert_people_pair|
|
262
260
|
alert = alert_people_pair[0]
|
263
261
|
old_alerts = to_remove[alert['name']]
|
264
262
|
|
265
|
-
if
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
end
|
263
|
+
next if old_alerts.nil?
|
264
|
+
if old_alerts['id'].length == 1
|
265
|
+
to_remove.delete(alert['name'])
|
266
|
+
else
|
267
|
+
old_alerts['id'] = old_alerts['id'].drop(1)
|
271
268
|
end
|
272
269
|
end
|
273
270
|
|
274
271
|
# Clean up alerts not longer being generated
|
275
|
-
to_remove.each do |
|
272
|
+
to_remove.each do |_name, alert|
|
276
273
|
break if @request_shutdown
|
277
274
|
dest.remove_alert(alert)
|
278
275
|
end
|
@@ -283,10 +280,10 @@ module Interferon
|
|
283
280
|
alerts_to_create = alerts_queue.keys
|
284
281
|
concurrency = dest.concurrency || 10
|
285
282
|
unless @request_shutdown
|
286
|
-
threads = concurrency
|
283
|
+
threads = Array.new(concurrency) do |i|
|
287
284
|
log.info "thread #{i} created"
|
288
285
|
t = Thread.new do
|
289
|
-
while name = alerts_to_create.shift
|
286
|
+
while (name = alerts_to_create.shift)
|
290
287
|
break if @request_shutdown
|
291
288
|
cur_alert, people = alerts_queue[name]
|
292
289
|
log.debug "creating alert for #{cur_alert[:name]}"
|
@@ -308,10 +305,10 @@ module Interferon
|
|
308
305
|
break if @request_shutdown
|
309
306
|
alerts_generated = {}
|
310
307
|
counters = {
|
311
|
-
:
|
312
|
-
:
|
313
|
-
:
|
314
|
-
:
|
308
|
+
errors: 0,
|
309
|
+
evals: 0,
|
310
|
+
applies: 0,
|
311
|
+
hosts: hosts.length,
|
315
312
|
}
|
316
313
|
last_eval_error = nil
|
317
314
|
|
@@ -347,8 +344,8 @@ module Interferon
|
|
347
344
|
end
|
348
345
|
|
349
346
|
# log some of the counters
|
350
|
-
statsd.gauge('alerts.evaluate.errors', counters[:errors], :
|
351
|
-
statsd.gauge('alerts.evaluate.applies', counters[:applies], :
|
347
|
+
statsd.gauge('alerts.evaluate.errors', counters[:errors], tags: ["alert:#{alert}"])
|
348
|
+
statsd.gauge('alerts.evaluate.applies', counters[:applies], tags: ["alert:#{alert}"])
|
352
349
|
|
353
350
|
if counters[:applies] > 0
|
354
351
|
log.info "alert #{alert} applies to #{counters[:applies]} of #{counters[:hosts]} hosts"
|
@@ -359,18 +356,19 @@ module Interferon
|
|
359
356
|
log.error "alert #{alert} failed to evaluate in the context of all hosts!"
|
360
357
|
log.error "last error on alert #{alert}: #{last_eval_error}"
|
361
358
|
|
362
|
-
statsd.gauge('alerts.evaluate.failed_on_all', 1, :
|
363
|
-
log.debug "alert #{alert}:
|
359
|
+
statsd.gauge('alerts.evaluate.failed_on_all', 1, tags: ["alert:#{alert}"])
|
360
|
+
log.debug "alert #{alert}: " \
|
361
|
+
"error #{last_eval_error}\n#{last_eval_error.backtrace.join("\n")}"
|
364
362
|
else
|
365
|
-
statsd.gauge('alerts.evaluate.failed_on_all', 0, :
|
363
|
+
statsd.gauge('alerts.evaluate.failed_on_all', 0, tags: ["alert:#{alert}"])
|
366
364
|
end
|
367
365
|
|
368
366
|
# did the alert apply to any hosts?
|
369
367
|
if counters[:applies] == 0
|
370
|
-
statsd.gauge('alerts.evaluate.never_applies', 1, :
|
368
|
+
statsd.gauge('alerts.evaluate.never_applies', 1, tags: ["alert:#{alert}"])
|
371
369
|
log.warn "alert #{alert} did not apply to any hosts"
|
372
370
|
else
|
373
|
-
statsd.gauge('alerts.evaluate.never_applies', 0, :
|
371
|
+
statsd.gauge('alerts.evaluate.never_applies', 0, tags: ["alert:#{alert}"])
|
374
372
|
end
|
375
373
|
alerts_generated
|
376
374
|
end
|
@@ -403,36 +401,37 @@ module Interferon
|
|
403
401
|
alert, people = alert_people_pair
|
404
402
|
|
405
403
|
prev_alert = {
|
406
|
-
:
|
407
|
-
:
|
408
|
-
:
|
409
|
-
:
|
410
|
-
:
|
411
|
-
:
|
412
|
-
:
|
413
|
-
:
|
414
|
-
:
|
404
|
+
monitor_type: normalize_monitor_type(alert_api_json['type']),
|
405
|
+
query: alert_api_json['query'].strip,
|
406
|
+
message: alert_api_json['message'].strip,
|
407
|
+
evaluation_delay: alert_api_json['options']['evaluation_delay'],
|
408
|
+
notify_no_data: alert_api_json['options']['notify_no_data'],
|
409
|
+
notify_audit: alert_api_json['options']['notify_audit'],
|
410
|
+
no_data_timeframe: alert_api_json['options']['no_data_timeframe'],
|
411
|
+
silenced: alert_api_json['options']['silenced'],
|
412
|
+
thresholds: alert_api_json['options']['thresholds'],
|
413
|
+
timeout_h: alert_api_json['options']['timeout_h'],
|
415
414
|
}
|
416
415
|
|
417
416
|
new_alert = {
|
418
|
-
:
|
419
|
-
:
|
420
|
-
:
|
421
|
-
:
|
422
|
-
:
|
423
|
-
:
|
424
|
-
:
|
425
|
-
:
|
426
|
-
:
|
417
|
+
monitor_type: normalize_monitor_type(alert['monitor_type']),
|
418
|
+
query: alert['metric']['datadog_query'],
|
419
|
+
message: dest.generate_message(alert['message'], people).strip,
|
420
|
+
evaluation_delay: alert['evaluation_delay'],
|
421
|
+
notify_no_data: alert['notify_no_data'],
|
422
|
+
notify_audit: alert['notify']['audit'],
|
423
|
+
no_data_timeframe: alert['no_data_timeframe'],
|
424
|
+
silenced: alert['silenced'],
|
425
|
+
thresholds: alert['thresholds'],
|
426
|
+
timeout_h: alert['timeout_h'],
|
427
427
|
}
|
428
428
|
|
429
|
-
|
430
|
-
|
429
|
+
unless alert['require_full_window'].nil?
|
430
|
+
prev_alert[:require_full_window] = alert_api_json['options']['require_full_window']
|
431
431
|
new_alert[:require_full_window] = alert['require_full_window']
|
432
432
|
end
|
433
433
|
|
434
434
|
prev_alert == new_alert
|
435
435
|
end
|
436
|
-
|
437
436
|
end
|
438
437
|
end
|
data/script/pre-commit
CHANGED
@@ -11,22 +11,20 @@ reasons = []
|
|
11
11
|
diff = `git diff-index --name-status --cached HEAD`
|
12
12
|
files = diff.split("\n").map(&:split)
|
13
13
|
|
14
|
-
added_files = files
|
15
|
-
|
16
|
-
|
17
|
-
|
14
|
+
added_files = files
|
15
|
+
.select { |(status, _name)| status == 'A' }
|
16
|
+
.map { |(_status, name)| name }
|
17
|
+
.compact
|
18
18
|
|
19
|
-
modified_files = files
|
20
|
-
|
21
|
-
|
22
|
-
|
19
|
+
modified_files = files
|
20
|
+
.select { |(status, _name)| status != 'D' } # ignore deleted files
|
21
|
+
.map { |(_status, name)| name }
|
22
|
+
.compact
|
23
23
|
|
24
24
|
# check for large files
|
25
25
|
added_files.each do |file|
|
26
26
|
size_in_kb = `du -k #{file}`.strip.split.first.to_i
|
27
|
-
if size_in_kb > 1024
|
28
|
-
reasons << "#{file} is greater than 1 MB in size"
|
29
|
-
end
|
27
|
+
reasons << "#{file} is greater than 1 MB in size" if size_in_kb > 1024
|
30
28
|
end
|
31
29
|
|
32
30
|
# Make sure Gemfile.lock was updated if Gemfile changed.
|
@@ -35,25 +33,22 @@ if modified_files.include?('Gemfile') && !modified_files.include?('Gemfile.lock'
|
|
35
33
|
end
|
36
34
|
|
37
35
|
# Check Ruby syntax
|
38
|
-
modified_files.select {|f| f.match
|
39
|
-
|
40
|
-
|
41
|
-
if $? != 0
|
42
|
-
reasons << "ruby file #{file} failed syntax check"
|
43
|
-
end
|
36
|
+
modified_files.select { |f| f.match(/\.((rb)|(rake))$/) }.each do |file|
|
37
|
+
`ruby -c #{file} 2>&1`
|
38
|
+
reasons << "ruby file #{file} failed syntax check" if $CHILD_STATUS != 0
|
44
39
|
end
|
45
40
|
|
46
41
|
# Check JSON syntax
|
47
|
-
modified_files.select {|f| f.match
|
42
|
+
modified_files.select { |f| f.match(/(\.json)$/) }.each do |file|
|
48
43
|
begin
|
49
44
|
JSON.parse(File.read(file))
|
50
45
|
rescue StandardError => e
|
51
|
-
reasons << "JSON file #{file} contains invalid JSON"
|
46
|
+
reasons << "JSON file #{file} contains invalid JSON: #{e}"
|
52
47
|
end
|
53
48
|
end
|
54
49
|
|
55
50
|
# Check YAML syntax
|
56
|
-
modified_files.select {|f| f.match
|
51
|
+
modified_files.select { |f| f.match(/(\.yaml)$/) }.each do |file|
|
57
52
|
begin
|
58
53
|
YAML.parse(File.read(file))
|
59
54
|
rescue YAML::SyntaxError => e
|