interferon 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +15 -0
- data/.gitignore +11 -0
- data/Gemfile +2 -0
- data/Gemfile.lock +52 -0
- data/LICENSE +21 -0
- data/README.md +96 -0
- data/bin/interferon +66 -0
- data/config.example.yaml +37 -0
- data/groups/data.yaml +11 -0
- data/groups/dataeng.yaml +4 -0
- data/groups/datainfra.yaml +10 -0
- data/groups/devhap.yaml +6 -0
- data/groups/discover.yaml +13 -0
- data/groups/growth.yaml +17 -0
- data/groups/host.yaml +12 -0
- data/groups/internalproducts.yml +13 -0
- data/groups/logstash.yaml +4 -0
- data/groups/mobile.yaml +17 -0
- data/groups/pagerduty_sysops.yaml +5 -0
- data/groups/panda.yaml +10 -0
- data/groups/payments.yaml +16 -0
- data/groups/payments_finance.yaml +8 -0
- data/groups/prodinfra.yaml +15 -0
- data/groups/search.yaml +10 -0
- data/groups/security.yaml +8 -0
- data/groups/sre.yaml +16 -0
- data/groups/teamx.yaml +8 -0
- data/groups/tns.yaml +14 -0
- data/groups/tools.yml +11 -0
- data/interferon.gemspec +26 -0
- data/lib/interferon.rb +241 -0
- data/lib/interferon/alert.rb +33 -0
- data/lib/interferon/alert_dsl.rb +94 -0
- data/lib/interferon/destinations/datadog.rb +169 -0
- data/lib/interferon/group_sources/filesystem.rb +38 -0
- data/lib/interferon/host_sources/aws_dynamo.rb +51 -0
- data/lib/interferon/host_sources/aws_elasticache.rb +69 -0
- data/lib/interferon/host_sources/aws_rds.rb +92 -0
- data/lib/interferon/host_sources/optica.rb +35 -0
- data/lib/interferon/host_sources/optica_services.rb +68 -0
- data/lib/interferon/loaders.rb +123 -0
- data/lib/interferon/logging.rb +26 -0
- data/lib/interferon/version.rb +3 -0
- data/script/convert.rb +29 -0
- data/script/pre-commit +73 -0
- data/spec/spec_helper.rb +62 -0
- metadata +179 -0
data/groups/panda.yaml
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
---
|
2
|
+
name: payments
|
3
|
+
people:
|
4
|
+
- allen.kerr@airbnb.com
|
5
|
+
- ian@airbnb.com
|
6
|
+
- john.terenzio@airbnb.com
|
7
|
+
- joseph.sofaer@airbnb.com
|
8
|
+
- josh.lee@airbnb.com
|
9
|
+
- juliusz.gonera@airbnb.com
|
10
|
+
- karen.kim@airbnb.com
|
11
|
+
- kevin.sun@airbnb.com
|
12
|
+
- lou.kosak@airbnb.com
|
13
|
+
- michel.weksler@airbnb.com
|
14
|
+
- mike.lewis@airbnb.com
|
15
|
+
- tao.cui@airbnb.com
|
16
|
+
- varun.pai@airbnb.com
|
@@ -0,0 +1,15 @@
|
|
1
|
+
---
|
2
|
+
name: prodinfra
|
3
|
+
people:
|
4
|
+
- alexis.midon@airbnb.com
|
5
|
+
- ben.hughes@airbnb.com
|
6
|
+
- jon.tai@airbnb.com
|
7
|
+
- joseph.sofaer@airbnb.com
|
8
|
+
- kai.liu@airbnb.com
|
9
|
+
- kevin.rice@airbnb.com
|
10
|
+
- nija.mashruwala@airbnb.com
|
11
|
+
- philip.snowberger@airbnb.com
|
12
|
+
- rahul.iyer@airbnb.com
|
13
|
+
- sonic.wang@airbnb.com
|
14
|
+
- willie.yao@airbnb.com
|
15
|
+
|
data/groups/search.yaml
ADDED
data/groups/sre.yaml
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
---
|
2
|
+
# deprecated; use prodinfra
|
3
|
+
name: sre
|
4
|
+
people:
|
5
|
+
- alexis.midon@airbnb.com
|
6
|
+
- ben.hughes@airbnb.com
|
7
|
+
- jon.tai@airbnb.com
|
8
|
+
- joseph.sofaer@airbnb.com
|
9
|
+
- kai.liu@airbnb.com
|
10
|
+
- kevin.rice@airbnb.com
|
11
|
+
- nija.mashruwala@airbnb.com
|
12
|
+
- philip.snowberger@airbnb.com
|
13
|
+
- rahul.iyer@airbnb.com
|
14
|
+
- sonic.wang@airbnb.com
|
15
|
+
- willie.yao@airbnb.com
|
16
|
+
|
data/groups/teamx.yaml
ADDED
data/groups/tns.yaml
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
---
|
2
|
+
name: tns
|
3
|
+
people:
|
4
|
+
- alok.gupta@airbnb.com
|
5
|
+
- anish.muppalaneni@airbnb.com
|
6
|
+
- dmitry.alexeenko@airbnb.com
|
7
|
+
- eric.levine@airbnb.com
|
8
|
+
- freddy.chen@airbnb.com
|
9
|
+
- jerry.luan@airbnb.com
|
10
|
+
- justin.chen@airbnb.com
|
11
|
+
- sean.albito@airbnb.com
|
12
|
+
- siddharth.kar@airbnb.com
|
13
|
+
- tony.wen@airbnb.com
|
14
|
+
- lee.zhang@airbnb.com
|
data/groups/tools.yml
ADDED
data/interferon.gemspec
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'interferon/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |gem|
|
7
|
+
gem.name = "interferon"
|
8
|
+
gem.version = Interferon::VERSION
|
9
|
+
gem.authors = ["Igor Serebryany"]
|
10
|
+
gem.email = ["igor.serebryany@airbnb.com"]
|
11
|
+
gem.description = %q{: Store metrics alerts in code!}
|
12
|
+
gem.summary = %q{: Store metrics alerts in code!}
|
13
|
+
gem.homepage = "https://www.github.com/airbnb/interferon"
|
14
|
+
gem.licenses = ['MIT']
|
15
|
+
|
16
|
+
gem.files = `git ls-files`.split($/)
|
17
|
+
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
18
|
+
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
19
|
+
|
20
|
+
gem.add_runtime_dependency "dogapi", "~> 1.11", ">= 1.11.1"
|
21
|
+
gem.add_runtime_dependency "aws-sdk", "~> 1.35", ">= 1.35.1"
|
22
|
+
gem.add_runtime_dependency "dogstatsd-ruby", "~> 1.4", ">= 1.4.1"
|
23
|
+
|
24
|
+
gem.add_development_dependency "rspec", "~> 3.2"
|
25
|
+
gem.add_development_dependency "pry", "~> 0.10"
|
26
|
+
end
|
data/lib/interferon.rb
ADDED
@@ -0,0 +1,241 @@
|
|
1
|
+
require 'interferon/version'
|
2
|
+
require 'interferon/logging'
|
3
|
+
|
4
|
+
require 'interferon/loaders'
|
5
|
+
|
6
|
+
require 'interferon/alert'
|
7
|
+
require 'interferon/alert_dsl'
|
8
|
+
|
9
|
+
#require 'pry' #uncomment if you're debugging
|
10
|
+
require 'erb'
|
11
|
+
require 'ostruct'
|
12
|
+
require 'set'
|
13
|
+
require 'yaml'
|
14
|
+
|
15
|
+
module Interferon
|
16
|
+
class Interferon
|
17
|
+
|
18
|
+
include Logging
|
19
|
+
attr_accessor :host_sources, :destinations, :host_info
|
20
|
+
|
21
|
+
# groups_sources is a hash from type => options for each group source
|
22
|
+
# host_sources is a hash from type => options for each host source
|
23
|
+
# destinations is a similiar hash from type => options for each alerter
|
24
|
+
def initialize(alerts_repo_path, groups_sources, host_sources, destinations)
|
25
|
+
@alerts_repo_path = alerts_repo_path
|
26
|
+
@groups_sources = groups_sources
|
27
|
+
@host_sources = host_sources
|
28
|
+
@destinations = destinations
|
29
|
+
end
|
30
|
+
|
31
|
+
def run(dry_run = false)
|
32
|
+
run_desc = dry_run ? 'dry run' : 'run'
|
33
|
+
log.info "beginning alerts #{run_desc}"
|
34
|
+
|
35
|
+
alerts = read_alerts
|
36
|
+
groups = read_groups(@groups_sources)
|
37
|
+
hosts = read_hosts(@host_sources)
|
38
|
+
|
39
|
+
# make sure destinations know if it's a dry run
|
40
|
+
@destinations.each do |dest|
|
41
|
+
dest['options'] ||= {}
|
42
|
+
dest['options']['dry_run'] = dry_run
|
43
|
+
end
|
44
|
+
|
45
|
+
update_alerts(@destinations, hosts, alerts, groups)
|
46
|
+
|
47
|
+
log.info "interferon #{run_desc} complete"
|
48
|
+
end
|
49
|
+
|
50
|
+
def read_alerts
|
51
|
+
alerts = []
|
52
|
+
failed = 0
|
53
|
+
|
54
|
+
# validate that alerts path exists
|
55
|
+
path = File.expand_path(File.join(@alerts_repo_path, 'alerts'))
|
56
|
+
abort("no such directory #{path} for reading alert files") \
|
57
|
+
unless Dir.exists?(path)
|
58
|
+
|
59
|
+
Dir.glob(File.join(path, '*.rb')) do |alert_file|
|
60
|
+
begin
|
61
|
+
alert = Alert.new(alert_file)
|
62
|
+
rescue StandardError => e
|
63
|
+
log.warn "error reading alert file #{alert_file}: #{e}"
|
64
|
+
failed += 1
|
65
|
+
else
|
66
|
+
alerts << alert
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
log.info "read #{alerts.count} alerts files from #{path}"
|
71
|
+
|
72
|
+
statsd.gauge('alerts.read.count', alerts.count)
|
73
|
+
statsd.gauge('alerts.read.failed', failed)
|
74
|
+
|
75
|
+
abort("failed to read #{failed} alerts") if failed > 0
|
76
|
+
return alerts
|
77
|
+
end
|
78
|
+
|
79
|
+
def read_groups(sources)
|
80
|
+
groups = {}
|
81
|
+
loader = GroupSourcesLoader.new([@alerts_repo_path])
|
82
|
+
loader.get_all(sources).each do |source|
|
83
|
+
source_groups = source.list_groups
|
84
|
+
|
85
|
+
# add all people to groups
|
86
|
+
people_count = 0
|
87
|
+
source_groups.each do |name, people|
|
88
|
+
groups[name] ||= []
|
89
|
+
groups[name].concat(people)
|
90
|
+
people_count += people.count
|
91
|
+
end
|
92
|
+
|
93
|
+
log.info "read #{people_count} people in #{source_groups.count} groups from source #{source.class.name}"
|
94
|
+
end
|
95
|
+
|
96
|
+
log.info "total of #{groups.values.flatten.count} people in #{groups.count} groups from #{sources.count} sources"
|
97
|
+
|
98
|
+
statsd.gauge('groups.sources', sources.count)
|
99
|
+
statsd.gauge('groups.count', groups.count)
|
100
|
+
statsd.gauge('groups.people', groups.values.flatten.count)
|
101
|
+
|
102
|
+
return groups
|
103
|
+
end
|
104
|
+
|
105
|
+
def read_hosts(sources)
|
106
|
+
statsd.gauge('hosts.sources', sources.count)
|
107
|
+
|
108
|
+
hosts = []
|
109
|
+
loader = HostSourcesLoader.new([@alerts_repo_path])
|
110
|
+
loader.get_all(sources).each do |source|
|
111
|
+
source_hosts = source.list_hosts
|
112
|
+
hosts << source_hosts
|
113
|
+
|
114
|
+
statsd.gauge('hosts.count', source_hosts.count, :tags => ["source:#{source.class.name}"])
|
115
|
+
log.info "read #{source_hosts.count} hosts from source #{source.class.name}"
|
116
|
+
end
|
117
|
+
|
118
|
+
hosts.flatten!
|
119
|
+
log.info "total of #{hosts.count} entities from #{sources.count} sources"
|
120
|
+
|
121
|
+
return hosts
|
122
|
+
end
|
123
|
+
|
124
|
+
def update_alerts(destinations, hosts, alerts, groups)
|
125
|
+
loader = DestinationsLoader.new([@alerts_repo_path])
|
126
|
+
loader.get_all(destinations).each do |dest|
|
127
|
+
log.info "updating alerts on #{dest.class.name}"
|
128
|
+
|
129
|
+
# track some counters/stats per destination
|
130
|
+
start_time = Time.new.to_f
|
131
|
+
|
132
|
+
# get already-defined alerts
|
133
|
+
existing_alerts = dest.existing_alerts.dup
|
134
|
+
existing_alerts.each{ |key, existing_alert| existing_alert['still_exists'] = false }
|
135
|
+
|
136
|
+
# create or update alerts; mark when we've done that
|
137
|
+
alerts_queue = Hash.new
|
138
|
+
alerts.each do |alert|
|
139
|
+
counters = {
|
140
|
+
:errors => 0,
|
141
|
+
:evals => 0,
|
142
|
+
:applies => 0,
|
143
|
+
:hosts => hosts.length
|
144
|
+
}
|
145
|
+
last_eval_error = nil
|
146
|
+
|
147
|
+
hosts.each do |hostinfo|
|
148
|
+
begin
|
149
|
+
alert.evaluate(hostinfo)
|
150
|
+
counters[:evals] += 1
|
151
|
+
rescue StandardError => e
|
152
|
+
log.debug "Evaluation of alert #{alert} failed in the context of host #{hostinfo}"
|
153
|
+
counters[:errors] += 1
|
154
|
+
last_eval_error = e
|
155
|
+
next
|
156
|
+
end
|
157
|
+
|
158
|
+
# don't define an alert that doesn't apply to this hostinfo
|
159
|
+
unless alert[:applies]
|
160
|
+
log.debug "alert #{alert[:name]} doesn't apply to #{hostinfo.inspect}"
|
161
|
+
next
|
162
|
+
end
|
163
|
+
|
164
|
+
counters[:applies] += 1
|
165
|
+
|
166
|
+
# don't define alerts twice
|
167
|
+
next if alerts_queue.key?(alert[:name])
|
168
|
+
|
169
|
+
# figure out who to notify
|
170
|
+
people = Set.new(alert[:notify][:people])
|
171
|
+
alert[:notify][:groups].each do |g|
|
172
|
+
people += (groups[g] || [])
|
173
|
+
end
|
174
|
+
|
175
|
+
# queue the alert up for creation; we clone the alert to save the current state
|
176
|
+
alerts_queue[alert[:name]] ||= [alert.clone, people]
|
177
|
+
end
|
178
|
+
|
179
|
+
# log some of the counters
|
180
|
+
statsd.gauge('alerts.evaluate.errors', counters[:errors], :tags => ["alert:#{alert}"])
|
181
|
+
statsd.gauge('alerts.evaluate.applies', counters[:applies], :tags => ["alert:#{alert}"])
|
182
|
+
|
183
|
+
if counters[:applies] > 0
|
184
|
+
log.info "alert #{alert} applies to #{counters[:applies]} of #{counters[:hosts]} hosts"
|
185
|
+
end
|
186
|
+
|
187
|
+
# did the alert fail to evaluate on all hosts?
|
188
|
+
if counters[:errors] == counters[:hosts]
|
189
|
+
log.error "alert #{alert} failed to evaluate in the context of all hosts!"
|
190
|
+
log.error "last error on alert #{alert}: #{last_eval_error}"
|
191
|
+
|
192
|
+
statsd.gauge('alerts.evaluate.failed_on_all', 1, :tags => ["alert:#{alert}"])
|
193
|
+
log.debug "alert #{alert}: error #{last_eval_error}\n#{last_eval_error.backtrace.join("\n")}"
|
194
|
+
else
|
195
|
+
statsd.gauge('alerts.evaluate.failed_on_all', 0, :tags => ["alert:#{alert}"])
|
196
|
+
end
|
197
|
+
|
198
|
+
# did the alert apply to any hosts?
|
199
|
+
if counters[:applies] == 0
|
200
|
+
statsd.gauge('alerts.evaluate.never_applies', 1, :tags => ["alert:#{alert}"])
|
201
|
+
log.warn "alert #{alert} did not apply to any hosts"
|
202
|
+
else
|
203
|
+
statsd.gauge('alerts.evaluate.never_applies', 0, :tags => ["alert:#{alert}"])
|
204
|
+
end
|
205
|
+
end
|
206
|
+
|
207
|
+
# flush queue
|
208
|
+
alerts_to_create = alerts_queue.keys
|
209
|
+
concurrency = dest.concurrency || 10
|
210
|
+
threads = concurrency.times.map do
|
211
|
+
t = Thread.new do
|
212
|
+
while name = alerts_to_create.shift
|
213
|
+
cur_alert, people = alerts_queue[name]
|
214
|
+
|
215
|
+
log.debug "creating alert for #{cur_alert[:name]}"
|
216
|
+
alert_key = dest.create_alert(cur_alert, people)
|
217
|
+
|
218
|
+
# don't delete alerts we still have defined
|
219
|
+
existing_alerts[alert_key]['still_exists'] = true if existing_alerts.include?(alert_key)
|
220
|
+
end
|
221
|
+
end
|
222
|
+
t.abort_on_exception = true
|
223
|
+
t
|
224
|
+
end
|
225
|
+
threads.map(&:join)
|
226
|
+
|
227
|
+
# remove existing alerts that shouldn't exist
|
228
|
+
to_delete = existing_alerts.reject{ |key, existing_alert| existing_alert['still_exists'] }
|
229
|
+
to_delete.each{ |key, alert| dest.remove_alert(alert) }
|
230
|
+
|
231
|
+
# run time summary
|
232
|
+
run_time = Time.new.to_f - start_time
|
233
|
+
statsd.histogram('destinations.run_time', run_time, :tags => ["destination:#{dest.class.name}"])
|
234
|
+
log.info "#{dest.class.name} : run completed in %.2f seconds" % (run_time)
|
235
|
+
|
236
|
+
# report destination stats
|
237
|
+
dest.report_stats
|
238
|
+
end
|
239
|
+
end
|
240
|
+
end
|
241
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
module Interferon
|
2
|
+
class Alert
|
3
|
+
def initialize(path)
|
4
|
+
@path = path
|
5
|
+
@filename = File.basename(path)
|
6
|
+
|
7
|
+
@text = File.read(@path)
|
8
|
+
|
9
|
+
@dsl = nil
|
10
|
+
end
|
11
|
+
|
12
|
+
def to_s
|
13
|
+
@filename
|
14
|
+
end
|
15
|
+
|
16
|
+
def evaluate(hostinfo)
|
17
|
+
dsl = AlertDSL.new(hostinfo)
|
18
|
+
dsl.instance_eval(@text, @filename, 1)
|
19
|
+
@dsl = dsl
|
20
|
+
|
21
|
+
# return the alert and not the DSL object, which is private
|
22
|
+
self
|
23
|
+
end
|
24
|
+
|
25
|
+
def [](attr)
|
26
|
+
unless @dsl
|
27
|
+
raise "This alert has not yet been evaluated"
|
28
|
+
end
|
29
|
+
|
30
|
+
return @dsl.send(attr)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,94 @@
|
|
1
|
+
|
2
|
+
module Interferon
|
3
|
+
module DSLMixin
|
4
|
+
def initialize(hostinfo)
|
5
|
+
@hostinfo = hostinfo
|
6
|
+
end
|
7
|
+
|
8
|
+
def method_missing(meth, *args, &block)
|
9
|
+
raise ArgumentError, "No such alerts field '#{meth}'"
|
10
|
+
end
|
11
|
+
|
12
|
+
def [](arg)
|
13
|
+
self.send(arg)
|
14
|
+
end
|
15
|
+
|
16
|
+
private
|
17
|
+
def get_or_set(field, val, block, default)
|
18
|
+
if val.nil? && block.nil?
|
19
|
+
f = instance_variable_get(field)
|
20
|
+
f.nil? ? default : f
|
21
|
+
elsif val.nil?
|
22
|
+
instance_variable_set(field, block.call)
|
23
|
+
elsif block.nil?
|
24
|
+
instance_variable_set(field, val)
|
25
|
+
else
|
26
|
+
raise ArgumentError, "You must pass either a value or a block but not both to #{field}"
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
class AlertDSL
|
32
|
+
include DSLMixin
|
33
|
+
|
34
|
+
def name(v = nil, &block)
|
35
|
+
get_or_set(:@name, v, block, '')
|
36
|
+
end
|
37
|
+
|
38
|
+
def message(v = nil, &block)
|
39
|
+
get_or_set(:@message, v, block, '')
|
40
|
+
end
|
41
|
+
|
42
|
+
def silenced(v = nil, &block)
|
43
|
+
get_or_set(:@silenced, v, block, false)
|
44
|
+
end
|
45
|
+
|
46
|
+
def silenced_until(v = nil, &block)
|
47
|
+
get_or_set(:@silenced_until, v && Time.parse(v), block, Time.at(0))
|
48
|
+
end
|
49
|
+
|
50
|
+
def notify_no_data(v = nil, &block)
|
51
|
+
get_or_set(:@notify_no_data, v, block, false)
|
52
|
+
end
|
53
|
+
|
54
|
+
def no_data_timeframe(v = nil, &block)
|
55
|
+
get_or_set(:@no_data_timeframe, v, block, false)
|
56
|
+
end
|
57
|
+
|
58
|
+
def timeout(v = nil, &block)
|
59
|
+
get_or_set(:@timeout, v, block, false)
|
60
|
+
end
|
61
|
+
|
62
|
+
def applies(v = nil, &block)
|
63
|
+
get_or_set(:@applies, v, block, false)
|
64
|
+
end
|
65
|
+
|
66
|
+
def notify(v = nil)
|
67
|
+
@notify ||= NotifyDSL.new(@hostinfo)
|
68
|
+
end
|
69
|
+
|
70
|
+
def metric(v = nil)
|
71
|
+
@metric ||= MetricDSL.new(@hostinfo)
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
class NotifyDSL
|
76
|
+
include DSLMixin
|
77
|
+
|
78
|
+
def people(v = nil, &block)
|
79
|
+
get_or_set(:@people, v, block, [])
|
80
|
+
end
|
81
|
+
|
82
|
+
def groups(v = nil, &block)
|
83
|
+
get_or_set(:@groups, v, block, [])
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
class MetricDSL
|
88
|
+
include DSLMixin
|
89
|
+
|
90
|
+
def datadog_query(v = nil, &block)
|
91
|
+
get_or_set(:@datadog_query, v, block, '')
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|