interferon 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +15 -0
- data/.gitignore +11 -0
- data/Gemfile +2 -0
- data/Gemfile.lock +52 -0
- data/LICENSE +21 -0
- data/README.md +96 -0
- data/bin/interferon +66 -0
- data/config.example.yaml +37 -0
- data/groups/data.yaml +11 -0
- data/groups/dataeng.yaml +4 -0
- data/groups/datainfra.yaml +10 -0
- data/groups/devhap.yaml +6 -0
- data/groups/discover.yaml +13 -0
- data/groups/growth.yaml +17 -0
- data/groups/host.yaml +12 -0
- data/groups/internalproducts.yml +13 -0
- data/groups/logstash.yaml +4 -0
- data/groups/mobile.yaml +17 -0
- data/groups/pagerduty_sysops.yaml +5 -0
- data/groups/panda.yaml +10 -0
- data/groups/payments.yaml +16 -0
- data/groups/payments_finance.yaml +8 -0
- data/groups/prodinfra.yaml +15 -0
- data/groups/search.yaml +10 -0
- data/groups/security.yaml +8 -0
- data/groups/sre.yaml +16 -0
- data/groups/teamx.yaml +8 -0
- data/groups/tns.yaml +14 -0
- data/groups/tools.yml +11 -0
- data/interferon.gemspec +26 -0
- data/lib/interferon.rb +241 -0
- data/lib/interferon/alert.rb +33 -0
- data/lib/interferon/alert_dsl.rb +94 -0
- data/lib/interferon/destinations/datadog.rb +169 -0
- data/lib/interferon/group_sources/filesystem.rb +38 -0
- data/lib/interferon/host_sources/aws_dynamo.rb +51 -0
- data/lib/interferon/host_sources/aws_elasticache.rb +69 -0
- data/lib/interferon/host_sources/aws_rds.rb +92 -0
- data/lib/interferon/host_sources/optica.rb +35 -0
- data/lib/interferon/host_sources/optica_services.rb +68 -0
- data/lib/interferon/loaders.rb +123 -0
- data/lib/interferon/logging.rb +26 -0
- data/lib/interferon/version.rb +3 -0
- data/script/convert.rb +29 -0
- data/script/pre-commit +73 -0
- data/spec/spec_helper.rb +62 -0
- metadata +179 -0
data/groups/panda.yaml
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
---
|
2
|
+
name: payments
|
3
|
+
people:
|
4
|
+
- allen.kerr@airbnb.com
|
5
|
+
- ian@airbnb.com
|
6
|
+
- john.terenzio@airbnb.com
|
7
|
+
- joseph.sofaer@airbnb.com
|
8
|
+
- josh.lee@airbnb.com
|
9
|
+
- juliusz.gonera@airbnb.com
|
10
|
+
- karen.kim@airbnb.com
|
11
|
+
- kevin.sun@airbnb.com
|
12
|
+
- lou.kosak@airbnb.com
|
13
|
+
- michel.weksler@airbnb.com
|
14
|
+
- mike.lewis@airbnb.com
|
15
|
+
- tao.cui@airbnb.com
|
16
|
+
- varun.pai@airbnb.com
|
@@ -0,0 +1,15 @@
|
|
1
|
+
---
|
2
|
+
name: prodinfra
|
3
|
+
people:
|
4
|
+
- alexis.midon@airbnb.com
|
5
|
+
- ben.hughes@airbnb.com
|
6
|
+
- jon.tai@airbnb.com
|
7
|
+
- joseph.sofaer@airbnb.com
|
8
|
+
- kai.liu@airbnb.com
|
9
|
+
- kevin.rice@airbnb.com
|
10
|
+
- nija.mashruwala@airbnb.com
|
11
|
+
- philip.snowberger@airbnb.com
|
12
|
+
- rahul.iyer@airbnb.com
|
13
|
+
- sonic.wang@airbnb.com
|
14
|
+
- willie.yao@airbnb.com
|
15
|
+
|
data/groups/search.yaml
ADDED
data/groups/sre.yaml
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
---
|
2
|
+
# deprecated; use prodinfra
|
3
|
+
name: sre
|
4
|
+
people:
|
5
|
+
- alexis.midon@airbnb.com
|
6
|
+
- ben.hughes@airbnb.com
|
7
|
+
- jon.tai@airbnb.com
|
8
|
+
- joseph.sofaer@airbnb.com
|
9
|
+
- kai.liu@airbnb.com
|
10
|
+
- kevin.rice@airbnb.com
|
11
|
+
- nija.mashruwala@airbnb.com
|
12
|
+
- philip.snowberger@airbnb.com
|
13
|
+
- rahul.iyer@airbnb.com
|
14
|
+
- sonic.wang@airbnb.com
|
15
|
+
- willie.yao@airbnb.com
|
16
|
+
|
data/groups/teamx.yaml
ADDED
data/groups/tns.yaml
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
---
|
2
|
+
name: tns
|
3
|
+
people:
|
4
|
+
- alok.gupta@airbnb.com
|
5
|
+
- anish.muppalaneni@airbnb.com
|
6
|
+
- dmitry.alexeenko@airbnb.com
|
7
|
+
- eric.levine@airbnb.com
|
8
|
+
- freddy.chen@airbnb.com
|
9
|
+
- jerry.luan@airbnb.com
|
10
|
+
- justin.chen@airbnb.com
|
11
|
+
- sean.albito@airbnb.com
|
12
|
+
- siddharth.kar@airbnb.com
|
13
|
+
- tony.wen@airbnb.com
|
14
|
+
- lee.zhang@airbnb.com
|
data/groups/tools.yml
ADDED
data/interferon.gemspec
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'interferon/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |gem|
|
7
|
+
gem.name = "interferon"
|
8
|
+
gem.version = Interferon::VERSION
|
9
|
+
gem.authors = ["Igor Serebryany"]
|
10
|
+
gem.email = ["igor.serebryany@airbnb.com"]
|
11
|
+
gem.description = %q{: Store metrics alerts in code!}
|
12
|
+
gem.summary = %q{: Store metrics alerts in code!}
|
13
|
+
gem.homepage = "https://www.github.com/airbnb/interferon"
|
14
|
+
gem.licenses = ['MIT']
|
15
|
+
|
16
|
+
gem.files = `git ls-files`.split($/)
|
17
|
+
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
18
|
+
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
19
|
+
|
20
|
+
gem.add_runtime_dependency "dogapi", "~> 1.11", ">= 1.11.1"
|
21
|
+
gem.add_runtime_dependency "aws-sdk", "~> 1.35", ">= 1.35.1"
|
22
|
+
gem.add_runtime_dependency "dogstatsd-ruby", "~> 1.4", ">= 1.4.1"
|
23
|
+
|
24
|
+
gem.add_development_dependency "rspec", "~> 3.2"
|
25
|
+
gem.add_development_dependency "pry", "~> 0.10"
|
26
|
+
end
|
data/lib/interferon.rb
ADDED
@@ -0,0 +1,241 @@
|
|
1
|
+
require 'interferon/version'
|
2
|
+
require 'interferon/logging'
|
3
|
+
|
4
|
+
require 'interferon/loaders'
|
5
|
+
|
6
|
+
require 'interferon/alert'
|
7
|
+
require 'interferon/alert_dsl'
|
8
|
+
|
9
|
+
#require 'pry' #uncomment if you're debugging
|
10
|
+
require 'erb'
|
11
|
+
require 'ostruct'
|
12
|
+
require 'set'
|
13
|
+
require 'yaml'
|
14
|
+
|
15
|
+
module Interferon
|
16
|
+
class Interferon
|
17
|
+
|
18
|
+
include Logging
|
19
|
+
attr_accessor :host_sources, :destinations, :host_info
|
20
|
+
|
21
|
+
# groups_sources is a hash from type => options for each group source
|
22
|
+
# host_sources is a hash from type => options for each host source
|
23
|
+
# destinations is a similiar hash from type => options for each alerter
|
24
|
+
def initialize(alerts_repo_path, groups_sources, host_sources, destinations)
|
25
|
+
@alerts_repo_path = alerts_repo_path
|
26
|
+
@groups_sources = groups_sources
|
27
|
+
@host_sources = host_sources
|
28
|
+
@destinations = destinations
|
29
|
+
end
|
30
|
+
|
31
|
+
def run(dry_run = false)
|
32
|
+
run_desc = dry_run ? 'dry run' : 'run'
|
33
|
+
log.info "beginning alerts #{run_desc}"
|
34
|
+
|
35
|
+
alerts = read_alerts
|
36
|
+
groups = read_groups(@groups_sources)
|
37
|
+
hosts = read_hosts(@host_sources)
|
38
|
+
|
39
|
+
# make sure destinations know if it's a dry run
|
40
|
+
@destinations.each do |dest|
|
41
|
+
dest['options'] ||= {}
|
42
|
+
dest['options']['dry_run'] = dry_run
|
43
|
+
end
|
44
|
+
|
45
|
+
update_alerts(@destinations, hosts, alerts, groups)
|
46
|
+
|
47
|
+
log.info "interferon #{run_desc} complete"
|
48
|
+
end
|
49
|
+
|
50
|
+
def read_alerts
|
51
|
+
alerts = []
|
52
|
+
failed = 0
|
53
|
+
|
54
|
+
# validate that alerts path exists
|
55
|
+
path = File.expand_path(File.join(@alerts_repo_path, 'alerts'))
|
56
|
+
abort("no such directory #{path} for reading alert files") \
|
57
|
+
unless Dir.exists?(path)
|
58
|
+
|
59
|
+
Dir.glob(File.join(path, '*.rb')) do |alert_file|
|
60
|
+
begin
|
61
|
+
alert = Alert.new(alert_file)
|
62
|
+
rescue StandardError => e
|
63
|
+
log.warn "error reading alert file #{alert_file}: #{e}"
|
64
|
+
failed += 1
|
65
|
+
else
|
66
|
+
alerts << alert
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
log.info "read #{alerts.count} alerts files from #{path}"
|
71
|
+
|
72
|
+
statsd.gauge('alerts.read.count', alerts.count)
|
73
|
+
statsd.gauge('alerts.read.failed', failed)
|
74
|
+
|
75
|
+
abort("failed to read #{failed} alerts") if failed > 0
|
76
|
+
return alerts
|
77
|
+
end
|
78
|
+
|
79
|
+
def read_groups(sources)
|
80
|
+
groups = {}
|
81
|
+
loader = GroupSourcesLoader.new([@alerts_repo_path])
|
82
|
+
loader.get_all(sources).each do |source|
|
83
|
+
source_groups = source.list_groups
|
84
|
+
|
85
|
+
# add all people to groups
|
86
|
+
people_count = 0
|
87
|
+
source_groups.each do |name, people|
|
88
|
+
groups[name] ||= []
|
89
|
+
groups[name].concat(people)
|
90
|
+
people_count += people.count
|
91
|
+
end
|
92
|
+
|
93
|
+
log.info "read #{people_count} people in #{source_groups.count} groups from source #{source.class.name}"
|
94
|
+
end
|
95
|
+
|
96
|
+
log.info "total of #{groups.values.flatten.count} people in #{groups.count} groups from #{sources.count} sources"
|
97
|
+
|
98
|
+
statsd.gauge('groups.sources', sources.count)
|
99
|
+
statsd.gauge('groups.count', groups.count)
|
100
|
+
statsd.gauge('groups.people', groups.values.flatten.count)
|
101
|
+
|
102
|
+
return groups
|
103
|
+
end
|
104
|
+
|
105
|
+
def read_hosts(sources)
|
106
|
+
statsd.gauge('hosts.sources', sources.count)
|
107
|
+
|
108
|
+
hosts = []
|
109
|
+
loader = HostSourcesLoader.new([@alerts_repo_path])
|
110
|
+
loader.get_all(sources).each do |source|
|
111
|
+
source_hosts = source.list_hosts
|
112
|
+
hosts << source_hosts
|
113
|
+
|
114
|
+
statsd.gauge('hosts.count', source_hosts.count, :tags => ["source:#{source.class.name}"])
|
115
|
+
log.info "read #{source_hosts.count} hosts from source #{source.class.name}"
|
116
|
+
end
|
117
|
+
|
118
|
+
hosts.flatten!
|
119
|
+
log.info "total of #{hosts.count} entities from #{sources.count} sources"
|
120
|
+
|
121
|
+
return hosts
|
122
|
+
end
|
123
|
+
|
124
|
+
def update_alerts(destinations, hosts, alerts, groups)
|
125
|
+
loader = DestinationsLoader.new([@alerts_repo_path])
|
126
|
+
loader.get_all(destinations).each do |dest|
|
127
|
+
log.info "updating alerts on #{dest.class.name}"
|
128
|
+
|
129
|
+
# track some counters/stats per destination
|
130
|
+
start_time = Time.new.to_f
|
131
|
+
|
132
|
+
# get already-defined alerts
|
133
|
+
existing_alerts = dest.existing_alerts.dup
|
134
|
+
existing_alerts.each{ |key, existing_alert| existing_alert['still_exists'] = false }
|
135
|
+
|
136
|
+
# create or update alerts; mark when we've done that
|
137
|
+
alerts_queue = Hash.new
|
138
|
+
alerts.each do |alert|
|
139
|
+
counters = {
|
140
|
+
:errors => 0,
|
141
|
+
:evals => 0,
|
142
|
+
:applies => 0,
|
143
|
+
:hosts => hosts.length
|
144
|
+
}
|
145
|
+
last_eval_error = nil
|
146
|
+
|
147
|
+
hosts.each do |hostinfo|
|
148
|
+
begin
|
149
|
+
alert.evaluate(hostinfo)
|
150
|
+
counters[:evals] += 1
|
151
|
+
rescue StandardError => e
|
152
|
+
log.debug "Evaluation of alert #{alert} failed in the context of host #{hostinfo}"
|
153
|
+
counters[:errors] += 1
|
154
|
+
last_eval_error = e
|
155
|
+
next
|
156
|
+
end
|
157
|
+
|
158
|
+
# don't define an alert that doesn't apply to this hostinfo
|
159
|
+
unless alert[:applies]
|
160
|
+
log.debug "alert #{alert[:name]} doesn't apply to #{hostinfo.inspect}"
|
161
|
+
next
|
162
|
+
end
|
163
|
+
|
164
|
+
counters[:applies] += 1
|
165
|
+
|
166
|
+
# don't define alerts twice
|
167
|
+
next if alerts_queue.key?(alert[:name])
|
168
|
+
|
169
|
+
# figure out who to notify
|
170
|
+
people = Set.new(alert[:notify][:people])
|
171
|
+
alert[:notify][:groups].each do |g|
|
172
|
+
people += (groups[g] || [])
|
173
|
+
end
|
174
|
+
|
175
|
+
# queue the alert up for creation; we clone the alert to save the current state
|
176
|
+
alerts_queue[alert[:name]] ||= [alert.clone, people]
|
177
|
+
end
|
178
|
+
|
179
|
+
# log some of the counters
|
180
|
+
statsd.gauge('alerts.evaluate.errors', counters[:errors], :tags => ["alert:#{alert}"])
|
181
|
+
statsd.gauge('alerts.evaluate.applies', counters[:applies], :tags => ["alert:#{alert}"])
|
182
|
+
|
183
|
+
if counters[:applies] > 0
|
184
|
+
log.info "alert #{alert} applies to #{counters[:applies]} of #{counters[:hosts]} hosts"
|
185
|
+
end
|
186
|
+
|
187
|
+
# did the alert fail to evaluate on all hosts?
|
188
|
+
if counters[:errors] == counters[:hosts]
|
189
|
+
log.error "alert #{alert} failed to evaluate in the context of all hosts!"
|
190
|
+
log.error "last error on alert #{alert}: #{last_eval_error}"
|
191
|
+
|
192
|
+
statsd.gauge('alerts.evaluate.failed_on_all', 1, :tags => ["alert:#{alert}"])
|
193
|
+
log.debug "alert #{alert}: error #{last_eval_error}\n#{last_eval_error.backtrace.join("\n")}"
|
194
|
+
else
|
195
|
+
statsd.gauge('alerts.evaluate.failed_on_all', 0, :tags => ["alert:#{alert}"])
|
196
|
+
end
|
197
|
+
|
198
|
+
# did the alert apply to any hosts?
|
199
|
+
if counters[:applies] == 0
|
200
|
+
statsd.gauge('alerts.evaluate.never_applies', 1, :tags => ["alert:#{alert}"])
|
201
|
+
log.warn "alert #{alert} did not apply to any hosts"
|
202
|
+
else
|
203
|
+
statsd.gauge('alerts.evaluate.never_applies', 0, :tags => ["alert:#{alert}"])
|
204
|
+
end
|
205
|
+
end
|
206
|
+
|
207
|
+
# flush queue
|
208
|
+
alerts_to_create = alerts_queue.keys
|
209
|
+
concurrency = dest.concurrency || 10
|
210
|
+
threads = concurrency.times.map do
|
211
|
+
t = Thread.new do
|
212
|
+
while name = alerts_to_create.shift
|
213
|
+
cur_alert, people = alerts_queue[name]
|
214
|
+
|
215
|
+
log.debug "creating alert for #{cur_alert[:name]}"
|
216
|
+
alert_key = dest.create_alert(cur_alert, people)
|
217
|
+
|
218
|
+
# don't delete alerts we still have defined
|
219
|
+
existing_alerts[alert_key]['still_exists'] = true if existing_alerts.include?(alert_key)
|
220
|
+
end
|
221
|
+
end
|
222
|
+
t.abort_on_exception = true
|
223
|
+
t
|
224
|
+
end
|
225
|
+
threads.map(&:join)
|
226
|
+
|
227
|
+
# remove existing alerts that shouldn't exist
|
228
|
+
to_delete = existing_alerts.reject{ |key, existing_alert| existing_alert['still_exists'] }
|
229
|
+
to_delete.each{ |key, alert| dest.remove_alert(alert) }
|
230
|
+
|
231
|
+
# run time summary
|
232
|
+
run_time = Time.new.to_f - start_time
|
233
|
+
statsd.histogram('destinations.run_time', run_time, :tags => ["destination:#{dest.class.name}"])
|
234
|
+
log.info "#{dest.class.name} : run completed in %.2f seconds" % (run_time)
|
235
|
+
|
236
|
+
# report destination stats
|
237
|
+
dest.report_stats
|
238
|
+
end
|
239
|
+
end
|
240
|
+
end
|
241
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
module Interferon
|
2
|
+
class Alert
|
3
|
+
def initialize(path)
|
4
|
+
@path = path
|
5
|
+
@filename = File.basename(path)
|
6
|
+
|
7
|
+
@text = File.read(@path)
|
8
|
+
|
9
|
+
@dsl = nil
|
10
|
+
end
|
11
|
+
|
12
|
+
def to_s
|
13
|
+
@filename
|
14
|
+
end
|
15
|
+
|
16
|
+
def evaluate(hostinfo)
|
17
|
+
dsl = AlertDSL.new(hostinfo)
|
18
|
+
dsl.instance_eval(@text, @filename, 1)
|
19
|
+
@dsl = dsl
|
20
|
+
|
21
|
+
# return the alert and not the DSL object, which is private
|
22
|
+
self
|
23
|
+
end
|
24
|
+
|
25
|
+
def [](attr)
|
26
|
+
unless @dsl
|
27
|
+
raise "This alert has not yet been evaluated"
|
28
|
+
end
|
29
|
+
|
30
|
+
return @dsl.send(attr)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,94 @@
|
|
1
|
+
|
2
|
+
module Interferon
|
3
|
+
module DSLMixin
|
4
|
+
def initialize(hostinfo)
|
5
|
+
@hostinfo = hostinfo
|
6
|
+
end
|
7
|
+
|
8
|
+
def method_missing(meth, *args, &block)
|
9
|
+
raise ArgumentError, "No such alerts field '#{meth}'"
|
10
|
+
end
|
11
|
+
|
12
|
+
def [](arg)
|
13
|
+
self.send(arg)
|
14
|
+
end
|
15
|
+
|
16
|
+
private
|
17
|
+
def get_or_set(field, val, block, default)
|
18
|
+
if val.nil? && block.nil?
|
19
|
+
f = instance_variable_get(field)
|
20
|
+
f.nil? ? default : f
|
21
|
+
elsif val.nil?
|
22
|
+
instance_variable_set(field, block.call)
|
23
|
+
elsif block.nil?
|
24
|
+
instance_variable_set(field, val)
|
25
|
+
else
|
26
|
+
raise ArgumentError, "You must pass either a value or a block but not both to #{field}"
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
class AlertDSL
|
32
|
+
include DSLMixin
|
33
|
+
|
34
|
+
def name(v = nil, &block)
|
35
|
+
get_or_set(:@name, v, block, '')
|
36
|
+
end
|
37
|
+
|
38
|
+
def message(v = nil, &block)
|
39
|
+
get_or_set(:@message, v, block, '')
|
40
|
+
end
|
41
|
+
|
42
|
+
def silenced(v = nil, &block)
|
43
|
+
get_or_set(:@silenced, v, block, false)
|
44
|
+
end
|
45
|
+
|
46
|
+
def silenced_until(v = nil, &block)
|
47
|
+
get_or_set(:@silenced_until, v && Time.parse(v), block, Time.at(0))
|
48
|
+
end
|
49
|
+
|
50
|
+
def notify_no_data(v = nil, &block)
|
51
|
+
get_or_set(:@notify_no_data, v, block, false)
|
52
|
+
end
|
53
|
+
|
54
|
+
def no_data_timeframe(v = nil, &block)
|
55
|
+
get_or_set(:@no_data_timeframe, v, block, false)
|
56
|
+
end
|
57
|
+
|
58
|
+
def timeout(v = nil, &block)
|
59
|
+
get_or_set(:@timeout, v, block, false)
|
60
|
+
end
|
61
|
+
|
62
|
+
def applies(v = nil, &block)
|
63
|
+
get_or_set(:@applies, v, block, false)
|
64
|
+
end
|
65
|
+
|
66
|
+
def notify(v = nil)
|
67
|
+
@notify ||= NotifyDSL.new(@hostinfo)
|
68
|
+
end
|
69
|
+
|
70
|
+
def metric(v = nil)
|
71
|
+
@metric ||= MetricDSL.new(@hostinfo)
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
class NotifyDSL
|
76
|
+
include DSLMixin
|
77
|
+
|
78
|
+
def people(v = nil, &block)
|
79
|
+
get_or_set(:@people, v, block, [])
|
80
|
+
end
|
81
|
+
|
82
|
+
def groups(v = nil, &block)
|
83
|
+
get_or_set(:@groups, v, block, [])
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
class MetricDSL
|
88
|
+
include DSLMixin
|
89
|
+
|
90
|
+
def datadog_query(v = nil, &block)
|
91
|
+
get_or_set(:@datadog_query, v, block, '')
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|