interferon 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (47) hide show
  1. checksums.yaml +15 -0
  2. data/.gitignore +11 -0
  3. data/Gemfile +2 -0
  4. data/Gemfile.lock +52 -0
  5. data/LICENSE +21 -0
  6. data/README.md +96 -0
  7. data/bin/interferon +66 -0
  8. data/config.example.yaml +37 -0
  9. data/groups/data.yaml +11 -0
  10. data/groups/dataeng.yaml +4 -0
  11. data/groups/datainfra.yaml +10 -0
  12. data/groups/devhap.yaml +6 -0
  13. data/groups/discover.yaml +13 -0
  14. data/groups/growth.yaml +17 -0
  15. data/groups/host.yaml +12 -0
  16. data/groups/internalproducts.yml +13 -0
  17. data/groups/logstash.yaml +4 -0
  18. data/groups/mobile.yaml +17 -0
  19. data/groups/pagerduty_sysops.yaml +5 -0
  20. data/groups/panda.yaml +10 -0
  21. data/groups/payments.yaml +16 -0
  22. data/groups/payments_finance.yaml +8 -0
  23. data/groups/prodinfra.yaml +15 -0
  24. data/groups/search.yaml +10 -0
  25. data/groups/security.yaml +8 -0
  26. data/groups/sre.yaml +16 -0
  27. data/groups/teamx.yaml +8 -0
  28. data/groups/tns.yaml +14 -0
  29. data/groups/tools.yml +11 -0
  30. data/interferon.gemspec +26 -0
  31. data/lib/interferon.rb +241 -0
  32. data/lib/interferon/alert.rb +33 -0
  33. data/lib/interferon/alert_dsl.rb +94 -0
  34. data/lib/interferon/destinations/datadog.rb +169 -0
  35. data/lib/interferon/group_sources/filesystem.rb +38 -0
  36. data/lib/interferon/host_sources/aws_dynamo.rb +51 -0
  37. data/lib/interferon/host_sources/aws_elasticache.rb +69 -0
  38. data/lib/interferon/host_sources/aws_rds.rb +92 -0
  39. data/lib/interferon/host_sources/optica.rb +35 -0
  40. data/lib/interferon/host_sources/optica_services.rb +68 -0
  41. data/lib/interferon/loaders.rb +123 -0
  42. data/lib/interferon/logging.rb +26 -0
  43. data/lib/interferon/version.rb +3 -0
  44. data/script/convert.rb +29 -0
  45. data/script/pre-commit +73 -0
  46. data/spec/spec_helper.rb +62 -0
  47. metadata +179 -0
data/groups/panda.yaml ADDED
@@ -0,0 +1,10 @@
1
+ # PandA is Pricing and Availability team under marketplace.
2
+ ---
3
+ name: panda
4
+ people:
5
+ - fenglin.liao@airbnb.com
6
+ - li.zhang@airbnb.com
7
+ - spencer.demars@airbnb.com
8
+ - kim.pham@airbnb.com
9
+ - shao.liu@airbnb.com
10
+ - hector.yee@airbnb.com
@@ -0,0 +1,16 @@
1
+ ---
2
+ name: payments
3
+ people:
4
+ - allen.kerr@airbnb.com
5
+ - ian@airbnb.com
6
+ - john.terenzio@airbnb.com
7
+ - joseph.sofaer@airbnb.com
8
+ - josh.lee@airbnb.com
9
+ - juliusz.gonera@airbnb.com
10
+ - karen.kim@airbnb.com
11
+ - kevin.sun@airbnb.com
12
+ - lou.kosak@airbnb.com
13
+ - michel.weksler@airbnb.com
14
+ - mike.lewis@airbnb.com
15
+ - tao.cui@airbnb.com
16
+ - varun.pai@airbnb.com
@@ -0,0 +1,8 @@
1
+ ---
2
+ name: payments_finance
3
+ people:
4
+ - alice.liang@airbnb.com
5
+ - allen.kerr@airbnb.com
6
+ - josh.lee@airbnb.com
7
+ - mike.lewis@airbnb.com
8
+ - tao.cui@airbnb.com
@@ -0,0 +1,15 @@
1
+ ---
2
+ name: prodinfra
3
+ people:
4
+ - alexis.midon@airbnb.com
5
+ - ben.hughes@airbnb.com
6
+ - jon.tai@airbnb.com
7
+ - joseph.sofaer@airbnb.com
8
+ - kai.liu@airbnb.com
9
+ - kevin.rice@airbnb.com
10
+ - nija.mashruwala@airbnb.com
11
+ - philip.snowberger@airbnb.com
12
+ - rahul.iyer@airbnb.com
13
+ - sonic.wang@airbnb.com
14
+ - willie.yao@airbnb.com
15
+
@@ -0,0 +1,10 @@
1
+ ---
2
+ name: search
3
+ people:
4
+ - maxim.charkov@airbnb.com
5
+ - fenglin.liao@airbnb.com
6
+ - li.zhang@airbnb.com
7
+ - mousom.gupta@airbnb.com
8
+ - spencer.demars@airbnb.com
9
+ - xiang.xiao@airbnb.com
10
+ - will.moss@airbnb.com
@@ -0,0 +1,8 @@
1
+ ---
2
+ name: security
3
+ people:
4
+ - jeff@airbnb.com
5
+ - jennifer.rice@airbnb.com
6
+ - kevin.nguyen@airbnb.com
7
+ - philip.snowberger@airbnb.com
8
+ - samuel.zhu@airbnb.com
data/groups/sre.yaml ADDED
@@ -0,0 +1,16 @@
1
+ ---
2
+ # deprecated; use prodinfra
3
+ name: sre
4
+ people:
5
+ - alexis.midon@airbnb.com
6
+ - ben.hughes@airbnb.com
7
+ - jon.tai@airbnb.com
8
+ - joseph.sofaer@airbnb.com
9
+ - kai.liu@airbnb.com
10
+ - kevin.rice@airbnb.com
11
+ - nija.mashruwala@airbnb.com
12
+ - philip.snowberger@airbnb.com
13
+ - rahul.iyer@airbnb.com
14
+ - sonic.wang@airbnb.com
15
+ - willie.yao@airbnb.com
16
+
data/groups/teamx.yaml ADDED
@@ -0,0 +1,8 @@
1
+ ---
2
+ name: teamx
3
+ people:
4
+ - lu.cheng@airbnb.com
5
+ - surabhi.gupta@airbnb.com
6
+ - naseem@airbnb.com
7
+ - phillippe.siclait@airbnb.com
8
+
data/groups/tns.yaml ADDED
@@ -0,0 +1,14 @@
1
+ ---
2
+ name: tns
3
+ people:
4
+ - alok.gupta@airbnb.com
5
+ - anish.muppalaneni@airbnb.com
6
+ - dmitry.alexeenko@airbnb.com
7
+ - eric.levine@airbnb.com
8
+ - freddy.chen@airbnb.com
9
+ - jerry.luan@airbnb.com
10
+ - justin.chen@airbnb.com
11
+ - sean.albito@airbnb.com
12
+ - siddharth.kar@airbnb.com
13
+ - tony.wen@airbnb.com
14
+ - lee.zhang@airbnb.com
data/groups/tools.yml ADDED
@@ -0,0 +1,11 @@
1
+ ---
2
+ # deprecated; use internalproducts
3
+ name: tools
4
+ people:
5
+ - alvin.sng@airbnb.com
6
+ - emre.ozdemir@airbnb.com
7
+ - phil.busby@airbnb.com
8
+ - adrian.wisernig@airbnb.com
9
+ - bekki.jam@airbnb.com
10
+ - jujhaar.singh@airbnb.com
11
+ - jessica.toy@airbnb.com
@@ -0,0 +1,26 @@
1
+ # -*- encoding: utf-8 -*-
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'interferon/version'
5
+
6
+ Gem::Specification.new do |gem|
7
+ gem.name = "interferon"
8
+ gem.version = Interferon::VERSION
9
+ gem.authors = ["Igor Serebryany"]
10
+ gem.email = ["igor.serebryany@airbnb.com"]
11
+ gem.description = %q{: Store metrics alerts in code!}
12
+ gem.summary = %q{: Store metrics alerts in code!}
13
+ gem.homepage = "https://www.github.com/airbnb/interferon"
14
+ gem.licenses = ['MIT']
15
+
16
+ gem.files = `git ls-files`.split($/)
17
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
18
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
19
+
20
+ gem.add_runtime_dependency "dogapi", "~> 1.11", ">= 1.11.1"
21
+ gem.add_runtime_dependency "aws-sdk", "~> 1.35", ">= 1.35.1"
22
+ gem.add_runtime_dependency "dogstatsd-ruby", "~> 1.4", ">= 1.4.1"
23
+
24
+ gem.add_development_dependency "rspec", "~> 3.2"
25
+ gem.add_development_dependency "pry", "~> 0.10"
26
+ end
data/lib/interferon.rb ADDED
@@ -0,0 +1,241 @@
1
+ require 'interferon/version'
2
+ require 'interferon/logging'
3
+
4
+ require 'interferon/loaders'
5
+
6
+ require 'interferon/alert'
7
+ require 'interferon/alert_dsl'
8
+
9
+ #require 'pry' #uncomment if you're debugging
10
+ require 'erb'
11
+ require 'ostruct'
12
+ require 'set'
13
+ require 'yaml'
14
+
15
+ module Interferon
16
+ class Interferon
17
+
18
+ include Logging
19
+ attr_accessor :host_sources, :destinations, :host_info
20
+
21
+ # groups_sources is a hash from type => options for each group source
22
+ # host_sources is a hash from type => options for each host source
23
+ # destinations is a similiar hash from type => options for each alerter
24
+ def initialize(alerts_repo_path, groups_sources, host_sources, destinations)
25
+ @alerts_repo_path = alerts_repo_path
26
+ @groups_sources = groups_sources
27
+ @host_sources = host_sources
28
+ @destinations = destinations
29
+ end
30
+
31
+ def run(dry_run = false)
32
+ run_desc = dry_run ? 'dry run' : 'run'
33
+ log.info "beginning alerts #{run_desc}"
34
+
35
+ alerts = read_alerts
36
+ groups = read_groups(@groups_sources)
37
+ hosts = read_hosts(@host_sources)
38
+
39
+ # make sure destinations know if it's a dry run
40
+ @destinations.each do |dest|
41
+ dest['options'] ||= {}
42
+ dest['options']['dry_run'] = dry_run
43
+ end
44
+
45
+ update_alerts(@destinations, hosts, alerts, groups)
46
+
47
+ log.info "interferon #{run_desc} complete"
48
+ end
49
+
50
+ def read_alerts
51
+ alerts = []
52
+ failed = 0
53
+
54
+ # validate that alerts path exists
55
+ path = File.expand_path(File.join(@alerts_repo_path, 'alerts'))
56
+ abort("no such directory #{path} for reading alert files") \
57
+ unless Dir.exists?(path)
58
+
59
+ Dir.glob(File.join(path, '*.rb')) do |alert_file|
60
+ begin
61
+ alert = Alert.new(alert_file)
62
+ rescue StandardError => e
63
+ log.warn "error reading alert file #{alert_file}: #{e}"
64
+ failed += 1
65
+ else
66
+ alerts << alert
67
+ end
68
+ end
69
+
70
+ log.info "read #{alerts.count} alerts files from #{path}"
71
+
72
+ statsd.gauge('alerts.read.count', alerts.count)
73
+ statsd.gauge('alerts.read.failed', failed)
74
+
75
+ abort("failed to read #{failed} alerts") if failed > 0
76
+ return alerts
77
+ end
78
+
79
+ def read_groups(sources)
80
+ groups = {}
81
+ loader = GroupSourcesLoader.new([@alerts_repo_path])
82
+ loader.get_all(sources).each do |source|
83
+ source_groups = source.list_groups
84
+
85
+ # add all people to groups
86
+ people_count = 0
87
+ source_groups.each do |name, people|
88
+ groups[name] ||= []
89
+ groups[name].concat(people)
90
+ people_count += people.count
91
+ end
92
+
93
+ log.info "read #{people_count} people in #{source_groups.count} groups from source #{source.class.name}"
94
+ end
95
+
96
+ log.info "total of #{groups.values.flatten.count} people in #{groups.count} groups from #{sources.count} sources"
97
+
98
+ statsd.gauge('groups.sources', sources.count)
99
+ statsd.gauge('groups.count', groups.count)
100
+ statsd.gauge('groups.people', groups.values.flatten.count)
101
+
102
+ return groups
103
+ end
104
+
105
+ def read_hosts(sources)
106
+ statsd.gauge('hosts.sources', sources.count)
107
+
108
+ hosts = []
109
+ loader = HostSourcesLoader.new([@alerts_repo_path])
110
+ loader.get_all(sources).each do |source|
111
+ source_hosts = source.list_hosts
112
+ hosts << source_hosts
113
+
114
+ statsd.gauge('hosts.count', source_hosts.count, :tags => ["source:#{source.class.name}"])
115
+ log.info "read #{source_hosts.count} hosts from source #{source.class.name}"
116
+ end
117
+
118
+ hosts.flatten!
119
+ log.info "total of #{hosts.count} entities from #{sources.count} sources"
120
+
121
+ return hosts
122
+ end
123
+
124
+ def update_alerts(destinations, hosts, alerts, groups)
125
+ loader = DestinationsLoader.new([@alerts_repo_path])
126
+ loader.get_all(destinations).each do |dest|
127
+ log.info "updating alerts on #{dest.class.name}"
128
+
129
+ # track some counters/stats per destination
130
+ start_time = Time.new.to_f
131
+
132
+ # get already-defined alerts
133
+ existing_alerts = dest.existing_alerts.dup
134
+ existing_alerts.each{ |key, existing_alert| existing_alert['still_exists'] = false }
135
+
136
+ # create or update alerts; mark when we've done that
137
+ alerts_queue = Hash.new
138
+ alerts.each do |alert|
139
+ counters = {
140
+ :errors => 0,
141
+ :evals => 0,
142
+ :applies => 0,
143
+ :hosts => hosts.length
144
+ }
145
+ last_eval_error = nil
146
+
147
+ hosts.each do |hostinfo|
148
+ begin
149
+ alert.evaluate(hostinfo)
150
+ counters[:evals] += 1
151
+ rescue StandardError => e
152
+ log.debug "Evaluation of alert #{alert} failed in the context of host #{hostinfo}"
153
+ counters[:errors] += 1
154
+ last_eval_error = e
155
+ next
156
+ end
157
+
158
+ # don't define an alert that doesn't apply to this hostinfo
159
+ unless alert[:applies]
160
+ log.debug "alert #{alert[:name]} doesn't apply to #{hostinfo.inspect}"
161
+ next
162
+ end
163
+
164
+ counters[:applies] += 1
165
+
166
+ # don't define alerts twice
167
+ next if alerts_queue.key?(alert[:name])
168
+
169
+ # figure out who to notify
170
+ people = Set.new(alert[:notify][:people])
171
+ alert[:notify][:groups].each do |g|
172
+ people += (groups[g] || [])
173
+ end
174
+
175
+ # queue the alert up for creation; we clone the alert to save the current state
176
+ alerts_queue[alert[:name]] ||= [alert.clone, people]
177
+ end
178
+
179
+ # log some of the counters
180
+ statsd.gauge('alerts.evaluate.errors', counters[:errors], :tags => ["alert:#{alert}"])
181
+ statsd.gauge('alerts.evaluate.applies', counters[:applies], :tags => ["alert:#{alert}"])
182
+
183
+ if counters[:applies] > 0
184
+ log.info "alert #{alert} applies to #{counters[:applies]} of #{counters[:hosts]} hosts"
185
+ end
186
+
187
+ # did the alert fail to evaluate on all hosts?
188
+ if counters[:errors] == counters[:hosts]
189
+ log.error "alert #{alert} failed to evaluate in the context of all hosts!"
190
+ log.error "last error on alert #{alert}: #{last_eval_error}"
191
+
192
+ statsd.gauge('alerts.evaluate.failed_on_all', 1, :tags => ["alert:#{alert}"])
193
+ log.debug "alert #{alert}: error #{last_eval_error}\n#{last_eval_error.backtrace.join("\n")}"
194
+ else
195
+ statsd.gauge('alerts.evaluate.failed_on_all', 0, :tags => ["alert:#{alert}"])
196
+ end
197
+
198
+ # did the alert apply to any hosts?
199
+ if counters[:applies] == 0
200
+ statsd.gauge('alerts.evaluate.never_applies', 1, :tags => ["alert:#{alert}"])
201
+ log.warn "alert #{alert} did not apply to any hosts"
202
+ else
203
+ statsd.gauge('alerts.evaluate.never_applies', 0, :tags => ["alert:#{alert}"])
204
+ end
205
+ end
206
+
207
+ # flush queue
208
+ alerts_to_create = alerts_queue.keys
209
+ concurrency = dest.concurrency || 10
210
+ threads = concurrency.times.map do
211
+ t = Thread.new do
212
+ while name = alerts_to_create.shift
213
+ cur_alert, people = alerts_queue[name]
214
+
215
+ log.debug "creating alert for #{cur_alert[:name]}"
216
+ alert_key = dest.create_alert(cur_alert, people)
217
+
218
+ # don't delete alerts we still have defined
219
+ existing_alerts[alert_key]['still_exists'] = true if existing_alerts.include?(alert_key)
220
+ end
221
+ end
222
+ t.abort_on_exception = true
223
+ t
224
+ end
225
+ threads.map(&:join)
226
+
227
+ # remove existing alerts that shouldn't exist
228
+ to_delete = existing_alerts.reject{ |key, existing_alert| existing_alert['still_exists'] }
229
+ to_delete.each{ |key, alert| dest.remove_alert(alert) }
230
+
231
+ # run time summary
232
+ run_time = Time.new.to_f - start_time
233
+ statsd.histogram('destinations.run_time', run_time, :tags => ["destination:#{dest.class.name}"])
234
+ log.info "#{dest.class.name} : run completed in %.2f seconds" % (run_time)
235
+
236
+ # report destination stats
237
+ dest.report_stats
238
+ end
239
+ end
240
+ end
241
+ end
@@ -0,0 +1,33 @@
1
+ module Interferon
2
+ class Alert
3
+ def initialize(path)
4
+ @path = path
5
+ @filename = File.basename(path)
6
+
7
+ @text = File.read(@path)
8
+
9
+ @dsl = nil
10
+ end
11
+
12
+ def to_s
13
+ @filename
14
+ end
15
+
16
+ def evaluate(hostinfo)
17
+ dsl = AlertDSL.new(hostinfo)
18
+ dsl.instance_eval(@text, @filename, 1)
19
+ @dsl = dsl
20
+
21
+ # return the alert and not the DSL object, which is private
22
+ self
23
+ end
24
+
25
+ def [](attr)
26
+ unless @dsl
27
+ raise "This alert has not yet been evaluated"
28
+ end
29
+
30
+ return @dsl.send(attr)
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,94 @@
1
+
2
+ module Interferon
3
+ module DSLMixin
4
+ def initialize(hostinfo)
5
+ @hostinfo = hostinfo
6
+ end
7
+
8
+ def method_missing(meth, *args, &block)
9
+ raise ArgumentError, "No such alerts field '#{meth}'"
10
+ end
11
+
12
+ def [](arg)
13
+ self.send(arg)
14
+ end
15
+
16
+ private
17
+ def get_or_set(field, val, block, default)
18
+ if val.nil? && block.nil?
19
+ f = instance_variable_get(field)
20
+ f.nil? ? default : f
21
+ elsif val.nil?
22
+ instance_variable_set(field, block.call)
23
+ elsif block.nil?
24
+ instance_variable_set(field, val)
25
+ else
26
+ raise ArgumentError, "You must pass either a value or a block but not both to #{field}"
27
+ end
28
+ end
29
+ end
30
+
31
+ class AlertDSL
32
+ include DSLMixin
33
+
34
+ def name(v = nil, &block)
35
+ get_or_set(:@name, v, block, '')
36
+ end
37
+
38
+ def message(v = nil, &block)
39
+ get_or_set(:@message, v, block, '')
40
+ end
41
+
42
+ def silenced(v = nil, &block)
43
+ get_or_set(:@silenced, v, block, false)
44
+ end
45
+
46
+ def silenced_until(v = nil, &block)
47
+ get_or_set(:@silenced_until, v && Time.parse(v), block, Time.at(0))
48
+ end
49
+
50
+ def notify_no_data(v = nil, &block)
51
+ get_or_set(:@notify_no_data, v, block, false)
52
+ end
53
+
54
+ def no_data_timeframe(v = nil, &block)
55
+ get_or_set(:@no_data_timeframe, v, block, false)
56
+ end
57
+
58
+ def timeout(v = nil, &block)
59
+ get_or_set(:@timeout, v, block, false)
60
+ end
61
+
62
+ def applies(v = nil, &block)
63
+ get_or_set(:@applies, v, block, false)
64
+ end
65
+
66
+ def notify(v = nil)
67
+ @notify ||= NotifyDSL.new(@hostinfo)
68
+ end
69
+
70
+ def metric(v = nil)
71
+ @metric ||= MetricDSL.new(@hostinfo)
72
+ end
73
+ end
74
+
75
+ class NotifyDSL
76
+ include DSLMixin
77
+
78
+ def people(v = nil, &block)
79
+ get_or_set(:@people, v, block, [])
80
+ end
81
+
82
+ def groups(v = nil, &block)
83
+ get_or_set(:@groups, v, block, [])
84
+ end
85
+ end
86
+
87
+ class MetricDSL
88
+ include DSLMixin
89
+
90
+ def datadog_query(v = nil, &block)
91
+ get_or_set(:@datadog_query, v, block, '')
92
+ end
93
+ end
94
+ end