interferon 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. checksums.yaml +15 -0
  2. data/.gitignore +11 -0
  3. data/Gemfile +2 -0
  4. data/Gemfile.lock +52 -0
  5. data/LICENSE +21 -0
  6. data/README.md +96 -0
  7. data/bin/interferon +66 -0
  8. data/config.example.yaml +37 -0
  9. data/groups/data.yaml +11 -0
  10. data/groups/dataeng.yaml +4 -0
  11. data/groups/datainfra.yaml +10 -0
  12. data/groups/devhap.yaml +6 -0
  13. data/groups/discover.yaml +13 -0
  14. data/groups/growth.yaml +17 -0
  15. data/groups/host.yaml +12 -0
  16. data/groups/internalproducts.yml +13 -0
  17. data/groups/logstash.yaml +4 -0
  18. data/groups/mobile.yaml +17 -0
  19. data/groups/pagerduty_sysops.yaml +5 -0
  20. data/groups/panda.yaml +10 -0
  21. data/groups/payments.yaml +16 -0
  22. data/groups/payments_finance.yaml +8 -0
  23. data/groups/prodinfra.yaml +15 -0
  24. data/groups/search.yaml +10 -0
  25. data/groups/security.yaml +8 -0
  26. data/groups/sre.yaml +16 -0
  27. data/groups/teamx.yaml +8 -0
  28. data/groups/tns.yaml +14 -0
  29. data/groups/tools.yml +11 -0
  30. data/interferon.gemspec +26 -0
  31. data/lib/interferon.rb +241 -0
  32. data/lib/interferon/alert.rb +33 -0
  33. data/lib/interferon/alert_dsl.rb +94 -0
  34. data/lib/interferon/destinations/datadog.rb +169 -0
  35. data/lib/interferon/group_sources/filesystem.rb +38 -0
  36. data/lib/interferon/host_sources/aws_dynamo.rb +51 -0
  37. data/lib/interferon/host_sources/aws_elasticache.rb +69 -0
  38. data/lib/interferon/host_sources/aws_rds.rb +92 -0
  39. data/lib/interferon/host_sources/optica.rb +35 -0
  40. data/lib/interferon/host_sources/optica_services.rb +68 -0
  41. data/lib/interferon/loaders.rb +123 -0
  42. data/lib/interferon/logging.rb +26 -0
  43. data/lib/interferon/version.rb +3 -0
  44. data/script/convert.rb +29 -0
  45. data/script/pre-commit +73 -0
  46. data/spec/spec_helper.rb +62 -0
  47. metadata +179 -0
data/groups/panda.yaml ADDED
@@ -0,0 +1,10 @@
1
+ # PandA is Pricing and Availability team under marketplace.
2
+ ---
3
+ name: panda
4
+ people:
5
+ - fenglin.liao@airbnb.com
6
+ - li.zhang@airbnb.com
7
+ - spencer.demars@airbnb.com
8
+ - kim.pham@airbnb.com
9
+ - shao.liu@airbnb.com
10
+ - hector.yee@airbnb.com
@@ -0,0 +1,16 @@
1
+ ---
2
+ name: payments
3
+ people:
4
+ - allen.kerr@airbnb.com
5
+ - ian@airbnb.com
6
+ - john.terenzio@airbnb.com
7
+ - joseph.sofaer@airbnb.com
8
+ - josh.lee@airbnb.com
9
+ - juliusz.gonera@airbnb.com
10
+ - karen.kim@airbnb.com
11
+ - kevin.sun@airbnb.com
12
+ - lou.kosak@airbnb.com
13
+ - michel.weksler@airbnb.com
14
+ - mike.lewis@airbnb.com
15
+ - tao.cui@airbnb.com
16
+ - varun.pai@airbnb.com
@@ -0,0 +1,8 @@
1
+ ---
2
+ name: payments_finance
3
+ people:
4
+ - alice.liang@airbnb.com
5
+ - allen.kerr@airbnb.com
6
+ - josh.lee@airbnb.com
7
+ - mike.lewis@airbnb.com
8
+ - tao.cui@airbnb.com
@@ -0,0 +1,15 @@
1
+ ---
2
+ name: prodinfra
3
+ people:
4
+ - alexis.midon@airbnb.com
5
+ - ben.hughes@airbnb.com
6
+ - jon.tai@airbnb.com
7
+ - joseph.sofaer@airbnb.com
8
+ - kai.liu@airbnb.com
9
+ - kevin.rice@airbnb.com
10
+ - nija.mashruwala@airbnb.com
11
+ - philip.snowberger@airbnb.com
12
+ - rahul.iyer@airbnb.com
13
+ - sonic.wang@airbnb.com
14
+ - willie.yao@airbnb.com
15
+
@@ -0,0 +1,10 @@
1
+ ---
2
+ name: search
3
+ people:
4
+ - maxim.charkov@airbnb.com
5
+ - fenglin.liao@airbnb.com
6
+ - li.zhang@airbnb.com
7
+ - mousom.gupta@airbnb.com
8
+ - spencer.demars@airbnb.com
9
+ - xiang.xiao@airbnb.com
10
+ - will.moss@airbnb.com
@@ -0,0 +1,8 @@
1
+ ---
2
+ name: security
3
+ people:
4
+ - jeff@airbnb.com
5
+ - jennifer.rice@airbnb.com
6
+ - kevin.nguyen@airbnb.com
7
+ - philip.snowberger@airbnb.com
8
+ - samuel.zhu@airbnb.com
data/groups/sre.yaml ADDED
@@ -0,0 +1,16 @@
1
+ ---
2
+ # deprecated; use prodinfra
3
+ name: sre
4
+ people:
5
+ - alexis.midon@airbnb.com
6
+ - ben.hughes@airbnb.com
7
+ - jon.tai@airbnb.com
8
+ - joseph.sofaer@airbnb.com
9
+ - kai.liu@airbnb.com
10
+ - kevin.rice@airbnb.com
11
+ - nija.mashruwala@airbnb.com
12
+ - philip.snowberger@airbnb.com
13
+ - rahul.iyer@airbnb.com
14
+ - sonic.wang@airbnb.com
15
+ - willie.yao@airbnb.com
16
+
data/groups/teamx.yaml ADDED
@@ -0,0 +1,8 @@
1
+ ---
2
+ name: teamx
3
+ people:
4
+ - lu.cheng@airbnb.com
5
+ - surabhi.gupta@airbnb.com
6
+ - naseem@airbnb.com
7
+ - phillippe.siclait@airbnb.com
8
+
data/groups/tns.yaml ADDED
@@ -0,0 +1,14 @@
1
+ ---
2
+ name: tns
3
+ people:
4
+ - alok.gupta@airbnb.com
5
+ - anish.muppalaneni@airbnb.com
6
+ - dmitry.alexeenko@airbnb.com
7
+ - eric.levine@airbnb.com
8
+ - freddy.chen@airbnb.com
9
+ - jerry.luan@airbnb.com
10
+ - justin.chen@airbnb.com
11
+ - sean.albito@airbnb.com
12
+ - siddharth.kar@airbnb.com
13
+ - tony.wen@airbnb.com
14
+ - lee.zhang@airbnb.com
data/groups/tools.yml ADDED
@@ -0,0 +1,11 @@
1
+ ---
2
+ # deprecated; use internalproducts
3
+ name: tools
4
+ people:
5
+ - alvin.sng@airbnb.com
6
+ - emre.ozdemir@airbnb.com
7
+ - phil.busby@airbnb.com
8
+ - adrian.wisernig@airbnb.com
9
+ - bekki.jam@airbnb.com
10
+ - jujhaar.singh@airbnb.com
11
+ - jessica.toy@airbnb.com
@@ -0,0 +1,26 @@
1
+ # -*- encoding: utf-8 -*-
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'interferon/version'
5
+
6
+ Gem::Specification.new do |gem|
7
+ gem.name = "interferon"
8
+ gem.version = Interferon::VERSION
9
+ gem.authors = ["Igor Serebryany"]
10
+ gem.email = ["igor.serebryany@airbnb.com"]
11
+ gem.description = %q{: Store metrics alerts in code!}
12
+ gem.summary = %q{: Store metrics alerts in code!}
13
+ gem.homepage = "https://www.github.com/airbnb/interferon"
14
+ gem.licenses = ['MIT']
15
+
16
+ gem.files = `git ls-files`.split($/)
17
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
18
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
19
+
20
+ gem.add_runtime_dependency "dogapi", "~> 1.11", ">= 1.11.1"
21
+ gem.add_runtime_dependency "aws-sdk", "~> 1.35", ">= 1.35.1"
22
+ gem.add_runtime_dependency "dogstatsd-ruby", "~> 1.4", ">= 1.4.1"
23
+
24
+ gem.add_development_dependency "rspec", "~> 3.2"
25
+ gem.add_development_dependency "pry", "~> 0.10"
26
+ end
data/lib/interferon.rb ADDED
@@ -0,0 +1,241 @@
1
+ require 'interferon/version'
2
+ require 'interferon/logging'
3
+
4
+ require 'interferon/loaders'
5
+
6
+ require 'interferon/alert'
7
+ require 'interferon/alert_dsl'
8
+
9
+ #require 'pry' #uncomment if you're debugging
10
+ require 'erb'
11
+ require 'ostruct'
12
+ require 'set'
13
+ require 'yaml'
14
+
15
+ module Interferon
16
+ class Interferon
17
+
18
+ include Logging
19
+ attr_accessor :host_sources, :destinations, :host_info
20
+
21
+ # groups_sources is a hash from type => options for each group source
22
+ # host_sources is a hash from type => options for each host source
23
+ # destinations is a similiar hash from type => options for each alerter
24
+ def initialize(alerts_repo_path, groups_sources, host_sources, destinations)
25
+ @alerts_repo_path = alerts_repo_path
26
+ @groups_sources = groups_sources
27
+ @host_sources = host_sources
28
+ @destinations = destinations
29
+ end
30
+
31
+ def run(dry_run = false)
32
+ run_desc = dry_run ? 'dry run' : 'run'
33
+ log.info "beginning alerts #{run_desc}"
34
+
35
+ alerts = read_alerts
36
+ groups = read_groups(@groups_sources)
37
+ hosts = read_hosts(@host_sources)
38
+
39
+ # make sure destinations know if it's a dry run
40
+ @destinations.each do |dest|
41
+ dest['options'] ||= {}
42
+ dest['options']['dry_run'] = dry_run
43
+ end
44
+
45
+ update_alerts(@destinations, hosts, alerts, groups)
46
+
47
+ log.info "interferon #{run_desc} complete"
48
+ end
49
+
50
+ def read_alerts
51
+ alerts = []
52
+ failed = 0
53
+
54
+ # validate that alerts path exists
55
+ path = File.expand_path(File.join(@alerts_repo_path, 'alerts'))
56
+ abort("no such directory #{path} for reading alert files") \
57
+ unless Dir.exists?(path)
58
+
59
+ Dir.glob(File.join(path, '*.rb')) do |alert_file|
60
+ begin
61
+ alert = Alert.new(alert_file)
62
+ rescue StandardError => e
63
+ log.warn "error reading alert file #{alert_file}: #{e}"
64
+ failed += 1
65
+ else
66
+ alerts << alert
67
+ end
68
+ end
69
+
70
+ log.info "read #{alerts.count} alerts files from #{path}"
71
+
72
+ statsd.gauge('alerts.read.count', alerts.count)
73
+ statsd.gauge('alerts.read.failed', failed)
74
+
75
+ abort("failed to read #{failed} alerts") if failed > 0
76
+ return alerts
77
+ end
78
+
79
+ def read_groups(sources)
80
+ groups = {}
81
+ loader = GroupSourcesLoader.new([@alerts_repo_path])
82
+ loader.get_all(sources).each do |source|
83
+ source_groups = source.list_groups
84
+
85
+ # add all people to groups
86
+ people_count = 0
87
+ source_groups.each do |name, people|
88
+ groups[name] ||= []
89
+ groups[name].concat(people)
90
+ people_count += people.count
91
+ end
92
+
93
+ log.info "read #{people_count} people in #{source_groups.count} groups from source #{source.class.name}"
94
+ end
95
+
96
+ log.info "total of #{groups.values.flatten.count} people in #{groups.count} groups from #{sources.count} sources"
97
+
98
+ statsd.gauge('groups.sources', sources.count)
99
+ statsd.gauge('groups.count', groups.count)
100
+ statsd.gauge('groups.people', groups.values.flatten.count)
101
+
102
+ return groups
103
+ end
104
+
105
+ def read_hosts(sources)
106
+ statsd.gauge('hosts.sources', sources.count)
107
+
108
+ hosts = []
109
+ loader = HostSourcesLoader.new([@alerts_repo_path])
110
+ loader.get_all(sources).each do |source|
111
+ source_hosts = source.list_hosts
112
+ hosts << source_hosts
113
+
114
+ statsd.gauge('hosts.count', source_hosts.count, :tags => ["source:#{source.class.name}"])
115
+ log.info "read #{source_hosts.count} hosts from source #{source.class.name}"
116
+ end
117
+
118
+ hosts.flatten!
119
+ log.info "total of #{hosts.count} entities from #{sources.count} sources"
120
+
121
+ return hosts
122
+ end
123
+
124
+ def update_alerts(destinations, hosts, alerts, groups)
125
+ loader = DestinationsLoader.new([@alerts_repo_path])
126
+ loader.get_all(destinations).each do |dest|
127
+ log.info "updating alerts on #{dest.class.name}"
128
+
129
+ # track some counters/stats per destination
130
+ start_time = Time.new.to_f
131
+
132
+ # get already-defined alerts
133
+ existing_alerts = dest.existing_alerts.dup
134
+ existing_alerts.each{ |key, existing_alert| existing_alert['still_exists'] = false }
135
+
136
+ # create or update alerts; mark when we've done that
137
+ alerts_queue = Hash.new
138
+ alerts.each do |alert|
139
+ counters = {
140
+ :errors => 0,
141
+ :evals => 0,
142
+ :applies => 0,
143
+ :hosts => hosts.length
144
+ }
145
+ last_eval_error = nil
146
+
147
+ hosts.each do |hostinfo|
148
+ begin
149
+ alert.evaluate(hostinfo)
150
+ counters[:evals] += 1
151
+ rescue StandardError => e
152
+ log.debug "Evaluation of alert #{alert} failed in the context of host #{hostinfo}"
153
+ counters[:errors] += 1
154
+ last_eval_error = e
155
+ next
156
+ end
157
+
158
+ # don't define an alert that doesn't apply to this hostinfo
159
+ unless alert[:applies]
160
+ log.debug "alert #{alert[:name]} doesn't apply to #{hostinfo.inspect}"
161
+ next
162
+ end
163
+
164
+ counters[:applies] += 1
165
+
166
+ # don't define alerts twice
167
+ next if alerts_queue.key?(alert[:name])
168
+
169
+ # figure out who to notify
170
+ people = Set.new(alert[:notify][:people])
171
+ alert[:notify][:groups].each do |g|
172
+ people += (groups[g] || [])
173
+ end
174
+
175
+ # queue the alert up for creation; we clone the alert to save the current state
176
+ alerts_queue[alert[:name]] ||= [alert.clone, people]
177
+ end
178
+
179
+ # log some of the counters
180
+ statsd.gauge('alerts.evaluate.errors', counters[:errors], :tags => ["alert:#{alert}"])
181
+ statsd.gauge('alerts.evaluate.applies', counters[:applies], :tags => ["alert:#{alert}"])
182
+
183
+ if counters[:applies] > 0
184
+ log.info "alert #{alert} applies to #{counters[:applies]} of #{counters[:hosts]} hosts"
185
+ end
186
+
187
+ # did the alert fail to evaluate on all hosts?
188
+ if counters[:errors] == counters[:hosts]
189
+ log.error "alert #{alert} failed to evaluate in the context of all hosts!"
190
+ log.error "last error on alert #{alert}: #{last_eval_error}"
191
+
192
+ statsd.gauge('alerts.evaluate.failed_on_all', 1, :tags => ["alert:#{alert}"])
193
+ log.debug "alert #{alert}: error #{last_eval_error}\n#{last_eval_error.backtrace.join("\n")}"
194
+ else
195
+ statsd.gauge('alerts.evaluate.failed_on_all', 0, :tags => ["alert:#{alert}"])
196
+ end
197
+
198
+ # did the alert apply to any hosts?
199
+ if counters[:applies] == 0
200
+ statsd.gauge('alerts.evaluate.never_applies', 1, :tags => ["alert:#{alert}"])
201
+ log.warn "alert #{alert} did not apply to any hosts"
202
+ else
203
+ statsd.gauge('alerts.evaluate.never_applies', 0, :tags => ["alert:#{alert}"])
204
+ end
205
+ end
206
+
207
+ # flush queue
208
+ alerts_to_create = alerts_queue.keys
209
+ concurrency = dest.concurrency || 10
210
+ threads = concurrency.times.map do
211
+ t = Thread.new do
212
+ while name = alerts_to_create.shift
213
+ cur_alert, people = alerts_queue[name]
214
+
215
+ log.debug "creating alert for #{cur_alert[:name]}"
216
+ alert_key = dest.create_alert(cur_alert, people)
217
+
218
+ # don't delete alerts we still have defined
219
+ existing_alerts[alert_key]['still_exists'] = true if existing_alerts.include?(alert_key)
220
+ end
221
+ end
222
+ t.abort_on_exception = true
223
+ t
224
+ end
225
+ threads.map(&:join)
226
+
227
+ # remove existing alerts that shouldn't exist
228
+ to_delete = existing_alerts.reject{ |key, existing_alert| existing_alert['still_exists'] }
229
+ to_delete.each{ |key, alert| dest.remove_alert(alert) }
230
+
231
+ # run time summary
232
+ run_time = Time.new.to_f - start_time
233
+ statsd.histogram('destinations.run_time', run_time, :tags => ["destination:#{dest.class.name}"])
234
+ log.info "#{dest.class.name} : run completed in %.2f seconds" % (run_time)
235
+
236
+ # report destination stats
237
+ dest.report_stats
238
+ end
239
+ end
240
+ end
241
+ end
@@ -0,0 +1,33 @@
1
+ module Interferon
2
+ class Alert
3
+ def initialize(path)
4
+ @path = path
5
+ @filename = File.basename(path)
6
+
7
+ @text = File.read(@path)
8
+
9
+ @dsl = nil
10
+ end
11
+
12
+ def to_s
13
+ @filename
14
+ end
15
+
16
+ def evaluate(hostinfo)
17
+ dsl = AlertDSL.new(hostinfo)
18
+ dsl.instance_eval(@text, @filename, 1)
19
+ @dsl = dsl
20
+
21
+ # return the alert and not the DSL object, which is private
22
+ self
23
+ end
24
+
25
+ def [](attr)
26
+ unless @dsl
27
+ raise "This alert has not yet been evaluated"
28
+ end
29
+
30
+ return @dsl.send(attr)
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,94 @@
1
+
2
+ module Interferon
3
+ module DSLMixin
4
+ def initialize(hostinfo)
5
+ @hostinfo = hostinfo
6
+ end
7
+
8
+ def method_missing(meth, *args, &block)
9
+ raise ArgumentError, "No such alerts field '#{meth}'"
10
+ end
11
+
12
+ def [](arg)
13
+ self.send(arg)
14
+ end
15
+
16
+ private
17
+ def get_or_set(field, val, block, default)
18
+ if val.nil? && block.nil?
19
+ f = instance_variable_get(field)
20
+ f.nil? ? default : f
21
+ elsif val.nil?
22
+ instance_variable_set(field, block.call)
23
+ elsif block.nil?
24
+ instance_variable_set(field, val)
25
+ else
26
+ raise ArgumentError, "You must pass either a value or a block but not both to #{field}"
27
+ end
28
+ end
29
+ end
30
+
31
+ class AlertDSL
32
+ include DSLMixin
33
+
34
+ def name(v = nil, &block)
35
+ get_or_set(:@name, v, block, '')
36
+ end
37
+
38
+ def message(v = nil, &block)
39
+ get_or_set(:@message, v, block, '')
40
+ end
41
+
42
+ def silenced(v = nil, &block)
43
+ get_or_set(:@silenced, v, block, false)
44
+ end
45
+
46
+ def silenced_until(v = nil, &block)
47
+ get_or_set(:@silenced_until, v && Time.parse(v), block, Time.at(0))
48
+ end
49
+
50
+ def notify_no_data(v = nil, &block)
51
+ get_or_set(:@notify_no_data, v, block, false)
52
+ end
53
+
54
+ def no_data_timeframe(v = nil, &block)
55
+ get_or_set(:@no_data_timeframe, v, block, false)
56
+ end
57
+
58
+ def timeout(v = nil, &block)
59
+ get_or_set(:@timeout, v, block, false)
60
+ end
61
+
62
+ def applies(v = nil, &block)
63
+ get_or_set(:@applies, v, block, false)
64
+ end
65
+
66
+ def notify(v = nil)
67
+ @notify ||= NotifyDSL.new(@hostinfo)
68
+ end
69
+
70
+ def metric(v = nil)
71
+ @metric ||= MetricDSL.new(@hostinfo)
72
+ end
73
+ end
74
+
75
+ class NotifyDSL
76
+ include DSLMixin
77
+
78
+ def people(v = nil, &block)
79
+ get_or_set(:@people, v, block, [])
80
+ end
81
+
82
+ def groups(v = nil, &block)
83
+ get_or_set(:@groups, v, block, [])
84
+ end
85
+ end
86
+
87
+ class MetricDSL
88
+ include DSLMixin
89
+
90
+ def datadog_query(v = nil, &block)
91
+ get_or_set(:@datadog_query, v, block, '')
92
+ end
93
+ end
94
+ end