interferon 0.0.12 → 0.0.19
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +13 -5
- data/interferon.gemspec +6 -2
- data/lib/interferon/alert.rb +8 -0
- data/lib/interferon/alert_dsl.rb +10 -0
- data/lib/interferon/destinations/datadog.rb +127 -55
- data/lib/interferon/group_sources/filesystem.rb +18 -2
- data/lib/interferon/version.rb +1 -1
- data/lib/interferon/work_hours_helper.rb +21 -0
- data/lib/interferon.rb +134 -45
- data/spec/helpers/dsl_helper.rb +10 -1
- data/spec/helpers/mock_alert.rb +1 -0
- data/spec/lib/interferon/group_sources/filesystem_spec.rb +57 -0
- data/spec/lib/interferon_spec.rb +190 -43
- data/spec/lib/work_hours_helper_spec.rb +41 -0
- metadata +106 -25
checksums.yaml
CHANGED
@@ -1,7 +1,15 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
|
2
|
+
!binary "U0hBMQ==":
|
3
|
+
metadata.gz: !binary |-
|
4
|
+
MDdjYjBjNDk1OTJkMThiNjE5ZmZiNTgzNTI5NGU1ZDMzMzI5MGQ2ZQ==
|
5
|
+
data.tar.gz: !binary |-
|
6
|
+
NDIxMWM1MzQ2YTFlYjE4ZjIyOTUyMjk0ZDQ4ZjZhYTkzNWVlZDExZA==
|
5
7
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
|
8
|
+
metadata.gz: !binary |-
|
9
|
+
YmU0MWExYTUxNDY3ODcwNWI2OWNlZTliOWVjNWVlYjMwNzkyMzdiNWIzM2Vm
|
10
|
+
YzQ5NzY1MTQxNzkyMWIwYzJlM2I5MDc3ZTNlODFjOGVkZmU5ZmZmZjRmNGJm
|
11
|
+
YzYyZTdmODhkMjdmNjJmODUwODY5ODcwNzQ4OTliYjA5NzQ0Mzg=
|
12
|
+
data.tar.gz: !binary |-
|
13
|
+
YzkxYjVkNmFhNzQwNjdiZDdmYjY4ZmQ2ODU0MmZiMzVmODQ5NzJkN2RkMTNh
|
14
|
+
Nzc1YzY0ODA5NmU5ZDk3YzYxZmZjZTlhODg5YjdhMWU4OGNkNGM1Yzg2NTQx
|
15
|
+
MWE5ODhmZWU5YWJkNzcxNTlkMGM2YmUxYWVlYTZiOWYyZTNlZWY=
|
data/interferon.gemspec
CHANGED
@@ -6,8 +6,8 @@ require 'interferon/version'
|
|
6
6
|
Gem::Specification.new do |gem|
|
7
7
|
gem.name = "interferon"
|
8
8
|
gem.version = Interferon::VERSION
|
9
|
-
gem.authors = ["Igor Serebryany"]
|
10
|
-
gem.email = ["igor.serebryany@airbnb.com"]
|
9
|
+
gem.authors = ["Igor Serebryany", "Jimmy Ngo"]
|
10
|
+
gem.email = ["igor.serebryany@airbnb.com", "jimmy.ngo@airbnb.com"]
|
11
11
|
gem.description = %q{: Store metrics alerts in code!}
|
12
12
|
gem.summary = %q{: Store metrics alerts in code!}
|
13
13
|
gem.homepage = "https://www.github.com/airbnb/interferon"
|
@@ -20,6 +20,10 @@ Gem::Specification.new do |gem|
|
|
20
20
|
gem.add_runtime_dependency "dogapi", "~> 1.11", ">= 1.11.1"
|
21
21
|
gem.add_runtime_dependency "aws-sdk", "~> 1.35", ">= 1.35.1"
|
22
22
|
gem.add_runtime_dependency "dogstatsd-ruby", "~> 1.4", ">= 1.4.1"
|
23
|
+
gem.add_runtime_dependency "diffy", "~> 3.1.0", ">= 3.1.0"
|
24
|
+
gem.add_runtime_dependency "parallel", "~> 1.9", ">= 1.9.0"
|
25
|
+
gem.add_runtime_dependency "nokogiri", "< 1.7.0"
|
26
|
+
gem.add_runtime_dependency "tzinfo", "~> 1.2.2", ">= 1.2.2"
|
23
27
|
|
24
28
|
gem.add_development_dependency "rspec", "~> 3.2"
|
25
29
|
gem.add_development_dependency "pry", "~> 0.10"
|
data/lib/interferon/alert.rb
CHANGED
@@ -30,6 +30,14 @@ module Interferon
|
|
30
30
|
@dsl.name(name)
|
31
31
|
end
|
32
32
|
|
33
|
+
def silence
|
34
|
+
unless @dsl
|
35
|
+
raise "This alert has not yet been evaluated"
|
36
|
+
end
|
37
|
+
|
38
|
+
@dsl.silenced(true)
|
39
|
+
end
|
40
|
+
|
33
41
|
def [](attr)
|
34
42
|
unless @dsl
|
35
43
|
raise "This alert has not yet been evaluated"
|
data/lib/interferon/alert_dsl.rb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
require 'interferon/work_hours_helper'
|
1
2
|
|
2
3
|
module Interferon
|
3
4
|
module DSLMixin
|
@@ -47,6 +48,15 @@ module Interferon
|
|
47
48
|
get_or_set(:@silenced_until, v && Time.parse(v), block, Time.at(0))
|
48
49
|
end
|
49
50
|
|
51
|
+
def is_work_hour?(args = {})
|
52
|
+
# Args can contain
|
53
|
+
# :hours => range of work hours (0 to 23h), for example (9..16)
|
54
|
+
# :days => range of week days (0 = sunday), for example (1..5) (Monday to Friday)
|
55
|
+
# :timezone => example 'America/Los_Angeles'
|
56
|
+
# 9 to 5 Monday to Friday in PST is the default
|
57
|
+
WorkHoursHelper.is_work_hour?(Time.now.utc, args)
|
58
|
+
end
|
59
|
+
|
50
60
|
def notify_no_data(v = nil, &block)
|
51
61
|
get_or_set(:@notify_no_data, v, block, false)
|
52
62
|
end
|
@@ -1,6 +1,9 @@
|
|
1
|
+
require 'diffy'
|
1
2
|
require 'dogapi'
|
2
3
|
require 'set'
|
3
4
|
|
5
|
+
Diffy::Diff.default_format = :text
|
6
|
+
|
4
7
|
module Interferon::Destinations
|
5
8
|
class Datadog
|
6
9
|
include ::Interferon::Logging
|
@@ -32,14 +35,18 @@ module Interferon::Destinations
|
|
32
35
|
@dog = Dogapi::Client.new(*args)
|
33
36
|
|
34
37
|
@existing_alerts = nil
|
38
|
+
@dry_run = options['dry_run']
|
35
39
|
|
36
40
|
# create datadog alerts 10 at a time
|
37
41
|
@concurrency = 10
|
38
42
|
|
39
43
|
@stats = {
|
40
44
|
:alerts_created => 0,
|
45
|
+
:alerts_to_be_created => 0,
|
41
46
|
:alerts_updated => 0,
|
47
|
+
:alerts_to_be_updated => 0,
|
42
48
|
:alerts_deleted => 0,
|
49
|
+
:alerts_to_be_deleted => 0,
|
43
50
|
:alerts_silenced => 0,
|
44
51
|
:api_successes => 0,
|
45
52
|
:api_client_errors => 0,
|
@@ -52,6 +59,10 @@ module Interferon::Destinations
|
|
52
59
|
@api_errors ||= []
|
53
60
|
end
|
54
61
|
|
62
|
+
def generate_message(message, people)
|
63
|
+
[message, ALERT_KEY, people.map{ |p| "@#{p}" }].flatten.join("\n")
|
64
|
+
end
|
65
|
+
|
55
66
|
def existing_alerts
|
56
67
|
unless @existing_alerts
|
57
68
|
resp = @dog.get_all_alerts()
|
@@ -64,7 +75,16 @@ module Interferon::Destinations
|
|
64
75
|
alerts = resp[1]['alerts']
|
65
76
|
|
66
77
|
# key alerts by name
|
67
|
-
@existing_alerts =
|
78
|
+
@existing_alerts = {}
|
79
|
+
alerts.each do |alert|
|
80
|
+
existing_alert = @existing_alerts[alert['name']]
|
81
|
+
if existing_alert.nil?
|
82
|
+
alert['id'] = [alert['id']]
|
83
|
+
@existing_alerts[alert['name']] = alert
|
84
|
+
else
|
85
|
+
existing_alert['id'] << alert['id']
|
86
|
+
end
|
87
|
+
end
|
68
88
|
|
69
89
|
# count how many are manually created
|
70
90
|
@stats[:manually_created_alerts] = \
|
@@ -81,21 +101,22 @@ module Interferon::Destinations
|
|
81
101
|
|
82
102
|
def create_alert(alert, people)
|
83
103
|
# create a message which includes the notifications
|
84
|
-
message = [
|
85
|
-
alert['message'],
|
86
|
-
ALERT_KEY,
|
87
|
-
people.map{ |p| "@#{p}" }
|
88
|
-
].flatten.join("\n")
|
104
|
+
message = generate_message(alert['message'], people)
|
89
105
|
|
90
106
|
# create the hash of options to send to datadog
|
91
107
|
alert_opts = {
|
92
108
|
:name => alert['name'],
|
93
109
|
:message => message,
|
94
|
-
:silenced =>
|
110
|
+
:silenced => false,
|
95
111
|
:notify_no_data => alert['notify_no_data'],
|
96
112
|
:timeout_h => nil,
|
97
113
|
}
|
98
114
|
|
115
|
+
# Set alert to be silenced if there is a silenced set or silenced_until set
|
116
|
+
if alert['silenced'] || alert['silenced_until'] > Time.now
|
117
|
+
alert_opts[:silenced] = true
|
118
|
+
end
|
119
|
+
|
99
120
|
# allow an optional timeframe for "no data" alerts to be specified
|
100
121
|
# (this feature is supported, even though it's not documented)
|
101
122
|
alert_opts[:no_data_timeframe] = alert['no_data_timeframe'] if alert['no_data_timeframe']
|
@@ -103,10 +124,15 @@ module Interferon::Destinations
|
|
103
124
|
# timeout is in seconds, but set it to 1 hour at least
|
104
125
|
alert_opts[:timeout_h] = [1, (alert['timeout'].to_i / 3600)].max if alert['timeout']
|
105
126
|
|
127
|
+
datadog_query = alert['metric']['datadog_query'].strip
|
128
|
+
existing_alert = existing_alerts[alert['name']]
|
129
|
+
|
106
130
|
# new alert, create it
|
107
|
-
if
|
131
|
+
if existing_alert.nil?
|
108
132
|
action = :creating
|
109
|
-
|
133
|
+
@stats[:alerts_to_be_created] += 1
|
134
|
+
new_alert_text = "Query: #{datadog_query} Message: #{message.split().join(' ')}"
|
135
|
+
log.info("creating new alert #{alert['name']}: #{new_alert_text}")
|
110
136
|
|
111
137
|
resp = @dog.alert(
|
112
138
|
alert['metric']['datadog_query'].strip,
|
@@ -116,66 +142,67 @@ module Interferon::Destinations
|
|
116
142
|
# existing alert, modify it
|
117
143
|
else
|
118
144
|
action = :updating
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
145
|
+
@stats[:alerts_to_be_updated] += 1
|
146
|
+
id = existing_alert['id'][0]
|
147
|
+
|
148
|
+
new_alert_text = "Query:\n#{datadog_query}\nMessage:\n#{message}"
|
149
|
+
existing_alert_text = "Query:\n#{existing_alert['query']}\nMessage:\n#{existing_alert['message']}\n"
|
150
|
+
diff = Diffy::Diff.new(existing_alert_text, new_alert_text, :context=>1)
|
151
|
+
log.info("updating existing alert #{id} (#{alert['name']}): #{diff}")
|
152
|
+
|
153
|
+
if @dry_run
|
154
|
+
resp = @dog.alert(
|
155
|
+
alert['metric']['datadog_query'].strip,
|
156
|
+
alert_opts,
|
157
|
+
)
|
158
|
+
else
|
159
|
+
resp = @dog.update_alert(
|
160
|
+
id,
|
161
|
+
alert['metric']['datadog_query'].strip,
|
162
|
+
alert_opts
|
163
|
+
)
|
164
|
+
# Unmute existing alerts that have been unsilenced.
|
165
|
+
# Datadog does not allow updates to silencing via the update_alert API call.
|
166
|
+
if existing_alert['silenced'] && !alert_opts[:silenced]
|
167
|
+
@dog.unmute_monitor(id)
|
168
|
+
end
|
169
|
+
end
|
127
170
|
end
|
128
171
|
|
129
172
|
# log whenever we've encountered errors
|
130
173
|
code = resp[0].to_i
|
131
|
-
|
132
|
-
api_errors << "#{code.to_s} on alert #{alert['name']}"
|
133
|
-
end
|
134
|
-
|
135
|
-
# client error
|
136
|
-
if code == 400
|
137
|
-
statsd.gauge('datadog.api.unknown_error', 0, :tags => ["alert:#{alert}"])
|
138
|
-
statsd.gauge('datadog.api.client_error', 1, :tags => ["alert:#{alert}"])
|
139
|
-
statsd.gauge('datadog.api.success', 0, :tags => ["alert:#{alert}"])
|
140
|
-
|
141
|
-
@stats[:api_client_errors] += 1
|
142
|
-
log.error("client error while #{action} alert '#{alert['name']}';" \
|
143
|
-
" query was '#{alert['metric']['datadog_query'].strip}'" \
|
144
|
-
" response was #{resp[0]}:'#{resp[1].inspect}'")
|
145
|
-
|
146
|
-
# unknown (prob. datadog) error:
|
147
|
-
elsif code >= 400 || code == -1
|
148
|
-
statsd.gauge('datadog.api.unknown_error', 1, :tags => ["alert:#{alert}"])
|
149
|
-
statsd.gauge('datadog.api.client_error', 0, :tags => ["alert:#{alert}"])
|
150
|
-
statsd.gauge('datadog.api.success', 0, :tags => ["alert:#{alert}"])
|
151
|
-
|
152
|
-
@stats[:api_unknown_errors] += 1
|
153
|
-
log.error("unknown error while #{action} alert '#{alert['name']}':" \
|
154
|
-
" query was '#{alert['metric']['datadog_query'].strip}'" \
|
155
|
-
" response was #{resp[0]}:'#{resp[1].inspect}'")
|
174
|
+
log_datadog_response_code(resp, code, action, alert)
|
156
175
|
|
157
176
|
# assume this was a success
|
158
|
-
|
159
|
-
|
160
|
-
statsd.gauge('datadog.api.client_error', 0, :tags => ["alert:#{alert}"])
|
161
|
-
statsd.gauge('datadog.api.success', 1, :tags => ["alert:#{alert}"])
|
162
|
-
|
163
|
-
@stats[:api_successes] += 1
|
177
|
+
if !(code >= 400 || code == -1)
|
178
|
+
# assume this was a success
|
164
179
|
@stats[:alerts_created] += 1 if action == :creating
|
165
180
|
@stats[:alerts_updated] += 1 if action == :updating
|
166
181
|
@stats[:alerts_silenced] += 1 if alert_opts[:silenced]
|
167
182
|
end
|
168
183
|
|
169
|
-
id = resp[1].nil? ? nil : resp[1]['id']
|
184
|
+
id = resp[1].nil? ? nil : [resp[1]['id']]
|
170
185
|
# lets key alerts by their name
|
171
186
|
return [alert['name'], id]
|
172
187
|
end
|
173
188
|
|
174
189
|
def remove_alert(alert)
|
175
190
|
if alert['message'].include?(ALERT_KEY)
|
176
|
-
|
177
|
-
|
178
|
-
|
191
|
+
@stats[:alerts_to_be_deleted] += 1
|
192
|
+
log.info("deleting alert: #{alert['name']}")
|
193
|
+
|
194
|
+
if !@dry_run
|
195
|
+
alert['id'].each do |alert_id|
|
196
|
+
resp = @dog.delete_alert(alert_id)
|
197
|
+
code = resp[0].to_i
|
198
|
+
log_datadog_response_code(resp, code, :deleting)
|
199
|
+
|
200
|
+
if !(code >= 300 || code == -1)
|
201
|
+
# assume this was a success
|
202
|
+
@stats[:alerts_deleted] += 1
|
203
|
+
end
|
204
|
+
end
|
205
|
+
end
|
179
206
|
else
|
180
207
|
log.warn("not deleting manually-created alert #{alert['id']} (#{alert['name']})")
|
181
208
|
end
|
@@ -186,17 +213,62 @@ module Interferon::Destinations
|
|
186
213
|
statsd.gauge("datadog.#{k}", v)
|
187
214
|
end
|
188
215
|
|
189
|
-
log.info "datadog: created %d updated %d and deleted %d alerts" % [
|
216
|
+
log.info "datadog: successfully created (%d/%d), updated (%d/%d), and deleted (%d/%d) alerts" % [
|
190
217
|
@stats[:alerts_created],
|
218
|
+
@stats[:alerts_to_be_created],
|
191
219
|
@stats[:alerts_updated],
|
220
|
+
@stats[:alerts_to_be_updated],
|
192
221
|
@stats[:alerts_deleted],
|
222
|
+
@stats[:alerts_to_be_deleted],
|
193
223
|
]
|
194
224
|
end
|
195
225
|
|
196
226
|
def remove_alert_by_id(alert_id)
|
227
|
+
# This should only be used by dry-run to clean up created dry-run alerts
|
197
228
|
log.debug("deleting alert, id: #{alert_id}")
|
198
|
-
@dog.delete_alert(alert_id)
|
199
|
-
|
229
|
+
resp = @dog.delete_alert(alert_id)
|
230
|
+
code = resp[0].to_i
|
231
|
+
log_datadog_response_code(resp, code, :deleting)
|
232
|
+
end
|
233
|
+
|
234
|
+
def log_datadog_response_code(resp, code, action, alert=nil)
|
235
|
+
# log whenever we've encountered errors
|
236
|
+
if code != 200 && !alert.nil?
|
237
|
+
api_errors << "#{code.to_s} on alert #{alert['name']}"
|
238
|
+
end
|
239
|
+
|
240
|
+
# client error
|
241
|
+
if code == 400
|
242
|
+
@stats[:api_client_errors] += 1
|
243
|
+
if !alert.nil?
|
244
|
+
statsd.gauge('datadog.api.unknown_error', 0, :tags => ["alert:#{alert}"])
|
245
|
+
statsd.gauge('datadog.api.client_error', 1, :tags => ["alert:#{alert}"])
|
246
|
+
statsd.gauge('datadog.api.success', 0, :tags => ["alert:#{alert}"])
|
247
|
+
log.error("client error while #{action} alert '#{alert['name']}';" \
|
248
|
+
" query was '#{alert['metric']['datadog_query'].strip}'" \
|
249
|
+
" response was #{resp[0]}:'#{resp[1].inspect}'")
|
250
|
+
end
|
251
|
+
|
252
|
+
# unknown (prob. datadog) error:
|
253
|
+
elsif code > 400 || code == -1
|
254
|
+
@stats[:api_unknown_errors] += 1
|
255
|
+
if !alert.nil?
|
256
|
+
statsd.gauge('datadog.api.unknown_error', 1, :tags => ["alert:#{alert}"])
|
257
|
+
statsd.gauge('datadog.api.client_error', 0, :tags => ["alert:#{alert}"])
|
258
|
+
statsd.gauge('datadog.api.success', 0, :tags => ["alert:#{alert}"])
|
259
|
+
log.error("unknown error while #{action} alert '#{alert['name']}':" \
|
260
|
+
" query was '#{alert['metric']['datadog_query'].strip}'" \
|
261
|
+
" response was #{resp[0]}:'#{resp[1].inspect}'")
|
262
|
+
end
|
263
|
+
else
|
264
|
+
@stats[:api_successes] += 1
|
265
|
+
if !alert.nil?
|
266
|
+
statsd.gauge('datadog.api.unknown_error', 0, :tags => ["alert:#{alert}"])
|
267
|
+
statsd.gauge('datadog.api.client_error', 0, :tags => ["alert:#{alert}"])
|
268
|
+
statsd.gauge('datadog.api.success', 1, :tags => ["alert:#{alert}"])
|
269
|
+
end
|
270
|
+
end
|
200
271
|
end
|
272
|
+
|
201
273
|
end
|
202
274
|
end
|
@@ -1,3 +1,4 @@
|
|
1
|
+
include ::Interferon::Logging
|
1
2
|
|
2
3
|
module Interferon::GroupSources
|
3
4
|
class Filesystem
|
@@ -10,6 +11,7 @@ module Interferon::GroupSources
|
|
10
11
|
|
11
12
|
def list_groups
|
12
13
|
groups = {}
|
14
|
+
aliases = {}
|
13
15
|
|
14
16
|
@paths.each do |path|
|
15
17
|
path = File.expand_path(path)
|
@@ -18,7 +20,7 @@ module Interferon::GroupSources
|
|
18
20
|
next
|
19
21
|
end
|
20
22
|
|
21
|
-
Dir.glob(File.join(path, '*.{json,yml,yaml}')) do |group_file|
|
23
|
+
Dir.glob(File.join(path, '*.{json,yml,yaml}')).each do |group_file|
|
22
24
|
begin
|
23
25
|
group = YAML::parse(File.read(group_file))
|
24
26
|
rescue YAML::SyntaxError => e
|
@@ -27,11 +29,25 @@ module Interferon::GroupSources
|
|
27
29
|
log.warn "error reading group file #{group_file}: #{e}"
|
28
30
|
else
|
29
31
|
group = group.to_ruby
|
30
|
-
|
32
|
+
if group['people']
|
33
|
+
groups[group['name']] = group['people'] || []
|
34
|
+
elsif group['alias_for']
|
35
|
+
aliases[group['name']] = {:group => group['alias_for'], :group_file => group_file}
|
36
|
+
end
|
31
37
|
end
|
32
38
|
end
|
33
39
|
end
|
34
40
|
|
41
|
+
aliases.each do |aliased_group, group_info|
|
42
|
+
group = group_info[:group]
|
43
|
+
group_file = group_info[:group_file]
|
44
|
+
if groups.include?(group)
|
45
|
+
groups[aliased_group] = groups[group]
|
46
|
+
else
|
47
|
+
log.warn "Alias not found for #{group} but used by #{aliased_group} in #{group_file}"
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
35
51
|
return groups
|
36
52
|
end
|
37
53
|
end
|
data/lib/interferon/version.rb
CHANGED
@@ -0,0 +1,21 @@
|
|
1
|
+
require 'tzinfo'
|
2
|
+
|
3
|
+
module Interferon
|
4
|
+
class WorkHoursHelper
|
5
|
+
DEFAULT_WORK_DAYS = (1..5)
|
6
|
+
DEFAULT_WORK_HOURS = (9..16)
|
7
|
+
DEFAULT_WORK_TIMEZONE = 'America/Los_Angeles'
|
8
|
+
DEFAULT_WORK_ARGS = {
|
9
|
+
:hours => DEFAULT_WORK_HOURS,
|
10
|
+
:days => DEFAULT_WORK_DAYS,
|
11
|
+
:timezone => DEFAULT_WORK_TIMEZONE,
|
12
|
+
}.freeze
|
13
|
+
|
14
|
+
def self.is_work_hour?(time, args = {})
|
15
|
+
args = args.merge(DEFAULT_WORK_ARGS)
|
16
|
+
tz = TZInfo::Timezone.get args[:timezone]
|
17
|
+
time_in_tz = time + tz.period_for_utc(time).utc_offset
|
18
|
+
return args[:days].include?(time_in_tz.wday) && args[:hours].include?(time_in_tz.hour)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|