interferon 0.0.12 → 0.0.19
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +13 -5
- data/interferon.gemspec +6 -2
- data/lib/interferon/alert.rb +8 -0
- data/lib/interferon/alert_dsl.rb +10 -0
- data/lib/interferon/destinations/datadog.rb +127 -55
- data/lib/interferon/group_sources/filesystem.rb +18 -2
- data/lib/interferon/version.rb +1 -1
- data/lib/interferon/work_hours_helper.rb +21 -0
- data/lib/interferon.rb +134 -45
- data/spec/helpers/dsl_helper.rb +10 -1
- data/spec/helpers/mock_alert.rb +1 -0
- data/spec/lib/interferon/group_sources/filesystem_spec.rb +57 -0
- data/spec/lib/interferon_spec.rb +190 -43
- data/spec/lib/work_hours_helper_spec.rb +41 -0
- metadata +106 -25
checksums.yaml
CHANGED
@@ -1,7 +1,15 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
|
2
|
+
!binary "U0hBMQ==":
|
3
|
+
metadata.gz: !binary |-
|
4
|
+
MDdjYjBjNDk1OTJkMThiNjE5ZmZiNTgzNTI5NGU1ZDMzMzI5MGQ2ZQ==
|
5
|
+
data.tar.gz: !binary |-
|
6
|
+
NDIxMWM1MzQ2YTFlYjE4ZjIyOTUyMjk0ZDQ4ZjZhYTkzNWVlZDExZA==
|
5
7
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
|
8
|
+
metadata.gz: !binary |-
|
9
|
+
YmU0MWExYTUxNDY3ODcwNWI2OWNlZTliOWVjNWVlYjMwNzkyMzdiNWIzM2Vm
|
10
|
+
YzQ5NzY1MTQxNzkyMWIwYzJlM2I5MDc3ZTNlODFjOGVkZmU5ZmZmZjRmNGJm
|
11
|
+
YzYyZTdmODhkMjdmNjJmODUwODY5ODcwNzQ4OTliYjA5NzQ0Mzg=
|
12
|
+
data.tar.gz: !binary |-
|
13
|
+
YzkxYjVkNmFhNzQwNjdiZDdmYjY4ZmQ2ODU0MmZiMzVmODQ5NzJkN2RkMTNh
|
14
|
+
Nzc1YzY0ODA5NmU5ZDk3YzYxZmZjZTlhODg5YjdhMWU4OGNkNGM1Yzg2NTQx
|
15
|
+
MWE5ODhmZWU5YWJkNzcxNTlkMGM2YmUxYWVlYTZiOWYyZTNlZWY=
|
data/interferon.gemspec
CHANGED
@@ -6,8 +6,8 @@ require 'interferon/version'
|
|
6
6
|
Gem::Specification.new do |gem|
|
7
7
|
gem.name = "interferon"
|
8
8
|
gem.version = Interferon::VERSION
|
9
|
-
gem.authors = ["Igor Serebryany"]
|
10
|
-
gem.email = ["igor.serebryany@airbnb.com"]
|
9
|
+
gem.authors = ["Igor Serebryany", "Jimmy Ngo"]
|
10
|
+
gem.email = ["igor.serebryany@airbnb.com", "jimmy.ngo@airbnb.com"]
|
11
11
|
gem.description = %q{: Store metrics alerts in code!}
|
12
12
|
gem.summary = %q{: Store metrics alerts in code!}
|
13
13
|
gem.homepage = "https://www.github.com/airbnb/interferon"
|
@@ -20,6 +20,10 @@ Gem::Specification.new do |gem|
|
|
20
20
|
gem.add_runtime_dependency "dogapi", "~> 1.11", ">= 1.11.1"
|
21
21
|
gem.add_runtime_dependency "aws-sdk", "~> 1.35", ">= 1.35.1"
|
22
22
|
gem.add_runtime_dependency "dogstatsd-ruby", "~> 1.4", ">= 1.4.1"
|
23
|
+
gem.add_runtime_dependency "diffy", "~> 3.1.0", ">= 3.1.0"
|
24
|
+
gem.add_runtime_dependency "parallel", "~> 1.9", ">= 1.9.0"
|
25
|
+
gem.add_runtime_dependency "nokogiri", "< 1.7.0"
|
26
|
+
gem.add_runtime_dependency "tzinfo", "~> 1.2.2", ">= 1.2.2"
|
23
27
|
|
24
28
|
gem.add_development_dependency "rspec", "~> 3.2"
|
25
29
|
gem.add_development_dependency "pry", "~> 0.10"
|
data/lib/interferon/alert.rb
CHANGED
@@ -30,6 +30,14 @@ module Interferon
|
|
30
30
|
@dsl.name(name)
|
31
31
|
end
|
32
32
|
|
33
|
+
def silence
|
34
|
+
unless @dsl
|
35
|
+
raise "This alert has not yet been evaluated"
|
36
|
+
end
|
37
|
+
|
38
|
+
@dsl.silenced(true)
|
39
|
+
end
|
40
|
+
|
33
41
|
def [](attr)
|
34
42
|
unless @dsl
|
35
43
|
raise "This alert has not yet been evaluated"
|
data/lib/interferon/alert_dsl.rb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
require 'interferon/work_hours_helper'
|
1
2
|
|
2
3
|
module Interferon
|
3
4
|
module DSLMixin
|
@@ -47,6 +48,15 @@ module Interferon
|
|
47
48
|
get_or_set(:@silenced_until, v && Time.parse(v), block, Time.at(0))
|
48
49
|
end
|
49
50
|
|
51
|
+
def is_work_hour?(args = {})
|
52
|
+
# Args can contain
|
53
|
+
# :hours => range of work hours (0 to 23h), for example (9..16)
|
54
|
+
# :days => range of week days (0 = sunday), for example (1..5) (Monday to Friday)
|
55
|
+
# :timezone => example 'America/Los_Angeles'
|
56
|
+
# 9 to 5 Monday to Friday in PST is the default
|
57
|
+
WorkHoursHelper.is_work_hour?(Time.now.utc, args)
|
58
|
+
end
|
59
|
+
|
50
60
|
def notify_no_data(v = nil, &block)
|
51
61
|
get_or_set(:@notify_no_data, v, block, false)
|
52
62
|
end
|
@@ -1,6 +1,9 @@
|
|
1
|
+
require 'diffy'
|
1
2
|
require 'dogapi'
|
2
3
|
require 'set'
|
3
4
|
|
5
|
+
Diffy::Diff.default_format = :text
|
6
|
+
|
4
7
|
module Interferon::Destinations
|
5
8
|
class Datadog
|
6
9
|
include ::Interferon::Logging
|
@@ -32,14 +35,18 @@ module Interferon::Destinations
|
|
32
35
|
@dog = Dogapi::Client.new(*args)
|
33
36
|
|
34
37
|
@existing_alerts = nil
|
38
|
+
@dry_run = options['dry_run']
|
35
39
|
|
36
40
|
# create datadog alerts 10 at a time
|
37
41
|
@concurrency = 10
|
38
42
|
|
39
43
|
@stats = {
|
40
44
|
:alerts_created => 0,
|
45
|
+
:alerts_to_be_created => 0,
|
41
46
|
:alerts_updated => 0,
|
47
|
+
:alerts_to_be_updated => 0,
|
42
48
|
:alerts_deleted => 0,
|
49
|
+
:alerts_to_be_deleted => 0,
|
43
50
|
:alerts_silenced => 0,
|
44
51
|
:api_successes => 0,
|
45
52
|
:api_client_errors => 0,
|
@@ -52,6 +59,10 @@ module Interferon::Destinations
|
|
52
59
|
@api_errors ||= []
|
53
60
|
end
|
54
61
|
|
62
|
+
def generate_message(message, people)
|
63
|
+
[message, ALERT_KEY, people.map{ |p| "@#{p}" }].flatten.join("\n")
|
64
|
+
end
|
65
|
+
|
55
66
|
def existing_alerts
|
56
67
|
unless @existing_alerts
|
57
68
|
resp = @dog.get_all_alerts()
|
@@ -64,7 +75,16 @@ module Interferon::Destinations
|
|
64
75
|
alerts = resp[1]['alerts']
|
65
76
|
|
66
77
|
# key alerts by name
|
67
|
-
@existing_alerts =
|
78
|
+
@existing_alerts = {}
|
79
|
+
alerts.each do |alert|
|
80
|
+
existing_alert = @existing_alerts[alert['name']]
|
81
|
+
if existing_alert.nil?
|
82
|
+
alert['id'] = [alert['id']]
|
83
|
+
@existing_alerts[alert['name']] = alert
|
84
|
+
else
|
85
|
+
existing_alert['id'] << alert['id']
|
86
|
+
end
|
87
|
+
end
|
68
88
|
|
69
89
|
# count how many are manually created
|
70
90
|
@stats[:manually_created_alerts] = \
|
@@ -81,21 +101,22 @@ module Interferon::Destinations
|
|
81
101
|
|
82
102
|
def create_alert(alert, people)
|
83
103
|
# create a message which includes the notifications
|
84
|
-
message = [
|
85
|
-
alert['message'],
|
86
|
-
ALERT_KEY,
|
87
|
-
people.map{ |p| "@#{p}" }
|
88
|
-
].flatten.join("\n")
|
104
|
+
message = generate_message(alert['message'], people)
|
89
105
|
|
90
106
|
# create the hash of options to send to datadog
|
91
107
|
alert_opts = {
|
92
108
|
:name => alert['name'],
|
93
109
|
:message => message,
|
94
|
-
:silenced =>
|
110
|
+
:silenced => false,
|
95
111
|
:notify_no_data => alert['notify_no_data'],
|
96
112
|
:timeout_h => nil,
|
97
113
|
}
|
98
114
|
|
115
|
+
# Set alert to be silenced if there is a silenced set or silenced_until set
|
116
|
+
if alert['silenced'] || alert['silenced_until'] > Time.now
|
117
|
+
alert_opts[:silenced] = true
|
118
|
+
end
|
119
|
+
|
99
120
|
# allow an optional timeframe for "no data" alerts to be specified
|
100
121
|
# (this feature is supported, even though it's not documented)
|
101
122
|
alert_opts[:no_data_timeframe] = alert['no_data_timeframe'] if alert['no_data_timeframe']
|
@@ -103,10 +124,15 @@ module Interferon::Destinations
|
|
103
124
|
# timeout is in seconds, but set it to 1 hour at least
|
104
125
|
alert_opts[:timeout_h] = [1, (alert['timeout'].to_i / 3600)].max if alert['timeout']
|
105
126
|
|
127
|
+
datadog_query = alert['metric']['datadog_query'].strip
|
128
|
+
existing_alert = existing_alerts[alert['name']]
|
129
|
+
|
106
130
|
# new alert, create it
|
107
|
-
if
|
131
|
+
if existing_alert.nil?
|
108
132
|
action = :creating
|
109
|
-
|
133
|
+
@stats[:alerts_to_be_created] += 1
|
134
|
+
new_alert_text = "Query: #{datadog_query} Message: #{message.split().join(' ')}"
|
135
|
+
log.info("creating new alert #{alert['name']}: #{new_alert_text}")
|
110
136
|
|
111
137
|
resp = @dog.alert(
|
112
138
|
alert['metric']['datadog_query'].strip,
|
@@ -116,66 +142,67 @@ module Interferon::Destinations
|
|
116
142
|
# existing alert, modify it
|
117
143
|
else
|
118
144
|
action = :updating
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
145
|
+
@stats[:alerts_to_be_updated] += 1
|
146
|
+
id = existing_alert['id'][0]
|
147
|
+
|
148
|
+
new_alert_text = "Query:\n#{datadog_query}\nMessage:\n#{message}"
|
149
|
+
existing_alert_text = "Query:\n#{existing_alert['query']}\nMessage:\n#{existing_alert['message']}\n"
|
150
|
+
diff = Diffy::Diff.new(existing_alert_text, new_alert_text, :context=>1)
|
151
|
+
log.info("updating existing alert #{id} (#{alert['name']}): #{diff}")
|
152
|
+
|
153
|
+
if @dry_run
|
154
|
+
resp = @dog.alert(
|
155
|
+
alert['metric']['datadog_query'].strip,
|
156
|
+
alert_opts,
|
157
|
+
)
|
158
|
+
else
|
159
|
+
resp = @dog.update_alert(
|
160
|
+
id,
|
161
|
+
alert['metric']['datadog_query'].strip,
|
162
|
+
alert_opts
|
163
|
+
)
|
164
|
+
# Unmute existing alerts that have been unsilenced.
|
165
|
+
# Datadog does not allow updates to silencing via the update_alert API call.
|
166
|
+
if existing_alert['silenced'] && !alert_opts[:silenced]
|
167
|
+
@dog.unmute_monitor(id)
|
168
|
+
end
|
169
|
+
end
|
127
170
|
end
|
128
171
|
|
129
172
|
# log whenever we've encountered errors
|
130
173
|
code = resp[0].to_i
|
131
|
-
|
132
|
-
api_errors << "#{code.to_s} on alert #{alert['name']}"
|
133
|
-
end
|
134
|
-
|
135
|
-
# client error
|
136
|
-
if code == 400
|
137
|
-
statsd.gauge('datadog.api.unknown_error', 0, :tags => ["alert:#{alert}"])
|
138
|
-
statsd.gauge('datadog.api.client_error', 1, :tags => ["alert:#{alert}"])
|
139
|
-
statsd.gauge('datadog.api.success', 0, :tags => ["alert:#{alert}"])
|
140
|
-
|
141
|
-
@stats[:api_client_errors] += 1
|
142
|
-
log.error("client error while #{action} alert '#{alert['name']}';" \
|
143
|
-
" query was '#{alert['metric']['datadog_query'].strip}'" \
|
144
|
-
" response was #{resp[0]}:'#{resp[1].inspect}'")
|
145
|
-
|
146
|
-
# unknown (prob. datadog) error:
|
147
|
-
elsif code >= 400 || code == -1
|
148
|
-
statsd.gauge('datadog.api.unknown_error', 1, :tags => ["alert:#{alert}"])
|
149
|
-
statsd.gauge('datadog.api.client_error', 0, :tags => ["alert:#{alert}"])
|
150
|
-
statsd.gauge('datadog.api.success', 0, :tags => ["alert:#{alert}"])
|
151
|
-
|
152
|
-
@stats[:api_unknown_errors] += 1
|
153
|
-
log.error("unknown error while #{action} alert '#{alert['name']}':" \
|
154
|
-
" query was '#{alert['metric']['datadog_query'].strip}'" \
|
155
|
-
" response was #{resp[0]}:'#{resp[1].inspect}'")
|
174
|
+
log_datadog_response_code(resp, code, action, alert)
|
156
175
|
|
157
176
|
# assume this was a success
|
158
|
-
|
159
|
-
|
160
|
-
statsd.gauge('datadog.api.client_error', 0, :tags => ["alert:#{alert}"])
|
161
|
-
statsd.gauge('datadog.api.success', 1, :tags => ["alert:#{alert}"])
|
162
|
-
|
163
|
-
@stats[:api_successes] += 1
|
177
|
+
if !(code >= 400 || code == -1)
|
178
|
+
# assume this was a success
|
164
179
|
@stats[:alerts_created] += 1 if action == :creating
|
165
180
|
@stats[:alerts_updated] += 1 if action == :updating
|
166
181
|
@stats[:alerts_silenced] += 1 if alert_opts[:silenced]
|
167
182
|
end
|
168
183
|
|
169
|
-
id = resp[1].nil? ? nil : resp[1]['id']
|
184
|
+
id = resp[1].nil? ? nil : [resp[1]['id']]
|
170
185
|
# lets key alerts by their name
|
171
186
|
return [alert['name'], id]
|
172
187
|
end
|
173
188
|
|
174
189
|
def remove_alert(alert)
|
175
190
|
if alert['message'].include?(ALERT_KEY)
|
176
|
-
|
177
|
-
|
178
|
-
|
191
|
+
@stats[:alerts_to_be_deleted] += 1
|
192
|
+
log.info("deleting alert: #{alert['name']}")
|
193
|
+
|
194
|
+
if !@dry_run
|
195
|
+
alert['id'].each do |alert_id|
|
196
|
+
resp = @dog.delete_alert(alert_id)
|
197
|
+
code = resp[0].to_i
|
198
|
+
log_datadog_response_code(resp, code, :deleting)
|
199
|
+
|
200
|
+
if !(code >= 300 || code == -1)
|
201
|
+
# assume this was a success
|
202
|
+
@stats[:alerts_deleted] += 1
|
203
|
+
end
|
204
|
+
end
|
205
|
+
end
|
179
206
|
else
|
180
207
|
log.warn("not deleting manually-created alert #{alert['id']} (#{alert['name']})")
|
181
208
|
end
|
@@ -186,17 +213,62 @@ module Interferon::Destinations
|
|
186
213
|
statsd.gauge("datadog.#{k}", v)
|
187
214
|
end
|
188
215
|
|
189
|
-
log.info "datadog: created %d updated %d and deleted %d alerts" % [
|
216
|
+
log.info "datadog: successfully created (%d/%d), updated (%d/%d), and deleted (%d/%d) alerts" % [
|
190
217
|
@stats[:alerts_created],
|
218
|
+
@stats[:alerts_to_be_created],
|
191
219
|
@stats[:alerts_updated],
|
220
|
+
@stats[:alerts_to_be_updated],
|
192
221
|
@stats[:alerts_deleted],
|
222
|
+
@stats[:alerts_to_be_deleted],
|
193
223
|
]
|
194
224
|
end
|
195
225
|
|
196
226
|
def remove_alert_by_id(alert_id)
|
227
|
+
# This should only be used by dry-run to clean up created dry-run alerts
|
197
228
|
log.debug("deleting alert, id: #{alert_id}")
|
198
|
-
@dog.delete_alert(alert_id)
|
199
|
-
|
229
|
+
resp = @dog.delete_alert(alert_id)
|
230
|
+
code = resp[0].to_i
|
231
|
+
log_datadog_response_code(resp, code, :deleting)
|
232
|
+
end
|
233
|
+
|
234
|
+
def log_datadog_response_code(resp, code, action, alert=nil)
|
235
|
+
# log whenever we've encountered errors
|
236
|
+
if code != 200 && !alert.nil?
|
237
|
+
api_errors << "#{code.to_s} on alert #{alert['name']}"
|
238
|
+
end
|
239
|
+
|
240
|
+
# client error
|
241
|
+
if code == 400
|
242
|
+
@stats[:api_client_errors] += 1
|
243
|
+
if !alert.nil?
|
244
|
+
statsd.gauge('datadog.api.unknown_error', 0, :tags => ["alert:#{alert}"])
|
245
|
+
statsd.gauge('datadog.api.client_error', 1, :tags => ["alert:#{alert}"])
|
246
|
+
statsd.gauge('datadog.api.success', 0, :tags => ["alert:#{alert}"])
|
247
|
+
log.error("client error while #{action} alert '#{alert['name']}';" \
|
248
|
+
" query was '#{alert['metric']['datadog_query'].strip}'" \
|
249
|
+
" response was #{resp[0]}:'#{resp[1].inspect}'")
|
250
|
+
end
|
251
|
+
|
252
|
+
# unknown (prob. datadog) error:
|
253
|
+
elsif code > 400 || code == -1
|
254
|
+
@stats[:api_unknown_errors] += 1
|
255
|
+
if !alert.nil?
|
256
|
+
statsd.gauge('datadog.api.unknown_error', 1, :tags => ["alert:#{alert}"])
|
257
|
+
statsd.gauge('datadog.api.client_error', 0, :tags => ["alert:#{alert}"])
|
258
|
+
statsd.gauge('datadog.api.success', 0, :tags => ["alert:#{alert}"])
|
259
|
+
log.error("unknown error while #{action} alert '#{alert['name']}':" \
|
260
|
+
" query was '#{alert['metric']['datadog_query'].strip}'" \
|
261
|
+
" response was #{resp[0]}:'#{resp[1].inspect}'")
|
262
|
+
end
|
263
|
+
else
|
264
|
+
@stats[:api_successes] += 1
|
265
|
+
if !alert.nil?
|
266
|
+
statsd.gauge('datadog.api.unknown_error', 0, :tags => ["alert:#{alert}"])
|
267
|
+
statsd.gauge('datadog.api.client_error', 0, :tags => ["alert:#{alert}"])
|
268
|
+
statsd.gauge('datadog.api.success', 1, :tags => ["alert:#{alert}"])
|
269
|
+
end
|
270
|
+
end
|
200
271
|
end
|
272
|
+
|
201
273
|
end
|
202
274
|
end
|
@@ -1,3 +1,4 @@
|
|
1
|
+
include ::Interferon::Logging
|
1
2
|
|
2
3
|
module Interferon::GroupSources
|
3
4
|
class Filesystem
|
@@ -10,6 +11,7 @@ module Interferon::GroupSources
|
|
10
11
|
|
11
12
|
def list_groups
|
12
13
|
groups = {}
|
14
|
+
aliases = {}
|
13
15
|
|
14
16
|
@paths.each do |path|
|
15
17
|
path = File.expand_path(path)
|
@@ -18,7 +20,7 @@ module Interferon::GroupSources
|
|
18
20
|
next
|
19
21
|
end
|
20
22
|
|
21
|
-
Dir.glob(File.join(path, '*.{json,yml,yaml}')) do |group_file|
|
23
|
+
Dir.glob(File.join(path, '*.{json,yml,yaml}')).each do |group_file|
|
22
24
|
begin
|
23
25
|
group = YAML::parse(File.read(group_file))
|
24
26
|
rescue YAML::SyntaxError => e
|
@@ -27,11 +29,25 @@ module Interferon::GroupSources
|
|
27
29
|
log.warn "error reading group file #{group_file}: #{e}"
|
28
30
|
else
|
29
31
|
group = group.to_ruby
|
30
|
-
|
32
|
+
if group['people']
|
33
|
+
groups[group['name']] = group['people'] || []
|
34
|
+
elsif group['alias_for']
|
35
|
+
aliases[group['name']] = {:group => group['alias_for'], :group_file => group_file}
|
36
|
+
end
|
31
37
|
end
|
32
38
|
end
|
33
39
|
end
|
34
40
|
|
41
|
+
aliases.each do |aliased_group, group_info|
|
42
|
+
group = group_info[:group]
|
43
|
+
group_file = group_info[:group_file]
|
44
|
+
if groups.include?(group)
|
45
|
+
groups[aliased_group] = groups[group]
|
46
|
+
else
|
47
|
+
log.warn "Alias not found for #{group} but used by #{aliased_group} in #{group_file}"
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
35
51
|
return groups
|
36
52
|
end
|
37
53
|
end
|
data/lib/interferon/version.rb
CHANGED
@@ -0,0 +1,21 @@
|
|
1
|
+
require 'tzinfo'
|
2
|
+
|
3
|
+
module Interferon
|
4
|
+
class WorkHoursHelper
|
5
|
+
DEFAULT_WORK_DAYS = (1..5)
|
6
|
+
DEFAULT_WORK_HOURS = (9..16)
|
7
|
+
DEFAULT_WORK_TIMEZONE = 'America/Los_Angeles'
|
8
|
+
DEFAULT_WORK_ARGS = {
|
9
|
+
:hours => DEFAULT_WORK_HOURS,
|
10
|
+
:days => DEFAULT_WORK_DAYS,
|
11
|
+
:timezone => DEFAULT_WORK_TIMEZONE,
|
12
|
+
}.freeze
|
13
|
+
|
14
|
+
def self.is_work_hour?(time, args = {})
|
15
|
+
args = args.merge(DEFAULT_WORK_ARGS)
|
16
|
+
tz = TZInfo::Timezone.get args[:timezone]
|
17
|
+
time_in_tz = time + tz.period_for_utc(time).utc_offset
|
18
|
+
return args[:days].include?(time_in_tz.wday) && args[:hours].include?(time_in_tz.hour)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|