interferon 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +15 -0
- data/.gitignore +11 -0
- data/Gemfile +2 -0
- data/Gemfile.lock +52 -0
- data/LICENSE +21 -0
- data/README.md +96 -0
- data/bin/interferon +66 -0
- data/config.example.yaml +37 -0
- data/groups/data.yaml +11 -0
- data/groups/dataeng.yaml +4 -0
- data/groups/datainfra.yaml +10 -0
- data/groups/devhap.yaml +6 -0
- data/groups/discover.yaml +13 -0
- data/groups/growth.yaml +17 -0
- data/groups/host.yaml +12 -0
- data/groups/internalproducts.yml +13 -0
- data/groups/logstash.yaml +4 -0
- data/groups/mobile.yaml +17 -0
- data/groups/pagerduty_sysops.yaml +5 -0
- data/groups/panda.yaml +10 -0
- data/groups/payments.yaml +16 -0
- data/groups/payments_finance.yaml +8 -0
- data/groups/prodinfra.yaml +15 -0
- data/groups/search.yaml +10 -0
- data/groups/security.yaml +8 -0
- data/groups/sre.yaml +16 -0
- data/groups/teamx.yaml +8 -0
- data/groups/tns.yaml +14 -0
- data/groups/tools.yml +11 -0
- data/interferon.gemspec +26 -0
- data/lib/interferon.rb +241 -0
- data/lib/interferon/alert.rb +33 -0
- data/lib/interferon/alert_dsl.rb +94 -0
- data/lib/interferon/destinations/datadog.rb +169 -0
- data/lib/interferon/group_sources/filesystem.rb +38 -0
- data/lib/interferon/host_sources/aws_dynamo.rb +51 -0
- data/lib/interferon/host_sources/aws_elasticache.rb +69 -0
- data/lib/interferon/host_sources/aws_rds.rb +92 -0
- data/lib/interferon/host_sources/optica.rb +35 -0
- data/lib/interferon/host_sources/optica_services.rb +68 -0
- data/lib/interferon/loaders.rb +123 -0
- data/lib/interferon/logging.rb +26 -0
- data/lib/interferon/version.rb +3 -0
- data/script/convert.rb +29 -0
- data/script/pre-commit +73 -0
- data/spec/spec_helper.rb +62 -0
- metadata +179 -0
@@ -0,0 +1,169 @@
|
|
1
|
+
require 'dogapi'
|
2
|
+
require 'set'
|
3
|
+
|
4
|
+
module Interferon::Destinations
|
5
|
+
class Datadog
|
6
|
+
include Interferon::Logging
|
7
|
+
|
8
|
+
attr_accessor :concurrency
|
9
|
+
ALERT_KEY = 'This alert was created via the alerts framework'
|
10
|
+
|
11
|
+
def initialize(options)
|
12
|
+
%w{app_key api_key}.each do |req|
|
13
|
+
unless options[req]
|
14
|
+
raise ArgumentError, "missing required argument #{req}"
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
@dog = Dogapi::Client.new(options['api_key'], options['app_key'])
|
19
|
+
@dry_run = !!options['dry_run']
|
20
|
+
@existing_alerts = nil
|
21
|
+
|
22
|
+
# create datadog alerts 10 at a time
|
23
|
+
@concurrency = 10
|
24
|
+
|
25
|
+
@stats = {
|
26
|
+
:alerts_created => 0,
|
27
|
+
:alerts_updated => 0,
|
28
|
+
:alerts_deleted => 0,
|
29
|
+
:alerts_silenced => 0,
|
30
|
+
:api_successes => 0,
|
31
|
+
:api_client_errors => 0,
|
32
|
+
:api_unknown_errors => 0,
|
33
|
+
:manually_created_alerts => 0,
|
34
|
+
}
|
35
|
+
end
|
36
|
+
|
37
|
+
def existing_alerts
|
38
|
+
unless @existing_alerts
|
39
|
+
resp = @dog.get_all_alerts()
|
40
|
+
alerts = resp[1]['alerts']
|
41
|
+
|
42
|
+
# key alerts by name
|
43
|
+
@existing_alerts = Hash[alerts.map{ |a| [a['name'], a] }]
|
44
|
+
|
45
|
+
# count how many are manually created
|
46
|
+
@stats[:manually_created_alerts] = \
|
47
|
+
@existing_alerts.reject{|n,a| a['message'].include?(ALERT_KEY)}.length
|
48
|
+
|
49
|
+
log.info "datadog: found %d existing alerts; %d were manually created" % [
|
50
|
+
@existing_alerts.length,
|
51
|
+
@stats[:manually_created_alerts],
|
52
|
+
]
|
53
|
+
end
|
54
|
+
|
55
|
+
return @existing_alerts
|
56
|
+
end
|
57
|
+
|
58
|
+
def create_alert(alert, people)
|
59
|
+
# create a message which includes the notifications
|
60
|
+
message = [
|
61
|
+
alert['message'],
|
62
|
+
ALERT_KEY,
|
63
|
+
people.map{ |p| "@#{p}" }
|
64
|
+
].flatten.join("\n")
|
65
|
+
|
66
|
+
# create the hash of options to send to datadog
|
67
|
+
alert_opts = {
|
68
|
+
:name => alert['name'],
|
69
|
+
:message => message,
|
70
|
+
:silenced => alert['silenced'] || alert['silenced_until'] > Time.now,
|
71
|
+
:notify_no_data => alert['notify_no_data'],
|
72
|
+
:timeout_h => nil,
|
73
|
+
}
|
74
|
+
|
75
|
+
# allow an optional timeframe for "no data" alerts to be specified
|
76
|
+
# (this feature is supported, even though it's not documented)
|
77
|
+
alert_opts[:no_data_timeframe] = alert['no_data_timeframe'] if alert['no_data_timeframe']
|
78
|
+
|
79
|
+
# timeout is in seconds, but set it to 1 hour at least
|
80
|
+
alert_opts[:timeout_h] = [1, (alert['timeout'].to_i / 3600)].max if alert['timeout']
|
81
|
+
|
82
|
+
# new alert, create it
|
83
|
+
if existing_alerts[alert['name']].nil?
|
84
|
+
action = :creating
|
85
|
+
log.debug("new alert #{alert['name']}")
|
86
|
+
|
87
|
+
resp = @dog.alert(
|
88
|
+
alert['metric']['datadog_query'].strip,
|
89
|
+
alert_opts,
|
90
|
+
) unless @dry_run
|
91
|
+
|
92
|
+
# existing alert, modify it
|
93
|
+
else
|
94
|
+
action = :updating
|
95
|
+
id = existing_alerts[alert['name']]['id']
|
96
|
+
log.debug("updating existing alert #{id} (#{alert['name']})")
|
97
|
+
|
98
|
+
resp = @dog.update_alert(
|
99
|
+
id,
|
100
|
+
alert['metric']['datadog_query'].strip,
|
101
|
+
alert_opts
|
102
|
+
) unless @dry_run
|
103
|
+
end
|
104
|
+
|
105
|
+
# log whenever we've encountered errors
|
106
|
+
if resp
|
107
|
+
code = resp[0].to_i
|
108
|
+
|
109
|
+
# client error
|
110
|
+
if code == 400
|
111
|
+
statsd.gauge('datadog.api.unknown_error', 0, :tags => ["alert:#{alert}"])
|
112
|
+
statsd.gauge('datadog.api.client_error', 1, :tags => ["alert:#{alert}"])
|
113
|
+
statsd.gauge('datadog.api.success', 0, :tags => ["alert:#{alert}"])
|
114
|
+
|
115
|
+
@stats[:api_client_errors] += 1
|
116
|
+
log.error("client error while #{action} alert '#{alert['name']}';" \
|
117
|
+
" query was '#{alert['metric']['datadog_query'].strip}'")
|
118
|
+
|
119
|
+
# unknown (prob. datadog) error:
|
120
|
+
elsif code >= 400 || code == -1
|
121
|
+
statsd.gauge('datadog.api.unknown_error', 1, :tags => ["alert:#{alert}"])
|
122
|
+
statsd.gauge('datadog.api.client_error', 0, :tags => ["alert:#{alert}"])
|
123
|
+
statsd.gauge('datadog.api.success', 0, :tags => ["alert:#{alert}"])
|
124
|
+
|
125
|
+
@stats[:api_unknown_errors] += 1
|
126
|
+
log.error("unknown error while #{action} alert '#{alert['name']}':" \
|
127
|
+
" query was '#{alert['metric']['datadog_query'].strip}'" \
|
128
|
+
" response was #{resp[0]}:'#{resp[1].inspect}'")
|
129
|
+
|
130
|
+
# assume this was a success
|
131
|
+
else
|
132
|
+
statsd.gauge('datadog.api.unknown_error', 0, :tags => ["alert:#{alert}"])
|
133
|
+
statsd.gauge('datadog.api.client_error', 0, :tags => ["alert:#{alert}"])
|
134
|
+
statsd.gauge('datadog.api.success', 1, :tags => ["alert:#{alert}"])
|
135
|
+
|
136
|
+
@stats[:api_successes] += 1
|
137
|
+
@stats[:alerts_created] += 1 if action == :creating
|
138
|
+
@stats[:alerts_updated] += 1 if action == :updating
|
139
|
+
@stats[:alerts_silenced] += 1 if alert_opts[:silenced]
|
140
|
+
end
|
141
|
+
end
|
142
|
+
|
143
|
+
# lets key alerts by their name
|
144
|
+
return alert['name']
|
145
|
+
end
|
146
|
+
|
147
|
+
def remove_alert(alert)
|
148
|
+
if alert['message'].include?(ALERT_KEY)
|
149
|
+
log.debug("deleting alert #{alert['id']} (#{alert['name']})")
|
150
|
+
@dog.delete_alert(alert['id']) unless @dry_run
|
151
|
+
@stats[:alerts_deleted] += 1
|
152
|
+
else
|
153
|
+
log.warn("not deleting manually-created alert #{alert['id']} (#{alert['name']})")
|
154
|
+
end
|
155
|
+
end
|
156
|
+
|
157
|
+
def report_stats
|
158
|
+
@stats.each do |k,v|
|
159
|
+
statsd.gauge("datadog.#{k}", v)
|
160
|
+
end
|
161
|
+
|
162
|
+
log.info "datadog: created %d updated %d and deleted %d alerts" % [
|
163
|
+
@stats[:alerts_created],
|
164
|
+
@stats[:alerts_updated],
|
165
|
+
@stats[:alerts_deleted],
|
166
|
+
]
|
167
|
+
end
|
168
|
+
end
|
169
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
|
2
|
+
module Interferon::GroupSources
|
3
|
+
class Filesystem
|
4
|
+
def initialize(options)
|
5
|
+
raise ArgumentError, "missing paths for loading groups from filesystem" \
|
6
|
+
unless options['paths']
|
7
|
+
|
8
|
+
@paths = options['paths']
|
9
|
+
end
|
10
|
+
|
11
|
+
def list_groups
|
12
|
+
groups = {}
|
13
|
+
|
14
|
+
@paths.each do |path|
|
15
|
+
path = File.expand_path(path)
|
16
|
+
unless Dir.exists?(path)
|
17
|
+
log.warn "no such directory #{path} for reading group files"
|
18
|
+
next
|
19
|
+
end
|
20
|
+
|
21
|
+
Dir.glob(File.join(path, '*.{json,yaml}')) do |group_file|
|
22
|
+
begin
|
23
|
+
group = YAML::parse(File.read(group_file))
|
24
|
+
rescue YAML::SyntaxError => e
|
25
|
+
log.error "syntax error in group file #{group_file}: #{e}"
|
26
|
+
rescue StandardError => e
|
27
|
+
log.warn "error reading group file #{group_file}: #{e}"
|
28
|
+
else
|
29
|
+
group = group.to_ruby
|
30
|
+
groups[group['name']] = group['people'] || []
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
return groups
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
require 'aws'
|
2
|
+
|
3
|
+
module Interferon::HostSources
|
4
|
+
class AwsDynamo
|
5
|
+
def initialize(options)
|
6
|
+
missing = %w{access_key_id secret_access_key}.reject{|r| options.key?(r)}
|
7
|
+
raise ArgumentError, "missing these required arguments for source AwsDynamo: #{missing.inspect}"\
|
8
|
+
unless missing.empty?
|
9
|
+
|
10
|
+
@access_key_id = options['access_key_id']
|
11
|
+
@secret_access_key = options['secret_access_key']
|
12
|
+
|
13
|
+
# initialize a list of regions to check
|
14
|
+
if options['regions'] && !options['regions'].empty?
|
15
|
+
@regions = options['regions']
|
16
|
+
else
|
17
|
+
@regions = AWS::regions.map(&:name)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def list_hosts
|
22
|
+
hosts = []
|
23
|
+
|
24
|
+
@regions.each do |region|
|
25
|
+
client = AWS::DynamoDB.new(
|
26
|
+
:access_key_id => @access_key_id,
|
27
|
+
:secret_access_key => @secret_access_key,
|
28
|
+
:region => region)
|
29
|
+
|
30
|
+
AWS.memoize do
|
31
|
+
client.tables.each do |table|
|
32
|
+
hosts << {
|
33
|
+
:source => 'aws_dynamo',
|
34
|
+
:region => region,
|
35
|
+
:table_name => table.name,
|
36
|
+
|
37
|
+
:read_capacity => table.read_capacity_units,
|
38
|
+
:write_capacity => table.write_capacity_units,
|
39
|
+
|
40
|
+
# dynamodb does not support tagging
|
41
|
+
:owners => [],
|
42
|
+
:owner_groups => [],
|
43
|
+
}
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
return hosts
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
@@ -0,0 +1,69 @@
|
|
1
|
+
require 'aws'
|
2
|
+
|
3
|
+
module Interferon::HostSources
|
4
|
+
class AwsElasticache
|
5
|
+
def initialize(options)
|
6
|
+
missing = %w{access_key_id secret_access_key}.reject{|r| options.key?(r)}
|
7
|
+
raise ArgumentError, "missing these required arguments for source AwsElasticache: #{missing.inspect}"\
|
8
|
+
unless missing.empty?
|
9
|
+
|
10
|
+
@access_key_id = options['access_key_id']
|
11
|
+
@secret_access_key = options['secret_access_key']
|
12
|
+
|
13
|
+
# initialize a list of regions to check
|
14
|
+
if options['regions'] && !options['regions'].empty?
|
15
|
+
@regions = options['regions']
|
16
|
+
else
|
17
|
+
@regions = AWS::regions.map(&:name)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def list_hosts
|
22
|
+
hosts = []
|
23
|
+
|
24
|
+
@regions.each do |region|
|
25
|
+
clusters = []
|
26
|
+
client = AWS::ElastiCache.new(
|
27
|
+
:access_key_id => @access_key_id,
|
28
|
+
:secret_access_key => @secret_access_key,
|
29
|
+
:region => region).client
|
30
|
+
|
31
|
+
AWS.memoize do
|
32
|
+
# read the list of cache clusters; we have to do our own pagination
|
33
|
+
clusters = []
|
34
|
+
options = {:show_cache_node_info => true}
|
35
|
+
loop do
|
36
|
+
r = client.describe_cache_clusters(options)
|
37
|
+
clusters += r.data[:cache_clusters]
|
38
|
+
|
39
|
+
break unless r.data[:marker]
|
40
|
+
options[:marker] = r.data[:marker]
|
41
|
+
end
|
42
|
+
|
43
|
+
# iterate over the nodes in each cluster and add each one to hosts
|
44
|
+
clusters.each do |cluster|
|
45
|
+
cluster[:cache_nodes].each do |node|
|
46
|
+
hosts << {
|
47
|
+
:source => 'aws_elasticache',
|
48
|
+
:region => region,
|
49
|
+
|
50
|
+
:cluster_id => cluster[:cache_cluster_id],
|
51
|
+
:cluster_status => cluster[:cache_cluster_status],
|
52
|
+
:node_type => cluster[:cache_node_type],
|
53
|
+
:peer_nodes => cluster[:num_cache_nodes],
|
54
|
+
|
55
|
+
:node_status => node[:cache_node_status],
|
56
|
+
|
57
|
+
# elasticache does not support tagging
|
58
|
+
:owners => [],
|
59
|
+
:owner_groups => [],
|
60
|
+
}
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
return hosts
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
@@ -0,0 +1,92 @@
|
|
1
|
+
require 'aws'
|
2
|
+
|
3
|
+
module Interferon::HostSources
|
4
|
+
class AwsRds
|
5
|
+
def initialize(options)
|
6
|
+
missing = %w{access_key_id secret_access_key}.reject{|r| options.key?(r)}
|
7
|
+
raise ArgumentError, "missing these required arguments for source AwsRds: #{missing.inspect}"\
|
8
|
+
unless missing.empty?
|
9
|
+
|
10
|
+
@access_key_id = options['access_key_id']
|
11
|
+
@secret_access_key = options['secret_access_key']
|
12
|
+
|
13
|
+
# initialize a list of regions to check
|
14
|
+
if options['regions'] && !options['regions'].empty?
|
15
|
+
@regions = options['regions']
|
16
|
+
else
|
17
|
+
@regions = AWS::regions.map(&:name)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def list_hosts
|
22
|
+
hosts = []
|
23
|
+
|
24
|
+
@regions.each do |region|
|
25
|
+
rds = AWS::RDS.new(
|
26
|
+
:access_key_id => @access_key_id,
|
27
|
+
:secret_access_key => @secret_access_key,
|
28
|
+
:region => region)
|
29
|
+
|
30
|
+
AWS.memoize do
|
31
|
+
rds.instances.each do |instance|
|
32
|
+
# get the tags for the instance
|
33
|
+
arn = arn(region, instance.id)
|
34
|
+
tag_list = rds.client.list_tags_for_resource(:resource_name => arn)[:tag_list]
|
35
|
+
tags = Hash[ tag_list.map { |h| [h[:key], h[:value]] } ]
|
36
|
+
|
37
|
+
tags['owners'] ||= ''
|
38
|
+
tags['owner_groups'] ||= ''
|
39
|
+
|
40
|
+
# build the host data for this instance
|
41
|
+
hosts << {
|
42
|
+
:source => 'aws_rds',
|
43
|
+
:region => region,
|
44
|
+
:instance_id => instance.id,
|
45
|
+
:db_name => instance.db_name,
|
46
|
+
:engine => instance.engine,
|
47
|
+
:engine_version => instance.engine_version,
|
48
|
+
|
49
|
+
# metrics
|
50
|
+
:allocated_storage => instance.allocated_storage,
|
51
|
+
:iops => instance.iops,
|
52
|
+
|
53
|
+
# replication info
|
54
|
+
:is_replica => !instance.read_replica_source_db_instance_identifier.nil?,
|
55
|
+
:replica_source_name => instance.read_replica_source_db_instance_identifier,
|
56
|
+
:replica_names => instance.read_replica_db_instance_identifiers.join(','),
|
57
|
+
:replicas => instance.read_replica_db_instance_identifiers.count,
|
58
|
+
|
59
|
+
:owners => tags['owners'].split(','),
|
60
|
+
:owner_groups => tags['owner_groups'].split(','),
|
61
|
+
|
62
|
+
:db_env => tags['db_env'],
|
63
|
+
:db_role => tags['db_role'],
|
64
|
+
}
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
return hosts
|
70
|
+
end
|
71
|
+
|
72
|
+
private
|
73
|
+
def arn(region, instance_id)
|
74
|
+
return "arn:aws:rds:#{region}:#{account_number}:db:#{instance_id}"
|
75
|
+
end
|
76
|
+
|
77
|
+
# unfortunately, this appears to be the only way to get your account number
|
78
|
+
def account_number
|
79
|
+
return @account_number if @account_number
|
80
|
+
|
81
|
+
begin
|
82
|
+
my_arn = AWS::IAM.new(
|
83
|
+
:access_key_id => @access_key_id,
|
84
|
+
:secret_access_key => @secret_access_key).client.get_user()[:user][:arn]
|
85
|
+
rescue AWS::IAM::Errors::AccessDenied => e
|
86
|
+
my_arn = e.message.split[1]
|
87
|
+
end
|
88
|
+
|
89
|
+
@account_number = my_arn.split(':')[4]
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
require 'net/http'
|
2
|
+
require 'json'
|
3
|
+
|
4
|
+
module Interferon::HostSources
|
5
|
+
class Optica
|
6
|
+
include Logging
|
7
|
+
|
8
|
+
def initialize(options)
|
9
|
+
raise ArgumentError, "missing host for optica source" \
|
10
|
+
unless options['host']
|
11
|
+
|
12
|
+
@host = options['host']
|
13
|
+
@port = options['port'] || 80
|
14
|
+
end
|
15
|
+
|
16
|
+
def list_hosts
|
17
|
+
con = Net::HTTP.new(@host, @port)
|
18
|
+
con.read_timeout = 60
|
19
|
+
con.open_timeout = 60
|
20
|
+
|
21
|
+
response = con.get('/')
|
22
|
+
data = JSON::parse(response.body)
|
23
|
+
|
24
|
+
return data['nodes'].map{|ip, host| {
|
25
|
+
:source => 'optica',
|
26
|
+
:hostname => host['hostname'],
|
27
|
+
:role => host['role'],
|
28
|
+
:environment => host['environment'],
|
29
|
+
|
30
|
+
:owners => host['ownership'] && host['ownership']['people'] || [],
|
31
|
+
:owner_groups => host['ownership'] && host['ownership']['groups'] || [],
|
32
|
+
}}
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|