kennel 1.74.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/Readme.md +244 -0
- data/lib/kennel.rb +90 -0
- data/lib/kennel/api.rb +83 -0
- data/lib/kennel/file_cache.rb +53 -0
- data/lib/kennel/github_reporter.rb +49 -0
- data/lib/kennel/importer.rb +135 -0
- data/lib/kennel/models/base.rb +29 -0
- data/lib/kennel/models/dashboard.rb +209 -0
- data/lib/kennel/models/monitor.rb +213 -0
- data/lib/kennel/models/project.rb +31 -0
- data/lib/kennel/models/record.rb +94 -0
- data/lib/kennel/models/slo.rb +92 -0
- data/lib/kennel/models/team.rb +12 -0
- data/lib/kennel/optional_validations.rb +21 -0
- data/lib/kennel/progress.rb +34 -0
- data/lib/kennel/settings_as_methods.rb +86 -0
- data/lib/kennel/subclass_tracking.rb +19 -0
- data/lib/kennel/syncer.rb +260 -0
- data/lib/kennel/tasks.rb +147 -0
- data/lib/kennel/template_variables.rb +38 -0
- data/lib/kennel/unmuted_alerts.rb +89 -0
- data/lib/kennel/utils.rb +159 -0
- data/lib/kennel/version.rb +4 -0
- data/template/Readme.md +205 -0
- metadata +109 -0
data/lib/kennel/tasks.rb
ADDED
@@ -0,0 +1,147 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
require "English"
|
3
|
+
require "kennel"
|
4
|
+
require "kennel/unmuted_alerts"
|
5
|
+
require "kennel/importer"
|
6
|
+
|
7
|
+
module Kennel
|
8
|
+
module Tasks
|
9
|
+
class << self
|
10
|
+
def abort(message = nil)
|
11
|
+
Kennel.err.puts message if message
|
12
|
+
raise SystemExit.new(1), message
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
namespace :kennel do
|
19
|
+
desc "Ensure there are no uncommited changes that would be hidden from PR reviewers"
|
20
|
+
task no_diff: :generate do
|
21
|
+
result = `git status --porcelain generated/`.strip
|
22
|
+
Kennel::Tasks.abort "Diff found:\n#{result}\nrun `rake generate` and commit the diff to fix" unless result == ""
|
23
|
+
Kennel::Tasks.abort "Error during diffing" unless $CHILD_STATUS.success?
|
24
|
+
end
|
25
|
+
|
26
|
+
# ideally do this on every run, but it's slow (~1.5s) and brittle (might not find all + might find false-positives)
|
27
|
+
# https://help.datadoghq.com/hc/en-us/requests/254114 for automatic validation
|
28
|
+
desc "Verify that all used monitor mentions are valid"
|
29
|
+
task validate_mentions: :environment do
|
30
|
+
known = Kennel.send(:api)
|
31
|
+
.send(:request, :get, "/monitor/notifications")
|
32
|
+
.fetch(:handles)
|
33
|
+
.values
|
34
|
+
.flatten(1)
|
35
|
+
.map { |v| v.fetch(:value) }
|
36
|
+
|
37
|
+
known += ENV["KNOWN"].to_s.split(",")
|
38
|
+
|
39
|
+
bad = []
|
40
|
+
Dir["generated/**/*.json"].each do |f|
|
41
|
+
next unless message = JSON.parse(File.read(f))["message"]
|
42
|
+
used = message.scan(/\s(@[^\s{,'"]+)/).flatten(1)
|
43
|
+
.grep(/^@.*@|^@.*-/) # ignore @here etc handles ... datadog uses @foo@bar.com for emails and @foo-bar for integrations
|
44
|
+
(used - known).each { |v| bad << [f, v] }
|
45
|
+
end
|
46
|
+
|
47
|
+
if bad.any?
|
48
|
+
url = Kennel::Utils.path_to_url "/account/settings"
|
49
|
+
puts "Invalid mentions found, either ignore them by adding to `KNOWN` env var or add them via #{url}"
|
50
|
+
bad.each { |f, v| puts "Invalid mention #{v} in monitor message of #{f}" }
|
51
|
+
Kennel::Tasks.abort
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
desc "generate local definitions"
|
56
|
+
task generate: :environment do
|
57
|
+
Kennel.generate
|
58
|
+
end
|
59
|
+
|
60
|
+
# also generate parts so users see and commit updated generated automatically
|
61
|
+
desc "show planned datadog changes (scope with PROJECT=name)"
|
62
|
+
task plan: :generate do
|
63
|
+
Kennel.plan
|
64
|
+
end
|
65
|
+
|
66
|
+
desc "update datadog (scope with PROJECT=name)"
|
67
|
+
task update_datadog: :environment do
|
68
|
+
Kennel.update
|
69
|
+
end
|
70
|
+
|
71
|
+
desc "update if this is a push to the default branch, otherwise plan"
|
72
|
+
task :travis do
|
73
|
+
on_default_branch = (ENV["TRAVIS_BRANCH"] == (ENV["DEFAULT_BRANCH"] || "master"))
|
74
|
+
is_push = (ENV["TRAVIS_PULL_REQUEST"] == "false")
|
75
|
+
task_name =
|
76
|
+
if on_default_branch && is_push
|
77
|
+
"kennel:update_datadog"
|
78
|
+
else
|
79
|
+
"kennel:plan" # show plan in travis logs
|
80
|
+
end
|
81
|
+
|
82
|
+
Rake::Task[task_name].invoke
|
83
|
+
end
|
84
|
+
|
85
|
+
desc "show unmuted alerts filtered by TAG, for example TAG=team:foo"
|
86
|
+
task alerts: :environment do
|
87
|
+
tag = ENV["TAG"] || Kennel::Tasks.abort("Call with TAG=foo:bar")
|
88
|
+
Kennel::UnmutedAlerts.print(Kennel.send(:api), tag)
|
89
|
+
end
|
90
|
+
|
91
|
+
desc "show monitors with no data by TAG, for example TAG=team:foo"
|
92
|
+
task nodata: :environment do
|
93
|
+
tag = ENV["TAG"] || Kennel::Tasks.abort("Call with TAG=foo:bar")
|
94
|
+
monitors = Kennel.send(:api).list("monitor", monitor_tags: tag, group_states: "no data")
|
95
|
+
monitors.select! { |m| m[:overall_state] == "No Data" }
|
96
|
+
monitors.reject! { |m| m[:tags].include? "nodata:ignore" }
|
97
|
+
if monitors.any?
|
98
|
+
Kennel.err.puts <<~TEXT
|
99
|
+
This is a useful task to find monitors that have mis-spelled metrics or never received data at any time.
|
100
|
+
To ignore monitors with nodata, tag the monitor with "nodata:ignore"
|
101
|
+
|
102
|
+
TEXT
|
103
|
+
end
|
104
|
+
|
105
|
+
monitors.each do |m|
|
106
|
+
Kennel.out.puts m[:name]
|
107
|
+
Kennel.out.puts Kennel::Utils.path_to_url("/monitors/#{m[:id]}")
|
108
|
+
Kennel.out.puts
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
desc "Convert existing resources to copy-pasteable definitions to import existing resources (call with URL= or call with RESOURCE= and ID=)"
|
113
|
+
task import: :environment do
|
114
|
+
if (id = ENV["ID"]) && (resource = ENV["RESOURCE"])
|
115
|
+
id = Integer(id) if id =~ /^\d+$/ # dashboards can have alphanumeric ids
|
116
|
+
elsif (url = ENV["URL"])
|
117
|
+
resource, id = Kennel::Models::Record.parse_any_url(url) || Kennel::Tasks.abort("Unable to parse url")
|
118
|
+
else
|
119
|
+
possible_resources = Kennel::Models::Record.subclasses.map(&:api_resource)
|
120
|
+
Kennel::Tasks.abort("Call with URL= or call with RESOURCE=#{possible_resources.join(" or ")} and ID=")
|
121
|
+
end
|
122
|
+
|
123
|
+
Kennel.out.puts Kennel::Importer.new(Kennel.send(:api)).import(resource, id)
|
124
|
+
end
|
125
|
+
|
126
|
+
desc "Dump ALL of datadog config as raw json ... useful for grep/search TYPE=slo|monitor|dashboard"
|
127
|
+
task dump: :environment do
|
128
|
+
Kennel.send(:api).list(ENV.fetch("TYPE")).each do |r|
|
129
|
+
Kennel.out.puts JSON.pretty_generate(r)
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
133
|
+
task :environment do
|
134
|
+
require "kennel"
|
135
|
+
gem "dotenv"
|
136
|
+
require "dotenv"
|
137
|
+
source = ".env"
|
138
|
+
|
139
|
+
# warn when users have things like DATADOG_TOKEN already set and it will not be loaded from .env
|
140
|
+
unless ENV["KENNEL_SILENCE_UPDATED_ENV"]
|
141
|
+
updated = Dotenv.parse(source).select { |k, v| ENV[k] && ENV[k] != v }
|
142
|
+
warn "Environment variables #{updated.keys.join(", ")} need to be unset to be sourced from #{source}" if updated.any?
|
143
|
+
end
|
144
|
+
|
145
|
+
Dotenv.load(source)
|
146
|
+
end
|
147
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
module Kennel
|
3
|
+
module TemplateVariables
|
4
|
+
def self.included(base)
|
5
|
+
base.settings :template_variables
|
6
|
+
base.defaults(template_variables: -> { [] })
|
7
|
+
end
|
8
|
+
|
9
|
+
private
|
10
|
+
|
11
|
+
def render_template_variables
|
12
|
+
(template_variables || []).map do |v|
|
13
|
+
v.is_a?(String) ? { default: "*", prefix: v, name: v } : v
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
# check for queries that do not use the variables and would be misleading
|
18
|
+
# TODO: do the same check for apm_query and their group_by
|
19
|
+
def validate_template_variables(data, key)
|
20
|
+
variables = (data[:template_variables] || []).map { |v| "$#{v.fetch(:name)}" }
|
21
|
+
queries = data[key].flat_map do |widget|
|
22
|
+
([widget] + (widget.dig(:definition, :widgets) || [])).flat_map { |w| widget_queries(w) }
|
23
|
+
end.compact
|
24
|
+
bad = queries.grep_v(/(#{variables.map { |v| Regexp.escape(v) }.join("|")})\b/)
|
25
|
+
if bad.any?
|
26
|
+
invalid!(
|
27
|
+
"queries #{bad.join(", ")} must use the template variables #{variables.join(", ")}\n" \
|
28
|
+
"If that is not possible, add `validate: -> { false } # query foo in bar does not have baz tag`"
|
29
|
+
)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def widget_queries(widget)
|
34
|
+
requests = widget.dig(:definition, :requests) || []
|
35
|
+
(requests.is_a?(Hash) ? requests.values : requests).map { |r| r[:q] } # hostmap widgets have hash requests
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
@@ -0,0 +1,89 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
require "kennel"
|
3
|
+
|
4
|
+
# Show Alerts that are not muted and their alerting scopes
|
5
|
+
module Kennel
|
6
|
+
class UnmutedAlerts
|
7
|
+
COLORS = {
|
8
|
+
"Alert" => :red,
|
9
|
+
"Warn" => :yellow,
|
10
|
+
"No Data" => :cyan
|
11
|
+
}.freeze
|
12
|
+
|
13
|
+
class << self
|
14
|
+
def print(api, tag)
|
15
|
+
monitors = filtered_monitors(api, tag)
|
16
|
+
if monitors.empty?
|
17
|
+
Kennel.out.puts "No unmuted alerts found"
|
18
|
+
else
|
19
|
+
monitors.each do |m|
|
20
|
+
Kennel.out.puts m[:name]
|
21
|
+
Kennel.out.puts Utils.path_to_url("/monitors/#{m[:id]}")
|
22
|
+
m[:state][:groups].each do |g|
|
23
|
+
color = COLORS[g[:status]] || :default
|
24
|
+
since = "\t#{time_since(g[:last_triggered_ts])}"
|
25
|
+
Kennel.out.puts "#{Kennel::Utils.color(color, g[:status])}\t#{g[:name]}#{since}"
|
26
|
+
end
|
27
|
+
Kennel.out.puts
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
private
|
33
|
+
|
34
|
+
# sort pod3 before pod11
|
35
|
+
def sort_groups!(monitor)
|
36
|
+
groups = monitor[:state][:groups].values
|
37
|
+
groups.sort_by! { |g| g[:name].to_s.split(",").map { |w| Utils.natural_order(w) } }
|
38
|
+
monitor[:state][:groups] = groups
|
39
|
+
end
|
40
|
+
|
41
|
+
def time_since(t)
|
42
|
+
diff = Time.now.to_i - Integer(t)
|
43
|
+
"%02d:%02d:%02d" % [diff / 3600, diff / 60 % 60, diff % 60]
|
44
|
+
end
|
45
|
+
|
46
|
+
def filtered_monitors(api, tag)
|
47
|
+
# Download all monitors with given tag
|
48
|
+
monitors = Progress.progress("Downloading") do
|
49
|
+
api.list("monitor", monitor_tags: tag, group_states: "all", with_downtimes: "true")
|
50
|
+
end
|
51
|
+
|
52
|
+
raise "No monitors for #{tag} found, check your spelling" if monitors.empty?
|
53
|
+
|
54
|
+
# only keep monitors that are alerting
|
55
|
+
monitors.reject! { |m| m[:overall_state] == "OK" }
|
56
|
+
|
57
|
+
# only keep monitors that are not completely silenced
|
58
|
+
monitors.reject! { |m| m[:options][:silenced].key?(:*) }
|
59
|
+
|
60
|
+
# only keep groups that are alerting
|
61
|
+
monitors.each { |m| m[:state][:groups].reject! { |_, g| g[:status] == "OK" || g[:status] == "Ignored" } }
|
62
|
+
|
63
|
+
# only keep alerting groups that are not silenced
|
64
|
+
monitors.each do |m|
|
65
|
+
silenced = m[:options][:silenced].keys.map { |k| k.to_s.split(",") }
|
66
|
+
m[:state][:groups].select! do |k, _|
|
67
|
+
scope = k.to_s.split(",")
|
68
|
+
silenced.none? { |s| (s - scope).empty? }
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
# only keep monitors that are not covered by a downtime
|
73
|
+
monitors.each do |m|
|
74
|
+
next unless m[:matching_downtimes]
|
75
|
+
downtime_groups = m[:matching_downtimes].select { |d| d[:active] }.flat_map { |d| d[:groups] }
|
76
|
+
m[:state][:groups].reject! do |k, _|
|
77
|
+
downtime_groups.include?(k.to_s)
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
# only keep monitors with alerting groups
|
82
|
+
monitors.select! { |m| m[:state][:groups].any? }
|
83
|
+
|
84
|
+
# sort group alerts
|
85
|
+
monitors.each { |m| sort_groups!(m) }
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
data/lib/kennel/utils.rb
ADDED
@@ -0,0 +1,159 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
module Kennel
|
3
|
+
module Utils
|
4
|
+
COLORS = { red: 31, green: 32, yellow: 33, cyan: 36, magenta: 35, default: 0 }.freeze
|
5
|
+
|
6
|
+
class TeeIO < IO
|
7
|
+
def initialize(ios)
|
8
|
+
@ios = ios
|
9
|
+
end
|
10
|
+
|
11
|
+
def write(string)
|
12
|
+
@ios.each { |io| io.write string }
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
class << self
|
17
|
+
def snake_case(string)
|
18
|
+
string
|
19
|
+
.gsub(/::/, "_") # Foo::Bar -> foo_bar
|
20
|
+
.gsub(/([A-Z]+)([A-Z][a-z])/, '\1_\2') # FOOBar -> foo_bar
|
21
|
+
.gsub(/([a-z\d])([A-Z])/, '\1_\2') # fooBar -> foo_bar
|
22
|
+
.tr("-", "_") # foo-bar -> foo_bar
|
23
|
+
.downcase
|
24
|
+
end
|
25
|
+
|
26
|
+
# simplified version of https://apidock.com/rails/ActiveSupport/Inflector/parameterize
|
27
|
+
def parameterize(string)
|
28
|
+
string
|
29
|
+
.downcase
|
30
|
+
.gsub(/[^a-z0-9\-_]+/, "-") # remove unsupported
|
31
|
+
.gsub(/-{2,}/, "-") # remove duplicates
|
32
|
+
.gsub(/^-|-$/, "") # remove leading/trailing
|
33
|
+
end
|
34
|
+
|
35
|
+
def presence(value)
|
36
|
+
value.nil? || value.empty? ? nil : value
|
37
|
+
end
|
38
|
+
|
39
|
+
def ask(question)
|
40
|
+
Kennel.err.printf color(:red, "#{question} - press 'y' to continue: ")
|
41
|
+
begin
|
42
|
+
STDIN.gets.chomp == "y"
|
43
|
+
rescue Interrupt # do not show a backtrace if user decides to Ctrl+C here
|
44
|
+
Kennel.err.print "\n"
|
45
|
+
exit 1
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
def color(color, text)
|
50
|
+
"\e[#{COLORS.fetch(color)}m#{text}\e[0m"
|
51
|
+
end
|
52
|
+
|
53
|
+
def strip_shell_control(text)
|
54
|
+
text.gsub(/\e\[\d+m(.*?)\e\[0m/, "\\1").gsub(/.#{Regexp.escape("\b")}/, "")
|
55
|
+
end
|
56
|
+
|
57
|
+
def capture_stdout
|
58
|
+
old = Kennel.out
|
59
|
+
Kennel.out = StringIO.new
|
60
|
+
yield
|
61
|
+
Kennel.out.string
|
62
|
+
ensure
|
63
|
+
Kennel.out = old
|
64
|
+
end
|
65
|
+
|
66
|
+
def capture_stderr
|
67
|
+
old = Kennel.err
|
68
|
+
Kennel.err = StringIO.new
|
69
|
+
yield
|
70
|
+
Kennel.err.string
|
71
|
+
ensure
|
72
|
+
Kennel.err = old
|
73
|
+
end
|
74
|
+
|
75
|
+
def tee_output
|
76
|
+
old_stdout = Kennel.out
|
77
|
+
old_stderr = Kennel.err
|
78
|
+
capture = StringIO.new
|
79
|
+
Kennel.out = TeeIO.new([capture, Kennel.out])
|
80
|
+
Kennel.err = TeeIO.new([capture, Kennel.err])
|
81
|
+
yield
|
82
|
+
capture.string
|
83
|
+
ensure
|
84
|
+
Kennel.out = old_stdout
|
85
|
+
Kennel.err = old_stderr
|
86
|
+
end
|
87
|
+
|
88
|
+
def capture_sh(command)
|
89
|
+
result = `#{command} 2>&1`
|
90
|
+
raise "Command failed:\n#{command}\n#{result}" unless $CHILD_STATUS.success?
|
91
|
+
result
|
92
|
+
end
|
93
|
+
|
94
|
+
def path_to_url(path)
|
95
|
+
if subdomain = ENV["DATADOG_SUBDOMAIN"]
|
96
|
+
"https://#{subdomain}.datadoghq.com#{path}"
|
97
|
+
else
|
98
|
+
path
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
def parallel(items, max: 10)
|
103
|
+
threads = [items.size, max].min
|
104
|
+
work = items.each_with_index.to_a
|
105
|
+
done = Array.new(items.size)
|
106
|
+
workers = Array.new(threads).map do
|
107
|
+
Thread.new do
|
108
|
+
loop do
|
109
|
+
item, i = work.pop
|
110
|
+
break unless i
|
111
|
+
done[i] =
|
112
|
+
begin
|
113
|
+
yield item
|
114
|
+
rescue StandardError => e
|
115
|
+
work.clear
|
116
|
+
e
|
117
|
+
end
|
118
|
+
end
|
119
|
+
end
|
120
|
+
end
|
121
|
+
workers.each(&:join)
|
122
|
+
done.each { |d| raise d if d.is_a?(StandardError) }
|
123
|
+
end
|
124
|
+
|
125
|
+
def natural_order(name)
|
126
|
+
name.split(/(\d+)/).each_with_index.map { |x, i| i.odd? ? x.to_i : x }
|
127
|
+
end
|
128
|
+
|
129
|
+
def retry(*errors, times:)
|
130
|
+
yield
|
131
|
+
rescue *errors => e
|
132
|
+
times -= 1
|
133
|
+
raise if times < 0
|
134
|
+
Kennel.err.puts "Error #{e}, #{times} retries left"
|
135
|
+
retry
|
136
|
+
end
|
137
|
+
|
138
|
+
# https://stackoverflow.com/questions/20235206/ruby-get-all-keys-in-a-hash-including-sub-keys/53876255#53876255
|
139
|
+
def all_keys(items)
|
140
|
+
case items
|
141
|
+
when Hash then items.keys + items.values.flat_map { |v| all_keys(v) }
|
142
|
+
when Array then items.flat_map { |i| all_keys(i) }
|
143
|
+
else []
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
147
|
+
# TODO: use awesome-print or similar, but it has too many monkey-patches
|
148
|
+
# https://github.com/amazing-print/amazing_print/issues/36
|
149
|
+
def pretty_inspect(object)
|
150
|
+
string = object.inspect
|
151
|
+
string.gsub!(/:([a-z_]+)=>/, "\\1: ")
|
152
|
+
10.times do
|
153
|
+
string.gsub!(/{(\S.*?\S)}/, "{ \\1 }") || break
|
154
|
+
end
|
155
|
+
string
|
156
|
+
end
|
157
|
+
end
|
158
|
+
end
|
159
|
+
end
|
data/template/Readme.md
ADDED
@@ -0,0 +1,205 @@
|
|
1
|
+
# Kennel
|
2
|
+
|
3
|
+
![](github/cage.jpg?raw=true)
|
4
|
+
|
5
|
+
Manage datadog monitors/dashboards/slos as code
|
6
|
+
|
7
|
+
- Documented, reusable, and searchable
|
8
|
+
- Changes are PR reviewed and auditable
|
9
|
+
- Updating shows diff before applying
|
10
|
+
- Automated import of existing monitors/dashboards/slos
|
11
|
+
|
12
|
+
![](github/screen.png?raw=true)
|
13
|
+
|
14
|
+
## Structure
|
15
|
+
|
16
|
+
- `projects/` monitors/dashboards/etc scoped by project
|
17
|
+
- `teams/` team definitions
|
18
|
+
- `parts/` monitors/dashboards/etc that are used by multiple projects
|
19
|
+
- `generated/` projects as json, to show current state and proposed changes in PRs
|
20
|
+
|
21
|
+
## Workflows
|
22
|
+
|
23
|
+
### Setup
|
24
|
+
- clone the repo
|
25
|
+
- `gem install bundler && bundle install`
|
26
|
+
- `cp .env.example .env`
|
27
|
+
- open [Datadog API Settings](https://app.datadoghq.com/account/settings#api)
|
28
|
+
- copy any `API Key` and add it to `.env` as `DATADOG_API_KEY`
|
29
|
+
- find or create (check last page) your personal "Application Key" and add it to `.env` as `DATADOG_APP_KEY=`
|
30
|
+
- change the `DATADOG_SUBDOMAIN=app` in `.env` to your companies subdomain if you have one
|
31
|
+
- verify it works by running `rake plan`, it might show some diff, but should not crash
|
32
|
+
|
33
|
+
### Adding a team
|
34
|
+
|
35
|
+
- `mention` is used for all team monitors via `super()`
|
36
|
+
- `renotify_interval` is used for all team monitors (defaults to `0` / off)
|
37
|
+
- `tags` is used for all team monitors/dashboards (defaults to `team:<team-name>`)
|
38
|
+
|
39
|
+
```Ruby
|
40
|
+
# teams/my_team.rb
|
41
|
+
module Teams
|
42
|
+
class MyTeam < Kennel::Models::Team
|
43
|
+
defaults(
|
44
|
+
mention: -> { "@slack-my-team" }
|
45
|
+
)
|
46
|
+
end
|
47
|
+
end
|
48
|
+
```
|
49
|
+
|
50
|
+
### Adding a new monitor
|
51
|
+
- use [datadog monitor UI](https://app.datadoghq.com/monitors#create) to create a monitor
|
52
|
+
- see below
|
53
|
+
|
54
|
+
### Updating an existing monitor
|
55
|
+
- use [datadog monitor UI](https://app.datadoghq.com/monitors/manage) to find a monitor
|
56
|
+
- get the `id` from the url
|
57
|
+
- run `URL='https://app.datadoghq.com/monitors/123' bundle exec rake kennel:import` and copy the output
|
58
|
+
- find or create a project in `projects/`
|
59
|
+
- add the monitor to `parts: [` list, for example:
|
60
|
+
```Ruby
|
61
|
+
# projects/my_project.rb
|
62
|
+
class MyProject < Kennel::Models::Project
|
63
|
+
defaults(
|
64
|
+
team: -> { Teams::MyTeam.new }, # use existing team or create new one in teams/
|
65
|
+
parts: -> {
|
66
|
+
[
|
67
|
+
Kennel::Models::Monitor.new(
|
68
|
+
self,
|
69
|
+
id: -> { 123456 }, # id from datadog url, not necessary when creating a new monitor
|
70
|
+
type: -> { "query alert" },
|
71
|
+
kennel_id: -> { "load-too-high" }, # make up a unique name
|
72
|
+
name: -> { "Foobar Load too high" }, # nice descriptive name that will show up in alerts and emails
|
73
|
+
message: -> {
|
74
|
+
# Explain what behavior to expect and how to fix the cause
|
75
|
+
# Use #{super()} to add team notifications.
|
76
|
+
<<~TEXT
|
77
|
+
Foobar will be slow and that could cause Barfoo to go down.
|
78
|
+
Add capacity or debug why it is suddenly slow.
|
79
|
+
#{super()}
|
80
|
+
TEXT
|
81
|
+
},
|
82
|
+
query: -> { "avg(last_5m):avg:system.load.5{hostgroup:api} by {pod} > #{critical}" }, # replace actual value with #{critical} to keep them in sync
|
83
|
+
critical: -> { 20 }
|
84
|
+
)
|
85
|
+
]
|
86
|
+
}
|
87
|
+
)
|
88
|
+
end
|
89
|
+
```
|
90
|
+
- run `PROJECT=my_project bundle exec rake plan`, an Update to the existing monitor should be shown (not Create / Delete)
|
91
|
+
- alternatively: `bundle exec rake generate` to only locally update the generated `json` files
|
92
|
+
- review changes then `git commit`
|
93
|
+
- make a PR ... get reviewed ... merge
|
94
|
+
- datadog is updated by travis
|
95
|
+
|
96
|
+
### Adding a new dashboard
|
97
|
+
- go to [datadog dashboard UI](https://app.datadoghq.com/dashboard/lists) and click on _New Dashboard_ to create a dashboard
|
98
|
+
- see below
|
99
|
+
|
100
|
+
### Updating an existing dashboard
|
101
|
+
- go to [datadog dashboard UI](https://app.datadoghq.com/dashboard/lists) and click on _New Dashboard_ to find a dashboard
|
102
|
+
- get the `id` from the url
|
103
|
+
- run `URL='https://app.datadoghq.com/dashboard/bet-foo-bar' bundle exec rake kennel:import` and copy the output
|
104
|
+
- find or create a project in `projects/`
|
105
|
+
- add a dashboard to `parts: [` list, for example:
|
106
|
+
```Ruby
|
107
|
+
class MyProject < Kennel::Models::Project
|
108
|
+
defaults(
|
109
|
+
team: -> { Teams::MyTeam.new }, # use existing team or create new one in teams/
|
110
|
+
parts: -> {
|
111
|
+
[
|
112
|
+
Kennel::Models::Dashboard.new(
|
113
|
+
self,
|
114
|
+
id: -> { "abc-def-ghi" }, # id from datadog url, not needed when creating a new dashboard
|
115
|
+
title: -> { "My Dashboard" },
|
116
|
+
description: -> { "Overview of foobar" },
|
117
|
+
template_variables: -> { ["environment"] }, # see https://docs.datadoghq.com/api/?lang=ruby#timeboards
|
118
|
+
kennel_id: -> { "overview-dashboard" }, # make up a unique name
|
119
|
+
layout_type: -> { "ordered" },
|
120
|
+
definitions: -> {
|
121
|
+
[ # An array or arrays, each one is a graph in the dashboard, alternatively a hash for finer control
|
122
|
+
[
|
123
|
+
# title, viz, type, query, edit an existing graph and see the json definition
|
124
|
+
"Graph name", "timeseries", "area", "sum:mystats.foobar{$environment}"
|
125
|
+
],
|
126
|
+
[
|
127
|
+
# queries can be an Array as well, this will generate multiple requests
|
128
|
+
# for a single graph
|
129
|
+
"Graph name", "timeseries", "area", ["sum:mystats.foobar{$environment}", "sum:mystats.success{$environment}"],
|
130
|
+
# add events too ...
|
131
|
+
events: [{q: "tags:foobar,deploy", tags_execution: "and"}]
|
132
|
+
]
|
133
|
+
]
|
134
|
+
}
|
135
|
+
)
|
136
|
+
]
|
137
|
+
}
|
138
|
+
)
|
139
|
+
end
|
140
|
+
```
|
141
|
+
|
142
|
+
### Skipping validations
|
143
|
+
|
144
|
+
Some validations might be too strict for your usecase or just wrong, please [open an issue](https://github.com/grosser/kennel/issues) and
|
145
|
+
to unblock use the `validate: -> { false }` option.
|
146
|
+
|
147
|
+
### Linking with kennel_ids
|
148
|
+
|
149
|
+
To link to existing monitors via their kennel_id
|
150
|
+
|
151
|
+
- Screens `uptime` widgets can use `monitor: {id: "foo:bar"}`
|
152
|
+
- Screens `alert_graph` widgets can use `alert_id: "foo:bar"`
|
153
|
+
- Monitors `composite` can use `query: -> { "%{foo:bar} || %{foo:baz}" }`
|
154
|
+
|
155
|
+
### Debugging changes locally
|
156
|
+
|
157
|
+
- rebase on updated `master` to not undo other changes
|
158
|
+
- figure out project name by converting the class name to snake-case
|
159
|
+
- run `PROJECT=foo bundle exec rake kennel:update_datadog` to test changes for a single project
|
160
|
+
|
161
|
+
### Listing un-muted alerts
|
162
|
+
|
163
|
+
Run `rake kennel:alerts TAG=service:my-service` to see all un-muted alerts for a given datadog monitor tag.
|
164
|
+
|
165
|
+
### Validating mentions work
|
166
|
+
|
167
|
+
`rake kennel:validate_mentions` should run as part of CI
|
168
|
+
|
169
|
+
### Grepping through all of datadog
|
170
|
+
|
171
|
+
`TYPE=monitor rake kennel:dump`
|
172
|
+
|
173
|
+
### Find all monitors with No-Data
|
174
|
+
|
175
|
+
`rake kennel:nodata TAG=team:foo`
|
176
|
+
|
177
|
+
## Examples
|
178
|
+
|
179
|
+
### Reusable monitors/dashes/etc
|
180
|
+
|
181
|
+
Add to `parts/<folder>`.
|
182
|
+
|
183
|
+
```Ruby
|
184
|
+
module Monitors
|
185
|
+
class LoadTooHigh < Kennel::Models::Monitor
|
186
|
+
defaults(
|
187
|
+
name: -> { "#{project.name} load too high" },
|
188
|
+
message: -> { "Shut it down!" },
|
189
|
+
type: -> { "query alert" },
|
190
|
+
query: -> { "avg(last_5m):avg:system.load.5{hostgroup:#{project.kennel_id}} by {pod} > #{critical}" }
|
191
|
+
)
|
192
|
+
end
|
193
|
+
end
|
194
|
+
```
|
195
|
+
|
196
|
+
Reuse it in multiple projects.
|
197
|
+
|
198
|
+
```Ruby
|
199
|
+
class Database < Kennel::Models::Project
|
200
|
+
defaults(
|
201
|
+
team: -> { Kennel::Models::Team.new(mention: -> { '@slack-foo' }, kennel_id: -> { 'foo' }) },
|
202
|
+
parts: -> { [Monitors::LoadTooHigh.new(self, critical: -> { 13 })] }
|
203
|
+
)
|
204
|
+
end
|
205
|
+
```
|