kennel 1.74.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/Readme.md +244 -0
- data/lib/kennel.rb +90 -0
- data/lib/kennel/api.rb +83 -0
- data/lib/kennel/file_cache.rb +53 -0
- data/lib/kennel/github_reporter.rb +49 -0
- data/lib/kennel/importer.rb +135 -0
- data/lib/kennel/models/base.rb +29 -0
- data/lib/kennel/models/dashboard.rb +209 -0
- data/lib/kennel/models/monitor.rb +213 -0
- data/lib/kennel/models/project.rb +31 -0
- data/lib/kennel/models/record.rb +94 -0
- data/lib/kennel/models/slo.rb +92 -0
- data/lib/kennel/models/team.rb +12 -0
- data/lib/kennel/optional_validations.rb +21 -0
- data/lib/kennel/progress.rb +34 -0
- data/lib/kennel/settings_as_methods.rb +86 -0
- data/lib/kennel/subclass_tracking.rb +19 -0
- data/lib/kennel/syncer.rb +260 -0
- data/lib/kennel/tasks.rb +147 -0
- data/lib/kennel/template_variables.rb +38 -0
- data/lib/kennel/unmuted_alerts.rb +89 -0
- data/lib/kennel/utils.rb +159 -0
- data/lib/kennel/version.rb +4 -0
- data/template/Readme.md +205 -0
- metadata +109 -0
data/lib/kennel/tasks.rb
ADDED
@@ -0,0 +1,147 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
require "English"
|
3
|
+
require "kennel"
|
4
|
+
require "kennel/unmuted_alerts"
|
5
|
+
require "kennel/importer"
|
6
|
+
|
7
|
+
module Kennel
|
8
|
+
module Tasks
|
9
|
+
class << self
|
10
|
+
def abort(message = nil)
|
11
|
+
Kennel.err.puts message if message
|
12
|
+
raise SystemExit.new(1), message
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
namespace :kennel do
|
19
|
+
desc "Ensure there are no uncommited changes that would be hidden from PR reviewers"
|
20
|
+
task no_diff: :generate do
|
21
|
+
result = `git status --porcelain generated/`.strip
|
22
|
+
Kennel::Tasks.abort "Diff found:\n#{result}\nrun `rake generate` and commit the diff to fix" unless result == ""
|
23
|
+
Kennel::Tasks.abort "Error during diffing" unless $CHILD_STATUS.success?
|
24
|
+
end
|
25
|
+
|
26
|
+
# ideally do this on every run, but it's slow (~1.5s) and brittle (might not find all + might find false-positives)
|
27
|
+
# https://help.datadoghq.com/hc/en-us/requests/254114 for automatic validation
|
28
|
+
desc "Verify that all used monitor mentions are valid"
|
29
|
+
task validate_mentions: :environment do
|
30
|
+
known = Kennel.send(:api)
|
31
|
+
.send(:request, :get, "/monitor/notifications")
|
32
|
+
.fetch(:handles)
|
33
|
+
.values
|
34
|
+
.flatten(1)
|
35
|
+
.map { |v| v.fetch(:value) }
|
36
|
+
|
37
|
+
known += ENV["KNOWN"].to_s.split(",")
|
38
|
+
|
39
|
+
bad = []
|
40
|
+
Dir["generated/**/*.json"].each do |f|
|
41
|
+
next unless message = JSON.parse(File.read(f))["message"]
|
42
|
+
used = message.scan(/\s(@[^\s{,'"]+)/).flatten(1)
|
43
|
+
.grep(/^@.*@|^@.*-/) # ignore @here etc handles ... datadog uses @foo@bar.com for emails and @foo-bar for integrations
|
44
|
+
(used - known).each { |v| bad << [f, v] }
|
45
|
+
end
|
46
|
+
|
47
|
+
if bad.any?
|
48
|
+
url = Kennel::Utils.path_to_url "/account/settings"
|
49
|
+
puts "Invalid mentions found, either ignore them by adding to `KNOWN` env var or add them via #{url}"
|
50
|
+
bad.each { |f, v| puts "Invalid mention #{v} in monitor message of #{f}" }
|
51
|
+
Kennel::Tasks.abort
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
desc "generate local definitions"
|
56
|
+
task generate: :environment do
|
57
|
+
Kennel.generate
|
58
|
+
end
|
59
|
+
|
60
|
+
# also generate parts so users see and commit updated generated automatically
|
61
|
+
desc "show planned datadog changes (scope with PROJECT=name)"
|
62
|
+
task plan: :generate do
|
63
|
+
Kennel.plan
|
64
|
+
end
|
65
|
+
|
66
|
+
desc "update datadog (scope with PROJECT=name)"
|
67
|
+
task update_datadog: :environment do
|
68
|
+
Kennel.update
|
69
|
+
end
|
70
|
+
|
71
|
+
desc "update if this is a push to the default branch, otherwise plan"
|
72
|
+
task :travis do
|
73
|
+
on_default_branch = (ENV["TRAVIS_BRANCH"] == (ENV["DEFAULT_BRANCH"] || "master"))
|
74
|
+
is_push = (ENV["TRAVIS_PULL_REQUEST"] == "false")
|
75
|
+
task_name =
|
76
|
+
if on_default_branch && is_push
|
77
|
+
"kennel:update_datadog"
|
78
|
+
else
|
79
|
+
"kennel:plan" # show plan in travis logs
|
80
|
+
end
|
81
|
+
|
82
|
+
Rake::Task[task_name].invoke
|
83
|
+
end
|
84
|
+
|
85
|
+
desc "show unmuted alerts filtered by TAG, for example TAG=team:foo"
|
86
|
+
task alerts: :environment do
|
87
|
+
tag = ENV["TAG"] || Kennel::Tasks.abort("Call with TAG=foo:bar")
|
88
|
+
Kennel::UnmutedAlerts.print(Kennel.send(:api), tag)
|
89
|
+
end
|
90
|
+
|
91
|
+
desc "show monitors with no data by TAG, for example TAG=team:foo"
|
92
|
+
task nodata: :environment do
|
93
|
+
tag = ENV["TAG"] || Kennel::Tasks.abort("Call with TAG=foo:bar")
|
94
|
+
monitors = Kennel.send(:api).list("monitor", monitor_tags: tag, group_states: "no data")
|
95
|
+
monitors.select! { |m| m[:overall_state] == "No Data" }
|
96
|
+
monitors.reject! { |m| m[:tags].include? "nodata:ignore" }
|
97
|
+
if monitors.any?
|
98
|
+
Kennel.err.puts <<~TEXT
|
99
|
+
This is a useful task to find monitors that have mis-spelled metrics or never received data at any time.
|
100
|
+
To ignore monitors with nodata, tag the monitor with "nodata:ignore"
|
101
|
+
|
102
|
+
TEXT
|
103
|
+
end
|
104
|
+
|
105
|
+
monitors.each do |m|
|
106
|
+
Kennel.out.puts m[:name]
|
107
|
+
Kennel.out.puts Kennel::Utils.path_to_url("/monitors/#{m[:id]}")
|
108
|
+
Kennel.out.puts
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
desc "Convert existing resources to copy-pasteable definitions to import existing resources (call with URL= or call with RESOURCE= and ID=)"
|
113
|
+
task import: :environment do
|
114
|
+
if (id = ENV["ID"]) && (resource = ENV["RESOURCE"])
|
115
|
+
id = Integer(id) if id =~ /^\d+$/ # dashboards can have alphanumeric ids
|
116
|
+
elsif (url = ENV["URL"])
|
117
|
+
resource, id = Kennel::Models::Record.parse_any_url(url) || Kennel::Tasks.abort("Unable to parse url")
|
118
|
+
else
|
119
|
+
possible_resources = Kennel::Models::Record.subclasses.map(&:api_resource)
|
120
|
+
Kennel::Tasks.abort("Call with URL= or call with RESOURCE=#{possible_resources.join(" or ")} and ID=")
|
121
|
+
end
|
122
|
+
|
123
|
+
Kennel.out.puts Kennel::Importer.new(Kennel.send(:api)).import(resource, id)
|
124
|
+
end
|
125
|
+
|
126
|
+
desc "Dump ALL of datadog config as raw json ... useful for grep/search TYPE=slo|monitor|dashboard"
|
127
|
+
task dump: :environment do
|
128
|
+
Kennel.send(:api).list(ENV.fetch("TYPE")).each do |r|
|
129
|
+
Kennel.out.puts JSON.pretty_generate(r)
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
133
|
+
task :environment do
|
134
|
+
require "kennel"
|
135
|
+
gem "dotenv"
|
136
|
+
require "dotenv"
|
137
|
+
source = ".env"
|
138
|
+
|
139
|
+
# warn when users have things like DATADOG_TOKEN already set and it will not be loaded from .env
|
140
|
+
unless ENV["KENNEL_SILENCE_UPDATED_ENV"]
|
141
|
+
updated = Dotenv.parse(source).select { |k, v| ENV[k] && ENV[k] != v }
|
142
|
+
warn "Environment variables #{updated.keys.join(", ")} need to be unset to be sourced from #{source}" if updated.any?
|
143
|
+
end
|
144
|
+
|
145
|
+
Dotenv.load(source)
|
146
|
+
end
|
147
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
module Kennel
|
3
|
+
module TemplateVariables
|
4
|
+
def self.included(base)
|
5
|
+
base.settings :template_variables
|
6
|
+
base.defaults(template_variables: -> { [] })
|
7
|
+
end
|
8
|
+
|
9
|
+
private
|
10
|
+
|
11
|
+
def render_template_variables
|
12
|
+
(template_variables || []).map do |v|
|
13
|
+
v.is_a?(String) ? { default: "*", prefix: v, name: v } : v
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
# check for queries that do not use the variables and would be misleading
|
18
|
+
# TODO: do the same check for apm_query and their group_by
|
19
|
+
def validate_template_variables(data, key)
|
20
|
+
variables = (data[:template_variables] || []).map { |v| "$#{v.fetch(:name)}" }
|
21
|
+
queries = data[key].flat_map do |widget|
|
22
|
+
([widget] + (widget.dig(:definition, :widgets) || [])).flat_map { |w| widget_queries(w) }
|
23
|
+
end.compact
|
24
|
+
bad = queries.grep_v(/(#{variables.map { |v| Regexp.escape(v) }.join("|")})\b/)
|
25
|
+
if bad.any?
|
26
|
+
invalid!(
|
27
|
+
"queries #{bad.join(", ")} must use the template variables #{variables.join(", ")}\n" \
|
28
|
+
"If that is not possible, add `validate: -> { false } # query foo in bar does not have baz tag`"
|
29
|
+
)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def widget_queries(widget)
|
34
|
+
requests = widget.dig(:definition, :requests) || []
|
35
|
+
(requests.is_a?(Hash) ? requests.values : requests).map { |r| r[:q] } # hostmap widgets have hash requests
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
@@ -0,0 +1,89 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
require "kennel"
|
3
|
+
|
4
|
+
# Show Alerts that are not muted and their alerting scopes
|
5
|
+
module Kennel
|
6
|
+
class UnmutedAlerts
|
7
|
+
COLORS = {
|
8
|
+
"Alert" => :red,
|
9
|
+
"Warn" => :yellow,
|
10
|
+
"No Data" => :cyan
|
11
|
+
}.freeze
|
12
|
+
|
13
|
+
class << self
|
14
|
+
def print(api, tag)
|
15
|
+
monitors = filtered_monitors(api, tag)
|
16
|
+
if monitors.empty?
|
17
|
+
Kennel.out.puts "No unmuted alerts found"
|
18
|
+
else
|
19
|
+
monitors.each do |m|
|
20
|
+
Kennel.out.puts m[:name]
|
21
|
+
Kennel.out.puts Utils.path_to_url("/monitors/#{m[:id]}")
|
22
|
+
m[:state][:groups].each do |g|
|
23
|
+
color = COLORS[g[:status]] || :default
|
24
|
+
since = "\t#{time_since(g[:last_triggered_ts])}"
|
25
|
+
Kennel.out.puts "#{Kennel::Utils.color(color, g[:status])}\t#{g[:name]}#{since}"
|
26
|
+
end
|
27
|
+
Kennel.out.puts
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
private
|
33
|
+
|
34
|
+
# sort pod3 before pod11
|
35
|
+
def sort_groups!(monitor)
|
36
|
+
groups = monitor[:state][:groups].values
|
37
|
+
groups.sort_by! { |g| g[:name].to_s.split(",").map { |w| Utils.natural_order(w) } }
|
38
|
+
monitor[:state][:groups] = groups
|
39
|
+
end
|
40
|
+
|
41
|
+
def time_since(t)
|
42
|
+
diff = Time.now.to_i - Integer(t)
|
43
|
+
"%02d:%02d:%02d" % [diff / 3600, diff / 60 % 60, diff % 60]
|
44
|
+
end
|
45
|
+
|
46
|
+
def filtered_monitors(api, tag)
|
47
|
+
# Download all monitors with given tag
|
48
|
+
monitors = Progress.progress("Downloading") do
|
49
|
+
api.list("monitor", monitor_tags: tag, group_states: "all", with_downtimes: "true")
|
50
|
+
end
|
51
|
+
|
52
|
+
raise "No monitors for #{tag} found, check your spelling" if monitors.empty?
|
53
|
+
|
54
|
+
# only keep monitors that are alerting
|
55
|
+
monitors.reject! { |m| m[:overall_state] == "OK" }
|
56
|
+
|
57
|
+
# only keep monitors that are not completely silenced
|
58
|
+
monitors.reject! { |m| m[:options][:silenced].key?(:*) }
|
59
|
+
|
60
|
+
# only keep groups that are alerting
|
61
|
+
monitors.each { |m| m[:state][:groups].reject! { |_, g| g[:status] == "OK" || g[:status] == "Ignored" } }
|
62
|
+
|
63
|
+
# only keep alerting groups that are not silenced
|
64
|
+
monitors.each do |m|
|
65
|
+
silenced = m[:options][:silenced].keys.map { |k| k.to_s.split(",") }
|
66
|
+
m[:state][:groups].select! do |k, _|
|
67
|
+
scope = k.to_s.split(",")
|
68
|
+
silenced.none? { |s| (s - scope).empty? }
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
# only keep monitors that are not covered by a downtime
|
73
|
+
monitors.each do |m|
|
74
|
+
next unless m[:matching_downtimes]
|
75
|
+
downtime_groups = m[:matching_downtimes].select { |d| d[:active] }.flat_map { |d| d[:groups] }
|
76
|
+
m[:state][:groups].reject! do |k, _|
|
77
|
+
downtime_groups.include?(k.to_s)
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
# only keep monitors with alerting groups
|
82
|
+
monitors.select! { |m| m[:state][:groups].any? }
|
83
|
+
|
84
|
+
# sort group alerts
|
85
|
+
monitors.each { |m| sort_groups!(m) }
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
data/lib/kennel/utils.rb
ADDED
@@ -0,0 +1,159 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
module Kennel
|
3
|
+
module Utils
|
4
|
+
COLORS = { red: 31, green: 32, yellow: 33, cyan: 36, magenta: 35, default: 0 }.freeze
|
5
|
+
|
6
|
+
class TeeIO < IO
|
7
|
+
def initialize(ios)
|
8
|
+
@ios = ios
|
9
|
+
end
|
10
|
+
|
11
|
+
def write(string)
|
12
|
+
@ios.each { |io| io.write string }
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
class << self
|
17
|
+
def snake_case(string)
|
18
|
+
string
|
19
|
+
.gsub(/::/, "_") # Foo::Bar -> foo_bar
|
20
|
+
.gsub(/([A-Z]+)([A-Z][a-z])/, '\1_\2') # FOOBar -> foo_bar
|
21
|
+
.gsub(/([a-z\d])([A-Z])/, '\1_\2') # fooBar -> foo_bar
|
22
|
+
.tr("-", "_") # foo-bar -> foo_bar
|
23
|
+
.downcase
|
24
|
+
end
|
25
|
+
|
26
|
+
# simplified version of https://apidock.com/rails/ActiveSupport/Inflector/parameterize
|
27
|
+
def parameterize(string)
|
28
|
+
string
|
29
|
+
.downcase
|
30
|
+
.gsub(/[^a-z0-9\-_]+/, "-") # remove unsupported
|
31
|
+
.gsub(/-{2,}/, "-") # remove duplicates
|
32
|
+
.gsub(/^-|-$/, "") # remove leading/trailing
|
33
|
+
end
|
34
|
+
|
35
|
+
def presence(value)
|
36
|
+
value.nil? || value.empty? ? nil : value
|
37
|
+
end
|
38
|
+
|
39
|
+
def ask(question)
|
40
|
+
Kennel.err.printf color(:red, "#{question} - press 'y' to continue: ")
|
41
|
+
begin
|
42
|
+
STDIN.gets.chomp == "y"
|
43
|
+
rescue Interrupt # do not show a backtrace if user decides to Ctrl+C here
|
44
|
+
Kennel.err.print "\n"
|
45
|
+
exit 1
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
def color(color, text)
|
50
|
+
"\e[#{COLORS.fetch(color)}m#{text}\e[0m"
|
51
|
+
end
|
52
|
+
|
53
|
+
def strip_shell_control(text)
|
54
|
+
text.gsub(/\e\[\d+m(.*?)\e\[0m/, "\\1").gsub(/.#{Regexp.escape("\b")}/, "")
|
55
|
+
end
|
56
|
+
|
57
|
+
def capture_stdout
|
58
|
+
old = Kennel.out
|
59
|
+
Kennel.out = StringIO.new
|
60
|
+
yield
|
61
|
+
Kennel.out.string
|
62
|
+
ensure
|
63
|
+
Kennel.out = old
|
64
|
+
end
|
65
|
+
|
66
|
+
def capture_stderr
|
67
|
+
old = Kennel.err
|
68
|
+
Kennel.err = StringIO.new
|
69
|
+
yield
|
70
|
+
Kennel.err.string
|
71
|
+
ensure
|
72
|
+
Kennel.err = old
|
73
|
+
end
|
74
|
+
|
75
|
+
def tee_output
|
76
|
+
old_stdout = Kennel.out
|
77
|
+
old_stderr = Kennel.err
|
78
|
+
capture = StringIO.new
|
79
|
+
Kennel.out = TeeIO.new([capture, Kennel.out])
|
80
|
+
Kennel.err = TeeIO.new([capture, Kennel.err])
|
81
|
+
yield
|
82
|
+
capture.string
|
83
|
+
ensure
|
84
|
+
Kennel.out = old_stdout
|
85
|
+
Kennel.err = old_stderr
|
86
|
+
end
|
87
|
+
|
88
|
+
def capture_sh(command)
|
89
|
+
result = `#{command} 2>&1`
|
90
|
+
raise "Command failed:\n#{command}\n#{result}" unless $CHILD_STATUS.success?
|
91
|
+
result
|
92
|
+
end
|
93
|
+
|
94
|
+
def path_to_url(path)
|
95
|
+
if subdomain = ENV["DATADOG_SUBDOMAIN"]
|
96
|
+
"https://#{subdomain}.datadoghq.com#{path}"
|
97
|
+
else
|
98
|
+
path
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
def parallel(items, max: 10)
|
103
|
+
threads = [items.size, max].min
|
104
|
+
work = items.each_with_index.to_a
|
105
|
+
done = Array.new(items.size)
|
106
|
+
workers = Array.new(threads).map do
|
107
|
+
Thread.new do
|
108
|
+
loop do
|
109
|
+
item, i = work.pop
|
110
|
+
break unless i
|
111
|
+
done[i] =
|
112
|
+
begin
|
113
|
+
yield item
|
114
|
+
rescue StandardError => e
|
115
|
+
work.clear
|
116
|
+
e
|
117
|
+
end
|
118
|
+
end
|
119
|
+
end
|
120
|
+
end
|
121
|
+
workers.each(&:join)
|
122
|
+
done.each { |d| raise d if d.is_a?(StandardError) }
|
123
|
+
end
|
124
|
+
|
125
|
+
def natural_order(name)
|
126
|
+
name.split(/(\d+)/).each_with_index.map { |x, i| i.odd? ? x.to_i : x }
|
127
|
+
end
|
128
|
+
|
129
|
+
def retry(*errors, times:)
|
130
|
+
yield
|
131
|
+
rescue *errors => e
|
132
|
+
times -= 1
|
133
|
+
raise if times < 0
|
134
|
+
Kennel.err.puts "Error #{e}, #{times} retries left"
|
135
|
+
retry
|
136
|
+
end
|
137
|
+
|
138
|
+
# https://stackoverflow.com/questions/20235206/ruby-get-all-keys-in-a-hash-including-sub-keys/53876255#53876255
|
139
|
+
def all_keys(items)
|
140
|
+
case items
|
141
|
+
when Hash then items.keys + items.values.flat_map { |v| all_keys(v) }
|
142
|
+
when Array then items.flat_map { |i| all_keys(i) }
|
143
|
+
else []
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
147
|
+
# TODO: use awesome-print or similar, but it has too many monkey-patches
|
148
|
+
# https://github.com/amazing-print/amazing_print/issues/36
|
149
|
+
def pretty_inspect(object)
|
150
|
+
string = object.inspect
|
151
|
+
string.gsub!(/:([a-z_]+)=>/, "\\1: ")
|
152
|
+
10.times do
|
153
|
+
string.gsub!(/{(\S.*?\S)}/, "{ \\1 }") || break
|
154
|
+
end
|
155
|
+
string
|
156
|
+
end
|
157
|
+
end
|
158
|
+
end
|
159
|
+
end
|
data/template/Readme.md
ADDED
@@ -0,0 +1,205 @@
|
|
1
|
+
# Kennel
|
2
|
+
|
3
|
+

|
4
|
+
|
5
|
+
Manage datadog monitors/dashboards/slos as code
|
6
|
+
|
7
|
+
- Documented, reusable, and searchable
|
8
|
+
- Changes are PR reviewed and auditable
|
9
|
+
- Updating shows diff before applying
|
10
|
+
- Automated import of existing monitors/dashboards/slos
|
11
|
+
|
12
|
+

|
13
|
+
|
14
|
+
## Structure
|
15
|
+
|
16
|
+
- `projects/` monitors/dashboards/etc scoped by project
|
17
|
+
- `teams/` team definitions
|
18
|
+
- `parts/` monitors/dashboards/etc that are used by multiple projects
|
19
|
+
- `generated/` projects as json, to show current state and proposed changes in PRs
|
20
|
+
|
21
|
+
## Workflows
|
22
|
+
|
23
|
+
### Setup
|
24
|
+
- clone the repo
|
25
|
+
- `gem install bundler && bundle install`
|
26
|
+
- `cp .env.example .env`
|
27
|
+
- open [Datadog API Settings](https://app.datadoghq.com/account/settings#api)
|
28
|
+
- copy any `API Key` and add it to `.env` as `DATADOG_API_KEY`
|
29
|
+
- find or create (check last page) your personal "Application Key" and add it to `.env` as `DATADOG_APP_KEY=`
|
30
|
+
- change the `DATADOG_SUBDOMAIN=app` in `.env` to your companies subdomain if you have one
|
31
|
+
- verify it works by running `rake plan`, it might show some diff, but should not crash
|
32
|
+
|
33
|
+
### Adding a team
|
34
|
+
|
35
|
+
- `mention` is used for all team monitors via `super()`
|
36
|
+
- `renotify_interval` is used for all team monitors (defaults to `0` / off)
|
37
|
+
- `tags` is used for all team monitors/dashboards (defaults to `team:<team-name>`)
|
38
|
+
|
39
|
+
```Ruby
|
40
|
+
# teams/my_team.rb
|
41
|
+
module Teams
|
42
|
+
class MyTeam < Kennel::Models::Team
|
43
|
+
defaults(
|
44
|
+
mention: -> { "@slack-my-team" }
|
45
|
+
)
|
46
|
+
end
|
47
|
+
end
|
48
|
+
```
|
49
|
+
|
50
|
+
### Adding a new monitor
|
51
|
+
- use [datadog monitor UI](https://app.datadoghq.com/monitors#create) to create a monitor
|
52
|
+
- see below
|
53
|
+
|
54
|
+
### Updating an existing monitor
|
55
|
+
- use [datadog monitor UI](https://app.datadoghq.com/monitors/manage) to find a monitor
|
56
|
+
- get the `id` from the url
|
57
|
+
- run `URL='https://app.datadoghq.com/monitors/123' bundle exec rake kennel:import` and copy the output
|
58
|
+
- find or create a project in `projects/`
|
59
|
+
- add the monitor to `parts: [` list, for example:
|
60
|
+
```Ruby
|
61
|
+
# projects/my_project.rb
|
62
|
+
class MyProject < Kennel::Models::Project
|
63
|
+
defaults(
|
64
|
+
team: -> { Teams::MyTeam.new }, # use existing team or create new one in teams/
|
65
|
+
parts: -> {
|
66
|
+
[
|
67
|
+
Kennel::Models::Monitor.new(
|
68
|
+
self,
|
69
|
+
id: -> { 123456 }, # id from datadog url, not necessary when creating a new monitor
|
70
|
+
type: -> { "query alert" },
|
71
|
+
kennel_id: -> { "load-too-high" }, # make up a unique name
|
72
|
+
name: -> { "Foobar Load too high" }, # nice descriptive name that will show up in alerts and emails
|
73
|
+
message: -> {
|
74
|
+
# Explain what behavior to expect and how to fix the cause
|
75
|
+
# Use #{super()} to add team notifications.
|
76
|
+
<<~TEXT
|
77
|
+
Foobar will be slow and that could cause Barfoo to go down.
|
78
|
+
Add capacity or debug why it is suddenly slow.
|
79
|
+
#{super()}
|
80
|
+
TEXT
|
81
|
+
},
|
82
|
+
query: -> { "avg(last_5m):avg:system.load.5{hostgroup:api} by {pod} > #{critical}" }, # replace actual value with #{critical} to keep them in sync
|
83
|
+
critical: -> { 20 }
|
84
|
+
)
|
85
|
+
]
|
86
|
+
}
|
87
|
+
)
|
88
|
+
end
|
89
|
+
```
|
90
|
+
- run `PROJECT=my_project bundle exec rake plan`, an Update to the existing monitor should be shown (not Create / Delete)
|
91
|
+
- alternatively: `bundle exec rake generate` to only locally update the generated `json` files
|
92
|
+
- review changes then `git commit`
|
93
|
+
- make a PR ... get reviewed ... merge
|
94
|
+
- datadog is updated by travis
|
95
|
+
|
96
|
+
### Adding a new dashboard
|
97
|
+
- go to [datadog dashboard UI](https://app.datadoghq.com/dashboard/lists) and click on _New Dashboard_ to create a dashboard
|
98
|
+
- see below
|
99
|
+
|
100
|
+
### Updating an existing dashboard
|
101
|
+
- go to [datadog dashboard UI](https://app.datadoghq.com/dashboard/lists) and click on _New Dashboard_ to find a dashboard
|
102
|
+
- get the `id` from the url
|
103
|
+
- run `URL='https://app.datadoghq.com/dashboard/bet-foo-bar' bundle exec rake kennel:import` and copy the output
|
104
|
+
- find or create a project in `projects/`
|
105
|
+
- add a dashboard to `parts: [` list, for example:
|
106
|
+
```Ruby
|
107
|
+
class MyProject < Kennel::Models::Project
|
108
|
+
defaults(
|
109
|
+
team: -> { Teams::MyTeam.new }, # use existing team or create new one in teams/
|
110
|
+
parts: -> {
|
111
|
+
[
|
112
|
+
Kennel::Models::Dashboard.new(
|
113
|
+
self,
|
114
|
+
id: -> { "abc-def-ghi" }, # id from datadog url, not needed when creating a new dashboard
|
115
|
+
title: -> { "My Dashboard" },
|
116
|
+
description: -> { "Overview of foobar" },
|
117
|
+
template_variables: -> { ["environment"] }, # see https://docs.datadoghq.com/api/?lang=ruby#timeboards
|
118
|
+
kennel_id: -> { "overview-dashboard" }, # make up a unique name
|
119
|
+
layout_type: -> { "ordered" },
|
120
|
+
definitions: -> {
|
121
|
+
[ # An array or arrays, each one is a graph in the dashboard, alternatively a hash for finer control
|
122
|
+
[
|
123
|
+
# title, viz, type, query, edit an existing graph and see the json definition
|
124
|
+
"Graph name", "timeseries", "area", "sum:mystats.foobar{$environment}"
|
125
|
+
],
|
126
|
+
[
|
127
|
+
# queries can be an Array as well, this will generate multiple requests
|
128
|
+
# for a single graph
|
129
|
+
"Graph name", "timeseries", "area", ["sum:mystats.foobar{$environment}", "sum:mystats.success{$environment}"],
|
130
|
+
# add events too ...
|
131
|
+
events: [{q: "tags:foobar,deploy", tags_execution: "and"}]
|
132
|
+
]
|
133
|
+
]
|
134
|
+
}
|
135
|
+
)
|
136
|
+
]
|
137
|
+
}
|
138
|
+
)
|
139
|
+
end
|
140
|
+
```
|
141
|
+
|
142
|
+
### Skipping validations
|
143
|
+
|
144
|
+
Some validations might be too strict for your usecase or just wrong, please [open an issue](https://github.com/grosser/kennel/issues) and
|
145
|
+
to unblock use the `validate: -> { false }` option.
|
146
|
+
|
147
|
+
### Linking with kennel_ids
|
148
|
+
|
149
|
+
To link to existing monitors via their kennel_id
|
150
|
+
|
151
|
+
- Screens `uptime` widgets can use `monitor: {id: "foo:bar"}`
|
152
|
+
- Screens `alert_graph` widgets can use `alert_id: "foo:bar"`
|
153
|
+
- Monitors `composite` can use `query: -> { "%{foo:bar} || %{foo:baz}" }`
|
154
|
+
|
155
|
+
### Debugging changes locally
|
156
|
+
|
157
|
+
- rebase on updated `master` to not undo other changes
|
158
|
+
- figure out project name by converting the class name to snake-case
|
159
|
+
- run `PROJECT=foo bundle exec rake kennel:update_datadog` to test changes for a single project
|
160
|
+
|
161
|
+
### Listing un-muted alerts
|
162
|
+
|
163
|
+
Run `rake kennel:alerts TAG=service:my-service` to see all un-muted alerts for a given datadog monitor tag.
|
164
|
+
|
165
|
+
### Validating mentions work
|
166
|
+
|
167
|
+
`rake kennel:validate_mentions` should run as part of CI
|
168
|
+
|
169
|
+
### Grepping through all of datadog
|
170
|
+
|
171
|
+
`TYPE=monitor rake kennel:dump`
|
172
|
+
|
173
|
+
### Find all monitors with No-Data
|
174
|
+
|
175
|
+
`rake kennel:nodata TAG=team:foo`
|
176
|
+
|
177
|
+
## Examples
|
178
|
+
|
179
|
+
### Reusable monitors/dashes/etc
|
180
|
+
|
181
|
+
Add to `parts/<folder>`.
|
182
|
+
|
183
|
+
```Ruby
|
184
|
+
module Monitors
|
185
|
+
class LoadTooHigh < Kennel::Models::Monitor
|
186
|
+
defaults(
|
187
|
+
name: -> { "#{project.name} load too high" },
|
188
|
+
message: -> { "Shut it down!" },
|
189
|
+
type: -> { "query alert" },
|
190
|
+
query: -> { "avg(last_5m):avg:system.load.5{hostgroup:#{project.kennel_id}} by {pod} > #{critical}" }
|
191
|
+
)
|
192
|
+
end
|
193
|
+
end
|
194
|
+
```
|
195
|
+
|
196
|
+
Reuse it in multiple projects.
|
197
|
+
|
198
|
+
```Ruby
|
199
|
+
class Database < Kennel::Models::Project
|
200
|
+
defaults(
|
201
|
+
team: -> { Kennel::Models::Team.new(mention: -> { '@slack-foo' }, kennel_id: -> { 'foo' }) },
|
202
|
+
parts: -> { [Monitors::LoadTooHigh.new(self, critical: -> { 13 })] }
|
203
|
+
)
|
204
|
+
end
|
205
|
+
```
|