kennel 1.75.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,38 @@
1
+ # frozen_string_literal: true
2
+ module Kennel
3
+ module TemplateVariables
4
+ def self.included(base)
5
+ base.settings :template_variables
6
+ base.defaults(template_variables: -> { [] })
7
+ end
8
+
9
+ private
10
+
11
+ def render_template_variables
12
+ (template_variables || []).map do |v|
13
+ v.is_a?(String) ? { default: "*", prefix: v, name: v } : v
14
+ end
15
+ end
16
+
17
+ # check for queries that do not use the variables and would be misleading
18
+ # TODO: do the same check for apm_query and their group_by
19
+ def validate_template_variables(data, key)
20
+ variables = (data[:template_variables] || []).map { |v| "$#{v.fetch(:name)}" }
21
+ queries = data[key].flat_map do |widget|
22
+ ([widget] + (widget.dig(:definition, :widgets) || [])).flat_map { |w| widget_queries(w) }
23
+ end.compact
24
+ bad = queries.grep_v(/(#{variables.map { |v| Regexp.escape(v) }.join("|")})\b/)
25
+ if bad.any?
26
+ invalid!(
27
+ "queries #{bad.join(", ")} must use the template variables #{variables.join(", ")}\n" \
28
+ "If that is not possible, add `validate: -> { false } # query foo in bar does not have baz tag`"
29
+ )
30
+ end
31
+ end
32
+
33
+ def widget_queries(widget)
34
+ requests = widget.dig(:definition, :requests) || []
35
+ (requests.is_a?(Hash) ? requests.values : requests).map { |r| r[:q] } # hostmap widgets have hash requests
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,89 @@
1
+ # frozen_string_literal: true
2
+ require "kennel"
3
+
4
+ # Show Alerts that are not muted and their alerting scopes
5
+ module Kennel
6
+ class UnmutedAlerts
7
+ COLORS = {
8
+ "Alert" => :red,
9
+ "Warn" => :yellow,
10
+ "No Data" => :cyan
11
+ }.freeze
12
+
13
+ class << self
14
+ def print(api, tag)
15
+ monitors = filtered_monitors(api, tag)
16
+ if monitors.empty?
17
+ Kennel.out.puts "No unmuted alerts found"
18
+ else
19
+ monitors.each do |m|
20
+ Kennel.out.puts m[:name]
21
+ Kennel.out.puts Utils.path_to_url("/monitors/#{m[:id]}")
22
+ m[:state][:groups].each do |g|
23
+ color = COLORS[g[:status]] || :default
24
+ since = "\t#{time_since(g[:last_triggered_ts])}"
25
+ Kennel.out.puts "#{Kennel::Utils.color(color, g[:status])}\t#{g[:name]}#{since}"
26
+ end
27
+ Kennel.out.puts
28
+ end
29
+ end
30
+ end
31
+
32
+ private
33
+
34
+ # sort pod3 before pod11
35
+ def sort_groups!(monitor)
36
+ groups = monitor[:state][:groups].values
37
+ groups.sort_by! { |g| g[:name].to_s.split(",").map { |w| Utils.natural_order(w) } }
38
+ monitor[:state][:groups] = groups
39
+ end
40
+
41
+ def time_since(t)
42
+ diff = Time.now.to_i - Integer(t)
43
+ "%02d:%02d:%02d" % [diff / 3600, diff / 60 % 60, diff % 60]
44
+ end
45
+
46
+ def filtered_monitors(api, tag)
47
+ # Download all monitors with given tag
48
+ monitors = Progress.progress("Downloading") do
49
+ api.list("monitor", monitor_tags: tag, group_states: "all", with_downtimes: "true")
50
+ end
51
+
52
+ raise "No monitors for #{tag} found, check your spelling" if monitors.empty?
53
+
54
+ # only keep monitors that are alerting
55
+ monitors.reject! { |m| m[:overall_state] == "OK" }
56
+
57
+ # only keep monitors that are not completely silenced
58
+ monitors.reject! { |m| m[:options][:silenced].key?(:*) }
59
+
60
+ # only keep groups that are alerting
61
+ monitors.each { |m| m[:state][:groups].reject! { |_, g| g[:status] == "OK" || g[:status] == "Ignored" } }
62
+
63
+ # only keep alerting groups that are not silenced
64
+ monitors.each do |m|
65
+ silenced = m[:options][:silenced].keys.map { |k| k.to_s.split(",") }
66
+ m[:state][:groups].select! do |k, _|
67
+ scope = k.to_s.split(",")
68
+ silenced.none? { |s| (s - scope).empty? }
69
+ end
70
+ end
71
+
72
+ # only keep monitors that are not covered by a downtime
73
+ monitors.each do |m|
74
+ next unless m[:matching_downtimes]
75
+ downtime_groups = m[:matching_downtimes].select { |d| d[:active] }.flat_map { |d| d[:groups] }
76
+ m[:state][:groups].reject! do |k, _|
77
+ downtime_groups.include?(k.to_s)
78
+ end
79
+ end
80
+
81
+ # only keep monitors with alerting groups
82
+ monitors.select! { |m| m[:state][:groups].any? }
83
+
84
+ # sort group alerts
85
+ monitors.each { |m| sort_groups!(m) }
86
+ end
87
+ end
88
+ end
89
+ end
@@ -0,0 +1,159 @@
1
+ # frozen_string_literal: true
2
+ module Kennel
3
+ module Utils
4
+ COLORS = { red: 31, green: 32, yellow: 33, cyan: 36, magenta: 35, default: 0 }.freeze
5
+
6
+ class TeeIO < IO
7
+ def initialize(ios)
8
+ @ios = ios
9
+ end
10
+
11
+ def write(string)
12
+ @ios.each { |io| io.write string }
13
+ end
14
+ end
15
+
16
+ class << self
17
+ def snake_case(string)
18
+ string
19
+ .gsub(/::/, "_") # Foo::Bar -> foo_bar
20
+ .gsub(/([A-Z]+)([A-Z][a-z])/, '\1_\2') # FOOBar -> foo_bar
21
+ .gsub(/([a-z\d])([A-Z])/, '\1_\2') # fooBar -> foo_bar
22
+ .tr("-", "_") # foo-bar -> foo_bar
23
+ .downcase
24
+ end
25
+
26
+ # simplified version of https://apidock.com/rails/ActiveSupport/Inflector/parameterize
27
+ def parameterize(string)
28
+ string
29
+ .downcase
30
+ .gsub(/[^a-z0-9\-_]+/, "-") # remove unsupported
31
+ .gsub(/-{2,}/, "-") # remove duplicates
32
+ .gsub(/^-|-$/, "") # remove leading/trailing
33
+ end
34
+
35
+ def presence(value)
36
+ value.nil? || value.empty? ? nil : value
37
+ end
38
+
39
+ def ask(question)
40
+ Kennel.err.printf color(:red, "#{question} - press 'y' to continue: ")
41
+ begin
42
+ STDIN.gets.chomp == "y"
43
+ rescue Interrupt # do not show a backtrace if user decides to Ctrl+C here
44
+ Kennel.err.print "\n"
45
+ exit 1
46
+ end
47
+ end
48
+
49
+ def color(color, text)
50
+ "\e[#{COLORS.fetch(color)}m#{text}\e[0m"
51
+ end
52
+
53
+ def strip_shell_control(text)
54
+ text.gsub(/\e\[\d+m(.*?)\e\[0m/, "\\1").gsub(/.#{Regexp.escape("\b")}/, "")
55
+ end
56
+
57
+ def capture_stdout
58
+ old = Kennel.out
59
+ Kennel.out = StringIO.new
60
+ yield
61
+ Kennel.out.string
62
+ ensure
63
+ Kennel.out = old
64
+ end
65
+
66
+ def capture_stderr
67
+ old = Kennel.err
68
+ Kennel.err = StringIO.new
69
+ yield
70
+ Kennel.err.string
71
+ ensure
72
+ Kennel.err = old
73
+ end
74
+
75
+ def tee_output
76
+ old_stdout = Kennel.out
77
+ old_stderr = Kennel.err
78
+ capture = StringIO.new
79
+ Kennel.out = TeeIO.new([capture, Kennel.out])
80
+ Kennel.err = TeeIO.new([capture, Kennel.err])
81
+ yield
82
+ capture.string
83
+ ensure
84
+ Kennel.out = old_stdout
85
+ Kennel.err = old_stderr
86
+ end
87
+
88
+ def capture_sh(command)
89
+ result = `#{command} 2>&1`
90
+ raise "Command failed:\n#{command}\n#{result}" unless $CHILD_STATUS.success?
91
+ result
92
+ end
93
+
94
+ def path_to_url(path)
95
+ if subdomain = ENV["DATADOG_SUBDOMAIN"]
96
+ "https://#{subdomain}.datadoghq.com#{path}"
97
+ else
98
+ path
99
+ end
100
+ end
101
+
102
+ def parallel(items, max: 10)
103
+ threads = [items.size, max].min
104
+ work = items.each_with_index.to_a
105
+ done = Array.new(items.size)
106
+ workers = Array.new(threads).map do
107
+ Thread.new do
108
+ loop do
109
+ item, i = work.pop
110
+ break unless i
111
+ done[i] =
112
+ begin
113
+ yield item
114
+ rescue StandardError => e
115
+ work.clear
116
+ e
117
+ end
118
+ end
119
+ end
120
+ end
121
+ workers.each(&:join)
122
+ done.each { |d| raise d if d.is_a?(StandardError) }
123
+ end
124
+
125
+ def natural_order(name)
126
+ name.split(/(\d+)/).each_with_index.map { |x, i| i.odd? ? x.to_i : x }
127
+ end
128
+
129
+ def retry(*errors, times:)
130
+ yield
131
+ rescue *errors => e
132
+ times -= 1
133
+ raise if times < 0
134
+ Kennel.err.puts "Error #{e}, #{times} retries left"
135
+ retry
136
+ end
137
+
138
+ # https://stackoverflow.com/questions/20235206/ruby-get-all-keys-in-a-hash-including-sub-keys/53876255#53876255
139
+ def all_keys(items)
140
+ case items
141
+ when Hash then items.keys + items.values.flat_map { |v| all_keys(v) }
142
+ when Array then items.flat_map { |i| all_keys(i) }
143
+ else []
144
+ end
145
+ end
146
+
147
+ # TODO: use awesome-print or similar, but it has too many monkey-patches
148
+ # https://github.com/amazing-print/amazing_print/issues/36
149
+ def pretty_inspect(object)
150
+ string = object.inspect
151
+ string.gsub!(/:([a-z_]+)=>/, "\\1: ")
152
+ 10.times do
153
+ string.gsub!(/{(\S.*?\S)}/, "{ \\1 }") || break
154
+ end
155
+ string
156
+ end
157
+ end
158
+ end
159
+ end
@@ -0,0 +1,4 @@
1
+ # frozen_string_literal: true
2
+ module Kennel
3
+ VERSION = "1.75.0"
4
+ end
@@ -0,0 +1,247 @@
1
+ ![](github/cage.jpg?raw=true)
2
+
3
+ Manage Datadog Monitors / Dashboards / Slos as code
4
+
5
+ - DRY, searchable, audited, documented
6
+ - Changes are PR reviewed and applied on merge
7
+ - Updating shows diff before applying
8
+ - Automated import of existing resources
9
+ - Resources are grouped into projects that belong to teams and inherit tags
10
+ - No copy-pasting of ids to create new resources
11
+ - Automated cleanup when removing code
12
+ - [Helpers](#helpers) for automating common tasks
13
+
14
+ ### Applying changes
15
+
16
+ ![](github/screen.png?raw=true)
17
+
18
+ ### Example code
19
+
20
+ ```Ruby
21
+ # teams/foo.rb
22
+ module Teams
23
+ class Foo < Kennel::Models::Team
24
+ defaults(mention: -> { "@slack-my-team" })
25
+ end
26
+ end
27
+
28
+ # projects/bar.rb
29
+ class Bar < Kennel::Models::Project
30
+ defaults(
31
+ team: -> { Teams::Foo.new }, # use mention and tags from the team
32
+ parts: -> {
33
+ [
34
+ Kennel::Models::Monitor.new(
35
+ self, # the current project
36
+ type: -> { "query alert" },
37
+ kennel_id: -> { "load-too-high" }, # pick a unique name
38
+ name: -> { "Foobar Load too high" }, # nice descriptive name that will show up in alerts and emails
39
+ message: -> {
40
+ <<~TEXT
41
+ This is bad!
42
+ #{super()} # inserts mention from team
43
+ TEXT
44
+ },
45
+ query: -> { "avg(last_5m):avg:system.load.5{hostgroup:api} by {pod} > #{critical}" },
46
+ critical: -> { 20 }
47
+ )
48
+ ]
49
+ }
50
+ )
51
+ end
52
+ ```
53
+
54
+
55
+ ## Structure
56
+
57
+ - `projects/` monitors/dashboards/etc scoped by project
58
+ - `teams/` team definitions
59
+ - `parts/` monitors/dashboards/etc that are used by multiple projects
60
+ - `generated/` projects as json, to show current state and proposed changes in PRs
61
+
62
+ ## Workflows
63
+
64
+ ### Setup
65
+ - clone the repo
66
+ - `gem install bundler && bundle install`
67
+ - `cp .env.example .env`
68
+ - open [Datadog API Settings](https://app.datadoghq.com/account/settings#api)
69
+ - copy any `API Key` and add it to `.env` as `DATADOG_API_KEY`
70
+ - find or create (check last page) your personal "Application Key" and add it to `.env` as `DATADOG_APP_KEY=`
71
+ - change the `DATADOG_SUBDOMAIN=app` in `.env` to your companies subdomain if you have one
72
+ - verify it works by running `rake plan`, it might show some diff, but should not crash
73
+
74
+ ### Adding a team
75
+
76
+ - `mention` is used for all team monitors via `super()`
77
+ - `renotify_interval` is used for all team monitors (defaults to `0` / off)
78
+ - `tags` is used for all team monitors/dashboards (defaults to `team:<team-name>`)
79
+
80
+ ```Ruby
81
+ # teams/my_team.rb
82
+ module Teams
83
+ class MyTeam < Kennel::Models::Team
84
+ defaults(
85
+ mention: -> { "@slack-my-team" }
86
+ )
87
+ end
88
+ end
89
+ ```
90
+
91
+ ### Adding a new monitor
92
+ - use [datadog monitor UI](https://app.datadoghq.com/monitors#create) to create a monitor
93
+ - see below
94
+
95
+ ### Updating an existing monitor
96
+ - use [datadog monitor UI](https://app.datadoghq.com/monitors/manage) to find a monitor
97
+ - get the `id` from the url
98
+ - run `URL='https://app.datadoghq.com/monitors/123' bundle exec rake kennel:import` and copy the output
99
+ - find or create a project in `projects/`
100
+ - add the monitor to `parts: [` list, for example:
101
+ ```Ruby
102
+ # projects/my_project.rb
103
+ class MyProject < Kennel::Models::Project
104
+ defaults(
105
+ team: -> { Teams::MyTeam.new }, # use existing team or create new one in teams/
106
+ parts: -> {
107
+ [
108
+ Kennel::Models::Monitor.new(
109
+ self,
110
+ id: -> { 123456 }, # id from datadog url, not necessary when creating a new monitor
111
+ type: -> { "query alert" },
112
+ kennel_id: -> { "load-too-high" }, # make up a unique name
113
+ name: -> { "Foobar Load too high" }, # nice descriptive name that will show up in alerts and emails
114
+ message: -> {
115
+ # Explain what behavior to expect and how to fix the cause
116
+ # Use #{super()} to add team notifications.
117
+ <<~TEXT
118
+ Foobar will be slow and that could cause Barfoo to go down.
119
+ Add capacity or debug why it is suddenly slow.
120
+ #{super()}
121
+ TEXT
122
+ },
123
+ query: -> { "avg(last_5m):avg:system.load.5{hostgroup:api} by {pod} > #{critical}" }, # replace actual value with #{critical} to keep them in sync
124
+ critical: -> { 20 }
125
+ )
126
+ ]
127
+ }
128
+ )
129
+ end
130
+ ```
131
+ - run `PROJECT=my_project bundle exec rake plan`, an Update to the existing monitor should be shown (not Create / Delete)
132
+ - alternatively: `bundle exec rake generate` to only locally update the generated `json` files
133
+ - review changes then `git commit`
134
+ - make a PR ... get reviewed ... merge
135
+ - datadog is updated by CI
136
+
137
+ ### Adding a new dashboard
138
+ - go to [datadog dashboard UI](https://app.datadoghq.com/dashboard/lists) and click on _New Dashboard_ to create a dashboard
139
+ - see below
140
+
141
+ ### Updating an existing dashboard
142
+ - go to [datadog dashboard UI](https://app.datadoghq.com/dashboard/lists) and click on _New Dashboard_ to find a dashboard
143
+ - get the `id` from the url
144
+ - run `URL='https://app.datadoghq.com/dashboard/bet-foo-bar' bundle exec rake kennel:import` and copy the output
145
+ - find or create a project in `projects/`
146
+ - add a dashboard to `parts: [` list, for example:
147
+ ```Ruby
148
+ class MyProject < Kennel::Models::Project
149
+ defaults(
150
+ team: -> { Teams::MyTeam.new }, # use existing team or create new one in teams/
151
+ parts: -> {
152
+ [
153
+ Kennel::Models::Dashboard.new(
154
+ self,
155
+ id: -> { "abc-def-ghi" }, # id from datadog url, not needed when creating a new dashboard
156
+ title: -> { "My Dashboard" },
157
+ description: -> { "Overview of foobar" },
158
+ template_variables: -> { ["environment"] }, # see https://docs.datadoghq.com/api/?lang=ruby#timeboards
159
+ kennel_id: -> { "overview-dashboard" }, # make up a unique name
160
+ layout_type: -> { "ordered" },
161
+ definitions: -> {
162
+ [ # An array or arrays, each one is a graph in the dashboard, alternatively a hash for finer control
163
+ [
164
+ # title, viz, type, query, edit an existing graph and see the json definition
165
+ "Graph name", "timeseries", "area", "sum:mystats.foobar{$environment}"
166
+ ],
167
+ [
168
+ # queries can be an Array as well, this will generate multiple requests
169
+ # for a single graph
170
+ "Graph name", "timeseries", "area", ["sum:mystats.foobar{$environment}", "sum:mystats.success{$environment}"],
171
+ # add events too ...
172
+ events: [{q: "tags:foobar,deploy", tags_execution: "and"}]
173
+ ]
174
+ ]
175
+ }
176
+ )
177
+ ]
178
+ }
179
+ )
180
+ end
181
+ ```
182
+
183
+ ### Skipping validations
184
+
185
+ Some validations might be too strict for your usecase or just wrong, please [open an issue](https://github.com/grosser/kennel/issues) and
186
+ to unblock use the `validate: -> { false }` option.
187
+
188
+ ### Linking with kennel_ids
189
+
190
+ To link to existing monitors via their kennel_id
191
+
192
+ - Screens `uptime` widgets can use `monitor: {id: "foo:bar"}`
193
+ - Screens `alert_graph` widgets can use `alert_id: "foo:bar"`
194
+ - Monitors `composite` can use `query: -> { "%{foo:bar} || %{foo:baz}" }`
195
+
196
+ ### Debugging changes locally
197
+
198
+ - rebase on updated `master` to not undo other changes
199
+ - figure out project name by converting the class name to snake-case
200
+ - run `PROJECT=foo bundle exec rake kennel:update_datadog` to test changes for a single project
201
+
202
+ ### Reuse
203
+
204
+ Add to `parts/<folder>`.
205
+
206
+ ```Ruby
207
+ module Monitors
208
+ class LoadTooHigh < Kennel::Models::Monitor
209
+ defaults(
210
+ name: -> { "#{project.name} load too high" },
211
+ message: -> { "Shut it down!" },
212
+ type: -> { "query alert" },
213
+ query: -> { "avg(last_5m):avg:system.load.5{hostgroup:#{project.kennel_id}} by {pod} > #{critical}" }
214
+ )
215
+ end
216
+ end
217
+ ```
218
+
219
+ Reuse it in multiple projects.
220
+
221
+ ```Ruby
222
+ class Database < Kennel::Models::Project
223
+ defaults(
224
+ team: -> { Kennel::Models::Team.new(mention: -> { '@slack-foo' }, kennel_id: -> { 'foo' }) },
225
+ parts: -> { [Monitors::LoadTooHigh.new(self, critical: -> { 13 })] }
226
+ )
227
+ end
228
+ ```
229
+
230
+ ## Helpers
231
+
232
+ ### Listing un-muted alerts
233
+
234
+ Run `rake kennel:alerts TAG=service:my-service` to see all un-muted alerts for a given datadog monitor tag.
235
+
236
+ ### Validating mentions work
237
+
238
+ `rake kennel:validate_mentions` should run as part of CI
239
+
240
+ ### Grepping through all of datadog
241
+
242
+ `TYPE=monitor rake kennel:dump`
243
+
244
+ ### Find all monitors with No-Data
245
+
246
+ `rake kennel:nodata TAG=team:foo`
247
+