kennel 1.75.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,38 @@
1
+ # frozen_string_literal: true
2
+ module Kennel
3
+ module TemplateVariables
4
+ def self.included(base)
5
+ base.settings :template_variables
6
+ base.defaults(template_variables: -> { [] })
7
+ end
8
+
9
+ private
10
+
11
+ def render_template_variables
12
+ (template_variables || []).map do |v|
13
+ v.is_a?(String) ? { default: "*", prefix: v, name: v } : v
14
+ end
15
+ end
16
+
17
+ # check for queries that do not use the variables and would be misleading
18
+ # TODO: do the same check for apm_query and their group_by
19
+ def validate_template_variables(data, key)
20
+ variables = (data[:template_variables] || []).map { |v| "$#{v.fetch(:name)}" }
21
+ queries = data[key].flat_map do |widget|
22
+ ([widget] + (widget.dig(:definition, :widgets) || [])).flat_map { |w| widget_queries(w) }
23
+ end.compact
24
+ bad = queries.grep_v(/(#{variables.map { |v| Regexp.escape(v) }.join("|")})\b/)
25
+ if bad.any?
26
+ invalid!(
27
+ "queries #{bad.join(", ")} must use the template variables #{variables.join(", ")}\n" \
28
+ "If that is not possible, add `validate: -> { false } # query foo in bar does not have baz tag`"
29
+ )
30
+ end
31
+ end
32
+
33
+ def widget_queries(widget)
34
+ requests = widget.dig(:definition, :requests) || []
35
+ (requests.is_a?(Hash) ? requests.values : requests).map { |r| r[:q] } # hostmap widgets have hash requests
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,89 @@
1
+ # frozen_string_literal: true
2
+ require "kennel"
3
+
4
+ # Show Alerts that are not muted and their alerting scopes
5
+ module Kennel
6
+ class UnmutedAlerts
7
+ COLORS = {
8
+ "Alert" => :red,
9
+ "Warn" => :yellow,
10
+ "No Data" => :cyan
11
+ }.freeze
12
+
13
+ class << self
14
+ def print(api, tag)
15
+ monitors = filtered_monitors(api, tag)
16
+ if monitors.empty?
17
+ Kennel.out.puts "No unmuted alerts found"
18
+ else
19
+ monitors.each do |m|
20
+ Kennel.out.puts m[:name]
21
+ Kennel.out.puts Utils.path_to_url("/monitors/#{m[:id]}")
22
+ m[:state][:groups].each do |g|
23
+ color = COLORS[g[:status]] || :default
24
+ since = "\t#{time_since(g[:last_triggered_ts])}"
25
+ Kennel.out.puts "#{Kennel::Utils.color(color, g[:status])}\t#{g[:name]}#{since}"
26
+ end
27
+ Kennel.out.puts
28
+ end
29
+ end
30
+ end
31
+
32
+ private
33
+
34
+ # sort pod3 before pod11
35
+ def sort_groups!(monitor)
36
+ groups = monitor[:state][:groups].values
37
+ groups.sort_by! { |g| g[:name].to_s.split(",").map { |w| Utils.natural_order(w) } }
38
+ monitor[:state][:groups] = groups
39
+ end
40
+
41
+ def time_since(t)
42
+ diff = Time.now.to_i - Integer(t)
43
+ "%02d:%02d:%02d" % [diff / 3600, diff / 60 % 60, diff % 60]
44
+ end
45
+
46
+ def filtered_monitors(api, tag)
47
+ # Download all monitors with given tag
48
+ monitors = Progress.progress("Downloading") do
49
+ api.list("monitor", monitor_tags: tag, group_states: "all", with_downtimes: "true")
50
+ end
51
+
52
+ raise "No monitors for #{tag} found, check your spelling" if monitors.empty?
53
+
54
+ # only keep monitors that are alerting
55
+ monitors.reject! { |m| m[:overall_state] == "OK" }
56
+
57
+ # only keep monitors that are not completely silenced
58
+ monitors.reject! { |m| m[:options][:silenced].key?(:*) }
59
+
60
+ # only keep groups that are alerting
61
+ monitors.each { |m| m[:state][:groups].reject! { |_, g| g[:status] == "OK" || g[:status] == "Ignored" } }
62
+
63
+ # only keep alerting groups that are not silenced
64
+ monitors.each do |m|
65
+ silenced = m[:options][:silenced].keys.map { |k| k.to_s.split(",") }
66
+ m[:state][:groups].select! do |k, _|
67
+ scope = k.to_s.split(",")
68
+ silenced.none? { |s| (s - scope).empty? }
69
+ end
70
+ end
71
+
72
+ # only keep monitors that are not covered by a downtime
73
+ monitors.each do |m|
74
+ next unless m[:matching_downtimes]
75
+ downtime_groups = m[:matching_downtimes].select { |d| d[:active] }.flat_map { |d| d[:groups] }
76
+ m[:state][:groups].reject! do |k, _|
77
+ downtime_groups.include?(k.to_s)
78
+ end
79
+ end
80
+
81
+ # only keep monitors with alerting groups
82
+ monitors.select! { |m| m[:state][:groups].any? }
83
+
84
+ # sort group alerts
85
+ monitors.each { |m| sort_groups!(m) }
86
+ end
87
+ end
88
+ end
89
+ end
@@ -0,0 +1,159 @@
1
+ # frozen_string_literal: true
2
+ module Kennel
3
+ module Utils
4
+ COLORS = { red: 31, green: 32, yellow: 33, cyan: 36, magenta: 35, default: 0 }.freeze
5
+
6
+ class TeeIO < IO
7
+ def initialize(ios)
8
+ @ios = ios
9
+ end
10
+
11
+ def write(string)
12
+ @ios.each { |io| io.write string }
13
+ end
14
+ end
15
+
16
+ class << self
17
+ def snake_case(string)
18
+ string
19
+ .gsub(/::/, "_") # Foo::Bar -> foo_bar
20
+ .gsub(/([A-Z]+)([A-Z][a-z])/, '\1_\2') # FOOBar -> foo_bar
21
+ .gsub(/([a-z\d])([A-Z])/, '\1_\2') # fooBar -> foo_bar
22
+ .tr("-", "_") # foo-bar -> foo_bar
23
+ .downcase
24
+ end
25
+
26
+ # simplified version of https://apidock.com/rails/ActiveSupport/Inflector/parameterize
27
+ def parameterize(string)
28
+ string
29
+ .downcase
30
+ .gsub(/[^a-z0-9\-_]+/, "-") # remove unsupported
31
+ .gsub(/-{2,}/, "-") # remove duplicates
32
+ .gsub(/^-|-$/, "") # remove leading/trailing
33
+ end
34
+
35
+ def presence(value)
36
+ value.nil? || value.empty? ? nil : value
37
+ end
38
+
39
+ def ask(question)
40
+ Kennel.err.printf color(:red, "#{question} - press 'y' to continue: ")
41
+ begin
42
+ STDIN.gets.chomp == "y"
43
+ rescue Interrupt # do not show a backtrace if user decides to Ctrl+C here
44
+ Kennel.err.print "\n"
45
+ exit 1
46
+ end
47
+ end
48
+
49
+ def color(color, text)
50
+ "\e[#{COLORS.fetch(color)}m#{text}\e[0m"
51
+ end
52
+
53
+ def strip_shell_control(text)
54
+ text.gsub(/\e\[\d+m(.*?)\e\[0m/, "\\1").gsub(/.#{Regexp.escape("\b")}/, "")
55
+ end
56
+
57
+ def capture_stdout
58
+ old = Kennel.out
59
+ Kennel.out = StringIO.new
60
+ yield
61
+ Kennel.out.string
62
+ ensure
63
+ Kennel.out = old
64
+ end
65
+
66
+ def capture_stderr
67
+ old = Kennel.err
68
+ Kennel.err = StringIO.new
69
+ yield
70
+ Kennel.err.string
71
+ ensure
72
+ Kennel.err = old
73
+ end
74
+
75
+ def tee_output
76
+ old_stdout = Kennel.out
77
+ old_stderr = Kennel.err
78
+ capture = StringIO.new
79
+ Kennel.out = TeeIO.new([capture, Kennel.out])
80
+ Kennel.err = TeeIO.new([capture, Kennel.err])
81
+ yield
82
+ capture.string
83
+ ensure
84
+ Kennel.out = old_stdout
85
+ Kennel.err = old_stderr
86
+ end
87
+
88
+ def capture_sh(command)
89
+ result = `#{command} 2>&1`
90
+ raise "Command failed:\n#{command}\n#{result}" unless $CHILD_STATUS.success?
91
+ result
92
+ end
93
+
94
+ def path_to_url(path)
95
+ if subdomain = ENV["DATADOG_SUBDOMAIN"]
96
+ "https://#{subdomain}.datadoghq.com#{path}"
97
+ else
98
+ path
99
+ end
100
+ end
101
+
102
+ def parallel(items, max: 10)
103
+ threads = [items.size, max].min
104
+ work = items.each_with_index.to_a
105
+ done = Array.new(items.size)
106
+ workers = Array.new(threads).map do
107
+ Thread.new do
108
+ loop do
109
+ item, i = work.pop
110
+ break unless i
111
+ done[i] =
112
+ begin
113
+ yield item
114
+ rescue StandardError => e
115
+ work.clear
116
+ e
117
+ end
118
+ end
119
+ end
120
+ end
121
+ workers.each(&:join)
122
+ done.each { |d| raise d if d.is_a?(StandardError) }
123
+ end
124
+
125
+ def natural_order(name)
126
+ name.split(/(\d+)/).each_with_index.map { |x, i| i.odd? ? x.to_i : x }
127
+ end
128
+
129
+ def retry(*errors, times:)
130
+ yield
131
+ rescue *errors => e
132
+ times -= 1
133
+ raise if times < 0
134
+ Kennel.err.puts "Error #{e}, #{times} retries left"
135
+ retry
136
+ end
137
+
138
+ # https://stackoverflow.com/questions/20235206/ruby-get-all-keys-in-a-hash-including-sub-keys/53876255#53876255
139
+ def all_keys(items)
140
+ case items
141
+ when Hash then items.keys + items.values.flat_map { |v| all_keys(v) }
142
+ when Array then items.flat_map { |i| all_keys(i) }
143
+ else []
144
+ end
145
+ end
146
+
147
+ # TODO: use awesome-print or similar, but it has too many monkey-patches
148
+ # https://github.com/amazing-print/amazing_print/issues/36
149
+ def pretty_inspect(object)
150
+ string = object.inspect
151
+ string.gsub!(/:([a-z_]+)=>/, "\\1: ")
152
+ 10.times do
153
+ string.gsub!(/{(\S.*?\S)}/, "{ \\1 }") || break
154
+ end
155
+ string
156
+ end
157
+ end
158
+ end
159
+ end
@@ -0,0 +1,4 @@
1
+ # frozen_string_literal: true
2
+ module Kennel
3
+ VERSION = "1.75.0"
4
+ end
@@ -0,0 +1,247 @@
1
+ ![](github/cage.jpg?raw=true)
2
+
3
+ Manage Datadog Monitors / Dashboards / Slos as code
4
+
5
+ - DRY, searchable, audited, documented
6
+ - Changes are PR reviewed and applied on merge
7
+ - Updating shows diff before applying
8
+ - Automated import of existing resources
9
+ - Resources are grouped into projects that belong to teams and inherit tags
10
+ - No copy-pasting of ids to create new resources
11
+ - Automated cleanup when removing code
12
+ - [Helpers](#helpers) for automating common tasks
13
+
14
+ ### Applying changes
15
+
16
+ ![](github/screen.png?raw=true)
17
+
18
+ ### Example code
19
+
20
+ ```Ruby
21
+ # teams/foo.rb
22
+ module Teams
23
+ class Foo < Kennel::Models::Team
24
+ defaults(mention: -> { "@slack-my-team" })
25
+ end
26
+ end
27
+
28
+ # projects/bar.rb
29
+ class Bar < Kennel::Models::Project
30
+ defaults(
31
+ team: -> { Teams::Foo.new }, # use mention and tags from the team
32
+ parts: -> {
33
+ [
34
+ Kennel::Models::Monitor.new(
35
+ self, # the current project
36
+ type: -> { "query alert" },
37
+ kennel_id: -> { "load-too-high" }, # pick a unique name
38
+ name: -> { "Foobar Load too high" }, # nice descriptive name that will show up in alerts and emails
39
+ message: -> {
40
+ <<~TEXT
41
+ This is bad!
42
+ #{super()} # inserts mention from team
43
+ TEXT
44
+ },
45
+ query: -> { "avg(last_5m):avg:system.load.5{hostgroup:api} by {pod} > #{critical}" },
46
+ critical: -> { 20 }
47
+ )
48
+ ]
49
+ }
50
+ )
51
+ end
52
+ ```
53
+
54
+
55
+ ## Structure
56
+
57
+ - `projects/` monitors/dashboards/etc scoped by project
58
+ - `teams/` team definitions
59
+ - `parts/` monitors/dashboards/etc that are used by multiple projects
60
+ - `generated/` projects as json, to show current state and proposed changes in PRs
61
+
62
+ ## Workflows
63
+
64
+ ### Setup
65
+ - clone the repo
66
+ - `gem install bundler && bundle install`
67
+ - `cp .env.example .env`
68
+ - open [Datadog API Settings](https://app.datadoghq.com/account/settings#api)
69
+ - copy any `API Key` and add it to `.env` as `DATADOG_API_KEY`
70
+ - find or create (check last page) your personal "Application Key" and add it to `.env` as `DATADOG_APP_KEY=`
71
+ - change the `DATADOG_SUBDOMAIN=app` in `.env` to your companies subdomain if you have one
72
+ - verify it works by running `rake plan`, it might show some diff, but should not crash
73
+
74
+ ### Adding a team
75
+
76
+ - `mention` is used for all team monitors via `super()`
77
+ - `renotify_interval` is used for all team monitors (defaults to `0` / off)
78
+ - `tags` is used for all team monitors/dashboards (defaults to `team:<team-name>`)
79
+
80
+ ```Ruby
81
+ # teams/my_team.rb
82
+ module Teams
83
+ class MyTeam < Kennel::Models::Team
84
+ defaults(
85
+ mention: -> { "@slack-my-team" }
86
+ )
87
+ end
88
+ end
89
+ ```
90
+
91
+ ### Adding a new monitor
92
+ - use [datadog monitor UI](https://app.datadoghq.com/monitors#create) to create a monitor
93
+ - see below
94
+
95
+ ### Updating an existing monitor
96
+ - use [datadog monitor UI](https://app.datadoghq.com/monitors/manage) to find a monitor
97
+ - get the `id` from the url
98
+ - run `URL='https://app.datadoghq.com/monitors/123' bundle exec rake kennel:import` and copy the output
99
+ - find or create a project in `projects/`
100
+ - add the monitor to `parts: [` list, for example:
101
+ ```Ruby
102
+ # projects/my_project.rb
103
+ class MyProject < Kennel::Models::Project
104
+ defaults(
105
+ team: -> { Teams::MyTeam.new }, # use existing team or create new one in teams/
106
+ parts: -> {
107
+ [
108
+ Kennel::Models::Monitor.new(
109
+ self,
110
+ id: -> { 123456 }, # id from datadog url, not necessary when creating a new monitor
111
+ type: -> { "query alert" },
112
+ kennel_id: -> { "load-too-high" }, # make up a unique name
113
+ name: -> { "Foobar Load too high" }, # nice descriptive name that will show up in alerts and emails
114
+ message: -> {
115
+ # Explain what behavior to expect and how to fix the cause
116
+ # Use #{super()} to add team notifications.
117
+ <<~TEXT
118
+ Foobar will be slow and that could cause Barfoo to go down.
119
+ Add capacity or debug why it is suddenly slow.
120
+ #{super()}
121
+ TEXT
122
+ },
123
+ query: -> { "avg(last_5m):avg:system.load.5{hostgroup:api} by {pod} > #{critical}" }, # replace actual value with #{critical} to keep them in sync
124
+ critical: -> { 20 }
125
+ )
126
+ ]
127
+ }
128
+ )
129
+ end
130
+ ```
131
+ - run `PROJECT=my_project bundle exec rake plan`, an Update to the existing monitor should be shown (not Create / Delete)
132
+ - alternatively: `bundle exec rake generate` to only locally update the generated `json` files
133
+ - review changes then `git commit`
134
+ - make a PR ... get reviewed ... merge
135
+ - datadog is updated by CI
136
+
137
+ ### Adding a new dashboard
138
+ - go to [datadog dashboard UI](https://app.datadoghq.com/dashboard/lists) and click on _New Dashboard_ to create a dashboard
139
+ - see below
140
+
141
+ ### Updating an existing dashboard
142
+ - go to [datadog dashboard UI](https://app.datadoghq.com/dashboard/lists) and click on _New Dashboard_ to find a dashboard
143
+ - get the `id` from the url
144
+ - run `URL='https://app.datadoghq.com/dashboard/bet-foo-bar' bundle exec rake kennel:import` and copy the output
145
+ - find or create a project in `projects/`
146
+ - add a dashboard to `parts: [` list, for example:
147
+ ```Ruby
148
+ class MyProject < Kennel::Models::Project
149
+ defaults(
150
+ team: -> { Teams::MyTeam.new }, # use existing team or create new one in teams/
151
+ parts: -> {
152
+ [
153
+ Kennel::Models::Dashboard.new(
154
+ self,
155
+ id: -> { "abc-def-ghi" }, # id from datadog url, not needed when creating a new dashboard
156
+ title: -> { "My Dashboard" },
157
+ description: -> { "Overview of foobar" },
158
+ template_variables: -> { ["environment"] }, # see https://docs.datadoghq.com/api/?lang=ruby#timeboards
159
+ kennel_id: -> { "overview-dashboard" }, # make up a unique name
160
+ layout_type: -> { "ordered" },
161
+ definitions: -> {
162
+ [ # An array or arrays, each one is a graph in the dashboard, alternatively a hash for finer control
163
+ [
164
+ # title, viz, type, query, edit an existing graph and see the json definition
165
+ "Graph name", "timeseries", "area", "sum:mystats.foobar{$environment}"
166
+ ],
167
+ [
168
+ # queries can be an Array as well, this will generate multiple requests
169
+ # for a single graph
170
+ "Graph name", "timeseries", "area", ["sum:mystats.foobar{$environment}", "sum:mystats.success{$environment}"],
171
+ # add events too ...
172
+ events: [{q: "tags:foobar,deploy", tags_execution: "and"}]
173
+ ]
174
+ ]
175
+ }
176
+ )
177
+ ]
178
+ }
179
+ )
180
+ end
181
+ ```
182
+
183
+ ### Skipping validations
184
+
185
+ Some validations might be too strict for your usecase or just wrong, please [open an issue](https://github.com/grosser/kennel/issues) and
186
+ to unblock use the `validate: -> { false }` option.
187
+
188
+ ### Linking with kennel_ids
189
+
190
+ To link to existing monitors via their kennel_id
191
+
192
+ - Screens `uptime` widgets can use `monitor: {id: "foo:bar"}`
193
+ - Screens `alert_graph` widgets can use `alert_id: "foo:bar"`
194
+ - Monitors `composite` can use `query: -> { "%{foo:bar} || %{foo:baz}" }`
195
+
196
+ ### Debugging changes locally
197
+
198
+ - rebase on updated `master` to not undo other changes
199
+ - figure out project name by converting the class name to snake-case
200
+ - run `PROJECT=foo bundle exec rake kennel:update_datadog` to test changes for a single project
201
+
202
+ ### Reuse
203
+
204
+ Add to `parts/<folder>`.
205
+
206
+ ```Ruby
207
+ module Monitors
208
+ class LoadTooHigh < Kennel::Models::Monitor
209
+ defaults(
210
+ name: -> { "#{project.name} load too high" },
211
+ message: -> { "Shut it down!" },
212
+ type: -> { "query alert" },
213
+ query: -> { "avg(last_5m):avg:system.load.5{hostgroup:#{project.kennel_id}} by {pod} > #{critical}" }
214
+ )
215
+ end
216
+ end
217
+ ```
218
+
219
+ Reuse it in multiple projects.
220
+
221
+ ```Ruby
222
+ class Database < Kennel::Models::Project
223
+ defaults(
224
+ team: -> { Kennel::Models::Team.new(mention: -> { '@slack-foo' }, kennel_id: -> { 'foo' }) },
225
+ parts: -> { [Monitors::LoadTooHigh.new(self, critical: -> { 13 })] }
226
+ )
227
+ end
228
+ ```
229
+
230
+ ## Helpers
231
+
232
+ ### Listing un-muted alerts
233
+
234
+ Run `rake kennel:alerts TAG=service:my-service` to see all un-muted alerts for a given datadog monitor tag.
235
+
236
+ ### Validating mentions work
237
+
238
+ `rake kennel:validate_mentions` should run as part of CI
239
+
240
+ ### Grepping through all of datadog
241
+
242
+ `TYPE=monitor rake kennel:dump`
243
+
244
+ ### Find all monitors with No-Data
245
+
246
+ `rake kennel:nodata TAG=team:foo`
247
+