@hiiretail/gcp-infra-cli 0.73.0 → 0.74.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,11 @@
1
+ const ejs = require('ejs');
2
+
3
+ const handleAlerts = (alerts, templates, answers) => {
4
+ const template = templates[`${answers.alertResource}`][`${answers.alert}`];
5
+ const newAlert = ejs.render(JSON.stringify(template), answers);
6
+
7
+ alerts.push(JSON.parse(newAlert));
8
+ return alerts;
9
+ };
10
+
11
+ module.exports = handleAlerts;
@@ -1,36 +1,86 @@
1
1
  const path = require('path');
2
2
  const chalk = require('chalk');
3
3
  const fs = require('fs');
4
+ const yaml = require('js-yaml');
4
5
  const BaseGenerator = require('../../../src/BaseGenerator');
5
6
  const { required } = require('../../../src/validators');
6
7
  const helper = require('./validate');
7
8
  const handleSlosFile = require('./handle-slos');
8
9
  const handleUptimeFile = require('./handle-uptime');
10
+ const handleAlerts = require('./handle-alerts');
11
+
12
+ const alertTemplates = yaml.load(fs.readFileSync(`${__dirname}/templates/alerts/alerts.yaml`));
9
13
 
10
14
  module.exports = class extends BaseGenerator {
11
- prompting() {
12
- const prompts = [
15
+ async prompting() {
16
+ this.answers = await this.prompt([
13
17
  {
14
18
  type: 'list',
15
19
  name: 'monitoringResource',
16
20
  message: 'Select the resource you want to create',
17
- default: 'uptime-checks',
18
- choices: ['uptime-checks', 'slos'],
21
+ choices: ['alerts', 'uptime-checks', 'slos'],
19
22
  },
20
23
  {
21
- when: (response) => response.monitoringResource === 'uptime-checks' || 'slos',
24
+ when: (response) => response.monitoringResource === 'alerts',
25
+ type: 'list',
26
+ name: 'alertResource',
27
+ choices: Object.keys(alertTemplates),
28
+ },
29
+ {
30
+ when: (response) => response.monitoringResource === 'alerts',
31
+ type: 'list',
32
+ name: 'alert',
33
+ choices: (answers) => Object.keys(alertTemplates[`${answers.alertResource}`]),
34
+ },
35
+ {
36
+ when: (response) => response.monitoringResource === ('alerts' || 'slos' || 'uptime-checks'),
22
37
  type: 'input',
23
38
  name: 'systemName',
24
39
  message: 'Please provide three-letter system name as defined in Styra',
25
40
  validate: required && helper.validSystemName,
26
41
  },
27
42
  {
28
- when: (response) => response.monitoringResource === 'uptime-checks' || 'slos',
43
+ when: (response) => response.monitoringResource === ('slos' || 'uptime-checks') || response.alertResource === 'cloud_run',
29
44
  type: 'input',
30
45
  name: 'serviceName',
31
46
  message: 'Please provide the namespace where the service resides',
32
47
  validate: required,
33
48
  },
49
+ {
50
+ when: (response) => response.monitoringResource === 'alerts',
51
+ type: 'input',
52
+ name: 'runbookLink',
53
+ message: 'Please provide the full URL to your runbook in confluence (Leave empty if none)',
54
+ validate: required && helper.validUrl,
55
+ },
56
+ {
57
+ when: (response) => response.alertResource === 'cloud_scheduler',
58
+ type: 'input',
59
+ name: 'jobId',
60
+ message: 'Please provide the "job id"',
61
+ validate: required,
62
+ },
63
+ {
64
+ when: (response) => response.alertResource === 'cloud_sql',
65
+ type: 'input',
66
+ name: 'databaseId',
67
+ message: 'Please provide the "database id"',
68
+ validate: required,
69
+ },
70
+ {
71
+ when: (response) => response.alertResource === 'memorystore',
72
+ type: 'input',
73
+ name: 'instanceId',
74
+ message: 'Please provide the "instance id"',
75
+ validate: required,
76
+ },
77
+ {
78
+ when: (response) => response.alertResource === 'pub_sub',
79
+ type: 'input',
80
+ name: 'subscriptionId',
81
+ message: 'Please provide the "subscription id"',
82
+ validate: required,
83
+ },
34
84
  {
35
85
  when: (response) => response.monitoringResource === 'uptime-checks',
36
86
  type: 'input',
@@ -50,7 +100,6 @@ module.exports = class extends BaseGenerator {
50
100
  type: 'list',
51
101
  name: 'sli',
52
102
  message: 'Please select the SLI',
53
- default: 'availability',
54
103
  choices: ['availability', 'error-rate', 'latency'],
55
104
  },
56
105
  {
@@ -67,11 +116,7 @@ module.exports = class extends BaseGenerator {
67
116
  name: 'info',
68
117
  message: 'WARNING: Make sure that an uptime check has been created before applying availability SLI',
69
118
  },
70
- ];
71
-
72
- return this.prompt(prompts).then((props) => {
73
- this.answers = props;
74
- });
119
+ ]);
75
120
  }
76
121
 
77
122
  async writing() {
@@ -84,20 +129,18 @@ module.exports = class extends BaseGenerator {
84
129
  burnRateAlerts,
85
130
  } = this.answers;
86
131
 
87
- const serviceFolderName = serviceName.replace(/ /g, '-').toLowerCase();
88
- const serviceDir = path.join(process.cwd(), 'infra', 'prod', 'monitoring', monitoringResource, serviceFolderName);
89
- const uptimeDirPath = path.join(process.cwd(), 'infra', 'prod', 'monitoring', monitoringResource);
132
+ const resourcePath = path.join(process.cwd(), 'infra', 'prod', 'monitoring', monitoringResource);
90
133
 
91
134
  if (monitoringResource === 'uptime-checks') {
92
- if (!fs.existsSync(uptimeDirPath)) {
93
- fs.mkdirSync(uptimeDirPath, { recursive: true });
135
+ if (!fs.existsSync(resourcePath)) {
136
+ fs.mkdirSync(resourcePath, { recursive: true });
94
137
  }
95
138
 
96
- const uptimeYamlFile = `${uptimeDirPath}/uptime-checks.yaml`;
139
+ const uptimeYamlFile = `${resourcePath}/uptime-checks.yaml`;
97
140
  if (!fs.existsSync(uptimeYamlFile)) {
98
141
  this.copyDir(
99
142
  'uptime-checks',
100
- uptimeDirPath,
143
+ resourcePath,
101
144
  {
102
145
  ...this.answers,
103
146
  serviceName,
@@ -111,6 +154,8 @@ module.exports = class extends BaseGenerator {
111
154
  }
112
155
 
113
156
  if (monitoringResource === 'slos') {
157
+ const serviceFolderName = serviceName.replace(/ /g, '-').toLowerCase();
158
+ const serviceDir = path.join(process.cwd(), 'infra', 'prod', 'monitoring', monitoringResource, serviceFolderName);
114
159
  const fileContainsFilter = (fileName, str) => {
115
160
  const contents = fs.readFileSync(fileName, 'utf-8');
116
161
  const result = contents.includes(str);
@@ -167,6 +212,26 @@ module.exports = class extends BaseGenerator {
167
212
  await handleSlosFile(this.answers, sloYamlFile);
168
213
  }
169
214
  }
215
+
216
+ if (monitoringResource === 'alerts') {
217
+ const yamlPath = `${resourcePath}/alerts.yaml`;
218
+ const terraPath = `${resourcePath}/terragrunt.hcl`;
219
+ if (!fs.existsSync(resourcePath)) fs.mkdirSync(resourcePath, { recursive: true });
220
+ if (!fs.existsSync(yamlPath)) fs.writeFileSync(yamlPath, '');
221
+
222
+ if (!fs.existsSync(terraPath)) {
223
+ this.fs.copyTpl(
224
+ this.templatePath('alerts/terragrunt.hcl'),
225
+ this.destinationPath(terraPath),
226
+ this.answers,
227
+ );
228
+ }
229
+
230
+ const oldYaml = yaml.load(fs.readFileSync(yamlPath, 'utf8')) || [];
231
+ const newYaml = await handleAlerts(oldYaml, alertTemplates, this.answers);
232
+
233
+ fs.writeFileSync(yamlPath, yaml.dump(newYaml));
234
+ }
170
235
  }
171
236
 
172
237
  end() {
@@ -0,0 +1,257 @@
1
+ cloud_run:
2
+ error_count:
3
+ display_name: "[P3] <%-systemName%>.<%-serviceName%> | 5xx Error Request Count above 1"
4
+ conditions:
5
+ - display_name: Cloud Run Anthos - 5xx error Request Count above 1
6
+ condition_threshold:
7
+ filter: |
8
+ resource.type="knative_revision"
9
+ resource.labels.service_name="<%-serviceName%>"
10
+ metric.type="knative.dev/serving/revision/request_count"
11
+ metric.labels.response_code_class="5xx"
12
+ threshold_value: 1
13
+ aggregations:
14
+ - alignment_period: 60s
15
+ cross_series_reducer: REDUCE_SUM
16
+ group_by_fields:
17
+ - metric.label.response_code_class
18
+ per_series_aligner: ALIGN_DELTA
19
+ documentation:
20
+ content: <% if (runbookLink) { %>[Runbook](<%-runbookLink%>)<%} else { %> <% } %>
21
+ error_rate:
22
+ display_name: "[P3] <%-systemName%>.<%-serviceName%> | High 5xx Error Rate"
23
+ conditions:
24
+ - display_name: Cloud Run Anthos - 3% of all requests during 10min are 5xx
25
+ condition_monitoring_query_language:
26
+ query: |
27
+ fetch knative_revision::knative.dev/serving/revision/request_count
28
+ | filter service_name = "store-data-resolver"
29
+ | align int_mean_aligner(10m)
30
+ | group_by [], sum(if(metric.response_code_class == '5xx', val(), 0)) / sum(val())
31
+ | condition val() > 0.03
32
+ | every 10m
33
+ documentation:
34
+ content: <% if (runbookLink) { %>[Runbook](<%-runbookLink%>)<%} else { %> <% } %>
35
+ request_latency:
36
+ display_name: "[P3] <%-systemName%>.<%-serviceName%> | High Request Latency"
37
+ conditions:
38
+ - display_name: Cloud Run Anthos - Response Time (95%) above 1s for 5 min
39
+ condition_threshold:
40
+ filter: |
41
+ resource.type="knative_revision"
42
+ resource.labels.service_name="<%-serviceName%>"
43
+ metric.type="knative.dev/serving/revision/request_latencies"
44
+ threshold_value: 1000
45
+ duration: 300s
46
+ aggregations:
47
+ - alignment_period: 60s
48
+ cross_series_reducer: REDUCE_NONE
49
+ per_series_aligner: ALIGN_PERCENTILE_95
50
+ documentation:
51
+ content: <% if (runbookLink) { %>[Runbook](<%-runbookLink%>)<%} else { %> <% } %>
52
+ cloud_scheduler:
53
+ failed_job:
54
+ display_name: "[P4] <%-systemName%> - Cloud Scheduler | <%-jobId%> - Job Failed"
55
+ conditions:
56
+ - display_name: Cloud Scheduler Job - Log entries with SEVERITY=Error exceed threshold
57
+ condition_threshold:
58
+ filter: |
59
+ resource.type="cloud_scheduler_job"
60
+ resource.labels.job_id="<%-jobId%>"
61
+ metric.type="logging.googleapis.com/log_entry_count"
62
+ metric.labels.severity="ERROR"
63
+ threshold_value: 1
64
+ aggregations:
65
+ - alignment_period: 60s
66
+ cross_series_reducer: REDUCE_NONE
67
+ per_series_aligner: ALIGN_COUNT
68
+ documentation:
69
+ content: <% if (runbookLink) { %>[Runbook](<%-runbookLink%>)<%} else { %> <% } %>
70
+ cloud_sql:
71
+ cpu_over_65:
72
+ display_name: "[P3] <%-systemName%> - CloudSQL | <%-databaseId%> - CPU over 65%"
73
+ conditions:
74
+ - display_name: Cloud SQL Database - CPU utilization above 65% over 5 min
75
+ condition_threshold:
76
+ filter: |
77
+ resource.type="cloudsql_database"
78
+ resource.labels.database_id="<%-databaseId%>"
79
+ metric.type="cloudsql.googleapis.com/database/cpu/utilization"
80
+ threshold_value: 0.65
81
+ duration: 300s
82
+ aggregations:
83
+ - alignment_period: 60s
84
+ cross_series_reducer: REDUCE_NONE
85
+ per_series_aligner: ALIGN_MAX
86
+ documentation:
87
+ content: <% if (runbookLink) { %>[Runbook](<%-runbookLink%>)<%} else { %> <% } %>
88
+ cpu_over_85:
89
+ display_name: "[P3] <%-systemName%> - CloudSQL | <%-databaseId%> - CPU over 85%"
90
+ conditions:
91
+ - display_name: "Cloud SQL Database - CPU-usage above 85% over 1 min"
92
+ condition_threshold:
93
+ filter: |
94
+ resource.type="cloudsql_database"
95
+ resource.labels.database_id="<%-databaseId%>"
96
+ metric.type="cloudsql.googleapis.com/database/cpu/utilization"
97
+ threshold_value: 0.85
98
+ duration: 60s
99
+ aggregations:
100
+ - alignment_period: 60s
101
+ cross_series_reducer: REDUCE_NONE
102
+ per_series_aligner: ALIGN_MAX
103
+ documentation:
104
+ content: <% if (runbookLink) { %>[Runbook](<%-runbookLink%>)<%} else { %> <% } %>
105
+ cpu_over_90:
106
+ display_name: "[P3] <%-systemName%> - CloudSQL | <%-databaseId%> - CPU over 90%"
107
+ conditions:
108
+ - display_name: Cloud SQL Database - CPU-usage above 90%
109
+ condition_threshold:
110
+ filter: |
111
+ resource.type="cloudsql_database"
112
+ resource.labels.database_id="<%-databaseId%>"
113
+ metric.type="cloudsql.googleapis.com/database/cpu/utilization"
114
+ threshold_value: 0.9
115
+ aggregations:
116
+ - alignment_period: 60s
117
+ cross_series_reducer: REDUCE_NONE
118
+ per_series_aligner: ALIGN_MAX
119
+ documentation:
120
+ content: <% if (runbookLink) { %>[Runbook](<%-runbookLink%>)<%} else { %> <% } %>
121
+ memory_over_50:
122
+ display_name: "[P3] <%-systemName%> - CloudSQL | <%-databaseId%> - Memory over 50%"
123
+ conditions:
124
+ - display_name: Cloud SQL Database - Memory utilization above 50% over 5 min
125
+ condition_threshold:
126
+ filter: |
127
+ resource.type="cloudsql_database"
128
+ resource.labels.database_id="<%-databaseId%>"
129
+ metric.type="cloudsql.googleapis.com/database/memory/utilization"
130
+ threshold_value: 50
131
+ duration: 300s
132
+ aggregations:
133
+ - alignment_period: 60s
134
+ cross_series_reducer: REDUCE_NONE
135
+ per_series_aligner: ALIGN_MAX
136
+ documentation:
137
+ content: <% if (runbookLink) { %>[Runbook](<%-runbookLink%>)<%} else { %> <% } %>
138
+ memory_over_75:
139
+ display_name: "[P3] <%-systemName%> - CloudSQL | <%-databaseId%> - Memory over 75%"
140
+ conditions:
141
+ - display_name: Cloud SQL Database - Memory utilization above 75% over 5 min
142
+ condition_threshold:
143
+ filter: |
144
+ resource.type="cloudsql_database"
145
+ resource.labels.database_id="<%-databaseId%>"
146
+ metric.type="cloudsql.googleapis.com/database/memory/utilization"
147
+ threshold_value: 75
148
+ duration: 300s
149
+ aggregations:
150
+ - alignment_period: 60s
151
+ cross_series_reducer: REDUCE_NONE
152
+ per_series_aligner: ALIGN_MAX
153
+ documentation:
154
+ content: <% if (runbookLink) { %>[Runbook](<%-runbookLink%>)<%} else { %> <% } %>
155
+ memory_over_90:
156
+ display_name: "[P3] <%-systemName%> - CloudSQL | <%-databaseId%> - Memory over 90%"
157
+ conditions:
158
+ - display_name: Cloud SQL Database - Memory utilization above 90%
159
+ condition_threshold:
160
+ filter: |
161
+ resource.type="cloudsql_database"
162
+ resource.labels.database_id="<%-databaseId%>"
163
+ metric.type="cloudsql.googleapis.com/database/memory/utilization"
164
+ threshold_value: 90
165
+ duration: 60s
166
+ aggregations:
167
+ - alignment_period: 60s
168
+ cross_series_reducer: REDUCE_NONE
169
+ per_series_aligner: ALIGN_MAX
170
+ documentation:
171
+ content: <% if (runbookLink) { %>[Runbook](<%-runbookLink%>)<%} else { %> <% } %>
172
+ query_over_1s:
173
+ display_name: "[P4] <%-systemName%> - CloudSQL | <%-databaseId%> - Query resolve time"
174
+ conditions:
175
+ - display_name: Cloud SQL Instance Database - Per query execution times above 1000 ms
176
+ condition_threshold:
177
+ filter: |
178
+ resource.type="cloudsql_instance_database"
179
+ resource.labels.resource_id="<%-databaseId%>"
180
+ metric.type="cloudsql.googleapis.com/database/postgresql/insights/perquery/execution_time"
181
+ threshold_value: 1000000
182
+ aggregations:
183
+ - alignment_period: 60s
184
+ cross_series_reducer: REDUCE_NONE
185
+ per_series_aligner: ALIGN_DELTA
186
+ documentation:
187
+ content: <% if (runbookLink) { %>[Runbook](<%-runbookLink%>)<%} else { %> <% } %>
188
+ memorystore:
189
+ memory_over_50:
190
+ display_name: "[P4] <%-systemName%> - Memorystore | <%-instanceId%> - Memory over 50%"
191
+ conditions:
192
+ - display_name: Memorystore Redis Instance - Memory Usage above 50% over 5 min
193
+ condition_threshold:
194
+ filter: |
195
+ resource.type="redis_instance"
196
+ resource.labels.instance_id="<%-instanceId%>"
197
+ metric.type="redis.googleapis.com/stats/memory/usage_ratio"
198
+ threshold_value: 0.5
199
+ duration: 300s
200
+ aggregations:
201
+ - alignment_period: 60s
202
+ cross_series_reducer: REDUCE_NONE
203
+ per_series_aligner: ALIGN_MAX
204
+ documentation:
205
+ content: <% if (runbookLink) { %>[Runbook](<%-runbookLink%>)<%} else { %> <% } %>
206
+ memory_over_75:
207
+ display_name: "[P4] <%-systemName%> - Memorystore | <%-instanceId%> - Memory over 75%"
208
+ conditions:
209
+ - display_name: Memorystore Redis Instance - Memory Usage above 75% for 5min
210
+ condition_threshold:
211
+ filter: |
212
+ resource.type="redis_instance"
213
+ resource.labels.instance_id="<%-instanceId%>"
214
+ metric.type="redis.googleapis.com/stats/memory/usage_ratio"
215
+ threshold_value: 0.75
216
+ duration: 300s
217
+ aggregations:
218
+ - alignment_period: 60s
219
+ cross_series_reducer: REDUCE_NONE
220
+ per_series_aligner: ALIGN_MAX
221
+ documentation:
222
+ content: <% if (runbookLink) { %>[Runbook](<%-runbookLink%>)<%} else { %> <% } %>
223
+ memory_over_90:
224
+ display_name: "[P2] <%-systemName%> - Memorystore | <%-instanceId%> - Memory over 90%"
225
+ conditions:
226
+ - display_name: Memorystore Redis Instance - Memory Usage above 90%
227
+ condition_threshold:
228
+ filter: |
229
+ resource.type="redis_instance"
230
+ resource.labels.instance_id="<%-instanceId%>"
231
+ metric.type="redis.googleapis.com/stats/memory/usage_ratio"
232
+ threshold_value: 0.90
233
+ duration: 60s
234
+ aggregations:
235
+ - alignment_period: 60s
236
+ cross_series_reducer: REDUCE_NONE
237
+ per_series_aligner: ALIGN_MAX
238
+ documentation:
239
+ content: <% if (runbookLink) { %>[Runbook](<%-runbookLink%>)<%} else { %> <% } %>
240
+ pub_sub:
241
+ unacknowledged_messages:
242
+ display_name: "[P4] <%-systemName%> - Pub/Sub | <%-subscriptionId%> - Undelivered message(s)"
243
+ conditions:
244
+ - display_name: Cloud Pub/Sub Subscription - Undelivered messages above 1 for 5 min
245
+ condition_threshold:
246
+ filter: |
247
+ resource.type="pubsub_subscription"
248
+ resource.labels.subscription_id="<%-subscriptionId%>"
249
+ metric.type="pubsub.googleapis.com/subscription/num_undelivered_messages"
250
+ threshold_value: 1
251
+ duration: 300s
252
+ aggregations:
253
+ - alignment_period: 60s
254
+ cross_series_reducer: REDUCE_NONE
255
+ per_series_aligner: ALIGN_MEAN
256
+ documentation:
257
+ content: <% if (runbookLink) { %>[Runbook](<%-runbookLink%>)<%} else { %> <% } %>
@@ -0,0 +1,33 @@
1
+ # Terragrunt will copy the Terraform configurations specified by the source parameter, along with any files in the
2
+ # working directory, into a temporary folder, and execute your Terraform commands in that folder.
3
+ terraform {
4
+ source = "git::https://github.com/extenda/tf-module-gcp-alert-policy//?ref=v0.1.0"
5
+ }
6
+
7
+ # Include all settings from the root terragrunt.hcl file
8
+ include {
9
+ path = find_in_parent_folders("terragrunt_root.hcl")
10
+ }
11
+
12
+ dependency "notification_channels" {
13
+ config_path = "../notification-channels"
14
+ mock_outputs = {
15
+ notification_channels = ["dummy-channel"]
16
+ }
17
+ }
18
+
19
+ locals {
20
+ project_vars = read_terragrunt_config(find_in_parent_folders("project.hcl"))
21
+ common_vars = read_terragrunt_config(find_in_parent_folders("common.hcl"))
22
+ }
23
+
24
+ # These are the variables we have to pass in to use the module specified in the terragrunt configuration above
25
+ inputs = {
26
+ monitoring_project_id = local.project_vars.locals.monitoring_project_id,
27
+ notification_channels = dependency.notification_channels.outputs.notification_channels,
28
+ policies = yamldecode(file("${get_terragrunt_dir()}/alerts.yaml")),
29
+ user_labels = {
30
+ cc = local.common_vars.locals.cost_center
31
+ clan = local.common_vars.locals.clan_name
32
+ },
33
+ }
@@ -1,3 +1,3 @@
1
- - service_name: "<%-systemName%>.<%-serviceName%>"
2
- hostname: "<%-hostname%>"
3
- path: "<%-path%>"
1
+ - service_name: <%-systemName%>.<%-serviceName%>
2
+ hostname: <%-hostname%>
3
+ path: <%-path%>
@@ -15,4 +15,11 @@ helper.validSystemName = (input) => {
15
15
  return 'System name must be 3 characters';
16
16
  };
17
17
 
18
+ helper.validUrl = (input) => {
19
+ // eslint-disable-next-line no-useless-escape
20
+ const regex = new RegExp(/^https:\/\/[a-zA-Z]*.[a-zA-Z]*.[a-zA-Z]*\/[a-zA-Z\/+_-]*.$/g);
21
+ if (regex.test(input) || input === '') return true;
22
+ return 'Enter a valid URL';
23
+ };
24
+
18
25
  module.exports = helper;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@hiiretail/gcp-infra-cli",
3
- "version": "0.73.0",
3
+ "version": "0.74.0",
4
4
  "description": "Infrastructure as code generator for GCP.",
5
5
  "main": "src/cli.js",
6
6
  "bin": {