@hiiretail/gcp-infra-cli 0.80.0 → 0.80.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -2,47 +2,88 @@
|
|
|
2
2
|
|
|
3
3
|
## General
|
|
4
4
|
|
|
5
|
-
<!-- Describe in short what the
|
|
5
|
+
<!-- Describe, in short, what the system does and the interactions with external and/or third party systems.
|
|
6
|
+
|
|
7
|
+
Example:
|
|
8
|
+
|
|
9
|
+
Transaction Repository is a centralized product which receives transactions from many parts of Hii Retail. The intention is to store transactions long term and to expose a search API to be able to search for transactions.
|
|
10
|
+
-->
|
|
6
11
|
|
|
7
12
|
## Architecture
|
|
8
13
|
|
|
9
|
-
<!-- Include C4 diagrams
|
|
14
|
+
<!-- Include the C4 diagrams and a link to the Software Guidebook. -->
|
|
10
15
|
|
|
11
16
|
## Business Continuity and Disaster Recovery Plan
|
|
12
17
|
|
|
13
|
-
<!--
|
|
18
|
+
<!-- Add a link to the Business Continuity and Disaster Recovery Plan. -->
|
|
14
19
|
|
|
15
20
|
## Services
|
|
16
21
|
|
|
17
|
-
<!--
|
|
22
|
+
<!-- List all internal services that are a part of the system with a short explanation of their purpose and a link to the logs of the service.
|
|
23
|
+
-->
|
|
18
24
|
|
|
19
25
|
## Dashboard
|
|
20
26
|
|
|
21
|
-
<!--
|
|
27
|
+
<!-- Add link(s) to the dashboard(s) that are setup. Write a short explanation what the dashboard displays and the purpose of it.
|
|
28
|
+
-->
|
|
22
29
|
|
|
23
30
|
## Service Level Objectives
|
|
24
31
|
|
|
25
|
-
<!--
|
|
32
|
+
<!-- Add a link or include the SLOs for each service that are defined. -->
|
|
26
33
|
|
|
27
34
|
## Alerts
|
|
28
35
|
|
|
29
|
-
<!--
|
|
36
|
+
<!-- List the alerts according to the format:
|
|
37
|
+
|
|
38
|
+
Alert name
|
|
39
|
+
* Description
|
|
40
|
+
* Notification channels
|
|
41
|
+
* Remediation steps
|
|
42
|
+
|
|
43
|
+
Example:
|
|
44
|
+
|
|
45
|
+
[P1] che.checkout-engine-isrg-nl-checkout-api - Service is offline
|
|
46
|
+
|
|
47
|
+
Description: Triggers when the uptime check fails
|
|
48
|
+
|
|
49
|
+
Notification channels:
|
|
50
|
+
* Slack, #monitoring-channel
|
|
51
|
+
* SMS
|
|
52
|
+
* Jira
|
|
53
|
+
|
|
54
|
+
Remediation steps:
|
|
55
|
+
1. Check if the memory usage (Link to where to check that) is higher than usual (Is there a threshold?)
|
|
56
|
+
2. Check if the number of requests (Link to where to check that) are higher than usual
|
|
57
|
+
3. Follow the Contact & Escalation Matrix
|
|
58
|
+
-->
|
|
30
59
|
|
|
31
60
|
## Health Checks
|
|
32
61
|
|
|
33
|
-
<!--
|
|
62
|
+
<!-- Add links to the configured health checks in GCP -->
|
|
63
|
+
|
|
64
|
+
## Accessibility (GCP)
|
|
65
|
+
|
|
66
|
+
<!-- What permissions are required to access the GCP resources that are used by the system?
|
|
67
|
+
|
|
68
|
+
Example:
|
|
69
|
+
* Cloud SQL - roles/cloudsql.editor (txengine-prod-1c85)
|
|
70
|
+
* Secret Manager - roles/secretmanager.admin (cardpayment-prod-d5b4)
|
|
71
|
+
|
|
72
|
+
Add examples on how to use the Just-In-Time Access system (https://jit-access.retailsvc.com/)
|
|
73
|
+
-->
|
|
34
74
|
|
|
35
75
|
## How do I..?
|
|
36
76
|
|
|
37
|
-
<!-- Good to know things
|
|
77
|
+
<!-- Good to know things, such as "How do I connect to the database?", "How do I find a specific item?" -->
|
|
38
78
|
|
|
39
79
|
## Known Issues
|
|
40
80
|
|
|
41
|
-
<!-- Are there any known issues?
|
|
81
|
+
<!-- Are there any known issues that requires manual intervention? Is there a workaround for the issue? There should be a short description of the issue and a link to the Jira where more details can be found.
|
|
82
|
+
-->
|
|
42
83
|
|
|
43
84
|
## Contact & Escalation Matrix
|
|
44
85
|
|
|
45
|
-
<!-- If the team is unable to resolve
|
|
86
|
+
<!-- If the team is unable to resolve or need to escalate an incident, who is the first to contact?
|
|
46
87
|
|
|
47
88
|
| # | Name | Role | E-Mail | Phone number |
|
|
48
89
|
| --- | --- | --- | --- | --- |
|
|
@@ -7,10 +7,12 @@ const { required } = require('../../../src/validators');
|
|
|
7
7
|
const validate = require('./validate');
|
|
8
8
|
const { handleSlos, handleAlerts, handleUptimeChecks } = require('./handle-yaml');
|
|
9
9
|
const { getProjectId } = require('../pubsub/get-gcp-projects');
|
|
10
|
+
const getTribeAndClanName = require('../../init/clan-infra/tribe-clan-repo');
|
|
10
11
|
|
|
11
12
|
const uptimeCheckTemplates = yaml.load(fs.readFileSync(`${__dirname}/templates/uptime-checks/uptime-checks.yaml`));
|
|
12
13
|
const alertTemplates = yaml.load(fs.readFileSync(`${__dirname}/templates/alerts/alerts.yaml`));
|
|
13
14
|
const sloTemplates = yaml.load(fs.readFileSync(`${__dirname}/templates/slos/slos.yaml`));
|
|
15
|
+
const { clan: defaultClan } = getTribeAndClanName();
|
|
14
16
|
const projectId = getProjectId('prod');
|
|
15
17
|
|
|
16
18
|
module.exports = class extends BaseGenerator {
|
|
@@ -35,7 +37,7 @@ module.exports = class extends BaseGenerator {
|
|
|
35
37
|
choices: (answers) => Object.keys(alertTemplates[`${answers.alertResource}`]),
|
|
36
38
|
},
|
|
37
39
|
{
|
|
38
|
-
when: (response) => ['
|
|
40
|
+
when: (response) => ['slos', 'uptime-checks'].includes(response.monitoringResource) || response.alertResource === 'cloud_run',
|
|
39
41
|
type: 'input',
|
|
40
42
|
name: 'systemName',
|
|
41
43
|
message: 'Please provide three-letter system name as defined in Styra (example: sre, ptf, sda, che, pnp, iam...)',
|
|
@@ -146,8 +148,10 @@ module.exports = class extends BaseGenerator {
|
|
|
146
148
|
const yamlPath = `${resourceDir}/alerts.yaml`;
|
|
147
149
|
|
|
148
150
|
copyTemplate('alerts', resourceDir, yamlPath);
|
|
151
|
+
|
|
149
152
|
const oldYaml = yaml.load(fs.readFileSync(yamlPath, 'utf8')) || [];
|
|
150
|
-
const newYaml = await handleAlerts(oldYaml, alertTemplates,
|
|
153
|
+
const newYaml = await handleAlerts(oldYaml, alertTemplates,
|
|
154
|
+
{ ...this.answers, clan: defaultClan });
|
|
151
155
|
|
|
152
156
|
fs.writeFileSync(yamlPath, yaml.dump(newYaml));
|
|
153
157
|
}
|
|
@@ -18,7 +18,7 @@ cloud_run:
|
|
|
18
18
|
content: <% if (runbookLink) { %>[Runbook](<%-runbookLink%>)<%} else { %> <% } %>
|
|
19
19
|
cloud_scheduler:
|
|
20
20
|
failed_job:
|
|
21
|
-
display_name: "[P4] <%-
|
|
21
|
+
display_name: "[P4] <%-clan%> - Cloud Scheduler | Job Failed"
|
|
22
22
|
conditions:
|
|
23
23
|
- display_name: Cloud Scheduler Job - Log entries with SEVERITY=Error exceed threshold
|
|
24
24
|
condition_threshold:
|
|
@@ -36,7 +36,7 @@ cloud_scheduler:
|
|
|
36
36
|
content: <% if (runbookLink) { %>[Runbook](<%-runbookLink%>)<%} else { %> <% } %>
|
|
37
37
|
cloud_sql:
|
|
38
38
|
cpu_over_65:
|
|
39
|
-
display_name: "[P3] <%-
|
|
39
|
+
display_name: "[P3] <%-clan%> - CloudSQL | CPU over 65%"
|
|
40
40
|
conditions:
|
|
41
41
|
- display_name: Cloud SQL Database - CPU utilization above 65% over 5 min
|
|
42
42
|
condition_threshold:
|
|
@@ -53,7 +53,7 @@ cloud_sql:
|
|
|
53
53
|
documentation:
|
|
54
54
|
content: <% if (runbookLink) { %>[Runbook](<%-runbookLink%>)<%} else { %> <% } %>
|
|
55
55
|
cpu_over_85:
|
|
56
|
-
display_name: "[P3] <%-
|
|
56
|
+
display_name: "[P3] <%-clan%> - CloudSQL | CPU over 85%"
|
|
57
57
|
conditions:
|
|
58
58
|
- display_name: "Cloud SQL Database - CPU-usage above 85% over 1 min"
|
|
59
59
|
condition_threshold:
|
|
@@ -70,7 +70,7 @@ cloud_sql:
|
|
|
70
70
|
documentation:
|
|
71
71
|
content: <% if (runbookLink) { %>[Runbook](<%-runbookLink%>)<%} else { %> <% } %>
|
|
72
72
|
cpu_over_90:
|
|
73
|
-
display_name: "[P3] <%-
|
|
73
|
+
display_name: "[P3] <%-clan%> - CloudSQL | CPU over 90%"
|
|
74
74
|
conditions:
|
|
75
75
|
- display_name: Cloud SQL Database - CPU-usage above 90%
|
|
76
76
|
condition_threshold:
|
|
@@ -86,7 +86,7 @@ cloud_sql:
|
|
|
86
86
|
documentation:
|
|
87
87
|
content: <% if (runbookLink) { %>[Runbook](<%-runbookLink%>)<%} else { %> <% } %>
|
|
88
88
|
query_over_1s:
|
|
89
|
-
display_name: "[P4] <%-
|
|
89
|
+
display_name: "[P4] <%-clan%> - CloudSQL | Query resolve time"
|
|
90
90
|
conditions:
|
|
91
91
|
- display_name: Cloud SQL Instance Database - Per query execution times above 1000 ms
|
|
92
92
|
condition_threshold:
|
|
@@ -103,7 +103,7 @@ cloud_sql:
|
|
|
103
103
|
content: <% if (runbookLink) { %>[Runbook](<%-runbookLink%>)<%} else { %> <% } %>
|
|
104
104
|
memorystore:
|
|
105
105
|
memory_over_50:
|
|
106
|
-
display_name: "[P4] <%-
|
|
106
|
+
display_name: "[P4] <%-clan%> - Memorystore | Memory over 50%"
|
|
107
107
|
conditions:
|
|
108
108
|
- display_name: Memorystore Redis Instance - Memory Usage above 50% over 5 min
|
|
109
109
|
condition_threshold:
|
|
@@ -120,7 +120,7 @@ memorystore:
|
|
|
120
120
|
documentation:
|
|
121
121
|
content: <% if (runbookLink) { %>[Runbook](<%-runbookLink%>)<%} else { %> <% } %>
|
|
122
122
|
memory_over_75:
|
|
123
|
-
display_name: "[P4] <%-
|
|
123
|
+
display_name: "[P4] <%-clan%> - Memorystore | Memory over 75%"
|
|
124
124
|
conditions:
|
|
125
125
|
- display_name: Memorystore Redis Instance - Memory Usage above 75% for 5min
|
|
126
126
|
condition_threshold:
|
|
@@ -137,7 +137,7 @@ memorystore:
|
|
|
137
137
|
documentation:
|
|
138
138
|
content: <% if (runbookLink) { %>[Runbook](<%-runbookLink%>)<%} else { %> <% } %>
|
|
139
139
|
memory_over_90:
|
|
140
|
-
display_name: "[P2] <%-
|
|
140
|
+
display_name: "[P2] <%-clan%> - Memorystore | Memory over 90%"
|
|
141
141
|
conditions:
|
|
142
142
|
- display_name: Memorystore Redis Instance - Memory Usage above 90%
|
|
143
143
|
condition_threshold:
|
|
@@ -155,7 +155,7 @@ memorystore:
|
|
|
155
155
|
content: <% if (runbookLink) { %>[Runbook](<%-runbookLink%>)<%} else { %> <% } %>
|
|
156
156
|
pub_sub:
|
|
157
157
|
unacknowledged_messages:
|
|
158
|
-
display_name: "[P3] <%-
|
|
158
|
+
display_name: "[P3] <%-clan%> - Pub/Sub | Undelivered message(s)"
|
|
159
159
|
conditions:
|
|
160
160
|
- display_name: Cloud Pub/Sub Subscription - Undelivered messages above 1 for 5 min
|
|
161
161
|
condition_threshold:
|
|
@@ -172,7 +172,7 @@ pub_sub:
|
|
|
172
172
|
documentation:
|
|
173
173
|
content: <% if (runbookLink) { %>[Runbook](<%-runbookLink%>)<%} else { %> <% } %>
|
|
174
174
|
messages_in_dlq:
|
|
175
|
-
display_name: "[P3] <%-
|
|
175
|
+
display_name: "[P3] <%-clan%> - Pub/Sub | Message(s) in DLQ"
|
|
176
176
|
conditions:
|
|
177
177
|
- display_name: Cloud Pub/Sub Subscription - Number of undelivered message(s) forwarded to DLQ
|
|
178
178
|
condition_threshold:
|
|
@@ -189,7 +189,7 @@ pub_sub:
|
|
|
189
189
|
documentation:
|
|
190
190
|
content: <% if (runbookLink) { %>[Runbook](<%-runbookLink%>)<%} else { %> <% } %>
|
|
191
191
|
latency:
|
|
192
|
-
display_name: "[P3] <%-
|
|
192
|
+
display_name: "[P3] <%-clan%> - Pub/Sub | Response latency distribution"
|
|
193
193
|
conditions:
|
|
194
194
|
- display_name: Cloud Pub/Sub Subscription - Latency above 3s
|
|
195
195
|
condition_threshold:
|
|
@@ -208,7 +208,7 @@ pub_sub:
|
|
|
208
208
|
content: <% if (runbookLink) { %>[Runbook](<%-runbookLink%>)<%} else { %> <% } %>
|
|
209
209
|
cloud_function:
|
|
210
210
|
failed_execution:
|
|
211
|
-
display_name: "[P2] <%-
|
|
211
|
+
display_name: "[P2] <%-clan%> - Cloud Function | Failed job execution"
|
|
212
212
|
conditions:
|
|
213
213
|
- display_name: Cloud Function - Execution error count
|
|
214
214
|
condition_threshold:
|