@hiiretail/gcp-infra-cli 0.80.0 → 0.80.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,47 +2,88 @@
2
2
 
3
3
  ## General
4
4
 
5
- <!-- Describe in short what the purpose of the solution is. -->
5
+ <!-- Describe, in short, what the system does and the interactions with external and/or third party systems.
6
+
7
+ Example:
8
+
9
+ Transaction Repository is a centralized product which receives transactions from many parts of Hii Retail. The intention is to store transactions long term and to expose a search API to be able to search for transactions.
10
+ -->
6
11
 
7
12
  ## Architecture
8
13
 
9
- <!-- Include C4 diagrams or links to the Software Guidebook. -->
14
+ <!-- Include the C4 diagrams and a link to the Software Guidebook. -->
10
15
 
11
16
  ## Business Continuity and Disaster Recovery Plan
12
17
 
13
- <!-- Link to the Business Continuity and Disaster Recovery Plan documentation. -->
18
+ <!-- Add a link to the Business Continuity and Disaster Recovery Plan. -->
14
19
 
15
20
  ## Services
16
21
 
17
- <!-- A short description of what the purpose of each service is. Links to the log files of all the services that are included in the solution. -->
22
+ <!-- List all internal services that are a part of the system with a short explanation of their purpose and a link to the logs of the service.
23
+ -->
18
24
 
19
25
  ## Dashboard
20
26
 
21
- <!-- Links to one or multiple dashboards. -->
27
+ <!-- Add link(s) to the dashboard(s) that are setup. Write a short explanation what the dashboard displays and the purpose of it.
28
+ -->
22
29
 
23
30
  ## Service Level Objectives
24
31
 
25
- <!-- What are the SLOs? -->
32
+ <!-- Add a link or include the SLOs for each service that are defined. -->
26
33
 
27
34
  ## Alerts
28
35
 
29
- <!-- What are the alerts that has been setup, where is alert sent to and what are the steps to mitigate the issue? -->
36
+ <!-- List the alerts according to the format:
37
+
38
+ Alert name
39
+ * Description
40
+ * Notification channels
41
+ * Remediation steps
42
+
43
+ Example:
44
+
45
+ [P1] che.checkout-engine-isrg-nl-checkout-api - Service is offline
46
+
47
+ Description: Triggers when the uptime check fails
48
+
49
+ Notification channels:
50
+ * Slack, #monitoring-channel
51
+ * SMS
52
+ * Jira
53
+
54
+ Remediation steps:
55
+ 1. Check if the memory usage (Link to where to check that) is higher than usual (Is there a threshold?)
56
+ 2. Check if the number of requests (Link to where to check that) are higher than usual
57
+ 3. Follow the Contact & Escalation Matrix
58
+ -->
30
59
 
31
60
  ## Health Checks
32
61
 
33
- <!-- Links to the configured uptime checks that has been setup in GCP. -->
62
+ <!-- Add links to the configured health checks in GCP -->
63
+
64
+ ## Accessibility (GCP)
65
+
66
+ <!-- What permissions are required to access the GCP resources that are used by the system?
67
+
68
+ Example:
69
+ * Cloud SQL - roles/cloudsql.editor (txengine-prod-1c85)
70
+ * Secret Manager - roles/secretmanager.admin (cardpayment-prod-d5b4)
71
+
72
+ Add examples on how to use the Just-In-Time Access system (https://jit-access.retailsvc.com/)
73
+ -->
34
74
 
35
75
  ## How do I..?
36
76
 
37
- <!-- Good to know things. Such as `How do I check the price for a specific item?` -->
77
+ <!-- Good to know things, such as "How do I connect to the database?", "How do I find a specific item?" -->
38
78
 
39
79
  ## Known Issues
40
80
 
41
- <!-- Are there any known issues? If yes, what is the workaround to solve them? -->
81
+ <!-- Are there any known issues that requires manual intervention? Is there a workaround for the issue? There should be a short description of the issue and a link to the Jira where more details can be found.
82
+ -->
42
83
 
43
84
  ## Contact & Escalation Matrix
44
85
 
45
- <!-- If the team is unable to resolve the issue, who is the first in line to contact?
86
+ <!-- If the team is unable to resolve or need to escalate an incident, who is the first to contact?
46
87
 
47
88
  | # | Name | Role | E-Mail | Phone number |
48
89
  | --- | --- | --- | --- | --- |
@@ -7,10 +7,12 @@ const { required } = require('../../../src/validators');
7
7
  const validate = require('./validate');
8
8
  const { handleSlos, handleAlerts, handleUptimeChecks } = require('./handle-yaml');
9
9
  const { getProjectId } = require('../pubsub/get-gcp-projects');
10
+ const getTribeAndClanName = require('../../init/clan-infra/tribe-clan-repo');
10
11
 
11
12
  const uptimeCheckTemplates = yaml.load(fs.readFileSync(`${__dirname}/templates/uptime-checks/uptime-checks.yaml`));
12
13
  const alertTemplates = yaml.load(fs.readFileSync(`${__dirname}/templates/alerts/alerts.yaml`));
13
14
  const sloTemplates = yaml.load(fs.readFileSync(`${__dirname}/templates/slos/slos.yaml`));
15
+ const { clan: defaultClan } = getTribeAndClanName();
14
16
  const projectId = getProjectId('prod');
15
17
 
16
18
  module.exports = class extends BaseGenerator {
@@ -35,7 +37,7 @@ module.exports = class extends BaseGenerator {
35
37
  choices: (answers) => Object.keys(alertTemplates[`${answers.alertResource}`]),
36
38
  },
37
39
  {
38
- when: (response) => ['alerts', 'slos', 'uptime-checks'].includes(response.monitoringResource),
40
+ when: (response) => ['slos', 'uptime-checks'].includes(response.monitoringResource) || response.alertResource === 'cloud_run',
39
41
  type: 'input',
40
42
  name: 'systemName',
41
43
  message: 'Please provide three-letter system name as defined in Styra (example: sre, ptf, sda, che, pnp, iam...)',
@@ -146,8 +148,10 @@ module.exports = class extends BaseGenerator {
146
148
  const yamlPath = `${resourceDir}/alerts.yaml`;
147
149
 
148
150
  copyTemplate('alerts', resourceDir, yamlPath);
151
+
149
152
  const oldYaml = yaml.load(fs.readFileSync(yamlPath, 'utf8')) || [];
150
- const newYaml = await handleAlerts(oldYaml, alertTemplates, this.answers);
153
+ const newYaml = await handleAlerts(oldYaml, alertTemplates,
154
+ { ...this.answers, clan: defaultClan });
151
155
 
152
156
  fs.writeFileSync(yamlPath, yaml.dump(newYaml));
153
157
  }
@@ -18,7 +18,7 @@ cloud_run:
18
18
  content: <% if (runbookLink) { %>[Runbook](<%-runbookLink%>)<%} else { %> <% } %>
19
19
  cloud_scheduler:
20
20
  failed_job:
21
- display_name: "[P4] <%-systemName%> - Cloud Scheduler | Job Failed"
21
+ display_name: "[P4] <%-clan%> - Cloud Scheduler | Job Failed"
22
22
  conditions:
23
23
  - display_name: Cloud Scheduler Job - Log entries with SEVERITY=Error exceed threshold
24
24
  condition_threshold:
@@ -36,7 +36,7 @@ cloud_scheduler:
36
36
  content: <% if (runbookLink) { %>[Runbook](<%-runbookLink%>)<%} else { %> <% } %>
37
37
  cloud_sql:
38
38
  cpu_over_65:
39
- display_name: "[P3] <%-systemName%> - CloudSQL | CPU over 65%"
39
+ display_name: "[P3] <%-clan%> - CloudSQL | CPU over 65%"
40
40
  conditions:
41
41
  - display_name: Cloud SQL Database - CPU utilization above 65% over 5 min
42
42
  condition_threshold:
@@ -53,7 +53,7 @@ cloud_sql:
53
53
  documentation:
54
54
  content: <% if (runbookLink) { %>[Runbook](<%-runbookLink%>)<%} else { %> <% } %>
55
55
  cpu_over_85:
56
- display_name: "[P3] <%-systemName%> - CloudSQL | CPU over 85%"
56
+ display_name: "[P3] <%-clan%> - CloudSQL | CPU over 85%"
57
57
  conditions:
58
58
  - display_name: "Cloud SQL Database - CPU-usage above 85% over 1 min"
59
59
  condition_threshold:
@@ -70,7 +70,7 @@ cloud_sql:
70
70
  documentation:
71
71
  content: <% if (runbookLink) { %>[Runbook](<%-runbookLink%>)<%} else { %> <% } %>
72
72
  cpu_over_90:
73
- display_name: "[P3] <%-systemName%> - CloudSQL | CPU over 90%"
73
+ display_name: "[P3] <%-clan%> - CloudSQL | CPU over 90%"
74
74
  conditions:
75
75
  - display_name: Cloud SQL Database - CPU-usage above 90%
76
76
  condition_threshold:
@@ -86,7 +86,7 @@ cloud_sql:
86
86
  documentation:
87
87
  content: <% if (runbookLink) { %>[Runbook](<%-runbookLink%>)<%} else { %> <% } %>
88
88
  query_over_1s:
89
- display_name: "[P4] <%-systemName%> - CloudSQL | Query resolve time"
89
+ display_name: "[P4] <%-clan%> - CloudSQL | Query resolve time"
90
90
  conditions:
91
91
  - display_name: Cloud SQL Instance Database - Per query execution times above 1000 ms
92
92
  condition_threshold:
@@ -103,7 +103,7 @@ cloud_sql:
103
103
  content: <% if (runbookLink) { %>[Runbook](<%-runbookLink%>)<%} else { %> <% } %>
104
104
  memorystore:
105
105
  memory_over_50:
106
- display_name: "[P4] <%-systemName%> - Memorystore | Memory over 50%"
106
+ display_name: "[P4] <%-clan%> - Memorystore | Memory over 50%"
107
107
  conditions:
108
108
  - display_name: Memorystore Redis Instance - Memory Usage above 50% over 5 min
109
109
  condition_threshold:
@@ -120,7 +120,7 @@ memorystore:
120
120
  documentation:
121
121
  content: <% if (runbookLink) { %>[Runbook](<%-runbookLink%>)<%} else { %> <% } %>
122
122
  memory_over_75:
123
- display_name: "[P4] <%-systemName%> - Memorystore | Memory over 75%"
123
+ display_name: "[P4] <%-clan%> - Memorystore | Memory over 75%"
124
124
  conditions:
125
125
  - display_name: Memorystore Redis Instance - Memory Usage above 75% for 5min
126
126
  condition_threshold:
@@ -137,7 +137,7 @@ memorystore:
137
137
  documentation:
138
138
  content: <% if (runbookLink) { %>[Runbook](<%-runbookLink%>)<%} else { %> <% } %>
139
139
  memory_over_90:
140
- display_name: "[P2] <%-systemName%> - Memorystore | Memory over 90%"
140
+ display_name: "[P2] <%-clan%> - Memorystore | Memory over 90%"
141
141
  conditions:
142
142
  - display_name: Memorystore Redis Instance - Memory Usage above 90%
143
143
  condition_threshold:
@@ -155,7 +155,7 @@ memorystore:
155
155
  content: <% if (runbookLink) { %>[Runbook](<%-runbookLink%>)<%} else { %> <% } %>
156
156
  pub_sub:
157
157
  unacknowledged_messages:
158
- display_name: "[P3] <%-systemName%> - Pub/Sub | Undelivered message(s)"
158
+ display_name: "[P3] <%-clan%> - Pub/Sub | Undelivered message(s)"
159
159
  conditions:
160
160
  - display_name: Cloud Pub/Sub Subscription - Undelivered messages above 1 for 5 min
161
161
  condition_threshold:
@@ -172,7 +172,7 @@ pub_sub:
172
172
  documentation:
173
173
  content: <% if (runbookLink) { %>[Runbook](<%-runbookLink%>)<%} else { %> <% } %>
174
174
  messages_in_dlq:
175
- display_name: "[P3] <%-systemName%> - Pub/Sub | Message(s) in DLQ"
175
+ display_name: "[P3] <%-clan%> - Pub/Sub | Message(s) in DLQ"
176
176
  conditions:
177
177
  - display_name: Cloud Pub/Sub Subscription - Number of undelivered message(s) forwarded to DLQ
178
178
  condition_threshold:
@@ -189,7 +189,7 @@ pub_sub:
189
189
  documentation:
190
190
  content: <% if (runbookLink) { %>[Runbook](<%-runbookLink%>)<%} else { %> <% } %>
191
191
  latency:
192
- display_name: "[P3] <%-systemName%> - Pub/Sub | Response latency distribution"
192
+ display_name: "[P3] <%-clan%> - Pub/Sub | Response latency distribution"
193
193
  conditions:
194
194
  - display_name: Cloud Pub/Sub Subscription - Latency above 3s
195
195
  condition_threshold:
@@ -208,7 +208,7 @@ pub_sub:
208
208
  content: <% if (runbookLink) { %>[Runbook](<%-runbookLink%>)<%} else { %> <% } %>
209
209
  cloud_function:
210
210
  failed_execution:
211
- display_name: "[P2] <%-systemName%> - Cloud Function | Failed job execution"
211
+ display_name: "[P2] <%-clan%> - Cloud Function | Failed job execution"
212
212
  conditions:
213
213
  - display_name: Cloud Function - Execution error count
214
214
  condition_threshold:
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@hiiretail/gcp-infra-cli",
3
- "version": "0.80.0",
3
+ "version": "0.80.2",
4
4
  "description": "Infrastructure as code generator for GCP.",
5
5
  "main": "src/cli.js",
6
6
  "bin": {