npm - @hiiretail/gcp-infra-cli - Versions diffs - 0.80.0 → 0.80.2 - Mend

@hiiretail/gcp-infra-cli 0.80.0 → 0.80.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/generators/docs/runbook/templates/docs/runbook.md CHANGED Viewed

@@ -2,47 +2,88 @@
 ## General
-<!-- Describe in short what the purpose of the solution is. -->
+<!-- Describe, in short, what the system does and the interactions with external and/or third party systems.
+Example:
+Transaction Repository is a centralized product which receives transactions from many parts of Hii Retail. The intention is to store transactions long term and to expose a search API to be able to search for transactions.
+-->
 ## Architecture
-<!-- Include C4 diagrams or links to the Software Guidebook. -->
+<!-- Include the C4 diagrams and a link to the Software Guidebook. -->
 ## Business Continuity and Disaster Recovery Plan
-<!-- Link to the Business Continuity and Disaster Recovery Plan documentation. -->
+<!-- Add a link to the Business Continuity and Disaster Recovery Plan. -->
 ## Services
-<!-- A short description of what the purpose of each service is. Links to the log files of all the services that are included in the solution. -->
+<!-- List all internal services that are a part of the system with a short explanation of their purpose and a link to the logs of the service.
+-->
 ## Dashboard
-<!-- Links to one or multiple dashboards. -->
+<!-- Add link(s) to the dashboard(s) that are setup. Write a short explanation what the dashboard displays and the purpose of it.
+-->
 ## Service Level Objectives
-<!-- What are the SLOs? -->
+<!-- Add a link or include the SLOs for each service that are defined. -->
 ## Alerts
-<!-- What are the alerts that has been setup, where is alert sent to and what are the steps to mitigate the issue? -->
+<!-- List the alerts according to the format:
+Alert name
+  * Description
+  * Notification channels
+  * Remediation steps
+Example:
+[P1] che.checkout-engine-isrg-nl-checkout-api - Service is offline
+Description: Triggers when the uptime check fails
+Notification channels:
+  * Slack, #monitoring-channel
+  * SMS
+  * Jira
+Remediation steps:
+  1. Check if the memory usage (Link to where to check that) is higher than usual (Is there a threshold?)
+  2. Check if the number of requests (Link to where to check that) are higher than usual
+  3. Follow the Contact & Escalation Matrix
+-->
 ## Health Checks
-<!-- Links to the configured uptime checks that has been setup in GCP. -->
+<!-- Add links to the configured health checks in GCP -->
+## Accessibility (GCP)
+<!-- What permissions are required to access the GCP resources that are used by the system?
+Example:
+  * Cloud SQL - roles/cloudsql.editor (txengine-prod-1c85)
+  * Secret Manager - roles/secretmanager.admin (cardpayment-prod-d5b4)
+Add examples on how to use the Just-In-Time Access system (https://jit-access.retailsvc.com/)
+-->
 ## How do I..?
-<!-- Good to know things. Such as `How do I check the price for a specific item?` -->
+<!-- Good to know things, such as "How do I connect to the database?", "How do I find a specific item?"  -->
 ## Known Issues
-<!-- Are there any known issues? If yes, what is the workaround to solve them? -->
+<!-- Are there any known issues that requires manual intervention? Is there a workaround for the issue? There should be a short description of the issue and a link to the Jira where more details can be found.
+-->
 ## Contact & Escalation Matrix
-<!-- If the team is unable to resolve the issue, who is the first in line to contact?
+<!-- If the team is unable to resolve or need to escalate an incident, who is the first to contact?
 | #  | Name | Role | E-Mail | Phone number |
 | --- | --- | --- | --- | --- |

package/generators/resources/monitoring/index.js CHANGED Viewed

@@ -7,10 +7,12 @@ const { required } = require('../../../src/validators');
 const validate = require('./validate');
 const { handleSlos, handleAlerts, handleUptimeChecks } = require('./handle-yaml');
 const { getProjectId } = require('../pubsub/get-gcp-projects');
+const getTribeAndClanName = require('../../init/clan-infra/tribe-clan-repo');
 const uptimeCheckTemplates = yaml.load(fs.readFileSync(`${__dirname}/templates/uptime-checks/uptime-checks.yaml`));
 const alertTemplates = yaml.load(fs.readFileSync(`${__dirname}/templates/alerts/alerts.yaml`));
 const sloTemplates = yaml.load(fs.readFileSync(`${__dirname}/templates/slos/slos.yaml`));
+const { clan: defaultClan } = getTribeAndClanName();
 const projectId = getProjectId('prod');
 module.exports = class extends BaseGenerator {
@@ -35,7 +37,7 @@ module.exports = class extends BaseGenerator {
         choices: (answers) => Object.keys(alertTemplates[`${answers.alertResource}`]),
       },
       {
-        when: (response) => ['alerts', 'slos', 'uptime-checks'].includes(response.monitoringResource),
+        when: (response) => ['slos', 'uptime-checks'].includes(response.monitoringResource) || response.alertResource === 'cloud_run',
         type: 'input',
         name: 'systemName',
         message: 'Please provide three-letter system name as defined in Styra (example: sre, ptf, sda, che, pnp, iam...)',
@@ -146,8 +148,10 @@ module.exports = class extends BaseGenerator {
       const yamlPath = `${resourceDir}/alerts.yaml`;
       copyTemplate('alerts', resourceDir, yamlPath);
       const oldYaml = yaml.load(fs.readFileSync(yamlPath, 'utf8')) || [];
-      const newYaml = await handleAlerts(oldYaml, alertTemplates, this.answers);
+      const newYaml = await handleAlerts(oldYaml, alertTemplates,
+        { ...this.answers, clan: defaultClan });
       fs.writeFileSync(yamlPath, yaml.dump(newYaml));
     }

package/generators/resources/monitoring/templates/alerts/alerts.yaml CHANGED Viewed

@@ -18,7 +18,7 @@ cloud_run:
       content: <% if (runbookLink) { %>[Runbook](<%-runbookLink%>)<%} else { %> <% } %>
 cloud_scheduler:
   failed_job:
-    display_name: "[P4] <%-systemName%> - Cloud Scheduler | Job Failed"
+    display_name: "[P4] <%-clan%> - Cloud Scheduler | Job Failed"
     conditions:
       - display_name: Cloud Scheduler Job - Log entries with SEVERITY=Error exceed threshold
         condition_threshold:
@@ -36,7 +36,7 @@ cloud_scheduler:
       content: <% if (runbookLink) { %>[Runbook](<%-runbookLink%>)<%} else { %> <% } %>
 cloud_sql:
   cpu_over_65:
-    display_name: "[P3] <%-systemName%> - CloudSQL | CPU over 65%"
+    display_name: "[P3] <%-clan%> - CloudSQL | CPU over 65%"
     conditions:
       - display_name: Cloud SQL Database - CPU utilization above 65% over 5 min
         condition_threshold:
@@ -53,7 +53,7 @@ cloud_sql:
     documentation:
       content: <% if (runbookLink) { %>[Runbook](<%-runbookLink%>)<%} else { %> <% } %>
   cpu_over_85:
-    display_name: "[P3] <%-systemName%> - CloudSQL | CPU over 85%"
+    display_name: "[P3] <%-clan%> - CloudSQL | CPU over 85%"
     conditions:
       - display_name: "Cloud SQL Database - CPU-usage above 85% over 1 min"
         condition_threshold:
@@ -70,7 +70,7 @@ cloud_sql:
     documentation:
       content: <% if (runbookLink) { %>[Runbook](<%-runbookLink%>)<%} else { %> <% } %>
   cpu_over_90:
-    display_name: "[P3] <%-systemName%> - CloudSQL | CPU over 90%"
+    display_name: "[P3] <%-clan%> - CloudSQL | CPU over 90%"
     conditions:
       - display_name: Cloud SQL Database - CPU-usage above 90%
         condition_threshold:
@@ -86,7 +86,7 @@ cloud_sql:
     documentation:
       content: <% if (runbookLink) { %>[Runbook](<%-runbookLink%>)<%} else { %> <% } %>
   query_over_1s:
-    display_name: "[P4] <%-systemName%> - CloudSQL | Query resolve time"
+    display_name: "[P4] <%-clan%> - CloudSQL | Query resolve time"
     conditions:
       - display_name: Cloud SQL Instance Database - Per query execution times above 1000 ms
         condition_threshold:
@@ -103,7 +103,7 @@ cloud_sql:
       content: <% if (runbookLink) { %>[Runbook](<%-runbookLink%>)<%} else { %> <% } %>
 memorystore:
   memory_over_50:
-    display_name: "[P4] <%-systemName%> - Memorystore | Memory over 50%"
+    display_name: "[P4] <%-clan%> - Memorystore | Memory over 50%"
     conditions:
       - display_name: Memorystore Redis Instance - Memory Usage above 50% over 5 min
         condition_threshold:
@@ -120,7 +120,7 @@ memorystore:
     documentation:
       content: <% if (runbookLink) { %>[Runbook](<%-runbookLink%>)<%} else { %> <% } %>
   memory_over_75:
-    display_name: "[P4] <%-systemName%> - Memorystore | Memory over 75%"
+    display_name: "[P4] <%-clan%> - Memorystore | Memory over 75%"
     conditions:
       - display_name: Memorystore Redis Instance - Memory Usage above 75% for 5min
         condition_threshold:
@@ -137,7 +137,7 @@ memorystore:
     documentation:
       content: <% if (runbookLink) { %>[Runbook](<%-runbookLink%>)<%} else { %> <% } %>
   memory_over_90:
-    display_name: "[P2] <%-systemName%> - Memorystore | Memory over 90%"
+    display_name: "[P2] <%-clan%> - Memorystore | Memory over 90%"
     conditions:
       - display_name: Memorystore Redis Instance - Memory Usage above 90%
         condition_threshold:
@@ -155,7 +155,7 @@ memorystore:
       content: <% if (runbookLink) { %>[Runbook](<%-runbookLink%>)<%} else { %> <% } %>
 pub_sub:
   unacknowledged_messages:
-    display_name: "[P3] <%-systemName%> - Pub/Sub | Undelivered message(s)"
+    display_name: "[P3] <%-clan%> - Pub/Sub | Undelivered message(s)"
     conditions:
       - display_name: Cloud Pub/Sub Subscription - Undelivered messages above 1 for 5 min
         condition_threshold:
@@ -172,7 +172,7 @@ pub_sub:
     documentation:
       content: <% if (runbookLink) { %>[Runbook](<%-runbookLink%>)<%} else { %> <% } %>
   messages_in_dlq:
-    display_name: "[P3] <%-systemName%> - Pub/Sub | Message(s) in DLQ"
+    display_name: "[P3] <%-clan%> - Pub/Sub | Message(s) in DLQ"
     conditions:
       - display_name: Cloud Pub/Sub Subscription - Number of undelivered message(s) forwarded to DLQ
         condition_threshold:
@@ -189,7 +189,7 @@ pub_sub:
     documentation:
       content: <% if (runbookLink) { %>[Runbook](<%-runbookLink%>)<%} else { %> <% } %>
   latency:
-    display_name: "[P3] <%-systemName%> - Pub/Sub | Response latency distribution"
+    display_name: "[P3] <%-clan%> - Pub/Sub | Response latency distribution"
     conditions:
       - display_name: Cloud Pub/Sub Subscription - Latency above 3s
         condition_threshold:
@@ -208,7 +208,7 @@ pub_sub:
       content: <% if (runbookLink) { %>[Runbook](<%-runbookLink%>)<%} else { %> <% } %>
 cloud_function:
   failed_execution:
-    display_name: "[P2] <%-systemName%> - Cloud Function | Failed job execution"
+    display_name: "[P2] <%-clan%> - Cloud Function | Failed job execution"
     conditions:
       - display_name: Cloud Function - Execution error count
         condition_threshold:

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@hiiretail/gcp-infra-cli",
-  "version": "0.80.0",
+  "version": "0.80.2",
   "description": "Infrastructure as code generator for GCP.",
   "main": "src/cli.js",
   "bin": {