npm - @hiiretail/gcp-infra-cli - Versions diffs - 0.101.0 → 0.102.1 - Mend

@hiiretail/gcp-infra-cli 0.101.0 → 0.102.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/generators/common-resources/monitoring/templates/alerts/generic-infra.yaml CHANGED Viewed

@@ -430,16 +430,16 @@ cloud_function:
       subject: 'Function: `$${resource.label.function_name}`'
 cloud_run:
   error_count:
-    display_name: "[P3] <%-clan%> - Cloud Run | Error Count"
+    display_name: "[P3] <%-clan%> - Cloud Run | Error rate"
     conditions:
-      - display_name: Error count above threshold for more than 5 minutes
+      - display_name: Error rate above threshold for more than 5 minutes
         condition_threshold:
           filter: |
             resource.type="cloud_run_revision"
             metric.type="run.googleapis.com/request_count"
             metric.label.response_code_class="5xx"
             resource.labels.project_id="<%-projectId%>"
-          threshold_value: 50
+          threshold_value: 0.05
           duration: 300s
           aggregations:
             - alignment_period: 60s
@@ -527,57 +527,61 @@ cloud_run:
       subject: 'Service: `$${resource.label.service_name}`'
 spanner:
   cpu_utilization_by_priority:
-    display_name: "[P1] <%-clan%> - Spanner | CPU Utilization by Priority"
+    display_name: "[P2] <%-clan%> - Spanner | CPU Utilization"
     conditions:
-      - display_name: Spanner - CPU Utilization by Priority
+      - display_name: Cloud Spanner Database - CPU Utilization
         condition_threshold:
           filter: |
-            resource.type="spanner_instance"
-            metric.type="spanner.googleapis.com/instance/cpu/utilization_by_priority"
+            resource.type = "spanner_instance"
+            metric.type = "spanner.googleapis.com/instance/cpu/utilization"
             resource.labels.project_id="<%-projectId%>"
-          threshold_value: 0.9
+          threshold_Value: 0.8
           duration: 300s
           aggregations:
             - alignment_period: 60s
-              per_series_aligner: ALIGN_MAX
-              cross_series_reducer: REDUCE_SUM
+              cross_series_reducer: REDUCE_MEAN
+              per_series_aligner: ALIGN_MEAN
               group_by_fields:
-                - resource.label.instance_id
+                - metric.label.database
     documentation:
-      subject: 'Instance: `$${resource.label.instance_id}`'
-  api_request_count:
-    display_name: "[P2] <%-clan%> - Spanner | API Request Count"
+      subject: 'Instance: `$${resource.label.instance_id}`, Database: `$${resource.label.database}`'
+  api_request_error_rate:
+    display_name: "[P2] <%-clan%> - Spanner | API request error rate"
     conditions:
-      - display_name: Spanner - API Request Count
+      - display_name: Cloud Spanner Database - API request error rate
         condition_threshold:
           filter: |
-            resource.type="spanner_instance"
-            metric.type="spanner.googleapis.com/api/api_request_count"
-            resource.labels.project_id="<%-projectId%>"
-          threshold_value: 1000
+            resource.type = "spanner_instance"
+            metric.type = "spanner.googleapis.com/api/api_request_count"
+            metric.labels.status != "OK"
+          threshold_value: 1
           duration: 300s
           aggregations:
             - alignment_period: 60s
-              per_series_aligner: ALIGN_SUM
+              cross_series_reducer: REDUCE_MEAN
+              per_series_aligner: ALIGN_RATE
               group_by_fields:
-                - resource.label.instance_id
+                - metric.label.status
+                - resource.label.database
     documentation:
-      subject: 'Instance: `$${resource.label.instance_id}`'
+      subject: 'Instance: `$${resource.label.instance_id}`, Database: `$${resource.label.database}}`'
   request_latencies:
-    display_name: "[P3] <%-clan%> - Spanner | Request Latencies"
+    display_name: '[P3] <%-clan%> - Spanner | API transaction latency'
     conditions:
-      - display_name: Spanner - Request Latencies
+      - display_name: Cloud Spanner Instance - Request latencies by transaction type
         condition_threshold:
           filter: |
-            resource.type="spanner_instance"
-            metric.type="spanner.googleapis.com/api/request_latencies"
-            resource.labels.project_id="<%-projectId%>"
-          threshold_value: 500
+            resource.type = "spanner_instance"
+            metric.type = "spanner.googleapis.com/api/request_latencies_by_transaction_type"
+            metric.labels.transaction_type = "READ_WRITE"
           duration: 300s
+          threshold_value: 100
           aggregations:
             - alignment_period: 60s
-              per_series_aligner: ALIGN_MEAN
+              cross_series_reducer: REDUCE_PERCENTILE_95
               group_by_fields:
-                - resource.label.instance_id
+                - metric.label.database
+                - metric.label.transaction_type
+              per_series_aligner: ALIGN_PERCENTILE_95
     documentation:
-      subject: 'Instance: `$${resource.label.instance_id}`'
+      subject: 'Instance: `$${resource.label.instance_id}`, Database: `$${resource.label.database}}`'

package/generators/docs/rca/templates/docs/rca_external.md CHANGED Viewed

@@ -3,19 +3,41 @@
 ## Summary
 <!--
-This section should be written last, when all of the other bullets in the RCA has been written. The purpose of this section is to have a section that can be
-shared with external customers as well as internal stakeholders. Some guidelines and pointers:
+Briefly summarize what happened, who was impacted (e.g., end-users, stores), and the business consequences (e.g., data loss, incorrect pricing, downtime).
+  * Incident Description: Brief overview of what happened (2-3 sentences)
+  * Root Cause: One-sentence explanation of the fundamental issue
+  * Resolution: Brief description of how the issue was resolved
+  * Resolution Time: When the issue was fully resolved
+-->
+## Technical Details
+<!--
+Provide a clear explanation of the technical root cause. Include:
+  * What change or event triggered the incident
+  * How the system behaved unexpectedly
+  * Details about any configuration changes, bugs, or missing logic
+Any dependencies or services involved.
+-->
-* Think of it as the back cover of a book, a short but descriptive story
-* The people who will read this will most likeley not have the technical know-how that we do, keep it simple
-* Include what the problem was and how the customer(s) was affected
-* Do NOT include any customer names
-* Do NOT include any Hii Retail service names or underlying technologies that we are using
-* Include some general action points, such as if additional alerts needs to be added. Do not add details
-* Don't make promises, if there are any actions that we will look at, just mention them. Don't include target dates
+## Impact
-Example:
-On January 25 2023 from 14:31 CET to 15:59 CET some customers may have experienced delays with card payments. The issue was traced back to a misconfiguration of a recent deployment. At 15:31 CET a fix was deployed and monitored. At 15:59 CET the issue was resolved for all affected customers.
+<!--
+Describe the direct and indirect consequences of the incident.
+  * Customer Impact: How customers/end-users were affected
+  * Business Impact: Financial or operational consequences
+  * Duration: Total time the issue persisted
+  * Scope: Which systems/users were affected
+-->
+## Lessons Learned
+<!--
+Highlight what could have prevented the incident:
-We will look into the process of how we update our configuration in production to mitigate these issues going forward.
+  * Gaps in testing
+  * Gaps in monitoring
+  * Gaps in requirements
+  * etc.
 -->

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@hiiretail/gcp-infra-cli",
-  "version": "0.101.0",
+  "version": "0.102.1",
   "description": "Infrastructure as code generator for GCP.",
   "main": "src/cli.js",
   "bin": {