@hiiretail/gcp-infra-cli 0.101.0 → 0.102.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -430,16 +430,16 @@ cloud_function:
430
430
  subject: 'Function: `$${resource.label.function_name}`'
431
431
  cloud_run:
432
432
  error_count:
433
- display_name: "[P3] <%-clan%> - Cloud Run | Error Count"
433
+ display_name: "[P3] <%-clan%> - Cloud Run | Error rate"
434
434
  conditions:
435
- - display_name: Error count above threshold for more than 5 minutes
435
+ - display_name: Error rate above threshold for more than 5 minutes
436
436
  condition_threshold:
437
437
  filter: |
438
438
  resource.type="cloud_run_revision"
439
439
  metric.type="run.googleapis.com/request_count"
440
440
  metric.label.response_code_class="5xx"
441
441
  resource.labels.project_id="<%-projectId%>"
442
- threshold_value: 50
442
+ threshold_value: 0.05
443
443
  duration: 300s
444
444
  aggregations:
445
445
  - alignment_period: 60s
@@ -527,57 +527,61 @@ cloud_run:
527
527
  subject: 'Service: `$${resource.label.service_name}`'
528
528
  spanner:
529
529
  cpu_utilization_by_priority:
530
- display_name: "[P1] <%-clan%> - Spanner | CPU Utilization by Priority"
530
+ display_name: "[P2] <%-clan%> - Spanner | CPU Utilization"
531
531
  conditions:
532
- - display_name: Spanner - CPU Utilization by Priority
532
+ - display_name: Cloud Spanner Database - CPU Utilization
533
533
  condition_threshold:
534
534
  filter: |
535
- resource.type="spanner_instance"
536
- metric.type="spanner.googleapis.com/instance/cpu/utilization_by_priority"
535
+ resource.type = "spanner_instance"
536
+ metric.type = "spanner.googleapis.com/instance/cpu/utilization"
537
537
  resource.labels.project_id="<%-projectId%>"
538
- threshold_value: 0.9
538
+ threshold_Value: 0.8
539
539
  duration: 300s
540
540
  aggregations:
541
541
  - alignment_period: 60s
542
- per_series_aligner: ALIGN_MAX
543
- cross_series_reducer: REDUCE_SUM
542
+ cross_series_reducer: REDUCE_MEAN
543
+ per_series_aligner: ALIGN_MEAN
544
544
  group_by_fields:
545
- - resource.label.instance_id
545
+ - metric.label.database
546
546
  documentation:
547
- subject: 'Instance: `$${resource.label.instance_id}`'
548
- api_request_count:
549
- display_name: "[P2] <%-clan%> - Spanner | API Request Count"
547
+ subject: 'Instance: `$${resource.label.instance_id}`, Database: `$${resource.label.database}`'
548
+ api_request_error_rate:
549
+ display_name: "[P2] <%-clan%> - Spanner | API request error rate"
550
550
  conditions:
551
- - display_name: Spanner - API Request Count
551
+ - display_name: Cloud Spanner Database - API request error rate
552
552
  condition_threshold:
553
553
  filter: |
554
- resource.type="spanner_instance"
555
- metric.type="spanner.googleapis.com/api/api_request_count"
556
- resource.labels.project_id="<%-projectId%>"
557
- threshold_value: 1000
554
+ resource.type = "spanner_instance"
555
+ metric.type = "spanner.googleapis.com/api/api_request_count"
556
+ metric.labels.status != "OK"
557
+ threshold_value: 1
558
558
  duration: 300s
559
559
  aggregations:
560
560
  - alignment_period: 60s
561
- per_series_aligner: ALIGN_SUM
561
+ cross_series_reducer: REDUCE_MEAN
562
+ per_series_aligner: ALIGN_RATE
562
563
  group_by_fields:
563
- - resource.label.instance_id
564
+ - metric.label.status
565
+ - resource.label.database
564
566
  documentation:
565
- subject: 'Instance: `$${resource.label.instance_id}`'
567
+ subject: 'Instance: `$${resource.label.instance_id}`, Database: `$${resource.label.database}}`'
566
568
  request_latencies:
567
- display_name: "[P3] <%-clan%> - Spanner | Request Latencies"
569
+ display_name: '[P3] <%-clan%> - Spanner | API transaction latency'
568
570
  conditions:
569
- - display_name: Spanner - Request Latencies
571
+ - display_name: Cloud Spanner Instance - Request latencies by transaction type
570
572
  condition_threshold:
571
573
  filter: |
572
- resource.type="spanner_instance"
573
- metric.type="spanner.googleapis.com/api/request_latencies"
574
- resource.labels.project_id="<%-projectId%>"
575
- threshold_value: 500
574
+ resource.type = "spanner_instance"
575
+ metric.type = "spanner.googleapis.com/api/request_latencies_by_transaction_type"
576
+ metric.labels.transaction_type = "READ_WRITE"
576
577
  duration: 300s
578
+ threshold_value: 100
577
579
  aggregations:
578
580
  - alignment_period: 60s
579
- per_series_aligner: ALIGN_MEAN
581
+ cross_series_reducer: REDUCE_PERCENTILE_95
580
582
  group_by_fields:
581
- - resource.label.instance_id
583
+ - metric.label.database
584
+ - metric.label.transaction_type
585
+ per_series_aligner: ALIGN_PERCENTILE_95
582
586
  documentation:
583
- subject: 'Instance: `$${resource.label.instance_id}`'
587
+ subject: 'Instance: `$${resource.label.instance_id}`, Database: `$${resource.label.database}}`'
@@ -3,19 +3,41 @@
3
3
  ## Summary
4
4
 
5
5
  <!--
6
- This section should be written last, when all of the other bullets in the RCA has been written. The purpose of this section is to have a section that can be
7
- shared with external customers as well as internal stakeholders. Some guidelines and pointers:
6
+ Briefly summarize what happened, who was impacted (e.g., end-users, stores), and the business consequences (e.g., data loss, incorrect pricing, downtime).
7
+ * Incident Description: Brief overview of what happened (2-3 sentences)
8
+ * Root Cause: One-sentence explanation of the fundamental issue
9
+ * Resolution: Brief description of how the issue was resolved
10
+ * Resolution Time: When the issue was fully resolved
11
+ -->
12
+
13
+ ## Technical Details
14
+
15
+ <!--
16
+ Provide a clear explanation of the technical root cause. Include:
17
+ * What change or event triggered the incident
18
+ * How the system behaved unexpectedly
19
+ * Details about any configuration changes, bugs, or missing logic
20
+
21
+ Any dependencies or services involved.
22
+ -->
8
23
 
9
- * Think of it as the back cover of a book, a short but descriptive story
10
- * The people who will read this will most likeley not have the technical know-how that we do, keep it simple
11
- * Include what the problem was and how the customer(s) was affected
12
- * Do NOT include any customer names
13
- * Do NOT include any Hii Retail service names or underlying technologies that we are using
14
- * Include some general action points, such as if additional alerts needs to be added. Do not add details
15
- * Don't make promises, if there are any actions that we will look at, just mention them. Don't include target dates
24
+ ## Impact
16
25
 
17
- Example:
18
- On January 25 2023 from 14:31 CET to 15:59 CET some customers may have experienced delays with card payments. The issue was traced back to a misconfiguration of a recent deployment. At 15:31 CET a fix was deployed and monitored. At 15:59 CET the issue was resolved for all affected customers.
26
+ <!--
27
+ Describe the direct and indirect consequences of the incident.
28
+ * Customer Impact: How customers/end-users were affected
29
+ * Business Impact: Financial or operational consequences
30
+ * Duration: Total time the issue persisted
31
+ * Scope: Which systems/users were affected
32
+ -->
33
+
34
+ ## Lessons Learned
35
+
36
+ <!--
37
+ Highlight what could have prevented the incident:
19
38
 
20
- We will look into the process of how we update our configuration in production to mitigate these issues going forward.
39
+ * Gaps in testing
40
+ * Gaps in monitoring
41
+ * Gaps in requirements
42
+ * etc.
21
43
  -->
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@hiiretail/gcp-infra-cli",
3
- "version": "0.101.0",
3
+ "version": "0.102.1",
4
4
  "description": "Infrastructure as code generator for GCP.",
5
5
  "main": "src/cli.js",
6
6
  "bin": {