@cdklabs/multi-az-observability 0.0.1-alpha.2 → 0.0.1-alpha.21

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. package/.jsii +372 -115
  2. package/API.md +292 -74
  3. package/README.md +21 -14
  4. package/cdk.json +1 -1
  5. package/lib/alarmsandrules/AvailabilityAndLatencyAlarmsAndRules.d.ts +6 -6
  6. package/lib/alarmsandrules/AvailabilityAndLatencyAlarmsAndRules.js +31 -28
  7. package/lib/alarmsandrules/BaseOperationZonalAlarmsAndRules.d.ts +0 -4
  8. package/lib/alarmsandrules/BaseOperationZonalAlarmsAndRules.js +6 -5
  9. package/lib/alarmsandrules/CanaryOperationZonalAlarmsAndRules.js +8 -6
  10. package/lib/alarmsandrules/IBaseOperationZonalAlarmsAndRules.d.ts +0 -4
  11. package/lib/alarmsandrules/IBaseOperationZonalAlarmsAndRules.js +1 -1
  12. package/lib/alarmsandrules/OperationAlarmsAndRules.js +10 -12
  13. package/lib/alarmsandrules/ServerSideOperationZonalAlarmsAndRules.js +14 -12
  14. package/lib/alarmsandrules/props/BaseOperationZonalAlarmsAndRulesProps.d.ts +2 -2
  15. package/lib/alarmsandrules/props/BaseOperationZonalAlarmsAndRulesProps.js +1 -1
  16. package/lib/azmapper/AvailabilityZoneMapper.js +4 -3
  17. package/lib/basic_observability/BasicServiceDashboard.d.ts +1 -2
  18. package/lib/basic_observability/BasicServiceDashboard.js +58 -78
  19. package/lib/basic_observability/BasicServiceMultiAZObservability.d.ts +8 -9
  20. package/lib/basic_observability/BasicServiceMultiAZObservability.js +100 -312
  21. package/lib/basic_observability/props/ApplicationLoadBalancerDetectionProps.d.ts +50 -0
  22. package/lib/basic_observability/props/ApplicationLoadBalancerDetectionProps.js +3 -0
  23. package/lib/basic_observability/props/BasicServiceDashboardProps.d.ts +27 -13
  24. package/lib/basic_observability/props/BasicServiceDashboardProps.js +1 -1
  25. package/lib/basic_observability/props/BasicServiceMultiAZObservabilityProps.d.ts +10 -38
  26. package/lib/basic_observability/props/BasicServiceMultiAZObservabilityProps.js +1 -1
  27. package/lib/basic_observability/props/NatGatewayDetectionProps.d.ts +31 -0
  28. package/lib/basic_observability/props/NatGatewayDetectionProps.js +3 -0
  29. package/lib/canaries/CanaryFunction.js +7 -6
  30. package/lib/canaries/src/canary.zip +0 -0
  31. package/lib/dashboards/OperationAvailabilityAndLatencyDashboard.d.ts +3 -6
  32. package/lib/dashboards/OperationAvailabilityAndLatencyDashboard.js +359 -498
  33. package/lib/dashboards/ServiceAvailabilityAndLatencyDashboard.d.ts +0 -1
  34. package/lib/dashboards/ServiceAvailabilityAndLatencyDashboard.js +56 -113
  35. package/lib/dashboards/props/OperationAvailabilityAndLatencyDashboardProps.d.ts +4 -8
  36. package/lib/dashboards/props/OperationAvailabilityAndLatencyDashboardProps.js +1 -1
  37. package/lib/dashboards/props/OperationAvailabilityWidgetProps.d.ts +7 -2
  38. package/lib/dashboards/props/OperationAvailabilityWidgetProps.js +1 -1
  39. package/lib/dashboards/props/OperationLatencyWidgetProps.d.ts +7 -2
  40. package/lib/dashboards/props/OperationLatencyWidgetProps.js +1 -1
  41. package/lib/index.d.ts +5 -0
  42. package/lib/index.js +8 -2
  43. package/lib/metrics/ApplicationLoadBalancerMetrics.d.ts +65 -4
  44. package/lib/metrics/ApplicationLoadBalancerMetrics.js +629 -40
  45. package/lib/metrics/AvailabilityAndLatencyMetrics.d.ts +23 -0
  46. package/lib/metrics/AvailabilityAndLatencyMetrics.js +115 -24
  47. package/lib/metrics/NatGatewayMetrics.d.ts +110 -0
  48. package/lib/metrics/NatGatewayMetrics.js +300 -0
  49. package/lib/metrics/RegionalLatencyMetrics.d.ts +1 -1
  50. package/lib/metrics/RegionalLatencyMetrics.js +27 -20
  51. package/lib/metrics/ZonalAvailabilityMetrics.d.ts +2 -8
  52. package/lib/metrics/ZonalAvailabilityMetrics.js +10 -25
  53. package/lib/metrics/ZonalLatencyMetrics.d.ts +2 -1
  54. package/lib/metrics/ZonalLatencyMetrics.js +33 -23
  55. package/lib/metrics/props/AvailabilityAndLatencyMetricProps.d.ts +4 -0
  56. package/lib/metrics/props/AvailabilityAndLatencyMetricProps.js +1 -1
  57. package/lib/metrics/props/LatencyMetricProps.d.ts +6 -0
  58. package/lib/metrics/props/LatencyMetricProps.js +1 -1
  59. package/lib/metrics/props/ZonalAvailabilityMetricProps.d.ts +4 -0
  60. package/lib/metrics/props/ZonalAvailabilityMetricProps.js +1 -1
  61. package/lib/metrics/props/ZonalLatencyMetricProps.d.ts +4 -0
  62. package/lib/metrics/props/ZonalLatencyMetricProps.js +1 -1
  63. package/lib/monitoring/src/monitoring-layer.zip +0 -0
  64. package/lib/outlier-detection/ApplicationLoadBalancerAvailabilityOutlierAlgorithm.d.ts +10 -0
  65. package/lib/outlier-detection/ApplicationLoadBalancerAvailabilityOutlierAlgorithm.js +15 -0
  66. package/lib/outlier-detection/ApplicationLoadBalancerLatencyOutlierAlgorithm.d.ts +18 -0
  67. package/lib/outlier-detection/ApplicationLoadBalancerLatencyOutlierAlgorithm.js +23 -0
  68. package/lib/outlier-detection/OutlierDetectionFunction.js +6 -5
  69. package/lib/outlier-detection/PacketLossOutlierAlgorithm.d.ts +10 -0
  70. package/lib/outlier-detection/PacketLossOutlierAlgorithm.js +15 -0
  71. package/lib/outlier-detection/src/outlier-detection.zip +0 -0
  72. package/lib/outlier-detection/src/scipy-layer.zip +0 -0
  73. package/lib/services/CanaryMetrics.js +1 -1
  74. package/lib/services/CanaryTestMetricsOverride.js +1 -1
  75. package/lib/services/ContributorInsightRuleDetails.js +1 -1
  76. package/lib/services/InstrumentedServiceMultiAZObservability.js +3 -3
  77. package/lib/services/Operation.js +1 -1
  78. package/lib/services/OperationMetricDetails.js +1 -1
  79. package/lib/services/Service.js +1 -1
  80. package/lib/services/ServiceMetricDetails.js +1 -1
  81. package/lib/services/props/MetricDimensions.js +1 -1
  82. package/lib/utilities/MetricsHelper.d.ts +14 -10
  83. package/lib/utilities/MetricsHelper.js +18 -11
  84. package/package.json +8 -8
package/API.md CHANGED
@@ -788,6 +788,136 @@ The timeout for each individual HTTP request.
788
788
 
789
789
  ---
790
790
 
791
+ ### ApplicationLoadBalancerDetectionProps <a name="ApplicationLoadBalancerDetectionProps" id="@cdklabs/multi-az-observability.ApplicationLoadBalancerDetectionProps"></a>
792
+
793
+ The properties for performing zonal impact detection with ALB(s).
794
+
795
+ #### Initializer <a name="Initializer" id="@cdklabs/multi-az-observability.ApplicationLoadBalancerDetectionProps.Initializer"></a>
796
+
797
+ ```typescript
798
+ import { ApplicationLoadBalancerDetectionProps } from '@cdklabs/multi-az-observability'
799
+
800
+ const applicationLoadBalancerDetectionProps: ApplicationLoadBalancerDetectionProps = { ... }
801
+ ```
802
+
803
+ #### Properties <a name="Properties" id="Properties"></a>
804
+
805
+ | **Name** | **Type** | **Description** |
806
+ | --- | --- | --- |
807
+ | <code><a href="#@cdklabs/multi-az-observability.ApplicationLoadBalancerDetectionProps.property.applicationLoadBalancers">applicationLoadBalancers</a></code> | <code>aws-cdk-lib.aws_elasticloadbalancingv2.IApplicationLoadBalancer[]</code> | The application load balancers to collect metrics from. |
808
+ | <code><a href="#@cdklabs/multi-az-observability.ApplicationLoadBalancerDetectionProps.property.faultCountPercentThreshold">faultCountPercentThreshold</a></code> | <code>number</code> | The percentage of faults for a single ALB to consider an AZ to be unhealthy, a number between 0 and 100. |
809
+ | <code><a href="#@cdklabs/multi-az-observability.ApplicationLoadBalancerDetectionProps.property.latencyStatistic">latencyStatistic</a></code> | <code>string</code> | The statistic used to measure target response latency, like p99, which can be specified using Stats.percentile(99) or "p99". |
810
+ | <code><a href="#@cdklabs/multi-az-observability.ApplicationLoadBalancerDetectionProps.property.latencyThreshold">latencyThreshold</a></code> | <code>number</code> | The threshold in milliseconds for ALB targets whose responses are slower than this value at the specified percentile statistic. |
811
+ | <code><a href="#@cdklabs/multi-az-observability.ApplicationLoadBalancerDetectionProps.property.availabilityOutlierAlgorithm">availabilityOutlierAlgorithm</a></code> | <code><a href="#@cdklabs/multi-az-observability.ApplicationLoadBalancerAvailabilityOutlierAlgorithm">ApplicationLoadBalancerAvailabilityOutlierAlgorithm</a></code> | The method used to determine if an AZ is an outlier for availability for Application Load Balancer metrics. |
812
+ | <code><a href="#@cdklabs/multi-az-observability.ApplicationLoadBalancerDetectionProps.property.availabilityOutlierThreshold">availabilityOutlierThreshold</a></code> | <code>number</code> | The threshold for the outlier detection algorithm. |
813
+ | <code><a href="#@cdklabs/multi-az-observability.ApplicationLoadBalancerDetectionProps.property.latencyOutlierAlgorithm">latencyOutlierAlgorithm</a></code> | <code><a href="#@cdklabs/multi-az-observability.ApplicationLoadBalancerLatencyOutlierAlgorithm">ApplicationLoadBalancerLatencyOutlierAlgorithm</a></code> | The method used to determine if an AZ is an outlier for latency for Application Load Balancer metrics. |
814
+ | <code><a href="#@cdklabs/multi-az-observability.ApplicationLoadBalancerDetectionProps.property.latencyOutlierThreshold">latencyOutlierThreshold</a></code> | <code>number</code> | The threshold for the outlier detection algorithm. |
815
+
816
+ ---
817
+
818
+ ##### `applicationLoadBalancers`<sup>Required</sup> <a name="applicationLoadBalancers" id="@cdklabs/multi-az-observability.ApplicationLoadBalancerDetectionProps.property.applicationLoadBalancers"></a>
819
+
820
+ ```typescript
821
+ public readonly applicationLoadBalancers: IApplicationLoadBalancer[];
822
+ ```
823
+
824
+ - *Type:* aws-cdk-lib.aws_elasticloadbalancingv2.IApplicationLoadBalancer[]
825
+
826
+ The application load balancers to collect metrics from.
827
+
828
+ ---
829
+
830
+ ##### `faultCountPercentThreshold`<sup>Required</sup> <a name="faultCountPercentThreshold" id="@cdklabs/multi-az-observability.ApplicationLoadBalancerDetectionProps.property.faultCountPercentThreshold"></a>
831
+
832
+ ```typescript
833
+ public readonly faultCountPercentThreshold: number;
834
+ ```
835
+
836
+ - *Type:* number
837
+
838
+ The percentage of faults for a single ALB to consider an AZ to be unhealthy, a number between 0 and 100.
839
+
840
+ This should align with your availability goal. For example
841
+ 1% or 5%, provided as 1 or 5.
842
+
843
+ ---
844
+
845
+ ##### `latencyStatistic`<sup>Required</sup> <a name="latencyStatistic" id="@cdklabs/multi-az-observability.ApplicationLoadBalancerDetectionProps.property.latencyStatistic"></a>
846
+
847
+ ```typescript
848
+ public readonly latencyStatistic: string;
849
+ ```
850
+
851
+ - *Type:* string
852
+
853
+ The statistic used to measure target response latency, like p99, which can be specified using Stats.percentile(99) or "p99".
854
+
855
+ ---
856
+
857
+ ##### `latencyThreshold`<sup>Required</sup> <a name="latencyThreshold" id="@cdklabs/multi-az-observability.ApplicationLoadBalancerDetectionProps.property.latencyThreshold"></a>
858
+
859
+ ```typescript
860
+ public readonly latencyThreshold: number;
861
+ ```
862
+
863
+ - *Type:* number
864
+
865
+ The threshold in milliseconds for ALB targets whose responses are slower than this value at the specified percentile statistic.
866
+
867
+ ---
868
+
869
+ ##### `availabilityOutlierAlgorithm`<sup>Optional</sup> <a name="availabilityOutlierAlgorithm" id="@cdklabs/multi-az-observability.ApplicationLoadBalancerDetectionProps.property.availabilityOutlierAlgorithm"></a>
870
+
871
+ ```typescript
872
+ public readonly availabilityOutlierAlgorithm: ApplicationLoadBalancerAvailabilityOutlierAlgorithm;
873
+ ```
874
+
875
+ - *Type:* <a href="#@cdklabs/multi-az-observability.ApplicationLoadBalancerAvailabilityOutlierAlgorithm">ApplicationLoadBalancerAvailabilityOutlierAlgorithm</a>
876
+ - *Default:* STATIC
877
+
878
+ The method used to determine if an AZ is an outlier for availability for Application Load Balancer metrics.
879
+
880
+ ---
881
+
882
+ ##### `availabilityOutlierThreshold`<sup>Optional</sup> <a name="availabilityOutlierThreshold" id="@cdklabs/multi-az-observability.ApplicationLoadBalancerDetectionProps.property.availabilityOutlierThreshold"></a>
883
+
884
+ ```typescript
885
+ public readonly availabilityOutlierThreshold: number;
886
+ ```
887
+
888
+ - *Type:* number
889
+ - *Default:* "This depends on the algorithm used. STATIC: 66"
890
+
891
+ The threshold for the outlier detection algorithm.
892
+
893
+ ---
894
+
895
+ ##### `latencyOutlierAlgorithm`<sup>Optional</sup> <a name="latencyOutlierAlgorithm" id="@cdklabs/multi-az-observability.ApplicationLoadBalancerDetectionProps.property.latencyOutlierAlgorithm"></a>
896
+
897
+ ```typescript
898
+ public readonly latencyOutlierAlgorithm: ApplicationLoadBalancerLatencyOutlierAlgorithm;
899
+ ```
900
+
901
+ - *Type:* <a href="#@cdklabs/multi-az-observability.ApplicationLoadBalancerLatencyOutlierAlgorithm">ApplicationLoadBalancerLatencyOutlierAlgorithm</a>
902
+ - *Default:* Z_SCORE
903
+
904
+ The method used to determine if an AZ is an outlier for latency for Application Load Balancer metrics.
905
+
906
+ ---
907
+
908
+ ##### `latencyOutlierThreshold`<sup>Optional</sup> <a name="latencyOutlierThreshold" id="@cdklabs/multi-az-observability.ApplicationLoadBalancerDetectionProps.property.latencyOutlierThreshold"></a>
909
+
910
+ ```typescript
911
+ public readonly latencyOutlierThreshold: number;
912
+ ```
913
+
914
+ - *Type:* number
915
+ - *Default:* "This depends on the algorithm used. STATIC: 66. Z_SCORE: 3."
916
+
917
+ The threshold for the outlier detection algorithm.
918
+
919
+ ---
920
+
791
921
  ### AvailabilityZoneMapperProps <a name="AvailabilityZoneMapperProps" id="@cdklabs/multi-az-observability.AvailabilityZoneMapperProps"></a>
792
922
 
793
923
  Properties for the AZ mapper.
@@ -839,17 +969,13 @@ const basicServiceMultiAZObservabilityProps: BasicServiceMultiAZObservabilityPro
839
969
  | --- | --- | --- |
840
970
  | <code><a href="#@cdklabs/multi-az-observability.BasicServiceMultiAZObservabilityProps.property.datapointsToAlarm">datapointsToAlarm</a></code> | <code>number</code> | The number of datapoints to alarm on for latency and availability alarms. |
841
971
  | <code><a href="#@cdklabs/multi-az-observability.BasicServiceMultiAZObservabilityProps.property.evaluationPeriods">evaluationPeriods</a></code> | <code>number</code> | The number of evaluation periods for latency and availabiltiy alarms. |
842
- | <code><a href="#@cdklabs/multi-az-observability.BasicServiceMultiAZObservabilityProps.property.faultCountPercentageThreshold">faultCountPercentageThreshold</a></code> | <code>number</code> | The percentage of faults for a single ALB to consider an AZ to be unhealthy, this should align with your availability goal. |
843
- | <code><a href="#@cdklabs/multi-az-observability.BasicServiceMultiAZObservabilityProps.property.latencyStatistic">latencyStatistic</a></code> | <code>string</code> | The statistic used to measure target response latency, like p99, which can be specified using Stats.percentile(99) or "p99". |
844
- | <code><a href="#@cdklabs/multi-az-observability.BasicServiceMultiAZObservabilityProps.property.latencyThreshold">latencyThreshold</a></code> | <code>number</code> | The threshold in seconds for ALB targets whose responses are slower than this value at the specified percentile statistic. |
845
972
  | <code><a href="#@cdklabs/multi-az-observability.BasicServiceMultiAZObservabilityProps.property.serviceName">serviceName</a></code> | <code>string</code> | The service's name. |
846
- | <code><a href="#@cdklabs/multi-az-observability.BasicServiceMultiAZObservabilityProps.property.applicationLoadBalancers">applicationLoadBalancers</a></code> | <code>aws-cdk-lib.aws_elasticloadbalancingv2.IApplicationLoadBalancer[]</code> | The application load balancers being used by the service. |
973
+ | <code><a href="#@cdklabs/multi-az-observability.BasicServiceMultiAZObservabilityProps.property.applicationLoadBalancerProps">applicationLoadBalancerProps</a></code> | <code><a href="#@cdklabs/multi-az-observability.ApplicationLoadBalancerDetectionProps">ApplicationLoadBalancerDetectionProps</a></code> | Properties for ALBs to detect single AZ impact. |
847
974
  | <code><a href="#@cdklabs/multi-az-observability.BasicServiceMultiAZObservabilityProps.property.assetsBucketParameterName">assetsBucketParameterName</a></code> | <code>string</code> | If you are not using a static bucket to deploy assets, for example you are synthing this and it gets uploaded to a bucket whose name is unknown to you (maybe used as part of a central CI/CD system) and is provided as a parameter to your stack, specify that parameter name here. |
848
975
  | <code><a href="#@cdklabs/multi-az-observability.BasicServiceMultiAZObservabilityProps.property.assetsBucketPrefixParameterName">assetsBucketPrefixParameterName</a></code> | <code>string</code> | If you are not using a static bucket to deploy assets, for example you are synthing this and it gets uploaded to a bucket that uses a prefix that is unknown to you (maybe used as part of a central CI/CD system) and is provided as a parameter to your stack, specify that parameter name here. |
849
976
  | <code><a href="#@cdklabs/multi-az-observability.BasicServiceMultiAZObservabilityProps.property.createDashboard">createDashboard</a></code> | <code>boolean</code> | Whether to create a dashboard displaying the metrics and alarms. |
850
977
  | <code><a href="#@cdklabs/multi-az-observability.BasicServiceMultiAZObservabilityProps.property.interval">interval</a></code> | <code>aws-cdk-lib.Duration</code> | Dashboard interval. |
851
- | <code><a href="#@cdklabs/multi-az-observability.BasicServiceMultiAZObservabilityProps.property.natGateways">natGateways</a></code> | <code>{[ key: string ]: aws-cdk-lib.aws_ec2.CfnNatGateway[]}</code> | (Optional) A map of Availability Zone name to the NAT Gateways in that AZ. |
852
- | <code><a href="#@cdklabs/multi-az-observability.BasicServiceMultiAZObservabilityProps.property.packetLossImpactPercentageThreshold">packetLossImpactPercentageThreshold</a></code> | <code>number</code> | The amount of packet loss in a NAT GW to determine if an AZ is actually impacted, recommendation is 0.01%. |
978
+ | <code><a href="#@cdklabs/multi-az-observability.BasicServiceMultiAZObservabilityProps.property.natGatewayProps">natGatewayProps</a></code> | <code><a href="#@cdklabs/multi-az-observability.NatGatewayDetectionProps">NatGatewayDetectionProps</a></code> | Properties for NAT Gateways to detect single AZ impact. |
853
979
  | <code><a href="#@cdklabs/multi-az-observability.BasicServiceMultiAZObservabilityProps.property.period">period</a></code> | <code>aws-cdk-lib.Duration</code> | The period to evaluate metrics. |
854
980
 
855
981
  ---
@@ -878,45 +1004,6 @@ The number of evaluation periods for latency and availabiltiy alarms.
878
1004
 
879
1005
  ---
880
1006
 
881
- ##### `faultCountPercentageThreshold`<sup>Required</sup> <a name="faultCountPercentageThreshold" id="@cdklabs/multi-az-observability.BasicServiceMultiAZObservabilityProps.property.faultCountPercentageThreshold"></a>
882
-
883
- ```typescript
884
- public readonly faultCountPercentageThreshold: number;
885
- ```
886
-
887
- - *Type:* number
888
-
889
- The percentage of faults for a single ALB to consider an AZ to be unhealthy, this should align with your availability goal.
890
-
891
- For example
892
- 1% or 5%, specify as 1 or 5.
893
-
894
- ---
895
-
896
- ##### `latencyStatistic`<sup>Required</sup> <a name="latencyStatistic" id="@cdklabs/multi-az-observability.BasicServiceMultiAZObservabilityProps.property.latencyStatistic"></a>
897
-
898
- ```typescript
899
- public readonly latencyStatistic: string;
900
- ```
901
-
902
- - *Type:* string
903
-
904
- The statistic used to measure target response latency, like p99, which can be specified using Stats.percentile(99) or "p99".
905
-
906
- ---
907
-
908
- ##### `latencyThreshold`<sup>Required</sup> <a name="latencyThreshold" id="@cdklabs/multi-az-observability.BasicServiceMultiAZObservabilityProps.property.latencyThreshold"></a>
909
-
910
- ```typescript
911
- public readonly latencyThreshold: number;
912
- ```
913
-
914
- - *Type:* number
915
-
916
- The threshold in seconds for ALB targets whose responses are slower than this value at the specified percentile statistic.
917
-
918
- ---
919
-
920
1007
  ##### `serviceName`<sup>Required</sup> <a name="serviceName" id="@cdklabs/multi-az-observability.BasicServiceMultiAZObservabilityProps.property.serviceName"></a>
921
1008
 
922
1009
  ```typescript
@@ -929,20 +1016,19 @@ The service's name.
929
1016
 
930
1017
  ---
931
1018
 
932
- ##### `applicationLoadBalancers`<sup>Optional</sup> <a name="applicationLoadBalancers" id="@cdklabs/multi-az-observability.BasicServiceMultiAZObservabilityProps.property.applicationLoadBalancers"></a>
1019
+ ##### `applicationLoadBalancerProps`<sup>Optional</sup> <a name="applicationLoadBalancerProps" id="@cdklabs/multi-az-observability.BasicServiceMultiAZObservabilityProps.property.applicationLoadBalancerProps"></a>
933
1020
 
934
1021
  ```typescript
935
- public readonly applicationLoadBalancers: IApplicationLoadBalancer[];
1022
+ public readonly applicationLoadBalancerProps: ApplicationLoadBalancerDetectionProps;
936
1023
  ```
937
1024
 
938
- - *Type:* aws-cdk-lib.aws_elasticloadbalancingv2.IApplicationLoadBalancer[]
939
- - *Default:* "No alarms for ALBs will be created"
1025
+ - *Type:* <a href="#@cdklabs/multi-az-observability.ApplicationLoadBalancerDetectionProps">ApplicationLoadBalancerDetectionProps</a>
1026
+ - *Default:* "No ALBs will be used to calculate impact."
940
1027
 
941
- The application load balancers being used by the service.
1028
+ Properties for ALBs to detect single AZ impact.
942
1029
 
943
- There will be an alarm created for
944
- each AZ for each ALB. Then, there will be a composite alarm for AZ created from the input
945
- of all ALBs. You must either specify an ALB or a NAT GW.
1030
+ You must specify this
1031
+ and/or natGatewayProps.
946
1032
 
947
1033
  ---
948
1034
 
@@ -1011,33 +1097,19 @@ Dashboard interval.
1011
1097
 
1012
1098
  ---
1013
1099
 
1014
- ##### `natGateways`<sup>Optional</sup> <a name="natGateways" id="@cdklabs/multi-az-observability.BasicServiceMultiAZObservabilityProps.property.natGateways"></a>
1100
+ ##### `natGatewayProps`<sup>Optional</sup> <a name="natGatewayProps" id="@cdklabs/multi-az-observability.BasicServiceMultiAZObservabilityProps.property.natGatewayProps"></a>
1015
1101
 
1016
1102
  ```typescript
1017
- public readonly natGateways: {[ key: string ]: CfnNatGateway[]};
1103
+ public readonly natGatewayProps: NatGatewayDetectionProps;
1018
1104
  ```
1019
1105
 
1020
- - *Type:* {[ key: string ]: aws-cdk-lib.aws_ec2.CfnNatGateway[]}
1021
- - *Default:* "No alarms for NAT Gateways will be created"
1106
+ - *Type:* <a href="#@cdklabs/multi-az-observability.NatGatewayDetectionProps">NatGatewayDetectionProps</a>
1107
+ - *Default:* "No NAT Gateways will be used to calculate impact."
1022
1108
 
1023
- (Optional) A map of Availability Zone name to the NAT Gateways in that AZ.
1109
+ Properties for NAT Gateways to detect single AZ impact.
1024
1110
 
1025
- One alarm per NAT GW will be created. If multiple NAT GWs
1026
- are provided for a single AZ, those alarms will be aggregated into
1027
- a composite alarm for the AZ. You must either specify an ALB or a NAT GW.
1028
-
1029
- ---
1030
-
1031
- ##### `packetLossImpactPercentageThreshold`<sup>Optional</sup> <a name="packetLossImpactPercentageThreshold" id="@cdklabs/multi-az-observability.BasicServiceMultiAZObservabilityProps.property.packetLossImpactPercentageThreshold"></a>
1032
-
1033
- ```typescript
1034
- public readonly packetLossImpactPercentageThreshold: number;
1035
- ```
1036
-
1037
- - *Type:* number
1038
- - *Default:* "0.01 (as in 0.01%)"
1039
-
1040
- The amount of packet loss in a NAT GW to determine if an AZ is actually impacted, recommendation is 0.01%.
1111
+ You must specify
1112
+ this and/or applicationLoadBalancerProps.
1041
1113
 
1042
1114
  ---
1043
1115
 
@@ -1451,6 +1523,80 @@ MAD: 3
1451
1523
 
1452
1524
  ---
1453
1525
 
1526
+ ### NatGatewayDetectionProps <a name="NatGatewayDetectionProps" id="@cdklabs/multi-az-observability.NatGatewayDetectionProps"></a>
1527
+
1528
+ The properties for performing zonal impact detection with NAT Gateway(s).
1529
+
1530
+ #### Initializer <a name="Initializer" id="@cdklabs/multi-az-observability.NatGatewayDetectionProps.Initializer"></a>
1531
+
1532
+ ```typescript
1533
+ import { NatGatewayDetectionProps } from '@cdklabs/multi-az-observability'
1534
+
1535
+ const natGatewayDetectionProps: NatGatewayDetectionProps = { ... }
1536
+ ```
1537
+
1538
+ #### Properties <a name="Properties" id="Properties"></a>
1539
+
1540
+ | **Name** | **Type** | **Description** |
1541
+ | --- | --- | --- |
1542
+ | <code><a href="#@cdklabs/multi-az-observability.NatGatewayDetectionProps.property.natGateways">natGateways</a></code> | <code>{[ key: string ]: aws-cdk-lib.aws_ec2.CfnNatGateway[]}</code> | A list of NAT Gateways per Availability Zone (using the AZ name as the key). |
1543
+ | <code><a href="#@cdklabs/multi-az-observability.NatGatewayDetectionProps.property.packetLossOutlierAlgorithm">packetLossOutlierAlgorithm</a></code> | <code><a href="#@cdklabs/multi-az-observability.PacketLossOutlierAlgorithm">PacketLossOutlierAlgorithm</a></code> | The algorithm to use to calculate an AZ as an outlier for packet loss. |
1544
+ | <code><a href="#@cdklabs/multi-az-observability.NatGatewayDetectionProps.property.packetLossOutlierThreshold">packetLossOutlierThreshold</a></code> | <code>number</code> | The threshold used with the outlier calculation. |
1545
+ | <code><a href="#@cdklabs/multi-az-observability.NatGatewayDetectionProps.property.packetLossPercentThreshold">packetLossPercentThreshold</a></code> | <code>number</code> | The percentage of packet loss at which you consider there to be impact. |
1546
+
1547
+ ---
1548
+
1549
+ ##### `natGateways`<sup>Required</sup> <a name="natGateways" id="@cdklabs/multi-az-observability.NatGatewayDetectionProps.property.natGateways"></a>
1550
+
1551
+ ```typescript
1552
+ public readonly natGateways: {[ key: string ]: CfnNatGateway[]};
1553
+ ```
1554
+
1555
+ - *Type:* {[ key: string ]: aws-cdk-lib.aws_ec2.CfnNatGateway[]}
1556
+
1557
+ A list of NAT Gateways per Availability Zone (using the AZ name as the key).
1558
+
1559
+ ---
1560
+
1561
+ ##### `packetLossOutlierAlgorithm`<sup>Optional</sup> <a name="packetLossOutlierAlgorithm" id="@cdklabs/multi-az-observability.NatGatewayDetectionProps.property.packetLossOutlierAlgorithm"></a>
1562
+
1563
+ ```typescript
1564
+ public readonly packetLossOutlierAlgorithm: PacketLossOutlierAlgorithm;
1565
+ ```
1566
+
1567
+ - *Type:* <a href="#@cdklabs/multi-az-observability.PacketLossOutlierAlgorithm">PacketLossOutlierAlgorithm</a>
1568
+ - *Default:* PacketLossOutlierAlgorithm.STATIC
1569
+
1570
+ The algorithm to use to calculate an AZ as an outlier for packet loss.
1571
+
1572
+ ---
1573
+
1574
+ ##### `packetLossOutlierThreshold`<sup>Optional</sup> <a name="packetLossOutlierThreshold" id="@cdklabs/multi-az-observability.NatGatewayDetectionProps.property.packetLossOutlierThreshold"></a>
1575
+
1576
+ ```typescript
1577
+ public readonly packetLossOutlierThreshold: number;
1578
+ ```
1579
+
1580
+ - *Type:* number
1581
+ - *Default:* "This depends on the outlier algorithm. STATIC: 66. Z-SCORE: 3."
1582
+
1583
+ The threshold used with the outlier calculation.
1584
+
1585
+ ---
1586
+
1587
+ ##### `packetLossPercentThreshold`<sup>Optional</sup> <a name="packetLossPercentThreshold" id="@cdklabs/multi-az-observability.NatGatewayDetectionProps.property.packetLossPercentThreshold"></a>
1588
+
1589
+ ```typescript
1590
+ public readonly packetLossPercentThreshold: number;
1591
+ ```
1592
+
1593
+ - *Type:* number
1594
+ - *Default:* 0.01 (as in 0.01%)
1595
+
1596
+ The percentage of packet loss at which you consider there to be impact.
1597
+
1598
+ ---
1599
+
1454
1600
  ### NetworkConfigurationProps <a name="NetworkConfigurationProps" id="@cdklabs/multi-az-observability.NetworkConfigurationProps"></a>
1455
1601
 
1456
1602
  The network configuration for the canary function.
@@ -5043,6 +5189,59 @@ metrics this will typically just be "Sum".
5043
5189
 
5044
5190
  ## Enums <a name="Enums" id="Enums"></a>
5045
5191
 
5192
+ ### ApplicationLoadBalancerAvailabilityOutlierAlgorithm <a name="ApplicationLoadBalancerAvailabilityOutlierAlgorithm" id="@cdklabs/multi-az-observability.ApplicationLoadBalancerAvailabilityOutlierAlgorithm"></a>
5193
+
5194
+ The options for calculating if an ALB is an outlier for availability.
5195
+
5196
+ #### Members <a name="Members" id="Members"></a>
5197
+
5198
+ | **Name** | **Description** |
5199
+ | --- | --- |
5200
+ | <code><a href="#@cdklabs/multi-az-observability.ApplicationLoadBalancerAvailabilityOutlierAlgorithm.STATIC">STATIC</a></code> | This will take the availability threshold and calculate if one AZ is responsible for that percentage of errors. |
5201
+
5202
+ ---
5203
+
5204
+ ##### `STATIC` <a name="STATIC" id="@cdklabs/multi-az-observability.ApplicationLoadBalancerAvailabilityOutlierAlgorithm.STATIC"></a>
5205
+
5206
+ This will take the availability threshold and calculate if one AZ is responsible for that percentage of errors.
5207
+
5208
+ ---
5209
+
5210
+
5211
+ ### ApplicationLoadBalancerLatencyOutlierAlgorithm <a name="ApplicationLoadBalancerLatencyOutlierAlgorithm" id="@cdklabs/multi-az-observability.ApplicationLoadBalancerLatencyOutlierAlgorithm"></a>
5212
+
5213
+ The options for calculating if an AZ is an outlier for latency for ALBs.
5214
+
5215
+ #### Members <a name="Members" id="Members"></a>
5216
+
5217
+ | **Name** | **Description** |
5218
+ | --- | --- |
5219
+ | <code><a href="#@cdklabs/multi-az-observability.ApplicationLoadBalancerLatencyOutlierAlgorithm.STATIC">STATIC</a></code> | This will take the latency threshold and count the number of requests per AZ that exceed this threshold and then calculate the percentage of requests exceeding this threshold belong to each AZ. |
5220
+ | <code><a href="#@cdklabs/multi-az-observability.ApplicationLoadBalancerLatencyOutlierAlgorithm.Z_SCORE">Z_SCORE</a></code> | This calculates the z score of latency in one AZ against the other AZs. |
5221
+
5222
+ ---
5223
+
5224
+ ##### `STATIC` <a name="STATIC" id="@cdklabs/multi-az-observability.ApplicationLoadBalancerLatencyOutlierAlgorithm.STATIC"></a>
5225
+
5226
+ This will take the latency threshold and count the number of requests per AZ that exceed this threshold and then calculate the percentage of requests exceeding this threshold belong to each AZ.
5227
+
5228
+ This provides a static comparison
5229
+ of the number of high latency requests in one AZ versus the others
5230
+
5231
+ ---
5232
+
5233
+
5234
+ ##### `Z_SCORE` <a name="Z_SCORE" id="@cdklabs/multi-az-observability.ApplicationLoadBalancerLatencyOutlierAlgorithm.Z_SCORE"></a>
5235
+
5236
+ This calculates the z score of latency in one AZ against the other AZs.
5237
+
5238
+ It uses
5239
+ the target response time of all requests to calculate the standard deviation and
5240
+ average for all AZs. This is the default.
5241
+
5242
+ ---
5243
+
5244
+
5046
5245
  ### OutlierDetectionAlgorithm <a name="OutlierDetectionAlgorithm" id="@cdklabs/multi-az-observability.OutlierDetectionAlgorithm"></a>
5047
5246
 
5048
5247
  Available algorithms for performing outlier detection.
@@ -5107,3 +5306,22 @@ A common default value threshold 3
5107
5306
 
5108
5307
  ---
5109
5308
 
5309
+
5310
+ ### PacketLossOutlierAlgorithm <a name="PacketLossOutlierAlgorithm" id="@cdklabs/multi-az-observability.PacketLossOutlierAlgorithm"></a>
5311
+
5312
+ The options for calculating if a NAT Gateway is an outlier for packet loss.
5313
+
5314
+ #### Members <a name="Members" id="Members"></a>
5315
+
5316
+ | **Name** | **Description** |
5317
+ | --- | --- |
5318
+ | <code><a href="#@cdklabs/multi-az-observability.PacketLossOutlierAlgorithm.STATIC">STATIC</a></code> | This will take the availability threshold and calculate if one AZ is responsible for that percentage of packet loss. |
5319
+
5320
+ ---
5321
+
5322
+ ##### `STATIC` <a name="STATIC" id="@cdklabs/multi-az-observability.PacketLossOutlierAlgorithm.STATIC"></a>
5323
+
5324
+ This will take the availability threshold and calculate if one AZ is responsible for that percentage of packet loss.
5325
+
5326
+ ---
5327
+
package/README.md CHANGED
@@ -1,3 +1,5 @@
1
+ ![Build Workflow](https://github.com/cdklabs/cdk-multi-az-observability/actions/workflows/build.yml/badge.svg) ![Release Workflow](https://github.com/cdklabs/cdk-multi-az-observability/actions/workflows/release.yml/badge.svg)
2
+
1
3
  # multi-az-observability
2
4
  This is a CDK construct for multi-AZ observability to help detect single-AZ impairments. This is currently an `alpha` version, but is being used in the AWS [Advanced Multi-AZ Resilience Patterns](https://catalog.workshops.aws/multi-az-gray-failures/en-US) workshop.
3
5
 
@@ -151,25 +153,30 @@ You define some characteristics of the service, default values for metrics and a
151
153
  If you don't have service specific logs and custom metrics with per-AZ dimensions, you can still use the construct to evaluate ALB and NAT Gateway metrics to find single AZ faults.
152
154
 
153
155
  ```csharp
154
- BasicServiceMultiAZObservability multiAvailabilityZoneObservability = new BasicServiceMultiAZObservability(this, "MultiAZObservability", new BasicServiceMultiAZObservabilityProps() {
155
- ApplicationLoadBalancers = new IApplicationLoadBalancer[] { loadBalancer },
156
- NatGateways = new Dictionary<string, CfnNatGateway>() {
157
- { "us-east-1a", natGateway1},
158
- { "us-east-1b", natGateway2},
159
- { "us-east-1c", natGateway3},
156
+ BasicServiceMultiAZObservability multiAZObservability = new BasicServiceMultiAZObservability(this, "basic-service-", new BasicServiceMultiAZObservabilityProps() {
157
+ ApplicationLoadBalancerProps = new ApplicationLoadBalancerDetectionProps() {
158
+ ApplicationLoadBalancers = [ myALB ],
159
+ LatencyStatistic = Stats.Percentile(99),
160
+ FaultCountPercentThreshold = 1,
161
+ LatencyThreshold = 500
162
+ },
163
+ NatGatewayProps = new NatGatewayDetectionProps() {
164
+ PacketLossPercentThreshold = 0.01,
165
+ NatGateways = {
166
+ { "us-east-1a", [ natGateway1 ] },
167
+ { "us-east-1b", [ natGateway2 ] },
168
+ { "us-east-1c", [ natGateway3 ] }
169
+ },
160
170
  },
161
171
  CreateDashboard = true,
162
- OutlierDetectionAlgorithm = OutlierDetectionAlgorithm.STATIC,
163
- FaultCountPercentageThreshold = 1.0, // The fault rate to alarm on for errors seen from the ALBs in the same AZ
164
- PacketLossImpactPercentageThreshold = 0.01, // The percentage of packet loss to alarm on for the NAT Gateways in the same AZ
172
+ DatapointsToAlarm = 2,
173
+ EvaluationPeriods = 3,
165
174
  ServiceName = "WildRydes",
166
- Period = Duration.Seconds(60), // The period for metric evaluation
167
- Interval = Duration.Minutes(60) // The interval for the dashboards
168
- EvaluationPeriods = 5,
169
- DatapointsToAlarm = 3
175
+ Period = Duration.Seconds(60),
176
+ Interval = Duration.Minutes(60),
170
177
  });
171
178
  ```
172
179
 
173
180
  If you provide a load balancer, the construct assumes it is deployed in each AZ of the VPC the load balancer is associated with and will look for HTTP metrics using those AZs as dimensions.
174
181
 
175
- Both options support running workloads on EC2, ECS, Lambda, and EKS.
182
+ Both options support running workloads on EC2, ECS, Lambda, and EKS.
package/cdk.json CHANGED
@@ -1,5 +1,5 @@
1
1
  {
2
- "app": "npx ts-node --prefer-ts-exts test/synth-test.ts",
2
+ "app": "npx ts-node --prefer-ts-exts test/basic-service-synth-test.ts",
3
3
  "watch": {
4
4
  "include": [
5
5
  "**"
@@ -22,7 +22,7 @@ export declare class AvailabilityAndLatencyAlarmsAndRules {
22
22
  * @param counter
23
23
  * @returns
24
24
  */
25
- static createZonalAvailabilityAlarm(scope: Construct, metricDetails: IOperationMetricDetails, availabilityZoneId: string, counter: number, nameSuffix?: string): IAlarm;
25
+ static createZonalAvailabilityAlarm(scope: Construct, metricDetails: IOperationMetricDetails, availabilityZone: string, availabilityZoneId: string, counter: number, nameSuffix?: string): IAlarm;
26
26
  /**
27
27
  * Creates a zonal latency alarm
28
28
  * @param scope
@@ -32,7 +32,7 @@ export declare class AvailabilityAndLatencyAlarmsAndRules {
32
32
  * @param counter
33
33
  * @returns
34
34
  */
35
- static createZonalLatencyAlarm(scope: Construct, metricDetails: IOperationMetricDetails, availabilityZoneId: string, counter: number, nameSuffix?: string): IAlarm;
35
+ static createZonalLatencyAlarm(scope: Construct, metricDetails: IOperationMetricDetails, availabilityZone: string, availabilityZoneId: string, counter: number, nameSuffix?: string): IAlarm;
36
36
  /**
37
37
  * Creates a composite alarm when either latency or availability is breached in the Availabiltiy Zone
38
38
  * @param scope
@@ -55,7 +55,7 @@ export declare class AvailabilityAndLatencyAlarmsAndRules {
55
55
  * @param outlierThreshold
56
56
  * @returns
57
57
  */
58
- static createZonalFaultRateStaticOutlierAlarm(scope: Construct, metricDetails: IOperationMetricDetails, availabilityZoneId: string, counter: number, outlierThreshold: number, nameSuffix?: string): IAlarm;
58
+ static createZonalFaultRateStaticOutlierAlarm(scope: Construct, metricDetails: IOperationMetricDetails, availabilityZone: string, availabilityZoneId: string, counter: number, outlierThreshold: number, nameSuffix?: string): IAlarm;
59
59
  /**
60
60
  * An alarm that compares error rate in this AZ to the overall region error based only on metric data.
61
61
  * This is different for canaries because the metrics they test at the regional level are different
@@ -69,15 +69,15 @@ export declare class AvailabilityAndLatencyAlarmsAndRules {
69
69
  * @param outlierThreshold
70
70
  * @returns
71
71
  */
72
- static createZonalFaultRateStaticOutlierAlarmForCanaries(scope: Construct, metricDetails: IOperationMetricDetails, availabilityZoneId: string, availabilityZones: string[], counter: number, outlierThreshold: number, nameSuffix?: string): IAlarm;
72
+ static createZonalFaultRateStaticOutlierAlarmForCanaries(scope: Construct, metricDetails: IOperationMetricDetails, availabilityZone: string, availabilityZoneId: string, availabilityZoneIds: string[], counter: number, outlierThreshold: number, nameSuffix?: string): IAlarm;
73
73
  static createZonalFaultRateOutlierAlarm(scope: IConstruct, metricDetails: IOperationMetricDetails, availabilityZoneId: string, allAvailabilityZoneIds: string[], outlierThreshold: number, outlierDetectionFunction: IFunction, outlierDetectionAlgorithm: OutlierDetectionAlgorithm, counter: number, nameSuffix?: string): IAlarm;
74
74
  static createZonalFaultRateOutlierAlarmForAlb(scope: IConstruct, loadBalancers: IApplicationLoadBalancer[], availabilityZoneId: string, outlierThreshold: number, outlierDetectionFunction: IFunction, outlierDetectionAlgorithm: OutlierDetectionAlgorithm, azMapper: IAvailabilityZoneMapper, counter: number, evaluationPeriods: number, datapointsToAlarm: number, period: Duration, nameSuffix?: string): IAlarm;
75
75
  static createZonalFaultRateOutlierAlarmForNatGW(scope: IConstruct, natGateways: {
76
76
  [key: string]: CfnNatGateway[];
77
77
  }, availabilityZoneId: string, outlierThreshold: number, outlierDetectionFunction: IFunction, outlierDetectionAlgorithm: OutlierDetectionAlgorithm, azMapper: IAvailabilityZoneMapper, counter: number, evaluationPeriods: number, datapointsToAlarm: number, period: Duration, nameSuffix?: string): IAlarm;
78
78
  static createZonalHighLatencyOutlierAlarm(scope: IConstruct, metricDetails: IOperationMetricDetails, availabilityZoneId: string, allAvailabilityZoneIds: string[], outlierThreshold: number, outlierDetectionFunction: IFunction, outlierDetectionAlgorithm: OutlierDetectionAlgorithm, counter: number, nameSuffix?: string): IAlarm;
79
- static createZonalHighLatencyStaticOutlierAlarm(scope: Construct, metricDetails: IOperationMetricDetails, availabilityZoneId: string, counter: number, outlierThreshold: number, nameSuffix?: string): IAlarm;
80
- static createZonalHighLatencyStaticOutlierAlarmForCanaries(scope: Construct, metricDetails: IOperationMetricDetails, availabilityZoneId: string, availabilityZones: string[], counter: number, outlierThreshold: number, nameSuffix?: string): IAlarm;
79
+ static createZonalHighLatencyStaticOutlierAlarm(scope: Construct, metricDetails: IOperationMetricDetails, availabilityZone: string, availabilityZoneId: string, counter: number, outlierThreshold: number, nameSuffix?: string): IAlarm;
80
+ static createZonalHighLatencyStaticOutlierAlarmForCanaries(scope: Construct, metricDetails: IOperationMetricDetails, availabilityZone: string, availabilityZoneId: string, availabilityZones: string[], counter: number, outlierThreshold: number, nameSuffix?: string): IAlarm;
81
81
  /**
82
82
  * An insight rule that calculates how many instances are responding to requests in
83
83
  * the specified AZ. Only useful for server-side metrics since the canary doesn't record instance id metrics.