awsidr 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (180) hide show
  1. awsidr-1.0.0/.github/pull_request_template.md +13 -0
  2. awsidr-1.0.0/.github/workflows/pypi_upload.yml +27 -0
  3. awsidr-1.0.0/.github/workflows/version_bump.yml +44 -0
  4. awsidr-1.0.0/CHANGELOG.md +0 -0
  5. awsidr-1.0.0/CODE_OF_CONDUCT.md +4 -0
  6. awsidr-1.0.0/Guide/README.md +23 -0
  7. awsidr-1.0.0/Guide/appendix.md +231 -0
  8. awsidr-1.0.0/Guide/cli-usage/alarm-ingestion.md +441 -0
  9. awsidr-1.0.0/Guide/cli-usage/apm-integration.md +426 -0
  10. awsidr-1.0.0/Guide/cli-usage/cloudwatch-alarms.md +290 -0
  11. awsidr-1.0.0/Guide/cli-usage/workload-registration.md +226 -0
  12. awsidr-1.0.0/Guide/examples/alarm-creation-examples.md +93 -0
  13. awsidr-1.0.0/Guide/examples/alarm-ingestion-examples.md +126 -0
  14. awsidr-1.0.0/Guide/examples/workload-registration-examples.md +56 -0
  15. awsidr-1.0.0/Guide/faq.md +141 -0
  16. awsidr-1.0.0/Guide/getting-started.md +88 -0
  17. awsidr-1.0.0/Guide/iam-policies/apm-saas.json +136 -0
  18. awsidr-1.0.0/Guide/iam-policies/apm-sns.json +141 -0
  19. awsidr-1.0.0/Guide/iam-policies/apm-webhook.json +159 -0
  20. awsidr-1.0.0/Guide/iam-policies/general-cli.json +59 -0
  21. awsidr-1.0.0/Guide/iam-policies.md +121 -0
  22. awsidr-1.0.0/Guide/support-case-attachment.md +61 -0
  23. awsidr-1.0.0/Guide/unattended-mode.md +401 -0
  24. awsidr-1.0.0/Guide/workflows.md +167 -0
  25. awsidr-1.0.0/LICENSE.md +188 -0
  26. awsidr-1.0.0/NOTICE.md +2 -0
  27. awsidr-1.0.0/PKG-INFO +124 -0
  28. awsidr-1.0.0/README.md +103 -0
  29. awsidr-1.0.0/SECURITY.md +11 -0
  30. awsidr-1.0.0/pyproject.toml +33 -0
  31. awsidr-1.0.0/src/aws_idr_customer_cli/__init__.py +0 -0
  32. awsidr-1.0.0/src/aws_idr_customer_cli/_version.py +1 -0
  33. awsidr-1.0.0/src/aws_idr_customer_cli/clients/__init__.py +0 -0
  34. awsidr-1.0.0/src/aws_idr_customer_cli/clients/ec2.py +42 -0
  35. awsidr-1.0.0/src/aws_idr_customer_cli/clients/iam.py +33 -0
  36. awsidr-1.0.0/src/aws_idr_customer_cli/clients/s3.py +34 -0
  37. awsidr-1.0.0/src/aws_idr_customer_cli/clients/sts.py +29 -0
  38. awsidr-1.0.0/src/aws_idr_customer_cli/commands/__init__.py +0 -0
  39. awsidr-1.0.0/src/aws_idr_customer_cli/commands/create_alarm/__init__.py +0 -0
  40. awsidr-1.0.0/src/aws_idr_customer_cli/commands/create_alarm/command.py +339 -0
  41. awsidr-1.0.0/src/aws_idr_customer_cli/commands/ingest_alarms/__init__.py +0 -0
  42. awsidr-1.0.0/src/aws_idr_customer_cli/commands/ingest_alarms/command.py +215 -0
  43. awsidr-1.0.0/src/aws_idr_customer_cli/commands/register_workload/__init__.py +0 -0
  44. awsidr-1.0.0/src/aws_idr_customer_cli/commands/register_workload/command.py +363 -0
  45. awsidr-1.0.0/src/aws_idr_customer_cli/commands/setup_apm/__init__.py +0 -0
  46. awsidr-1.0.0/src/aws_idr_customer_cli/commands/setup_apm/command.py +222 -0
  47. awsidr-1.0.0/src/aws_idr_customer_cli/core/__init__.py +0 -0
  48. awsidr-1.0.0/src/aws_idr_customer_cli/core/command_base.py +15 -0
  49. awsidr-1.0.0/src/aws_idr_customer_cli/core/decorators.py +89 -0
  50. awsidr-1.0.0/src/aws_idr_customer_cli/core/interactive/__init__.py +0 -0
  51. awsidr-1.0.0/src/aws_idr_customer_cli/core/interactive/ui.py +168 -0
  52. awsidr-1.0.0/src/aws_idr_customer_cli/core/registry.py +163 -0
  53. awsidr-1.0.0/src/aws_idr_customer_cli/data_accessors/__init__.py +0 -0
  54. awsidr-1.0.0/src/aws_idr_customer_cli/data_accessors/alarm_accessor.py +217 -0
  55. awsidr-1.0.0/src/aws_idr_customer_cli/data_accessors/apigateway_accessor.py +39 -0
  56. awsidr-1.0.0/src/aws_idr_customer_cli/data_accessors/base_accessor.py +49 -0
  57. awsidr-1.0.0/src/aws_idr_customer_cli/data_accessors/cloudformation_accessor.py +311 -0
  58. awsidr-1.0.0/src/aws_idr_customer_cli/data_accessors/eventbridge_accessor.py +121 -0
  59. awsidr-1.0.0/src/aws_idr_customer_cli/data_accessors/logs_accessor.py +91 -0
  60. awsidr-1.0.0/src/aws_idr_customer_cli/data_accessors/resource_tagging_accessor.py +197 -0
  61. awsidr-1.0.0/src/aws_idr_customer_cli/data_accessors/sns_accessor.py +70 -0
  62. awsidr-1.0.0/src/aws_idr_customer_cli/data_accessors/support_case_accessor.py +160 -0
  63. awsidr-1.0.0/src/aws_idr_customer_cli/exceptions.py +99 -0
  64. awsidr-1.0.0/src/aws_idr_customer_cli/input/__init__.py +0 -0
  65. awsidr-1.0.0/src/aws_idr_customer_cli/input/input_resource_discovery.py +256 -0
  66. awsidr-1.0.0/src/aws_idr_customer_cli/interfaces/__init__.py +0 -0
  67. awsidr-1.0.0/src/aws_idr_customer_cli/interfaces/file_cache_service.py +93 -0
  68. awsidr-1.0.0/src/aws_idr_customer_cli/interfaces/input_service.py +48 -0
  69. awsidr-1.0.0/src/aws_idr_customer_cli/interfaces/mlo_selection_manager.py +54 -0
  70. awsidr-1.0.0/src/aws_idr_customer_cli/interfaces/resource_finder_service.py +61 -0
  71. awsidr-1.0.0/src/aws_idr_customer_cli/main.py +35 -0
  72. awsidr-1.0.0/src/aws_idr_customer_cli/models/__init__.py +0 -0
  73. awsidr-1.0.0/src/aws_idr_customer_cli/models/alarm_models.py +157 -0
  74. awsidr-1.0.0/src/aws_idr_customer_cli/models/mlo_selection_manager.py +20 -0
  75. awsidr-1.0.0/src/aws_idr_customer_cli/models/non_interactive_config.py +145 -0
  76. awsidr-1.0.0/src/aws_idr_customer_cli/modules/__init__.py +0 -0
  77. awsidr-1.0.0/src/aws_idr_customer_cli/modules/accessors.py +98 -0
  78. awsidr-1.0.0/src/aws_idr_customer_cli/modules/base.py +13 -0
  79. awsidr-1.0.0/src/aws_idr_customer_cli/modules/boto_clients.py +46 -0
  80. awsidr-1.0.0/src/aws_idr_customer_cli/modules/file_cache.py +22 -0
  81. awsidr-1.0.0/src/aws_idr_customer_cli/modules/injector_config.py +26 -0
  82. awsidr-1.0.0/src/aws_idr_customer_cli/modules/input.py +54 -0
  83. awsidr-1.0.0/src/aws_idr_customer_cli/modules/logging.py +70 -0
  84. awsidr-1.0.0/src/aws_idr_customer_cli/modules/service_clients.py +140 -0
  85. awsidr-1.0.0/src/aws_idr_customer_cli/modules/session.py +19 -0
  86. awsidr-1.0.0/src/aws_idr_customer_cli/modules/validation.py +19 -0
  87. awsidr-1.0.0/src/aws_idr_customer_cli/services/__init__.py +0 -0
  88. awsidr-1.0.0/src/aws_idr_customer_cli/services/apm/_init_.py +0 -0
  89. awsidr-1.0.0/src/aws_idr_customer_cli/services/apm/apm_service.py +603 -0
  90. awsidr-1.0.0/src/aws_idr_customer_cli/services/apm/cfn_stack_processor.py +129 -0
  91. awsidr-1.0.0/src/aws_idr_customer_cli/services/create_alarm/__init__.py +0 -0
  92. awsidr-1.0.0/src/aws_idr_customer_cli/services/create_alarm/alarm_recommendation_service.py +678 -0
  93. awsidr-1.0.0/src/aws_idr_customer_cli/services/create_alarm/alarm_service.py +551 -0
  94. awsidr-1.0.0/src/aws_idr_customer_cli/services/file_cache/__init__.py +0 -0
  95. awsidr-1.0.0/src/aws_idr_customer_cli/services/file_cache/data.py +230 -0
  96. awsidr-1.0.0/src/aws_idr_customer_cli/services/file_cache/file_cache_deserializer.py +65 -0
  97. awsidr-1.0.0/src/aws_idr_customer_cli/services/file_cache/file_cache_migration_service.py +124 -0
  98. awsidr-1.0.0/src/aws_idr_customer_cli/services/file_cache/file_cache_service.py +447 -0
  99. awsidr-1.0.0/src/aws_idr_customer_cli/services/file_cache/idr-cx-cli_20251231115959.json +117 -0
  100. awsidr-1.0.0/src/aws_idr_customer_cli/services/input_module/__init__.py +0 -0
  101. awsidr-1.0.0/src/aws_idr_customer_cli/services/input_module/input_service.py +72 -0
  102. awsidr-1.0.0/src/aws_idr_customer_cli/services/input_module/resource_finder_service.py +291 -0
  103. awsidr-1.0.0/src/aws_idr_customer_cli/services/non_interactive_alarm_ingestion_service.py +732 -0
  104. awsidr-1.0.0/src/aws_idr_customer_cli/services/non_interactive_alarm_service.py +578 -0
  105. awsidr-1.0.0/src/aws_idr_customer_cli/services/non_interactive_base_service.py +308 -0
  106. awsidr-1.0.0/src/aws_idr_customer_cli/services/non_interactive_workload_service.py +157 -0
  107. awsidr-1.0.0/src/aws_idr_customer_cli/services/support_case_service.py +356 -0
  108. awsidr-1.0.0/src/aws_idr_customer_cli/utils/__init__.py +0 -0
  109. awsidr-1.0.0/src/aws_idr_customer_cli/utils/alarm_contact_collection.py +363 -0
  110. awsidr-1.0.0/src/aws_idr_customer_cli/utils/apm/apm_config.py +145 -0
  111. awsidr-1.0.0/src/aws_idr_customer_cli/utils/apm/apm_constants.py +510 -0
  112. awsidr-1.0.0/src/aws_idr_customer_cli/utils/apm/apm_stack_helpers.py +80 -0
  113. awsidr-1.0.0/src/aws_idr_customer_cli/utils/apm/cfn_templates/__init__.py +0 -0
  114. awsidr-1.0.0/src/aws_idr_customer_cli/utils/apm/cfn_templates/non_saas_integration.json +421 -0
  115. awsidr-1.0.0/src/aws_idr_customer_cli/utils/apm/cfn_templates/saas_integration.json +176 -0
  116. awsidr-1.0.0/src/aws_idr_customer_cli/utils/apm/cfn_templates/sns_integration.json +154 -0
  117. awsidr-1.0.0/src/aws_idr_customer_cli/utils/apm/lambda_code/non_saas_lambda.py +234 -0
  118. awsidr-1.0.0/src/aws_idr_customer_cli/utils/apm/lambda_code/saas_lambda.py +51 -0
  119. awsidr-1.0.0/src/aws_idr_customer_cli/utils/apm/lambda_code/sns_lambda.py +135 -0
  120. awsidr-1.0.0/src/aws_idr_customer_cli/utils/arn_utils.py +74 -0
  121. awsidr-1.0.0/src/aws_idr_customer_cli/utils/attachment_splitter.py +344 -0
  122. awsidr-1.0.0/src/aws_idr_customer_cli/utils/constants.py +49 -0
  123. awsidr-1.0.0/src/aws_idr_customer_cli/utils/context.py +25 -0
  124. awsidr-1.0.0/src/aws_idr_customer_cli/utils/create_alarm/alarm_service_config.py +424 -0
  125. awsidr-1.0.0/src/aws_idr_customer_cli/utils/create_alarm/idr_alarm_templates/Keyspaces.yaml +137 -0
  126. awsidr-1.0.0/src/aws_idr_customer_cli/utils/create_alarm/idr_alarm_templates/__init__.py +0 -0
  127. awsidr-1.0.0/src/aws_idr_customer_cli/utils/create_alarm/idr_alarm_templates/alb.yaml +157 -0
  128. awsidr-1.0.0/src/aws_idr_customer_cli/utils/create_alarm/idr_alarm_templates/apigateway.yaml +139 -0
  129. awsidr-1.0.0/src/aws_idr_customer_cli/utils/create_alarm/idr_alarm_templates/cloudfront.yaml +40 -0
  130. awsidr-1.0.0/src/aws_idr_customer_cli/utils/create_alarm/idr_alarm_templates/dax.yaml +104 -0
  131. awsidr-1.0.0/src/aws_idr_customer_cli/utils/create_alarm/idr_alarm_templates/directconnect.yaml +36 -0
  132. awsidr-1.0.0/src/aws_idr_customer_cli/utils/create_alarm/idr_alarm_templates/dynamodb.yaml +135 -0
  133. awsidr-1.0.0/src/aws_idr_customer_cli/utils/create_alarm/idr_alarm_templates/ec2.yaml +104 -0
  134. awsidr-1.0.0/src/aws_idr_customer_cli/utils/create_alarm/idr_alarm_templates/efs.yaml +35 -0
  135. awsidr-1.0.0/src/aws_idr_customer_cli/utils/create_alarm/idr_alarm_templates/eks.yaml +536 -0
  136. awsidr-1.0.0/src/aws_idr_customer_cli/utils/create_alarm/idr_alarm_templates/elasticache.yaml +101 -0
  137. awsidr-1.0.0/src/aws_idr_customer_cli/utils/create_alarm/idr_alarm_templates/kinesis.yaml +167 -0
  138. awsidr-1.0.0/src/aws_idr_customer_cli/utils/create_alarm/idr_alarm_templates/lambda.yaml +169 -0
  139. awsidr-1.0.0/src/aws_idr_customer_cli/utils/create_alarm/idr_alarm_templates/medialive.yaml +35 -0
  140. awsidr-1.0.0/src/aws_idr_customer_cli/utils/create_alarm/idr_alarm_templates/mediapackage.yaml +68 -0
  141. awsidr-1.0.0/src/aws_idr_customer_cli/utils/create_alarm/idr_alarm_templates/rds.yaml +167 -0
  142. awsidr-1.0.0/src/aws_idr_customer_cli/utils/create_alarm/idr_alarm_templates/redshift.yaml +167 -0
  143. awsidr-1.0.0/src/aws_idr_customer_cli/utils/create_alarm/idr_alarm_templates/s3.yaml +37 -0
  144. awsidr-1.0.0/src/aws_idr_customer_cli/utils/create_alarm/idr_alarm_templates/sns.yaml +168 -0
  145. awsidr-1.0.0/src/aws_idr_customer_cli/utils/create_alarm/idr_alarm_templates/sqs.yaml +68 -0
  146. awsidr-1.0.0/src/aws_idr_customer_cli/utils/create_alarm/idr_alarm_templates/stepfunctions.yaml +101 -0
  147. awsidr-1.0.0/src/aws_idr_customer_cli/utils/create_alarm/idr_alarm_templates/transitgateway.yaml +68 -0
  148. awsidr-1.0.0/src/aws_idr_customer_cli/utils/create_alarm/metric_namespace_validator.py +155 -0
  149. awsidr-1.0.0/src/aws_idr_customer_cli/utils/execution_mode.py +25 -0
  150. awsidr-1.0.0/src/aws_idr_customer_cli/utils/feature_flags.py +123 -0
  151. awsidr-1.0.0/src/aws_idr_customer_cli/utils/hash_utils.py +42 -0
  152. awsidr-1.0.0/src/aws_idr_customer_cli/utils/log_formatter.py +38 -0
  153. awsidr-1.0.0/src/aws_idr_customer_cli/utils/log_handlers.py +9 -0
  154. awsidr-1.0.0/src/aws_idr_customer_cli/utils/mlo.py +539 -0
  155. awsidr-1.0.0/src/aws_idr_customer_cli/utils/mlo_adapter.py +136 -0
  156. awsidr-1.0.0/src/aws_idr_customer_cli/utils/resource_discovery_utils.py +479 -0
  157. awsidr-1.0.0/src/aws_idr_customer_cli/utils/resource_filtering/__init__.py +0 -0
  158. awsidr-1.0.0/src/aws_idr_customer_cli/utils/resource_filtering/functional_resource_config.py +221 -0
  159. awsidr-1.0.0/src/aws_idr_customer_cli/utils/service_linked_role_utils.py +68 -0
  160. awsidr-1.0.0/src/aws_idr_customer_cli/utils/session/__init__.py +0 -0
  161. awsidr-1.0.0/src/aws_idr_customer_cli/utils/session/alarm_creation_session.py +546 -0
  162. awsidr-1.0.0/src/aws_idr_customer_cli/utils/session/alarm_ingestion_session.py +856 -0
  163. awsidr-1.0.0/src/aws_idr_customer_cli/utils/session/apm_setup_session.py +695 -0
  164. awsidr-1.0.0/src/aws_idr_customer_cli/utils/session/interactive_session.py +381 -0
  165. awsidr-1.0.0/src/aws_idr_customer_cli/utils/session/session_store.py +291 -0
  166. awsidr-1.0.0/src/aws_idr_customer_cli/utils/session/workload_session.py +483 -0
  167. awsidr-1.0.0/src/aws_idr_customer_cli/utils/support_case_utils.py +55 -0
  168. awsidr-1.0.0/src/aws_idr_customer_cli/utils/validate_alarm/__init__.py +0 -0
  169. awsidr-1.0.0/src/aws_idr_customer_cli/utils/validate_alarm/alarm_validation_constants.py +189 -0
  170. awsidr-1.0.0/src/aws_idr_customer_cli/utils/validate_alarm/alarm_validation_models.py +33 -0
  171. awsidr-1.0.0/src/aws_idr_customer_cli/utils/validate_alarm/alarm_validator.py +1322 -0
  172. awsidr-1.0.0/src/aws_idr_customer_cli/utils/validation/__init__.py +0 -0
  173. awsidr-1.0.0/src/aws_idr_customer_cli/utils/validation/apm_validation.py +442 -0
  174. awsidr-1.0.0/src/aws_idr_customer_cli/utils/validation/aws_validation_context.py +74 -0
  175. awsidr-1.0.0/src/aws_idr_customer_cli/utils/validation/base_validation_context.py +33 -0
  176. awsidr-1.0.0/src/aws_idr_customer_cli/utils/validation/contact_validation_context.py +41 -0
  177. awsidr-1.0.0/src/aws_idr_customer_cli/utils/validation/validator.py +410 -0
  178. awsidr-1.0.0/src/aws_idr_customer_cli/utils/validation/workload_validation.py +61 -0
  179. awsidr-1.0.0/src/aws_idr_customer_cli/utils/validation/workload_validation_context.py +40 -0
  180. awsidr-1.0.0/src/aws_idr_customer_cli/utils/workload_meta_data_collection_utils.py +276 -0
@@ -0,0 +1,13 @@
1
+ ## Description
2
+ <!-- Briefly describe your changes -->
3
+
4
+ ## Type of Change
5
+ - [ ] Bug fix
6
+ - [ ] New feature
7
+ - [ ] Documentation update
8
+
9
+ ### Pre-merge Checklist
10
+ - [ ] CI/CD pipeline passes with no error
11
+ - [ ] Branch is up to date with main
12
+ - [ ] Github Sync Script is run and tests have passed
13
+ - [ ] Approved by oncall engineer
@@ -0,0 +1,27 @@
1
+ name: PyPI Upload
2
+
3
+ on:
4
+ release:
5
+ types: [ published ]
6
+
7
+ jobs:
8
+ upload:
9
+ runs-on: ubuntu-latest
10
+ environment: release
11
+ permissions:
12
+ id-token: write
13
+ steps:
14
+ - uses: actions/checkout@v4
15
+
16
+ - name: Set up Python
17
+ uses: actions/setup-python@v4
18
+ with:
19
+ python-version: '3.10'
20
+
21
+ - name: Build package
22
+ run: |
23
+ python -m pip install --upgrade pip build
24
+ python -m build
25
+
26
+ - name: Upload to PyPI
27
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,44 @@
1
+ # This is a workflow to help you bump the verion automtically
2
+
3
+ name: Bump Version
4
+
5
+ permissions:
6
+ contents: write
7
+ pull-requests: write
8
+
9
+ # Controls when the workflow will run
10
+ on:
11
+ workflow_dispatch:
12
+ inputs:
13
+ release_type:
14
+ description: 'Select version bump type'
15
+ required: true
16
+ default: 'patch'
17
+ type: choice
18
+ options:
19
+ - patch
20
+ - minor
21
+ - major
22
+
23
+ # A workflow run is made up of one or more jobs that can run sequentially or in parallel
24
+ jobs:
25
+ # This workflow contains a single job called "bump"
26
+ bump:
27
+ name: 'Bump Version on master'
28
+ # The type of runner that the job will run on
29
+ runs-on: ubuntu-latest
30
+
31
+ # Steps represent a sequence of tasks that will be executed as part of the job
32
+ steps:
33
+ - name: 'Checkout source code'
34
+ uses: actions/checkout@v5
35
+ with:
36
+ fetch-depth: 0
37
+
38
+ - name: 'Automated Version Bump'
39
+ id: version-bump
40
+ uses: 'taj54/universal-version-bump@v0.14.0'
41
+ env:
42
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
43
+ with:
44
+ release_type: ${{ inputs.release_type }}
File without changes
@@ -0,0 +1,4 @@
1
+ ## Code of Conduct
2
+ This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
3
+ For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
4
+ opensource-codeofconduct@amazon.com with any additional questions or comments.
@@ -0,0 +1,23 @@
1
+ # IDR CLI Guide
2
+
3
+ This guide provides documentation for using the IDR CLI tool.
4
+
5
+ ## Getting Started
6
+
7
+ Please read [Getting started.md](getting-started.md) first.
8
+
9
+ ## Directory Structure
10
+
11
+ - [cli-usage/](cli-usage/) - Command-line usage guides for individual CLI commands.
12
+ - [examples/](examples/) - Examples demonstrating use case of unattended mode.
13
+ - [iam-policies/](iam-policies/) - Customized IAM policy templates required for CLI operations.
14
+
15
+ ## Documentation Files
16
+
17
+ - [getting-started.md](getting-started.md) - Initial setup and basic usage instructions.
18
+ - [workflows.md](workflows.md) - Workflow examples for common tasks.
19
+ - [iam-policies.md](iam-policies.md) - IAM permissions setup and configuration.
20
+ - [unattended-mode.md](unattended-mode.md) - Running the CLI in unattended mode.
21
+ - [support-case-attachment.md](support-case-attachment.md) - CLI Attachment to AWS Support case explained.
22
+ - [faq.md](faq.md) - Frequently asked questions and troubleshooting.
23
+ - [appendix.md](appendix.md) - Additional reference information and resources.
@@ -0,0 +1,231 @@
1
+ # Appendix
2
+
3
+ ## Progress Saving
4
+
5
+ The CLI will store progress information in a local file cache. The local file cache is a local directory that saves files that record progress and onboarding information during execution of the IDR Customer CLI. You can find this directory at ~/.aws-idr/cache
6
+ A typical local cache file name looks like idr-cx-cli_20250805163942882934.enc
7
+ These cache files are encrypted and not meant to be edited or accessed directly.
8
+ Cache file names are consistent with the session numbers. So in this example, you can resume session from that specific cache by executing with flag --resume idr-cx-cli_20250805163942882934
9
+
10
+ ## Resources IDR Customer CLI does not onboard
11
+
12
+ If you find resources count to be less than expected when doing AWS Resource Discovery, it is likely because they are in the category considered as non-functional resources. The CLI will not create alarms for these resources and will not display them to you. For the complete list of functional and non-functional resources, see [functional_resource_config.py](../src/aws_idr_customer_cli/utils/resource_filtering/functional_resource_config.py).
13
+
14
+ ## APM Integrations
15
+
16
+ ### Integration Resources
17
+
18
+ #### Resources Created by Integration Type
19
+
20
+ **Common Resources (All Types)**
21
+
22
+ | Resource | Purpose |
23
+ |----------|---------|
24
+ | Custom EventBus | Routes normalized events to IDR |
25
+ | Transform Lambda | Extracts incident identifier from APM payload |
26
+ | IAM Execution Role | Lambda permissions for EventBridge and CloudWatch |
27
+
28
+ Created by all integration types: 3 core resources
29
+
30
+ **Type-Specific Resources**
31
+
32
+ | Integration Type | Additional Resources | Total Resources |
33
+ |-----------------|---------------------|-----------------|
34
+ | EventBridge (SaaS) | EventBridge Rule | 4 resources |
35
+ | SNS | SNS Topic Subscription | 4 resources |
36
+ | Webhook | API Gateway (4 components)<br>Secrets Manager<br>Lambda Authorizer<br>Authorizer IAM Role | 10 resources |
37
+
38
+ #### Resource Naming Pattern
39
+
40
+ All resources follow: `{APMName}-AWSIncidentDetectionResponse-{ResourceType}`
41
+
42
+ Example for Dynatrace:
43
+ * EventBus: `Dynatrace-AWSIncidentDetectionResponse-EventBus`
44
+ * Transform Lambda: `Dynatrace-AWSIncidentDetectionResponse-Lambda-Transform`
45
+ * API Gateway: `Dynatrace-AWSIncidentDetectionResponse-APIGW` (webhook only)
46
+
47
+ ## IDR Alarm Recommendations
48
+
49
+ ### ALB
50
+
51
+ | Business Objectives | AWS Service | Namespace | Metric name | Reactive/Proactive | Metric Classification | Threshold | Recommended alarm configuration | Use case |
52
+ |---------------------|-------------|-----------|-------------|--------------------|-----------------------|-----------|---------------------------------|----------|
53
+ | RELIABILITY | ALB | AWS/ApplicationELB | HTTPErrorRate | Reactive | Native | > 5.0 | Statistic = Average<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors ratio of HTTP errors (4XX+5XX) to total requests for application stability |
54
+ | RELIABILITY | ALB | AWS/ApplicationELB | RejectedConnectionCount | Proactive | Native | >= 1.0 | Statistic = Sum<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors rejected connections when load balancer reaches maximum capacity |
55
+ | RELIABILITY | ALB | AWS/ApplicationELB | TargetResponseTime | Reactive | Native | >= 1.0 | Statistic = Average<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors time elapsed from request leaving load balancer until target starts sending response headers |
56
+
57
+ ### API Gateway
58
+
59
+ | Business Objectives | AWS Service | Namespace | Metric name | Reactive/Proactive | Metric Classification | Threshold | Recommended alarm configuration | Use case |
60
+ |---------------------|-------------|-----------|-------------|--------------------|-----------------------|-----------|---------------------------------|----------|
61
+ | RELIABILITY | API Gateway | AWS/API Gateway | (m1/m2)*100<br>m1 = Errors<br>m2 = Invocations | Reactive | Native | >= 5.0 | Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors API error rates as a percentage of total traffic using math expression |
62
+ | PERFORMANCE | API Gateway | AWS/ApiGateway | Latency | Reactive | Native | >= 5000.0 | Statistic = Average<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Response time monitoring provides insight into end user experience and performance |
63
+ | PERFORMANCE | API Gateway | AWS/ApiGateway | IntegrationLatency | Reactive | Native | >= 3000.0 | Statistic = Average<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Measures the time taken between when API Gateway forwards a request to the backend and receives a response |
64
+
65
+ ### CloudFront
66
+
67
+ | Business Objectives | AWS Service | Namespace | Metric name | Reactive/Proactive | Metric Classification | Threshold | Recommended alarm configuration | Use case |
68
+ |---------------------|-------------|-----------|-------------|--------------------|-----------------------|-----------|---------------------------------|----------|
69
+ | RELIABILITY | CloudFront | AWS/CloudFront | (m1/m2)*100<br>m1 = Errors<br>m2 = Invocations | Reactive | Native | > 5.0 | Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors the ratio of HTTP 5xx server error responses to total requests |
70
+
71
+ ### DAX
72
+
73
+ | Business Objectives | AWS Service | Namespace | Metric name | Reactive/Proactive | Metric Classification | Threshold | Recommended alarm configuration | Use case |
74
+ |---------------------|-------------|-----------|-------------|--------------------|-----------------------|-----------|---------------------------------|----------|
75
+ | RELIABILITY | DAX | AWS/DAX | FaultRequestCount | Reactive | Native | >= 1.0 | Statistic = Sum<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors HTTP 500 server errors from DAX |
76
+ | RELIABILITY | DAX | AWS/DAX | FailedRequestCount | Reactive | Native | >= 1.0 | Statistic = Sum<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors failed requests from DAX |
77
+ | RELIABILITY | DAX | AWS/DAX | ThrottledRequestCount | Reactive | Native | >= 1.0 | Statistic = Sum<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors throttled requests from DAX |
78
+
79
+ ### Direct Connect
80
+
81
+ | Business Objectives | AWS Service | Namespace | Metric name | Reactive/Proactive | Metric Classification | Threshold | Recommended alarm configuration | Use case |
82
+ |---------------------|-------------|-----------|-------------|--------------------|-----------------------|-----------|---------------------------------|----------|
83
+ | RELIABILITY | Direct Connect | AWS/DX | ConnectionState | Reactive | Native | < 0.5 | Statistic = Maximum<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = breaching | The state of the connection. 1 indicates up and 0 indicates down |
84
+
85
+ ### DynamoDB
86
+
87
+ | Business Objectives | AWS Service | Namespace | Metric name | Reactive/Proactive | Metric Classification | Threshold | Recommended alarm configuration | Use case |
88
+ |---------------------|-------------|-----------|-------------|--------------------|-----------------------|-----------|---------------------------------|----------|
89
+ | RELIABILITY | DynamoDB | AWS/DynamoDB | ReadThrottleEvents | Proactive | Native | >= 5.0 | Statistic = Average<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors read throttle events to detect capacity issues |
90
+ | RELIABILITY | DynamoDB | AWS/DynamoDB | WriteThrottleEvents | Proactive | Native | >= 5.0 | Statistic = Average<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors write throttle events to detect capacity issues |
91
+ | RELIABILITY | DynamoDB | AWS/DynamoDB | SuccessfulRequestLatency | Reactive | Native | >= 100.0 | Statistic = Average<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors successful request latency to detect performance degradation |
92
+ | RELIABILITY | DynamoDB | AWS/DynamoDB | ReplicationLatency | Reactive | Conditional | >= 1.0 | Statistic = Average<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors replication latency for global tables |
93
+
94
+ ### EC2
95
+
96
+ | Business Objectives | AWS Service | Namespace | Metric name | Reactive/Proactive | Metric Classification | Threshold | Recommended alarm configuration | Use case |
97
+ |---------------------|-------------|-----------|-------------|--------------------|-----------------------|-----------|---------------------------------|----------|
98
+ | RELIABILITY | EC2 | AWS/EC2 | StatusCheckFailed_Instance | Reactive | Native | >= 1.0 | Statistic = Sum<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = breaching | Reports whether the instance has passed the instance status check |
99
+ | RELIABILITY | EC2 | AWS/EC2 | StatusCheckFailed_System | Reactive | Native | >= 1.0 | Statistic = Sum<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = breaching | Reports whether the instance has passed the system status check |
100
+ | RELIABILITY | EC2 | AWS/EC2 | StatusCheckFailed_AttachedEBS | Reactive | Native | >= 1.0 | Statistic = Sum<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = breaching | Reports whether the instance has passed the attached EBS status check |
101
+
102
+ ### EFS
103
+
104
+ | Business Objectives | AWS Service | Namespace | Metric name | Reactive/Proactive | Metric Classification | Threshold | Recommended alarm configuration | Use case |
105
+ |---------------------|-------------|-----------|-------------|--------------------|-----------------------|-----------|---------------------------------|----------|
106
+ | RELIABILITY | EFS | AWS/EFS | PercentIOLimit | Reactive | Native | >= 80.0 | Statistic = Average<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = breaching | Shows how close a file system is to reaching the I/O limit of the General Purpose performance mode |
107
+
108
+ ### EKS
109
+
110
+ | Business Objectives | AWS Service | Namespace | Metric name | Reactive/Proactive | Metric Classification | Threshold | Recommended alarm configuration | Use case |
111
+ |---------------------|-------------|-----------|-------------|--------------------|-----------------------|-----------|---------------------------------|----------|
112
+ | REACTIVE | EKS | ContainerInsights | pod_container_status_waiting_reason_crash_loop_back_off | Reactive | Native | > 1.0 | Statistic = Average<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors containers stuck in CrashLoopBackOff |
113
+ | REACTIVE | EKS | ContainerInsights | pod_container_status_waiting_reason_create_container_config_error | Reactive | Native | > 1.0 | Statistic = Average<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors containers with CreateContainerConfigError |
114
+ | REACTIVE | EKS | ContainerInsights | pod_container_status_waiting_reason_create_container_error | Reactive | Native | > 1.0 | Statistic = Average<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors containers with CreateContainerError |
115
+ | REACTIVE | EKS | ContainerInsights | pod_container_status_waiting_reason_image_pull_error | Reactive | Native | > 1.0 | Statistic = Average<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors containers with image pull errors |
116
+ | REACTIVE | EKS | ContainerInsights | pod_container_status_waiting_reason_start_error | Reactive | Native | > 1.0 | Statistic = Average<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors containers with start errors |
117
+ | REACTIVE | EKS | ContainerInsights | cluster_failed_node_count | Reactive | Native | > 1.0 | Statistic = Average<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors failed worker nodes in the EKS cluster |
118
+ | REACTIVE | EKS | ContainerInsights | pod_status_unknown | Reactive | Native | > 1.0 | Statistic = Average<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors pods with unknown status |
119
+ | REACTIVE | EKS | ContainerInsights | apiserver_admission_webhook_admission_duration_seconds | Reactive | Non-native | > 1.0 | Statistic = Average<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors admission webhook latency |
120
+ | PROACTIVE | EKS | ContainerInsights/Prometheus | apiserver_admission_controller_admission_duration_seconds | Reactive | Non-native | > 1.0 | Statistic = Average<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors admission controller latency |
121
+ | PROACTIVE | EKS | ContainerInsights/Prometheus | apiserver_authorization_webhook_duration_seconds | Reactive | Non-native | > 1.0 | Statistic = Average<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors authorization webhook duration |
122
+ | REACTIVE | EKS | ContainerInsights/Prometheus | apiserver_clusterip_repair_ip_errors_total | Reactive | Non-native | > 1.0 | Statistic = Sum<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors cluster IP repair errors |
123
+ | REACTIVE | EKS | ContainerInsights/Prometheus | apiserver_nodeport_repair_port_errors_total | Reactive | Non-native | > 1.0 | Statistic = Sum<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors node port repair errors |
124
+ | REACTIVE | EKS | ContainerInsights/Prometheus | kubelet_started_containers_errors_total | Reactive | Non-native | > 1.0 | Statistic = Sum<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors container start errors |
125
+ | REACTIVE | EKS | ContainerInsights/Prometheus | kubelet_runtime_operations_errors_total | Reactive | Non-native | > 1.0 | Statistic = Sum<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors kubelet runtime operation errors |
126
+ | REACTIVE | EKS | ContainerInsights/Prometheus | kubelet_started_pods_errors_total | Reactive | Non-native | > 1.0 | Statistic = Sum<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors pod start errors |
127
+ | REACTIVE | EKS | ContainerInsights/Prometheus | node_collector_zone_health | Reactive | Non-native | > 1.0 | Statistic = Average<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors node health percentage per zone |
128
+
129
+ ### ElastiCache
130
+
131
+ | Business Objectives | AWS Service | Namespace | Metric name | Reactive/Proactive | Metric Classification | Threshold | Recommended alarm configuration | Use case |
132
+ |---------------------|-------------|-----------|-------------|--------------------|-----------------------|-----------|---------------------------------|----------|
133
+ | PERFORMANCE | ElastiCache | AWS/ElastiCache | FreeableMemory | Reactive | Native | < 100000000 | Statistic = Sum<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors low freeable memory which can indicate spike in connections or high memory pressure |
134
+ | PERFORMANCE | ElastiCache | AWS/ElastiCache | DatabaseMemoryUsagePercentage | Reactive | Native | >= 90.0 | Statistic = Sum<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors the percentage of memory utilization for Redis clusters |
135
+ | PERFORMANCE | ElastiCache | AWS/ElastiCache | CurrConnections | Reactive | Native | > 1000 | Statistic = Maximum<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors the number of client connections, excluding connections from read replicas |
136
+
137
+ ### Elemental Media Services
138
+
139
+ | Business Objectives | AWS Service | Namespace | Metric name | Reactive/Proactive | Metric Classification | Threshold | Recommended alarm configuration | Use case |
140
+ |---------------------|-------------|-----------|-------------|--------------------|-----------------------|-----------|---------------------------------|----------|
141
+ | CUSTOMER_EXPERIENCE | Elemental Media Services | AWS/MediaLive | SvqTime | Reactive | Native | > 80.0 | Statistic = Maximum<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Percentage of time MediaLive had to reduce quality optimizations to emit output in real time |
142
+ | RELIABILITY | Elemental Media Services | AWS/MediaPackage | EgressResponseTime | Reactive | Native | > 5000.0 | Statistic = Average<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Time that it takes MediaPackage to process each output request |
143
+ | RELIABILITY | Elemental Media Services | AWS/MediaPackage | IngressResponseTime | Reactive | Native | > 5000.0 | Statistic = Average<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Time that it takes MediaPackage to process each input request |
144
+
145
+ ### Keyspaces
146
+
147
+ | Business Objectives | AWS Service | Namespace | Metric name | Reactive/Proactive | Metric Classification | Threshold | Recommended alarm configuration | Use case |
148
+ |---------------------|-------------|-----------|-------------|--------------------|-----------------------|-----------|---------------------------------|----------|
149
+ | RELIABILITY | Keyspaces | AWS/Cassandra | PerConnectionRequestRateExceeded | Reactive | Native | >= 1.0 | Statistic = Sum<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Tracks the requests that exceed the per-connection request rate quota |
150
+ | RELIABILITY | Keyspaces | AWS/Cassandra | ReadThrottleEvents | Reactive | Native | >= 1.0 | Statistic = Sum<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Tracks the requests that exceed the provisioned read capacity |
151
+ | RELIABILITY | Keyspaces | AWS/Cassandra | WriteThrottleEvents | Reactive | Native | >= 1.0 | Statistic = Sum<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Tracks the requests that exceed the provisioned write capacity |
152
+ | RELIABILITY | Keyspaces | AWS/Cassandra | ReplicationLatency | Reactive | Conditional | > 1000.0 | Statistic = Average<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors the time it took to replicate updates, inserts, or deletes from one replica table to another replica table in a multi-Region keyspace |
153
+ | RELIABILITY | Keyspaces | AWS/Cassandra | StoragePartitionThroughputCapacityExceeded | Reactive | Native | >= 1.0 | Statistic = Sum<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Tracks the requests that exceed the throughput capacity of the storage partition |
154
+
155
+ ### Kinesis
156
+
157
+ | Business Objectives | AWS Service | Namespace | Metric name | Reactive/Proactive | Metric Classification | Threshold | Recommended alarm configuration | Use case |
158
+ |---------------------|-------------|-----------|-------------|--------------------|-----------------------|-----------|---------------------------------|----------|
159
+ | RELIABILITY | Kinesis | AWS/Kinesis | ReadProvisionedThroughputExceeded | Proactive | Native | >= 1.0 | Statistic = Sum<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Tracks the number of times read operations exceed the provisioned read throughput capacity |
160
+ | RELIABILITY | Kinesis | AWS/Kinesis | WriteProvisionedThroughputExceeded | Proactive | Native | >= 1.0 | Statistic = Sum<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Identifies when write capacity limits are hit causing data ingestion delays or failures |
161
+ | RELIABILITY | Kinesis | AWS/Kinesis | GetRecords.IteratorAgeMilliseconds | Reactive | Native | > 600000.0 | Statistic = Maximum<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Tracks how long records have been in the stream before being processed |
162
+ | RELIABILITY | Kinesis | AWS/Kinesis | PutRecords.FailedRecords | Reactive | Native | > 1.0 | Statistic = Sum<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Tracks the number of records that fail to be processed |
163
+ | RELIABILITY | Kinesis | AWS/Kinesis | PutRecords.ThrottledRecords | Proactive | Native | > 1.0 | Statistic = Sum<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors read throttling to detect capacity issues |
164
+
165
+ ### Lambda
166
+
167
+ | Business Objectives | AWS Service | Namespace | Metric name | Reactive/Proactive | Metric Classification | Threshold | Recommended alarm configuration | Use case |
168
+ |---------------------|-------------|-----------|-------------|--------------------|-----------------------|-----------|---------------------------------|----------|
169
+ | RELIABILITY | Lambda | AWS/Lambda | (m1/m2)*100<br>m1 = Errors<br>m2 = Invocations | Reactive | Native | > 5.0 | Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors the ratio of errors to successful Lambda invocations |
170
+ | RELIABILITY | Lambda | AWS/Lambda | Throttles | Reactive | Native | >= 1.0 | Statistic = Average<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors invocation request throughput and throttling |
171
+ | RELIABILITY | Lambda | AWS/Lambda | DeadLetterErrors | Reactive | Native | >= 1.0 | Statistic = Sum<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors when messages fail to reach the DLQ |
172
+ | RELIABILITY | Lambda | AWS/Lambda | ConcurrentExecutions | Reactive | Conditional | > 900.0 | Statistic = Average<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Tracks the number of function instances running concurrently |
173
+
174
+ ### RDS
175
+
176
+ | Business Objectives | AWS Service | Namespace | Metric name | Reactive/Proactive | Metric Classification | Threshold | Recommended alarm configuration | Use case |
177
+ |---------------------|-------------|-----------|-------------|--------------------|-----------------------|-----------|---------------------------------|----------|
178
+ | RELIABILITY | RDS | AWS/RDS | DiskQueueDepth | Reactive | Native | > 25.0 | Statistic = Average<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors outstanding IOs waiting to be processed - identifies performance bottlenecks |
179
+ | RELIABILITY | RDS | AWS/RDS | FreeStorageSpace | Proactive | Native | < 2147483648.0 | Statistic = Average<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = breaching | Proactively monitors available storage capacity to prevent database outages |
180
+ | RELIABILITY | RDS | AWS/RDS | ReplicaLag | Reactive | Conditional | > 30.0 | Statistic = Average<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = breaching | Monitors replication lag times on RDS read replicas to ensure data freshness |
181
+ | LATENCY | RDS | AWS/RDS | ReadLatency | Reactive | Native | > 0.2 | Statistic = Average<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors average time to read data from database storage |
182
+ | LATENCY | RDS | AWS/RDS | WriteLatency | Reactive | Native | > 0.2 | Statistic = Average<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors average time to write data to database storage |
183
+
184
+ ### Redshift
185
+
186
+ | Business Objectives | AWS Service | Namespace | Metric name | Reactive/Proactive | Metric Classification | Threshold | Recommended alarm configuration | Use case |
187
+ |---------------------|-------------|-----------|-------------|--------------------|-----------------------|-----------|---------------------------------|----------|
188
+ | RELIABILITY | Redshift | AWS/Redshift | DatabaseConnections | Reactive | Native | > 90.0 | Statistic = Sum<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = breaching | Tracks the number of database connections to a cluster |
189
+ | RELIABILITY | Redshift | AWS/Redshift | HealthStatus | Reactive | Native | < 1.0 | Statistic = Minimum<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = breaching | Indicates the health of the cluster |
190
+ | RELIABILITY | Redshift | AWS/Redshift | PercentageDiskSpaceUsed | Reactive | Native | > 95.0 | Statistic = Average<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = breaching | Tracks the percent of disk space used |
191
+ | LATENCY | Redshift | AWS/Redshift | ReadLatency | Reactive | Native | > 20.0 | Statistic = Average<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Tracks the average amount of time taken for disk read I/O operations |
192
+ | LATENCY | Redshift | AWS/Redshift | WriteLatency | Reactive | Native | > 20.0 | Statistic = Average<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Tracks the average amount of time taken for disk write I/O operations |
193
+
194
+ ### S3
195
+
196
+ | Business Objectives | AWS Service | Namespace | Metric name | Reactive/Proactive | Metric Classification | Threshold | Recommended alarm configuration | Use case |
197
+ |---------------------|-------------|-----------|-------------|--------------------|-----------------------|-----------|---------------------------------|----------|
198
+ | PERFORMANCE | S3 | AWS/S3 | TotalRequestLatency | Reactive | Conditional | > 1000.0 | Statistic = Average<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Measures the total time taken to process requests to an S3 bucket |
199
+
200
+ ### SNS
201
+
202
+ | Business Objectives | AWS Service | Namespace | Metric name | Reactive/Proactive | Metric Classification | Threshold | Recommended alarm configuration | Use case |
203
+ |---------------------|-------------|-----------|-------------|--------------------|-----------------------|-----------|---------------------------------|----------|
204
+ | RELIABILITY | SNS | AWS/SNS | NumberOfNotificationsFailed | Reactive | Native | >= 5.0 | Statistic = Sum<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors failed notifications to detect delivery issues |
205
+ | RELIABILITY | SNS | AWS/SNS | NumberOfNotificationsFilteredOut-InvalidAttributes | Reactive | Conditional | >= 1.0 | Statistic = Sum<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = breaching | Monitors notifications filtered due to invalid attributes |
206
+ | RELIABILITY | SNS | AWS/SNS | NumberOfNotificationsFilteredOut-NoMessageAttributes | Reactive | Conditional | >= 1.0 | Statistic = Sum<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = breaching | Monitors notifications filtered due to missing message attributes |
207
+ | RELIABILITY | SNS | AWS/SNS | NumberOfNotificationsFilteredOut-InvalidMessageBody | Reactive | Conditional | >= 1.0 | Statistic = Sum<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = breaching | Monitors notifications filtered due to invalid message body |
208
+ | RELIABILITY | SNS | AWS/SNS | NumberOfNotificationsRedrivenToDlq | Reactive | Conditional | >= 1.0 | Statistic = Sum<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = breaching | Monitors notifications redirected to a Dead Letter Queue |
209
+ | RELIABILITY | SNS | AWS/SNS | SMSSuccessRate | Reactive | Conditional | < 90.0 | Statistic = Average<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors the rate of successful SMS message deliveries |
210
+
211
+ ### SQS
212
+
213
+ | Business Objectives | AWS Service | Namespace | Metric name | Reactive/Proactive | Metric Classification | Threshold | Recommended alarm configuration | Use case |
214
+ |---------------------|-------------|-----------|-------------|--------------------|-----------------------|-----------|---------------------------------|----------|
215
+ | RELIABILITY | SQS | AWS/SQS | ApproximateNumberOfMessagesVisible | Reactive | Native | >= 1000.0 | Statistic = Average<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors the number of visible messages in the SQS queue that are awaiting processing |
216
+ | RELIABILITY | SQS | AWS/SQS | ApproximateAgeOfOldestMessage | Reactive | Native | >= 900.0 | Statistic = Maximum<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = breaching | Measures the duration that the oldest message has been in the queue without being processed |
217
+
218
+ ### Step Functions
219
+
220
+ | Business Objectives | AWS Service | Namespace | Metric name | Reactive/Proactive | Metric Classification | Threshold | Recommended alarm configuration | Use case |
221
+ |---------------------|-------------|-----------|-------------|--------------------|-----------------------|-----------|---------------------------------|----------|
222
+ | RELIABILITY | Step Functions | AWS/States | ExecutionsFailed | Reactive | Native | >= 5.0 | Statistic = Sum<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors failed executions to detect workflow issues |
223
+ | RELIABILITY | Step Functions | AWS/States | ExecutionsTimedOut | Reactive | Native | >= 1.0 | Statistic = Sum<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors timed out executions to detect timeout issues |
224
+ | LATENCY | Step Functions | AWS/States | ExecutionThrottled | Reactive | Native | >= 1.0 | Statistic = Sum<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Tracks executions throttled due to exceeding AWS service limits |
225
+
226
+ ### VPC Transit Gateway
227
+
228
+ | Business Objectives | AWS Service | Namespace | Metric name | Reactive/Proactive | Metric Classification | Threshold | Recommended alarm configuration | Use case |
229
+ |---------------------|-------------|-----------|-------------|--------------------|-----------------------|-----------|---------------------------------|----------|
230
+ | RELIABILITY | VPC Transit Gateway | AWS/TransitGateway | BytesDropCountBlackhole | Reactive | Native | > 1000000.0 | Statistic = Sum<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = breaching | Number of bytes dropped because they matched a blackhole route |
231
+ | RELIABILITY | VPC Transit Gateway | AWS/TransitGateway | BytesDropCountNoRoute | Reactive | Native | > 1000000.0 | Statistic = Sum<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = breaching | Number of bytes dropped because they did not match a route |