awsidr 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- awsidr-1.0.0/.github/pull_request_template.md +13 -0
- awsidr-1.0.0/.github/workflows/pypi_upload.yml +27 -0
- awsidr-1.0.0/.github/workflows/version_bump.yml +44 -0
- awsidr-1.0.0/CHANGELOG.md +0 -0
- awsidr-1.0.0/CODE_OF_CONDUCT.md +4 -0
- awsidr-1.0.0/Guide/README.md +23 -0
- awsidr-1.0.0/Guide/appendix.md +231 -0
- awsidr-1.0.0/Guide/cli-usage/alarm-ingestion.md +441 -0
- awsidr-1.0.0/Guide/cli-usage/apm-integration.md +426 -0
- awsidr-1.0.0/Guide/cli-usage/cloudwatch-alarms.md +290 -0
- awsidr-1.0.0/Guide/cli-usage/workload-registration.md +226 -0
- awsidr-1.0.0/Guide/examples/alarm-creation-examples.md +93 -0
- awsidr-1.0.0/Guide/examples/alarm-ingestion-examples.md +126 -0
- awsidr-1.0.0/Guide/examples/workload-registration-examples.md +56 -0
- awsidr-1.0.0/Guide/faq.md +141 -0
- awsidr-1.0.0/Guide/getting-started.md +88 -0
- awsidr-1.0.0/Guide/iam-policies/apm-saas.json +136 -0
- awsidr-1.0.0/Guide/iam-policies/apm-sns.json +141 -0
- awsidr-1.0.0/Guide/iam-policies/apm-webhook.json +159 -0
- awsidr-1.0.0/Guide/iam-policies/general-cli.json +59 -0
- awsidr-1.0.0/Guide/iam-policies.md +121 -0
- awsidr-1.0.0/Guide/support-case-attachment.md +61 -0
- awsidr-1.0.0/Guide/unattended-mode.md +401 -0
- awsidr-1.0.0/Guide/workflows.md +167 -0
- awsidr-1.0.0/LICENSE.md +188 -0
- awsidr-1.0.0/NOTICE.md +2 -0
- awsidr-1.0.0/PKG-INFO +124 -0
- awsidr-1.0.0/README.md +103 -0
- awsidr-1.0.0/SECURITY.md +11 -0
- awsidr-1.0.0/pyproject.toml +33 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/__init__.py +0 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/_version.py +1 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/clients/__init__.py +0 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/clients/ec2.py +42 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/clients/iam.py +33 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/clients/s3.py +34 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/clients/sts.py +29 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/commands/__init__.py +0 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/commands/create_alarm/__init__.py +0 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/commands/create_alarm/command.py +339 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/commands/ingest_alarms/__init__.py +0 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/commands/ingest_alarms/command.py +215 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/commands/register_workload/__init__.py +0 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/commands/register_workload/command.py +363 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/commands/setup_apm/__init__.py +0 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/commands/setup_apm/command.py +222 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/core/__init__.py +0 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/core/command_base.py +15 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/core/decorators.py +89 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/core/interactive/__init__.py +0 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/core/interactive/ui.py +168 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/core/registry.py +163 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/data_accessors/__init__.py +0 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/data_accessors/alarm_accessor.py +217 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/data_accessors/apigateway_accessor.py +39 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/data_accessors/base_accessor.py +49 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/data_accessors/cloudformation_accessor.py +311 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/data_accessors/eventbridge_accessor.py +121 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/data_accessors/logs_accessor.py +91 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/data_accessors/resource_tagging_accessor.py +197 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/data_accessors/sns_accessor.py +70 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/data_accessors/support_case_accessor.py +160 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/exceptions.py +99 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/input/__init__.py +0 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/input/input_resource_discovery.py +256 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/interfaces/__init__.py +0 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/interfaces/file_cache_service.py +93 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/interfaces/input_service.py +48 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/interfaces/mlo_selection_manager.py +54 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/interfaces/resource_finder_service.py +61 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/main.py +35 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/models/__init__.py +0 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/models/alarm_models.py +157 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/models/mlo_selection_manager.py +20 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/models/non_interactive_config.py +145 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/modules/__init__.py +0 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/modules/accessors.py +98 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/modules/base.py +13 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/modules/boto_clients.py +46 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/modules/file_cache.py +22 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/modules/injector_config.py +26 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/modules/input.py +54 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/modules/logging.py +70 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/modules/service_clients.py +140 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/modules/session.py +19 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/modules/validation.py +19 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/services/__init__.py +0 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/services/apm/_init_.py +0 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/services/apm/apm_service.py +603 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/services/apm/cfn_stack_processor.py +129 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/services/create_alarm/__init__.py +0 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/services/create_alarm/alarm_recommendation_service.py +678 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/services/create_alarm/alarm_service.py +551 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/services/file_cache/__init__.py +0 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/services/file_cache/data.py +230 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/services/file_cache/file_cache_deserializer.py +65 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/services/file_cache/file_cache_migration_service.py +124 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/services/file_cache/file_cache_service.py +447 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/services/file_cache/idr-cx-cli_20251231115959.json +117 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/services/input_module/__init__.py +0 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/services/input_module/input_service.py +72 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/services/input_module/resource_finder_service.py +291 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/services/non_interactive_alarm_ingestion_service.py +732 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/services/non_interactive_alarm_service.py +578 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/services/non_interactive_base_service.py +308 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/services/non_interactive_workload_service.py +157 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/services/support_case_service.py +356 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/utils/__init__.py +0 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/utils/alarm_contact_collection.py +363 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/utils/apm/apm_config.py +145 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/utils/apm/apm_constants.py +510 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/utils/apm/apm_stack_helpers.py +80 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/utils/apm/cfn_templates/__init__.py +0 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/utils/apm/cfn_templates/non_saas_integration.json +421 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/utils/apm/cfn_templates/saas_integration.json +176 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/utils/apm/cfn_templates/sns_integration.json +154 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/utils/apm/lambda_code/non_saas_lambda.py +234 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/utils/apm/lambda_code/saas_lambda.py +51 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/utils/apm/lambda_code/sns_lambda.py +135 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/utils/arn_utils.py +74 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/utils/attachment_splitter.py +344 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/utils/constants.py +49 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/utils/context.py +25 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/utils/create_alarm/alarm_service_config.py +424 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/utils/create_alarm/idr_alarm_templates/Keyspaces.yaml +137 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/utils/create_alarm/idr_alarm_templates/__init__.py +0 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/utils/create_alarm/idr_alarm_templates/alb.yaml +157 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/utils/create_alarm/idr_alarm_templates/apigateway.yaml +139 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/utils/create_alarm/idr_alarm_templates/cloudfront.yaml +40 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/utils/create_alarm/idr_alarm_templates/dax.yaml +104 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/utils/create_alarm/idr_alarm_templates/directconnect.yaml +36 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/utils/create_alarm/idr_alarm_templates/dynamodb.yaml +135 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/utils/create_alarm/idr_alarm_templates/ec2.yaml +104 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/utils/create_alarm/idr_alarm_templates/efs.yaml +35 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/utils/create_alarm/idr_alarm_templates/eks.yaml +536 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/utils/create_alarm/idr_alarm_templates/elasticache.yaml +101 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/utils/create_alarm/idr_alarm_templates/kinesis.yaml +167 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/utils/create_alarm/idr_alarm_templates/lambda.yaml +169 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/utils/create_alarm/idr_alarm_templates/medialive.yaml +35 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/utils/create_alarm/idr_alarm_templates/mediapackage.yaml +68 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/utils/create_alarm/idr_alarm_templates/rds.yaml +167 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/utils/create_alarm/idr_alarm_templates/redshift.yaml +167 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/utils/create_alarm/idr_alarm_templates/s3.yaml +37 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/utils/create_alarm/idr_alarm_templates/sns.yaml +168 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/utils/create_alarm/idr_alarm_templates/sqs.yaml +68 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/utils/create_alarm/idr_alarm_templates/stepfunctions.yaml +101 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/utils/create_alarm/idr_alarm_templates/transitgateway.yaml +68 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/utils/create_alarm/metric_namespace_validator.py +155 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/utils/execution_mode.py +25 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/utils/feature_flags.py +123 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/utils/hash_utils.py +42 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/utils/log_formatter.py +38 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/utils/log_handlers.py +9 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/utils/mlo.py +539 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/utils/mlo_adapter.py +136 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/utils/resource_discovery_utils.py +479 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/utils/resource_filtering/__init__.py +0 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/utils/resource_filtering/functional_resource_config.py +221 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/utils/service_linked_role_utils.py +68 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/utils/session/__init__.py +0 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/utils/session/alarm_creation_session.py +546 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/utils/session/alarm_ingestion_session.py +856 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/utils/session/apm_setup_session.py +695 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/utils/session/interactive_session.py +381 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/utils/session/session_store.py +291 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/utils/session/workload_session.py +483 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/utils/support_case_utils.py +55 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/utils/validate_alarm/__init__.py +0 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/utils/validate_alarm/alarm_validation_constants.py +189 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/utils/validate_alarm/alarm_validation_models.py +33 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/utils/validate_alarm/alarm_validator.py +1322 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/utils/validation/__init__.py +0 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/utils/validation/apm_validation.py +442 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/utils/validation/aws_validation_context.py +74 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/utils/validation/base_validation_context.py +33 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/utils/validation/contact_validation_context.py +41 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/utils/validation/validator.py +410 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/utils/validation/workload_validation.py +61 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/utils/validation/workload_validation_context.py +40 -0
- awsidr-1.0.0/src/aws_idr_customer_cli/utils/workload_meta_data_collection_utils.py +276 -0
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
## Description
|
|
2
|
+
<!-- Briefly describe your changes -->
|
|
3
|
+
|
|
4
|
+
## Type of Change
|
|
5
|
+
- [ ] Bug fix
|
|
6
|
+
- [ ] New feature
|
|
7
|
+
- [ ] Documentation update
|
|
8
|
+
|
|
9
|
+
### Pre-merge Checklist
|
|
10
|
+
- [ ] CI/CD pipeline passes with no error
|
|
11
|
+
- [ ] Branch is up to date with main
|
|
12
|
+
- [ ] Github Sync Script is run and tests have passed
|
|
13
|
+
- [ ] Approved by oncall engineer
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
name: PyPI Upload
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
release:
|
|
5
|
+
types: [ published ]
|
|
6
|
+
|
|
7
|
+
jobs:
|
|
8
|
+
upload:
|
|
9
|
+
runs-on: ubuntu-latest
|
|
10
|
+
environment: release
|
|
11
|
+
permissions:
|
|
12
|
+
id-token: write
|
|
13
|
+
steps:
|
|
14
|
+
- uses: actions/checkout@v4
|
|
15
|
+
|
|
16
|
+
- name: Set up Python
|
|
17
|
+
uses: actions/setup-python@v4
|
|
18
|
+
with:
|
|
19
|
+
python-version: '3.10'
|
|
20
|
+
|
|
21
|
+
- name: Build package
|
|
22
|
+
run: |
|
|
23
|
+
python -m pip install --upgrade pip build
|
|
24
|
+
python -m build
|
|
25
|
+
|
|
26
|
+
- name: Upload to PyPI
|
|
27
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
# This is a workflow to help you bump the verion automtically
|
|
2
|
+
|
|
3
|
+
name: Bump Version
|
|
4
|
+
|
|
5
|
+
permissions:
|
|
6
|
+
contents: write
|
|
7
|
+
pull-requests: write
|
|
8
|
+
|
|
9
|
+
# Controls when the workflow will run
|
|
10
|
+
on:
|
|
11
|
+
workflow_dispatch:
|
|
12
|
+
inputs:
|
|
13
|
+
release_type:
|
|
14
|
+
description: 'Select version bump type'
|
|
15
|
+
required: true
|
|
16
|
+
default: 'patch'
|
|
17
|
+
type: choice
|
|
18
|
+
options:
|
|
19
|
+
- patch
|
|
20
|
+
- minor
|
|
21
|
+
- major
|
|
22
|
+
|
|
23
|
+
# A workflow run is made up of one or more jobs that can run sequentially or in parallel
|
|
24
|
+
jobs:
|
|
25
|
+
# This workflow contains a single job called "bump"
|
|
26
|
+
bump:
|
|
27
|
+
name: 'Bump Version on master'
|
|
28
|
+
# The type of runner that the job will run on
|
|
29
|
+
runs-on: ubuntu-latest
|
|
30
|
+
|
|
31
|
+
# Steps represent a sequence of tasks that will be executed as part of the job
|
|
32
|
+
steps:
|
|
33
|
+
- name: 'Checkout source code'
|
|
34
|
+
uses: actions/checkout@v5
|
|
35
|
+
with:
|
|
36
|
+
fetch-depth: 0
|
|
37
|
+
|
|
38
|
+
- name: 'Automated Version Bump'
|
|
39
|
+
id: version-bump
|
|
40
|
+
uses: 'taj54/universal-version-bump@v0.14.0'
|
|
41
|
+
env:
|
|
42
|
+
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
|
43
|
+
with:
|
|
44
|
+
release_type: ${{ inputs.release_type }}
|
|
File without changes
|
|
@@ -0,0 +1,4 @@
|
|
|
1
|
+
## Code of Conduct
|
|
2
|
+
This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
|
|
3
|
+
For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
|
|
4
|
+
opensource-codeofconduct@amazon.com with any additional questions or comments.
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
# IDR CLI Guide
|
|
2
|
+
|
|
3
|
+
This guide provides documentation for using the IDR CLI tool.
|
|
4
|
+
|
|
5
|
+
## Getting Started
|
|
6
|
+
|
|
7
|
+
Please read [Getting started.md](getting-started.md) first.
|
|
8
|
+
|
|
9
|
+
## Directory Structure
|
|
10
|
+
|
|
11
|
+
- [cli-usage/](cli-usage/) - Command-line usage guides for individual CLI commands.
|
|
12
|
+
- [examples/](examples/) - Examples demonstrating use case of unattended mode.
|
|
13
|
+
- [iam-policies/](iam-policies/) - Customized IAM policy templates required for CLI operations.
|
|
14
|
+
|
|
15
|
+
## Documentation Files
|
|
16
|
+
|
|
17
|
+
- [getting-started.md](getting-started.md) - Initial setup and basic usage instructions.
|
|
18
|
+
- [workflows.md](workflows.md) - Workflow examples for common tasks.
|
|
19
|
+
- [iam-policies.md](iam-policies.md) - IAM permissions setup and configuration.
|
|
20
|
+
- [unattended-mode.md](unattended-mode.md) - Running the CLI in unattended mode.
|
|
21
|
+
- [support-case-attachment.md](support-case-attachment.md) - CLI Attachment to AWS Support case explained.
|
|
22
|
+
- [faq.md](faq.md) - Frequently asked questions and troubleshooting.
|
|
23
|
+
- [appendix.md](appendix.md) - Additional reference information and resources.
|
|
@@ -0,0 +1,231 @@
|
|
|
1
|
+
# Appendix
|
|
2
|
+
|
|
3
|
+
## Progress Saving
|
|
4
|
+
|
|
5
|
+
The CLI will store progress information in a local file cache. The local file cache is a local directory that saves files that record progress and onboarding information during execution of the IDR Customer CLI. You can find this directory at ~/.aws-idr/cache
|
|
6
|
+
A typical local cache file name looks like idr-cx-cli_20250805163942882934.enc
|
|
7
|
+
These cache files are encrypted and not meant to be edited or accessed directly.
|
|
8
|
+
Cache file names are consistent with the session numbers. So in this example, you can resume session from that specific cache by executing with flag --resume idr-cx-cli_20250805163942882934
|
|
9
|
+
|
|
10
|
+
## Resources IDR Customer CLI does not onboard
|
|
11
|
+
|
|
12
|
+
If you find resources count to be less than expected when doing AWS Resource Discovery, it is likely because they are in the category considered as non-functional resources. The CLI will not create alarms for these resources and will not display them to you. For the complete list of functional and non-functional resources, see [functional_resource_config.py](../src/aws_idr_customer_cli/utils/resource_filtering/functional_resource_config.py).
|
|
13
|
+
|
|
14
|
+
## APM Integrations
|
|
15
|
+
|
|
16
|
+
### Integration Resources
|
|
17
|
+
|
|
18
|
+
#### Resources Created by Integration Type
|
|
19
|
+
|
|
20
|
+
**Common Resources (All Types)**
|
|
21
|
+
|
|
22
|
+
| Resource | Purpose |
|
|
23
|
+
|----------|---------|
|
|
24
|
+
| Custom EventBus | Routes normalized events to IDR |
|
|
25
|
+
| Transform Lambda | Extracts incident identifier from APM payload |
|
|
26
|
+
| IAM Execution Role | Lambda permissions for EventBridge and CloudWatch |
|
|
27
|
+
|
|
28
|
+
Created by all integration types: 3 core resources
|
|
29
|
+
|
|
30
|
+
**Type-Specific Resources**
|
|
31
|
+
|
|
32
|
+
| Integration Type | Additional Resources | Total Resources |
|
|
33
|
+
|-----------------|---------------------|-----------------|
|
|
34
|
+
| EventBridge (SaaS) | EventBridge Rule | 4 resources |
|
|
35
|
+
| SNS | SNS Topic Subscription | 4 resources |
|
|
36
|
+
| Webhook | API Gateway (4 components)<br>Secrets Manager<br>Lambda Authorizer<br>Authorizer IAM Role | 10 resources |
|
|
37
|
+
|
|
38
|
+
#### Resource Naming Pattern
|
|
39
|
+
|
|
40
|
+
All resources follow: `{APMName}-AWSIncidentDetectionResponse-{ResourceType}`
|
|
41
|
+
|
|
42
|
+
Example for Dynatrace:
|
|
43
|
+
* EventBus: `Dynatrace-AWSIncidentDetectionResponse-EventBus`
|
|
44
|
+
* Transform Lambda: `Dynatrace-AWSIncidentDetectionResponse-Lambda-Transform`
|
|
45
|
+
* API Gateway: `Dynatrace-AWSIncidentDetectionResponse-APIGW` (webhook only)
|
|
46
|
+
|
|
47
|
+
## IDR Alarm Recommendations
|
|
48
|
+
|
|
49
|
+
### ALB
|
|
50
|
+
|
|
51
|
+
| Business Objectives | AWS Service | Namespace | Metric name | Reactive/Proactive | Metric Classification | Threshold | Recommended alarm configuration | Use case |
|
|
52
|
+
|---------------------|-------------|-----------|-------------|--------------------|-----------------------|-----------|---------------------------------|----------|
|
|
53
|
+
| RELIABILITY | ALB | AWS/ApplicationELB | HTTPErrorRate | Reactive | Native | > 5.0 | Statistic = Average<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors ratio of HTTP errors (4XX+5XX) to total requests for application stability |
|
|
54
|
+
| RELIABILITY | ALB | AWS/ApplicationELB | RejectedConnectionCount | Proactive | Native | >= 1.0 | Statistic = Sum<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors rejected connections when load balancer reaches maximum capacity |
|
|
55
|
+
| RELIABILITY | ALB | AWS/ApplicationELB | TargetResponseTime | Reactive | Native | >= 1.0 | Statistic = Average<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors time elapsed from request leaving load balancer until target starts sending response headers |
|
|
56
|
+
|
|
57
|
+
### API Gateway
|
|
58
|
+
|
|
59
|
+
| Business Objectives | AWS Service | Namespace | Metric name | Reactive/Proactive | Metric Classification | Threshold | Recommended alarm configuration | Use case |
|
|
60
|
+
|---------------------|-------------|-----------|-------------|--------------------|-----------------------|-----------|---------------------------------|----------|
|
|
61
|
+
| RELIABILITY | API Gateway | AWS/API Gateway | (m1/m2)*100<br>m1 = Errors<br>m2 = Invocations | Reactive | Native | >= 5.0 | Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors API error rates as a percentage of total traffic using math expression |
|
|
62
|
+
| PERFORMANCE | API Gateway | AWS/ApiGateway | Latency | Reactive | Native | >= 5000.0 | Statistic = Average<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Response time monitoring provides insight into end user experience and performance |
|
|
63
|
+
| PERFORMANCE | API Gateway | AWS/ApiGateway | IntegrationLatency | Reactive | Native | >= 3000.0 | Statistic = Average<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Measures the time taken between when API Gateway forwards a request to the backend and receives a response |
|
|
64
|
+
|
|
65
|
+
### CloudFront
|
|
66
|
+
|
|
67
|
+
| Business Objectives | AWS Service | Namespace | Metric name | Reactive/Proactive | Metric Classification | Threshold | Recommended alarm configuration | Use case |
|
|
68
|
+
|---------------------|-------------|-----------|-------------|--------------------|-----------------------|-----------|---------------------------------|----------|
|
|
69
|
+
| RELIABILITY | CloudFront | AWS/CloudFront | (m1/m2)*100<br>m1 = Errors<br>m2 = Invocations | Reactive | Native | > 5.0 | Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors the ratio of HTTP 5xx server error responses to total requests |
|
|
70
|
+
|
|
71
|
+
### DAX
|
|
72
|
+
|
|
73
|
+
| Business Objectives | AWS Service | Namespace | Metric name | Reactive/Proactive | Metric Classification | Threshold | Recommended alarm configuration | Use case |
|
|
74
|
+
|---------------------|-------------|-----------|-------------|--------------------|-----------------------|-----------|---------------------------------|----------|
|
|
75
|
+
| RELIABILITY | DAX | AWS/DAX | FaultRequestCount | Reactive | Native | >= 1.0 | Statistic = Sum<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors HTTP 500 server errors from DAX |
|
|
76
|
+
| RELIABILITY | DAX | AWS/DAX | FailedRequestCount | Reactive | Native | >= 1.0 | Statistic = Sum<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors failed requests from DAX |
|
|
77
|
+
| RELIABILITY | DAX | AWS/DAX | ThrottledRequestCount | Reactive | Native | >= 1.0 | Statistic = Sum<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors throttled requests from DAX |
|
|
78
|
+
|
|
79
|
+
### Direct Connect
|
|
80
|
+
|
|
81
|
+
| Business Objectives | AWS Service | Namespace | Metric name | Reactive/Proactive | Metric Classification | Threshold | Recommended alarm configuration | Use case |
|
|
82
|
+
|---------------------|-------------|-----------|-------------|--------------------|-----------------------|-----------|---------------------------------|----------|
|
|
83
|
+
| RELIABILITY | Direct Connect | AWS/DX | ConnectionState | Reactive | Native | < 0.5 | Statistic = Maximum<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = breaching | The state of the connection. 1 indicates up and 0 indicates down |
|
|
84
|
+
|
|
85
|
+
### DynamoDB
|
|
86
|
+
|
|
87
|
+
| Business Objectives | AWS Service | Namespace | Metric name | Reactive/Proactive | Metric Classification | Threshold | Recommended alarm configuration | Use case |
|
|
88
|
+
|---------------------|-------------|-----------|-------------|--------------------|-----------------------|-----------|---------------------------------|----------|
|
|
89
|
+
| RELIABILITY | DynamoDB | AWS/DynamoDB | ReadThrottleEvents | Proactive | Native | >= 5.0 | Statistic = Average<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors read throttle events to detect capacity issues |
|
|
90
|
+
| RELIABILITY | DynamoDB | AWS/DynamoDB | WriteThrottleEvents | Proactive | Native | >= 5.0 | Statistic = Average<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors write throttle events to detect capacity issues |
|
|
91
|
+
| RELIABILITY | DynamoDB | AWS/DynamoDB | SuccessfulRequestLatency | Reactive | Native | >= 100.0 | Statistic = Average<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors successful request latency to detect performance degradation |
|
|
92
|
+
| RELIABILITY | DynamoDB | AWS/DynamoDB | ReplicationLatency | Reactive | Conditional | >= 1.0 | Statistic = Average<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors replication latency for global tables |
|
|
93
|
+
|
|
94
|
+
### EC2
|
|
95
|
+
|
|
96
|
+
| Business Objectives | AWS Service | Namespace | Metric name | Reactive/Proactive | Metric Classification | Threshold | Recommended alarm configuration | Use case |
|
|
97
|
+
|---------------------|-------------|-----------|-------------|--------------------|-----------------------|-----------|---------------------------------|----------|
|
|
98
|
+
| RELIABILITY | EC2 | AWS/EC2 | StatusCheckFailed_Instance | Reactive | Native | >= 1.0 | Statistic = Sum<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = breaching | Reports whether the instance has passed the instance status check |
|
|
99
|
+
| RELIABILITY | EC2 | AWS/EC2 | StatusCheckFailed_System | Reactive | Native | >= 1.0 | Statistic = Sum<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = breaching | Reports whether the instance has passed the system status check |
|
|
100
|
+
| RELIABILITY | EC2 | AWS/EC2 | StatusCheckFailed_AttachedEBS | Reactive | Native | >= 1.0 | Statistic = Sum<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = breaching | Reports whether the instance has passed the attached EBS status check |
|
|
101
|
+
|
|
102
|
+
### EFS
|
|
103
|
+
|
|
104
|
+
| Business Objectives | AWS Service | Namespace | Metric name | Reactive/Proactive | Metric Classification | Threshold | Recommended alarm configuration | Use case |
|
|
105
|
+
|---------------------|-------------|-----------|-------------|--------------------|-----------------------|-----------|---------------------------------|----------|
|
|
106
|
+
| RELIABILITY | EFS | AWS/EFS | PercentIOLimit | Reactive | Native | >= 80.0 | Statistic = Average<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = breaching | Shows how close a file system is to reaching the I/O limit of the General Purpose performance mode |
|
|
107
|
+
|
|
108
|
+
### EKS
|
|
109
|
+
|
|
110
|
+
| Business Objectives | AWS Service | Namespace | Metric name | Reactive/Proactive | Metric Classification | Threshold | Recommended alarm configuration | Use case |
|
|
111
|
+
|---------------------|-------------|-----------|-------------|--------------------|-----------------------|-----------|---------------------------------|----------|
|
|
112
|
+
| REACTIVE | EKS | ContainerInsights | pod_container_status_waiting_reason_crash_loop_back_off | Reactive | Native | > 1.0 | Statistic = Average<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors containers stuck in CrashLoopBackOff |
|
|
113
|
+
| REACTIVE | EKS | ContainerInsights | pod_container_status_waiting_reason_create_container_config_error | Reactive | Native | > 1.0 | Statistic = Average<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors containers with CreateContainerConfigError |
|
|
114
|
+
| REACTIVE | EKS | ContainerInsights | pod_container_status_waiting_reason_create_container_error | Reactive | Native | > 1.0 | Statistic = Average<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors containers with CreateContainerError |
|
|
115
|
+
| REACTIVE | EKS | ContainerInsights | pod_container_status_waiting_reason_image_pull_error | Reactive | Native | > 1.0 | Statistic = Average<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors containers with image pull errors |
|
|
116
|
+
| REACTIVE | EKS | ContainerInsights | pod_container_status_waiting_reason_start_error | Reactive | Native | > 1.0 | Statistic = Average<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors containers with start errors |
|
|
117
|
+
| REACTIVE | EKS | ContainerInsights | cluster_failed_node_count | Reactive | Native | > 1.0 | Statistic = Average<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors failed worker nodes in the EKS cluster |
|
|
118
|
+
| REACTIVE | EKS | ContainerInsights | pod_status_unknown | Reactive | Native | > 1.0 | Statistic = Average<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors pods with unknown status |
|
|
119
|
+
| REACTIVE | EKS | ContainerInsights | apiserver_admission_webhook_admission_duration_seconds | Reactive | Non-native | > 1.0 | Statistic = Average<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors admission webhook latency |
|
|
120
|
+
| PROACTIVE | EKS | ContainerInsights/Prometheus | apiserver_admission_controller_admission_duration_seconds | Reactive | Non-native | > 1.0 | Statistic = Average<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors admission controller latency |
|
|
121
|
+
| PROACTIVE | EKS | ContainerInsights/Prometheus | apiserver_authorization_webhook_duration_seconds | Reactive | Non-native | > 1.0 | Statistic = Average<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors authorization webhook duration |
|
|
122
|
+
| REACTIVE | EKS | ContainerInsights/Prometheus | apiserver_clusterip_repair_ip_errors_total | Reactive | Non-native | > 1.0 | Statistic = Sum<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors cluster IP repair errors |
|
|
123
|
+
| REACTIVE | EKS | ContainerInsights/Prometheus | apiserver_nodeport_repair_port_errors_total | Reactive | Non-native | > 1.0 | Statistic = Sum<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors node port repair errors |
|
|
124
|
+
| REACTIVE | EKS | ContainerInsights/Prometheus | kubelet_started_containers_errors_total | Reactive | Non-native | > 1.0 | Statistic = Sum<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors container start errors |
|
|
125
|
+
| REACTIVE | EKS | ContainerInsights/Prometheus | kubelet_runtime_operations_errors_total | Reactive | Non-native | > 1.0 | Statistic = Sum<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors kubelet runtime operation errors |
|
|
126
|
+
| REACTIVE | EKS | ContainerInsights/Prometheus | kubelet_started_pods_errors_total | Reactive | Non-native | > 1.0 | Statistic = Sum<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors pod start errors |
|
|
127
|
+
| REACTIVE | EKS | ContainerInsights/Prometheus | node_collector_zone_health | Reactive | Non-native | > 1.0 | Statistic = Average<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors node health percentage per zone |
|
|
128
|
+
|
|
129
|
+
### ElastiCache
|
|
130
|
+
|
|
131
|
+
| Business Objectives | AWS Service | Namespace | Metric name | Reactive/Proactive | Metric Classification | Threshold | Recommended alarm configuration | Use case |
|
|
132
|
+
|---------------------|-------------|-----------|-------------|--------------------|-----------------------|-----------|---------------------------------|----------|
|
|
133
|
+
| PERFORMANCE | ElastiCache | AWS/ElastiCache | FreeableMemory | Reactive | Native | < 100000000 | Statistic = Sum<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors low freeable memory which can indicate spike in connections or high memory pressure |
|
|
134
|
+
| PERFORMANCE | ElastiCache | AWS/ElastiCache | DatabaseMemoryUsagePercentage | Reactive | Native | >= 90.0 | Statistic = Sum<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors the percentage of memory utilization for Redis clusters |
|
|
135
|
+
| PERFORMANCE | ElastiCache | AWS/ElastiCache | CurrConnections | Reactive | Native | > 1000 | Statistic = Maximum<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors the number of client connections, excluding connections from read replicas |
|
|
136
|
+
|
|
137
|
+
### Elemental Media Services
|
|
138
|
+
|
|
139
|
+
| Business Objectives | AWS Service | Namespace | Metric name | Reactive/Proactive | Metric Classification | Threshold | Recommended alarm configuration | Use case |
|
|
140
|
+
|---------------------|-------------|-----------|-------------|--------------------|-----------------------|-----------|---------------------------------|----------|
|
|
141
|
+
| CUSTOMER_EXPERIENCE | Elemental Media Services | AWS/MediaLive | SvqTime | Reactive | Native | > 80.0 | Statistic = Maximum<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Percentage of time MediaLive had to reduce quality optimizations to emit output in real time |
|
|
142
|
+
| RELIABILITY | Elemental Media Services | AWS/MediaPackage | EgressResponseTime | Reactive | Native | > 5000.0 | Statistic = Average<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Time that it takes MediaPackage to process each output request |
|
|
143
|
+
| RELIABILITY | Elemental Media Services | AWS/MediaPackage | IngressResponseTime | Reactive | Native | > 5000.0 | Statistic = Average<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Time that it takes MediaPackage to process each input request |
|
|
144
|
+
|
|
145
|
+
### Keyspaces
|
|
146
|
+
|
|
147
|
+
| Business Objectives | AWS Service | Namespace | Metric name | Reactive/Proactive | Metric Classification | Threshold | Recommended alarm configuration | Use case |
|
|
148
|
+
|---------------------|-------------|-----------|-------------|--------------------|-----------------------|-----------|---------------------------------|----------|
|
|
149
|
+
| RELIABILITY | Keyspaces | AWS/Cassandra | PerConnectionRequestRateExceeded | Reactive | Native | >= 1.0 | Statistic = Sum<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Tracks the requests that exceed the per-connection request rate quota |
|
|
150
|
+
| RELIABILITY | Keyspaces | AWS/Cassandra | ReadThrottleEvents | Reactive | Native | >= 1.0 | Statistic = Sum<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Tracks the requests that exceed the provisioned read capacity |
|
|
151
|
+
| RELIABILITY | Keyspaces | AWS/Cassandra | WriteThrottleEvents | Reactive | Native | >= 1.0 | Statistic = Sum<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Tracks the requests that exceed the provisioned write capacity |
|
|
152
|
+
| RELIABILITY | Keyspaces | AWS/Cassandra | ReplicationLatency | Reactive | Conditional | > 1000.0 | Statistic = Average<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors the time it took to replicate updates, inserts, or deletes from one replica table to another replica table in a multi-Region keyspace |
|
|
153
|
+
| RELIABILITY | Keyspaces | AWS/Cassandra | StoragePartitionThroughputCapacityExceeded | Reactive | Native | >= 1.0 | Statistic = Sum<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Tracks the requests that exceed the throughput capacity of the storage partition |
|
|
154
|
+
|
|
155
|
+
### Kinesis
|
|
156
|
+
|
|
157
|
+
| Business Objectives | AWS Service | Namespace | Metric name | Reactive/Proactive | Metric Classification | Threshold | Recommended alarm configuration | Use case |
|
|
158
|
+
|---------------------|-------------|-----------|-------------|--------------------|-----------------------|-----------|---------------------------------|----------|
|
|
159
|
+
| RELIABILITY | Kinesis | AWS/Kinesis | ReadProvisionedThroughputExceeded | Proactive | Native | >= 1.0 | Statistic = Sum<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Tracks the number of times read operations exceed the provisioned read throughput capacity |
|
|
160
|
+
| RELIABILITY | Kinesis | AWS/Kinesis | WriteProvisionedThroughputExceeded | Proactive | Native | >= 1.0 | Statistic = Sum<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Identifies when write capacity limits are hit causing data ingestion delays or failures |
|
|
161
|
+
| RELIABILITY | Kinesis | AWS/Kinesis | GetRecords.IteratorAgeMilliseconds | Reactive | Native | > 600000.0 | Statistic = Maximum<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Tracks how long records have been in the stream before being processed |
|
|
162
|
+
| RELIABILITY | Kinesis | AWS/Kinesis | PutRecords.FailedRecords | Reactive | Native | > 1.0 | Statistic = Sum<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Tracks the number of records that fail to be processed |
|
|
163
|
+
| RELIABILITY | Kinesis | AWS/Kinesis | PutRecords.ThrottledRecords | Proactive | Native | > 1.0 | Statistic = Sum<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors read throttling to detect capacity issues |
|
|
164
|
+
|
|
165
|
+
### Lambda
|
|
166
|
+
|
|
167
|
+
| Business Objectives | AWS Service | Namespace | Metric name | Reactive/Proactive | Metric Classification | Threshold | Recommended alarm configuration | Use case |
|
|
168
|
+
|---------------------|-------------|-----------|-------------|--------------------|-----------------------|-----------|---------------------------------|----------|
|
|
169
|
+
| RELIABILITY | Lambda | AWS/Lambda | (m1/m2)*100<br>m1 = Errors<br>m2 = Invocations | Reactive | Native | > 5.0 | Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors the ratio of errors to successful Lambda invocations |
|
|
170
|
+
| RELIABILITY | Lambda | AWS/Lambda | Throttles | Reactive | Native | >= 1.0 | Statistic = Average<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors invocation request throughput and throttling |
|
|
171
|
+
| RELIABILITY | Lambda | AWS/Lambda | DeadLetterErrors | Reactive | Native | >= 1.0 | Statistic = Sum<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors when messages fail to reach the DLQ |
|
|
172
|
+
| RELIABILITY | Lambda | AWS/Lambda | ConcurrentExecutions | Reactive | Conditional | > 900.0 | Statistic = Average<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Tracks the number of function instances running concurrently |
|
|
173
|
+
|
|
174
|
+
### RDS
|
|
175
|
+
|
|
176
|
+
| Business Objectives | AWS Service | Namespace | Metric name | Reactive/Proactive | Metric Classification | Threshold | Recommended alarm configuration | Use case |
|
|
177
|
+
|---------------------|-------------|-----------|-------------|--------------------|-----------------------|-----------|---------------------------------|----------|
|
|
178
|
+
| RELIABILITY | RDS | AWS/RDS | DiskQueueDepth | Reactive | Native | > 25.0 | Statistic = Average<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors outstanding IOs waiting to be processed - identifies performance bottlenecks |
|
|
179
|
+
| RELIABILITY | RDS | AWS/RDS | FreeStorageSpace | Proactive | Native | < 2147483648.0 | Statistic = Average<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = breaching | Proactively monitors available storage capacity to prevent database outages |
|
|
180
|
+
| RELIABILITY | RDS | AWS/RDS | ReplicaLag | Reactive | Conditional | > 30.0 | Statistic = Average<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = breaching | Monitors replication lag times on RDS read replicas to ensure data freshness |
|
|
181
|
+
| LATENCY | RDS | AWS/RDS | ReadLatency | Reactive | Native | > 0.2 | Statistic = Average<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors average time to read data from database storage |
|
|
182
|
+
| LATENCY | RDS | AWS/RDS | WriteLatency | Reactive | Native | > 0.2 | Statistic = Average<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors average time to write data to database storage |
|
|
183
|
+
|
|
184
|
+
### Redshift
|
|
185
|
+
|
|
186
|
+
| Business Objectives | AWS Service | Namespace | Metric name | Reactive/Proactive | Metric Classification | Threshold | Recommended alarm configuration | Use case |
|
|
187
|
+
|---------------------|-------------|-----------|-------------|--------------------|-----------------------|-----------|---------------------------------|----------|
|
|
188
|
+
| RELIABILITY | Redshift | AWS/Redshift | DatabaseConnections | Reactive | Native | > 90.0 | Statistic = Sum<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = breaching | Tracks the number of database connections to a cluster |
|
|
189
|
+
| RELIABILITY | Redshift | AWS/Redshift | HealthStatus | Reactive | Native | < 1.0 | Statistic = Minimum<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = breaching | Indicates the health of the cluster |
|
|
190
|
+
| RELIABILITY | Redshift | AWS/Redshift | PercentageDiskSpaceUsed | Reactive | Native | > 95.0 | Statistic = Average<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = breaching | Tracks the percent of disk space used |
|
|
191
|
+
| LATENCY | Redshift | AWS/Redshift | ReadLatency | Reactive | Native | > 20.0 | Statistic = Average<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Tracks the average amount of time taken for disk read I/O operations |
|
|
192
|
+
| LATENCY | Redshift | AWS/Redshift | WriteLatency | Reactive | Native | > 20.0 | Statistic = Average<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Tracks the average amount of time taken for disk write I/O operations |
|
|
193
|
+
|
|
194
|
+
### S3
|
|
195
|
+
|
|
196
|
+
| Business Objectives | AWS Service | Namespace | Metric name | Reactive/Proactive | Metric Classification | Threshold | Recommended alarm configuration | Use case |
|
|
197
|
+
|---------------------|-------------|-----------|-------------|--------------------|-----------------------|-----------|---------------------------------|----------|
|
|
198
|
+
| PERFORMANCE | S3 | AWS/S3 | TotalRequestLatency | Reactive | Conditional | > 1000.0 | Statistic = Average<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Measures the total time taken to process requests to an S3 bucket |
|
|
199
|
+
|
|
200
|
+
### SNS
|
|
201
|
+
|
|
202
|
+
| Business Objectives | AWS Service | Namespace | Metric name | Reactive/Proactive | Metric Classification | Threshold | Recommended alarm configuration | Use case |
|
|
203
|
+
|---------------------|-------------|-----------|-------------|--------------------|-----------------------|-----------|---------------------------------|----------|
|
|
204
|
+
| RELIABILITY | SNS | AWS/SNS | NumberOfNotificationsFailed | Reactive | Native | >= 5.0 | Statistic = Sum<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors failed notifications to detect delivery issues |
|
|
205
|
+
| RELIABILITY | SNS | AWS/SNS | NumberOfNotificationsFilteredOut-InvalidAttributes | Reactive | Conditional | >= 1.0 | Statistic = Sum<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = breaching | Monitors notifications filtered due to invalid attributes |
|
|
206
|
+
| RELIABILITY | SNS | AWS/SNS | NumberOfNotificationsFilteredOut-NoMessageAttributes | Reactive | Conditional | >= 1.0 | Statistic = Sum<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = breaching | Monitors notifications filtered due to missing message attributes |
|
|
207
|
+
| RELIABILITY | SNS | AWS/SNS | NumberOfNotificationsFilteredOut-InvalidMessageBody | Reactive | Conditional | >= 1.0 | Statistic = Sum<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = breaching | Monitors notifications filtered due to invalid message body |
|
|
208
|
+
| RELIABILITY | SNS | AWS/SNS | NumberOfNotificationsRedrivenToDlq | Reactive | Conditional | >= 1.0 | Statistic = Sum<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = breaching | Monitors notifications redirected to a Dead Letter Queue |
|
|
209
|
+
| RELIABILITY | SNS | AWS/SNS | SMSSuccessRate | Reactive | Conditional | < 90.0 | Statistic = Average<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors the rate of successful SMS message deliveries |
|
|
210
|
+
|
|
211
|
+
### SQS
|
|
212
|
+
|
|
213
|
+
| Business Objectives | AWS Service | Namespace | Metric name | Reactive/Proactive | Metric Classification | Threshold | Recommended alarm configuration | Use case |
|
|
214
|
+
|---------------------|-------------|-----------|-------------|--------------------|-----------------------|-----------|---------------------------------|----------|
|
|
215
|
+
| RELIABILITY | SQS | AWS/SQS | ApproximateNumberOfMessagesVisible | Reactive | Native | >= 1000.0 | Statistic = Average<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors the number of visible messages in the SQS queue that are awaiting processing |
|
|
216
|
+
| RELIABILITY | SQS | AWS/SQS | ApproximateAgeOfOldestMessage | Reactive | Native | >= 900.0 | Statistic = Maximum<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = breaching | Measures the duration that the oldest message has been in the queue without being processed |
|
|
217
|
+
|
|
218
|
+
### Step Functions
|
|
219
|
+
|
|
220
|
+
| Business Objectives | AWS Service | Namespace | Metric name | Reactive/Proactive | Metric Classification | Threshold | Recommended alarm configuration | Use case |
|
|
221
|
+
|---------------------|-------------|-----------|-------------|--------------------|-----------------------|-----------|---------------------------------|----------|
|
|
222
|
+
| RELIABILITY | Step Functions | AWS/States | ExecutionsFailed | Reactive | Native | >= 5.0 | Statistic = Sum<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors failed executions to detect workflow issues |
|
|
223
|
+
| RELIABILITY | Step Functions | AWS/States | ExecutionsTimedOut | Reactive | Native | >= 1.0 | Statistic = Sum<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Monitors timed out executions to detect timeout issues |
|
|
224
|
+
| LATENCY | Step Functions | AWS/States | ExecutionThrottled | Reactive | Native | >= 1.0 | Statistic = Sum<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = notBreaching | Tracks executions throttled due to exceeding AWS service limits |
|
|
225
|
+
|
|
226
|
+
### VPC Transit Gateway
|
|
227
|
+
|
|
228
|
+
| Business Objectives | AWS Service | Namespace | Metric name | Reactive/Proactive | Metric Classification | Threshold | Recommended alarm configuration | Use case |
|
|
229
|
+
|---------------------|-------------|-----------|-------------|--------------------|-----------------------|-----------|---------------------------------|----------|
|
|
230
|
+
| RELIABILITY | VPC Transit Gateway | AWS/TransitGateway | BytesDropCountBlackhole | Reactive | Native | > 1000000.0 | Statistic = Sum<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = breaching | Number of bytes dropped because they matched a blackhole route |
|
|
231
|
+
| RELIABILITY | VPC Transit Gateway | AWS/TransitGateway | BytesDropCountNoRoute | Reactive | Native | > 1000000.0 | Statistic = Sum<br>Period = 60 seconds<br>DatapointsToAlarm = 5<br>TreatMissingData = breaching | Number of bytes dropped because they did not match a route |
|