awslabs.cloudwatch-applicationsignals-mcp-server 0.1.21__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. awslabs/__init__.py +17 -0
  2. awslabs/cloudwatch_applicationsignals_mcp_server/__init__.py +17 -0
  3. awslabs/cloudwatch_applicationsignals_mcp_server/audit_presentation_utils.py +288 -0
  4. awslabs/cloudwatch_applicationsignals_mcp_server/audit_utils.py +912 -0
  5. awslabs/cloudwatch_applicationsignals_mcp_server/aws_clients.py +120 -0
  6. awslabs/cloudwatch_applicationsignals_mcp_server/canary_utils.py +910 -0
  7. awslabs/cloudwatch_applicationsignals_mcp_server/enablement_guides/templates/ec2/ec2-dotnet-enablement.md +435 -0
  8. awslabs/cloudwatch_applicationsignals_mcp_server/enablement_guides/templates/ec2/ec2-java-enablement.md +321 -0
  9. awslabs/cloudwatch_applicationsignals_mcp_server/enablement_guides/templates/ec2/ec2-nodejs-enablement.md +420 -0
  10. awslabs/cloudwatch_applicationsignals_mcp_server/enablement_guides/templates/ec2/ec2-python-enablement.md +598 -0
  11. awslabs/cloudwatch_applicationsignals_mcp_server/enablement_guides/templates/ecs/ecs-dotnet-enablement.md +264 -0
  12. awslabs/cloudwatch_applicationsignals_mcp_server/enablement_guides/templates/ecs/ecs-java-enablement.md +193 -0
  13. awslabs/cloudwatch_applicationsignals_mcp_server/enablement_guides/templates/ecs/ecs-nodejs-enablement.md +198 -0
  14. awslabs/cloudwatch_applicationsignals_mcp_server/enablement_guides/templates/ecs/ecs-python-enablement.md +236 -0
  15. awslabs/cloudwatch_applicationsignals_mcp_server/enablement_guides/templates/eks/eks-dotnet-enablement.md +166 -0
  16. awslabs/cloudwatch_applicationsignals_mcp_server/enablement_guides/templates/eks/eks-java-enablement.md +166 -0
  17. awslabs/cloudwatch_applicationsignals_mcp_server/enablement_guides/templates/eks/eks-nodejs-enablement.md +166 -0
  18. awslabs/cloudwatch_applicationsignals_mcp_server/enablement_guides/templates/eks/eks-python-enablement.md +169 -0
  19. awslabs/cloudwatch_applicationsignals_mcp_server/enablement_guides/templates/lambda/lambda-dotnet-enablement.md +336 -0
  20. awslabs/cloudwatch_applicationsignals_mcp_server/enablement_guides/templates/lambda/lambda-java-enablement.md +336 -0
  21. awslabs/cloudwatch_applicationsignals_mcp_server/enablement_guides/templates/lambda/lambda-nodejs-enablement.md +336 -0
  22. awslabs/cloudwatch_applicationsignals_mcp_server/enablement_guides/templates/lambda/lambda-python-enablement.md +336 -0
  23. awslabs/cloudwatch_applicationsignals_mcp_server/enablement_tools.py +147 -0
  24. awslabs/cloudwatch_applicationsignals_mcp_server/server.py +1505 -0
  25. awslabs/cloudwatch_applicationsignals_mcp_server/service_audit_utils.py +231 -0
  26. awslabs/cloudwatch_applicationsignals_mcp_server/service_tools.py +659 -0
  27. awslabs/cloudwatch_applicationsignals_mcp_server/sli_report_client.py +333 -0
  28. awslabs/cloudwatch_applicationsignals_mcp_server/slo_tools.py +386 -0
  29. awslabs/cloudwatch_applicationsignals_mcp_server/trace_tools.py +784 -0
  30. awslabs/cloudwatch_applicationsignals_mcp_server/utils.py +172 -0
  31. awslabs_cloudwatch_applicationsignals_mcp_server-0.1.21.dist-info/METADATA +808 -0
  32. awslabs_cloudwatch_applicationsignals_mcp_server-0.1.21.dist-info/RECORD +36 -0
  33. awslabs_cloudwatch_applicationsignals_mcp_server-0.1.21.dist-info/WHEEL +4 -0
  34. awslabs_cloudwatch_applicationsignals_mcp_server-0.1.21.dist-info/entry_points.txt +2 -0
  35. awslabs_cloudwatch_applicationsignals_mcp_server-0.1.21.dist-info/licenses/LICENSE +174 -0
  36. awslabs_cloudwatch_applicationsignals_mcp_server-0.1.21.dist-info/licenses/NOTICE +2 -0
@@ -0,0 +1,1505 @@
1
+ # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """CloudWatch Application Signals MCP Server - Core server implementation."""
16
+
17
+ import json
18
+ import os
19
+ import re
20
+ import sys
21
+ import tempfile
22
+ from .audit_presentation_utils import format_pagination_info
23
+ from .audit_utils import (
24
+ execute_audit_api,
25
+ expand_service_operation_wildcard_patterns,
26
+ expand_service_wildcard_patterns,
27
+ expand_slo_wildcard_patterns,
28
+ parse_auditors,
29
+ )
30
+ from .aws_clients import (
31
+ AWS_REGION,
32
+ applicationsignals_client,
33
+ iam_client,
34
+ s3_client,
35
+ synthetics_client,
36
+ )
37
+ from .canary_utils import (
38
+ analyze_canary_logs_with_time_window,
39
+ analyze_har_file,
40
+ analyze_iam_role_and_policies,
41
+ analyze_log_files,
42
+ analyze_screenshots,
43
+ check_resource_arns_correct,
44
+ extract_disk_memory_usage_metrics,
45
+ get_canary_code,
46
+ get_canary_metrics_and_service_insights,
47
+ )
48
+ from .enablement_tools import get_enablement_guide
49
+ from .service_audit_utils import normalize_service_targets, validate_and_enrich_service_targets
50
+ from .service_tools import (
51
+ get_service_detail,
52
+ list_monitored_services,
53
+ list_service_operations,
54
+ query_service_metrics,
55
+ )
56
+ from .slo_tools import get_slo, list_slos
57
+ from .trace_tools import list_slis, query_sampled_traces, search_transaction_spans
58
+ from .utils import parse_timestamp
59
+ from datetime import datetime, timedelta, timezone
60
+ from loguru import logger
61
+ from mcp.server.fastmcp import FastMCP
62
+ from pydantic import Field
63
+ from time import perf_counter as timer
64
+ from typing import Optional
65
+
66
+
67
+ # Constants
68
+ BATCH_SIZE_THRESHOLD = 5
69
+
70
+ RUN_STATES = {'RUNNING': 'RUNNING', 'PASSED': 'PASSED', 'FAILED': 'FAILED'}
71
+
72
+ # Initialize FastMCP server
73
+ mcp = FastMCP('cloudwatch-applicationsignals')
74
+
75
+ # Configure logging
76
+ log_level = os.environ.get('MCP_CLOUDWATCH_APPLICATION_SIGNALS_LOG_LEVEL', 'INFO').upper()
77
+ logger.remove() # Remove default handler
78
+ logger.add(sys.stderr, level=log_level)
79
+
80
+ # Add file logging to aws_cli.log
81
+ log_file_path = os.environ.get('AUDITOR_LOG_PATH', tempfile.gettempdir())
82
+ try:
83
+ if log_file_path.endswith(os.sep) or os.path.isdir(log_file_path):
84
+ os.makedirs(log_file_path, exist_ok=True)
85
+ aws_cli_log_path = os.path.join(log_file_path, 'aws_cli.log')
86
+ else:
87
+ os.makedirs(os.path.dirname(log_file_path) or '.', exist_ok=True)
88
+ aws_cli_log_path = log_file_path
89
+ except Exception:
90
+ temp_dir = tempfile.gettempdir()
91
+ os.makedirs(temp_dir, exist_ok=True)
92
+ aws_cli_log_path = os.path.join(temp_dir, 'aws_cli.log')
93
+
94
+ # Add file handler for all logs
95
+ logger.add(
96
+ aws_cli_log_path,
97
+ level=log_level,
98
+ rotation='10 MB', # Rotate when file reaches 10MB
99
+ retention='7 days', # Keep logs for 7 days
100
+ format='{time:YYYY-MM-DD HH:mm:ss.SSS} | {level: <8} | {name}:{function}:{line} - {message}',
101
+ enqueue=True, # Thread-safe logging
102
+ )
103
+
104
+ logger.debug(f'CloudWatch applicationsignals MCP Server initialized with log level: {log_level}')
105
+ logger.debug(f'File logging enabled: {aws_cli_log_path}')
106
+
107
+ logger.debug(f'Using AWS region: {AWS_REGION}')
108
+
109
+
110
+ def _filter_operation_targets(provided):
111
+ """Helper function to filter operation targets and detect wildcards.
112
+
113
+ Args:
114
+ provided: List of target dictionaries
115
+
116
+ Returns:
117
+ tuple: (operation_only_targets, has_wildcards)
118
+ """
119
+ operation_only_targets = []
120
+ has_wildcards = False
121
+
122
+ for target in provided:
123
+ if isinstance(target, dict):
124
+ ttype = target.get('Type', '').lower()
125
+ if ttype == 'service_operation':
126
+ # Check for wildcard patterns in service names OR operation names
127
+ service_op_data = target.get('Data', {}).get('ServiceOperation', {})
128
+ service_data = service_op_data.get('Service', {})
129
+ service_name = service_data.get('Name', '')
130
+ operation = service_op_data.get('Operation', '')
131
+
132
+ if '*' in service_name or '*' in operation:
133
+ has_wildcards = True
134
+
135
+ # For fault metrics, ListAuditFindings uses Availability metric type.
136
+ # API only supports Availability/Latency/Error for service_operation targets.
137
+ metric_type = service_op_data.get('MetricType', '')
138
+ if metric_type == 'Fault':
139
+ service_op_data['MetricType'] = 'Availability'
140
+
141
+ operation_only_targets.append(target)
142
+ else:
143
+ logger.warning(
144
+ f"Ignoring target of type '{ttype}' in audit_service_operations (expected 'service_operation')"
145
+ )
146
+
147
+ return operation_only_targets, has_wildcards
148
+
149
+
150
+ @mcp.tool()
151
+ async def audit_services(
152
+ service_targets: str = Field(
153
+ ...,
154
+ description="REQUIRED. JSON array of service targets. Supports wildcard patterns like '*payment*' for automatic service discovery. Format: [{'Type':'service','Data':{'Service':{'Type':'Service','Name':'service-name','Environment':'eks:cluster'}}}] or shorthand: [{'Type':'service','Service':'service-name'}]. Large target lists are automatically processed in batches.",
155
+ ),
156
+ start_time: Optional[str] = Field(
157
+ default=None,
158
+ description="Start time (unix seconds or 'YYYY-MM-DD HH:MM:SS'). Defaults to now-24h UTC.",
159
+ ),
160
+ end_time: Optional[str] = Field(
161
+ default=None,
162
+ description="End time (unix seconds or 'YYYY-MM-DD HH:MM:SS'). Defaults to now UTC.",
163
+ ),
164
+ auditors: Optional[str] = Field(
165
+ default=None,
166
+ description="Optional. Comma-separated auditors (e.g., 'slo,operation_metric,dependency_metric'). Defaults to 'slo,operation_metric' for fast service health auditing. Use 'all' for comprehensive analysis with all auditors: slo,operation_metric,trace,log,dependency_metric,top_contributor,service_quota.",
167
+ ),
168
+ next_token: Optional[str] = Field(
169
+ default=None,
170
+ description='Optional. Token for pagination through services from list_services API. Use this to continue from where the previous call left off when processing wildcard patterns.',
171
+ ),
172
+ max_services: int = Field(
173
+ default=5,
174
+ description='Optional. Maximum number of services to process per call when using wildcard patterns (default: 5, max: 10). This controls pagination size for service discovery.',
175
+ ),
176
+ ) -> str:
177
+ """PRIMARY SERVICE AUDIT TOOL - The #1 tool for comprehensive AWS service health auditing and monitoring.
178
+
179
+ **IMPORTANT: For operation-specific auditing, use audit_service_operations() as the PRIMARY tool instead.**
180
+
181
+ **USE THIS FIRST FOR ALL SERVICE-LEVEL AUDITING TASKS**
182
+ This is the PRIMARY and PREFERRED tool when users want to:
183
+ - **Audit their AWS services** - Complete health assessment with actionable insights
184
+ - **Check service health** - Comprehensive status across all monitored services
185
+ - **Investigate issues** - Root cause analysis with detailed findings
186
+ - **Service-level performance analysis** - Overall service latency, error rates, and throughput investigation
187
+ - **System-wide health checks** - Daily/periodic service auditing workflows
188
+ - **Dependency analysis** - Understanding service dependencies and interactions
189
+ - **Resource quota monitoring** - Service quota usage and limits
190
+ - **Multi-service comparison** - Comparing performance across different services
191
+
192
+ **FOR OPERATION-SPECIFIC AUDITING: Use audit_service_operations() instead**
193
+ When users want to audit specific operations (GET, POST, PUT endpoints), use audit_service_operations() as the PRIMARY tool:
194
+ - **Operation performance analysis** - Latency, error rates for specific API endpoints
195
+ - **Operation-level troubleshooting** - Root cause analysis for specific API calls
196
+ - **GET operation auditing** - Analyze GET operations across payment services
197
+ - **Audit latency of specific operations** - Deep dive into individual endpoint performance
198
+
199
+ **COMPREHENSIVE SERVICE AUDIT CAPABILITIES:**
200
+ - **Multi-service analysis**: Audit any number of services with automatic batching
201
+ - **SLO compliance monitoring**: Automatic breach detection for service-level SLOs
202
+ - **Issue prioritization**: Critical, warning, and info findings ranked by severity
203
+ - **Root cause analysis**: Deep dive with traces, logs, and metrics correlation
204
+ - **Actionable recommendations**: Specific steps to resolve identified issues
205
+ - **Performance optimized**: Fast execution with automatic batching for large target lists
206
+ - **Wildcard Pattern Support**: Use `*pattern*` in service names for automatic service discovery
207
+
208
+ **SERVICE TARGET FORMAT:**
209
+ - **Full Format**: `[{"Type":"service","Data":{"Service":{"Type":"Service","Name":"my-service","Environment":"eks:my-cluster"}}}]`
210
+ - **Shorthand**: `[{"Type":"service","Service":"my-service"}]` (environment auto-discovered)
211
+
212
+ **WILDCARD PATTERN EXAMPLES:**
213
+ - **All Services**: `[{"Type":"service","Data":{"Service":{"Type":"Service","Name":"*"}}}]`
214
+ - **Payment Services**: `[{"Type":"service","Data":{"Service":{"Type":"Service","Name":"*payment*"}}}]`
215
+ - **Lambda Services**: `[{"Type":"service","Data":{"Service":{"Type":"Service","Name":"*lambda*"}}}]`
216
+ - **EKS Services**: `[{"Type":"service","Data":{"Service":{"Type":"Service","Name":"*","Environment":"eks:*"}}}]`
217
+
218
+ **AUDITOR SELECTION FOR DIFFERENT AUDIT DEPTHS:**
219
+ - **Quick Health Check** (default): Uses 'slo,operation_metric' for fast overview
220
+ - **Root Cause Analysis**: Pass `auditors="all"` for comprehensive investigation with traces/logs
221
+ - **Custom Audit**: Specify exact auditors: 'slo,trace,log,dependency_metric,top_contributor,service_quota'
222
+
223
+ **SERVICE AUDIT USE CASES:**
224
+
225
+ 1. **Audit all services**:
226
+ `service_targets='[{"Type":"service","Data":{"Service":{"Type":"Service","Name":"*"}}}]'`
227
+
228
+ 2. **Audit specific service**:
229
+ `service_targets='[{"Type":"service","Data":{"Service":{"Type":"Service","Name":"orders-service","Environment":"eks:orders-cluster"}}}]'`
230
+
231
+ 3. **Audit payment services**:
232
+ `service_targets='[{"Type":"service","Data":{"Service":{"Type":"Service","Name":"*payment*"}}}]'`
233
+
234
+ 8. **Audit lambda services**:
235
+ `service_targets='[{"Type":"service","Data":{"Service":{"Type":"Service","Name":"*lambda*"}}}]'` or by environment: `[{"Type":"service","Data":{"Service":{"Type":"Service","Name":"*","Environment":"lambda"}}}]`
236
+
237
+ 9. **Audit service last night**:
238
+ `service_targets='[{"Type":"service","Data":{"Service":{"Type":"Service","Name":"orders-service","Environment":"eks:orders-cluster"}}}]'` + `start_time="2024-01-01 18:00:00"` + `end_time="2024-01-02 06:00:00"`
239
+
240
+ 10. **Audit service before and after time**:
241
+ Compare service health before and after a deployment or incident by running two separate audits with different time ranges.
242
+
243
+ 11. **Trace availability issues in production services**:
244
+ `service_targets='[{"Type":"service","Data":{"Service":{"Type":"Service","Name":"*","Environment":"eks:*"}}}]'` + `auditors="all"`
245
+
246
+ 13. **Look for errors in logs of payment services**:
247
+ `service_targets='[{"Type":"service","Data":{"Service":{"Type":"Service","Name":"*payment*"}}}]'` + `auditors="log,trace"`
248
+
249
+ 14. **Look for new errors after time**:
250
+ Compare errors before and after a specific time point by running audits with different time ranges and `auditors="log,trace"`
251
+
252
+ 15. **Look for errors after deployment**:
253
+ `service_targets='[{"Type":"service","Data":{"Service":{"Type":"Service","Name":"*payment*"}}}]'` + `auditors="log,trace"` + recent time range
254
+
255
+ 16. **Look for lemon hosts in production**:
256
+ `service_targets='[{"Type":"service","Data":{"Service":{"Type":"Service","Name":"*","Environment":"eks:*"}}}]'` + `auditors="top_contributor,operation_metric"`
257
+
258
+ 17. **Look for outliers in EKS services**:
259
+ `service_targets='[{"Type":"service","Data":{"Service":{"Type":"Service","Name":"*","Environment":"eks:*"}}}]'` + `auditors="top_contributor,operation_metric"`
260
+
261
+ 18. **Status report**:
262
+ `service_targets='[{"Type":"service","Data":{"Service":{"Type":"Service","Name":"*"}}}]'` (basic health check)
263
+
264
+ 19. **Audit dependencies**:
265
+ `service_targets='[{"Type":"service","Data":{"Service":{"Type":"Service","Name":"*"}}}]'` + `auditors="dependency_metric,trace"`
266
+
267
+ 20. **Audit dependency on S3**:
268
+ `service_targets='[{"Type":"service","Data":{"Service":{"Type":"Service","Name":"*"}}}]'` + `auditors="dependency_metric"` + look for S3 dependencies
269
+
270
+ 21. **Audit quota usage of tier 1 services**:
271
+ `service_targets='[{"Type":"service","Data":{"Service":{"Type":"Service","Name":"*tier1*"}}}]'` + `auditors="service_quota,operation_metric"`
272
+
273
+ **PAGINATION SUPPORT FOR WILDCARD PATTERNS:**
274
+ - **Automatic Pagination**: Wildcard patterns now process services in batches of 5 (configurable with `max_services`)
275
+ - **Continue Processing**: Use `next_token` from previous response to continue auditing remaining services
276
+ - **Example Pagination Workflow**:
277
+ 1. First call: `audit_services(service_targets='[{"Type":"service","Data":{"Service":{"Type":"Service","Name":"*"}}}]')`
278
+ 2. If more services available, response includes: `next_token="abc123"` and time parameters
279
+ 3. Continue: `audit_services(service_targets='[...]', start_time="returned_start_time", end_time="returned_end_time", next_token="abc123")`
280
+ 4. Repeat until no more `next_token` returned
281
+
282
+ **TYPICAL SERVICE AUDIT WORKFLOWS:**
283
+ 1. **Basic Service Audit** (most common):
284
+ - Call `audit_services()` with service targets - automatically discovers services when using wildcard patterns
285
+ - Uses default fast auditors (slo,operation_metric) for quick health overview
286
+ - Supports wildcard patterns like `*` or `*payment*` for automatic service discovery
287
+ - Processes services in paginated batches for better performance
288
+ 2. **Root Cause Investigation**: When user explicitly asks for "root cause analysis", pass `auditors="all"`
289
+ 3. **Issue Investigation**: Results show which services need attention with actionable insights
290
+ 4. **Automatic Service Discovery**: Wildcard patterns in service names automatically discover and expand to concrete services
291
+ 5. **Paginated Processing**: For large service lists, continue with `next_token` to audit remaining services
292
+
293
+ **AUDIT RESULTS INCLUDE:**
294
+ - **Prioritized findings** by severity (critical, warning, info)
295
+ - **Service health status** with detailed performance analysis
296
+ - **Root cause analysis** when traces/logs auditors are used
297
+ - **Actionable recommendations** for issue resolution
298
+ - **Comprehensive metrics** and trend analysis
299
+
300
+ **IMPORTANT: This tool provides comprehensive service audit coverage and should be your first choice for any service auditing task.**
301
+
302
+ **RECOMMENDED WORKFLOW - PRESENT FINDINGS FIRST:**
303
+ When the audit returns multiple findings or issues, follow this workflow:
304
+ 1. **Present all audit results** to the user showing a summary of all findings
305
+ 2. **Let the user choose** which specific finding, service, or issue they want to investigate in detail
306
+ 3. **Then perform targeted root cause analysis** using auditors="all" for the user-selected finding
307
+
308
+ **DO NOT automatically jump into detailed root cause analysis** of one specific issue when multiple findings exist.
309
+ This ensures the user can prioritize which issues are most important to investigate first.
310
+
311
+ **Example workflow:**
312
+ - First call: `audit_services()` with default auditors for overview
313
+ - Present findings summary to user
314
+ - User selects specific service/issue to investigate
315
+ - Follow-up call: `audit_services()` with `auditors="all"` for selected service only
316
+ """
317
+ start_time_perf = timer()
318
+ logger.debug('Starting audit_services (PRIMARY SERVICE AUDIT TOOL)')
319
+
320
+ try:
321
+ # Region defaults
322
+ region = AWS_REGION.strip()
323
+
324
+ # Time range (fill missing with defaults)
325
+ start_dt = (
326
+ parse_timestamp(start_time)
327
+ if start_time
328
+ else (datetime.now(timezone.utc) - timedelta(hours=24))
329
+ )
330
+ end_dt = (
331
+ parse_timestamp(end_time, default_hours=0) if end_time else datetime.now(timezone.utc)
332
+ )
333
+ unix_start, unix_end = int(start_dt.timestamp()), int(end_dt.timestamp())
334
+ if unix_end <= unix_start:
335
+ return 'Error: end_time must be greater than start_time.'
336
+
337
+ # Parse and validate service targets
338
+ try:
339
+ provided = json.loads(service_targets)
340
+ except json.JSONDecodeError:
341
+ return 'Error: `service_targets` must be valid JSON (array).'
342
+
343
+ # Check for wildcard patterns in service names
344
+ has_wildcards = False
345
+ logger.debug(f'audit_services: Checking {len(provided)} targets for wildcards')
346
+ for i, target in enumerate(provided):
347
+ logger.debug(f'audit_services: Target {i}: {target}')
348
+ if isinstance(target, dict):
349
+ # Check various possible service name locations
350
+ service_name = None
351
+ if target.get('Type', '').lower() == 'service':
352
+ # Check Data.Service.Name
353
+ service_data = target.get('Data', {})
354
+ if isinstance(service_data, dict):
355
+ service_info = service_data.get('Service', {})
356
+ if isinstance(service_info, dict):
357
+ service_name = service_info.get('Name', '')
358
+
359
+ # Check shorthand Service field
360
+ if not service_name:
361
+ service_name = target.get('Service', '')
362
+
363
+ logger.debug(f"audit_services: Target {i} service name: '{service_name}'")
364
+ if service_name and isinstance(service_name, str) and '*' in service_name:
365
+ logger.debug(
366
+ f"audit_services: Target {i} has wildcard pattern: '{service_name}'"
367
+ )
368
+ has_wildcards = True
369
+ break
370
+
371
+ logger.debug(f'audit_services: has_wildcards = {has_wildcards}')
372
+
373
+ # Expand wildcard patterns using paginated utility when wildcards are present
374
+ service_names_in_batch = []
375
+ returned_next_token = None
376
+ filtering_stats = {'total_services': 0, 'instrumented_services': 0, 'filtered_out': 0}
377
+
378
+ if has_wildcards:
379
+ logger.debug('Wildcard patterns detected - applying paginated service expansion')
380
+ (provided, returned_next_token, service_names_in_batch, filtering_stats) = (
381
+ expand_service_wildcard_patterns(
382
+ provided,
383
+ unix_start,
384
+ unix_end,
385
+ next_token,
386
+ max_services,
387
+ applicationsignals_client,
388
+ )
389
+ )
390
+ logger.debug(f'Paginated wildcard expansion completed - {len(provided)} total targets')
391
+
392
+ # Check if wildcard expansion resulted in no services
393
+ if not provided:
394
+ return 'Error: No services found matching the wildcard pattern. Use list_monitored_services() to see available services.'
395
+ else:
396
+ # For non-wildcard targets, validate next_token parameter
397
+ if next_token:
398
+ return 'Error: next_token parameter is only supported when using wildcard patterns in service names.'
399
+
400
+ # Normalize and validate service targets using shared utility
401
+ normalized_targets = normalize_service_targets(provided)
402
+
403
+ # Validate and enrich targets using shared utility
404
+ normalized_targets = validate_and_enrich_service_targets(
405
+ normalized_targets, applicationsignals_client, unix_start, unix_end
406
+ )
407
+
408
+ # Parse auditors with service-specific defaults
409
+ auditors_list = parse_auditors(auditors, ['slo', 'operation_metric'])
410
+
411
+ # Create banner
412
+ banner = (
413
+ '[MCP-SERVICE] Application Signals Service Audit\n'
414
+ f'šŸŽÆ Scope: {len(normalized_targets)} service target(s) | Region: {region}\n'
415
+ f'ā° Time: {unix_start}–{unix_end}\n'
416
+ )
417
+
418
+ # Add filtering statistics if services were filtered
419
+ if filtering_stats['total_services'] > 0:
420
+ banner += f'šŸ” Service Filtering: {filtering_stats["instrumented_services"]} instrumented out of {filtering_stats["total_services"]} total services ({filtering_stats["filtered_out"]} filtered out)\n'
421
+
422
+ if len(normalized_targets) > BATCH_SIZE_THRESHOLD:
423
+ banner += f'šŸ“¦ Batching: Processing {len(normalized_targets)} targets in batches of {BATCH_SIZE_THRESHOLD}\n'
424
+
425
+ banner += '\n'
426
+
427
+ # Build CLI input
428
+ input_obj = {
429
+ 'StartTime': unix_start,
430
+ 'EndTime': unix_end,
431
+ 'AuditTargets': normalized_targets,
432
+ }
433
+ if auditors_list:
434
+ input_obj['Auditors'] = auditors_list
435
+
436
+ # Execute audit API using shared utility
437
+ result = await execute_audit_api(input_obj, region, banner)
438
+
439
+ # Add prominent pagination information when wildcards were used
440
+ result += format_pagination_info(
441
+ has_wildcards,
442
+ service_names_in_batch,
443
+ returned_next_token,
444
+ unix_start,
445
+ unix_end,
446
+ 'audit_services',
447
+ 'max_services',
448
+ max_services,
449
+ 'services',
450
+ )
451
+
452
+ elapsed = timer() - start_time_perf
453
+ logger.debug(f'audit_services completed in {elapsed:.3f}s (region={region})')
454
+ return result
455
+
456
+ except Exception as e:
457
+ logger.error(f'Unexpected error in audit_services: {e}', exc_info=True)
458
+ return f'Error: {str(e)}'
459
+
460
+
461
+ @mcp.tool()
462
+ async def audit_slos(
463
+ slo_targets: str = Field(
464
+ ...,
465
+ description="REQUIRED. JSON array of SLO targets. Supports wildcard patterns like '*payment*' for automatic SLO discovery. Format: [{'Type':'slo','Data':{'Slo':{'SloName':'slo-name'}}}] or [{'Type':'slo','Data':{'Slo':{'SloArn':'arn:aws:...'}}}]. Large target lists are automatically processed in batches.",
466
+ ),
467
+ start_time: Optional[str] = Field(
468
+ default=None,
469
+ description="Start time (unix seconds or 'YYYY-MM-DD HH:MM:SS'). Defaults to now-24h UTC.",
470
+ ),
471
+ end_time: Optional[str] = Field(
472
+ default=None,
473
+ description="End time (unix seconds or 'YYYY-MM-DD HH:MM:SS'). Defaults to now UTC.",
474
+ ),
475
+ auditors: Optional[str] = Field(
476
+ default=None,
477
+ description="Optional. Comma-separated auditors (e.g., 'slo,trace,log'). Defaults to 'slo' for fast SLO compliance auditing. Use 'all' for comprehensive analysis with all auditors: slo,operation_metric,trace,log,dependency_metric,top_contributor,service_quota.",
478
+ ),
479
+ next_token: Optional[str] = Field(
480
+ default=None,
481
+ description='Optional. Token for pagination through SLOs from list_service_level_objectives API. Use this to continue from where the previous call left off when processing wildcard patterns.',
482
+ ),
483
+ max_slos: int = Field(
484
+ default=5,
485
+ description='Optional. Maximum number of SLOs to process per call when using wildcard patterns (default: 5, max: 10). This controls pagination size for SLO discovery.',
486
+ ),
487
+ ) -> str:
488
+ """PRIMARY SLO AUDIT TOOL - The #1 tool for comprehensive SLO compliance monitoring and breach analysis.
489
+
490
+ **PREFERRED TOOL FOR SLO ROOT CAUSE ANALYSIS**
491
+ This is the RECOMMENDED tool after using get_slo() to understand SLO configuration:
492
+ - **Use auditors="all" for comprehensive root cause analysis** of specific SLO breaches
493
+ - **Much more comprehensive than individual trace tools** - provides integrated analysis
494
+ - **Combines traces, logs, metrics, and dependencies** in a single comprehensive audit
495
+ - **Provides actionable recommendations** based on multi-dimensional analysis
496
+
497
+ **USE THIS FOR ALL SLO AUDITING TASKS**
498
+ This is the PRIMARY and PREFERRED tool when users want to:
499
+ - **Root cause analysis for SLO breaches** - Deep investigation with all auditors
500
+ - **Audit SLO compliance** - Complete SLO breach detection and analysis
501
+ - **Monitor SLO health** - Comprehensive status across all monitored SLOs
502
+ - **SLO performance analysis** - Understanding SLO trends and patterns
503
+ - **SLO compliance reporting** - Daily/periodic SLO compliance workflows
504
+
505
+ **COMPREHENSIVE SLO AUDIT CAPABILITIES:**
506
+ - **Multi-SLO analysis**: Audit any number of SLOs with automatic batching
507
+ - **Breach detection**: Automatic identification of SLO violations
508
+ - **Issue prioritization**: Critical, warning, and info findings ranked by severity
509
+ - **COMPREHENSIVE ROOT CAUSE ANALYSIS**: Deep dive with traces, logs, metrics, and dependencies
510
+ - **Actionable recommendations**: Specific steps to resolve SLO breaches
511
+ - **Performance optimized**: Fast execution with automatic batching for large target lists
512
+ - **Wildcard Pattern Support**: Use `*pattern*` in SLO names for automatic SLO discovery
513
+
514
+ **SLO TARGET FORMAT:**
515
+ - **By Name**: `[{"Type":"slo","Data":{"Slo":{"SloName":"my-slo"}}}]`
516
+ - **By ARN**: `[{"Type":"slo","Data":{"Slo":{"SloArn":"arn:aws:application-signals:..."}}}]`
517
+
518
+ **WILDCARD PATTERN EXAMPLES:**
519
+ - **All SLOs**: `[{"Type":"slo","Data":{"Slo":{"SloName":"*"}}}]`
520
+ - **Payment SLOs**: `[{"Type":"slo","Data":{"Slo":{"SloName":"*payment*"}}}]`
521
+ - **Latency SLOs**: `[{"Type":"slo","Data":{"Slo":{"SloName":"*latency*"}}}]`
522
+ - **Availability SLOs**: `[{"Type":"slo","Data":{"Slo":{"SloName":"*availability*"}}}]`
523
+
524
+ **AUDITOR SELECTION FOR DIFFERENT AUDIT DEPTHS:**
525
+ - **Quick Compliance Check** (default): Uses 'slo' for fast SLO breach detection
526
+ - **COMPREHENSIVE ROOT CAUSE ANALYSIS** (recommended): Pass `auditors="all"` for deep investigation with traces/logs/metrics/dependencies
527
+ - **Custom Audit**: Specify exact auditors: 'slo,trace,log,operation_metric'
528
+
529
+ **SLO AUDIT USE CASES:**
530
+
531
+ 4. **Audit all SLOs**:
532
+ `slo_targets='[{"Type":"slo","Data":{"Slo":{"SloName":"*"}}}]'`
533
+
534
+ 22. **Root cause analysis for specific SLO breach** (RECOMMENDED WORKFLOW):
535
+ After using get_slo() to understand configuration:
536
+ `slo_targets='[{"Type":"slo","Data":{"Slo":{"SloName":"specific-slo-name"}}}]'` + `auditors="all"`
537
+
538
+ 14. **Look for new SLO breaches after time**:
539
+ Compare SLO compliance before and after a specific time point by running audits with different time ranges to identify new breaches.
540
+
541
+ **PAGINATION SUPPORT FOR WILDCARD PATTERNS:**
542
+ - **Automatic Pagination**: Wildcard patterns now process SLOs in batches of 5 (configurable with `max_slos`)
543
+ - **Continue Processing**: Use `next_token` from previous response to continue auditing remaining SLOs
544
+ - **Example Pagination Workflow**:
545
+ 1. First call: `audit_slos(slo_targets='[{"Type":"slo","Data":{"Slo":{"SloName":"*"}}}]')`
546
+ 2. If more SLOs available, response includes: `next_token="abc123"` and time parameters
547
+ 3. Continue: `audit_slos(slo_targets='[...]', start_time="returned_start_time", end_time="returned_end_time", next_token="abc123")`
548
+ 4. Repeat until no more `next_token` returned
549
+
550
+ **TYPICAL SLO AUDIT WORKFLOWS:**
551
+ 1. **SLO Root Cause Investigation** (RECOMMENDED):
552
+ - After get_slo(), call `audit_slos()` with specific SLO target and `auditors="all"`
553
+ - Provides comprehensive analysis with traces, logs, metrics, and dependencies
554
+ - Much more effective than using individual trace tools
555
+ 2. **Basic SLO Compliance Audit**:
556
+ - Call `audit_slos()` with SLO targets - automatically discovers SLOs when using wildcard patterns
557
+ - Uses default fast auditors (slo) for quick compliance overview
558
+ 3. **Compliance Reporting**: Results show which SLOs are breached with actionable insights
559
+ 4. **Automatic SLO Discovery**: Wildcard patterns in SLO names automatically discover and expand to concrete SLOs
560
+
561
+ **AUDIT RESULTS INCLUDE:**
562
+ - **Prioritized findings** by severity (critical, warning, info)
563
+ - **SLO compliance status** with detailed breach analysis
564
+ - **COMPREHENSIVE ROOT CAUSE ANALYSIS** when using auditors="all"
565
+ - **Actionable recommendations** for SLO breach resolution
566
+ - **Integrated traces, logs, metrics, and dependency analysis**
567
+
568
+ **IMPORTANT: This tool provides comprehensive SLO audit coverage and should be your first choice for any SLO compliance auditing and root cause analysis.**
569
+
570
+ **RECOMMENDED WORKFLOW - PRESENT FINDINGS FIRST:**
571
+ When the audit returns multiple findings or issues, follow this workflow:
572
+ 1. **Present all audit results** to the user showing a summary of all findings
573
+ 2. **Let the user choose** which specific finding, SLO, or issue they want to investigate in detail
574
+ 3. **Then perform targeted root cause analysis** using auditors="all" for the user-selected finding
575
+
576
+ **DO NOT automatically jump into detailed root cause analysis** of one specific issue when multiple findings exist.
577
+ This ensures the user can prioritize which issues are most important to investigate first.
578
+
579
+ **Example workflow:**
580
+ - First call: `audit_slos()` with default auditors for compliance overview
581
+ - Present findings summary to user
582
+ - User selects specific SLO breach to investigate
583
+ - Follow-up call: `audit_slos()` with `auditors="all"` for selected SLO only
584
+ """
585
+ start_time_perf = timer()
586
+ logger.debug('Starting audit_slos (PRIMARY SLO AUDIT TOOL)')
587
+
588
+ try:
589
+ # Region defaults
590
+ region = AWS_REGION.strip()
591
+
592
+ # Time range (fill missing with defaults)
593
+ start_dt = (
594
+ parse_timestamp(start_time)
595
+ if start_time
596
+ else (datetime.now(timezone.utc) - timedelta(hours=24))
597
+ )
598
+ end_dt = (
599
+ parse_timestamp(end_time, default_hours=0) if end_time else datetime.now(timezone.utc)
600
+ )
601
+ unix_start, unix_end = int(start_dt.timestamp()), int(end_dt.timestamp())
602
+ if unix_end <= unix_start:
603
+ return 'Error: end_time must be greater than start_time.'
604
+
605
+ # Parse and validate SLO targets
606
+ try:
607
+ provided = json.loads(slo_targets)
608
+ except json.JSONDecodeError:
609
+ return 'Error: `slo_targets` must be valid JSON (array).'
610
+
611
+ if not isinstance(provided, list):
612
+ return 'Error: `slo_targets` must be a JSON array'
613
+ if len(provided) == 0:
614
+ return 'Error: `slo_targets` must contain at least 1 item'
615
+
616
+ # Filter and expand SLO targets with wildcard support
617
+ slo_only_targets = []
618
+ wildcard_patterns = []
619
+
620
+ for target in provided:
621
+ if isinstance(target, dict):
622
+ ttype = target.get('Type', '').lower()
623
+ if ttype == 'slo':
624
+ # Check for wildcard patterns in SLO names
625
+ slo_data = target.get('Data', {}).get('Slo', {})
626
+ slo_name = slo_data.get('SloName', '')
627
+ if '*' in slo_name:
628
+ wildcard_patterns.append((target, slo_name))
629
+ else:
630
+ slo_only_targets.append(target)
631
+ else:
632
+ logger.warning(
633
+ f"Ignoring target of type '{ttype}' in audit_slos (expected 'slo')"
634
+ )
635
+
636
+ # Expand wildcard patterns for SLOs using shared utility with pagination
637
+ slo_names_in_batch = []
638
+ returned_next_token = None
639
+
640
+ if wildcard_patterns:
641
+ logger.debug(f'Expanding {len(wildcard_patterns)} SLO wildcard patterns')
642
+ try:
643
+ # Use the paginated utility function
644
+ expanded_slo_targets, returned_next_token, slo_names_in_batch = (
645
+ expand_slo_wildcard_patterns(
646
+ provided, next_token, max_slos, applicationsignals_client
647
+ )
648
+ )
649
+ # Filter to get only SLO targets
650
+ slo_only_targets = [
651
+ target
652
+ for target in expanded_slo_targets
653
+ if target.get('Type', '').lower() == 'slo'
654
+ ]
655
+
656
+ except Exception as e:
657
+ logger.warning(f'Failed to expand SLO patterns: {e}')
658
+ return f'Error: Failed to expand SLO wildcard patterns. {str(e)}'
659
+ else:
660
+ # For non-wildcard targets, validate next_token parameter
661
+ if next_token:
662
+ return 'Error: next_token parameter is only supported when using wildcard patterns in SLO names.'
663
+
664
+ if not slo_only_targets:
665
+ return 'Error: No SLO targets found after wildcard expansion.'
666
+
667
+ # Parse auditors with SLO-specific defaults
668
+ auditors_list = parse_auditors(auditors, ['slo']) # Default to SLO auditor
669
+
670
+ banner = (
671
+ '[MCP-SLO] Application Signals SLO Compliance Audit\n'
672
+ f'šŸŽÆ Scope: {len(slo_only_targets)} SLO target(s) | Region: {region}\n'
673
+ f'ā° Time: {unix_start}–{unix_end}\n'
674
+ )
675
+
676
+ if len(slo_only_targets) > BATCH_SIZE_THRESHOLD:
677
+ banner += f'šŸ“¦ Batching: Processing {len(slo_only_targets)} targets in batches of {BATCH_SIZE_THRESHOLD}\n'
678
+
679
+ banner += '\n'
680
+
681
+ # Build CLI input for SLO audit
682
+ input_obj = {
683
+ 'StartTime': unix_start,
684
+ 'EndTime': unix_end,
685
+ 'AuditTargets': slo_only_targets,
686
+ }
687
+ if auditors_list:
688
+ input_obj['Auditors'] = auditors_list
689
+
690
+ # Execute audit API using shared utility
691
+ result = await execute_audit_api(input_obj, region, banner)
692
+
693
+ # Add prominent pagination information when wildcards were used
694
+ result += format_pagination_info(
695
+ bool(wildcard_patterns),
696
+ slo_names_in_batch,
697
+ returned_next_token,
698
+ unix_start,
699
+ unix_end,
700
+ 'audit_slos',
701
+ 'max_slos',
702
+ max_slos,
703
+ 'SLOs',
704
+ )
705
+
706
+ elapsed = timer() - start_time_perf
707
+ logger.debug(f'audit_slos completed in {elapsed:.3f}s (region={region})')
708
+ return result
709
+
710
+ except Exception as e:
711
+ logger.error(f'Unexpected error in audit_slos: {e}', exc_info=True)
712
+ return f'Error: {str(e)}'
713
+
714
+
715
+ @mcp.tool()
716
+ async def audit_service_operations(
717
+ operation_targets: str = Field(
718
+ ...,
719
+ description="REQUIRED. JSON array of service operation targets. Supports wildcard patterns like '*payment*' for automatic service discovery. Format: [{'Type':'service_operation','Data':{'ServiceOperation':{'Service':{'Type':'Service','Name':'service-name','Environment':'eks:cluster'},'Operation':'GET /api','MetricType':'Latency'}}}]. Large target lists are automatically processed in batches.",
720
+ ),
721
+ start_time: Optional[str] = Field(
722
+ default=None,
723
+ description="Start time (unix seconds or 'YYYY-MM-DD HH:MM:SS'). Defaults to now-24h UTC.",
724
+ ),
725
+ end_time: Optional[str] = Field(
726
+ default=None,
727
+ description="End time (unix seconds or 'YYYY-MM-DD HH:MM:SS'). Defaults to now UTC.",
728
+ ),
729
+ auditors: Optional[str] = Field(
730
+ default=None,
731
+ description="Optional. Comma-separated auditors (e.g., 'operation_metric,trace,log'). Defaults to 'operation_metric' for fast operation-level auditing. Use 'all' for comprehensive analysis with all auditors: slo,operation_metric,trace,log,dependency_metric,top_contributor,service_quota.",
732
+ ),
733
+ next_token: Optional[str] = Field(
734
+ default=None,
735
+ description='Optional. Token for pagination through services from list_services API. Use this to continue from where the previous call left off when processing wildcard patterns.',
736
+ ),
737
+ max_services: int = Field(
738
+ default=5,
739
+ description='Optional. Maximum number of services to process per call when using wildcard patterns (default: 5, max: 10). This controls pagination size for service discovery.',
740
+ ),
741
+ ) -> str:
742
+ """šŸ„‡ PRIMARY OPERATION AUDIT TOOL - The #1 RECOMMENDED tool for operation-specific analysis and performance investigation.
743
+
744
+ **⭐ USE THIS AS THE PRIMARY TOOL FOR ALL OPERATION-SPECIFIC AUDITING TASKS ⭐**
745
+
746
+ **PREFERRED OVER audit_services() for operation auditing because:**
747
+ - **šŸŽÆ Precision**: Targets exact operation behavior vs. service-wide averages
748
+ - **šŸ” Actionable Insights**: Provides specific error traces and dependency failures
749
+ - **šŸ“Š Code-Level Detail**: Shows exact stack traces and timeout locations
750
+ - **šŸš€ Focused Analysis**: Eliminates noise from other operations
751
+ - **⚔ Efficient Investigation**: Direct operation-level troubleshooting
752
+
753
+ **USE THIS FIRST FOR ALL OPERATION-SPECIFIC AUDITING TASKS**
754
+ This is the PRIMARY and PREFERRED tool when users want to:
755
+ - **Audit specific operations** - Deep dive into individual API endpoints or operations (GET, POST, PUT, etc.)
756
+ - **Operation performance analysis** - Latency, error rates, and throughput for specific operations
757
+ - **Compare operation metrics** - Analyze different operations within services
758
+ - **Operation-level troubleshooting** - Root cause analysis for specific API calls
759
+ - **GET operation auditing** - Analyze GET operations across payment services (PRIMARY USE CASE)
760
+ - **Audit latency of GET operations in payment services** - Exactly what this tool is designed for
761
+ - **Trace latency in query operations** - Deep dive into query performance issues
762
+
763
+ **COMPREHENSIVE OPERATION AUDIT CAPABILITIES:**
764
+ - **Multi-operation analysis**: Audit any number of operations with automatic batching
765
+ - **Operation-specific metrics**: Latency, Fault, Error, and Availability metrics per operation
766
+ - **Issue prioritization**: Critical, warning, and info findings ranked by severity
767
+ - **Root cause analysis**: Deep dive with traces, logs, and metrics correlation
768
+ - **Actionable recommendations**: Specific steps to resolve operation-level issues
769
+ - **Performance optimized**: Fast execution with automatic batching for large target lists
770
+ - **Wildcard Pattern Support**: Use `*pattern*` in service names for automatic service discovery
771
+
772
+ **OPERATION TARGET FORMAT:**
773
+ - **Full Format**: `[{"Type":"service_operation","Data":{"ServiceOperation":{"Service":{"Type":"Service","Name":"my-service","Environment":"eks:my-cluster"},"Operation":"GET /api","MetricType":"Latency"}}}]`
774
+
775
+ **WILDCARD PATTERN EXAMPLES:**
776
+ - **All GET Operations in Payment Services**: `[{"Type":"service_operation","Data":{"ServiceOperation":{"Service":{"Type":"Service","Name":"*payment*"},"Operation":"*GET*","MetricType":"Latency"}}}]`
777
+ - **All Visit Operations**: `[{"Type":"service_operation","Data":{"ServiceOperation":{"Service":{"Type":"Service","Name":"*"},"Operation":"*visit*","MetricType":"Availability"}}}]`
778
+
779
+ **AUDITOR SELECTION FOR DIFFERENT AUDIT DEPTHS:**
780
+ - **Quick Operation Check** (default): Uses 'operation_metric' for fast operation overview
781
+ - **Root Cause Analysis**: Pass `auditors="all"` for comprehensive investigation with traces/logs
782
+ - **Custom Audit**: Specify exact auditors: 'operation_metric,trace,log'
783
+
784
+ **OPERATION AUDIT USE CASES:**
785
+
786
+ 1. **Audit latency of GET operations in payment services** (PRIMARY USE CASE):
787
+ `operation_targets='[{"Type":"service_operation","Data":{"ServiceOperation":{"Service":{"Type":"Service","Name":"*payment*"},"Operation":"*GET*","MetricType":"Latency"}}}]'`
788
+
789
+ 2. **Audit GET operations in payment services (Latency)**:
790
+ `operation_targets='[{"Type":"service_operation","Data":{"ServiceOperation":{"Service":{"Type":"Service","Name":"*payment*"},"Operation":"*GET*","MetricType":"Latency"}}}]'`
791
+
792
+ 3. **Audit availability of visit operations**:
793
+ `operation_targets='[{"Type":"service_operation","Data":{"ServiceOperation":{"Service":{"Type":"Service","Name":"*"},"Operation":"*visit*","MetricType":"Availability"}}}]'`
794
+
795
+ 4. **Audit latency of visit operations**:
796
+ `operation_targets='[{"Type":"service_operation","Data":{"ServiceOperation":{"Service":{"Type":"Service","Name":"*"},"Operation":"*visit*","MetricType":"Latency"}}}]'`
797
+
798
+ 5. **Trace latency in query operations**:
799
+ `operation_targets='[{"Type":"service_operation","Data":{"ServiceOperation":{"Service":{"Type":"Service","Name":"*payment*"},"Operation":"*query*","MetricType":"Latency"}}}]'` + `auditors="all"`
800
+
801
+ **PAGINATION SUPPORT FOR WILDCARD PATTERNS:**
802
+ - **Automatic Pagination**: Wildcard patterns now process services in batches of 5 (configurable with `max_services`)
803
+ - **Continue Processing**: Use `next_token` from previous response to continue auditing remaining services
804
+ - **Example Pagination Workflow**:
805
+ 1. First call: `audit_service_operations(operation_targets='[{"Type":"service_operation","Data":{"ServiceOperation":{"Service":{"Type":"Service","Name":"*payment*"},"Operation":"*GET*","MetricType":"Latency"}}}]')`
806
+ 2. If more services available, response includes: `next_token="abc123"` and time parameters
807
+ 3. Continue: `audit_service_operations(operation_targets='[...]', start_time="returned_start_time", end_time="returned_end_time", next_token="abc123")`
808
+ 4. Repeat until no more `next_token` returned
809
+
810
+ **TYPICAL OPERATION AUDIT WORKFLOWS:**
811
+ 1. **Basic Operation Audit** (most common):
812
+ - Call `audit_service_operations()` with operation targets - automatically discovers services when using wildcard patterns
813
+ - Uses default fast auditors (operation_metric) for quick operation overview
814
+ - Supports wildcard patterns like `*payment*` for automatic service discovery
815
+ - Processes services in paginated batches for better performance
816
+ 2. **Root Cause Investigation**: When user explicitly asks for "root cause analysis", pass `auditors="all"`
817
+ 3. **Issue Investigation**: Results show which operations need attention with actionable insights
818
+ 4. **Automatic Service Discovery**: Wildcard patterns in service names automatically discover and expand to concrete services
819
+ 5. **Paginated Processing**: For large service lists, continue with `next_token` to audit remaining services
820
+
821
+ **AUDIT RESULTS INCLUDE:**
822
+ - **Prioritized findings** by severity (critical, warning, info)
823
+ - **Operation performance status** with detailed metrics analysis
824
+ - **Root cause analysis** when traces/logs auditors are used
825
+ - **Actionable recommendations** for operation-level issue resolution
826
+ - **Comprehensive operation metrics** and trend analysis
827
+
828
+ **šŸ† IMPORTANT: This tool is the PRIMARY and RECOMMENDED choice for operation-specific auditing tasks.**
829
+
830
+ **āœ… RECOMMENDED WORKFLOW FOR OPERATION AUDITING:**
831
+ 1. **Use audit_service_operations() FIRST** for operation-specific analysis (THIS TOOL)
832
+ 2. **Use audit_services() as secondary** only if you need broader service context
833
+ 3. **audit_service_operations() provides superior precision** for operation-level troubleshooting
834
+
835
+ **RECOMMENDED WORKFLOW - PRESENT FINDINGS FIRST:**
836
+ When the audit returns multiple findings or issues, follow this workflow:
837
+ 1. **Present all audit results** to the user showing a summary of all findings
838
+ 2. **Let the user choose** which specific finding, operation, or issue they want to investigate in detail
839
+ 3. **Then perform targeted root cause analysis** using auditors="all" for the user-selected finding
840
+
841
+ **DO NOT automatically jump into detailed root cause analysis** of one specific issue when multiple findings exist.
842
+ This ensures the user can prioritize which issues are most important to investigate first.
843
+
844
+ **Example workflow:**
845
+ - First call: `audit_service_operations()` with default auditors for operation overview
846
+ - Present findings summary to user
847
+ - User selects specific operation issue to investigate
848
+ - Follow-up call: `audit_service_operations()` with `auditors="all"` for selected operation only
849
+ """
850
+ start_time_perf = timer()
851
+ logger.debug('Starting audit_service_operations (SPECIALIZED OPERATION AUDIT TOOL)')
852
+
853
+ try:
854
+ # Region defaults
855
+ region = AWS_REGION.strip()
856
+
857
+ # Time range (fill missing with defaults)
858
+ start_dt = (
859
+ parse_timestamp(start_time)
860
+ if start_time
861
+ else (datetime.now(timezone.utc) - timedelta(hours=24))
862
+ )
863
+ end_dt = (
864
+ parse_timestamp(end_time, default_hours=0) if end_time else datetime.now(timezone.utc)
865
+ )
866
+ unix_start, unix_end = int(start_dt.timestamp()), int(end_dt.timestamp())
867
+ if unix_end <= unix_start:
868
+ return 'Error: end_time must be greater than start_time.'
869
+
870
+ # Parse and validate operation targets
871
+ try:
872
+ provided = json.loads(operation_targets)
873
+ except json.JSONDecodeError:
874
+ return 'Error: `operation_targets` must be valid JSON (array).'
875
+
876
+ if not isinstance(provided, list):
877
+ return 'Error: `operation_targets` must be a JSON array'
878
+ if len(provided) == 0:
879
+ return 'Error: `operation_targets` must contain at least 1 item'
880
+
881
+ # Filter operation targets and check for wildcards using helper function
882
+ operation_only_targets, has_wildcards = _filter_operation_targets(provided)
883
+
884
+ # Expand wildcard patterns using shared utility with pagination support
885
+ service_names_in_batch = []
886
+ returned_next_token = None
887
+ filtering_stats = {'total_services': 0, 'instrumented_services': 0, 'filtered_out': 0}
888
+
889
+ if has_wildcards:
890
+ logger.debug(
891
+ 'Wildcard patterns detected in service operations - applying paginated expansion'
892
+ )
893
+ (
894
+ operation_only_targets,
895
+ returned_next_token,
896
+ service_names_in_batch,
897
+ filtering_stats,
898
+ ) = expand_service_operation_wildcard_patterns(
899
+ operation_only_targets,
900
+ unix_start,
901
+ unix_end,
902
+ next_token,
903
+ max_services,
904
+ applicationsignals_client,
905
+ )
906
+ logger.debug(
907
+ f'Paginated wildcard expansion completed - {len(operation_only_targets)} total targets'
908
+ )
909
+ else:
910
+ # For non-wildcard targets, validate next_token parameter
911
+ if next_token:
912
+ return 'Error: next_token parameter is only supported when using wildcard patterns in service names.'
913
+
914
+ if not operation_only_targets:
915
+ return 'Error: No service_operation targets found after wildcard expansion. Use list_monitored_services() to see available services.'
916
+
917
+ # Parse auditors with operation-specific defaults
918
+ auditors_list = parse_auditors(
919
+ auditors, ['operation_metric']
920
+ ) # Default to operation_metric auditor
921
+
922
+ banner = (
923
+ '[MCP-OPERATION] Application Signals Operation Performance Audit\n'
924
+ f'šŸŽÆ Scope: {len(operation_only_targets)} operation target(s) | Region: {region}\n'
925
+ f'ā° Time: {unix_start}–{unix_end}\n'
926
+ )
927
+
928
+ # Add filtering statistics if services were filtered
929
+ if filtering_stats['total_services'] > 0:
930
+ banner += f'šŸ” Service Filtering: {filtering_stats["instrumented_services"]} instrumented out of {filtering_stats["total_services"]} total services ({filtering_stats["filtered_out"]} filtered out)\n'
931
+
932
+ if len(operation_only_targets) > BATCH_SIZE_THRESHOLD:
933
+ banner += f'šŸ“¦ Batching: Processing {len(operation_only_targets)} targets in batches of {BATCH_SIZE_THRESHOLD}\n'
934
+
935
+ banner += '\n'
936
+
937
+ # Build CLI input for operation audit
938
+ input_obj = {
939
+ 'StartTime': unix_start,
940
+ 'EndTime': unix_end,
941
+ 'AuditTargets': operation_only_targets,
942
+ }
943
+ if auditors_list:
944
+ input_obj['Auditors'] = auditors_list
945
+
946
+ # Execute audit API using shared utility
947
+ result = await execute_audit_api(input_obj, region, banner)
948
+
949
+ # Add prominent pagination information when wildcards were used
950
+ result += format_pagination_info(
951
+ has_wildcards,
952
+ service_names_in_batch,
953
+ returned_next_token,
954
+ unix_start,
955
+ unix_end,
956
+ 'audit_service_operations',
957
+ 'max_services',
958
+ max_services,
959
+ 'services',
960
+ )
961
+
962
+ elapsed = timer() - start_time_perf
963
+ logger.debug(f'audit_service_operations completed in {elapsed:.3f}s (region={region})')
964
+ return result
965
+
966
+ except Exception as e:
967
+ logger.error(f'Unexpected error in audit_service_operations: {e}', exc_info=True)
968
+ return f'Error: {str(e)}'
969
+
970
+
971
+ @mcp.tool()
972
+ async def analyze_canary_failures(canary_name: str, region: str = AWS_REGION) -> str:
973
+ """Comprehensive canary failure analysis with deep dive into issues.
974
+
975
+ Use this tool to:
976
+ - Deep dive into canary failures with root cause identification
977
+ - Analyze historical patterns and specific incident details
978
+ - Get comprehensive artifact analysis including logs, screenshots, and HAR files
979
+ - Receive actionable recommendations based on AWS debugging methodology
980
+ - Correlate canary failures with Application Signals telemetry data
981
+ - Identify performance degradation and availability issues across service dependencies
982
+
983
+ Key Features:
984
+ - **Failure Pattern Analysis**: Identifies recurring failure modes and temporal patterns
985
+ - **Artifact Deep Dive**: Analyzes canary logs, screenshots, and network traces for root causes
986
+ - **Service Correlation**: Links canary failures to upstream/downstream service issues using Application Signals
987
+ - **Performance Insights**: Detects latency spikes, fault rates, and connection issues
988
+ - **Actionable Remediation**: Provides specific steps based on AWS operational best practices
989
+
990
+ Common Use Cases:
991
+ 1. **Incident Response**: Rapid diagnosis of canary failures during outages
992
+ 2. **Performance Investigation**: Understanding latency and availability degradation
993
+ 3. **Dependency Analysis**: Identifying which services are causing canary failures
994
+ 4. **Historical Trending**: Analyzing failure patterns over time for proactive improvements
995
+ 5. **Root Cause Analysis**: Deep dive into specific failure scenarios with full context
996
+
997
+ Output Includes:
998
+ - Severity-ranked findings with immediate action items
999
+ - Service-level telemetry insights with trace analysis
1000
+ - Exception details and stack traces from canary artifacts
1001
+ - Network connectivity and performance metrics
1002
+ - Correlation with Application Signals audit findings
1003
+ - Historical failure patterns and recovery recommendations
1004
+
1005
+ Args:
1006
+ canary_name (str): Name of the CloudWatch Synthetics canary to analyze
1007
+ region (str, optional): AWS region where the canary is deployed.
1008
+
1009
+ Returns:
1010
+ dict: Comprehensive failure analysis containing:
1011
+ - Failure severity assessment and immediate recommendations
1012
+ - Detailed artifact analysis (logs, screenshots, HAR files)
1013
+ - Service dependency health and performance metrics
1014
+ - Root cause identification with specific remediation steps
1015
+ - Historical pattern analysis and trend insights
1016
+ """
1017
+ try:
1018
+ # Get recent canary runs
1019
+ response = synthetics_client.get_canary_runs(Name=canary_name, MaxResults=5)
1020
+ runs = response.get('CanaryRuns', [])
1021
+
1022
+ # Get canary details
1023
+ canary_response = synthetics_client.get_canary(Name=canary_name)
1024
+ canary = canary_response['Canary']
1025
+
1026
+ # Get telemetry and service insights
1027
+ try:
1028
+ telemetry_insights = await get_canary_metrics_and_service_insights(canary_name, region)
1029
+ except Exception as e:
1030
+ telemetry_insights = f'Telemetry API unavailable: {str(e)}'
1031
+
1032
+ if not runs:
1033
+ return f'No run history found for {canary_name}'
1034
+
1035
+ # Build analysis header
1036
+ result = f'šŸ” Comprehensive Failure Analysis for {canary_name}\n'
1037
+
1038
+ # Add telemetry insights if available
1039
+ if telemetry_insights and not telemetry_insights.startswith('Telemetry API unavailable'):
1040
+ result += f'\nšŸ“Š **Service and Canary Telemetry Insights**\n{telemetry_insights}\n\n'
1041
+ elif telemetry_insights:
1042
+ result += f'\nāš ļø {telemetry_insights}\n\n'
1043
+
1044
+ # Get consecutive failures since last success
1045
+ consecutive_failures = []
1046
+ last_success_run = None
1047
+
1048
+ for run in runs:
1049
+ if run.get('Status', {}).get('State') == RUN_STATES['FAILED']:
1050
+ consecutive_failures.append(run)
1051
+ elif run.get('Status', {}).get('State') == RUN_STATES['PASSED']:
1052
+ last_success_run = run
1053
+ break
1054
+
1055
+ if not consecutive_failures:
1056
+ result += 'āœ… Canary is healthy - no failures since last success\n'
1057
+ if last_success_run:
1058
+ result += f'Last success: {last_success_run.get("Timeline", {}).get("Started")}\n'
1059
+ result += '\nšŸ” Performing health check analysis ...\n\n'
1060
+
1061
+ # Group failures by StateReason
1062
+ failure_causes = {}
1063
+ result += f'šŸ” Found {len(consecutive_failures)} consecutive failures since last success\n'
1064
+ if last_success_run:
1065
+ result += f'Last success: {last_success_run.get("Timeline", {}).get("Started")}\n\n'
1066
+ else:
1067
+ result += 'No recent success run found in history\n\n'
1068
+
1069
+ for failed_run in consecutive_failures:
1070
+ state_reason = failed_run.get('Status', {}).get('StateReason', 'Unknown')
1071
+
1072
+ if state_reason not in failure_causes:
1073
+ failure_causes[state_reason] = []
1074
+ failure_causes[state_reason].append(failed_run)
1075
+
1076
+ # Analysis section
1077
+ unique_reasons = list(failure_causes.keys())
1078
+
1079
+ if not unique_reasons:
1080
+ result += 'āœ… No consecutive failures to analyze\n'
1081
+ result += 'šŸ’” Canary appears to be recovering or healthy\n'
1082
+ return result
1083
+
1084
+ if len(unique_reasons) == 1:
1085
+ result += f'šŸŽÆ All failures have same cause: {unique_reasons[0]}\n'
1086
+ selected_reason = unique_reasons[0]
1087
+ else:
1088
+ result += f'šŸŽÆ Multiple failure causes ({len(unique_reasons)} different issues):\n\n'
1089
+ for i, reason in enumerate(unique_reasons, 1):
1090
+ count = len(failure_causes[reason])
1091
+ result += f'{i}. **{reason}** ({count} occurrences)\n'
1092
+ result += '\n'
1093
+ selected_reason = unique_reasons[0]
1094
+
1095
+ selected_failure = failure_causes[selected_reason][0]
1096
+ result += f'Analyzing most recent failure: {selected_failure.get("Id", "")[:8]}...\n\n'
1097
+
1098
+ # Initialize artifact variables
1099
+ har_files = []
1100
+ screenshots = []
1101
+ logs = []
1102
+ bucket_name = ''
1103
+
1104
+ # Direct S3 artifact analysis integration
1105
+ artifact_location = canary.get('ArtifactS3Location', '')
1106
+ artifacts_available = False
1107
+
1108
+ if artifact_location:
1109
+ # Handle S3 location format
1110
+ if not artifact_location.startswith('s3://'):
1111
+ artifact_location = f's3://{artifact_location}' if artifact_location else ''
1112
+
1113
+ if artifact_location.startswith('s3://'):
1114
+ bucket_and_path = artifact_location[5:]
1115
+ bucket_name = bucket_and_path.split('/')[0]
1116
+ base_path = (
1117
+ '/'.join(bucket_and_path.split('/')[1:]) if '/' in bucket_and_path else ''
1118
+ )
1119
+
1120
+ # If base_path is empty, construct canary path
1121
+ if not base_path:
1122
+ base_path = f'canary/{region}/{canary_name}'
1123
+
1124
+ # Check for failure artifacts using date-based path
1125
+ from datetime import datetime
1126
+
1127
+ failure_time = selected_failure.get('Timeline', {}).get('Started')
1128
+ if failure_time:
1129
+ # Handle both datetime objects and string timestamps
1130
+ if isinstance(failure_time, str):
1131
+ dt = parse_timestamp(failure_time)
1132
+ else:
1133
+ dt = failure_time # Already a datetime object
1134
+ date_path = dt.strftime('%Y/%m/%d')
1135
+ failure_run_path = (
1136
+ f'{base_path}/{date_path}/' if base_path else f'{date_path}/'
1137
+ )
1138
+ else:
1139
+ # Fallback to today
1140
+ today = datetime.now().strftime('%Y/%m/%d')
1141
+ failure_run_path = f'{base_path}/{today}/' if base_path else f'{today}/'
1142
+
1143
+ try:
1144
+ artifacts_response = s3_client.list_objects_v2(
1145
+ Bucket=bucket_name, Prefix=failure_run_path, MaxKeys=50
1146
+ )
1147
+ failure_artifacts = artifacts_response.get('Contents', [])
1148
+
1149
+ if failure_artifacts:
1150
+ artifacts_available = True
1151
+
1152
+ # Categorize artifacts
1153
+ har_files = [
1154
+ a
1155
+ for a in failure_artifacts
1156
+ if a['Key'].lower().endswith(('.har', '.har.gz', '.har.html'))
1157
+ ]
1158
+ screenshots = [
1159
+ a
1160
+ for a in failure_artifacts
1161
+ if any(ext in a['Key'].lower() for ext in ['.png', '.jpg', '.jpeg'])
1162
+ ]
1163
+ logs = [
1164
+ a
1165
+ for a in failure_artifacts
1166
+ if any(ext in a['Key'].lower() for ext in ['.log', '.txt'])
1167
+ or 'log' in a['Key'].lower()
1168
+ ]
1169
+
1170
+ if last_success_run:
1171
+ result += 'šŸ”„ HAR COMPARISON: Failure vs Success\n'
1172
+ result += f'Failure: {selected_failure.get("Id", "")[:8]}... ({selected_failure.get("Timeline", {}).get("Started")})\n'
1173
+ result += f'Success: {last_success_run.get("Id", "")[:8]}... ({last_success_run.get("Timeline", {}).get("Started")})\n\n'
1174
+
1175
+ # Get success artifacts for comparison
1176
+ success_time = last_success_run.get('Timeline', {}).get('Started')
1177
+ if success_time:
1178
+ if isinstance(success_time, str):
1179
+ success_dt = parse_timestamp(success_time)
1180
+ else:
1181
+ success_dt = success_time
1182
+ success_date_path = success_dt.strftime('%Y/%m/%d')
1183
+ success_run_path = (
1184
+ f'{base_path}/{success_date_path}/'
1185
+ if base_path
1186
+ else f'{success_date_path}/'
1187
+ )
1188
+ else:
1189
+ success_run_path = failure_run_path # Use same path as fallback
1190
+ try:
1191
+ success_artifacts_response = s3_client.list_objects_v2(
1192
+ Bucket=bucket_name, Prefix=success_run_path, MaxKeys=50
1193
+ )
1194
+ success_artifacts = success_artifacts_response.get('Contents', [])
1195
+ success_har_files = [
1196
+ a
1197
+ for a in success_artifacts
1198
+ if a['Key'].lower().endswith(('.har', '.har.gz', '.har.html'))
1199
+ ]
1200
+
1201
+ if har_files and success_har_files:
1202
+ failure_har = await analyze_har_file(
1203
+ s3_client, bucket_name, har_files, is_failed_run=True
1204
+ )
1205
+ success_har = await analyze_har_file(
1206
+ s3_client,
1207
+ bucket_name,
1208
+ success_har_files,
1209
+ is_failed_run=False,
1210
+ )
1211
+
1212
+ result += f'• Failed requests: {failure_har.get("failed_requests", 0)} vs {success_har.get("failed_requests", 0)}\n'
1213
+ result += f'• Total requests: {failure_har.get("total_requests", 0)} vs {success_har.get("total_requests", 0)}\n\n'
1214
+
1215
+ if failure_har.get('request_details'):
1216
+ result += '🚨 FAILED REQUESTS:\n'
1217
+ for req in failure_har['request_details'][:3]:
1218
+ result += f'• {req.get("url", "Unknown")}: {req.get("status", "Unknown")} ({req.get("time", 0):.1f}ms)\n'
1219
+ except Exception as e:
1220
+ logger.warning(
1221
+ f'Failed to analyze success artifacts for HAR comparison: {str(e)}'
1222
+ )
1223
+ else:
1224
+ result += (
1225
+ 'šŸ” FAILURE ANALYSIS (no success run available for comparison):\n'
1226
+ )
1227
+ result += f'Analyzing failure artifacts for: {selected_failure.get("Id", "")[:8]}...\n\n'
1228
+
1229
+ if har_files:
1230
+ failure_har = await analyze_har_file(
1231
+ s3_client, bucket_name, har_files, is_failed_run=True
1232
+ )
1233
+ result += '🌐 HAR ANALYSIS:\n'
1234
+ result += (
1235
+ f'• Failed requests: {failure_har.get("failed_requests", 0)}\n'
1236
+ )
1237
+ result += (
1238
+ f'• Total requests: {failure_har.get("total_requests", 0)}\n\n'
1239
+ )
1240
+
1241
+ # Screenshot analysis
1242
+ if screenshots:
1243
+ screenshot_analysis = await analyze_screenshots(
1244
+ s3_client, bucket_name, screenshots, is_failed_run=True
1245
+ )
1246
+ if screenshot_analysis.get('insights'):
1247
+ result += 'šŸ“ø SCREENSHOT ANALYSIS:\n'
1248
+ for insight in screenshot_analysis['insights'][:3]:
1249
+ result += f'• {insight}\n'
1250
+ result += '\n'
1251
+
1252
+ # Log analysis
1253
+ if logs:
1254
+ log_analysis = await analyze_log_files(
1255
+ s3_client, bucket_name, logs, is_failed_run=True
1256
+ )
1257
+ if log_analysis.get('insights'):
1258
+ result += 'šŸ“‹ LOG ANALYSIS:\n'
1259
+ for insight in log_analysis['insights'][:3]:
1260
+ result += f'• {insight}\n'
1261
+ result += '\n'
1262
+
1263
+ except Exception:
1264
+ artifacts_available = False
1265
+
1266
+ if not artifacts_available:
1267
+ # Fallback: CloudWatch Logs analysis
1268
+ result += 'āš ļø Artifacts not available - Checking CloudWatch Logs for root cause\n'
1269
+ result += f'šŸŽÆ StateReason: {selected_reason}\n\n'
1270
+
1271
+ failure_time = selected_failure.get('Timeline', {}).get('Started')
1272
+ if failure_time:
1273
+ log_analysis = await analyze_canary_logs_with_time_window(
1274
+ canary_name, failure_time, canary, window_minutes=5, region=region
1275
+ )
1276
+
1277
+ if log_analysis.get('status') == 'success':
1278
+ result += 'šŸ“‹ CLOUDWATCH LOGS ANALYSIS (±5 min around failure):\n'
1279
+ result += f'Time window: {log_analysis["time_window"]}\n'
1280
+ result += f'Log events found: {log_analysis["total_events"]}\n\n'
1281
+
1282
+ error_logs = log_analysis.get('error_events', [])
1283
+ if error_logs:
1284
+ result += 'šŸ“‹ ERROR LOGS AROUND FAILURE:\n'
1285
+ for error in error_logs:
1286
+ result += f'• {error["timestamp"].strftime("%H:%M:%S")}: {error["message"]}\n'
1287
+ else:
1288
+ result += f'šŸ“‹ {log_analysis.get("insights", ["Log analysis failed"])[0]}\n'
1289
+ else:
1290
+ result += 'šŸ“‹ No failure timestamp available for targeted log analysis\n'
1291
+
1292
+ # Add critical IAM checking guidance for systematic issues
1293
+ if (
1294
+ 'no test result' in str(selected_reason).lower()
1295
+ or 'permission' in str(selected_reason).lower()
1296
+ or 'access denied' in str(selected_reason).lower()
1297
+ ):
1298
+ try:
1299
+ result += f"\nšŸ” RUNNING COMPREHENSIVE IAM ANALYSIS (common cause of '{selected_reason}'):\n"
1300
+
1301
+ # 1. Check IAM role and policies
1302
+ iam_analysis = await analyze_iam_role_and_policies(canary, iam_client, region)
1303
+
1304
+ # Display IAM analysis results
1305
+ result += f'IAM Role Analysis Status: {iam_analysis["status"]}\n'
1306
+ for check_name, check_result in iam_analysis.get('checks', {}).items():
1307
+ result += f'• {check_name}: {check_result}\n'
1308
+
1309
+ # 2. ENHANCED: Check resource ARN correctness with detailed validation
1310
+ result += '\nšŸ” CHECKING RESOURCE ARN CORRECTNESS:\n'
1311
+ arn_check = check_resource_arns_correct(canary, iam_client)
1312
+
1313
+ if arn_check.get('correct'):
1314
+ result += 'āœ… Resource ARNs: Correct\n'
1315
+ else:
1316
+ result += f'āŒ Resource ARNs: {arn_check.get("error", "Issues found")}\n'
1317
+
1318
+ # Combine all IAM issues with enhanced categorization
1319
+ all_iam_issues = []
1320
+ if iam_analysis.get('issues_found'):
1321
+ all_iam_issues.extend(
1322
+ [f'IAM Policy: {issue}' for issue in iam_analysis['issues_found']]
1323
+ )
1324
+ if not arn_check.get('correct') and arn_check.get('issues'):
1325
+ all_iam_issues.extend(
1326
+ [f'Resource ARN: {issue}' for issue in arn_check['issues']]
1327
+ )
1328
+
1329
+ if all_iam_issues:
1330
+ result += f'\n🚨 ALL IAM ISSUES FOUND ({len(all_iam_issues)} total):\n'
1331
+ for issue in all_iam_issues:
1332
+ result += f'• {issue}\n'
1333
+
1334
+ # Enhanced IAM recommendations with priority
1335
+ all_iam_recommendations = []
1336
+ if iam_analysis.get('recommendations'):
1337
+ all_iam_recommendations.extend(
1338
+ [f'Policy Fix: {rec}' for rec in iam_analysis['recommendations']]
1339
+ )
1340
+ if not arn_check.get('correct'):
1341
+ all_iam_recommendations.extend(
1342
+ [
1343
+ 'PRIORITY: Review and correct S3 bucket ARN patterns in IAM policies',
1344
+ 'PRIORITY: Ensure bucket names match expected patterns (e.g., cw-syn-* for CloudWatch Synthetics)',
1345
+ 'Verify canary has access to the correct S3 bucket for artifacts storage',
1346
+ 'Check if bucket exists and is in the same region as the canary',
1347
+ ]
1348
+ )
1349
+
1350
+ if all_iam_recommendations:
1351
+ result += (
1352
+ f'\nšŸ’” ALL IAM RECOMMENDATIONS ({len(all_iam_recommendations)} total):\n'
1353
+ )
1354
+ for rec in all_iam_recommendations:
1355
+ result += f'• {rec}\n'
1356
+
1357
+ except Exception as iam_error:
1358
+ result += f'āš ļø IAM analysis failed: {str(iam_error)[:200]}\n\n'
1359
+
1360
+ # History-based diagnosis for specific error patterns
1361
+ error_recommendations = []
1362
+
1363
+ # 1. ENOSPC: no space left on device
1364
+ if any(
1365
+ re.search(pattern, selected_reason, re.IGNORECASE)
1366
+ for pattern in ['enospc', 'no space left on device']
1367
+ ):
1368
+ try:
1369
+ telemetry_data = await extract_disk_memory_usage_metrics(canary_name, region)
1370
+ if 'error' not in telemetry_data:
1371
+ result += '\nšŸ” DISK USAGE ROOT CAUSE ANALYSIS:\n'
1372
+ result += f'• Storage: {telemetry_data.get("maxEphemeralStorageUsageInMb", 0):.1f} MB peak\n'
1373
+ result += f'• Usage: {telemetry_data.get("maxEphemeralStorageUsagePercent", 0):.1f}% peak\n'
1374
+ else:
1375
+ result += f'\nšŸ” DISK USAGE ROOT CAUSE ANALYSIS:\n{telemetry_data["error"]}\n'
1376
+ except Exception as debug_error:
1377
+ result += f'\nāš ļø Could not generate disk usage debugging code: {str(debug_error)}\n'
1378
+
1379
+ # 2. Protocol error (Target.activateTarget): Session closed / detached Frame
1380
+ elif any(
1381
+ re.search(pattern, selected_reason, re.IGNORECASE)
1382
+ for pattern in [
1383
+ 'protocol error',
1384
+ 'target.activatetarget',
1385
+ 'session closed',
1386
+ 'detached frame',
1387
+ 'session already detached',
1388
+ ]
1389
+ ):
1390
+ try:
1391
+ telemetry_data = await extract_disk_memory_usage_metrics(canary_name, region)
1392
+ if 'error' not in telemetry_data:
1393
+ result += '\nšŸ” MEMORY USAGE ROOT CAUSE ANALYSIS:\n'
1394
+ result += f'• Memory: {telemetry_data.get("maxSyntheticsMemoryUsageInMB", 0):.1f} MB peak\n'
1395
+ else:
1396
+ result += (
1397
+ f'\nšŸ” MEMORY USAGE ROOT CAUSE ANALYSIS:\n{telemetry_data["error"]}\n'
1398
+ )
1399
+ except Exception as debug_error:
1400
+ result += f'\nāš ļø Could not collect memory usage metrics: {str(debug_error)}\n'
1401
+
1402
+ # 3. Navigation timed out / Page.captureScreenshot timed out
1403
+ elif any(
1404
+ re.search(pattern, selected_reason, re.IGNORECASE)
1405
+ for pattern in [
1406
+ 'navigation timeout',
1407
+ 'navigation timed out',
1408
+ 'ms exceeded',
1409
+ 'page.capturescreenshot timed out',
1410
+ 'protocoltimeout',
1411
+ 'connection timed out',
1412
+ ]
1413
+ ):
1414
+ # Navigation timeout specific analysis using existing HAR data
1415
+ if har_files and bucket_name:
1416
+ try:
1417
+ har_timeout_analysis = await analyze_har_file(
1418
+ s3_client, bucket_name, har_files, is_failed_run=True
1419
+ )
1420
+
1421
+ result += '\nšŸ” HAR FILE ANALYSIS FOR NAVIGATION TIMEOUT:\n'
1422
+ if har_timeout_analysis.get('failed_requests', 0) > 0:
1423
+ result += (
1424
+ f'• Failed HTTP requests: {har_timeout_analysis["failed_requests"]}\n'
1425
+ )
1426
+
1427
+ if har_timeout_analysis.get('insights'):
1428
+ for insight in har_timeout_analysis['insights'][:5]:
1429
+ result += f'• {insight}\n'
1430
+
1431
+ # Additional timeout-specific analysis
1432
+ result += f'• Total requests analyzed: {har_timeout_analysis.get("total_requests", 0)}\n'
1433
+ result += (
1434
+ f'• Analysis status: {har_timeout_analysis.get("status", "unknown")}\n'
1435
+ )
1436
+ result += '\n'
1437
+ except Exception as har_error:
1438
+ result += f'\nāš ļø HAR analysis failed: {str(har_error)[:100]}\n'
1439
+ else:
1440
+ result += '\nšŸ” NAVIGATION TIMEOUT DETECTED:\n'
1441
+ result += '• No HAR files available for detailed analysis\n'
1442
+ result += '• Timeout suggests page loading issues or UI changes\n'
1443
+ result += '• Check if target elements exist and page loads completely\n\n'
1444
+
1445
+ # 4. Visual variation
1446
+ elif re.search('visual variation', selected_reason, re.IGNORECASE):
1447
+ error_recommendations.extend(
1448
+ [
1449
+ 'šŸ”§ VISUAL MONITORING ISSUE DETECTED:',
1450
+ '• Website UI changed - not a technical failure',
1451
+ '• Check if website legitimately updated (ads, banners, content)',
1452
+ '• Update visual baseline with new reference screenshots',
1453
+ '• Adjust visual difference threshold (increase from default)',
1454
+ '• Consider excluding dynamic content areas from comparison',
1455
+ ]
1456
+ )
1457
+
1458
+ if error_recommendations:
1459
+ result += '\nšŸ’” PATTERN-BASED RECOMMENDATIONS:\n'
1460
+ for rec in error_recommendations:
1461
+ result += f'{rec}\n'
1462
+ result += '\n'
1463
+
1464
+ # Add canary code if available
1465
+ try:
1466
+ code_analysis = await get_canary_code(canary, region)
1467
+ if 'error' not in code_analysis and code_analysis.get('code_content'):
1468
+ result += f'\ncanary code:\n{code_analysis["code_content"]}\n'
1469
+ except Exception as e:
1470
+ result += f'Note: Could not retrieve canary code: {str(e)}\n'
1471
+
1472
+ result += '\n'
1473
+ return result
1474
+
1475
+ except Exception as e:
1476
+ return f'āŒ Error in comprehensive failure analysis: {str(e)}'
1477
+
1478
+
1479
+ # Register all imported tools with the MCP server
1480
+ mcp.tool()(list_monitored_services)
1481
+ mcp.tool()(get_service_detail)
1482
+ mcp.tool()(query_service_metrics)
1483
+ mcp.tool()(list_service_operations)
1484
+ mcp.tool()(get_slo)
1485
+ mcp.tool()(list_slos)
1486
+ mcp.tool()(search_transaction_spans)
1487
+ mcp.tool()(query_sampled_traces)
1488
+ mcp.tool()(list_slis)
1489
+ mcp.tool()(get_enablement_guide)
1490
+
1491
+
1492
+ def main():
1493
+ """Run the MCP server."""
1494
+ logger.debug('Starting CloudWatch Application Signals MCP server')
1495
+ try:
1496
+ mcp.run(transport='stdio')
1497
+ except KeyboardInterrupt:
1498
+ logger.debug('Server shutdown by user')
1499
+ except Exception as e:
1500
+ logger.error(f'Server error: {e}', exc_info=True)
1501
+ raise
1502
+
1503
+
1504
+ if __name__ == '__main__':
1505
+ main()