awslabs.cloudwatch-appsignals-mcp-server 0.1.9__py3-none-any.whl → 0.1.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -16,6 +16,7 @@
16
16
 
17
17
  import json
18
18
  import os
19
+ import re
19
20
  import sys
20
21
  import tempfile
21
22
  from .audit_utils import (
@@ -25,7 +26,24 @@ from .audit_utils import (
25
26
  expand_slo_wildcard_patterns,
26
27
  parse_auditors,
27
28
  )
28
- from .aws_clients import AWS_REGION, appsignals_client
29
+ from .aws_clients import (
30
+ AWS_REGION,
31
+ appsignals_client,
32
+ iam_client,
33
+ s3_client,
34
+ synthetics_client,
35
+ )
36
+ from .canary_utils import (
37
+ analyze_canary_logs_with_time_window,
38
+ analyze_har_file,
39
+ analyze_iam_role_and_policies,
40
+ analyze_log_files,
41
+ analyze_screenshots,
42
+ check_resource_arns_correct,
43
+ extract_disk_memory_usage_metrics,
44
+ get_canary_code,
45
+ get_canary_metrics_and_service_insights,
46
+ )
29
47
  from .service_audit_utils import normalize_service_targets, validate_and_enrich_service_targets
30
48
  from .service_tools import (
31
49
  get_service_detail,
@@ -47,6 +65,8 @@ from typing import Optional
47
65
  # Constants
48
66
  BATCH_SIZE_THRESHOLD = 5
49
67
 
68
+ RUN_STATES = {'RUNNING': 'RUNNING', 'PASSED': 'PASSED', 'FAILED': 'FAILED'}
69
+
50
70
  # Initialize FastMCP server
51
71
  mcp = FastMCP('cloudwatch-appsignals')
52
72
 
@@ -110,6 +130,12 @@ def _filter_operation_targets(provided):
110
130
  if '*' in service_name or '*' in operation:
111
131
  has_wildcards = True
112
132
 
133
+ # For fault metrics, ListAuditFindings uses Availability metric type.
134
+ # API only supports Availability/Latency/Error for service_operation targets.
135
+ metric_type = service_op_data.get('MetricType', '')
136
+ if metric_type == 'Fault':
137
+ service_op_data['MetricType'] = 'Availability'
138
+
113
139
  operation_only_targets.append(target)
114
140
  else:
115
141
  logger.warning(
@@ -792,6 +818,514 @@ async def audit_service_operations(
792
818
  return f'Error: {str(e)}'
793
819
 
794
820
 
821
+ @mcp.tool()
822
+ async def analyze_canary_failures(canary_name: str, region: str = AWS_REGION) -> str:
823
+ """Comprehensive canary failure analysis with deep dive into issues.
824
+
825
+ Use this tool to:
826
+ - Deep dive into canary failures with root cause identification
827
+ - Analyze historical patterns and specific incident details
828
+ - Get comprehensive artifact analysis including logs, screenshots, and HAR files
829
+ - Receive actionable recommendations based on AWS debugging methodology
830
+ - Correlate canary failures with Application Signals telemetry data
831
+ - Identify performance degradation and availability issues across service dependencies
832
+
833
+ Key Features:
834
+ - **Failure Pattern Analysis**: Identifies recurring failure modes and temporal patterns
835
+ - **Artifact Deep Dive**: Analyzes canary logs, screenshots, and network traces for root causes
836
+ - **Service Correlation**: Links canary failures to upstream/downstream service issues using Application Signals
837
+ - **Performance Insights**: Detects latency spikes, fault rates, and connection issues
838
+ - **Actionable Remediation**: Provides specific steps based on AWS operational best practices
839
+
840
+ Common Use Cases:
841
+ 1. **Incident Response**: Rapid diagnosis of canary failures during outages
842
+ 2. **Performance Investigation**: Understanding latency and availability degradation
843
+ 3. **Dependency Analysis**: Identifying which services are causing canary failures
844
+ 4. **Historical Trending**: Analyzing failure patterns over time for proactive improvements
845
+ 5. **Root Cause Analysis**: Deep dive into specific failure scenarios with full context
846
+
847
+ Output Includes:
848
+ - Severity-ranked findings with immediate action items
849
+ - Service-level telemetry insights with trace analysis
850
+ - Exception details and stack traces from canary artifacts
851
+ - Network connectivity and performance metrics
852
+ - Correlation with Application Signals audit findings
853
+ - Historical failure patterns and recovery recommendations
854
+
855
+ Args:
856
+ canary_name (str): Name of the CloudWatch Synthetics canary to analyze
857
+ region (str, optional): AWS region where the canary is deployed.
858
+
859
+ Returns:
860
+ dict: Comprehensive failure analysis containing:
861
+ - Failure severity assessment and immediate recommendations
862
+ - Detailed artifact analysis (logs, screenshots, HAR files)
863
+ - Service dependency health and performance metrics
864
+ - Root cause identification with specific remediation steps
865
+ - Historical pattern analysis and trend insights
866
+ """
867
+ try:
868
+ # Get recent canary runs
869
+ response = synthetics_client.get_canary_runs(Name=canary_name, MaxResults=5)
870
+ runs = response.get('CanaryRuns', [])
871
+
872
+ # Get canary details
873
+ canary_response = synthetics_client.get_canary(Name=canary_name)
874
+ canary = canary_response['Canary']
875
+
876
+ # Get telemetry and service insights
877
+ try:
878
+ telemetry_insights = await get_canary_metrics_and_service_insights(canary_name, region)
879
+ except Exception as e:
880
+ telemetry_insights = f'Telemetry API unavailable: {str(e)}'
881
+
882
+ if not runs:
883
+ return f'No run history found for {canary_name}'
884
+
885
+ # Build analysis header
886
+ result = f'🔍 Comprehensive Failure Analysis for {canary_name}\n'
887
+
888
+ # Add telemetry insights if available
889
+ if telemetry_insights and not telemetry_insights.startswith('Telemetry API unavailable'):
890
+ result += f'\n📊 **Service and Canary Telemetry Insights**\n{telemetry_insights}\n\n'
891
+ elif telemetry_insights:
892
+ result += f'\n⚠️ {telemetry_insights}\n\n'
893
+
894
+ # Get consecutive failures since last success
895
+ consecutive_failures = []
896
+ last_success_run = None
897
+
898
+ for run in runs:
899
+ if run.get('Status', {}).get('State') == RUN_STATES['FAILED']:
900
+ consecutive_failures.append(run)
901
+ elif run.get('Status', {}).get('State') == RUN_STATES['PASSED']:
902
+ last_success_run = run
903
+ break
904
+
905
+ if not consecutive_failures:
906
+ result += '✅ Canary is healthy - no failures since last success\n'
907
+ if last_success_run:
908
+ result += f'Last success: {last_success_run.get("Timeline", {}).get("Started")}\n'
909
+ result += '\n🔍 Performing health check analysis ...\n\n'
910
+
911
+ # Group failures by StateReason
912
+ failure_causes = {}
913
+ result += f'🔍 Found {len(consecutive_failures)} consecutive failures since last success\n'
914
+ if last_success_run:
915
+ result += f'Last success: {last_success_run.get("Timeline", {}).get("Started")}\n\n'
916
+ else:
917
+ result += 'No recent success run found in history\n\n'
918
+
919
+ for failed_run in consecutive_failures:
920
+ state_reason = failed_run.get('Status', {}).get('StateReason', 'Unknown')
921
+
922
+ if state_reason not in failure_causes:
923
+ failure_causes[state_reason] = []
924
+ failure_causes[state_reason].append(failed_run)
925
+
926
+ # Analysis section
927
+ unique_reasons = list(failure_causes.keys())
928
+
929
+ if not unique_reasons:
930
+ result += '✅ No consecutive failures to analyze\n'
931
+ result += '💡 Canary appears to be recovering or healthy\n'
932
+ return result
933
+
934
+ if len(unique_reasons) == 1:
935
+ result += f'🎯 All failures have same cause: {unique_reasons[0]}\n'
936
+ selected_reason = unique_reasons[0]
937
+ else:
938
+ result += f'🎯 Multiple failure causes ({len(unique_reasons)} different issues):\n\n'
939
+ for i, reason in enumerate(unique_reasons, 1):
940
+ count = len(failure_causes[reason])
941
+ result += f'{i}. **{reason}** ({count} occurrences)\n'
942
+ result += '\n'
943
+ selected_reason = unique_reasons[0]
944
+
945
+ selected_failure = failure_causes[selected_reason][0]
946
+ result += f'Analyzing most recent failure: {selected_failure.get("Id", "")[:8]}...\n\n'
947
+
948
+ # Initialize artifact variables
949
+ har_files = []
950
+ screenshots = []
951
+ logs = []
952
+ bucket_name = ''
953
+
954
+ # Direct S3 artifact analysis integration
955
+ artifact_location = canary.get('ArtifactS3Location', '')
956
+ artifacts_available = False
957
+
958
+ if artifact_location:
959
+ # Handle S3 location format
960
+ if not artifact_location.startswith('s3://'):
961
+ artifact_location = f's3://{artifact_location}' if artifact_location else ''
962
+
963
+ if artifact_location.startswith('s3://'):
964
+ bucket_and_path = artifact_location[5:]
965
+ bucket_name = bucket_and_path.split('/')[0]
966
+ base_path = (
967
+ '/'.join(bucket_and_path.split('/')[1:]) if '/' in bucket_and_path else ''
968
+ )
969
+
970
+ # If base_path is empty, construct canary path
971
+ if not base_path:
972
+ base_path = f'canary/{region}/{canary_name}'
973
+
974
+ # Check for failure artifacts using date-based path
975
+ from datetime import datetime
976
+
977
+ failure_time = selected_failure.get('Timeline', {}).get('Started')
978
+ if failure_time:
979
+ # Handle both datetime objects and string timestamps
980
+ if isinstance(failure_time, str):
981
+ dt = parse_timestamp(failure_time)
982
+ else:
983
+ dt = failure_time # Already a datetime object
984
+ date_path = dt.strftime('%Y/%m/%d')
985
+ failure_run_path = (
986
+ f'{base_path}/{date_path}/' if base_path else f'{date_path}/'
987
+ )
988
+ else:
989
+ # Fallback to today
990
+ today = datetime.now().strftime('%Y/%m/%d')
991
+ failure_run_path = f'{base_path}/{today}/' if base_path else f'{today}/'
992
+
993
+ try:
994
+ artifacts_response = s3_client.list_objects_v2(
995
+ Bucket=bucket_name, Prefix=failure_run_path, MaxKeys=50
996
+ )
997
+ failure_artifacts = artifacts_response.get('Contents', [])
998
+
999
+ if failure_artifacts:
1000
+ artifacts_available = True
1001
+
1002
+ # Categorize artifacts
1003
+ har_files = [
1004
+ a
1005
+ for a in failure_artifacts
1006
+ if a['Key'].lower().endswith(('.har', '.har.gz', '.har.html'))
1007
+ ]
1008
+ screenshots = [
1009
+ a
1010
+ for a in failure_artifacts
1011
+ if any(ext in a['Key'].lower() for ext in ['.png', '.jpg', '.jpeg'])
1012
+ ]
1013
+ logs = [
1014
+ a
1015
+ for a in failure_artifacts
1016
+ if any(ext in a['Key'].lower() for ext in ['.log', '.txt'])
1017
+ or 'log' in a['Key'].lower()
1018
+ ]
1019
+
1020
+ if last_success_run:
1021
+ result += '🔄 HAR COMPARISON: Failure vs Success\n'
1022
+ result += f'Failure: {selected_failure.get("Id", "")[:8]}... ({selected_failure.get("Timeline", {}).get("Started")})\n'
1023
+ result += f'Success: {last_success_run.get("Id", "")[:8]}... ({last_success_run.get("Timeline", {}).get("Started")})\n\n'
1024
+
1025
+ # Get success artifacts for comparison
1026
+ success_time = last_success_run.get('Timeline', {}).get('Started')
1027
+ if success_time:
1028
+ if isinstance(success_time, str):
1029
+ success_dt = parse_timestamp(success_time)
1030
+ else:
1031
+ success_dt = success_time
1032
+ success_date_path = success_dt.strftime('%Y/%m/%d')
1033
+ success_run_path = (
1034
+ f'{base_path}/{success_date_path}/'
1035
+ if base_path
1036
+ else f'{success_date_path}/'
1037
+ )
1038
+ else:
1039
+ success_run_path = failure_run_path # Use same path as fallback
1040
+ try:
1041
+ success_artifacts_response = s3_client.list_objects_v2(
1042
+ Bucket=bucket_name, Prefix=success_run_path, MaxKeys=50
1043
+ )
1044
+ success_artifacts = success_artifacts_response.get('Contents', [])
1045
+ success_har_files = [
1046
+ a
1047
+ for a in success_artifacts
1048
+ if a['Key'].lower().endswith(('.har', '.har.gz', '.har.html'))
1049
+ ]
1050
+
1051
+ if har_files and success_har_files:
1052
+ failure_har = await analyze_har_file(
1053
+ s3_client, bucket_name, har_files, is_failed_run=True
1054
+ )
1055
+ success_har = await analyze_har_file(
1056
+ s3_client,
1057
+ bucket_name,
1058
+ success_har_files,
1059
+ is_failed_run=False,
1060
+ )
1061
+
1062
+ result += f'• Failed requests: {failure_har.get("failed_requests", 0)} vs {success_har.get("failed_requests", 0)}\n'
1063
+ result += f'• Total requests: {failure_har.get("total_requests", 0)} vs {success_har.get("total_requests", 0)}\n\n'
1064
+
1065
+ if failure_har.get('request_details'):
1066
+ result += '🚨 FAILED REQUESTS:\n'
1067
+ for req in failure_har['request_details'][:3]:
1068
+ result += f'• {req.get("url", "Unknown")}: {req.get("status", "Unknown")} ({req.get("time", 0):.1f}ms)\n'
1069
+ except Exception as e:
1070
+ logger.warning(
1071
+ f'Failed to analyze success artifacts for HAR comparison: {str(e)}'
1072
+ )
1073
+ else:
1074
+ result += (
1075
+ '🔍 FAILURE ANALYSIS (no success run available for comparison):\n'
1076
+ )
1077
+ result += f'Analyzing failure artifacts for: {selected_failure.get("Id", "")[:8]}...\n\n'
1078
+
1079
+ if har_files:
1080
+ failure_har = await analyze_har_file(
1081
+ s3_client, bucket_name, har_files, is_failed_run=True
1082
+ )
1083
+ result += '🌐 HAR ANALYSIS:\n'
1084
+ result += (
1085
+ f'• Failed requests: {failure_har.get("failed_requests", 0)}\n'
1086
+ )
1087
+ result += (
1088
+ f'• Total requests: {failure_har.get("total_requests", 0)}\n\n'
1089
+ )
1090
+
1091
+ # Screenshot analysis
1092
+ if screenshots:
1093
+ screenshot_analysis = await analyze_screenshots(
1094
+ s3_client, bucket_name, screenshots, is_failed_run=True
1095
+ )
1096
+ if screenshot_analysis.get('insights'):
1097
+ result += '📸 SCREENSHOT ANALYSIS:\n'
1098
+ for insight in screenshot_analysis['insights'][:3]:
1099
+ result += f'• {insight}\n'
1100
+ result += '\n'
1101
+
1102
+ # Log analysis
1103
+ if logs:
1104
+ log_analysis = await analyze_log_files(
1105
+ s3_client, bucket_name, logs, is_failed_run=True
1106
+ )
1107
+ if log_analysis.get('insights'):
1108
+ result += '📋 LOG ANALYSIS:\n'
1109
+ for insight in log_analysis['insights'][:3]:
1110
+ result += f'• {insight}\n'
1111
+ result += '\n'
1112
+
1113
+ except Exception:
1114
+ artifacts_available = False
1115
+
1116
+ if not artifacts_available:
1117
+ # Fallback: CloudWatch Logs analysis
1118
+ result += '⚠️ Artifacts not available - Checking CloudWatch Logs for root cause\n'
1119
+ result += f'🎯 StateReason: {selected_reason}\n\n'
1120
+
1121
+ failure_time = selected_failure.get('Timeline', {}).get('Started')
1122
+ if failure_time:
1123
+ log_analysis = await analyze_canary_logs_with_time_window(
1124
+ canary_name, failure_time, canary, window_minutes=5, region=region
1125
+ )
1126
+
1127
+ if log_analysis.get('status') == 'success':
1128
+ result += '📋 CLOUDWATCH LOGS ANALYSIS (±5 min around failure):\n'
1129
+ result += f'Time window: {log_analysis["time_window"]}\n'
1130
+ result += f'Log events found: {log_analysis["total_events"]}\n\n'
1131
+
1132
+ error_logs = log_analysis.get('error_events', [])
1133
+ if error_logs:
1134
+ result += '📋 ERROR LOGS AROUND FAILURE:\n'
1135
+ for error in error_logs:
1136
+ result += f'• {error["timestamp"].strftime("%H:%M:%S")}: {error["message"]}\n'
1137
+ else:
1138
+ result += f'📋 {log_analysis.get("insights", ["Log analysis failed"])[0]}\n'
1139
+ else:
1140
+ result += '📋 No failure timestamp available for targeted log analysis\n'
1141
+
1142
+ # Add critical IAM checking guidance for systematic issues
1143
+ if (
1144
+ 'no test result' in str(selected_reason).lower()
1145
+ or 'permission' in str(selected_reason).lower()
1146
+ or 'access denied' in str(selected_reason).lower()
1147
+ ):
1148
+ try:
1149
+ result += f"\n🔍 RUNNING COMPREHENSIVE IAM ANALYSIS (common cause of '{selected_reason}'):\n"
1150
+
1151
+ # 1. Check IAM role and policies
1152
+ iam_analysis = await analyze_iam_role_and_policies(canary, iam_client, region)
1153
+
1154
+ # Display IAM analysis results
1155
+ result += f'IAM Role Analysis Status: {iam_analysis["status"]}\n'
1156
+ for check_name, check_result in iam_analysis.get('checks', {}).items():
1157
+ result += f'• {check_name}: {check_result}\n'
1158
+
1159
+ # 2. ENHANCED: Check resource ARN correctness with detailed validation
1160
+ result += '\n🔍 CHECKING RESOURCE ARN CORRECTNESS:\n'
1161
+ arn_check = check_resource_arns_correct(canary, iam_client)
1162
+
1163
+ if arn_check.get('correct'):
1164
+ result += '✅ Resource ARNs: Correct\n'
1165
+ else:
1166
+ result += f'❌ Resource ARNs: {arn_check.get("error", "Issues found")}\n'
1167
+
1168
+ # Combine all IAM issues with enhanced categorization
1169
+ all_iam_issues = []
1170
+ if iam_analysis.get('issues_found'):
1171
+ all_iam_issues.extend(
1172
+ [f'IAM Policy: {issue}' for issue in iam_analysis['issues_found']]
1173
+ )
1174
+ if not arn_check.get('correct') and arn_check.get('issues'):
1175
+ all_iam_issues.extend(
1176
+ [f'Resource ARN: {issue}' for issue in arn_check['issues']]
1177
+ )
1178
+
1179
+ if all_iam_issues:
1180
+ result += f'\n🚨 ALL IAM ISSUES FOUND ({len(all_iam_issues)} total):\n'
1181
+ for issue in all_iam_issues:
1182
+ result += f'• {issue}\n'
1183
+
1184
+ # Enhanced IAM recommendations with priority
1185
+ all_iam_recommendations = []
1186
+ if iam_analysis.get('recommendations'):
1187
+ all_iam_recommendations.extend(
1188
+ [f'Policy Fix: {rec}' for rec in iam_analysis['recommendations']]
1189
+ )
1190
+ if not arn_check.get('correct'):
1191
+ all_iam_recommendations.extend(
1192
+ [
1193
+ 'PRIORITY: Review and correct S3 bucket ARN patterns in IAM policies',
1194
+ 'PRIORITY: Ensure bucket names match expected patterns (e.g., cw-syn-* for CloudWatch Synthetics)',
1195
+ 'Verify canary has access to the correct S3 bucket for artifacts storage',
1196
+ 'Check if bucket exists and is in the same region as the canary',
1197
+ ]
1198
+ )
1199
+
1200
+ if all_iam_recommendations:
1201
+ result += (
1202
+ f'\n💡 ALL IAM RECOMMENDATIONS ({len(all_iam_recommendations)} total):\n'
1203
+ )
1204
+ for rec in all_iam_recommendations:
1205
+ result += f'• {rec}\n'
1206
+
1207
+ except Exception as iam_error:
1208
+ result += f'⚠️ IAM analysis failed: {str(iam_error)[:200]}\n\n'
1209
+
1210
+ # History-based diagnosis for specific error patterns
1211
+ error_recommendations = []
1212
+
1213
+ # 1. ENOSPC: no space left on device
1214
+ if any(
1215
+ re.search(pattern, selected_reason, re.IGNORECASE)
1216
+ for pattern in ['enospc', 'no space left on device']
1217
+ ):
1218
+ try:
1219
+ telemetry_data = await extract_disk_memory_usage_metrics(canary_name, region)
1220
+ if 'error' not in telemetry_data:
1221
+ result += '\n🔍 DISK USAGE ROOT CAUSE ANALYSIS:\n'
1222
+ result += f'• Storage: {telemetry_data.get("maxEphemeralStorageUsageInMb", 0):.1f} MB peak\n'
1223
+ result += f'• Usage: {telemetry_data.get("maxEphemeralStorageUsagePercent", 0):.1f}% peak\n'
1224
+ else:
1225
+ result += f'\n🔍 DISK USAGE ROOT CAUSE ANALYSIS:\n{telemetry_data["error"]}\n'
1226
+ except Exception as debug_error:
1227
+ result += f'\n⚠️ Could not generate disk usage debugging code: {str(debug_error)}\n'
1228
+
1229
+ # 2. Protocol error (Target.activateTarget): Session closed / detached Frame
1230
+ elif any(
1231
+ re.search(pattern, selected_reason, re.IGNORECASE)
1232
+ for pattern in [
1233
+ 'protocol error',
1234
+ 'target.activatetarget',
1235
+ 'session closed',
1236
+ 'detached frame',
1237
+ 'session already detached',
1238
+ ]
1239
+ ):
1240
+ try:
1241
+ telemetry_data = await extract_disk_memory_usage_metrics(canary_name, region)
1242
+ if 'error' not in telemetry_data:
1243
+ result += '\n🔍 MEMORY USAGE ROOT CAUSE ANALYSIS:\n'
1244
+ result += f'• Memory: {telemetry_data.get("maxSyntheticsMemoryUsageInMB", 0):.1f} MB peak\n'
1245
+ else:
1246
+ result += (
1247
+ f'\n🔍 MEMORY USAGE ROOT CAUSE ANALYSIS:\n{telemetry_data["error"]}\n'
1248
+ )
1249
+ except Exception as debug_error:
1250
+ result += f'\n⚠️ Could not collect memory usage metrics: {str(debug_error)}\n'
1251
+
1252
+ # 3. Navigation timed out / Page.captureScreenshot timed out
1253
+ elif any(
1254
+ re.search(pattern, selected_reason, re.IGNORECASE)
1255
+ for pattern in [
1256
+ 'navigation timeout',
1257
+ 'navigation timed out',
1258
+ 'ms exceeded',
1259
+ 'page.capturescreenshot timed out',
1260
+ 'protocoltimeout',
1261
+ 'connection timed out',
1262
+ ]
1263
+ ):
1264
+ # Navigation timeout specific analysis using existing HAR data
1265
+ if har_files and bucket_name:
1266
+ try:
1267
+ har_timeout_analysis = await analyze_har_file(
1268
+ s3_client, bucket_name, har_files, is_failed_run=True
1269
+ )
1270
+
1271
+ result += '\n🔍 HAR FILE ANALYSIS FOR NAVIGATION TIMEOUT:\n'
1272
+ if har_timeout_analysis.get('failed_requests', 0) > 0:
1273
+ result += (
1274
+ f'• Failed HTTP requests: {har_timeout_analysis["failed_requests"]}\n'
1275
+ )
1276
+
1277
+ if har_timeout_analysis.get('insights'):
1278
+ for insight in har_timeout_analysis['insights'][:5]:
1279
+ result += f'• {insight}\n'
1280
+
1281
+ # Additional timeout-specific analysis
1282
+ result += f'• Total requests analyzed: {har_timeout_analysis.get("total_requests", 0)}\n'
1283
+ result += (
1284
+ f'• Analysis status: {har_timeout_analysis.get("status", "unknown")}\n'
1285
+ )
1286
+ result += '\n'
1287
+ except Exception as har_error:
1288
+ result += f'\n⚠️ HAR analysis failed: {str(har_error)[:100]}\n'
1289
+ else:
1290
+ result += '\n🔍 NAVIGATION TIMEOUT DETECTED:\n'
1291
+ result += '• No HAR files available for detailed analysis\n'
1292
+ result += '• Timeout suggests page loading issues or UI changes\n'
1293
+ result += '• Check if target elements exist and page loads completely\n\n'
1294
+
1295
+ # 4. Visual variation
1296
+ elif re.search('visual variation', selected_reason, re.IGNORECASE):
1297
+ error_recommendations.extend(
1298
+ [
1299
+ '🔧 VISUAL MONITORING ISSUE DETECTED:',
1300
+ '• Website UI changed - not a technical failure',
1301
+ '• Check if website legitimately updated (ads, banners, content)',
1302
+ '• Update visual baseline with new reference screenshots',
1303
+ '• Adjust visual difference threshold (increase from default)',
1304
+ '• Consider excluding dynamic content areas from comparison',
1305
+ ]
1306
+ )
1307
+
1308
+ if error_recommendations:
1309
+ result += '\n💡 PATTERN-BASED RECOMMENDATIONS:\n'
1310
+ for rec in error_recommendations:
1311
+ result += f'{rec}\n'
1312
+ result += '\n'
1313
+
1314
+ # Add canary code if available
1315
+ try:
1316
+ code_analysis = await get_canary_code(canary, region)
1317
+ if 'error' not in code_analysis and code_analysis.get('code_content'):
1318
+ result += f'\ncanary code:\n{code_analysis["code_content"]}\n'
1319
+ except Exception as e:
1320
+ result += f'Note: Could not retrieve canary code: {str(e)}\n'
1321
+
1322
+ result += '\n'
1323
+ return result
1324
+
1325
+ except Exception as e:
1326
+ return f'❌ Error in comprehensive failure analysis: {str(e)}'
1327
+
1328
+
795
1329
  # Register all imported tools with the MCP server
796
1330
  mcp.tool()(list_monitored_services)
797
1331
  mcp.tool()(get_service_detail)
@@ -802,6 +1336,7 @@ mcp.tool()(list_slos)
802
1336
  mcp.tool()(search_transaction_spans)
803
1337
  mcp.tool()(query_sampled_traces)
804
1338
  mcp.tool()(list_slis)
1339
+ mcp.tool()(analyze_canary_failures)
805
1340
 
806
1341
 
807
1342
  def main():