awslabs.cloudwatch-applicationsignals-mcp-server 0.1.21__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. awslabs/__init__.py +17 -0
  2. awslabs/cloudwatch_applicationsignals_mcp_server/__init__.py +17 -0
  3. awslabs/cloudwatch_applicationsignals_mcp_server/audit_presentation_utils.py +288 -0
  4. awslabs/cloudwatch_applicationsignals_mcp_server/audit_utils.py +912 -0
  5. awslabs/cloudwatch_applicationsignals_mcp_server/aws_clients.py +120 -0
  6. awslabs/cloudwatch_applicationsignals_mcp_server/canary_utils.py +910 -0
  7. awslabs/cloudwatch_applicationsignals_mcp_server/enablement_guides/templates/ec2/ec2-dotnet-enablement.md +435 -0
  8. awslabs/cloudwatch_applicationsignals_mcp_server/enablement_guides/templates/ec2/ec2-java-enablement.md +321 -0
  9. awslabs/cloudwatch_applicationsignals_mcp_server/enablement_guides/templates/ec2/ec2-nodejs-enablement.md +420 -0
  10. awslabs/cloudwatch_applicationsignals_mcp_server/enablement_guides/templates/ec2/ec2-python-enablement.md +598 -0
  11. awslabs/cloudwatch_applicationsignals_mcp_server/enablement_guides/templates/ecs/ecs-dotnet-enablement.md +264 -0
  12. awslabs/cloudwatch_applicationsignals_mcp_server/enablement_guides/templates/ecs/ecs-java-enablement.md +193 -0
  13. awslabs/cloudwatch_applicationsignals_mcp_server/enablement_guides/templates/ecs/ecs-nodejs-enablement.md +198 -0
  14. awslabs/cloudwatch_applicationsignals_mcp_server/enablement_guides/templates/ecs/ecs-python-enablement.md +236 -0
  15. awslabs/cloudwatch_applicationsignals_mcp_server/enablement_guides/templates/eks/eks-dotnet-enablement.md +166 -0
  16. awslabs/cloudwatch_applicationsignals_mcp_server/enablement_guides/templates/eks/eks-java-enablement.md +166 -0
  17. awslabs/cloudwatch_applicationsignals_mcp_server/enablement_guides/templates/eks/eks-nodejs-enablement.md +166 -0
  18. awslabs/cloudwatch_applicationsignals_mcp_server/enablement_guides/templates/eks/eks-python-enablement.md +169 -0
  19. awslabs/cloudwatch_applicationsignals_mcp_server/enablement_guides/templates/lambda/lambda-dotnet-enablement.md +336 -0
  20. awslabs/cloudwatch_applicationsignals_mcp_server/enablement_guides/templates/lambda/lambda-java-enablement.md +336 -0
  21. awslabs/cloudwatch_applicationsignals_mcp_server/enablement_guides/templates/lambda/lambda-nodejs-enablement.md +336 -0
  22. awslabs/cloudwatch_applicationsignals_mcp_server/enablement_guides/templates/lambda/lambda-python-enablement.md +336 -0
  23. awslabs/cloudwatch_applicationsignals_mcp_server/enablement_tools.py +147 -0
  24. awslabs/cloudwatch_applicationsignals_mcp_server/server.py +1505 -0
  25. awslabs/cloudwatch_applicationsignals_mcp_server/service_audit_utils.py +231 -0
  26. awslabs/cloudwatch_applicationsignals_mcp_server/service_tools.py +659 -0
  27. awslabs/cloudwatch_applicationsignals_mcp_server/sli_report_client.py +333 -0
  28. awslabs/cloudwatch_applicationsignals_mcp_server/slo_tools.py +386 -0
  29. awslabs/cloudwatch_applicationsignals_mcp_server/trace_tools.py +784 -0
  30. awslabs/cloudwatch_applicationsignals_mcp_server/utils.py +172 -0
  31. awslabs_cloudwatch_applicationsignals_mcp_server-0.1.21.dist-info/METADATA +808 -0
  32. awslabs_cloudwatch_applicationsignals_mcp_server-0.1.21.dist-info/RECORD +36 -0
  33. awslabs_cloudwatch_applicationsignals_mcp_server-0.1.21.dist-info/WHEEL +4 -0
  34. awslabs_cloudwatch_applicationsignals_mcp_server-0.1.21.dist-info/entry_points.txt +2 -0
  35. awslabs_cloudwatch_applicationsignals_mcp_server-0.1.21.dist-info/licenses/LICENSE +174 -0
  36. awslabs_cloudwatch_applicationsignals_mcp_server-0.1.21.dist-info/licenses/NOTICE +2 -0
@@ -0,0 +1,910 @@
1
+ # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """Utility functions for CloudWatch Synthetics canary analysis and debugging."""
16
+
17
+ import asyncio
18
+ import gzip
19
+ import json
20
+ import os
21
+ import re
22
+ import tempfile
23
+ import zipfile
24
+ from .aws_clients import (
25
+ lambda_client,
26
+ logs_client,
27
+ synthetics_client,
28
+ )
29
+ from botocore.exceptions import ClientError
30
+ from datetime import datetime, timedelta
31
+ from loguru import logger
32
+
33
+
34
+ async def check_iam_exists_for_canary(canary: dict, iam_client) -> dict:
35
+ """Check if IAM role exists for the canary."""
36
+ execution_role_arn = canary.get('ExecutionRoleArn', '')
37
+ if not execution_role_arn:
38
+ return {'exists': False, 'error': 'No execution role configured'}
39
+
40
+ role_name = execution_role_arn.split('/')[-1]
41
+
42
+ try:
43
+ iam_client.get_role(RoleName=role_name)
44
+ return {'exists': True, 'role_name': role_name}
45
+ except ClientError as e:
46
+ logger.warning(f'Failed to check IAM role {role_name}: {str(e)}')
47
+ error_response = e.response.get('Error', {})
48
+ if error_response.get('Code') == 'NoSuchEntity':
49
+ return {'exists': False, 'error': f"Role '{role_name}' does not exist"}
50
+ else:
51
+ return {
52
+ 'exists': False,
53
+ 'error': f'Cannot check role: {error_response.get("Message", str(e))}',
54
+ }
55
+
56
+
57
+ async def check_lambda_permissions(canary: dict, iam_client) -> dict:
58
+ """Check if IAM role has proper Lambda execution permissions."""
59
+ execution_role_arn = canary.get('ExecutionRoleArn', '')
60
+ if not execution_role_arn:
61
+ return {
62
+ 'has_basic_execution': False,
63
+ 'has_vpc_permissions': False,
64
+ 'needs_vpc_check': False,
65
+ 'error': 'No execution role configured',
66
+ }
67
+
68
+ role_name = execution_role_arn.split('/')[-1]
69
+
70
+ try:
71
+ policies_response = iam_client.list_attached_role_policies(RoleName=role_name)
72
+ attached_policies = policies_response['AttachedPolicies']
73
+
74
+ has_basic_execution = False
75
+ has_vpc_permissions = False
76
+
77
+ lambda_basic_arn = 'arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole'
78
+ lambda_vpc_arn = 'arn:aws:iam::aws:policy/service-role/AWSLambdaVPCAccessExecutionRole'
79
+
80
+ for policy in attached_policies:
81
+ if policy['PolicyArn'] == lambda_basic_arn:
82
+ has_basic_execution = True
83
+ elif policy['PolicyArn'] == lambda_vpc_arn:
84
+ has_vpc_permissions = True
85
+ has_basic_execution = True
86
+
87
+ if not has_basic_execution:
88
+ for policy in attached_policies:
89
+ if not policy['PolicyArn'].startswith('arn:aws:iam::aws:'):
90
+ try:
91
+ policy_response = iam_client.get_policy(PolicyArn=policy['PolicyArn'])
92
+ policy_version = iam_client.get_policy_version(
93
+ PolicyArn=policy['PolicyArn'],
94
+ VersionId=policy_response['Policy']['DefaultVersionId'],
95
+ )
96
+
97
+ policy_doc = policy_version['PolicyVersion']['Document']
98
+
99
+ for statement in policy_doc.get('Statement', []):
100
+ actions = statement.get('Action', [])
101
+ if isinstance(actions, str):
102
+ actions = [actions]
103
+
104
+ has_logs = any(
105
+ 'logs:CreateLogGroup' in action
106
+ or 'logs:CreateLogStream' in action
107
+ or 'logs:PutLogEvents' in action
108
+ for action in actions
109
+ )
110
+ if has_logs and statement.get('Effect') == 'Allow':
111
+ has_basic_execution = True
112
+ break
113
+
114
+ except Exception as e:
115
+ logger.warning(f'Failed to parse policy document: {str(e)}')
116
+ continue
117
+
118
+ return {
119
+ 'has_basic_execution': has_basic_execution,
120
+ 'has_managed_basic_execution': any(
121
+ p['PolicyArn'] == lambda_basic_arn for p in attached_policies
122
+ ),
123
+ 'has_vpc_permissions': has_vpc_permissions,
124
+ 'needs_vpc_check': not has_vpc_permissions,
125
+ 'attached_policies': [p['PolicyArn'] for p in attached_policies],
126
+ }
127
+
128
+ except Exception as e:
129
+ return {
130
+ 'has_basic_execution': False,
131
+ 'has_vpc_permissions': False,
132
+ 'needs_vpc_check': False,
133
+ 'error': str(e),
134
+ }
135
+
136
+
137
+ async def analyze_iam_role_and_policies(canary: dict, iam_client, region: str) -> dict:
138
+ """Analyze IAM Role and Policies."""
139
+ analysis = {'status': 'analyzing', 'checks': {}, 'issues_found': [], 'recommendations': []}
140
+
141
+ iam_check = await check_iam_exists_for_canary(canary, iam_client)
142
+ if not iam_check['exists']:
143
+ analysis['checks']['iam_exists'] = f'❌ IAM role does not exist: {iam_check["error"]}'
144
+ analysis['issues_found'].append(iam_check['error'])
145
+ else:
146
+ role_name = iam_check['role_name']
147
+ analysis['checks']['iam_exists'] = f'✅ IAM role `{role_name}` exists'
148
+
149
+ lambda_check = await check_lambda_permissions(canary, iam_client)
150
+ if 'error' in lambda_check:
151
+ analysis['checks']['lambda_execution'] = (
152
+ f'❌ IAM role check failed: {lambda_check["error"]}'
153
+ )
154
+ analysis['issues_found'].append(f'Cannot verify IAM permissions: {lambda_check["error"]}')
155
+ analysis['recommendations'].append(
156
+ "Verify the canary's execution role exists and has proper permissions"
157
+ )
158
+ elif lambda_check.get('has_managed_basic_execution', False):
159
+ analysis['checks']['lambda_execution'] = '✅ Has Lambda basic execution permissions'
160
+ elif lambda_check.get('has_basic_execution', False):
161
+ analysis['checks']['lambda_execution'] = (
162
+ '✅ Has custom Lambda execution permissions (sufficient)'
163
+ )
164
+ else:
165
+ analysis['checks']['lambda_execution'] = '❌ Missing Lambda basic execution permissions'
166
+ analysis['issues_found'].append('IAM role lacks Lambda execution permissions')
167
+ analysis['recommendations'].append(
168
+ 'Add Lambda execution permissions (logs:CreateLogGroup, logs:CreateLogStream, logs:PutLogEvents)'
169
+ )
170
+
171
+ # Only check VPC permissions if there's no error
172
+ if 'error' not in lambda_check:
173
+ if lambda_check.get('has_vpc_permissions', False):
174
+ analysis['checks']['lambda_vpc'] = '✅ Has Lambda VPC permissions'
175
+ elif lambda_check.get('needs_vpc_check', False):
176
+ analysis['checks']['lambda_vpc'] = (
177
+ '⚠️ No VPC permissions (may be needed if Lambda is in VPC)'
178
+ )
179
+
180
+ analysis['status'] = 'completed'
181
+ return analysis
182
+
183
+
184
+ async def analyze_har_file(s3_client, bucket_name, har_files, is_failed_run=True) -> dict:
185
+ """Analyze HAR files from canary runs."""
186
+ har_analysis = {'status': 'no_har_files', 'insights': []}
187
+
188
+ if not har_files:
189
+ return har_analysis
190
+
191
+ try:
192
+ har_key = har_files[0]['Key'] # Fix: use 'Key' not 'key'
193
+ har_obj = s3_client.get_object(Bucket=bucket_name, Key=har_key)
194
+ har_content = har_obj['Body'].read()
195
+
196
+ if har_key.endswith('.gz'):
197
+ har_content = gzip.decompress(har_content)
198
+
199
+ content_str = har_content.decode('utf-8')
200
+
201
+ # Handle .har.html format
202
+ if har_key.endswith('.har.html'):
203
+ # Extract JSON from HTML wrapper - find matching braces
204
+ start_match = re.search(r'var harOutput\s*=\s*({)', content_str)
205
+ if start_match:
206
+ json_start = start_match.start(1)
207
+ brace_count = 0
208
+ json_end = -1
209
+
210
+ # Find matching closing brace
211
+ for i, char in enumerate(content_str[json_start:], json_start):
212
+ if char == '{':
213
+ brace_count += 1
214
+ elif char == '}':
215
+ brace_count -= 1
216
+ if brace_count == 0:
217
+ json_end = i + 1
218
+ break
219
+
220
+ if json_end > 0:
221
+ content_str = content_str[json_start:json_end]
222
+ else:
223
+ return {'status': 'error', 'insights': ['Could not find end of HAR JSON data']}
224
+ else:
225
+ return {
226
+ 'status': 'error',
227
+ 'insights': ['Could not find harOutput variable in HTML'],
228
+ }
229
+
230
+ har_data = json.loads(content_str)
231
+
232
+ entries = har_data.get('log', {}).get('entries', [])
233
+ if not entries:
234
+ return {'status': 'empty_har', 'insights': ['HAR file contains no network entries']}
235
+
236
+ insights = []
237
+ failed_requests = []
238
+ request_details = []
239
+
240
+ for entry in entries:
241
+ request = entry.get('request', {})
242
+ response = entry.get('response', {})
243
+ timings = entry.get('timings', {})
244
+
245
+ url = request.get('url', 'unknown')
246
+ status = response.get('status', 0)
247
+
248
+ # Extract all timing components
249
+ blocked = timings.get('blocked', 0) if timings.get('blocked', 0) > 0 else 0
250
+ dns = timings.get('dns', 0) if timings.get('dns', 0) > 0 else 0
251
+ connect = timings.get('connect', 0) if timings.get('connect', 0) > 0 else 0
252
+ send = timings.get('send', 0) if timings.get('send', 0) > 0 else 0
253
+ wait = timings.get('wait', 0) if timings.get('wait', 0) > 0 else 0
254
+ receive = timings.get('receive', 0) if timings.get('receive', 0) > 0 else 0
255
+ ssl = timings.get('ssl', 0) if timings.get('ssl', 0) > 0 else 0
256
+
257
+ total_time = sum(
258
+ [v for v in timings.values() if isinstance(v, (int, float)) and v > 0]
259
+ )
260
+
261
+ if total_time > 0:
262
+ request_details.append(
263
+ {
264
+ 'url': url,
265
+ 'status': status,
266
+ 'total': total_time,
267
+ 'blocked': blocked,
268
+ 'dns': dns,
269
+ 'connect': connect,
270
+ 'ssl': ssl,
271
+ 'send': send,
272
+ 'wait': wait,
273
+ 'receive': receive,
274
+ }
275
+ )
276
+
277
+ if is_failed_run and int(status) >= 400:
278
+ failed_requests.append(
279
+ {
280
+ 'url': url,
281
+ 'status': status,
282
+ 'statusText': response.get('statusText', ''),
283
+ 'total': total_time,
284
+ 'blocked': blocked,
285
+ 'wait': wait,
286
+ }
287
+ )
288
+
289
+ # Sort by total time to find slowest requests
290
+ request_details.sort(key=lambda x: x['total'], reverse=True)
291
+
292
+ if failed_requests:
293
+ insights.append(f'🚨 Found {len(failed_requests)} failed HTTP requests:')
294
+ for req in failed_requests[:3]:
295
+ insights.append(f' • {req["status"]} {req["statusText"]}: {req["url"][:100]}...')
296
+
297
+ # Show top slowest requests with timing breakdown
298
+ if request_details:
299
+ insights.append('🐌 Top 5 slowest requests (timing breakdown):')
300
+ for i, req in enumerate(request_details[:5]):
301
+ insights.append(f' {i + 1}. {req["total"]:.0f}ms total - {req["url"][:80]}')
302
+
303
+ # Show timing breakdown
304
+ breakdown = []
305
+ if req['blocked'] > 0:
306
+ breakdown.append(f'Blocked: {req["blocked"]:.0f}ms')
307
+ if req['dns'] > 0:
308
+ breakdown.append(f'DNS: {req["dns"]:.0f}ms')
309
+ if req['connect'] > 0:
310
+ breakdown.append(f'Connect: {req["connect"]:.0f}ms')
311
+ if req['ssl'] > 0:
312
+ breakdown.append(f'SSL: {req["ssl"]:.0f}ms')
313
+ if req['send'] > 0:
314
+ breakdown.append(f'Send: {req["send"]:.0f}ms')
315
+ if req['wait'] > 0:
316
+ breakdown.append(f'Wait: {req["wait"]:.0f}ms')
317
+ if req['receive'] > 0:
318
+ breakdown.append(f'Receive: {req["receive"]:.0f}ms')
319
+
320
+ if breakdown:
321
+ insights.append(f' {" | ".join(breakdown)}')
322
+
323
+ # Identify specific issues
324
+ blocking_issues = [r for r in request_details if r['blocked'] > 500]
325
+ if blocking_issues:
326
+ insights.append(
327
+ f'🔒 {len(blocking_issues)} requests with high blocking time (connection limits):'
328
+ )
329
+ for req in blocking_issues[:3]:
330
+ insights.append(f' • {req["blocked"]:.0f}ms blocked: {req["url"][:80]}')
331
+
332
+ waiting_issues = [r for r in request_details if r['wait'] > 1000]
333
+ if waiting_issues:
334
+ insights.append(f'⏳ {len(waiting_issues)} requests with high server wait time:')
335
+ for req in waiting_issues[:3]:
336
+ insights.append(f' • {req["wait"]:.0f}ms waiting: {req["url"][:80]}')
337
+
338
+ har_analysis = {
339
+ 'status': 'analyzed',
340
+ 'total_requests': len(entries),
341
+ 'failed_requests': len(failed_requests),
342
+ 'insights': insights[:10],
343
+ }
344
+
345
+ except Exception as e:
346
+ har_analysis = {'status': 'error', 'insights': [f'HAR analysis failed: {str(e)[:200]}']}
347
+
348
+ return har_analysis
349
+
350
+
351
+ async def analyze_screenshots(s3_client, bucket_name, screenshots, is_failed_run=True) -> dict:
352
+ """Analyze screenshots from canary runs."""
353
+ screenshot_analysis = {'status': 'no_screenshots', 'insights': []}
354
+
355
+ if not screenshots:
356
+ return screenshot_analysis
357
+
358
+ try:
359
+ insights = []
360
+ screenshot_types = {}
361
+
362
+ for screenshot in screenshots:
363
+ filename = screenshot['Key'].split('/')[-1]
364
+ if 'error' in filename.lower() or 'failure' in filename.lower():
365
+ screenshot_types['error'] = screenshot
366
+ elif 'loaded' in filename.lower() or 'success' in filename.lower():
367
+ screenshot_types['success'] = screenshot
368
+ elif 'timeout' in filename.lower():
369
+ screenshot_types['timeout'] = screenshot
370
+
371
+ if is_failed_run:
372
+ if 'error' in screenshot_types:
373
+ insights.append('📸 Error screenshot captured - check for visible error messages')
374
+ insights.append(f' Screenshot: {screenshot_types["error"]["Key"]}')
375
+
376
+ if 'timeout' in screenshot_types:
377
+ insights.append(
378
+ '⏰ Timeout screenshot available - page may not have loaded completely'
379
+ )
380
+
381
+ if not screenshot_types:
382
+ insights.append(
383
+ '📸 Basic screenshots available - check for unexpected page content'
384
+ )
385
+
386
+ insights.append(f'📊 Total screenshots: {len(screenshots)}')
387
+
388
+ if screenshot_types:
389
+ types_found = list(screenshot_types.keys())
390
+ insights.append(f'📋 Screenshot types: {", ".join(types_found)}')
391
+
392
+ screenshot_analysis = {
393
+ 'status': 'analyzed',
394
+ 'total_screenshots': len(screenshots),
395
+ 'screenshot_types': list(screenshot_types.keys()),
396
+ 'insights': insights,
397
+ }
398
+
399
+ except Exception as e:
400
+ screenshot_analysis = {
401
+ 'status': 'error',
402
+ 'insights': [f'Screenshot analysis failed: {str(e)[:200]}'],
403
+ }
404
+
405
+ return screenshot_analysis
406
+
407
+
408
+ async def analyze_log_files(s3_client, bucket_name, logs, is_failed_run=True) -> dict:
409
+ """Analyze log files from canary runs."""
410
+ log_analysis = {'status': 'no_logs', 'insights': []}
411
+
412
+ if not logs:
413
+ return log_analysis
414
+
415
+ try:
416
+ insights = []
417
+ error_patterns = []
418
+
419
+ for log_file in logs[:3]: # Limit to 3 log files
420
+ log_key = log_file['Key']
421
+
422
+ try:
423
+ log_obj = s3_client.get_object(Bucket=bucket_name, Key=log_key)
424
+ log_content = log_obj['Body'].read()
425
+
426
+ if log_key.endswith('.gz'):
427
+ log_content = gzip.decompress(log_content)
428
+
429
+ log_text = log_content.decode('utf-8', errors='ignore')
430
+
431
+ if is_failed_run:
432
+ error_keywords = [
433
+ 'ERROR',
434
+ 'FAILED',
435
+ 'Exception',
436
+ 'timeout',
437
+ 'refused',
438
+ 'not found',
439
+ '404',
440
+ '500',
441
+ '502',
442
+ '503',
443
+ '504',
444
+ 'DNS_PROBE',
445
+ 'CONNECTION_REFUSED',
446
+ 'SSL_ERROR',
447
+ 'ERR_',
448
+ ]
449
+
450
+ found_errors = []
451
+ for line in log_text.split('\n'):
452
+ line_lower = line.lower()
453
+ if any(level in line for level in [' INFO:', ' DEBUG:']) and not any(
454
+ err in line_lower for err in ['error', 'failed', 'exception', 'err_']
455
+ ):
456
+ continue
457
+
458
+ for keyword in error_keywords:
459
+ if keyword.lower() in line_lower:
460
+ found_errors.append(line.strip()[:150])
461
+ break
462
+
463
+ if found_errors:
464
+ error_patterns.extend(found_errors[:5])
465
+
466
+ except Exception as log_error:
467
+ insights.append(f'⚠️ Could not read log {log_key}: {str(log_error)[:100]}')
468
+
469
+ if error_patterns:
470
+ insights.append(f'🚨 Found {len(error_patterns)} error patterns in logs:')
471
+ for i, error in enumerate(error_patterns[:5], 1):
472
+ insights.append(f' {i}. {error}')
473
+ elif is_failed_run:
474
+ insights.append('📋 No obvious error patterns found in log files')
475
+ insights.append('💡 Check CloudWatch Logs for more detailed error information')
476
+
477
+ insights.append(f'📊 Analyzed {min(len(logs), 3)} log files')
478
+
479
+ log_analysis = {
480
+ 'status': 'analyzed',
481
+ 'total_log_files': len(logs),
482
+ 'error_patterns_found': len(error_patterns),
483
+ 'insights': insights,
484
+ }
485
+
486
+ except Exception as e:
487
+ log_analysis = {'status': 'error', 'insights': [f'Log analysis failed: {str(e)[:200]}']}
488
+
489
+ return log_analysis
490
+
491
+
492
+ def check_resource_arns_correct(canary: dict, iam_client) -> dict:
493
+ """Check if all resource ARNs in IAM policies are correct."""
494
+ execution_role_arn = canary.get('ExecutionRoleArn', '')
495
+ if not execution_role_arn:
496
+ return {'correct': False, 'error': 'No execution role configured'}
497
+
498
+ role_name = execution_role_arn.split('/')[-1]
499
+
500
+ try:
501
+ policies_response = iam_client.list_attached_role_policies(RoleName=role_name)
502
+ attached_policies = policies_response['AttachedPolicies']
503
+
504
+ canary_bucket = canary.get('ArtifactS3Location', '')
505
+
506
+ if not canary_bucket.startswith('s3://'):
507
+ if canary_bucket:
508
+ canary_bucket = f's3://{canary_bucket}'
509
+ else:
510
+ return {'correct': False, 'error': 'No S3 artifact location configured'}
511
+
512
+ actual_bucket_name = canary_bucket.replace('s3://', '').split('/')[0]
513
+ has_mismatch = False
514
+
515
+ for policy in attached_policies:
516
+ if not policy['PolicyArn'].startswith('arn:aws:iam::aws:'):
517
+ try:
518
+ policy_response = iam_client.get_policy(PolicyArn=policy['PolicyArn'])
519
+ policy_version = iam_client.get_policy_version(
520
+ PolicyArn=policy['PolicyArn'],
521
+ VersionId=policy_response['Policy']['DefaultVersionId'],
522
+ )
523
+
524
+ policy_doc = policy_version['PolicyVersion']['Document']
525
+
526
+ for statement in policy_doc.get('Statement', []):
527
+ resources = statement.get('Resource', [])
528
+ if isinstance(resources, str):
529
+ resources = [resources]
530
+
531
+ for resource in resources:
532
+ if 's3:::' in resource:
533
+ s3_part = resource.split('s3:::')[1]
534
+ bucket_pattern = s3_part.split('/')[0]
535
+
536
+ if not _matches_bucket_pattern(actual_bucket_name, bucket_pattern):
537
+ has_mismatch = True
538
+ break
539
+
540
+ if has_mismatch:
541
+ break
542
+
543
+ if has_mismatch:
544
+ break
545
+
546
+ except ClientError as e:
547
+ error_code = e.response.get('Error', {}).get('Code', '')
548
+ if error_code in ['NoSuchEntity', 'InvalidPolicyDocument']:
549
+ has_mismatch = True
550
+ break
551
+ except Exception as e:
552
+ logger.error(f'Error: {str(e)}')
553
+ continue
554
+
555
+ return {'correct': not has_mismatch, 'actual_bucket': actual_bucket_name}
556
+
557
+ except Exception as e:
558
+ return {'correct': False, 'error': str(e)}
559
+
560
+
561
+ def _matches_bucket_pattern(actual_bucket: str, pattern: str) -> bool:
562
+ """Check if actual bucket matches the pattern (including wildcards)."""
563
+ if pattern == actual_bucket:
564
+ return True
565
+
566
+ if '*' in pattern:
567
+ regex_pattern = pattern.replace('*', '.*')
568
+ return bool(re.match(f'^{regex_pattern}$', actual_bucket))
569
+
570
+ return False
571
+
572
+
573
+ async def analyze_canary_logs_with_time_window(
574
+ canary_name: str,
575
+ failure_time,
576
+ canary: dict,
577
+ window_minutes: int = 3,
578
+ region: str = 'us-east-1',
579
+ ) -> dict:
580
+ """Analyze canary logs within a specific time window around failure."""
581
+ try:
582
+ # Calculate time window around failure
583
+ if isinstance(failure_time, str):
584
+ failure_time = datetime.fromisoformat(failure_time.replace('Z', '+00:00'))
585
+
586
+ start_time = failure_time - timedelta(minutes=window_minutes // 2)
587
+ end_time = failure_time + timedelta(minutes=window_minutes // 2)
588
+
589
+ # Convert to milliseconds since epoch
590
+ start_timestamp = int(start_time.timestamp() * 1000)
591
+ end_timestamp = int(end_time.timestamp() * 1000)
592
+
593
+ # Get actual Lambda function name from EngineArn
594
+ engine_arn = canary.get('EngineArn', '')
595
+ function_name = engine_arn.split(':function:')[1].split(':')[0]
596
+ log_group_name = f'/aws/lambda/{function_name}'
597
+
598
+ # Get log events in the time window
599
+ try:
600
+ response = logs_client.filter_log_events(
601
+ logGroupName=log_group_name,
602
+ startTime=start_timestamp,
603
+ endTime=end_timestamp,
604
+ limit=10,
605
+ )
606
+
607
+ events = response.get('events', [])
608
+
609
+ # Analyze log events for errors and patterns
610
+ error_events = []
611
+ warning_events = []
612
+ info_events = []
613
+
614
+ for event in events:
615
+ message = event.get('message', '').lower()
616
+ if any(
617
+ keyword in message for keyword in ['error', 'failed', 'exception', 'timeout']
618
+ ):
619
+ error_events.append(
620
+ {
621
+ 'timestamp': datetime.fromtimestamp(event.get('timestamp', 0) / 1000),
622
+ 'message': event.get('message', '')[:300],
623
+ }
624
+ )
625
+ elif any(keyword in message for keyword in ['warn', 'warning']):
626
+ warning_events.append(
627
+ {
628
+ 'timestamp': datetime.fromtimestamp(event.get('timestamp', 0) / 1000),
629
+ 'message': event.get('message', '')[:300],
630
+ }
631
+ )
632
+ else:
633
+ info_events.append(
634
+ {
635
+ 'timestamp': datetime.fromtimestamp(event.get('timestamp', 0) / 1000),
636
+ 'message': event.get('message', '')[:200],
637
+ }
638
+ )
639
+
640
+ return {
641
+ 'status': 'success',
642
+ 'time_window': f'{start_time.isoformat()} to {end_time.isoformat()}',
643
+ 'total_events': len(events),
644
+ 'error_events': error_events[:5], # Limit to top 5
645
+ 'warning_events': warning_events[:5], # Limit to top 5
646
+ 'info_events': info_events[:5], # Limit to top 5
647
+ 'insights': [
648
+ f'Found {len(error_events)} error events',
649
+ f'Found {len(warning_events)} warning events',
650
+ f'Analyzed {window_minutes}-minute window around failure',
651
+ ],
652
+ }
653
+
654
+ except ClientError as log_error:
655
+ error_response = log_error.response.get('Error', {})
656
+ if error_response.get('Code') == 'ResourceNotFoundException':
657
+ return {
658
+ 'status': 'no_logs',
659
+ 'insights': [f'No CloudWatch logs found for canary: {canary_name}'],
660
+ }
661
+ else:
662
+ return {
663
+ 'status': 'error',
664
+ 'insights': [
665
+ f'CloudWatch logs access error: {error_response.get("Message", str(log_error))}'
666
+ ],
667
+ }
668
+
669
+ except Exception as e:
670
+ return {'status': 'error', 'insights': [f'Log analysis failed: {str(e)[:200]}']}
671
+
672
+
673
+ async def extract_disk_memory_usage_metrics(canary_name: str, region: str = 'us-east-1') -> dict:
674
+ """Extract disk and memory usage metrics from canary log group."""
675
+ try:
676
+ # Get canary details to find the Lambda function name
677
+ canary_response = synthetics_client.get_canary(Name=canary_name)
678
+ canary = canary_response['Canary']
679
+
680
+ # Handle both EngineArn and EngineConfigs
681
+ engine_arn = canary.get('EngineArn', '')
682
+ if not engine_arn:
683
+ engine_configs = canary.get('EngineConfigs', [])
684
+ if not engine_configs:
685
+ return {'error': 'No EngineArn or EngineConfigs found for canary'}
686
+ engine_arn = engine_configs[0]['EngineArn']
687
+
688
+ function_name = engine_arn.split(':function:')[1].split(':')[0]
689
+ log_group_name = f'/aws/lambda/{function_name}'
690
+
691
+ end_time = datetime.utcnow()
692
+ start_time = end_time - timedelta(hours=24)
693
+
694
+ query = """
695
+ fields @timestamp, message.Result.telemetry.maxEphemeralStorageUsageInMb, message.Result.telemetry.maxEphemeralStorageUsagePercent, message.Result.telemetry.maxSyntheticsMemoryUsageInMB
696
+ | filter ispresent(message.Result.telemetry.maxEphemeralStorageUsageInMb)
697
+ | sort @timestamp desc
698
+ | limit 20
699
+ """
700
+
701
+ response = logs_client.start_query(
702
+ logGroupName=log_group_name,
703
+ startTime=int(start_time.timestamp()),
704
+ endTime=int(end_time.timestamp()),
705
+ queryString=query,
706
+ )
707
+
708
+ query_id = response['queryId']
709
+
710
+ # Wait for completion
711
+ max_wait = 30
712
+ wait_time = 0
713
+ delay = 1
714
+ result = None
715
+ while wait_time < max_wait:
716
+ result = logs_client.get_query_results(queryId=query_id)
717
+ if result['status'] == 'Complete':
718
+ break
719
+ await asyncio.sleep(delay)
720
+ wait_time += delay
721
+ delay = min(delay * 2, 8)
722
+
723
+ if not result or not result.get('results'):
724
+ return {'error': 'No telemetry data found in canary logs'}
725
+
726
+ telemetry_data = []
727
+ for row in result['results']:
728
+ if len(row) >= 4:
729
+ telemetry_data.append(
730
+ {
731
+ 'timestamp': row[0].get('value', ''),
732
+ 'maxEphemeralStorageUsageInMb': float(row[1].get('value', 0))
733
+ if row[1].get('value')
734
+ else 0,
735
+ 'maxEphemeralStorageUsagePercent': float(row[2].get('value', 0))
736
+ if row[2].get('value')
737
+ else 0,
738
+ 'maxSyntheticsMemoryUsageInMB': float(row[3].get('value', 0))
739
+ if row[3].get('value')
740
+ else 0,
741
+ }
742
+ )
743
+
744
+ if not telemetry_data:
745
+ return {'error': 'No valid telemetry metrics found'}
746
+
747
+ return {
748
+ 'maxEphemeralStorageUsageInMb': max(
749
+ t['maxEphemeralStorageUsageInMb'] for t in telemetry_data
750
+ ),
751
+ 'maxEphemeralStorageUsagePercent': max(
752
+ t['maxEphemeralStorageUsagePercent'] for t in telemetry_data
753
+ ),
754
+ 'maxSyntheticsMemoryUsageInMB': max(
755
+ t['maxSyntheticsMemoryUsageInMB'] for t in telemetry_data
756
+ ),
757
+ }
758
+
759
+ except Exception as e:
760
+ return {'error': f'Resource analysis failed: {str(e)[:200]}'}
761
+
762
+
763
+ async def get_canary_code(canary: dict, region: str = 'us-east-1') -> dict:
764
+ """Extract and analyze canary code from Lambda layers."""
765
+ try:
766
+ engine_arn = canary.get('EngineArn', '')
767
+ if not engine_arn:
768
+ engine_configs = canary.get('EngineConfigs', [])
769
+ if not engine_configs:
770
+ return {'error': 'No EngineArn or EngineConfigs found for canary'}
771
+ engine_arn = engine_configs[0]['EngineArn']
772
+
773
+ function_name = engine_arn.split(':function:')[1].split(':')[0]
774
+
775
+ # Get function configuration
776
+ function_response = lambda_client.get_function(FunctionName=function_name)
777
+ config = function_response['Configuration']
778
+
779
+ result = {
780
+ 'function_name': function_name,
781
+ 'memory_size': config['MemorySize'],
782
+ 'timeout': config['Timeout'],
783
+ 'ephemeral_storage': config.get('EphemeralStorage', {}).get('Size', 512),
784
+ 'layers_count': len(config.get('Layers', [])),
785
+ 'code_content': '',
786
+ }
787
+
788
+ source_location_arn = canary.get('Code', {}).get('SourceLocationArn', '')
789
+ if source_location_arn and ':layer:' in source_location_arn:
790
+ try:
791
+ layer_response = lambda_client.get_layer_version_by_arn(Arn=source_location_arn)
792
+ if 'Location' in layer_response['Content']:
793
+ with tempfile.NamedTemporaryFile(suffix='.zip', delete=False) as tmp_file:
794
+ import requests
795
+
796
+ response = requests.get(layer_response['Content']['Location'], timeout=30)
797
+ tmp_file.write(response.content)
798
+ tmp_file.flush()
799
+
800
+ with zipfile.ZipFile(tmp_file.name, 'r') as zip_ref:
801
+ code_files = [
802
+ f for f in zip_ref.namelist() if f.endswith(('.js', '.py'))
803
+ ]
804
+
805
+ # Find the actual canary file using handler info
806
+ handler = canary.get('Code', {}).get('Handler', '')
807
+ if handler:
808
+ handler_path = handler.replace('.handler', '')
809
+ canary_file = next(
810
+ (f for f in code_files if handler_path in f), None
811
+ )
812
+ if canary_file:
813
+ with zip_ref.open(canary_file) as f:
814
+ code_content = f.read().decode('utf-8')
815
+ lines = code_content.split('\n')
816
+ result['code_content'] = '\n'.join(
817
+ f'{i + 1}: {line}' for i, line in enumerate(lines)
818
+ )
819
+ os.unlink(tmp_file.name)
820
+ except Exception as e:
821
+ logger.warning(
822
+ f'Failed to extract canary code from layer {source_location_arn}: {str(e)}'
823
+ )
824
+
825
+ # Try custom layers from function config if no code found yet
826
+ if not result['code_content']:
827
+ custom_layers = [
828
+ l for l in config.get('Layers', []) if ':layer:Synthetics' not in l['Arn']
829
+ ]
830
+
831
+ for layer in custom_layers:
832
+ try:
833
+ layer_response = lambda_client.get_layer_version_by_arn(Arn=layer['Arn'])
834
+ if 'Location' in layer_response['Content']:
835
+ with tempfile.NamedTemporaryFile(suffix='.zip', delete=False) as tmp_file:
836
+ import requests
837
+
838
+ response = requests.get(
839
+ layer_response['Content']['Location'], timeout=30
840
+ )
841
+ tmp_file.write(response.content)
842
+ tmp_file.flush()
843
+
844
+ with zipfile.ZipFile(tmp_file.name, 'r') as zip_ref:
845
+ code_files = [
846
+ f for f in zip_ref.namelist() if f.endswith(('.js', '.py'))
847
+ ]
848
+ if code_files:
849
+ with zip_ref.open(code_files[0]) as f:
850
+ code_content = f.read().decode('utf-8')
851
+ lines = code_content.split('\n')
852
+ result['code_content'] = '\n'.join(
853
+ f'{i + 1}: {line}' for i, line in enumerate(lines)
854
+ )
855
+ break
856
+ os.unlink(tmp_file.name)
857
+ except Exception as e:
858
+ logger.warning(
859
+ f'Failed to extract canary code from custom layer {layer["Arn"]}: {str(e)}'
860
+ )
861
+ continue
862
+
863
+ # If no code found in layers, try function code directly
864
+ if not result['code_content']:
865
+ try:
866
+ code_location = function_response['Code']['Location']
867
+ with tempfile.NamedTemporaryFile(suffix='.zip', delete=False) as tmp_file:
868
+ import requests
869
+
870
+ response = requests.get(code_location, timeout=30)
871
+ tmp_file.write(response.content)
872
+ tmp_file.flush()
873
+
874
+ with zipfile.ZipFile(tmp_file.name, 'r') as zip_ref:
875
+ code_files = [f for f in zip_ref.namelist() if f.endswith(('.js', '.py'))]
876
+ if code_files:
877
+ with zip_ref.open(code_files[0]) as f:
878
+ code_content = f.read().decode('utf-8')
879
+ lines = code_content.split('\n')
880
+ result['code_content'] = '\n'.join(
881
+ f'{i + 1}: {line}' for i, line in enumerate(lines)
882
+ )
883
+ os.unlink(tmp_file.name)
884
+ except Exception as e:
885
+ result['code_content'] = f'Could not extract function code: {str(e)}'
886
+
887
+ return result
888
+
889
+ except Exception as e:
890
+ return {'error': f'Canary code analysis failed: {str(e)}'}
891
+
892
+
893
+ async def get_canary_metrics_and_service_insights(canary_name: str, region: str) -> str:
894
+ """Get canary metrics and service insights using Application Signals audit API."""
895
+ import time
896
+
897
+ try:
898
+ # Use execute_audit_api for canary analysis
899
+ from .audit_utils import execute_audit_api
900
+
901
+ audit_input = {
902
+ 'StartTime': int(time.time()) - 900,
903
+ 'EndTime': int(time.time()),
904
+ 'AuditTargets': [{'Type': 'canary', 'Data': {'Canary': {'CanaryName': canary_name}}}],
905
+ 'Auditors': ['canary', 'operation_metric', 'trace'],
906
+ }
907
+ return await execute_audit_api(audit_input, region, f'Canary Analysis for {canary_name}\n')
908
+
909
+ except Exception as e:
910
+ return f'ListAuditFindings API unavailable: {str(e)}'