awslabs.eks-mcp-server 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,670 @@
1
+ # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
4
+ # with the License. A copy of the License is located at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # or in the 'license' file accompanying this file. This file is distributed on an 'AS IS' BASIS, WITHOUT WARRANTIES
9
+ # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions
10
+ # and limitations under the License.
11
+
12
+ """CloudWatch handler for the EKS MCP Server."""
13
+
14
+ import datetime
15
+ import json
16
+ import time
17
+ from awslabs.eks_mcp_server.aws_helper import AwsHelper
18
+ from awslabs.eks_mcp_server.logging_helper import LogLevel, log_with_request_id
19
+ from awslabs.eks_mcp_server.models import CloudWatchLogsResponse, CloudWatchMetricsResponse
20
+ from mcp.server.fastmcp import Context
21
+ from mcp.types import TextContent
22
+ from pydantic import Field
23
+ from typing import Optional, Union
24
+
25
+
26
+ class CloudWatchHandler:
27
+ """Handler for CloudWatch operations in the EKS MCP Server.
28
+
29
+ This class provides tools for retrieving and analyzing CloudWatch logs and metrics
30
+ from EKS clusters, enabling effective monitoring and troubleshooting.
31
+ """
32
+
33
+ def __init__(self, mcp, allow_sensitive_data_access=False):
34
+ """Initialize the CloudWatch handler.
35
+
36
+ Args:
37
+ mcp: The MCP server instance
38
+ allow_sensitive_data_access: Whether to allow access to sensitive data (default: False)
39
+ """
40
+ self.mcp = mcp
41
+ self.allow_sensitive_data_access = allow_sensitive_data_access
42
+
43
+ # Register tools
44
+ self.mcp.tool(name='get_cloudwatch_logs')(self.get_cloudwatch_logs)
45
+ self.mcp.tool(name='get_cloudwatch_metrics')(self.get_cloudwatch_metrics)
46
+
47
+ def resolve_time_range(
48
+ self,
49
+ start_time: Optional[Union[str, datetime.datetime]] = None,
50
+ end_time: Optional[Union[str, datetime.datetime]] = None,
51
+ minutes: int = 15,
52
+ ) -> tuple:
53
+ """Resolve start and end times for CloudWatch queries.
54
+
55
+ This function is public for unit testing purposes.
56
+
57
+ Args:
58
+ start_time: Start time as string (ISO format) or datetime object
59
+ end_time: End time as string (ISO format) or datetime object
60
+ minutes: Number of minutes to look back if start_time is not provided
61
+
62
+ Returns:
63
+ Tuple of (start_datetime, end_datetime)
64
+ """
65
+ # Handle end_time
66
+ if end_time is None:
67
+ end_dt = datetime.datetime.now()
68
+ elif isinstance(end_time, str):
69
+ end_dt = datetime.datetime.fromisoformat(end_time)
70
+ else:
71
+ end_dt = end_time
72
+
73
+ # Handle start_time
74
+ if start_time is None:
75
+ start_dt = end_dt - datetime.timedelta(minutes=minutes)
76
+ elif isinstance(start_time, str):
77
+ start_dt = datetime.datetime.fromisoformat(start_time)
78
+ else:
79
+ start_dt = start_time
80
+
81
+ return start_dt, end_dt
82
+
83
+ async def get_cloudwatch_logs(
84
+ self,
85
+ ctx: Context,
86
+ resource_type: str = Field(
87
+ ...,
88
+ description='Resource type to search logs for. Valid values: "pod", "node", "container". This determines how logs are filtered.',
89
+ ),
90
+ resource_name: str = Field(
91
+ ...,
92
+ description='Resource name to search for in log messages (e.g., pod name, node name, container name). Used to filter logs for the specific resource.',
93
+ ),
94
+ cluster_name: str = Field(
95
+ ...,
96
+ description='Name of the EKS cluster where the resource is located. Used to construct the CloudWatch log group name.',
97
+ ),
98
+ log_type: str = Field(
99
+ ...,
100
+ description="""Log type to query. Options:
101
+ - "application": Container/application logs
102
+ - "host": Node-level system logs
103
+ - "performance": Performance metrics logs
104
+ - "control-plane": EKS control plane logs
105
+ - Or provide a custom CloudWatch log group name directly""",
106
+ ),
107
+ minutes: int = Field(
108
+ 15,
109
+ description='Number of minutes to look back for logs. Default: 15. Ignored if start_time is provided. Use smaller values for recent issues, larger values for historical analysis.',
110
+ ),
111
+ start_time: Optional[str] = Field(
112
+ None,
113
+ description='Start time in ISO format (e.g., "2023-01-01T00:00:00Z"). If provided, overrides the minutes parameter. IMPORTANT: Use this for precise time ranges.',
114
+ ),
115
+ end_time: Optional[str] = Field(
116
+ None,
117
+ description='End time in ISO format (e.g., "2023-01-01T01:00:00Z"). If not provided, defaults to current time. IMPORTANT: Use with start_time for precise time ranges.',
118
+ ),
119
+ limit: int = Field(
120
+ 50,
121
+ description='Maximum number of log entries to return. Use lower values (10-50) for faster queries, higher values (100-1000) for more comprehensive results. IMPORTANT: Higher values may impact performance.',
122
+ ),
123
+ filter_pattern: Optional[str] = Field(
124
+ None,
125
+ description='Additional CloudWatch Logs filter pattern to apply. Uses CloudWatch Logs Insights syntax (e.g., "ERROR", "field=value"). IMPORTANT: Use this to narrow down results for specific issues.',
126
+ ),
127
+ fields: Optional[str] = Field(
128
+ None,
129
+ description='Custom fields to include in the query results (defaults to "@timestamp, @message"). Use CloudWatch Logs Insights field syntax. IMPORTANT: Only specify if you need fields beyond the default timestamp and message.',
130
+ ),
131
+ ) -> CloudWatchLogsResponse:
132
+ """Get logs from CloudWatch for a specific resource.
133
+
134
+ This tool retrieves logs from CloudWatch for Kubernetes resources in an EKS cluster,
135
+ allowing you to analyze application behavior, troubleshoot issues, and monitor system
136
+ health. It supports filtering by resource type, time range, and content for troubleshooting
137
+ application errors, investigating security incidents, and analyzing startup configuration issues.
138
+
139
+ ## Requirements
140
+ - The server must be run with the `--allow-sensitive-data-access` flag
141
+ - The EKS cluster must have CloudWatch logging enabled
142
+ - The resource must exist in the specified cluster
143
+
144
+ ## Response Information
145
+ The response includes resource details (type, name, cluster), log group information,
146
+ time range queried, and formatted log entries with timestamps and messages.
147
+
148
+ ## Usage Tips
149
+ - Start with a small time range (15-30 minutes) and expand if needed
150
+ - Use filter_pattern to narrow down results (e.g., "ERROR", "exception")
151
+ - For JSON logs, the tool automatically parses nested structures
152
+ - Combine with get_k8s_events for comprehensive troubleshooting
153
+
154
+ Args:
155
+ ctx: MCP context
156
+ resource_type: Resource type (pod, node, container)
157
+ resource_name: Resource name to search for in log messages
158
+ cluster_name: Name of the EKS cluster
159
+ log_type: Log type (application, host, performance, control-plane, or custom)
160
+ minutes: Number of minutes to look back
161
+ start_time: Start time in ISO format (overrides minutes)
162
+ end_time: End time in ISO format (defaults to now)
163
+ limit: Maximum number of log entries to return
164
+ filter_pattern: Additional CloudWatch Logs filter pattern
165
+ fields: Custom fields to include in the query results
166
+
167
+ Returns:
168
+ CloudWatchLogsResponse with log entries and resource information
169
+ """
170
+ try:
171
+ # Check if sensitive data access is allowed
172
+ if not self.allow_sensitive_data_access:
173
+ error_message = (
174
+ 'Access to CloudWatch logs requires --allow-sensitive-data-access flag'
175
+ )
176
+ log_with_request_id(ctx, LogLevel.ERROR, error_message)
177
+ return CloudWatchLogsResponse(
178
+ isError=True,
179
+ content=[TextContent(type='text', text=error_message)],
180
+ resource_type=resource_type,
181
+ resource_name=resource_name,
182
+ cluster_name=cluster_name,
183
+ log_type=log_type,
184
+ log_group='',
185
+ start_time='',
186
+ end_time='',
187
+ log_entries=[],
188
+ )
189
+
190
+ start_dt, end_dt = self.resolve_time_range(start_time, end_time, minutes)
191
+
192
+ # Create CloudWatch Logs client
193
+ logs = AwsHelper.create_boto3_client('logs')
194
+
195
+ # Determine the log group based on log_type
196
+ known_types = {'application', 'host', 'performance', 'dataplane'}
197
+ if log_type in known_types:
198
+ log_group = f'/aws/containerinsights/{cluster_name}/{log_type}'
199
+ elif log_type == 'control-plane':
200
+ log_group = f'/aws/eks/{cluster_name}/cluster'
201
+ else:
202
+ log_group = log_type # Assume user passed full log group name
203
+
204
+ # Determine fields to include
205
+ query_fields = fields if fields else '@timestamp, @message'
206
+
207
+ # Construct the base query
208
+ query = f"""
209
+ fields {query_fields}
210
+ | filter @message like '{resource_name}'
211
+ """
212
+
213
+ # Add additional filter pattern if provided
214
+ if filter_pattern:
215
+ query += f'\n| {filter_pattern}'
216
+
217
+ # Add sorting and limit
218
+ query += f'\n| sort @timestamp desc\n| limit {limit}'
219
+
220
+ log_with_request_id(
221
+ ctx,
222
+ LogLevel.INFO,
223
+ f'Starting CloudWatch Logs query for {resource_type} {resource_name} in cluster {cluster_name}',
224
+ log_group=log_group,
225
+ start_time=start_dt.isoformat(),
226
+ end_time=end_dt.isoformat(),
227
+ )
228
+
229
+ # Start the query
230
+ start_query_response = logs.start_query(
231
+ logGroupName=log_group,
232
+ startTime=int(start_dt.timestamp()),
233
+ endTime=int(end_dt.timestamp()),
234
+ queryString=query,
235
+ )
236
+
237
+ query_id = start_query_response['queryId']
238
+
239
+ # Poll for results
240
+ query_response = self._poll_query_results(
241
+ ctx, logs, query_id, resource_type, resource_name
242
+ )
243
+
244
+ # Process results
245
+ results = query_response['results']
246
+ log_entries = []
247
+
248
+ for result in results:
249
+ entry = self._build_log_entry(result)
250
+ log_entries.append(entry)
251
+
252
+ log_with_request_id(
253
+ ctx,
254
+ LogLevel.INFO,
255
+ f'Retrieved {len(log_entries)} log entries for {resource_type} {resource_name}',
256
+ )
257
+
258
+ # Return the results
259
+ return CloudWatchLogsResponse(
260
+ isError=False,
261
+ content=[
262
+ TextContent(
263
+ type='text',
264
+ text=f'Successfully retrieved {len(log_entries)} log entries for {resource_type} {resource_name} in cluster {cluster_name}',
265
+ )
266
+ ],
267
+ resource_type=resource_type,
268
+ resource_name=resource_name,
269
+ cluster_name=cluster_name,
270
+ log_type=log_type,
271
+ log_group=log_group,
272
+ start_time=start_dt.isoformat(),
273
+ end_time=end_dt.isoformat(),
274
+ log_entries=log_entries,
275
+ )
276
+
277
+ except Exception as e:
278
+ error_message = f'Failed to get logs for {resource_type} {resource_name}: {str(e)}'
279
+ log_with_request_id(ctx, LogLevel.ERROR, error_message)
280
+
281
+ return CloudWatchLogsResponse(
282
+ isError=True,
283
+ content=[TextContent(type='text', text=error_message)],
284
+ resource_type=resource_type,
285
+ resource_name=resource_name,
286
+ cluster_name=cluster_name,
287
+ log_type=log_type,
288
+ log_group='',
289
+ start_time='',
290
+ end_time='',
291
+ log_entries=[],
292
+ )
293
+
294
+ async def get_cloudwatch_metrics(
295
+ self,
296
+ ctx: Context,
297
+ resource_type: str = Field(
298
+ ...,
299
+ description='Resource type to retrieve metrics for. Valid values: "pod", "node", "container", "cluster", "service". Determines the CloudWatch dimensions.',
300
+ ),
301
+ resource_name: str = Field(
302
+ ...,
303
+ description='Name of the resource to retrieve metrics for (e.g., pod name, node name). Used as a dimension value in CloudWatch.',
304
+ ),
305
+ cluster_name: str = Field(
306
+ ...,
307
+ description='Name of the EKS cluster where the resource is located. Used as the ClusterName dimension in CloudWatch.',
308
+ ),
309
+ metric_name: str = Field(
310
+ ...,
311
+ description="""Metric name to retrieve. Common examples:
312
+ - cpu_usage_total: Total CPU usage
313
+ - memory_rss: Resident Set Size memory usage
314
+ - network_rx_bytes: Network bytes received
315
+ - network_tx_bytes: Network bytes transmitted""",
316
+ ),
317
+ namespace: str = Field(
318
+ ...,
319
+ description="""CloudWatch namespace where the metric is stored. Common values:
320
+ - "ContainerInsights": For container metrics
321
+ - "AWS/EC2": For EC2 instance metrics
322
+ - "AWS/EKS": For EKS control plane metrics""",
323
+ ),
324
+ k8s_namespace: str = Field(
325
+ 'default',
326
+ description='Kubernetes namespace for the resource. Used as the Namespace dimension in CloudWatch. Default: "default"',
327
+ ),
328
+ minutes: int = Field(
329
+ 15,
330
+ description='Number of minutes to look back for metrics. Default: 15. Ignored if start_time is provided. IMPORTANT: Choose a time range appropriate for the metric resolution.',
331
+ ),
332
+ start_time: Optional[str] = Field(
333
+ None,
334
+ description='Start time in ISO format (e.g., "2023-01-01T00:00:00Z"). If provided, overrides the minutes parameter. IMPORTANT: Use this for precise historical analysis.',
335
+ ),
336
+ end_time: Optional[str] = Field(
337
+ None,
338
+ description='End time in ISO format (e.g., "2023-01-01T01:00:00Z"). If not provided, defaults to current time. IMPORTANT: Use with start_time for precise time ranges.',
339
+ ),
340
+ limit: int = Field(
341
+ 50,
342
+ description='Maximum number of data points to return. Higher values (100-1000) provide more granular data but may impact performance. IMPORTANT: Balance between granularity and performance.',
343
+ ),
344
+ period: int = Field(
345
+ 60,
346
+ description='Period in seconds for the metric data points. Default: 60 (1 minute). Lower values (1-60) provide higher resolution but may be less available. IMPORTANT: Match to your monitoring needs.',
347
+ ),
348
+ stat: str = Field(
349
+ 'Average',
350
+ description="""Statistic to use for the metric aggregation:
351
+ - Average: Mean value during the period
352
+ - Sum: Total value during the period
353
+ - Maximum: Highest value during the period
354
+ - Minimum: Lowest value during the period
355
+ - SampleCount: Number of samples during the period""",
356
+ ),
357
+ custom_dimensions: Optional[dict] = Field(
358
+ None,
359
+ description='Custom dimensions to use instead of the default ones. Provide as a dictionary of dimension name-value pairs. IMPORTANT: Only use this if you need to override the standard dimensions.',
360
+ ),
361
+ ) -> CloudWatchMetricsResponse:
362
+ """Get metrics from CloudWatch for a specific resource.
363
+
364
+ This tool retrieves metrics from CloudWatch for Kubernetes resources in an EKS cluster,
365
+ allowing you to monitor performance, resource utilization, and system health. It supports
366
+ various resource types and metrics with flexible time ranges and aggregation options for
367
+ monitoring CPU/memory usage, analyzing network traffic, and identifying performance bottlenecks.
368
+
369
+ ## Requirements
370
+ - The EKS cluster must have CloudWatch Container Insights enabled
371
+ - The resource must exist in the specified cluster
372
+ - The metric must be available in the specified namespace
373
+
374
+ ## Response Information
375
+ The response includes resource details (type, name, cluster), metric information (name, namespace),
376
+ time range queried, and data points with timestamps and values.
377
+
378
+ ## Usage Tips
379
+ - Use appropriate statistics for different metrics (e.g., Average for CPU, Maximum for memory spikes)
380
+ - Match the period to your analysis needs (smaller for detailed graphs, larger for trends)
381
+ - For rate metrics like network traffic, Sum is often more useful than Average
382
+ - Combine with get_cloudwatch_logs to correlate metrics with log events
383
+
384
+ Args:
385
+ ctx: MCP context
386
+ resource_type: Resource type (pod, node, container, cluster)
387
+ resource_name: Resource name
388
+ cluster_name: Name of the EKS cluster
389
+ metric_name: Metric name (e.g., cpu_usage_total, memory_rss)
390
+ namespace: CloudWatch namespace
391
+ k8s_namespace: Kubernetes namespace for the resource
392
+ minutes: Number of minutes to look back
393
+ start_time: Start time in ISO format (overrides minutes)
394
+ end_time: End time in ISO format (defaults to now)
395
+ limit: Maximum number of data points to return
396
+ period: Period in seconds for the metric data points
397
+ stat: Statistic to use for the metric
398
+ custom_dimensions: Custom dimensions to use instead of defaults
399
+
400
+ Returns:
401
+ CloudWatchMetricsResponse with metric data points and resource information
402
+ """
403
+ try:
404
+ start_dt, end_dt = self.resolve_time_range(start_time, end_time, minutes)
405
+
406
+ # Create CloudWatch client
407
+ cloudwatch = AwsHelper.create_boto3_client('cloudwatch')
408
+
409
+ # Use custom dimensions if provided, otherwise determine based on resource_type
410
+ dimensions = {}
411
+
412
+ if isinstance(custom_dimensions, dict):
413
+ # Use the provided custom dimensions directly
414
+ dimensions = custom_dimensions
415
+ elif custom_dimensions is not None and not hasattr(custom_dimensions, 'default'):
416
+ # Try to convert to dict if possible
417
+ try:
418
+ dimensions = dict(custom_dimensions)
419
+ except (TypeError, ValueError):
420
+ # If conversion fails, use default dimensions
421
+ dimensions = {'ClusterName': cluster_name}
422
+ else:
423
+ # Set default dimensions based on resource type
424
+ dimensions['ClusterName'] = cluster_name
425
+ dimensions['Namespace'] = k8s_namespace
426
+
427
+ if resource_type == 'pod':
428
+ dimensions['PodName'] = resource_name
429
+ elif resource_type == 'node':
430
+ dimensions['NodeName'] = resource_name
431
+ elif resource_type == 'container':
432
+ dimensions['ContainerName'] = resource_name
433
+ elif resource_type == 'service':
434
+ dimensions['Service'] = resource_name
435
+
436
+ log_with_request_id(
437
+ ctx,
438
+ LogLevel.INFO,
439
+ f'Getting CloudWatch metrics for {resource_type} {resource_name} in cluster {cluster_name}',
440
+ metric_name=metric_name,
441
+ namespace=namespace,
442
+ start_time=start_dt.isoformat(),
443
+ end_time=end_dt.isoformat(),
444
+ )
445
+
446
+ # Create the metric data query
447
+ metric_data_query = {
448
+ 'Id': 'm1',
449
+ 'ReturnData': True,
450
+ }
451
+
452
+ # Convert dimensions to the format expected by CloudWatch
453
+ dimension_list = [{'Name': k, 'Value': v} for k, v in dimensions.items()]
454
+
455
+ # Create the metric definition
456
+ metric_def = {
457
+ 'Namespace': namespace,
458
+ 'MetricName': metric_name,
459
+ 'Dimensions': dimension_list,
460
+ }
461
+
462
+ # Create the metric stat with the appropriate statistics
463
+ # Handle the case where period/stat is a Field object
464
+ period_value = period if isinstance(period, int) else period.default
465
+ stat_value = stat if isinstance(stat, str) else stat.default
466
+
467
+ # Create the metric stat
468
+ metric_stat = {'Metric': metric_def, 'Period': period_value, 'Stat': stat_value}
469
+
470
+ # Add the metric stat to the query
471
+ metric_data_query['MetricStat'] = metric_stat
472
+
473
+ # Get metric data
474
+ response = cloudwatch.get_metric_data(
475
+ MetricDataQueries=[metric_data_query],
476
+ StartTime=start_dt,
477
+ EndTime=end_dt,
478
+ MaxDatapoints=limit,
479
+ )
480
+
481
+ # Process results
482
+ metric_data = response['MetricDataResults'][0]
483
+ timestamps = [ts.isoformat() for ts in metric_data.get('Timestamps', [])]
484
+ values = metric_data.get('Values', [])
485
+
486
+ # Create data points
487
+ data_points = []
488
+ for i in range(len(timestamps)):
489
+ if i < len(values):
490
+ data_points.append({'timestamp': timestamps[i], 'value': values[i]})
491
+
492
+ log_with_request_id(
493
+ ctx,
494
+ LogLevel.INFO,
495
+ f'Retrieved {len(data_points)} metric data points for {resource_type} {resource_name}',
496
+ )
497
+
498
+ # Return the results
499
+ return CloudWatchMetricsResponse(
500
+ isError=False,
501
+ content=[
502
+ TextContent(
503
+ type='text',
504
+ text=f'Successfully retrieved {len(data_points)} metric data points for {resource_type} {resource_name} in cluster {cluster_name}',
505
+ )
506
+ ],
507
+ resource_type=resource_type,
508
+ resource_name=resource_name,
509
+ cluster_name=cluster_name,
510
+ metric_name=metric_name,
511
+ namespace=namespace,
512
+ start_time=start_dt.isoformat(),
513
+ end_time=end_dt.isoformat(),
514
+ data_points=data_points,
515
+ )
516
+
517
+ except Exception as e:
518
+ error_message = f'Failed to get metrics for {resource_type} {resource_name}: {str(e)}'
519
+ log_with_request_id(ctx, LogLevel.ERROR, error_message)
520
+
521
+ return CloudWatchMetricsResponse(
522
+ isError=True,
523
+ content=[TextContent(type='text', text=error_message)],
524
+ resource_type=resource_type,
525
+ resource_name=resource_name,
526
+ cluster_name=cluster_name,
527
+ metric_name=metric_name,
528
+ namespace=namespace,
529
+ start_time='',
530
+ end_time='',
531
+ data_points=[],
532
+ )
533
+
534
+ def _poll_query_results(
535
+ self,
536
+ ctx,
537
+ logs_client,
538
+ query_id,
539
+ resource_type,
540
+ resource_name,
541
+ max_attempts=60,
542
+ initial_delay=1,
543
+ ):
544
+ """Poll for CloudWatch Logs query results with exponential backoff.
545
+
546
+ Args:
547
+ ctx: MCP context
548
+ logs_client: Boto3 CloudWatch Logs client
549
+ query_id: ID of the query to poll for
550
+ resource_type: Resource type for logging
551
+ resource_name: Resource name for logging
552
+ max_attempts: Maximum number of polling attempts before timing out
553
+ initial_delay: Initial delay between polling attempts in seconds
554
+
555
+ Returns:
556
+ Query response when complete
557
+
558
+ Raises:
559
+ TimeoutError: If the query does not complete within the maximum number of attempts
560
+ """
561
+ attempts = 0
562
+ delay = initial_delay
563
+
564
+ log_with_request_id(
565
+ ctx,
566
+ LogLevel.INFO,
567
+ f'Polling for CloudWatch Logs query results (query_id: {query_id})',
568
+ )
569
+
570
+ while attempts < max_attempts:
571
+ query_response = logs_client.get_query_results(queryId=query_id)
572
+ status = query_response.get('status')
573
+
574
+ if status == 'Complete':
575
+ log_with_request_id(
576
+ ctx,
577
+ LogLevel.INFO,
578
+ f'CloudWatch Logs query completed successfully after {attempts + 1} attempts',
579
+ )
580
+ return query_response
581
+ elif status == 'Failed':
582
+ error_message = f'CloudWatch Logs query failed for {resource_type} {resource_name}'
583
+ log_with_request_id(ctx, LogLevel.ERROR, error_message)
584
+ raise Exception(error_message)
585
+ elif status == 'Cancelled':
586
+ error_message = (
587
+ f'CloudWatch Logs query was cancelled for {resource_type} {resource_name}'
588
+ )
589
+ log_with_request_id(ctx, LogLevel.ERROR, error_message)
590
+ raise Exception(error_message)
591
+
592
+ # Log progress periodically
593
+ if attempts % 5 == 0:
594
+ log_with_request_id(
595
+ ctx,
596
+ LogLevel.INFO,
597
+ f'Waiting for CloudWatch Logs query to complete (attempt {attempts + 1}/{max_attempts})',
598
+ )
599
+
600
+ # Sleep with exponential backoff (capped at 5 seconds)
601
+ time.sleep(min(delay, 5))
602
+ delay = min(delay * 1.5, 5) # Exponential backoff with a cap
603
+ attempts += 1
604
+
605
+ # If we've exhausted all attempts, raise a timeout error
606
+ error_message = f'CloudWatch Logs query timed out after {max_attempts} attempts for {resource_type} {resource_name}'
607
+ log_with_request_id(ctx, LogLevel.ERROR, error_message)
608
+ raise TimeoutError(error_message)
609
+
610
+ def _build_log_entry(self, result):
611
+ """Build a log entry from CloudWatch Logs query result.
612
+
613
+ Args:
614
+ result: A single result from CloudWatch Logs query
615
+
616
+ Returns:
617
+ Formatted log entry dictionary
618
+ """
619
+ entry = {}
620
+ for field in result:
621
+ if field['field'] == '@timestamp':
622
+ entry['timestamp'] = field['value']
623
+ elif field['field'] == '@message':
624
+ message = field['value']
625
+
626
+ # Clean up the message to make it more human-readable
627
+ message = message.replace('\n', '')
628
+ message = message.replace('"', '"')
629
+
630
+ # Try to parse JSON if the message appears to be JSON
631
+ if message.startswith('{') and message.endswith('}'):
632
+ try:
633
+ parsed_json = json.loads(message)
634
+
635
+ # Format any nested JSON structures
636
+ parsed_json = self._format_nested_json(parsed_json)
637
+
638
+ entry['message'] = parsed_json
639
+ except json.JSONDecodeError:
640
+ # If it's not valid JSON, just use the cleaned message
641
+ entry['message'] = message
642
+ else:
643
+ # For non-JSON messages, use the cleaned message
644
+ entry['message'] = message
645
+ else:
646
+ entry[field['field']] = field['value']
647
+ return entry
648
+
649
+ def _format_nested_json(self, obj):
650
+ """Format nested JSON objects for better readability.
651
+
652
+ Args:
653
+ obj: The JSON object to format
654
+
655
+ Returns:
656
+ The formatted JSON object
657
+ """
658
+ if isinstance(obj, dict):
659
+ for key, value in obj.items():
660
+ if isinstance(value, (dict, list)):
661
+ obj[key] = self._format_nested_json(value)
662
+ elif isinstance(value, str) and value.startswith('{') and value.endswith('}'):
663
+ try:
664
+ obj[key] = json.loads(value)
665
+ except json.JSONDecodeError:
666
+ pass
667
+ elif isinstance(obj, list):
668
+ for i, item in enumerate(obj):
669
+ obj[i] = self._format_nested_json(item)
670
+ return obj