awslabs.cloudwatch-appsignals-mcp-server 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -14,10 +14,13 @@
14
14
 
15
15
  """CloudWatch Application Signals MCP Server - Core server implementation."""
16
16
 
17
+ import asyncio
17
18
  import boto3
19
+ import json
18
20
  import os
19
21
  import sys
20
22
  from . import __version__
23
+ from .sli_report_client import AWSConfig, SLIReportClient
21
24
  from botocore.config import Config
22
25
  from botocore.exceptions import ClientError
23
26
  from datetime import datetime, timedelta, timezone
@@ -25,6 +28,7 @@ from loguru import logger
25
28
  from mcp.server.fastmcp import FastMCP
26
29
  from pydantic import Field
27
30
  from time import perf_counter as timer
31
+ from typing import Dict, Optional
28
32
 
29
33
 
30
34
  # Initialize FastMCP server
@@ -44,9 +48,12 @@ logger.debug(f'Using AWS region: {AWS_REGION}')
44
48
  try:
45
49
  config = Config(user_agent_extra=f'awslabs.cloudwatch-appsignals-mcp-server/{__version__}')
46
50
  logs_client = boto3.client('logs', region_name=AWS_REGION, config=config)
47
- logger.debug('AWS CloudWatch Logs client initialized successfully')
51
+ appsignals_client = boto3.client('application-signals', region_name=AWS_REGION, config=config)
52
+ cloudwatch_client = boto3.client('cloudwatch', region_name=AWS_REGION, config=config)
53
+ xray_client = boto3.client('xray', region_name=AWS_REGION, config=config)
54
+ logger.debug('AWS clients initialized successfully')
48
55
  except Exception as e:
49
- logger.error(f'Failed to initialize AWS CloudWatch Logs client: {str(e)}')
56
+ logger.error(f'Failed to initialize AWS clients: {str(e)}')
50
57
  raise
51
58
 
52
59
 
@@ -83,16 +90,15 @@ async def list_monitored_services() -> str:
83
90
  logger.debug('Starting list_application_signals_services request')
84
91
 
85
92
  try:
86
- appsignals = boto3.client('application-signals', region_name=AWS_REGION)
87
- logger.debug('Application Signals client created')
88
-
89
93
  # Calculate time range (last 24 hours)
90
94
  end_time = datetime.now(timezone.utc)
91
95
  start_time = end_time - timedelta(hours=24)
92
96
 
93
97
  # Get all services
94
98
  logger.debug(f'Querying services for time range: {start_time} to {end_time}')
95
- response = appsignals.list_services(StartTime=start_time, EndTime=end_time, MaxResults=100)
99
+ response = appsignals_client.list_services(
100
+ StartTime=start_time, EndTime=end_time, MaxResults=100
101
+ )
96
102
  services = response.get('ServiceSummaries', [])
97
103
  logger.debug(f'Retrieved {len(services)} services from Application Signals')
98
104
 
@@ -161,15 +167,12 @@ async def get_service_detail(
161
167
  logger.debug(f'Starting get_service_healthy_detail request for service: {service_name}')
162
168
 
163
169
  try:
164
- appsignals = boto3.client('application-signals', region_name=AWS_REGION)
165
- logger.debug('Application Signals client created')
166
-
167
170
  # Calculate time range (last 24 hours)
168
171
  end_time = datetime.now(timezone.utc)
169
172
  start_time = end_time - timedelta(hours=24)
170
173
 
171
174
  # First, get all services to find the one we want
172
- services_response = appsignals.list_services(
175
+ services_response = appsignals_client.list_services(
173
176
  StartTime=start_time, EndTime=end_time, MaxResults=100
174
177
  )
175
178
 
@@ -187,7 +190,7 @@ async def get_service_detail(
187
190
 
188
191
  # Get detailed service information
189
192
  logger.debug(f'Getting detailed information for service: {service_name}')
190
- service_response = appsignals.get_service(
193
+ service_response = appsignals_client.get_service(
191
194
  StartTime=start_time, EndTime=end_time, KeyAttributes=target_service['KeyAttributes']
192
195
  )
193
196
 
@@ -255,6 +258,1078 @@ async def get_service_detail(
255
258
  return f'Error: {str(e)}'
256
259
 
257
260
 
261
+ @mcp.tool()
262
+ async def query_service_metrics(
263
+ service_name: str = Field(
264
+ ..., description='Name of the service to get metrics for (case-sensitive)'
265
+ ),
266
+ metric_name: str = Field(
267
+ ...,
268
+ description='Specific metric name (e.g., Latency, Error, Fault). Leave empty to list available metrics',
269
+ ),
270
+ statistic: str = Field(
271
+ default='Average',
272
+ description='Standard statistic type (Average, Sum, Maximum, Minimum, SampleCount)',
273
+ ),
274
+ extended_statistic: str = Field(
275
+ default='p99', description='Extended statistic (p99, p95, p90, p50, etc)'
276
+ ),
277
+ hours: int = Field(
278
+ default=1, description='Number of hours to look back (default 1, max 168 for 1 week)'
279
+ ),
280
+ ) -> str:
281
+ """Get CloudWatch metrics for a specific Application Signals service.
282
+
283
+ Use this tool to:
284
+ - Analyze service performance (latency, throughput)
285
+ - Check error rates and reliability
286
+ - View trends over time
287
+ - Get both standard statistics (Average, Max) and percentiles (p99, p95)
288
+
289
+ Common metric names:
290
+ - 'Latency': Response time in milliseconds
291
+ - 'Error': Percentage of failed requests
292
+ - 'Fault': Percentage of server errors (5xx)
293
+
294
+ Returns:
295
+ - Summary statistics (latest, average, min, max)
296
+ - Recent data points with timestamps
297
+ - Both standard and percentile values when available
298
+
299
+ The tool automatically adjusts the granularity based on time range:
300
+ - Up to 3 hours: 1-minute resolution
301
+ - Up to 24 hours: 5-minute resolution
302
+ - Over 24 hours: 1-hour resolution
303
+ """
304
+ start_time_perf = timer()
305
+ logger.info(
306
+ f'Starting query_service_metrics request - service: {service_name}, metric: {metric_name}, hours: {hours}'
307
+ )
308
+
309
+ try:
310
+ # Calculate time range
311
+ end_time = datetime.now(timezone.utc)
312
+ start_time = end_time - timedelta(hours=hours)
313
+
314
+ # Get service details to find metrics
315
+ services_response = appsignals_client.list_services(
316
+ StartTime=start_time, EndTime=end_time, MaxResults=100
317
+ )
318
+
319
+ # Find the target service
320
+ target_service = None
321
+ for service in services_response.get('ServiceSummaries', []):
322
+ key_attrs = service.get('KeyAttributes', {})
323
+ if key_attrs.get('Name') == service_name:
324
+ target_service = service
325
+ break
326
+
327
+ if not target_service:
328
+ logger.warning(f"Service '{service_name}' not found in Application Signals")
329
+ return f"Service '{service_name}' not found in Application Signals."
330
+
331
+ # Get detailed service info for metric references
332
+ service_response = appsignals_client.get_service(
333
+ StartTime=start_time, EndTime=end_time, KeyAttributes=target_service['KeyAttributes']
334
+ )
335
+
336
+ metric_refs = service_response['Service'].get('MetricReferences', [])
337
+
338
+ if not metric_refs:
339
+ logger.warning(f"No metrics found for service '{service_name}'")
340
+ return f"No metrics found for service '{service_name}'."
341
+
342
+ # If no specific metric requested, show available metrics
343
+ if not metric_name:
344
+ result = f"Available metrics for service '{service_name}':\n\n"
345
+ for metric in metric_refs:
346
+ result += f'• {metric.get("MetricName", "Unknown")}\n'
347
+ result += f' Namespace: {metric.get("Namespace", "Unknown")}\n'
348
+ result += f' Type: {metric.get("MetricType", "Unknown")}\n'
349
+ result += '\n'
350
+ return result
351
+
352
+ # Find the specific metric
353
+ target_metric = None
354
+ for metric in metric_refs:
355
+ if metric.get('MetricName') == metric_name:
356
+ target_metric = metric
357
+ break
358
+
359
+ if not target_metric:
360
+ available = [m.get('MetricName', 'Unknown') for m in metric_refs]
361
+ return f"Metric '{metric_name}' not found for service '{service_name}'. Available: {', '.join(available)}"
362
+
363
+ # Calculate appropriate period based on time range
364
+ if hours <= 3:
365
+ period = 60 # 1 minute
366
+ elif hours <= 24:
367
+ period = 300 # 5 minutes
368
+ else:
369
+ period = 3600 # 1 hour
370
+
371
+ # Get both standard and extended statistics in a single call
372
+ response = cloudwatch_client.get_metric_statistics(
373
+ Namespace=target_metric['Namespace'],
374
+ MetricName=target_metric['MetricName'],
375
+ Dimensions=target_metric.get('Dimensions', []),
376
+ StartTime=start_time,
377
+ EndTime=end_time,
378
+ Period=period,
379
+ Statistics=[statistic], # type: ignore
380
+ ExtendedStatistics=[extended_statistic],
381
+ )
382
+
383
+ datapoints = response.get('Datapoints', [])
384
+
385
+ if not datapoints:
386
+ logger.warning(
387
+ f"No data points found for metric '{metric_name}' on service '{service_name}' in the last {hours} hour(s)"
388
+ )
389
+ return f"No data points found for metric '{metric_name}' on service '{service_name}' in the last {hours} hour(s)."
390
+
391
+ # Sort by timestamp
392
+ datapoints.sort(key=lambda x: x.get('Timestamp', datetime.min)) # type: ignore
393
+
394
+ # Build response
395
+ result = f'Metrics for {service_name} - {metric_name}\n'
396
+ result += f'Time Range: Last {hours} hour(s)\n'
397
+ result += f'Period: {period} seconds\n\n'
398
+
399
+ # Calculate summary statistics for both standard and extended statistics
400
+ standard_values = [dp.get(statistic) for dp in datapoints if dp.get(statistic) is not None]
401
+ extended_values = [
402
+ dp.get(extended_statistic)
403
+ for dp in datapoints
404
+ if dp.get(extended_statistic) is not None
405
+ ]
406
+
407
+ result += 'Summary:\n'
408
+
409
+ if standard_values:
410
+ latest_standard = datapoints[-1].get(statistic)
411
+ avg_of_standard = sum(standard_values) / len(standard_values) # type: ignore
412
+ max_standard = max(standard_values) # type: ignore
413
+ min_standard = min(standard_values) # type: ignore
414
+
415
+ result += f'{statistic} Statistics:\n'
416
+ result += f'• Latest: {latest_standard:.2f}\n'
417
+ result += f'• Average: {avg_of_standard:.2f}\n'
418
+ result += f'• Maximum: {max_standard:.2f}\n'
419
+ result += f'• Minimum: {min_standard:.2f}\n\n'
420
+
421
+ if extended_values:
422
+ latest_extended = datapoints[-1].get(extended_statistic)
423
+ avg_extended = sum(extended_values) / len(extended_values) # type: ignore
424
+ max_extended = max(extended_values) # type: ignore
425
+ min_extended = min(extended_values) # type: ignore
426
+
427
+ result += f'{extended_statistic} Statistics:\n'
428
+ result += f'• Latest: {latest_extended:.2f}\n'
429
+ result += f'• Average: {avg_extended:.2f}\n'
430
+ result += f'• Maximum: {max_extended:.2f}\n'
431
+ result += f'• Minimum: {min_extended:.2f}\n\n'
432
+
433
+ result += f'• Data Points: {len(datapoints)}\n\n'
434
+
435
+ # Show recent values (last 10) with both metrics
436
+ result += 'Recent Values:\n'
437
+ for dp in datapoints[-10:]:
438
+ timestamp = dp.get('Timestamp', datetime.min).strftime('%m/%d %H:%M') # type: ignore
439
+ unit = dp.get('Unit', '')
440
+
441
+ values_str = []
442
+ if dp.get(statistic) is not None:
443
+ values_str.append(f'{statistic}: {dp[statistic]:.2f}')
444
+ if dp.get(extended_statistic) is not None:
445
+ values_str.append(f'{extended_statistic}: {dp[extended_statistic]:.2f}')
446
+
447
+ result += f'• {timestamp}: {", ".join(values_str)} {unit}\n'
448
+
449
+ elapsed_time = timer() - start_time_perf
450
+ logger.info(
451
+ f"query_service_metrics completed for '{service_name}/{metric_name}' in {elapsed_time:.3f}s"
452
+ )
453
+ return result
454
+
455
+ except ClientError as e:
456
+ error_msg = e.response.get('Error', {}).get('Message', 'Unknown error')
457
+ error_code = e.response.get('Error', {}).get('Code', 'Unknown')
458
+ logger.error(
459
+ f"AWS ClientError in query_service_metrics for '{service_name}/{metric_name}': {error_code} - {error_msg}"
460
+ )
461
+ return f'AWS Error: {error_msg}'
462
+ except Exception as e:
463
+ logger.error(
464
+ f"Unexpected error in query_service_metrics for '{service_name}/{metric_name}': {str(e)}",
465
+ exc_info=True,
466
+ )
467
+ return f'Error: {str(e)}'
468
+
469
+
470
+ def get_trace_summaries_paginated(
471
+ xray_client, start_time, end_time, filter_expression, max_traces: int = 100
472
+ ) -> list:
473
+ """Get trace summaries with pagination to avoid exceeding response size limits.
474
+
475
+ Args:
476
+ xray_client: Boto3 X-Ray client
477
+ start_time: Start time for trace query
478
+ end_time: End time for trace query
479
+ filter_expression: X-Ray filter expression
480
+ max_traces: Maximum number of traces to retrieve (default 100)
481
+
482
+ Returns:
483
+ List of trace summaries
484
+ """
485
+ all_traces = []
486
+ next_token = None
487
+ logger.debug(
488
+ f'Starting paginated trace retrieval - filter: {filter_expression}, max_traces: {max_traces}'
489
+ )
490
+
491
+ try:
492
+ while len(all_traces) < max_traces:
493
+ # Build request parameters
494
+ kwargs = {
495
+ 'StartTime': start_time,
496
+ 'EndTime': end_time,
497
+ 'FilterExpression': filter_expression,
498
+ 'Sampling': True,
499
+ 'TimeRangeType': 'Service',
500
+ }
501
+
502
+ if next_token:
503
+ kwargs['NextToken'] = next_token
504
+
505
+ # Make request
506
+ response = xray_client.get_trace_summaries(**kwargs)
507
+
508
+ # Add traces from this page
509
+ traces = response.get('TraceSummaries', [])
510
+ all_traces.extend(traces)
511
+ logger.debug(
512
+ f'Retrieved {len(traces)} traces in this page, total so far: {len(all_traces)}'
513
+ )
514
+
515
+ # Check if we have more pages
516
+ next_token = response.get('NextToken')
517
+ if not next_token:
518
+ break
519
+
520
+ # If we've collected enough traces, stop
521
+ if len(all_traces) >= max_traces:
522
+ all_traces = all_traces[:max_traces]
523
+ break
524
+
525
+ logger.info(f'Successfully retrieved {len(all_traces)} traces')
526
+ return all_traces
527
+
528
+ except Exception as e:
529
+ # Return what we have so far if there's an error
530
+ logger.error(f'Error during paginated trace retrieval: {str(e)}', exc_info=True)
531
+ logger.info(f'Returning {len(all_traces)} traces retrieved before error')
532
+ return all_traces
533
+
534
+
535
+ @mcp.tool()
536
+ async def get_slo(
537
+ slo_id: str = Field(..., description='The ARN or name of the SLO to retrieve'),
538
+ ) -> str:
539
+ """Get detailed information about a specific Service Level Objective (SLO).
540
+
541
+ Use this tool to:
542
+ - Get comprehensive SLO configuration details
543
+ - Understand what metrics the SLO monitors
544
+ - See threshold values and comparison operators
545
+ - Extract operation names and key attributes for trace queries
546
+ - Identify dependency configurations
547
+ - Review attainment goals and burn rate settings
548
+
549
+ Returns detailed information including:
550
+ - SLO name, description, and metadata
551
+ - Metric configuration (for period-based or request-based SLOs)
552
+ - Key attributes and operation names
553
+ - Metric type (LATENCY or AVAILABILITY)
554
+ - Threshold values and comparison operators
555
+ - Goal configuration (attainment percentage, time interval)
556
+ - Burn rate configurations
557
+
558
+ This tool is essential for:
559
+ - Understanding why an SLO was breached
560
+ - Getting the exact operation name to query traces
561
+ - Identifying the metrics and thresholds being monitored
562
+ - Planning remediation based on SLO configuration
563
+ """
564
+ start_time_perf = timer()
565
+ logger.info(f'Starting get_service_level_objective request for SLO: {slo_id}')
566
+
567
+ try:
568
+ response = appsignals_client.get_service_level_objective(Id=slo_id)
569
+ slo = response.get('Slo', {})
570
+
571
+ if not slo:
572
+ logger.warning(f'No SLO found with ID: {slo_id}')
573
+ return f'No SLO found with ID: {slo_id}'
574
+
575
+ result = 'Service Level Objective Details\n'
576
+ result += '=' * 50 + '\n\n'
577
+
578
+ # Basic info
579
+ result += f'Name: {slo.get("Name", "Unknown")}\n'
580
+ result += f'ARN: {slo.get("Arn", "Unknown")}\n'
581
+ if slo.get('Description'):
582
+ result += f'Description: {slo.get("Description", "")}\n'
583
+ result += f'Evaluation Type: {slo.get("EvaluationType", "Unknown")}\n'
584
+ result += f'Created: {slo.get("CreatedTime", "Unknown")}\n'
585
+ result += f'Last Updated: {slo.get("LastUpdatedTime", "Unknown")}\n\n'
586
+
587
+ # Goal configuration
588
+ goal = slo.get('Goal', {})
589
+ if goal:
590
+ result += 'Goal Configuration:\n'
591
+ result += f'• Attainment Goal: {goal.get("AttainmentGoal", 99)}%\n'
592
+ result += f'• Warning Threshold: {goal.get("WarningThreshold", 50)}%\n'
593
+
594
+ interval = goal.get('Interval', {})
595
+ if 'RollingInterval' in interval:
596
+ rolling = interval['RollingInterval']
597
+ result += f'• Interval: Rolling {rolling.get("Duration")} {rolling.get("DurationUnit")}\n'
598
+ elif 'CalendarInterval' in interval:
599
+ calendar = interval['CalendarInterval']
600
+ result += f'• Interval: Calendar {calendar.get("Duration")} {calendar.get("DurationUnit")} starting {calendar.get("StartTime")}\n'
601
+ result += '\n'
602
+
603
+ # Period-based SLI
604
+ if 'Sli' in slo:
605
+ sli = slo['Sli']
606
+ result += 'Period-Based SLI Configuration:\n'
607
+
608
+ sli_metric = sli.get('SliMetric', {})
609
+ if sli_metric:
610
+ # Key attributes - crucial for trace queries
611
+ key_attrs = sli_metric.get('KeyAttributes', {})
612
+ if key_attrs:
613
+ result += '• Key Attributes:\n'
614
+ for k, v in key_attrs.items():
615
+ result += f' - {k}: {v}\n'
616
+
617
+ # Operation name - essential for trace filtering
618
+ if sli_metric.get('OperationName'):
619
+ result += f'• Operation Name: {sli_metric.get("OperationName", "")}\n'
620
+ result += f' (Use this in trace queries: annotation[aws.local.operation]="{sli_metric.get("OperationName", "")}")\n'
621
+
622
+ result += f'• Metric Type: {sli_metric.get("MetricType", "Unknown")}\n'
623
+
624
+ # MetricDataQueries - detailed metric configuration
625
+ metric_queries = sli_metric.get('MetricDataQueries', [])
626
+ if metric_queries:
627
+ result += '• Metric Data Queries:\n'
628
+ for query in metric_queries:
629
+ query_id = query.get('Id', 'Unknown')
630
+ result += f' Query ID: {query_id}\n'
631
+
632
+ # MetricStat details
633
+ metric_stat = query.get('MetricStat', {})
634
+ if metric_stat:
635
+ metric = metric_stat.get('Metric', {})
636
+ if metric:
637
+ result += f' Namespace: {metric.get("Namespace", "Unknown")}\n'
638
+ result += (
639
+ f' MetricName: {metric.get("MetricName", "Unknown")}\n'
640
+ )
641
+
642
+ # Dimensions - crucial for understanding what's being measured
643
+ dimensions = metric.get('Dimensions', [])
644
+ if dimensions:
645
+ result += ' Dimensions:\n'
646
+ for dim in dimensions:
647
+ result += f' - {dim.get("Name", "Unknown")}: {dim.get("Value", "Unknown")}\n'
648
+
649
+ result += (
650
+ f' Period: {metric_stat.get("Period", "Unknown")} seconds\n'
651
+ )
652
+ result += f' Stat: {metric_stat.get("Stat", "Unknown")}\n'
653
+ if metric_stat.get('Unit'):
654
+ result += f' Unit: {metric_stat["Unit"]}\n' # type: ignore
655
+
656
+ # Expression if present
657
+ if query.get('Expression'):
658
+ result += f' Expression: {query.get("Expression", "")}\n'
659
+
660
+ result += f' ReturnData: {query.get("ReturnData", True)}\n'
661
+
662
+ # Dependency config
663
+ dep_config = sli_metric.get('DependencyConfig', {})
664
+ if dep_config:
665
+ result += '• Dependency Configuration:\n'
666
+ dep_attrs = dep_config.get('DependencyKeyAttributes', {})
667
+ if dep_attrs:
668
+ result += ' Key Attributes:\n'
669
+ for k, v in dep_attrs.items():
670
+ result += f' - {k}: {v}\n'
671
+ if dep_config.get('DependencyOperationName'):
672
+ result += (
673
+ f' - Dependency Operation: {dep_config["DependencyOperationName"]}\n'
674
+ )
675
+ result += f' (Use in traces: annotation[aws.remote.operation]="{dep_config["DependencyOperationName"]}")\n'
676
+
677
+ result += f'• Threshold: {sli.get("MetricThreshold", "Unknown")}\n'
678
+ result += f'• Comparison: {sli.get("ComparisonOperator", "Unknown")}\n\n'
679
+
680
+ # Request-based SLI
681
+ if 'RequestBasedSli' in slo:
682
+ rbs = slo['RequestBasedSli']
683
+ result += 'Request-Based SLI Configuration:\n'
684
+
685
+ rbs_metric = rbs.get('RequestBasedSliMetric', {})
686
+ if rbs_metric:
687
+ # Key attributes
688
+ key_attrs = rbs_metric.get('KeyAttributes', {})
689
+ if key_attrs:
690
+ result += '• Key Attributes:\n'
691
+ for k, v in key_attrs.items():
692
+ result += f' - {k}: {v}\n'
693
+
694
+ # Operation name
695
+ if rbs_metric.get('OperationName'):
696
+ result += f'• Operation Name: {rbs_metric.get("OperationName", "")}\n'
697
+ result += f' (Use this in trace queries: annotation[aws.local.operation]="{rbs_metric.get("OperationName", "")}")\n'
698
+
699
+ result += f'• Metric Type: {rbs_metric.get("MetricType", "Unknown")}\n'
700
+
701
+ # MetricDataQueries - detailed metric configuration
702
+ metric_queries = rbs_metric.get('MetricDataQueries', [])
703
+ if metric_queries:
704
+ result += '• Metric Data Queries:\n'
705
+ for query in metric_queries:
706
+ query_id = query.get('Id', 'Unknown')
707
+ result += f' Query ID: {query_id}\n'
708
+
709
+ # MetricStat details
710
+ metric_stat = query.get('MetricStat', {})
711
+ if metric_stat:
712
+ metric = metric_stat.get('Metric', {})
713
+ if metric:
714
+ result += f' Namespace: {metric.get("Namespace", "Unknown")}\n'
715
+ result += (
716
+ f' MetricName: {metric.get("MetricName", "Unknown")}\n'
717
+ )
718
+
719
+ # Dimensions - crucial for understanding what's being measured
720
+ dimensions = metric.get('Dimensions', [])
721
+ if dimensions:
722
+ result += ' Dimensions:\n'
723
+ for dim in dimensions:
724
+ result += f' - {dim.get("Name", "Unknown")}: {dim.get("Value", "Unknown")}\n'
725
+
726
+ result += (
727
+ f' Period: {metric_stat.get("Period", "Unknown")} seconds\n'
728
+ )
729
+ result += f' Stat: {metric_stat.get("Stat", "Unknown")}\n'
730
+ if metric_stat.get('Unit'):
731
+ result += f' Unit: {metric_stat["Unit"]}\n' # type: ignore
732
+
733
+ # Expression if present
734
+ if query.get('Expression'):
735
+ result += f' Expression: {query.get("Expression", "")}\n'
736
+
737
+ result += f' ReturnData: {query.get("ReturnData", True)}\n'
738
+
739
+ # Dependency config
740
+ dep_config = rbs_metric.get('DependencyConfig', {})
741
+ if dep_config:
742
+ result += '• Dependency Configuration:\n'
743
+ dep_attrs = dep_config.get('DependencyKeyAttributes', {})
744
+ if dep_attrs:
745
+ result += ' Key Attributes:\n'
746
+ for k, v in dep_attrs.items():
747
+ result += f' - {k}: {v}\n'
748
+ if dep_config.get('DependencyOperationName'):
749
+ result += (
750
+ f' - Dependency Operation: {dep_config["DependencyOperationName"]}\n'
751
+ )
752
+ result += f' (Use in traces: annotation[aws.remote.operation]="{dep_config["DependencyOperationName"]}")\n'
753
+
754
+ result += f'• Threshold: {rbs.get("MetricThreshold", "Unknown")}\n'
755
+ result += f'• Comparison: {rbs.get("ComparisonOperator", "Unknown")}\n\n'
756
+
757
+ # Burn rate configurations
758
+ burn_rates = slo.get('BurnRateConfigurations', [])
759
+ if burn_rates:
760
+ result += 'Burn Rate Configurations:\n'
761
+ for br in burn_rates:
762
+ result += f'• Look-back window: {br.get("LookBackWindowMinutes")} minutes\n'
763
+
764
+ elapsed_time = timer() - start_time_perf
765
+ logger.info(f"get_service_level_objective completed for '{slo_id}' in {elapsed_time:.3f}s")
766
+ return result
767
+
768
+ except ClientError as e:
769
+ error_msg = e.response.get('Error', {}).get('Message', 'Unknown error')
770
+ error_code = e.response.get('Error', {}).get('Code', 'Unknown')
771
+ logger.error(
772
+ f"AWS ClientError in get_service_level_objective for '{slo_id}': {error_code} - {error_msg}"
773
+ )
774
+ return f'AWS Error: {error_msg}'
775
+ except Exception as e:
776
+ logger.error(
777
+ f"Unexpected error in get_service_level_objective for '{slo_id}': {str(e)}",
778
+ exc_info=True,
779
+ )
780
+ return f'Error: {str(e)}'
781
+
782
+
783
+ @mcp.tool()
784
+ async def search_transaction_spans(
785
+ log_group_name: str = Field(
786
+ default='',
787
+ description='CloudWatch log group name (defaults to "aws/spans" if not provided)',
788
+ ),
789
+ start_time: str = Field(
790
+ default='', description='Start time in ISO 8601 format (e.g., "2025-04-19T20:00:00+00:00")'
791
+ ),
792
+ end_time: str = Field(
793
+ default='', description='End time in ISO 8601 format (e.g., "2025-04-19T21:00:00+00:00")'
794
+ ),
795
+ query_string: str = Field(default='', description='CloudWatch Logs Insights query string'),
796
+ limit: Optional[int] = Field(default=None, description='Maximum number of results to return'),
797
+ max_timeout: int = Field(
798
+ default=30, description='Maximum time in seconds to wait for query completion'
799
+ ),
800
+ ) -> Dict:
801
+ """Executes a CloudWatch Logs Insights query for transaction search (100% sampled trace data).
802
+
803
+ IMPORTANT: If log_group_name is not provided use 'aws/spans' as default cloudwatch log group name.
804
+ The volume of returned logs can easily overwhelm the agent context window. Always include a limit in the query
805
+ (| limit 50) or using the limit parameter.
806
+
807
+ Usage:
808
+ "aws/spans" log group stores OpenTelemetry Spans data with many attributes for all monitored services.
809
+ This provides 100% sampled data vs X-Ray's 5% sampling, giving more accurate results.
810
+ User can write CloudWatch Logs Insights queries to group, list attribute with sum, avg.
811
+
812
+ ```
813
+ FILTER attributes.aws.local.service = "customers-service-java" and attributes.aws.local.environment = "eks:demo/default" and attributes.aws.remote.operation="InvokeModel"
814
+ | STATS sum(`attributes.gen_ai.usage.output_tokens`) as `avg_output_tokens` by `attributes.gen_ai.request.model`, `attributes.aws.local.service`,bin(1h)
815
+ | DISPLAY avg_output_tokens, `attributes.gen_ai.request.model`, `attributes.aws.local.service`
816
+ ```
817
+
818
+ Returns:
819
+ --------
820
+ A dictionary containing the final query results, including:
821
+ - status: The current status of the query (e.g., Scheduled, Running, Complete, Failed, etc.)
822
+ - results: A list of the actual query results if the status is Complete.
823
+ - statistics: Query performance statistics
824
+ - messages: Any informational messages about the query
825
+ - transaction_search_status: Information about transaction search availability
826
+ """
827
+ start_time_perf = timer()
828
+ logger.info(
829
+ f'Starting search_transactions - log_group: {log_group_name}, start: {start_time}, end: {end_time}'
830
+ )
831
+ logger.debug(f'Query string: {query_string}')
832
+
833
+ # Check if transaction search is enabled
834
+ is_enabled, destination, status = check_transaction_search_enabled(AWS_REGION)
835
+
836
+ if not is_enabled:
837
+ logger.warning(
838
+ f'Transaction Search not enabled - Destination: {destination}, Status: {status}'
839
+ )
840
+ return {
841
+ 'status': 'Transaction Search Not Available',
842
+ 'transaction_search_status': {
843
+ 'enabled': False,
844
+ 'destination': destination,
845
+ 'status': status,
846
+ },
847
+ 'message': (
848
+ '⚠️ Transaction Search is not enabled for this account. '
849
+ f'Current configuration: Destination={destination}, Status={status}. '
850
+ "Transaction Search requires sending traces to CloudWatch Logs (destination='CloudWatchLogs' and status='ACTIVE'). "
851
+ 'Without Transaction Search, you only have access to 5% sampled trace data through X-Ray. '
852
+ 'To get 100% trace visibility, please enable Transaction Search in your X-Ray settings. '
853
+ 'As a fallback, you can use query_sampled_traces() but results may be incomplete due to sampling.'
854
+ ),
855
+ 'fallback_recommendation': 'Use query_sampled_traces() with X-Ray filter expressions for 5% sampled data.',
856
+ }
857
+
858
+ try:
859
+ # Use default log group if none provided
860
+ if log_group_name is None:
861
+ log_group_name = 'aws/spans'
862
+ logger.debug('Using default log group: aws/spans')
863
+
864
+ # Start query
865
+ kwargs = {
866
+ 'startTime': int(datetime.fromisoformat(start_time).timestamp()),
867
+ 'endTime': int(datetime.fromisoformat(end_time).timestamp()),
868
+ 'queryString': query_string,
869
+ 'logGroupNames': [log_group_name],
870
+ 'limit': limit,
871
+ }
872
+
873
+ logger.debug(f'Starting CloudWatch Logs query with limit: {limit}')
874
+ start_response = logs_client.start_query(**remove_null_values(kwargs))
875
+ query_id = start_response['queryId']
876
+ logger.info(f'Started CloudWatch Logs query with ID: {query_id}')
877
+
878
+ # Seconds
879
+ poll_start = timer()
880
+ while poll_start + max_timeout > timer():
881
+ response = logs_client.get_query_results(queryId=query_id)
882
+ status = response['status']
883
+
884
+ if status in {'Complete', 'Failed', 'Cancelled'}:
885
+ elapsed_time = timer() - start_time_perf
886
+ logger.info(
887
+ f'Query {query_id} finished with status {status} in {elapsed_time:.3f}s'
888
+ )
889
+
890
+ if status == 'Failed':
891
+ logger.error(f'Query failed: {response.get("statistics", {})}')
892
+ elif status == 'Complete':
893
+ logger.debug(f'Query returned {len(response.get("results", []))} results')
894
+
895
+ return {
896
+ 'queryId': query_id,
897
+ 'status': status,
898
+ 'statistics': response.get('statistics', {}),
899
+ 'results': [
900
+ {field.get('field', ''): field.get('value', '') for field in line} # type: ignore
901
+ for line in response.get('results', [])
902
+ ],
903
+ 'transaction_search_status': {
904
+ 'enabled': True,
905
+ 'destination': 'CloudWatchLogs',
906
+ 'status': 'ACTIVE',
907
+ 'message': '✅ Using 100% sampled trace data from Transaction Search',
908
+ },
909
+ }
910
+
911
+ await asyncio.sleep(1)
912
+
913
+ elapsed_time = timer() - start_time_perf
914
+ msg = f'Query {query_id} did not complete within {max_timeout} seconds. Use get_query_results with the returned queryId to try again to retrieve query results.'
915
+ logger.warning(f'Query timeout after {elapsed_time:.3f}s: {msg}')
916
+ return {
917
+ 'queryId': query_id,
918
+ 'status': 'Polling Timeout',
919
+ 'message': msg,
920
+ }
921
+
922
+ except Exception as e:
923
+ logger.error(f'Error in search_transactions: {str(e)}', exc_info=True)
924
+ raise
925
+
926
+
927
+ @mcp.tool()
928
+ async def list_slis(
929
+ hours: int = Field(
930
+ default=24,
931
+ description='Number of hours to look back (default 24, typically use 24 for daily checks)',
932
+ ),
933
+ ) -> str:
934
+ """Get SLI (Service Level Indicator) status and SLO compliance for all services.
935
+
936
+ Use this tool to:
937
+ - Check overall system health at a glance
938
+ - Identify services with breached SLOs (Service Level Objectives)
939
+ - See which specific SLOs are failing
940
+ - Prioritize which services need immediate attention
941
+ - Monitor SLO compliance trends
942
+
943
+ Returns a comprehensive report showing:
944
+ - Summary counts (total, healthy, breached, insufficient data)
945
+ - Detailed list of breached services with:
946
+ - Service name and environment
947
+ - Number and names of breached SLOs
948
+ - Specific SLO violations
949
+ - List of healthy services
950
+ - Services with insufficient data
951
+
952
+ This is the primary tool for health monitoring and should be used:
953
+ - At the start of each day
954
+ - During incident response
955
+ - For regular health checks
956
+ - When investigating "what is the root cause of breaching SLO" questions
957
+
958
+ Status meanings:
959
+ - OK: All SLOs are being met
960
+ - BREACHED: One or more SLOs are violated
961
+ - INSUFFICIENT_DATA: Not enough data to determine status
962
+
963
+ To investigate breached SLOs, follow these steps:
964
+ 1. Call get_service_level_objective() with SLO name to get the detailed SLI data including Metric statistics
965
+ 2. Find the fault metrics from SLI under the breached SLO
966
+ 3. Build trace query filters using metric dimensions (Operation, RemoteOperation, etc.):
967
+ - For availability: `service("service-name"){fault = true} AND annotation[aws.local.operation]="operation-name"`
968
+ - For latency: `service("service-name") AND annotation[aws.local.operation]="operation-name" AND duration > threshold`
969
+ 4. Query traces:
970
+ - If Transaction Search is enabled: Use search_transaction_spans() for 100% trace visibility
971
+ - If not enabled: Use query_sampled_traces() with X-Ray (only 5% sampled data - may miss issues)
972
+ 5. The query time window should default to last 3 hours if not specified. Max query time window length is 6 hours
973
+ 6. Analyze the root causes from Exception data in traces
974
+ 7. Include findings in the report and give fix and mitigation suggestions
975
+ """
976
+ start_time_perf = timer()
977
+ logger.info(f'Starting get_sli_status request for last {hours} hours')
978
+
979
+ try:
980
+ # Calculate time range
981
+ end_time = datetime.now(timezone.utc)
982
+ start_time = end_time - timedelta(hours=hours)
983
+ logger.debug(f'Time range: {start_time} to {end_time}')
984
+
985
+ # Get all services
986
+ services_response = appsignals_client.list_services(
987
+ StartTime=start_time, # type: ignore
988
+ EndTime=end_time, # type: ignore
989
+ MaxResults=100,
990
+ )
991
+ services = services_response.get('ServiceSummaries', [])
992
+
993
+ if not services:
994
+ logger.warning('No services found in Application Signals')
995
+ return 'No services found in Application Signals.'
996
+
997
+ # Get SLI reports for each service
998
+ reports = []
999
+ logger.debug(f'Generating SLI reports for {len(services)} services')
1000
+ for service in services:
1001
+ service_name = service['KeyAttributes'].get('Name', 'Unknown')
1002
+ try:
1003
+ # Create custom config with the service's key attributes
1004
+ config = AWSConfig(
1005
+ region='us-east-1',
1006
+ period_in_hours=hours,
1007
+ service_name=service_name,
1008
+ key_attributes=service['KeyAttributes'],
1009
+ )
1010
+
1011
+ # Generate SLI report
1012
+ client = SLIReportClient(config)
1013
+ sli_report = client.generate_sli_report()
1014
+
1015
+ # Convert to expected format
1016
+ report = {
1017
+ 'BreachedSloCount': sli_report.breached_slo_count,
1018
+ 'BreachedSloNames': sli_report.breached_slo_names,
1019
+ 'EndTime': sli_report.end_time.timestamp(),
1020
+ 'OkSloCount': sli_report.ok_slo_count,
1021
+ 'ReferenceId': {'KeyAttributes': service['KeyAttributes']},
1022
+ 'SliStatus': 'BREACHED'
1023
+ if sli_report.sli_status == 'CRITICAL'
1024
+ else sli_report.sli_status,
1025
+ 'StartTime': sli_report.start_time.timestamp(),
1026
+ 'TotalSloCount': sli_report.total_slo_count,
1027
+ }
1028
+ reports.append(report)
1029
+
1030
+ except Exception as e:
1031
+ # Log error but continue with other services
1032
+ logger.error(
1033
+ f'Failed to get SLI report for service {service_name}: {str(e)}', exc_info=True
1034
+ )
1035
+ # Add a report with insufficient data status
1036
+ report = {
1037
+ 'BreachedSloCount': 0,
1038
+ 'BreachedSloNames': [],
1039
+ 'EndTime': end_time.timestamp(),
1040
+ 'OkSloCount': 0,
1041
+ 'ReferenceId': {'KeyAttributes': service['KeyAttributes']},
1042
+ 'SliStatus': 'INSUFFICIENT_DATA',
1043
+ 'StartTime': start_time.timestamp(),
1044
+ 'TotalSloCount': 0,
1045
+ }
1046
+ reports.append(report)
1047
+
1048
+ # Check transaction search status
1049
+ is_tx_search_enabled, tx_destination, tx_status = check_transaction_search_enabled(
1050
+ AWS_REGION
1051
+ )
1052
+
1053
+ # Build response
1054
+ result = f'SLI Status Report - Last {hours} hours\n'
1055
+ result += f'Time Range: {start_time.strftime("%Y-%m-%d %H:%M")} - {end_time.strftime("%Y-%m-%d %H:%M")}\n\n'
1056
+
1057
+ # Add transaction search status
1058
+ if is_tx_search_enabled:
1059
+ result += '✅ Transaction Search: ENABLED (100% trace visibility available)\n\n'
1060
+ else:
1061
+ result += '⚠️ Transaction Search: NOT ENABLED (only 5% sampled traces available)\n'
1062
+ result += f' Current config: Destination={tx_destination}, Status={tx_status}\n'
1063
+ result += ' Enable Transaction Search for accurate root cause analysis\n\n'
1064
+
1065
+ # Count by status
1066
+ status_counts = {
1067
+ 'OK': sum(1 for r in reports if r['SliStatus'] == 'OK'),
1068
+ 'BREACHED': sum(1 for r in reports if r['SliStatus'] == 'BREACHED'),
1069
+ 'INSUFFICIENT_DATA': sum(1 for r in reports if r['SliStatus'] == 'INSUFFICIENT_DATA'),
1070
+ }
1071
+
1072
+ result += 'Summary:\n'
1073
+ result += f'• Total Services: {len(reports)}\n'
1074
+ result += f'• Healthy (OK): {status_counts["OK"]}\n'
1075
+ result += f'• Breached: {status_counts["BREACHED"]}\n'
1076
+ result += f'• Insufficient Data: {status_counts["INSUFFICIENT_DATA"]}\n\n'
1077
+
1078
+ # Group by status
1079
+ if status_counts['BREACHED'] > 0:
1080
+ result += '⚠️ BREACHED SERVICES:\n'
1081
+ for report in reports:
1082
+ if report['SliStatus'] == 'BREACHED':
1083
+ name = report['ReferenceId']['KeyAttributes']['Name']
1084
+ env = report['ReferenceId']['KeyAttributes']['Environment']
1085
+ breached_count = report['BreachedSloCount']
1086
+ total_count = report['TotalSloCount']
1087
+ breached_names = report['BreachedSloNames']
1088
+
1089
+ result += f'\n• {name} ({env})\n'
1090
+ result += f' SLOs: {breached_count}/{total_count} breached\n'
1091
+ if breached_names:
1092
+ result += ' Breached SLOs:\n'
1093
+ for slo_name in breached_names:
1094
+ result += f' - {slo_name}\n'
1095
+
1096
+ if status_counts['OK'] > 0:
1097
+ result += '\n✅ HEALTHY SERVICES:\n'
1098
+ for report in reports:
1099
+ if report['SliStatus'] == 'OK':
1100
+ name = report['ReferenceId']['KeyAttributes']['Name']
1101
+ env = report['ReferenceId']['KeyAttributes']['Environment']
1102
+ ok_count = report['OkSloCount']
1103
+
1104
+ result += f'• {name} ({env}) - {ok_count} SLO(s) healthy\n'
1105
+
1106
+ if status_counts['INSUFFICIENT_DATA'] > 0:
1107
+ result += '\n❓ INSUFFICIENT DATA:\n'
1108
+ for report in reports:
1109
+ if report['SliStatus'] == 'INSUFFICIENT_DATA':
1110
+ name = report['ReferenceId']['KeyAttributes']['Name']
1111
+ env = report['ReferenceId']['KeyAttributes']['Environment']
1112
+
1113
+ result += f'• {name} ({env})\n'
1114
+
1115
+ # Remove the auto-investigation feature
1116
+
1117
+ elapsed_time = timer() - start_time_perf
1118
+ logger.info(
1119
+ f'get_sli_status completed in {elapsed_time:.3f}s - Total: {len(reports)}, Breached: {status_counts["BREACHED"]}, OK: {status_counts["OK"]}'
1120
+ )
1121
+ return result
1122
+
1123
+ except Exception as e:
1124
+ logger.error(f'Error in get_sli_status: {str(e)}', exc_info=True)
1125
+ return f'Error getting SLI status: {str(e)}'
1126
+
1127
+
1128
+ def check_transaction_search_enabled(region: str = 'us-east-1') -> tuple[bool, str, str]:
1129
+ """Internal function to check if AWS X-Ray Transaction Search is enabled.
1130
+
1131
+ Returns:
1132
+ tuple: (is_enabled: bool, destination: str, status: str)
1133
+ """
1134
+ try:
1135
+ response = xray_client.get_trace_segment_destination()
1136
+
1137
+ destination = response.get('Destination', 'Unknown')
1138
+ status = response.get('Status', 'Unknown')
1139
+
1140
+ is_enabled = destination == 'CloudWatchLogs' and status == 'ACTIVE'
1141
+ logger.debug(
1142
+ f'Transaction Search check - Enabled: {is_enabled}, Destination: {destination}, Status: {status}'
1143
+ )
1144
+
1145
+ return is_enabled, destination, status
1146
+
1147
+ except Exception as e:
1148
+ logger.error(f'Error checking transaction search status: {str(e)}')
1149
+ return False, 'Unknown', 'Error'
1150
+
1151
+
1152
+ @mcp.tool()
1153
+ async def query_sampled_traces(
1154
+ start_time: Optional[str] = Field(
1155
+ default=None,
1156
+ description='Start time in ISO format (e.g., "2024-01-01T00:00:00Z"). Defaults to 3 hours ago',
1157
+ ),
1158
+ end_time: Optional[str] = Field(
1159
+ default=None,
1160
+ description='End time in ISO format (e.g., "2024-01-01T01:00:00Z"). Defaults to current time',
1161
+ ),
1162
+ filter_expression: Optional[str] = Field(
1163
+ default=None,
1164
+ description='X-Ray filter expression to narrow results (e.g., service("service-name"){fault = true})',
1165
+ ),
1166
+ region: str = Field(default='us-east-1', description='AWS region (default: us-east-1)'),
1167
+ ) -> str:
1168
+ """Query AWS X-Ray traces (5% sampled data) to investigate errors and performance issues.
1169
+
1170
+ ⚠️ IMPORTANT: This tool uses X-Ray's 5% sampled trace data. For 100% trace visibility,
1171
+ enable Transaction Search and use search_transaction_spans() instead.
1172
+
1173
+ Use this tool to:
1174
+ - Find root causes of errors and faults (with 5% sampling limitations)
1175
+ - Analyze request latency and identify bottlenecks
1176
+ - Understand the requests across multiple services with traces
1177
+ - Debug timeout and dependency issues
1178
+ - Understand service-to-service interactions
1179
+ - Find customer impact from trace result such as Users data or trace attributes such as owner id
1180
+
1181
+ Common filter expressions:
1182
+ - 'service("service-name"){fault = true}': Find all traces with faults (5xx errors) for a service
1183
+ - 'service("service-name")': Filter by specific service
1184
+ - 'duration > 5': Find slow requests (over 5 seconds)
1185
+ - 'http.status = 500': Find specific HTTP status codes
1186
+ - 'annotation[aws.local.operation]="GET /owners/*/lastname"': Filter by specific operation (from metric dimensions)
1187
+ - 'annotation[aws.remote.operation]="ListOwners"': Filter by remote operation name
1188
+ - Combine filters: 'service("api"){fault = true} AND annotation[aws.local.operation]="POST /visits"'
1189
+
1190
+ IMPORTANT: When investigating SLO breaches, use annotation filters with the specific dimension values
1191
+ from the breached metric (e.g., Operation, RemoteOperation) to find traces for that exact operation.
1192
+
1193
+ Returns JSON with trace summaries including:
1194
+ - Trace ID for detailed investigation
1195
+ - Duration and response time
1196
+ - Error/fault/throttle status
1197
+ - HTTP information (method, status, URL)
1198
+ - Service interactions
1199
+ - User information if available
1200
+ - Exception root causes (ErrorRootCauses, FaultRootCauses, ResponseTimeRootCauses)
1201
+
1202
+ Best practices:
1203
+ - Start with recent time windows (last 1-3 hours)
1204
+ - Use filter expressions to narrow down issues and query Fault and Error traces for high priority
1205
+ - Look for patterns in errors or very slow requests
1206
+
1207
+ Returns:
1208
+ JSON string containing trace summaries with error status, duration, and service details
1209
+ """
1210
+ start_time_perf = timer()
1211
+ logger.info(f'Starting query_sampled_traces - region: {region}, filter: {filter_expression}')
1212
+
1213
+ try:
1214
+ logger.debug('Using X-Ray client')
1215
+
1216
+ # Default to past 3 hours if times not provided
1217
+ if not end_time:
1218
+ end_datetime = datetime.now(timezone.utc)
1219
+ else:
1220
+ end_datetime = datetime.fromisoformat(end_time.replace('Z', '+00:00'))
1221
+
1222
+ if not start_time:
1223
+ start_datetime = end_datetime - timedelta(hours=3)
1224
+ else:
1225
+ start_datetime = datetime.fromisoformat(start_time.replace('Z', '+00:00'))
1226
+
1227
+ # Validate time window to ensure it's not too large (max 6 hours)
1228
+ time_diff = end_datetime - start_datetime
1229
+ logger.debug(
1230
+ f'Query time window: {start_datetime} to {end_datetime} ({time_diff.total_seconds() / 3600:.1f} hours)'
1231
+ )
1232
+ if time_diff > timedelta(hours=6):
1233
+ logger.warning(f'Time window too large: {time_diff.total_seconds() / 3600:.1f} hours')
1234
+ return json.dumps(
1235
+ {
1236
+ 'error': 'Time window too large. Maximum allowed is 6 hours.',
1237
+ 'requested_hours': time_diff.total_seconds() / 3600,
1238
+ },
1239
+ indent=2,
1240
+ )
1241
+
1242
+ # Use pagination helper with a reasonable limit
1243
+ traces = get_trace_summaries_paginated(
1244
+ xray_client,
1245
+ start_datetime,
1246
+ end_datetime,
1247
+ filter_expression or '',
1248
+ max_traces=100, # Limit to prevent response size issues
1249
+ )
1250
+
1251
+ # Convert response to JSON-serializable format
1252
+ def convert_datetime(obj):
1253
+ if isinstance(obj, datetime):
1254
+ return obj.isoformat()
1255
+ return obj
1256
+
1257
+ trace_summaries = []
1258
+ for trace in traces:
1259
+ # Create a simplified trace data structure to reduce size
1260
+ trace_data = {
1261
+ 'Id': trace.get('Id'),
1262
+ 'Duration': trace.get('Duration'),
1263
+ 'ResponseTime': trace.get('ResponseTime'),
1264
+ 'HasError': trace.get('HasError'),
1265
+ 'HasFault': trace.get('HasFault'),
1266
+ 'HasThrottle': trace.get('HasThrottle'),
1267
+ 'Http': trace.get('Http', {}),
1268
+ }
1269
+
1270
+ # Only include root causes if they exist (to save space)
1271
+ if trace.get('ErrorRootCauses'):
1272
+ trace_data['ErrorRootCauses'] = trace.get('ErrorRootCauses', [])[
1273
+ :3
1274
+ ] # Limit to first 3
1275
+ if trace.get('FaultRootCauses'):
1276
+ trace_data['FaultRootCauses'] = trace.get('FaultRootCauses', [])[
1277
+ :3
1278
+ ] # Limit to first 3
1279
+ if trace.get('ResponseTimeRootCauses'):
1280
+ trace_data['ResponseTimeRootCauses'] = trace.get('ResponseTimeRootCauses', [])[
1281
+ :3
1282
+ ] # Limit to first 3
1283
+
1284
+ # Include limited annotations for key operations
1285
+ annotations = trace.get('Annotations', {})
1286
+ if annotations:
1287
+ # Only include operation-related annotations
1288
+ filtered_annotations = {}
1289
+ for key in ['aws.local.operation', 'aws.remote.operation']:
1290
+ if key in annotations:
1291
+ filtered_annotations[key] = annotations[key]
1292
+ if filtered_annotations:
1293
+ trace_data['Annotations'] = filtered_annotations
1294
+
1295
+ # Include user info if available
1296
+ if trace.get('Users'):
1297
+ trace_data['Users'] = trace.get('Users', [])[:2] # Limit to first 2 users
1298
+
1299
+ # Convert any datetime objects to ISO format strings
1300
+ for key, value in trace_data.items():
1301
+ trace_data[key] = convert_datetime(value)
1302
+ trace_summaries.append(trace_data)
1303
+
1304
+ # Check transaction search status
1305
+ is_tx_search_enabled, tx_destination, tx_status = check_transaction_search_enabled(region)
1306
+
1307
+ result_data = {
1308
+ 'TraceSummaries': trace_summaries,
1309
+ 'TraceCount': len(trace_summaries),
1310
+ 'Message': f'Retrieved {len(trace_summaries)} traces (limited to prevent size issues)',
1311
+ 'SamplingNote': "⚠️ This data is from X-Ray's 5% sampling. Results may not show all errors or issues.",
1312
+ 'TransactionSearchStatus': {
1313
+ 'enabled': is_tx_search_enabled,
1314
+ 'recommendation': (
1315
+ 'Transaction Search is available! Use search_transaction_spans() for 100% trace visibility.'
1316
+ if is_tx_search_enabled
1317
+ else 'Enable Transaction Search for 100% trace visibility instead of 5% sampling.'
1318
+ ),
1319
+ },
1320
+ }
1321
+
1322
+ elapsed_time = timer() - start_time_perf
1323
+ logger.info(
1324
+ f'query_sampled_traces completed in {elapsed_time:.3f}s - retrieved {len(trace_summaries)} traces'
1325
+ )
1326
+ return json.dumps(result_data, indent=2)
1327
+
1328
+ except Exception as e:
1329
+ logger.error(f'Error in query_sampled_traces: {str(e)}', exc_info=True)
1330
+ return json.dumps({'error': str(e)}, indent=2)
1331
+
1332
+
258
1333
  def main():
259
1334
  """Run the MCP server."""
260
1335
  logger.debug('Starting CloudWatch AppSignals MCP server')