awslabs.cloudwatch-appsignals-mcp-server 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- awslabs/cloudwatch_appsignals_mcp_server/server.py +1086 -11
- awslabs/cloudwatch_appsignals_mcp_server/sli_report_client.py +340 -0
- {awslabs_cloudwatch_appsignals_mcp_server-0.1.1.dist-info → awslabs_cloudwatch_appsignals_mcp_server-0.1.3.dist-info}/METADATA +87 -21
- awslabs_cloudwatch_appsignals_mcp_server-0.1.3.dist-info/RECORD +10 -0
- awslabs_cloudwatch_appsignals_mcp_server-0.1.1.dist-info/RECORD +0 -9
- {awslabs_cloudwatch_appsignals_mcp_server-0.1.1.dist-info → awslabs_cloudwatch_appsignals_mcp_server-0.1.3.dist-info}/WHEEL +0 -0
- {awslabs_cloudwatch_appsignals_mcp_server-0.1.1.dist-info → awslabs_cloudwatch_appsignals_mcp_server-0.1.3.dist-info}/entry_points.txt +0 -0
- {awslabs_cloudwatch_appsignals_mcp_server-0.1.1.dist-info → awslabs_cloudwatch_appsignals_mcp_server-0.1.3.dist-info}/licenses/LICENSE +0 -0
- {awslabs_cloudwatch_appsignals_mcp_server-0.1.1.dist-info → awslabs_cloudwatch_appsignals_mcp_server-0.1.3.dist-info}/licenses/NOTICE +0 -0
|
@@ -14,10 +14,13 @@
|
|
|
14
14
|
|
|
15
15
|
"""CloudWatch Application Signals MCP Server - Core server implementation."""
|
|
16
16
|
|
|
17
|
+
import asyncio
|
|
17
18
|
import boto3
|
|
19
|
+
import json
|
|
18
20
|
import os
|
|
19
21
|
import sys
|
|
20
22
|
from . import __version__
|
|
23
|
+
from .sli_report_client import AWSConfig, SLIReportClient
|
|
21
24
|
from botocore.config import Config
|
|
22
25
|
from botocore.exceptions import ClientError
|
|
23
26
|
from datetime import datetime, timedelta, timezone
|
|
@@ -25,6 +28,7 @@ from loguru import logger
|
|
|
25
28
|
from mcp.server.fastmcp import FastMCP
|
|
26
29
|
from pydantic import Field
|
|
27
30
|
from time import perf_counter as timer
|
|
31
|
+
from typing import Dict, Optional
|
|
28
32
|
|
|
29
33
|
|
|
30
34
|
# Initialize FastMCP server
|
|
@@ -44,9 +48,12 @@ logger.debug(f'Using AWS region: {AWS_REGION}')
|
|
|
44
48
|
try:
|
|
45
49
|
config = Config(user_agent_extra=f'awslabs.cloudwatch-appsignals-mcp-server/{__version__}')
|
|
46
50
|
logs_client = boto3.client('logs', region_name=AWS_REGION, config=config)
|
|
47
|
-
|
|
51
|
+
appsignals_client = boto3.client('application-signals', region_name=AWS_REGION, config=config)
|
|
52
|
+
cloudwatch_client = boto3.client('cloudwatch', region_name=AWS_REGION, config=config)
|
|
53
|
+
xray_client = boto3.client('xray', region_name=AWS_REGION, config=config)
|
|
54
|
+
logger.debug('AWS clients initialized successfully')
|
|
48
55
|
except Exception as e:
|
|
49
|
-
logger.error(f'Failed to initialize AWS
|
|
56
|
+
logger.error(f'Failed to initialize AWS clients: {str(e)}')
|
|
50
57
|
raise
|
|
51
58
|
|
|
52
59
|
|
|
@@ -83,16 +90,15 @@ async def list_monitored_services() -> str:
|
|
|
83
90
|
logger.debug('Starting list_application_signals_services request')
|
|
84
91
|
|
|
85
92
|
try:
|
|
86
|
-
appsignals = boto3.client('application-signals', region_name=AWS_REGION)
|
|
87
|
-
logger.debug('Application Signals client created')
|
|
88
|
-
|
|
89
93
|
# Calculate time range (last 24 hours)
|
|
90
94
|
end_time = datetime.now(timezone.utc)
|
|
91
95
|
start_time = end_time - timedelta(hours=24)
|
|
92
96
|
|
|
93
97
|
# Get all services
|
|
94
98
|
logger.debug(f'Querying services for time range: {start_time} to {end_time}')
|
|
95
|
-
response =
|
|
99
|
+
response = appsignals_client.list_services(
|
|
100
|
+
StartTime=start_time, EndTime=end_time, MaxResults=100
|
|
101
|
+
)
|
|
96
102
|
services = response.get('ServiceSummaries', [])
|
|
97
103
|
logger.debug(f'Retrieved {len(services)} services from Application Signals')
|
|
98
104
|
|
|
@@ -161,15 +167,12 @@ async def get_service_detail(
|
|
|
161
167
|
logger.debug(f'Starting get_service_healthy_detail request for service: {service_name}')
|
|
162
168
|
|
|
163
169
|
try:
|
|
164
|
-
appsignals = boto3.client('application-signals', region_name=AWS_REGION)
|
|
165
|
-
logger.debug('Application Signals client created')
|
|
166
|
-
|
|
167
170
|
# Calculate time range (last 24 hours)
|
|
168
171
|
end_time = datetime.now(timezone.utc)
|
|
169
172
|
start_time = end_time - timedelta(hours=24)
|
|
170
173
|
|
|
171
174
|
# First, get all services to find the one we want
|
|
172
|
-
services_response =
|
|
175
|
+
services_response = appsignals_client.list_services(
|
|
173
176
|
StartTime=start_time, EndTime=end_time, MaxResults=100
|
|
174
177
|
)
|
|
175
178
|
|
|
@@ -187,7 +190,7 @@ async def get_service_detail(
|
|
|
187
190
|
|
|
188
191
|
# Get detailed service information
|
|
189
192
|
logger.debug(f'Getting detailed information for service: {service_name}')
|
|
190
|
-
service_response =
|
|
193
|
+
service_response = appsignals_client.get_service(
|
|
191
194
|
StartTime=start_time, EndTime=end_time, KeyAttributes=target_service['KeyAttributes']
|
|
192
195
|
)
|
|
193
196
|
|
|
@@ -255,6 +258,1078 @@ async def get_service_detail(
|
|
|
255
258
|
return f'Error: {str(e)}'
|
|
256
259
|
|
|
257
260
|
|
|
261
|
+
@mcp.tool()
|
|
262
|
+
async def query_service_metrics(
|
|
263
|
+
service_name: str = Field(
|
|
264
|
+
..., description='Name of the service to get metrics for (case-sensitive)'
|
|
265
|
+
),
|
|
266
|
+
metric_name: str = Field(
|
|
267
|
+
...,
|
|
268
|
+
description='Specific metric name (e.g., Latency, Error, Fault). Leave empty to list available metrics',
|
|
269
|
+
),
|
|
270
|
+
statistic: str = Field(
|
|
271
|
+
default='Average',
|
|
272
|
+
description='Standard statistic type (Average, Sum, Maximum, Minimum, SampleCount)',
|
|
273
|
+
),
|
|
274
|
+
extended_statistic: str = Field(
|
|
275
|
+
default='p99', description='Extended statistic (p99, p95, p90, p50, etc)'
|
|
276
|
+
),
|
|
277
|
+
hours: int = Field(
|
|
278
|
+
default=1, description='Number of hours to look back (default 1, max 168 for 1 week)'
|
|
279
|
+
),
|
|
280
|
+
) -> str:
|
|
281
|
+
"""Get CloudWatch metrics for a specific Application Signals service.
|
|
282
|
+
|
|
283
|
+
Use this tool to:
|
|
284
|
+
- Analyze service performance (latency, throughput)
|
|
285
|
+
- Check error rates and reliability
|
|
286
|
+
- View trends over time
|
|
287
|
+
- Get both standard statistics (Average, Max) and percentiles (p99, p95)
|
|
288
|
+
|
|
289
|
+
Common metric names:
|
|
290
|
+
- 'Latency': Response time in milliseconds
|
|
291
|
+
- 'Error': Percentage of failed requests
|
|
292
|
+
- 'Fault': Percentage of server errors (5xx)
|
|
293
|
+
|
|
294
|
+
Returns:
|
|
295
|
+
- Summary statistics (latest, average, min, max)
|
|
296
|
+
- Recent data points with timestamps
|
|
297
|
+
- Both standard and percentile values when available
|
|
298
|
+
|
|
299
|
+
The tool automatically adjusts the granularity based on time range:
|
|
300
|
+
- Up to 3 hours: 1-minute resolution
|
|
301
|
+
- Up to 24 hours: 5-minute resolution
|
|
302
|
+
- Over 24 hours: 1-hour resolution
|
|
303
|
+
"""
|
|
304
|
+
start_time_perf = timer()
|
|
305
|
+
logger.info(
|
|
306
|
+
f'Starting query_service_metrics request - service: {service_name}, metric: {metric_name}, hours: {hours}'
|
|
307
|
+
)
|
|
308
|
+
|
|
309
|
+
try:
|
|
310
|
+
# Calculate time range
|
|
311
|
+
end_time = datetime.now(timezone.utc)
|
|
312
|
+
start_time = end_time - timedelta(hours=hours)
|
|
313
|
+
|
|
314
|
+
# Get service details to find metrics
|
|
315
|
+
services_response = appsignals_client.list_services(
|
|
316
|
+
StartTime=start_time, EndTime=end_time, MaxResults=100
|
|
317
|
+
)
|
|
318
|
+
|
|
319
|
+
# Find the target service
|
|
320
|
+
target_service = None
|
|
321
|
+
for service in services_response.get('ServiceSummaries', []):
|
|
322
|
+
key_attrs = service.get('KeyAttributes', {})
|
|
323
|
+
if key_attrs.get('Name') == service_name:
|
|
324
|
+
target_service = service
|
|
325
|
+
break
|
|
326
|
+
|
|
327
|
+
if not target_service:
|
|
328
|
+
logger.warning(f"Service '{service_name}' not found in Application Signals")
|
|
329
|
+
return f"Service '{service_name}' not found in Application Signals."
|
|
330
|
+
|
|
331
|
+
# Get detailed service info for metric references
|
|
332
|
+
service_response = appsignals_client.get_service(
|
|
333
|
+
StartTime=start_time, EndTime=end_time, KeyAttributes=target_service['KeyAttributes']
|
|
334
|
+
)
|
|
335
|
+
|
|
336
|
+
metric_refs = service_response['Service'].get('MetricReferences', [])
|
|
337
|
+
|
|
338
|
+
if not metric_refs:
|
|
339
|
+
logger.warning(f"No metrics found for service '{service_name}'")
|
|
340
|
+
return f"No metrics found for service '{service_name}'."
|
|
341
|
+
|
|
342
|
+
# If no specific metric requested, show available metrics
|
|
343
|
+
if not metric_name:
|
|
344
|
+
result = f"Available metrics for service '{service_name}':\n\n"
|
|
345
|
+
for metric in metric_refs:
|
|
346
|
+
result += f'• {metric.get("MetricName", "Unknown")}\n'
|
|
347
|
+
result += f' Namespace: {metric.get("Namespace", "Unknown")}\n'
|
|
348
|
+
result += f' Type: {metric.get("MetricType", "Unknown")}\n'
|
|
349
|
+
result += '\n'
|
|
350
|
+
return result
|
|
351
|
+
|
|
352
|
+
# Find the specific metric
|
|
353
|
+
target_metric = None
|
|
354
|
+
for metric in metric_refs:
|
|
355
|
+
if metric.get('MetricName') == metric_name:
|
|
356
|
+
target_metric = metric
|
|
357
|
+
break
|
|
358
|
+
|
|
359
|
+
if not target_metric:
|
|
360
|
+
available = [m.get('MetricName', 'Unknown') for m in metric_refs]
|
|
361
|
+
return f"Metric '{metric_name}' not found for service '{service_name}'. Available: {', '.join(available)}"
|
|
362
|
+
|
|
363
|
+
# Calculate appropriate period based on time range
|
|
364
|
+
if hours <= 3:
|
|
365
|
+
period = 60 # 1 minute
|
|
366
|
+
elif hours <= 24:
|
|
367
|
+
period = 300 # 5 minutes
|
|
368
|
+
else:
|
|
369
|
+
period = 3600 # 1 hour
|
|
370
|
+
|
|
371
|
+
# Get both standard and extended statistics in a single call
|
|
372
|
+
response = cloudwatch_client.get_metric_statistics(
|
|
373
|
+
Namespace=target_metric['Namespace'],
|
|
374
|
+
MetricName=target_metric['MetricName'],
|
|
375
|
+
Dimensions=target_metric.get('Dimensions', []),
|
|
376
|
+
StartTime=start_time,
|
|
377
|
+
EndTime=end_time,
|
|
378
|
+
Period=period,
|
|
379
|
+
Statistics=[statistic], # type: ignore
|
|
380
|
+
ExtendedStatistics=[extended_statistic],
|
|
381
|
+
)
|
|
382
|
+
|
|
383
|
+
datapoints = response.get('Datapoints', [])
|
|
384
|
+
|
|
385
|
+
if not datapoints:
|
|
386
|
+
logger.warning(
|
|
387
|
+
f"No data points found for metric '{metric_name}' on service '{service_name}' in the last {hours} hour(s)"
|
|
388
|
+
)
|
|
389
|
+
return f"No data points found for metric '{metric_name}' on service '{service_name}' in the last {hours} hour(s)."
|
|
390
|
+
|
|
391
|
+
# Sort by timestamp
|
|
392
|
+
datapoints.sort(key=lambda x: x.get('Timestamp', datetime.min)) # type: ignore
|
|
393
|
+
|
|
394
|
+
# Build response
|
|
395
|
+
result = f'Metrics for {service_name} - {metric_name}\n'
|
|
396
|
+
result += f'Time Range: Last {hours} hour(s)\n'
|
|
397
|
+
result += f'Period: {period} seconds\n\n'
|
|
398
|
+
|
|
399
|
+
# Calculate summary statistics for both standard and extended statistics
|
|
400
|
+
standard_values = [dp.get(statistic) for dp in datapoints if dp.get(statistic) is not None]
|
|
401
|
+
extended_values = [
|
|
402
|
+
dp.get(extended_statistic)
|
|
403
|
+
for dp in datapoints
|
|
404
|
+
if dp.get(extended_statistic) is not None
|
|
405
|
+
]
|
|
406
|
+
|
|
407
|
+
result += 'Summary:\n'
|
|
408
|
+
|
|
409
|
+
if standard_values:
|
|
410
|
+
latest_standard = datapoints[-1].get(statistic)
|
|
411
|
+
avg_of_standard = sum(standard_values) / len(standard_values) # type: ignore
|
|
412
|
+
max_standard = max(standard_values) # type: ignore
|
|
413
|
+
min_standard = min(standard_values) # type: ignore
|
|
414
|
+
|
|
415
|
+
result += f'{statistic} Statistics:\n'
|
|
416
|
+
result += f'• Latest: {latest_standard:.2f}\n'
|
|
417
|
+
result += f'• Average: {avg_of_standard:.2f}\n'
|
|
418
|
+
result += f'• Maximum: {max_standard:.2f}\n'
|
|
419
|
+
result += f'• Minimum: {min_standard:.2f}\n\n'
|
|
420
|
+
|
|
421
|
+
if extended_values:
|
|
422
|
+
latest_extended = datapoints[-1].get(extended_statistic)
|
|
423
|
+
avg_extended = sum(extended_values) / len(extended_values) # type: ignore
|
|
424
|
+
max_extended = max(extended_values) # type: ignore
|
|
425
|
+
min_extended = min(extended_values) # type: ignore
|
|
426
|
+
|
|
427
|
+
result += f'{extended_statistic} Statistics:\n'
|
|
428
|
+
result += f'• Latest: {latest_extended:.2f}\n'
|
|
429
|
+
result += f'• Average: {avg_extended:.2f}\n'
|
|
430
|
+
result += f'• Maximum: {max_extended:.2f}\n'
|
|
431
|
+
result += f'• Minimum: {min_extended:.2f}\n\n'
|
|
432
|
+
|
|
433
|
+
result += f'• Data Points: {len(datapoints)}\n\n'
|
|
434
|
+
|
|
435
|
+
# Show recent values (last 10) with both metrics
|
|
436
|
+
result += 'Recent Values:\n'
|
|
437
|
+
for dp in datapoints[-10:]:
|
|
438
|
+
timestamp = dp.get('Timestamp', datetime.min).strftime('%m/%d %H:%M') # type: ignore
|
|
439
|
+
unit = dp.get('Unit', '')
|
|
440
|
+
|
|
441
|
+
values_str = []
|
|
442
|
+
if dp.get(statistic) is not None:
|
|
443
|
+
values_str.append(f'{statistic}: {dp[statistic]:.2f}')
|
|
444
|
+
if dp.get(extended_statistic) is not None:
|
|
445
|
+
values_str.append(f'{extended_statistic}: {dp[extended_statistic]:.2f}')
|
|
446
|
+
|
|
447
|
+
result += f'• {timestamp}: {", ".join(values_str)} {unit}\n'
|
|
448
|
+
|
|
449
|
+
elapsed_time = timer() - start_time_perf
|
|
450
|
+
logger.info(
|
|
451
|
+
f"query_service_metrics completed for '{service_name}/{metric_name}' in {elapsed_time:.3f}s"
|
|
452
|
+
)
|
|
453
|
+
return result
|
|
454
|
+
|
|
455
|
+
except ClientError as e:
|
|
456
|
+
error_msg = e.response.get('Error', {}).get('Message', 'Unknown error')
|
|
457
|
+
error_code = e.response.get('Error', {}).get('Code', 'Unknown')
|
|
458
|
+
logger.error(
|
|
459
|
+
f"AWS ClientError in query_service_metrics for '{service_name}/{metric_name}': {error_code} - {error_msg}"
|
|
460
|
+
)
|
|
461
|
+
return f'AWS Error: {error_msg}'
|
|
462
|
+
except Exception as e:
|
|
463
|
+
logger.error(
|
|
464
|
+
f"Unexpected error in query_service_metrics for '{service_name}/{metric_name}': {str(e)}",
|
|
465
|
+
exc_info=True,
|
|
466
|
+
)
|
|
467
|
+
return f'Error: {str(e)}'
|
|
468
|
+
|
|
469
|
+
|
|
470
|
+
def get_trace_summaries_paginated(
|
|
471
|
+
xray_client, start_time, end_time, filter_expression, max_traces: int = 100
|
|
472
|
+
) -> list:
|
|
473
|
+
"""Get trace summaries with pagination to avoid exceeding response size limits.
|
|
474
|
+
|
|
475
|
+
Args:
|
|
476
|
+
xray_client: Boto3 X-Ray client
|
|
477
|
+
start_time: Start time for trace query
|
|
478
|
+
end_time: End time for trace query
|
|
479
|
+
filter_expression: X-Ray filter expression
|
|
480
|
+
max_traces: Maximum number of traces to retrieve (default 100)
|
|
481
|
+
|
|
482
|
+
Returns:
|
|
483
|
+
List of trace summaries
|
|
484
|
+
"""
|
|
485
|
+
all_traces = []
|
|
486
|
+
next_token = None
|
|
487
|
+
logger.debug(
|
|
488
|
+
f'Starting paginated trace retrieval - filter: {filter_expression}, max_traces: {max_traces}'
|
|
489
|
+
)
|
|
490
|
+
|
|
491
|
+
try:
|
|
492
|
+
while len(all_traces) < max_traces:
|
|
493
|
+
# Build request parameters
|
|
494
|
+
kwargs = {
|
|
495
|
+
'StartTime': start_time,
|
|
496
|
+
'EndTime': end_time,
|
|
497
|
+
'FilterExpression': filter_expression,
|
|
498
|
+
'Sampling': True,
|
|
499
|
+
'TimeRangeType': 'Service',
|
|
500
|
+
}
|
|
501
|
+
|
|
502
|
+
if next_token:
|
|
503
|
+
kwargs['NextToken'] = next_token
|
|
504
|
+
|
|
505
|
+
# Make request
|
|
506
|
+
response = xray_client.get_trace_summaries(**kwargs)
|
|
507
|
+
|
|
508
|
+
# Add traces from this page
|
|
509
|
+
traces = response.get('TraceSummaries', [])
|
|
510
|
+
all_traces.extend(traces)
|
|
511
|
+
logger.debug(
|
|
512
|
+
f'Retrieved {len(traces)} traces in this page, total so far: {len(all_traces)}'
|
|
513
|
+
)
|
|
514
|
+
|
|
515
|
+
# Check if we have more pages
|
|
516
|
+
next_token = response.get('NextToken')
|
|
517
|
+
if not next_token:
|
|
518
|
+
break
|
|
519
|
+
|
|
520
|
+
# If we've collected enough traces, stop
|
|
521
|
+
if len(all_traces) >= max_traces:
|
|
522
|
+
all_traces = all_traces[:max_traces]
|
|
523
|
+
break
|
|
524
|
+
|
|
525
|
+
logger.info(f'Successfully retrieved {len(all_traces)} traces')
|
|
526
|
+
return all_traces
|
|
527
|
+
|
|
528
|
+
except Exception as e:
|
|
529
|
+
# Return what we have so far if there's an error
|
|
530
|
+
logger.error(f'Error during paginated trace retrieval: {str(e)}', exc_info=True)
|
|
531
|
+
logger.info(f'Returning {len(all_traces)} traces retrieved before error')
|
|
532
|
+
return all_traces
|
|
533
|
+
|
|
534
|
+
|
|
535
|
+
@mcp.tool()
|
|
536
|
+
async def get_slo(
|
|
537
|
+
slo_id: str = Field(..., description='The ARN or name of the SLO to retrieve'),
|
|
538
|
+
) -> str:
|
|
539
|
+
"""Get detailed information about a specific Service Level Objective (SLO).
|
|
540
|
+
|
|
541
|
+
Use this tool to:
|
|
542
|
+
- Get comprehensive SLO configuration details
|
|
543
|
+
- Understand what metrics the SLO monitors
|
|
544
|
+
- See threshold values and comparison operators
|
|
545
|
+
- Extract operation names and key attributes for trace queries
|
|
546
|
+
- Identify dependency configurations
|
|
547
|
+
- Review attainment goals and burn rate settings
|
|
548
|
+
|
|
549
|
+
Returns detailed information including:
|
|
550
|
+
- SLO name, description, and metadata
|
|
551
|
+
- Metric configuration (for period-based or request-based SLOs)
|
|
552
|
+
- Key attributes and operation names
|
|
553
|
+
- Metric type (LATENCY or AVAILABILITY)
|
|
554
|
+
- Threshold values and comparison operators
|
|
555
|
+
- Goal configuration (attainment percentage, time interval)
|
|
556
|
+
- Burn rate configurations
|
|
557
|
+
|
|
558
|
+
This tool is essential for:
|
|
559
|
+
- Understanding why an SLO was breached
|
|
560
|
+
- Getting the exact operation name to query traces
|
|
561
|
+
- Identifying the metrics and thresholds being monitored
|
|
562
|
+
- Planning remediation based on SLO configuration
|
|
563
|
+
"""
|
|
564
|
+
start_time_perf = timer()
|
|
565
|
+
logger.info(f'Starting get_service_level_objective request for SLO: {slo_id}')
|
|
566
|
+
|
|
567
|
+
try:
|
|
568
|
+
response = appsignals_client.get_service_level_objective(Id=slo_id)
|
|
569
|
+
slo = response.get('Slo', {})
|
|
570
|
+
|
|
571
|
+
if not slo:
|
|
572
|
+
logger.warning(f'No SLO found with ID: {slo_id}')
|
|
573
|
+
return f'No SLO found with ID: {slo_id}'
|
|
574
|
+
|
|
575
|
+
result = 'Service Level Objective Details\n'
|
|
576
|
+
result += '=' * 50 + '\n\n'
|
|
577
|
+
|
|
578
|
+
# Basic info
|
|
579
|
+
result += f'Name: {slo.get("Name", "Unknown")}\n'
|
|
580
|
+
result += f'ARN: {slo.get("Arn", "Unknown")}\n'
|
|
581
|
+
if slo.get('Description'):
|
|
582
|
+
result += f'Description: {slo.get("Description", "")}\n'
|
|
583
|
+
result += f'Evaluation Type: {slo.get("EvaluationType", "Unknown")}\n'
|
|
584
|
+
result += f'Created: {slo.get("CreatedTime", "Unknown")}\n'
|
|
585
|
+
result += f'Last Updated: {slo.get("LastUpdatedTime", "Unknown")}\n\n'
|
|
586
|
+
|
|
587
|
+
# Goal configuration
|
|
588
|
+
goal = slo.get('Goal', {})
|
|
589
|
+
if goal:
|
|
590
|
+
result += 'Goal Configuration:\n'
|
|
591
|
+
result += f'• Attainment Goal: {goal.get("AttainmentGoal", 99)}%\n'
|
|
592
|
+
result += f'• Warning Threshold: {goal.get("WarningThreshold", 50)}%\n'
|
|
593
|
+
|
|
594
|
+
interval = goal.get('Interval', {})
|
|
595
|
+
if 'RollingInterval' in interval:
|
|
596
|
+
rolling = interval['RollingInterval']
|
|
597
|
+
result += f'• Interval: Rolling {rolling.get("Duration")} {rolling.get("DurationUnit")}\n'
|
|
598
|
+
elif 'CalendarInterval' in interval:
|
|
599
|
+
calendar = interval['CalendarInterval']
|
|
600
|
+
result += f'• Interval: Calendar {calendar.get("Duration")} {calendar.get("DurationUnit")} starting {calendar.get("StartTime")}\n'
|
|
601
|
+
result += '\n'
|
|
602
|
+
|
|
603
|
+
# Period-based SLI
|
|
604
|
+
if 'Sli' in slo:
|
|
605
|
+
sli = slo['Sli']
|
|
606
|
+
result += 'Period-Based SLI Configuration:\n'
|
|
607
|
+
|
|
608
|
+
sli_metric = sli.get('SliMetric', {})
|
|
609
|
+
if sli_metric:
|
|
610
|
+
# Key attributes - crucial for trace queries
|
|
611
|
+
key_attrs = sli_metric.get('KeyAttributes', {})
|
|
612
|
+
if key_attrs:
|
|
613
|
+
result += '• Key Attributes:\n'
|
|
614
|
+
for k, v in key_attrs.items():
|
|
615
|
+
result += f' - {k}: {v}\n'
|
|
616
|
+
|
|
617
|
+
# Operation name - essential for trace filtering
|
|
618
|
+
if sli_metric.get('OperationName'):
|
|
619
|
+
result += f'• Operation Name: {sli_metric.get("OperationName", "")}\n'
|
|
620
|
+
result += f' (Use this in trace queries: annotation[aws.local.operation]="{sli_metric.get("OperationName", "")}")\n'
|
|
621
|
+
|
|
622
|
+
result += f'• Metric Type: {sli_metric.get("MetricType", "Unknown")}\n'
|
|
623
|
+
|
|
624
|
+
# MetricDataQueries - detailed metric configuration
|
|
625
|
+
metric_queries = sli_metric.get('MetricDataQueries', [])
|
|
626
|
+
if metric_queries:
|
|
627
|
+
result += '• Metric Data Queries:\n'
|
|
628
|
+
for query in metric_queries:
|
|
629
|
+
query_id = query.get('Id', 'Unknown')
|
|
630
|
+
result += f' Query ID: {query_id}\n'
|
|
631
|
+
|
|
632
|
+
# MetricStat details
|
|
633
|
+
metric_stat = query.get('MetricStat', {})
|
|
634
|
+
if metric_stat:
|
|
635
|
+
metric = metric_stat.get('Metric', {})
|
|
636
|
+
if metric:
|
|
637
|
+
result += f' Namespace: {metric.get("Namespace", "Unknown")}\n'
|
|
638
|
+
result += (
|
|
639
|
+
f' MetricName: {metric.get("MetricName", "Unknown")}\n'
|
|
640
|
+
)
|
|
641
|
+
|
|
642
|
+
# Dimensions - crucial for understanding what's being measured
|
|
643
|
+
dimensions = metric.get('Dimensions', [])
|
|
644
|
+
if dimensions:
|
|
645
|
+
result += ' Dimensions:\n'
|
|
646
|
+
for dim in dimensions:
|
|
647
|
+
result += f' - {dim.get("Name", "Unknown")}: {dim.get("Value", "Unknown")}\n'
|
|
648
|
+
|
|
649
|
+
result += (
|
|
650
|
+
f' Period: {metric_stat.get("Period", "Unknown")} seconds\n'
|
|
651
|
+
)
|
|
652
|
+
result += f' Stat: {metric_stat.get("Stat", "Unknown")}\n'
|
|
653
|
+
if metric_stat.get('Unit'):
|
|
654
|
+
result += f' Unit: {metric_stat["Unit"]}\n' # type: ignore
|
|
655
|
+
|
|
656
|
+
# Expression if present
|
|
657
|
+
if query.get('Expression'):
|
|
658
|
+
result += f' Expression: {query.get("Expression", "")}\n'
|
|
659
|
+
|
|
660
|
+
result += f' ReturnData: {query.get("ReturnData", True)}\n'
|
|
661
|
+
|
|
662
|
+
# Dependency config
|
|
663
|
+
dep_config = sli_metric.get('DependencyConfig', {})
|
|
664
|
+
if dep_config:
|
|
665
|
+
result += '• Dependency Configuration:\n'
|
|
666
|
+
dep_attrs = dep_config.get('DependencyKeyAttributes', {})
|
|
667
|
+
if dep_attrs:
|
|
668
|
+
result += ' Key Attributes:\n'
|
|
669
|
+
for k, v in dep_attrs.items():
|
|
670
|
+
result += f' - {k}: {v}\n'
|
|
671
|
+
if dep_config.get('DependencyOperationName'):
|
|
672
|
+
result += (
|
|
673
|
+
f' - Dependency Operation: {dep_config["DependencyOperationName"]}\n'
|
|
674
|
+
)
|
|
675
|
+
result += f' (Use in traces: annotation[aws.remote.operation]="{dep_config["DependencyOperationName"]}")\n'
|
|
676
|
+
|
|
677
|
+
result += f'• Threshold: {sli.get("MetricThreshold", "Unknown")}\n'
|
|
678
|
+
result += f'• Comparison: {sli.get("ComparisonOperator", "Unknown")}\n\n'
|
|
679
|
+
|
|
680
|
+
# Request-based SLI
|
|
681
|
+
if 'RequestBasedSli' in slo:
|
|
682
|
+
rbs = slo['RequestBasedSli']
|
|
683
|
+
result += 'Request-Based SLI Configuration:\n'
|
|
684
|
+
|
|
685
|
+
rbs_metric = rbs.get('RequestBasedSliMetric', {})
|
|
686
|
+
if rbs_metric:
|
|
687
|
+
# Key attributes
|
|
688
|
+
key_attrs = rbs_metric.get('KeyAttributes', {})
|
|
689
|
+
if key_attrs:
|
|
690
|
+
result += '• Key Attributes:\n'
|
|
691
|
+
for k, v in key_attrs.items():
|
|
692
|
+
result += f' - {k}: {v}\n'
|
|
693
|
+
|
|
694
|
+
# Operation name
|
|
695
|
+
if rbs_metric.get('OperationName'):
|
|
696
|
+
result += f'• Operation Name: {rbs_metric.get("OperationName", "")}\n'
|
|
697
|
+
result += f' (Use this in trace queries: annotation[aws.local.operation]="{rbs_metric.get("OperationName", "")}")\n'
|
|
698
|
+
|
|
699
|
+
result += f'• Metric Type: {rbs_metric.get("MetricType", "Unknown")}\n'
|
|
700
|
+
|
|
701
|
+
# MetricDataQueries - detailed metric configuration
|
|
702
|
+
metric_queries = rbs_metric.get('MetricDataQueries', [])
|
|
703
|
+
if metric_queries:
|
|
704
|
+
result += '• Metric Data Queries:\n'
|
|
705
|
+
for query in metric_queries:
|
|
706
|
+
query_id = query.get('Id', 'Unknown')
|
|
707
|
+
result += f' Query ID: {query_id}\n'
|
|
708
|
+
|
|
709
|
+
# MetricStat details
|
|
710
|
+
metric_stat = query.get('MetricStat', {})
|
|
711
|
+
if metric_stat:
|
|
712
|
+
metric = metric_stat.get('Metric', {})
|
|
713
|
+
if metric:
|
|
714
|
+
result += f' Namespace: {metric.get("Namespace", "Unknown")}\n'
|
|
715
|
+
result += (
|
|
716
|
+
f' MetricName: {metric.get("MetricName", "Unknown")}\n'
|
|
717
|
+
)
|
|
718
|
+
|
|
719
|
+
# Dimensions - crucial for understanding what's being measured
|
|
720
|
+
dimensions = metric.get('Dimensions', [])
|
|
721
|
+
if dimensions:
|
|
722
|
+
result += ' Dimensions:\n'
|
|
723
|
+
for dim in dimensions:
|
|
724
|
+
result += f' - {dim.get("Name", "Unknown")}: {dim.get("Value", "Unknown")}\n'
|
|
725
|
+
|
|
726
|
+
result += (
|
|
727
|
+
f' Period: {metric_stat.get("Period", "Unknown")} seconds\n'
|
|
728
|
+
)
|
|
729
|
+
result += f' Stat: {metric_stat.get("Stat", "Unknown")}\n'
|
|
730
|
+
if metric_stat.get('Unit'):
|
|
731
|
+
result += f' Unit: {metric_stat["Unit"]}\n' # type: ignore
|
|
732
|
+
|
|
733
|
+
# Expression if present
|
|
734
|
+
if query.get('Expression'):
|
|
735
|
+
result += f' Expression: {query.get("Expression", "")}\n'
|
|
736
|
+
|
|
737
|
+
result += f' ReturnData: {query.get("ReturnData", True)}\n'
|
|
738
|
+
|
|
739
|
+
# Dependency config
|
|
740
|
+
dep_config = rbs_metric.get('DependencyConfig', {})
|
|
741
|
+
if dep_config:
|
|
742
|
+
result += '• Dependency Configuration:\n'
|
|
743
|
+
dep_attrs = dep_config.get('DependencyKeyAttributes', {})
|
|
744
|
+
if dep_attrs:
|
|
745
|
+
result += ' Key Attributes:\n'
|
|
746
|
+
for k, v in dep_attrs.items():
|
|
747
|
+
result += f' - {k}: {v}\n'
|
|
748
|
+
if dep_config.get('DependencyOperationName'):
|
|
749
|
+
result += (
|
|
750
|
+
f' - Dependency Operation: {dep_config["DependencyOperationName"]}\n'
|
|
751
|
+
)
|
|
752
|
+
result += f' (Use in traces: annotation[aws.remote.operation]="{dep_config["DependencyOperationName"]}")\n'
|
|
753
|
+
|
|
754
|
+
result += f'• Threshold: {rbs.get("MetricThreshold", "Unknown")}\n'
|
|
755
|
+
result += f'• Comparison: {rbs.get("ComparisonOperator", "Unknown")}\n\n'
|
|
756
|
+
|
|
757
|
+
# Burn rate configurations
|
|
758
|
+
burn_rates = slo.get('BurnRateConfigurations', [])
|
|
759
|
+
if burn_rates:
|
|
760
|
+
result += 'Burn Rate Configurations:\n'
|
|
761
|
+
for br in burn_rates:
|
|
762
|
+
result += f'• Look-back window: {br.get("LookBackWindowMinutes")} minutes\n'
|
|
763
|
+
|
|
764
|
+
elapsed_time = timer() - start_time_perf
|
|
765
|
+
logger.info(f"get_service_level_objective completed for '{slo_id}' in {elapsed_time:.3f}s")
|
|
766
|
+
return result
|
|
767
|
+
|
|
768
|
+
except ClientError as e:
|
|
769
|
+
error_msg = e.response.get('Error', {}).get('Message', 'Unknown error')
|
|
770
|
+
error_code = e.response.get('Error', {}).get('Code', 'Unknown')
|
|
771
|
+
logger.error(
|
|
772
|
+
f"AWS ClientError in get_service_level_objective for '{slo_id}': {error_code} - {error_msg}"
|
|
773
|
+
)
|
|
774
|
+
return f'AWS Error: {error_msg}'
|
|
775
|
+
except Exception as e:
|
|
776
|
+
logger.error(
|
|
777
|
+
f"Unexpected error in get_service_level_objective for '{slo_id}': {str(e)}",
|
|
778
|
+
exc_info=True,
|
|
779
|
+
)
|
|
780
|
+
return f'Error: {str(e)}'
|
|
781
|
+
|
|
782
|
+
|
|
783
|
+
@mcp.tool()
|
|
784
|
+
async def search_transaction_spans(
|
|
785
|
+
log_group_name: str = Field(
|
|
786
|
+
default='',
|
|
787
|
+
description='CloudWatch log group name (defaults to "aws/spans" if not provided)',
|
|
788
|
+
),
|
|
789
|
+
start_time: str = Field(
|
|
790
|
+
default='', description='Start time in ISO 8601 format (e.g., "2025-04-19T20:00:00+00:00")'
|
|
791
|
+
),
|
|
792
|
+
end_time: str = Field(
|
|
793
|
+
default='', description='End time in ISO 8601 format (e.g., "2025-04-19T21:00:00+00:00")'
|
|
794
|
+
),
|
|
795
|
+
query_string: str = Field(default='', description='CloudWatch Logs Insights query string'),
|
|
796
|
+
limit: Optional[int] = Field(default=None, description='Maximum number of results to return'),
|
|
797
|
+
max_timeout: int = Field(
|
|
798
|
+
default=30, description='Maximum time in seconds to wait for query completion'
|
|
799
|
+
),
|
|
800
|
+
) -> Dict:
|
|
801
|
+
"""Executes a CloudWatch Logs Insights query for transaction search (100% sampled trace data).
|
|
802
|
+
|
|
803
|
+
IMPORTANT: If log_group_name is not provided use 'aws/spans' as default cloudwatch log group name.
|
|
804
|
+
The volume of returned logs can easily overwhelm the agent context window. Always include a limit in the query
|
|
805
|
+
(| limit 50) or using the limit parameter.
|
|
806
|
+
|
|
807
|
+
Usage:
|
|
808
|
+
"aws/spans" log group stores OpenTelemetry Spans data with many attributes for all monitored services.
|
|
809
|
+
This provides 100% sampled data vs X-Ray's 5% sampling, giving more accurate results.
|
|
810
|
+
User can write CloudWatch Logs Insights queries to group, list attribute with sum, avg.
|
|
811
|
+
|
|
812
|
+
```
|
|
813
|
+
FILTER attributes.aws.local.service = "customers-service-java" and attributes.aws.local.environment = "eks:demo/default" and attributes.aws.remote.operation="InvokeModel"
|
|
814
|
+
| STATS sum(`attributes.gen_ai.usage.output_tokens`) as `avg_output_tokens` by `attributes.gen_ai.request.model`, `attributes.aws.local.service`,bin(1h)
|
|
815
|
+
| DISPLAY avg_output_tokens, `attributes.gen_ai.request.model`, `attributes.aws.local.service`
|
|
816
|
+
```
|
|
817
|
+
|
|
818
|
+
Returns:
|
|
819
|
+
--------
|
|
820
|
+
A dictionary containing the final query results, including:
|
|
821
|
+
- status: The current status of the query (e.g., Scheduled, Running, Complete, Failed, etc.)
|
|
822
|
+
- results: A list of the actual query results if the status is Complete.
|
|
823
|
+
- statistics: Query performance statistics
|
|
824
|
+
- messages: Any informational messages about the query
|
|
825
|
+
- transaction_search_status: Information about transaction search availability
|
|
826
|
+
"""
|
|
827
|
+
start_time_perf = timer()
|
|
828
|
+
logger.info(
|
|
829
|
+
f'Starting search_transactions - log_group: {log_group_name}, start: {start_time}, end: {end_time}'
|
|
830
|
+
)
|
|
831
|
+
logger.debug(f'Query string: {query_string}')
|
|
832
|
+
|
|
833
|
+
# Check if transaction search is enabled
|
|
834
|
+
is_enabled, destination, status = check_transaction_search_enabled(AWS_REGION)
|
|
835
|
+
|
|
836
|
+
if not is_enabled:
|
|
837
|
+
logger.warning(
|
|
838
|
+
f'Transaction Search not enabled - Destination: {destination}, Status: {status}'
|
|
839
|
+
)
|
|
840
|
+
return {
|
|
841
|
+
'status': 'Transaction Search Not Available',
|
|
842
|
+
'transaction_search_status': {
|
|
843
|
+
'enabled': False,
|
|
844
|
+
'destination': destination,
|
|
845
|
+
'status': status,
|
|
846
|
+
},
|
|
847
|
+
'message': (
|
|
848
|
+
'⚠️ Transaction Search is not enabled for this account. '
|
|
849
|
+
f'Current configuration: Destination={destination}, Status={status}. '
|
|
850
|
+
"Transaction Search requires sending traces to CloudWatch Logs (destination='CloudWatchLogs' and status='ACTIVE'). "
|
|
851
|
+
'Without Transaction Search, you only have access to 5% sampled trace data through X-Ray. '
|
|
852
|
+
'To get 100% trace visibility, please enable Transaction Search in your X-Ray settings. '
|
|
853
|
+
'As a fallback, you can use query_sampled_traces() but results may be incomplete due to sampling.'
|
|
854
|
+
),
|
|
855
|
+
'fallback_recommendation': 'Use query_sampled_traces() with X-Ray filter expressions for 5% sampled data.',
|
|
856
|
+
}
|
|
857
|
+
|
|
858
|
+
try:
|
|
859
|
+
# Use default log group if none provided
|
|
860
|
+
if log_group_name is None:
|
|
861
|
+
log_group_name = 'aws/spans'
|
|
862
|
+
logger.debug('Using default log group: aws/spans')
|
|
863
|
+
|
|
864
|
+
# Start query
|
|
865
|
+
kwargs = {
|
|
866
|
+
'startTime': int(datetime.fromisoformat(start_time).timestamp()),
|
|
867
|
+
'endTime': int(datetime.fromisoformat(end_time).timestamp()),
|
|
868
|
+
'queryString': query_string,
|
|
869
|
+
'logGroupNames': [log_group_name],
|
|
870
|
+
'limit': limit,
|
|
871
|
+
}
|
|
872
|
+
|
|
873
|
+
logger.debug(f'Starting CloudWatch Logs query with limit: {limit}')
|
|
874
|
+
start_response = logs_client.start_query(**remove_null_values(kwargs))
|
|
875
|
+
query_id = start_response['queryId']
|
|
876
|
+
logger.info(f'Started CloudWatch Logs query with ID: {query_id}')
|
|
877
|
+
|
|
878
|
+
# Seconds
|
|
879
|
+
poll_start = timer()
|
|
880
|
+
while poll_start + max_timeout > timer():
|
|
881
|
+
response = logs_client.get_query_results(queryId=query_id)
|
|
882
|
+
status = response['status']
|
|
883
|
+
|
|
884
|
+
if status in {'Complete', 'Failed', 'Cancelled'}:
|
|
885
|
+
elapsed_time = timer() - start_time_perf
|
|
886
|
+
logger.info(
|
|
887
|
+
f'Query {query_id} finished with status {status} in {elapsed_time:.3f}s'
|
|
888
|
+
)
|
|
889
|
+
|
|
890
|
+
if status == 'Failed':
|
|
891
|
+
logger.error(f'Query failed: {response.get("statistics", {})}')
|
|
892
|
+
elif status == 'Complete':
|
|
893
|
+
logger.debug(f'Query returned {len(response.get("results", []))} results')
|
|
894
|
+
|
|
895
|
+
return {
|
|
896
|
+
'queryId': query_id,
|
|
897
|
+
'status': status,
|
|
898
|
+
'statistics': response.get('statistics', {}),
|
|
899
|
+
'results': [
|
|
900
|
+
{field.get('field', ''): field.get('value', '') for field in line} # type: ignore
|
|
901
|
+
for line in response.get('results', [])
|
|
902
|
+
],
|
|
903
|
+
'transaction_search_status': {
|
|
904
|
+
'enabled': True,
|
|
905
|
+
'destination': 'CloudWatchLogs',
|
|
906
|
+
'status': 'ACTIVE',
|
|
907
|
+
'message': '✅ Using 100% sampled trace data from Transaction Search',
|
|
908
|
+
},
|
|
909
|
+
}
|
|
910
|
+
|
|
911
|
+
await asyncio.sleep(1)
|
|
912
|
+
|
|
913
|
+
elapsed_time = timer() - start_time_perf
|
|
914
|
+
msg = f'Query {query_id} did not complete within {max_timeout} seconds. Use get_query_results with the returned queryId to try again to retrieve query results.'
|
|
915
|
+
logger.warning(f'Query timeout after {elapsed_time:.3f}s: {msg}')
|
|
916
|
+
return {
|
|
917
|
+
'queryId': query_id,
|
|
918
|
+
'status': 'Polling Timeout',
|
|
919
|
+
'message': msg,
|
|
920
|
+
}
|
|
921
|
+
|
|
922
|
+
except Exception as e:
|
|
923
|
+
logger.error(f'Error in search_transactions: {str(e)}', exc_info=True)
|
|
924
|
+
raise
|
|
925
|
+
|
|
926
|
+
|
|
927
|
+
@mcp.tool()
|
|
928
|
+
async def list_slis(
|
|
929
|
+
hours: int = Field(
|
|
930
|
+
default=24,
|
|
931
|
+
description='Number of hours to look back (default 24, typically use 24 for daily checks)',
|
|
932
|
+
),
|
|
933
|
+
) -> str:
|
|
934
|
+
"""Get SLI (Service Level Indicator) status and SLO compliance for all services.
|
|
935
|
+
|
|
936
|
+
Use this tool to:
|
|
937
|
+
- Check overall system health at a glance
|
|
938
|
+
- Identify services with breached SLOs (Service Level Objectives)
|
|
939
|
+
- See which specific SLOs are failing
|
|
940
|
+
- Prioritize which services need immediate attention
|
|
941
|
+
- Monitor SLO compliance trends
|
|
942
|
+
|
|
943
|
+
Returns a comprehensive report showing:
|
|
944
|
+
- Summary counts (total, healthy, breached, insufficient data)
|
|
945
|
+
- Detailed list of breached services with:
|
|
946
|
+
- Service name and environment
|
|
947
|
+
- Number and names of breached SLOs
|
|
948
|
+
- Specific SLO violations
|
|
949
|
+
- List of healthy services
|
|
950
|
+
- Services with insufficient data
|
|
951
|
+
|
|
952
|
+
This is the primary tool for health monitoring and should be used:
|
|
953
|
+
- At the start of each day
|
|
954
|
+
- During incident response
|
|
955
|
+
- For regular health checks
|
|
956
|
+
- When investigating "what is the root cause of breaching SLO" questions
|
|
957
|
+
|
|
958
|
+
Status meanings:
|
|
959
|
+
- OK: All SLOs are being met
|
|
960
|
+
- BREACHED: One or more SLOs are violated
|
|
961
|
+
- INSUFFICIENT_DATA: Not enough data to determine status
|
|
962
|
+
|
|
963
|
+
To investigate breached SLOs, follow these steps:
|
|
964
|
+
1. Call get_service_level_objective() with SLO name to get the detailed SLI data including Metric statistics
|
|
965
|
+
2. Find the fault metrics from SLI under the breached SLO
|
|
966
|
+
3. Build trace query filters using metric dimensions (Operation, RemoteOperation, etc.):
|
|
967
|
+
- For availability: `service("service-name"){fault = true} AND annotation[aws.local.operation]="operation-name"`
|
|
968
|
+
- For latency: `service("service-name") AND annotation[aws.local.operation]="operation-name" AND duration > threshold`
|
|
969
|
+
4. Query traces:
|
|
970
|
+
- If Transaction Search is enabled: Use search_transaction_spans() for 100% trace visibility
|
|
971
|
+
- If not enabled: Use query_sampled_traces() with X-Ray (only 5% sampled data - may miss issues)
|
|
972
|
+
5. The query time window should default to last 3 hours if not specified. Max query time window length is 6 hours
|
|
973
|
+
6. Analyze the root causes from Exception data in traces
|
|
974
|
+
7. Include findings in the report and give fix and mitigation suggestions
|
|
975
|
+
"""
|
|
976
|
+
start_time_perf = timer()
|
|
977
|
+
logger.info(f'Starting get_sli_status request for last {hours} hours')
|
|
978
|
+
|
|
979
|
+
try:
|
|
980
|
+
# Calculate time range
|
|
981
|
+
end_time = datetime.now(timezone.utc)
|
|
982
|
+
start_time = end_time - timedelta(hours=hours)
|
|
983
|
+
logger.debug(f'Time range: {start_time} to {end_time}')
|
|
984
|
+
|
|
985
|
+
# Get all services
|
|
986
|
+
services_response = appsignals_client.list_services(
|
|
987
|
+
StartTime=start_time, # type: ignore
|
|
988
|
+
EndTime=end_time, # type: ignore
|
|
989
|
+
MaxResults=100,
|
|
990
|
+
)
|
|
991
|
+
services = services_response.get('ServiceSummaries', [])
|
|
992
|
+
|
|
993
|
+
if not services:
|
|
994
|
+
logger.warning('No services found in Application Signals')
|
|
995
|
+
return 'No services found in Application Signals.'
|
|
996
|
+
|
|
997
|
+
# Get SLI reports for each service
|
|
998
|
+
reports = []
|
|
999
|
+
logger.debug(f'Generating SLI reports for {len(services)} services')
|
|
1000
|
+
for service in services:
|
|
1001
|
+
service_name = service['KeyAttributes'].get('Name', 'Unknown')
|
|
1002
|
+
try:
|
|
1003
|
+
# Create custom config with the service's key attributes
|
|
1004
|
+
config = AWSConfig(
|
|
1005
|
+
region='us-east-1',
|
|
1006
|
+
period_in_hours=hours,
|
|
1007
|
+
service_name=service_name,
|
|
1008
|
+
key_attributes=service['KeyAttributes'],
|
|
1009
|
+
)
|
|
1010
|
+
|
|
1011
|
+
# Generate SLI report
|
|
1012
|
+
client = SLIReportClient(config)
|
|
1013
|
+
sli_report = client.generate_sli_report()
|
|
1014
|
+
|
|
1015
|
+
# Convert to expected format
|
|
1016
|
+
report = {
|
|
1017
|
+
'BreachedSloCount': sli_report.breached_slo_count,
|
|
1018
|
+
'BreachedSloNames': sli_report.breached_slo_names,
|
|
1019
|
+
'EndTime': sli_report.end_time.timestamp(),
|
|
1020
|
+
'OkSloCount': sli_report.ok_slo_count,
|
|
1021
|
+
'ReferenceId': {'KeyAttributes': service['KeyAttributes']},
|
|
1022
|
+
'SliStatus': 'BREACHED'
|
|
1023
|
+
if sli_report.sli_status == 'CRITICAL'
|
|
1024
|
+
else sli_report.sli_status,
|
|
1025
|
+
'StartTime': sli_report.start_time.timestamp(),
|
|
1026
|
+
'TotalSloCount': sli_report.total_slo_count,
|
|
1027
|
+
}
|
|
1028
|
+
reports.append(report)
|
|
1029
|
+
|
|
1030
|
+
except Exception as e:
|
|
1031
|
+
# Log error but continue with other services
|
|
1032
|
+
logger.error(
|
|
1033
|
+
f'Failed to get SLI report for service {service_name}: {str(e)}', exc_info=True
|
|
1034
|
+
)
|
|
1035
|
+
# Add a report with insufficient data status
|
|
1036
|
+
report = {
|
|
1037
|
+
'BreachedSloCount': 0,
|
|
1038
|
+
'BreachedSloNames': [],
|
|
1039
|
+
'EndTime': end_time.timestamp(),
|
|
1040
|
+
'OkSloCount': 0,
|
|
1041
|
+
'ReferenceId': {'KeyAttributes': service['KeyAttributes']},
|
|
1042
|
+
'SliStatus': 'INSUFFICIENT_DATA',
|
|
1043
|
+
'StartTime': start_time.timestamp(),
|
|
1044
|
+
'TotalSloCount': 0,
|
|
1045
|
+
}
|
|
1046
|
+
reports.append(report)
|
|
1047
|
+
|
|
1048
|
+
# Check transaction search status
|
|
1049
|
+
is_tx_search_enabled, tx_destination, tx_status = check_transaction_search_enabled(
|
|
1050
|
+
AWS_REGION
|
|
1051
|
+
)
|
|
1052
|
+
|
|
1053
|
+
# Build response
|
|
1054
|
+
result = f'SLI Status Report - Last {hours} hours\n'
|
|
1055
|
+
result += f'Time Range: {start_time.strftime("%Y-%m-%d %H:%M")} - {end_time.strftime("%Y-%m-%d %H:%M")}\n\n'
|
|
1056
|
+
|
|
1057
|
+
# Add transaction search status
|
|
1058
|
+
if is_tx_search_enabled:
|
|
1059
|
+
result += '✅ Transaction Search: ENABLED (100% trace visibility available)\n\n'
|
|
1060
|
+
else:
|
|
1061
|
+
result += '⚠️ Transaction Search: NOT ENABLED (only 5% sampled traces available)\n'
|
|
1062
|
+
result += f' Current config: Destination={tx_destination}, Status={tx_status}\n'
|
|
1063
|
+
result += ' Enable Transaction Search for accurate root cause analysis\n\n'
|
|
1064
|
+
|
|
1065
|
+
# Count by status
|
|
1066
|
+
status_counts = {
|
|
1067
|
+
'OK': sum(1 for r in reports if r['SliStatus'] == 'OK'),
|
|
1068
|
+
'BREACHED': sum(1 for r in reports if r['SliStatus'] == 'BREACHED'),
|
|
1069
|
+
'INSUFFICIENT_DATA': sum(1 for r in reports if r['SliStatus'] == 'INSUFFICIENT_DATA'),
|
|
1070
|
+
}
|
|
1071
|
+
|
|
1072
|
+
result += 'Summary:\n'
|
|
1073
|
+
result += f'• Total Services: {len(reports)}\n'
|
|
1074
|
+
result += f'• Healthy (OK): {status_counts["OK"]}\n'
|
|
1075
|
+
result += f'• Breached: {status_counts["BREACHED"]}\n'
|
|
1076
|
+
result += f'• Insufficient Data: {status_counts["INSUFFICIENT_DATA"]}\n\n'
|
|
1077
|
+
|
|
1078
|
+
# Group by status
|
|
1079
|
+
if status_counts['BREACHED'] > 0:
|
|
1080
|
+
result += '⚠️ BREACHED SERVICES:\n'
|
|
1081
|
+
for report in reports:
|
|
1082
|
+
if report['SliStatus'] == 'BREACHED':
|
|
1083
|
+
name = report['ReferenceId']['KeyAttributes']['Name']
|
|
1084
|
+
env = report['ReferenceId']['KeyAttributes']['Environment']
|
|
1085
|
+
breached_count = report['BreachedSloCount']
|
|
1086
|
+
total_count = report['TotalSloCount']
|
|
1087
|
+
breached_names = report['BreachedSloNames']
|
|
1088
|
+
|
|
1089
|
+
result += f'\n• {name} ({env})\n'
|
|
1090
|
+
result += f' SLOs: {breached_count}/{total_count} breached\n'
|
|
1091
|
+
if breached_names:
|
|
1092
|
+
result += ' Breached SLOs:\n'
|
|
1093
|
+
for slo_name in breached_names:
|
|
1094
|
+
result += f' - {slo_name}\n'
|
|
1095
|
+
|
|
1096
|
+
if status_counts['OK'] > 0:
|
|
1097
|
+
result += '\n✅ HEALTHY SERVICES:\n'
|
|
1098
|
+
for report in reports:
|
|
1099
|
+
if report['SliStatus'] == 'OK':
|
|
1100
|
+
name = report['ReferenceId']['KeyAttributes']['Name']
|
|
1101
|
+
env = report['ReferenceId']['KeyAttributes']['Environment']
|
|
1102
|
+
ok_count = report['OkSloCount']
|
|
1103
|
+
|
|
1104
|
+
result += f'• {name} ({env}) - {ok_count} SLO(s) healthy\n'
|
|
1105
|
+
|
|
1106
|
+
if status_counts['INSUFFICIENT_DATA'] > 0:
|
|
1107
|
+
result += '\n❓ INSUFFICIENT DATA:\n'
|
|
1108
|
+
for report in reports:
|
|
1109
|
+
if report['SliStatus'] == 'INSUFFICIENT_DATA':
|
|
1110
|
+
name = report['ReferenceId']['KeyAttributes']['Name']
|
|
1111
|
+
env = report['ReferenceId']['KeyAttributes']['Environment']
|
|
1112
|
+
|
|
1113
|
+
result += f'• {name} ({env})\n'
|
|
1114
|
+
|
|
1115
|
+
# Remove the auto-investigation feature
|
|
1116
|
+
|
|
1117
|
+
elapsed_time = timer() - start_time_perf
|
|
1118
|
+
logger.info(
|
|
1119
|
+
f'get_sli_status completed in {elapsed_time:.3f}s - Total: {len(reports)}, Breached: {status_counts["BREACHED"]}, OK: {status_counts["OK"]}'
|
|
1120
|
+
)
|
|
1121
|
+
return result
|
|
1122
|
+
|
|
1123
|
+
except Exception as e:
|
|
1124
|
+
logger.error(f'Error in get_sli_status: {str(e)}', exc_info=True)
|
|
1125
|
+
return f'Error getting SLI status: {str(e)}'
|
|
1126
|
+
|
|
1127
|
+
|
|
1128
|
+
def check_transaction_search_enabled(region: str = 'us-east-1') -> tuple[bool, str, str]:
|
|
1129
|
+
"""Internal function to check if AWS X-Ray Transaction Search is enabled.
|
|
1130
|
+
|
|
1131
|
+
Returns:
|
|
1132
|
+
tuple: (is_enabled: bool, destination: str, status: str)
|
|
1133
|
+
"""
|
|
1134
|
+
try:
|
|
1135
|
+
response = xray_client.get_trace_segment_destination()
|
|
1136
|
+
|
|
1137
|
+
destination = response.get('Destination', 'Unknown')
|
|
1138
|
+
status = response.get('Status', 'Unknown')
|
|
1139
|
+
|
|
1140
|
+
is_enabled = destination == 'CloudWatchLogs' and status == 'ACTIVE'
|
|
1141
|
+
logger.debug(
|
|
1142
|
+
f'Transaction Search check - Enabled: {is_enabled}, Destination: {destination}, Status: {status}'
|
|
1143
|
+
)
|
|
1144
|
+
|
|
1145
|
+
return is_enabled, destination, status
|
|
1146
|
+
|
|
1147
|
+
except Exception as e:
|
|
1148
|
+
logger.error(f'Error checking transaction search status: {str(e)}')
|
|
1149
|
+
return False, 'Unknown', 'Error'
|
|
1150
|
+
|
|
1151
|
+
|
|
1152
|
+
@mcp.tool()
|
|
1153
|
+
async def query_sampled_traces(
|
|
1154
|
+
start_time: Optional[str] = Field(
|
|
1155
|
+
default=None,
|
|
1156
|
+
description='Start time in ISO format (e.g., "2024-01-01T00:00:00Z"). Defaults to 3 hours ago',
|
|
1157
|
+
),
|
|
1158
|
+
end_time: Optional[str] = Field(
|
|
1159
|
+
default=None,
|
|
1160
|
+
description='End time in ISO format (e.g., "2024-01-01T01:00:00Z"). Defaults to current time',
|
|
1161
|
+
),
|
|
1162
|
+
filter_expression: Optional[str] = Field(
|
|
1163
|
+
default=None,
|
|
1164
|
+
description='X-Ray filter expression to narrow results (e.g., service("service-name"){fault = true})',
|
|
1165
|
+
),
|
|
1166
|
+
region: str = Field(default='us-east-1', description='AWS region (default: us-east-1)'),
|
|
1167
|
+
) -> str:
|
|
1168
|
+
"""Query AWS X-Ray traces (5% sampled data) to investigate errors and performance issues.
|
|
1169
|
+
|
|
1170
|
+
⚠️ IMPORTANT: This tool uses X-Ray's 5% sampled trace data. For 100% trace visibility,
|
|
1171
|
+
enable Transaction Search and use search_transaction_spans() instead.
|
|
1172
|
+
|
|
1173
|
+
Use this tool to:
|
|
1174
|
+
- Find root causes of errors and faults (with 5% sampling limitations)
|
|
1175
|
+
- Analyze request latency and identify bottlenecks
|
|
1176
|
+
- Understand the requests across multiple services with traces
|
|
1177
|
+
- Debug timeout and dependency issues
|
|
1178
|
+
- Understand service-to-service interactions
|
|
1179
|
+
- Find customer impact from trace result such as Users data or trace attributes such as owner id
|
|
1180
|
+
|
|
1181
|
+
Common filter expressions:
|
|
1182
|
+
- 'service("service-name"){fault = true}': Find all traces with faults (5xx errors) for a service
|
|
1183
|
+
- 'service("service-name")': Filter by specific service
|
|
1184
|
+
- 'duration > 5': Find slow requests (over 5 seconds)
|
|
1185
|
+
- 'http.status = 500': Find specific HTTP status codes
|
|
1186
|
+
- 'annotation[aws.local.operation]="GET /owners/*/lastname"': Filter by specific operation (from metric dimensions)
|
|
1187
|
+
- 'annotation[aws.remote.operation]="ListOwners"': Filter by remote operation name
|
|
1188
|
+
- Combine filters: 'service("api"){fault = true} AND annotation[aws.local.operation]="POST /visits"'
|
|
1189
|
+
|
|
1190
|
+
IMPORTANT: When investigating SLO breaches, use annotation filters with the specific dimension values
|
|
1191
|
+
from the breached metric (e.g., Operation, RemoteOperation) to find traces for that exact operation.
|
|
1192
|
+
|
|
1193
|
+
Returns JSON with trace summaries including:
|
|
1194
|
+
- Trace ID for detailed investigation
|
|
1195
|
+
- Duration and response time
|
|
1196
|
+
- Error/fault/throttle status
|
|
1197
|
+
- HTTP information (method, status, URL)
|
|
1198
|
+
- Service interactions
|
|
1199
|
+
- User information if available
|
|
1200
|
+
- Exception root causes (ErrorRootCauses, FaultRootCauses, ResponseTimeRootCauses)
|
|
1201
|
+
|
|
1202
|
+
Best practices:
|
|
1203
|
+
- Start with recent time windows (last 1-3 hours)
|
|
1204
|
+
- Use filter expressions to narrow down issues and query Fault and Error traces for high priority
|
|
1205
|
+
- Look for patterns in errors or very slow requests
|
|
1206
|
+
|
|
1207
|
+
Returns:
|
|
1208
|
+
JSON string containing trace summaries with error status, duration, and service details
|
|
1209
|
+
"""
|
|
1210
|
+
start_time_perf = timer()
|
|
1211
|
+
logger.info(f'Starting query_sampled_traces - region: {region}, filter: {filter_expression}')
|
|
1212
|
+
|
|
1213
|
+
try:
|
|
1214
|
+
logger.debug('Using X-Ray client')
|
|
1215
|
+
|
|
1216
|
+
# Default to past 3 hours if times not provided
|
|
1217
|
+
if not end_time:
|
|
1218
|
+
end_datetime = datetime.now(timezone.utc)
|
|
1219
|
+
else:
|
|
1220
|
+
end_datetime = datetime.fromisoformat(end_time.replace('Z', '+00:00'))
|
|
1221
|
+
|
|
1222
|
+
if not start_time:
|
|
1223
|
+
start_datetime = end_datetime - timedelta(hours=3)
|
|
1224
|
+
else:
|
|
1225
|
+
start_datetime = datetime.fromisoformat(start_time.replace('Z', '+00:00'))
|
|
1226
|
+
|
|
1227
|
+
# Validate time window to ensure it's not too large (max 6 hours)
|
|
1228
|
+
time_diff = end_datetime - start_datetime
|
|
1229
|
+
logger.debug(
|
|
1230
|
+
f'Query time window: {start_datetime} to {end_datetime} ({time_diff.total_seconds() / 3600:.1f} hours)'
|
|
1231
|
+
)
|
|
1232
|
+
if time_diff > timedelta(hours=6):
|
|
1233
|
+
logger.warning(f'Time window too large: {time_diff.total_seconds() / 3600:.1f} hours')
|
|
1234
|
+
return json.dumps(
|
|
1235
|
+
{
|
|
1236
|
+
'error': 'Time window too large. Maximum allowed is 6 hours.',
|
|
1237
|
+
'requested_hours': time_diff.total_seconds() / 3600,
|
|
1238
|
+
},
|
|
1239
|
+
indent=2,
|
|
1240
|
+
)
|
|
1241
|
+
|
|
1242
|
+
# Use pagination helper with a reasonable limit
|
|
1243
|
+
traces = get_trace_summaries_paginated(
|
|
1244
|
+
xray_client,
|
|
1245
|
+
start_datetime,
|
|
1246
|
+
end_datetime,
|
|
1247
|
+
filter_expression or '',
|
|
1248
|
+
max_traces=100, # Limit to prevent response size issues
|
|
1249
|
+
)
|
|
1250
|
+
|
|
1251
|
+
# Convert response to JSON-serializable format
|
|
1252
|
+
def convert_datetime(obj):
|
|
1253
|
+
if isinstance(obj, datetime):
|
|
1254
|
+
return obj.isoformat()
|
|
1255
|
+
return obj
|
|
1256
|
+
|
|
1257
|
+
trace_summaries = []
|
|
1258
|
+
for trace in traces:
|
|
1259
|
+
# Create a simplified trace data structure to reduce size
|
|
1260
|
+
trace_data = {
|
|
1261
|
+
'Id': trace.get('Id'),
|
|
1262
|
+
'Duration': trace.get('Duration'),
|
|
1263
|
+
'ResponseTime': trace.get('ResponseTime'),
|
|
1264
|
+
'HasError': trace.get('HasError'),
|
|
1265
|
+
'HasFault': trace.get('HasFault'),
|
|
1266
|
+
'HasThrottle': trace.get('HasThrottle'),
|
|
1267
|
+
'Http': trace.get('Http', {}),
|
|
1268
|
+
}
|
|
1269
|
+
|
|
1270
|
+
# Only include root causes if they exist (to save space)
|
|
1271
|
+
if trace.get('ErrorRootCauses'):
|
|
1272
|
+
trace_data['ErrorRootCauses'] = trace.get('ErrorRootCauses', [])[
|
|
1273
|
+
:3
|
|
1274
|
+
] # Limit to first 3
|
|
1275
|
+
if trace.get('FaultRootCauses'):
|
|
1276
|
+
trace_data['FaultRootCauses'] = trace.get('FaultRootCauses', [])[
|
|
1277
|
+
:3
|
|
1278
|
+
] # Limit to first 3
|
|
1279
|
+
if trace.get('ResponseTimeRootCauses'):
|
|
1280
|
+
trace_data['ResponseTimeRootCauses'] = trace.get('ResponseTimeRootCauses', [])[
|
|
1281
|
+
:3
|
|
1282
|
+
] # Limit to first 3
|
|
1283
|
+
|
|
1284
|
+
# Include limited annotations for key operations
|
|
1285
|
+
annotations = trace.get('Annotations', {})
|
|
1286
|
+
if annotations:
|
|
1287
|
+
# Only include operation-related annotations
|
|
1288
|
+
filtered_annotations = {}
|
|
1289
|
+
for key in ['aws.local.operation', 'aws.remote.operation']:
|
|
1290
|
+
if key in annotations:
|
|
1291
|
+
filtered_annotations[key] = annotations[key]
|
|
1292
|
+
if filtered_annotations:
|
|
1293
|
+
trace_data['Annotations'] = filtered_annotations
|
|
1294
|
+
|
|
1295
|
+
# Include user info if available
|
|
1296
|
+
if trace.get('Users'):
|
|
1297
|
+
trace_data['Users'] = trace.get('Users', [])[:2] # Limit to first 2 users
|
|
1298
|
+
|
|
1299
|
+
# Convert any datetime objects to ISO format strings
|
|
1300
|
+
for key, value in trace_data.items():
|
|
1301
|
+
trace_data[key] = convert_datetime(value)
|
|
1302
|
+
trace_summaries.append(trace_data)
|
|
1303
|
+
|
|
1304
|
+
# Check transaction search status
|
|
1305
|
+
is_tx_search_enabled, tx_destination, tx_status = check_transaction_search_enabled(region)
|
|
1306
|
+
|
|
1307
|
+
result_data = {
|
|
1308
|
+
'TraceSummaries': trace_summaries,
|
|
1309
|
+
'TraceCount': len(trace_summaries),
|
|
1310
|
+
'Message': f'Retrieved {len(trace_summaries)} traces (limited to prevent size issues)',
|
|
1311
|
+
'SamplingNote': "⚠️ This data is from X-Ray's 5% sampling. Results may not show all errors or issues.",
|
|
1312
|
+
'TransactionSearchStatus': {
|
|
1313
|
+
'enabled': is_tx_search_enabled,
|
|
1314
|
+
'recommendation': (
|
|
1315
|
+
'Transaction Search is available! Use search_transaction_spans() for 100% trace visibility.'
|
|
1316
|
+
if is_tx_search_enabled
|
|
1317
|
+
else 'Enable Transaction Search for 100% trace visibility instead of 5% sampling.'
|
|
1318
|
+
),
|
|
1319
|
+
},
|
|
1320
|
+
}
|
|
1321
|
+
|
|
1322
|
+
elapsed_time = timer() - start_time_perf
|
|
1323
|
+
logger.info(
|
|
1324
|
+
f'query_sampled_traces completed in {elapsed_time:.3f}s - retrieved {len(trace_summaries)} traces'
|
|
1325
|
+
)
|
|
1326
|
+
return json.dumps(result_data, indent=2)
|
|
1327
|
+
|
|
1328
|
+
except Exception as e:
|
|
1329
|
+
logger.error(f'Error in query_sampled_traces: {str(e)}', exc_info=True)
|
|
1330
|
+
return json.dumps({'error': str(e)}, indent=2)
|
|
1331
|
+
|
|
1332
|
+
|
|
258
1333
|
def main():
|
|
259
1334
|
"""Run the MCP server."""
|
|
260
1335
|
logger.debug('Starting CloudWatch AppSignals MCP server')
|