awslabs.cloudwatch-applicationsignals-mcp-server 0.1.21__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. awslabs/__init__.py +17 -0
  2. awslabs/cloudwatch_applicationsignals_mcp_server/__init__.py +17 -0
  3. awslabs/cloudwatch_applicationsignals_mcp_server/audit_presentation_utils.py +288 -0
  4. awslabs/cloudwatch_applicationsignals_mcp_server/audit_utils.py +912 -0
  5. awslabs/cloudwatch_applicationsignals_mcp_server/aws_clients.py +120 -0
  6. awslabs/cloudwatch_applicationsignals_mcp_server/canary_utils.py +910 -0
  7. awslabs/cloudwatch_applicationsignals_mcp_server/enablement_guides/templates/ec2/ec2-dotnet-enablement.md +435 -0
  8. awslabs/cloudwatch_applicationsignals_mcp_server/enablement_guides/templates/ec2/ec2-java-enablement.md +321 -0
  9. awslabs/cloudwatch_applicationsignals_mcp_server/enablement_guides/templates/ec2/ec2-nodejs-enablement.md +420 -0
  10. awslabs/cloudwatch_applicationsignals_mcp_server/enablement_guides/templates/ec2/ec2-python-enablement.md +598 -0
  11. awslabs/cloudwatch_applicationsignals_mcp_server/enablement_guides/templates/ecs/ecs-dotnet-enablement.md +264 -0
  12. awslabs/cloudwatch_applicationsignals_mcp_server/enablement_guides/templates/ecs/ecs-java-enablement.md +193 -0
  13. awslabs/cloudwatch_applicationsignals_mcp_server/enablement_guides/templates/ecs/ecs-nodejs-enablement.md +198 -0
  14. awslabs/cloudwatch_applicationsignals_mcp_server/enablement_guides/templates/ecs/ecs-python-enablement.md +236 -0
  15. awslabs/cloudwatch_applicationsignals_mcp_server/enablement_guides/templates/eks/eks-dotnet-enablement.md +166 -0
  16. awslabs/cloudwatch_applicationsignals_mcp_server/enablement_guides/templates/eks/eks-java-enablement.md +166 -0
  17. awslabs/cloudwatch_applicationsignals_mcp_server/enablement_guides/templates/eks/eks-nodejs-enablement.md +166 -0
  18. awslabs/cloudwatch_applicationsignals_mcp_server/enablement_guides/templates/eks/eks-python-enablement.md +169 -0
  19. awslabs/cloudwatch_applicationsignals_mcp_server/enablement_guides/templates/lambda/lambda-dotnet-enablement.md +336 -0
  20. awslabs/cloudwatch_applicationsignals_mcp_server/enablement_guides/templates/lambda/lambda-java-enablement.md +336 -0
  21. awslabs/cloudwatch_applicationsignals_mcp_server/enablement_guides/templates/lambda/lambda-nodejs-enablement.md +336 -0
  22. awslabs/cloudwatch_applicationsignals_mcp_server/enablement_guides/templates/lambda/lambda-python-enablement.md +336 -0
  23. awslabs/cloudwatch_applicationsignals_mcp_server/enablement_tools.py +147 -0
  24. awslabs/cloudwatch_applicationsignals_mcp_server/server.py +1505 -0
  25. awslabs/cloudwatch_applicationsignals_mcp_server/service_audit_utils.py +231 -0
  26. awslabs/cloudwatch_applicationsignals_mcp_server/service_tools.py +659 -0
  27. awslabs/cloudwatch_applicationsignals_mcp_server/sli_report_client.py +333 -0
  28. awslabs/cloudwatch_applicationsignals_mcp_server/slo_tools.py +386 -0
  29. awslabs/cloudwatch_applicationsignals_mcp_server/trace_tools.py +784 -0
  30. awslabs/cloudwatch_applicationsignals_mcp_server/utils.py +172 -0
  31. awslabs_cloudwatch_applicationsignals_mcp_server-0.1.21.dist-info/METADATA +808 -0
  32. awslabs_cloudwatch_applicationsignals_mcp_server-0.1.21.dist-info/RECORD +36 -0
  33. awslabs_cloudwatch_applicationsignals_mcp_server-0.1.21.dist-info/WHEEL +4 -0
  34. awslabs_cloudwatch_applicationsignals_mcp_server-0.1.21.dist-info/entry_points.txt +2 -0
  35. awslabs_cloudwatch_applicationsignals_mcp_server-0.1.21.dist-info/licenses/LICENSE +174 -0
  36. awslabs_cloudwatch_applicationsignals_mcp_server-0.1.21.dist-info/licenses/NOTICE +2 -0
@@ -0,0 +1,784 @@
1
+ # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """CloudWatch Application Signals MCP Server - Trace and logging tools."""
16
+
17
+ import asyncio
18
+ import json
19
+ from .aws_clients import applicationsignals_client, logs_client, xray_client
20
+ from .sli_report_client import AWSConfig, SLIReportClient
21
+ from .utils import remove_null_values
22
+ from datetime import datetime, timedelta, timezone
23
+ from loguru import logger
24
+ from pydantic import Field
25
+ from time import perf_counter as timer
26
+ from typing import Dict, Optional
27
+
28
+
29
+ def get_trace_summaries_paginated(
30
+ xray_client, start_time, end_time, filter_expression, max_traces: int = 100
31
+ ) -> list:
32
+ """Get trace summaries with pagination to avoid exceeding response size limits.
33
+
34
+ Args:
35
+ xray_client: Boto3 X-Ray client
36
+ start_time: Start time for trace query
37
+ end_time: End time for trace query
38
+ filter_expression: X-Ray filter expression
39
+ max_traces: Maximum number of traces to retrieve (default 100)
40
+
41
+ Returns:
42
+ List of trace summaries
43
+ """
44
+ all_traces = []
45
+ next_token = None
46
+ logger.debug(
47
+ f'Starting paginated trace retrieval - filter: {filter_expression}, max_traces: {max_traces}'
48
+ )
49
+
50
+ try:
51
+ while len(all_traces) < max_traces:
52
+ # Build request parameters
53
+ kwargs = {
54
+ 'StartTime': start_time,
55
+ 'EndTime': end_time,
56
+ 'FilterExpression': filter_expression,
57
+ 'Sampling': True,
58
+ 'TimeRangeType': 'Service',
59
+ }
60
+
61
+ if next_token:
62
+ kwargs['NextToken'] = next_token
63
+
64
+ # Make request
65
+ response = xray_client.get_trace_summaries(**kwargs)
66
+
67
+ # Add traces from this page
68
+ traces = response.get('TraceSummaries', [])
69
+ all_traces.extend(traces)
70
+ logger.debug(
71
+ f'Retrieved {len(traces)} traces in this page, total so far: {len(all_traces)}'
72
+ )
73
+
74
+ # Check if we have more pages
75
+ next_token = response.get('NextToken')
76
+ if not next_token:
77
+ break
78
+
79
+ # If we've collected enough traces, stop
80
+ if len(all_traces) >= max_traces:
81
+ all_traces = all_traces[:max_traces]
82
+ break
83
+
84
+ logger.info(f'Successfully retrieved {len(all_traces)} traces')
85
+ return all_traces
86
+
87
+ except Exception as e:
88
+ # Return what we have so far if there's an error
89
+ logger.error(f'Error during paginated trace retrieval: {str(e)}', exc_info=True)
90
+ logger.info(f'Returning {len(all_traces)} traces retrieved before error')
91
+ return all_traces
92
+
93
+
94
+ def check_transaction_search_enabled(region: str = 'us-east-1') -> tuple[bool, str, str]:
95
+ """Internal function to check if AWS X-Ray Transaction Search is enabled.
96
+
97
+ Returns:
98
+ tuple: (is_enabled: bool, destination: str, status: str)
99
+ """
100
+ try:
101
+ response = xray_client.get_trace_segment_destination()
102
+
103
+ destination = response.get('Destination', 'Unknown')
104
+ status = response.get('Status', 'Unknown')
105
+
106
+ is_enabled = destination == 'CloudWatchLogs' and status == 'ACTIVE'
107
+ logger.debug(
108
+ f'Transaction Search check - Enabled: {is_enabled}, Destination: {destination}, Status: {status}'
109
+ )
110
+
111
+ return is_enabled, destination, status
112
+
113
+ except Exception as e:
114
+ logger.error(f'Error checking transaction search status: {str(e)}')
115
+ return False, 'Unknown', 'Error'
116
+
117
+
118
+ async def search_transaction_spans(
119
+ log_group_name: str = Field(
120
+ default='',
121
+ description='CloudWatch log group name (defaults to "aws/spans" if not provided)',
122
+ ),
123
+ start_time: str = Field(
124
+ default='', description='Start time in ISO 8601 format (e.g., "2025-04-19T20:00:00+00:00")'
125
+ ),
126
+ end_time: str = Field(
127
+ default='', description='End time in ISO 8601 format (e.g., "2025-04-19T21:00:00+00:00")'
128
+ ),
129
+ query_string: str = Field(default='', description='CloudWatch Logs Insights query string'),
130
+ limit: Optional[int] = Field(default=None, description='Maximum number of results to return'),
131
+ max_timeout: int = Field(
132
+ default=30, description='Maximum time in seconds to wait for query completion'
133
+ ),
134
+ ) -> Dict:
135
+ """Executes a CloudWatch Logs Insights query for transaction search (100% sampled trace data).
136
+
137
+ IMPORTANT: If log_group_name is not provided use 'aws/spans' as default cloudwatch log group name.
138
+ The volume of returned logs can easily overwhelm the agent context window. Always include a limit in the query
139
+ (| limit 50) or using the limit parameter.
140
+
141
+ Usage:
142
+ "aws/spans" log group stores OpenTelemetry Spans data with many attributes for all monitored services.
143
+ This provides 100% sampled data vs X-Ray's 5% sampling, giving more accurate results.
144
+ User can write CloudWatch Logs Insights queries to group, list attribute with sum, avg.
145
+ If source code is not accessible, consider querying with code-level attributes.
146
+ ⚠️ Use CORRECT attribute names: attributes.code.file.path, attributes.code.function.name, attributes.code.line.number
147
+
148
+ ```
149
+ FILTER attributes.aws.local.service = "customers-service-java" and attributes.aws.local.environment = "eks:demo/default" and attributes.aws.remote.operation="InvokeModel"
150
+ | STATS sum(`attributes.gen_ai.usage.output_tokens`) as `avg_output_tokens` by `attributes.gen_ai.request.model`, `attributes.aws.local.service`,bin(1h)
151
+ | DISPLAY avg_output_tokens, `attributes.gen_ai.request.model`, `attributes.aws.local.service`
152
+ ```
153
+
154
+ Returns:
155
+ --------
156
+ A dictionary containing the final query results, including:
157
+ - status: The current status of the query (e.g., Scheduled, Running, Complete, Failed, etc.)
158
+ - results: A list of the actual query results if the status is Complete.
159
+ - statistics: Query performance statistics
160
+ - messages: Any informational messages about the query
161
+ - transaction_search_status: Information about transaction search availability
162
+ """
163
+ start_time_perf = timer()
164
+ logger.info(
165
+ f'Starting search_transactions - log_group: {log_group_name}, start: {start_time}, end: {end_time}'
166
+ )
167
+ logger.debug(f'Query string: {query_string}')
168
+
169
+ # Check if transaction search is enabled
170
+ is_enabled, destination, status = check_transaction_search_enabled()
171
+
172
+ if not is_enabled:
173
+ logger.warning(
174
+ f'Transaction Search not enabled - Destination: {destination}, Status: {status}'
175
+ )
176
+ return {
177
+ 'status': 'Transaction Search Not Available',
178
+ 'transaction_search_status': {
179
+ 'enabled': False,
180
+ 'destination': destination,
181
+ 'status': status,
182
+ },
183
+ 'message': (
184
+ '⚠️ Transaction Search is not enabled for this account. '
185
+ f'Current configuration: Destination={destination}, Status={status}. '
186
+ "Transaction Search requires sending traces to CloudWatch Logs (destination='CloudWatchLogs' and status='ACTIVE'). "
187
+ 'Without Transaction Search, you only have access to 5% sampled trace data through X-Ray. '
188
+ 'To get 100% trace visibility, please enable Transaction Search in your X-Ray settings. '
189
+ 'As a fallback, you can use query_sampled_traces() but results may be incomplete due to sampling.'
190
+ ),
191
+ 'fallback_recommendation': 'Use query_sampled_traces() with X-Ray filter expressions for 5% sampled data.',
192
+ }
193
+
194
+ try:
195
+ # Use default log group if none provided
196
+ if not log_group_name:
197
+ log_group_name = 'aws/spans'
198
+ logger.debug('Using default log group: aws/spans')
199
+
200
+ # Start query
201
+ kwargs = {
202
+ 'startTime': int(datetime.fromisoformat(start_time).timestamp()),
203
+ 'endTime': int(datetime.fromisoformat(end_time).timestamp()),
204
+ 'queryString': query_string,
205
+ 'logGroupNames': [log_group_name],
206
+ 'limit': limit,
207
+ }
208
+
209
+ logger.debug(f'Starting CloudWatch Logs query with limit: {limit}')
210
+ start_response = logs_client.start_query(**remove_null_values(kwargs))
211
+ query_id = start_response['queryId']
212
+ logger.info(f'Started CloudWatch Logs query with ID: {query_id}')
213
+
214
+ # Seconds
215
+ poll_start = timer()
216
+ while poll_start + max_timeout > timer():
217
+ response = logs_client.get_query_results(queryId=query_id)
218
+ status = response['status']
219
+
220
+ if status in {'Complete', 'Failed', 'Cancelled'}:
221
+ elapsed_time = timer() - start_time_perf
222
+ logger.info(
223
+ f'Query {query_id} finished with status {status} in {elapsed_time:.3f}s'
224
+ )
225
+
226
+ if status == 'Failed':
227
+ logger.error(f'Query failed: {response.get("statistics", {})}')
228
+ elif status == 'Complete':
229
+ logger.debug(f'Query returned {len(response.get("results", []))} results')
230
+
231
+ # Convert results to list of dictionaries
232
+ results = [
233
+ {field.get('field', ''): field.get('value', '') for field in line} # type: ignore
234
+ for line in response.get('results', [])
235
+ ]
236
+
237
+ # Check for code-level attributes following OpenTelemetry semantic conventions
238
+ # Only supported attributes: code.file.path, code.function.name, code.line.number
239
+ code_level_attribute_names = [
240
+ 'code.file.path',
241
+ 'code.function.name',
242
+ 'code.line.number',
243
+ ]
244
+
245
+ # Check with both prefixed and unprefixed versions
246
+ code_level_attributes_set = set()
247
+ for attr in code_level_attribute_names:
248
+ code_level_attributes_set.add(attr)
249
+ code_level_attributes_set.add(f'attributes.{attr}')
250
+
251
+ # Check if code-level attributes are requested in the query
252
+ query_lower = query_string.lower()
253
+ requested_in_query = any(
254
+ attr.lower() in query_lower or f'`{attr}`'.lower() in query_lower
255
+ for attr in code_level_attributes_set
256
+ )
257
+
258
+ # Check if any code-level attributes are present in results
259
+ detected_attributes = set()
260
+ for result in results:
261
+ for field_name in result.keys():
262
+ if field_name in code_level_attributes_set:
263
+ # Normalize attribute name (remove 'attributes.' prefix if present)
264
+ normalized_name = field_name.replace('attributes.', '')
265
+ detected_attributes.add(normalized_name)
266
+
267
+ code_level_detected = len(detected_attributes) > 0
268
+
269
+ # Build code-level attributes status
270
+ code_level_status = {
271
+ 'detected': code_level_detected,
272
+ 'attributes_found': sorted(detected_attributes),
273
+ 'requested_in_query': requested_in_query,
274
+ }
275
+
276
+ if not code_level_detected:
277
+ if requested_in_query:
278
+ # Attributes were requested but not found - instrumentation not enabled
279
+ code_level_status['message'] = (
280
+ 'Code-level attributes not available in span data. '
281
+ 'If source code is not accessible and code-level context is needed, '
282
+ 'enable code-level attributes by setting OTEL_AWS_EXPERIMENTAL_CODE_ATTRIBUTES=true. '
283
+ 'It is only supported in Python and requires the latest ADOT Python SDK.'
284
+ )
285
+ code_level_status['suggestion'] = (
286
+ 'Enable code-level attributes if source code is not accessible.'
287
+ )
288
+ logger.debug(
289
+ 'Code-level attributes requested in query but not found in data'
290
+ )
291
+ else:
292
+ code_level_status['message'] = (
293
+ f'✅ Code-Level Attributes Available: {", ".join(sorted(detected_attributes))}'
294
+ )
295
+ logger.debug(
296
+ f'Code-level attributes detected - attributes: {", ".join(sorted(detected_attributes))}'
297
+ )
298
+
299
+ return {
300
+ 'queryId': query_id,
301
+ 'status': status,
302
+ 'statistics': response.get('statistics', {}),
303
+ 'results': results,
304
+ 'transaction_search_status': {
305
+ 'enabled': True,
306
+ 'destination': 'CloudWatchLogs',
307
+ 'status': 'ACTIVE',
308
+ 'message': '✅ Using 100% sampled trace data from Transaction Search',
309
+ },
310
+ 'code_level_attributes_status': code_level_status,
311
+ }
312
+
313
+ await asyncio.sleep(1)
314
+
315
+ elapsed_time = timer() - start_time_perf
316
+ msg = f'Query {query_id} did not complete within {max_timeout} seconds. Use get_query_results with the returned queryId to try again to retrieve query results.'
317
+ logger.warning(f'Query timeout after {elapsed_time:.3f}s: {msg}')
318
+ return {
319
+ 'queryId': query_id,
320
+ 'status': 'Polling Timeout',
321
+ 'message': msg,
322
+ }
323
+
324
+ except Exception as e:
325
+ logger.error(f'Error in search_transactions: {str(e)}', exc_info=True)
326
+ raise
327
+
328
+
329
+ async def query_sampled_traces(
330
+ start_time: Optional[str] = Field(
331
+ default=None,
332
+ description='Start time in ISO format (e.g., "2024-01-01T00:00:00Z"). Defaults to 3 hours ago',
333
+ ),
334
+ end_time: Optional[str] = Field(
335
+ default=None,
336
+ description='End time in ISO format (e.g., "2024-01-01T01:00:00Z"). Defaults to current time',
337
+ ),
338
+ filter_expression: Optional[str] = Field(
339
+ default=None,
340
+ description='X-Ray filter expression to narrow results (e.g., service("service-name"){fault = true})',
341
+ ),
342
+ region: Optional[str] = Field(
343
+ default=None, description='AWS region (defaults to AWS_REGION environment variable)'
344
+ ),
345
+ ) -> str:
346
+ """SECONDARY TRACE TOOL - Query AWS X-Ray traces (5% sampled data) for trace investigation.
347
+
348
+ ⚠️ **IMPORTANT: Consider using audit_slos() with auditors="all" instead for comprehensive root cause analysis**
349
+
350
+ **RECOMMENDED WORKFLOW FOR OPERATION DISCOVERY:**
351
+ 1. **Use `get_service_detail(service_name)` FIRST** to discover operations from metric dimensions
352
+ 2. **Use audit_slos() with auditors="all"** for comprehensive root cause analysis (PREFERRED)
353
+ 3. Only use this tool if you need specific trace filtering that other tools don't provide
354
+
355
+ **RECOMMENDED WORKFLOW FOR SLO BREACH INVESTIGATION:**
356
+ 1. Use get_slo() to understand SLO configuration
357
+ 2. **Use audit_slos() with auditors="all"** for comprehensive root cause analysis (PREFERRED)
358
+ 3. Only use this tool if you need specific trace filtering that audit_slos() doesn't provide
359
+
360
+ **WHY audit_slos() IS PREFERRED:**
361
+ - **Comprehensive analysis**: Combines traces, logs, metrics, and dependencies
362
+ - **Actionable recommendations**: Provides specific steps to resolve issues
363
+ - **Integrated findings**: Correlates multiple data sources for better insights
364
+ - **Much more effective** than individual trace analysis
365
+
366
+ **WHY get_service_detail() IS PREFERRED FOR OPERATION DISCOVERY:**
367
+ - **Direct operation discovery**: Operations are available in metric dimensions
368
+ - **More reliable**: Uses Application Signals service metadata instead of sampling
369
+ - **Comprehensive**: Shows all operations, not just those in sampled traces
370
+
371
+ ⚠️ **LIMITATIONS OF THIS TOOL:**
372
+ - Uses X-Ray's **5% sampled trace data** - may miss critical errors
373
+ - **Limited context** compared to comprehensive audit tools
374
+ - **No integrated analysis** with logs, metrics, or dependencies
375
+ - **May miss operations** due to sampling - use get_service_detail() for complete operation discovery
376
+ - For 100% trace visibility, enable Transaction Search and use search_transaction_spans()
377
+
378
+ **Use this tool only when:**
379
+ - You need specific X-Ray filter expressions not available in audit tools
380
+ - You're doing exploratory trace analysis outside of SLO breach investigation
381
+ - You need raw trace data for custom analysis
382
+ - **After using get_service_detail() for operation discovery**
383
+
384
+ **For operation discovery, use get_service_detail() instead:**
385
+ ```
386
+ get_service_detail(service_name='your-service-name')
387
+ ```
388
+
389
+ **For SLO breach root cause analysis, use audit_slos() instead:**
390
+ ```
391
+ audit_slos(
392
+ slo_targets='[{"Type":"slo","Data":{"Slo":{"SloName":"your-slo-name"}}}]', auditors='all'
393
+ )
394
+ ```
395
+
396
+ Common filter expressions (if you must use this tool):
397
+ - 'service("service-name"){fault = true}': Find all traces with faults (5xx errors) for a service
398
+ - 'service("service-name")': Filter by specific service
399
+ - 'duration > 5': Find slow requests (over 5 seconds)
400
+ - 'http.status = 500': Find specific HTTP status codes
401
+ - 'annotation[aws.local.operation]="GET /owners/*/lastname"': Filter by specific operation (from metric dimensions)
402
+ - 'annotation[aws.remote.operation]="ListOwners"': Filter by remote operation name
403
+ - Combine filters: 'service("api"){fault = true} AND annotation[aws.local.operation]="POST /visits"'
404
+
405
+ Returns JSON with trace summaries including:
406
+ - Trace ID for detailed investigation
407
+ - Duration and response time
408
+ - Error/fault/throttle status
409
+ - HTTP information (method, status, URL)
410
+ - Service interactions
411
+ - User information if available
412
+ - Exception root causes (ErrorRootCauses, FaultRootCauses, ResponseTimeRootCauses)
413
+
414
+ **RECOMMENDATION: Use get_service_detail() for operation discovery and audit_slos() with auditors="all" for comprehensive root cause analysis instead of this tool.**
415
+
416
+ Returns:
417
+ JSON string containing trace summaries with error status, duration, and service details
418
+ """
419
+ start_time_perf = timer()
420
+
421
+ # Use AWS_REGION environment variable if region not provided
422
+ if not region:
423
+ from .aws_clients import AWS_REGION
424
+
425
+ region = AWS_REGION
426
+
427
+ logger.info(f'Starting query_sampled_traces - region: {region}, filter: {filter_expression}')
428
+
429
+ try:
430
+ logger.debug('Using X-Ray client')
431
+
432
+ # Default to past 3 hours if times not provided
433
+ if not end_time:
434
+ end_datetime = datetime.now(timezone.utc)
435
+ else:
436
+ end_datetime = datetime.fromisoformat(end_time.replace('Z', '+00:00'))
437
+
438
+ if not start_time:
439
+ start_datetime = end_datetime - timedelta(hours=3)
440
+ else:
441
+ start_datetime = datetime.fromisoformat(start_time.replace('Z', '+00:00'))
442
+
443
+ # Validate time window to ensure it's not too large (max 6 hours)
444
+ time_diff = end_datetime - start_datetime
445
+ logger.debug(
446
+ f'Query time window: {start_datetime} to {end_datetime} ({time_diff.total_seconds() / 3600:.1f} hours)'
447
+ )
448
+ if time_diff > timedelta(hours=6):
449
+ logger.warning(f'Time window too large: {time_diff.total_seconds() / 3600:.1f} hours')
450
+ return json.dumps(
451
+ {
452
+ 'error': 'Time window too large. Maximum allowed is 6 hours.',
453
+ 'requested_hours': time_diff.total_seconds() / 3600,
454
+ },
455
+ indent=2,
456
+ )
457
+
458
+ # Use pagination helper with a reasonable limit
459
+ traces = get_trace_summaries_paginated(
460
+ xray_client,
461
+ start_datetime,
462
+ end_datetime,
463
+ filter_expression or '',
464
+ max_traces=100, # Limit to prevent response size issues
465
+ )
466
+
467
+ # Convert response to JSON-serializable format
468
+ def convert_datetime(obj):
469
+ if isinstance(obj, datetime):
470
+ return obj.isoformat()
471
+ return obj
472
+
473
+ # Helper function to extract fault message from root causes for deduplication
474
+ def get_fault_message(trace_data):
475
+ """Extract fault message from a trace for deduplication.
476
+
477
+ Only checks FaultRootCauses (5xx server errors) since this is the primary
478
+ use case for root cause investigation. Traces without fault messages are
479
+ not deduplicated.
480
+ """
481
+ # Only check FaultRootCauses for deduplication
482
+ root_causes = trace_data.get('FaultRootCauses', [])
483
+ if root_causes:
484
+ for cause in root_causes:
485
+ services = cause.get('Services', [])
486
+ for service in services:
487
+ exceptions = service.get('Exceptions', [])
488
+ if exceptions and exceptions[0].get('Message'):
489
+ return exceptions[0].get('Message')
490
+ return None
491
+
492
+ # Build trace summaries (original format)
493
+ trace_summaries = []
494
+ for trace in traces:
495
+ # Create a simplified trace data structure to reduce size
496
+ trace_data = {
497
+ 'Id': trace.get('Id'),
498
+ 'Duration': trace.get('Duration'),
499
+ 'ResponseTime': trace.get('ResponseTime'),
500
+ 'HasError': trace.get('HasError'),
501
+ 'HasFault': trace.get('HasFault'),
502
+ 'HasThrottle': trace.get('HasThrottle'),
503
+ 'Http': trace.get('Http', {}),
504
+ }
505
+
506
+ # Only include root causes if they exist (to save space)
507
+ if trace.get('ErrorRootCauses'):
508
+ trace_data['ErrorRootCauses'] = trace.get('ErrorRootCauses', [])[:3]
509
+ if trace.get('FaultRootCauses'):
510
+ trace_data['FaultRootCauses'] = trace.get('FaultRootCauses', [])[:3]
511
+ if trace.get('ResponseTimeRootCauses'):
512
+ trace_data['ResponseTimeRootCauses'] = trace.get('ResponseTimeRootCauses', [])[:3]
513
+
514
+ # Include limited annotations for key operations
515
+ annotations = trace.get('Annotations', {})
516
+ if annotations:
517
+ # Only include operation-related annotations
518
+ filtered_annotations = {}
519
+ for key in ['aws.local.operation', 'aws.remote.operation']:
520
+ if key in annotations:
521
+ filtered_annotations[key] = annotations[key]
522
+ if filtered_annotations:
523
+ trace_data['Annotations'] = filtered_annotations
524
+
525
+ # Include user info if available
526
+ if trace.get('Users'):
527
+ trace_data['Users'] = trace.get('Users', [])[:2] # Limit to first 2 users
528
+
529
+ # Convert any datetime objects to ISO format strings
530
+ for key, value in trace_data.items():
531
+ trace_data[key] = convert_datetime(value)
532
+
533
+ trace_summaries.append(trace_data)
534
+
535
+ # Deduplicate trace summaries by fault message
536
+ seen_faults = {}
537
+ deduped_trace_summaries = []
538
+
539
+ for trace_summary in trace_summaries:
540
+ # Check if this trace has an error
541
+ has_issues = (
542
+ trace_summary.get('HasError')
543
+ or trace_summary.get('HasFault')
544
+ or trace_summary.get('HasThrottle')
545
+ )
546
+
547
+ if not has_issues:
548
+ # Always include healthy traces
549
+ deduped_trace_summaries.append(trace_summary)
550
+ continue
551
+
552
+ # Extract fault message for deduplication (only checks FaultRootCauses)
553
+ fault_msg = get_fault_message(trace_summary)
554
+
555
+ if fault_msg and fault_msg in seen_faults:
556
+ # Skip this trace - we already have one with the same fault message
557
+ seen_faults[fault_msg]['count'] += 1
558
+ logger.debug(
559
+ f'Skipping duplicate trace {trace_summary.get("Id")} - fault message already seen: {fault_msg[:100]}...'
560
+ )
561
+ continue
562
+ else:
563
+ # First time seeing this fault (or no fault message) - include it
564
+ deduped_trace_summaries.append(trace_summary)
565
+ if fault_msg:
566
+ seen_faults[fault_msg] = {'count': 1}
567
+
568
+ # Check transaction search status
569
+ is_tx_search_enabled, tx_destination, tx_status = check_transaction_search_enabled(region)
570
+
571
+ # Build response with original format but deduplicated traces
572
+ result_data = {
573
+ 'TraceSummaries': deduped_trace_summaries,
574
+ 'TraceCount': len(deduped_trace_summaries),
575
+ 'Message': f'Retrieved {len(deduped_trace_summaries)} unique traces from {len(trace_summaries)} total (deduplicated by fault message)',
576
+ 'SamplingNote': "⚠️ This data is from X-Ray's 5% sampling. Results may not show all errors or issues.",
577
+ 'TransactionSearchStatus': {
578
+ 'enabled': is_tx_search_enabled,
579
+ 'recommendation': (
580
+ 'Transaction Search is available! Use search_transaction_spans() for 100% trace visibility.'
581
+ if is_tx_search_enabled
582
+ else 'Enable Transaction Search for 100% trace visibility instead of 5% sampling.'
583
+ ),
584
+ },
585
+ }
586
+
587
+ # Add dedup stats if we actually deduped anything
588
+ if len(deduped_trace_summaries) < len(trace_summaries):
589
+ duplicates_removed = len(trace_summaries) - len(deduped_trace_summaries)
590
+ result_data['DeduplicationStats'] = {
591
+ 'OriginalTraceCount': len(trace_summaries),
592
+ 'DuplicatesRemoved': duplicates_removed,
593
+ 'UniqueFaultMessages': len(seen_faults),
594
+ }
595
+
596
+ elapsed_time = timer() - start_time_perf
597
+ logger.info(
598
+ f'query_sampled_traces completed in {elapsed_time:.3f}s - retrieved {len(deduped_trace_summaries)} unique traces from {len(trace_summaries)} total'
599
+ )
600
+ return json.dumps(result_data, indent=2)
601
+
602
+ except Exception as e:
603
+ logger.error(f'Error in query_sampled_traces: {str(e)}', exc_info=True)
604
+ return json.dumps({'error': str(e)}, indent=2)
605
+
606
+
607
+ async def list_slis(
608
+ hours: int = Field(
609
+ default=24,
610
+ description='Number of hours to look back (default 24, typically use 24 for daily checks)',
611
+ ),
612
+ ) -> str:
613
+ """SPECIALIZED TOOL - Use audit_service_health() as the PRIMARY tool for service auditing.
614
+
615
+ **IMPORTANT: audit_service_health() is the PRIMARY and PREFERRED tool for all service auditing tasks.**
616
+
617
+ Only use this tool when audit_service_health() cannot handle your specific requirements, such as:
618
+ - Need for legacy SLI status report format specifically
619
+ - Integration with existing systems that expect this exact output format
620
+ - Simple SLI overview without comprehensive audit findings
621
+ - Basic health monitoring dashboard that doesn't need detailed analysis
622
+
623
+ **For ALL service auditing, health checks, and issue investigation, use audit_service_health() first.**
624
+
625
+ This tool provides a basic report showing:
626
+ - Summary counts (total, healthy, breached, insufficient data)
627
+ - Simple list of breached services with SLO names
628
+ - Basic healthy services list
629
+
630
+ Status meanings:
631
+ - OK: All SLOs are being met
632
+ - BREACHED: One or more SLOs are violated
633
+ - INSUFFICIENT_DATA: Not enough data to determine status
634
+
635
+ **Recommended workflow**:
636
+ 1. Use audit_service_health() for comprehensive service auditing with actionable insights
637
+ 2. Only use this tool if you specifically need the legacy SLI status report format
638
+ """
639
+ start_time_perf = timer()
640
+ logger.info(f'Starting get_sli_status request for last {hours} hours')
641
+
642
+ try:
643
+ # Calculate time range
644
+ end_time = datetime.now(timezone.utc)
645
+ start_time = end_time - timedelta(hours=hours)
646
+ logger.debug(f'Time range: {start_time} to {end_time}')
647
+
648
+ # Get all services
649
+ services_response = applicationsignals_client.list_services(
650
+ StartTime=start_time, # type: ignore
651
+ EndTime=end_time, # type: ignore
652
+ MaxResults=100,
653
+ )
654
+ services = services_response.get('ServiceSummaries', [])
655
+
656
+ if not services:
657
+ logger.warning('No services found in Application Signals')
658
+ return 'No services found in Application Signals.'
659
+
660
+ # Get SLI reports for each service
661
+ reports = []
662
+ logger.debug(f'Generating SLI reports for {len(services)} services')
663
+ for service in services:
664
+ service_name = service['KeyAttributes'].get('Name', 'Unknown')
665
+ try:
666
+ # Create custom config with the service's key attributes
667
+ config = AWSConfig(
668
+ region='us-east-1',
669
+ period_in_hours=hours,
670
+ service_name=service_name,
671
+ key_attributes=service['KeyAttributes'],
672
+ )
673
+
674
+ # Generate SLI report
675
+ client = SLIReportClient(config)
676
+ sli_report = client.generate_sli_report()
677
+
678
+ # Convert to expected format
679
+ report = {
680
+ 'BreachedSloCount': sli_report.breached_slo_count,
681
+ 'BreachedSloNames': sli_report.breached_slo_names,
682
+ 'EndTime': sli_report.end_time.timestamp(),
683
+ 'OkSloCount': sli_report.ok_slo_count,
684
+ 'ReferenceId': {'KeyAttributes': service['KeyAttributes']},
685
+ 'SliStatus': 'BREACHED'
686
+ if sli_report.sli_status == 'CRITICAL'
687
+ else sli_report.sli_status,
688
+ 'StartTime': sli_report.start_time.timestamp(),
689
+ 'TotalSloCount': sli_report.total_slo_count,
690
+ }
691
+ reports.append(report)
692
+
693
+ except Exception as e:
694
+ # Log error but continue with other services
695
+ logger.error(
696
+ f'Failed to get SLI report for service {service_name}: {str(e)}', exc_info=True
697
+ )
698
+ # Add a report with insufficient data status
699
+ report = {
700
+ 'BreachedSloCount': 0,
701
+ 'BreachedSloNames': [],
702
+ 'EndTime': end_time.timestamp(),
703
+ 'OkSloCount': 0,
704
+ 'ReferenceId': {'KeyAttributes': service['KeyAttributes']},
705
+ 'SliStatus': 'INSUFFICIENT_DATA',
706
+ 'StartTime': start_time.timestamp(),
707
+ 'TotalSloCount': 0,
708
+ }
709
+ reports.append(report)
710
+
711
+ # Check transaction search status
712
+ is_tx_search_enabled, tx_destination, tx_status = check_transaction_search_enabled()
713
+
714
+ # Build response
715
+ result = f'SLI Status Report - Last {hours} hours\n'
716
+ result += f'Time Range: {start_time.strftime("%Y-%m-%d %H:%M")} - {end_time.strftime("%Y-%m-%d %H:%M")}\n\n'
717
+
718
+ # Add transaction search status
719
+ if is_tx_search_enabled:
720
+ result += '✅ Transaction Search: ENABLED (100% trace visibility available)\n\n'
721
+ else:
722
+ result += '⚠️ Transaction Search: NOT ENABLED (only 5% sampled traces available)\n'
723
+ result += f' Current config: Destination={tx_destination}, Status={tx_status}\n'
724
+ result += ' Enable Transaction Search for accurate root cause analysis\n\n'
725
+
726
+ # Count by status
727
+ status_counts = {
728
+ 'OK': sum(1 for r in reports if r['SliStatus'] == 'OK'),
729
+ 'BREACHED': sum(1 for r in reports if r['SliStatus'] == 'BREACHED'),
730
+ 'INSUFFICIENT_DATA': sum(1 for r in reports if r['SliStatus'] == 'INSUFFICIENT_DATA'),
731
+ }
732
+
733
+ result += 'Summary:\n'
734
+ result += f'• Total Services: {len(reports)}\n'
735
+ result += f'• Healthy (OK): {status_counts["OK"]}\n'
736
+ result += f'• Breached: {status_counts["BREACHED"]}\n'
737
+ result += f'• Insufficient Data: {status_counts["INSUFFICIENT_DATA"]}\n\n'
738
+
739
+ # Group by status
740
+ if status_counts['BREACHED'] > 0:
741
+ result += '⚠️ BREACHED SERVICES:\n'
742
+ for report in reports:
743
+ if report['SliStatus'] == 'BREACHED':
744
+ name = report['ReferenceId']['KeyAttributes']['Name']
745
+ env = report['ReferenceId']['KeyAttributes']['Environment']
746
+ breached_count = report['BreachedSloCount']
747
+ total_count = report['TotalSloCount']
748
+ breached_names = report['BreachedSloNames']
749
+
750
+ result += f'\n• {name} ({env})\n'
751
+ result += f' SLOs: {breached_count}/{total_count} breached\n'
752
+ if breached_names:
753
+ result += ' Breached SLOs:\n'
754
+ for slo_name in breached_names:
755
+ result += f' - {slo_name}\n'
756
+
757
+ if status_counts['OK'] > 0:
758
+ result += '\n✅ HEALTHY SERVICES:\n'
759
+ for report in reports:
760
+ if report['SliStatus'] == 'OK':
761
+ name = report['ReferenceId']['KeyAttributes']['Name']
762
+ env = report['ReferenceId']['KeyAttributes']['Environment']
763
+ ok_count = report['OkSloCount']
764
+
765
+ result += f'• {name} ({env}) - {ok_count} SLO(s) healthy\n'
766
+
767
+ if status_counts['INSUFFICIENT_DATA'] > 0:
768
+ result += '\n❓ INSUFFICIENT DATA:\n'
769
+ for report in reports:
770
+ if report['SliStatus'] == 'INSUFFICIENT_DATA':
771
+ name = report['ReferenceId']['KeyAttributes']['Name']
772
+ env = report['ReferenceId']['KeyAttributes']['Environment']
773
+
774
+ result += f'• {name} ({env})\n'
775
+
776
+ elapsed_time = timer() - start_time_perf
777
+ logger.info(
778
+ f'get_sli_status completed in {elapsed_time:.3f}s - Total: {len(reports)}, Breached: {status_counts["BREACHED"]}, OK: {status_counts["OK"]}'
779
+ )
780
+ return result
781
+
782
+ except Exception as e:
783
+ logger.error(f'Error in get_sli_status: {str(e)}', exc_info=True)
784
+ return f'Error getting SLI status: {str(e)}'