awslabs.cloudwatch-appsignals-mcp-server 0.1.7__py3-none-any.whl → 0.1.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (19) hide show
  1. awslabs/cloudwatch_appsignals_mcp_server/__init__.py +1 -1
  2. awslabs/cloudwatch_appsignals_mcp_server/audit_presentation_utils.py +231 -0
  3. awslabs/cloudwatch_appsignals_mcp_server/audit_utils.py +699 -0
  4. awslabs/cloudwatch_appsignals_mcp_server/aws_clients.py +88 -0
  5. awslabs/cloudwatch_appsignals_mcp_server/server.py +675 -1220
  6. awslabs/cloudwatch_appsignals_mcp_server/service_audit_utils.py +231 -0
  7. awslabs/cloudwatch_appsignals_mcp_server/service_tools.py +659 -0
  8. awslabs/cloudwatch_appsignals_mcp_server/sli_report_client.py +5 -12
  9. awslabs/cloudwatch_appsignals_mcp_server/slo_tools.py +386 -0
  10. awslabs/cloudwatch_appsignals_mcp_server/trace_tools.py +658 -0
  11. awslabs/cloudwatch_appsignals_mcp_server/utils.py +172 -0
  12. awslabs_cloudwatch_appsignals_mcp_server-0.1.8.dist-info/METADATA +636 -0
  13. awslabs_cloudwatch_appsignals_mcp_server-0.1.8.dist-info/RECORD +18 -0
  14. awslabs_cloudwatch_appsignals_mcp_server-0.1.7.dist-info/METADATA +0 -350
  15. awslabs_cloudwatch_appsignals_mcp_server-0.1.7.dist-info/RECORD +0 -10
  16. {awslabs_cloudwatch_appsignals_mcp_server-0.1.7.dist-info → awslabs_cloudwatch_appsignals_mcp_server-0.1.8.dist-info}/WHEEL +0 -0
  17. {awslabs_cloudwatch_appsignals_mcp_server-0.1.7.dist-info → awslabs_cloudwatch_appsignals_mcp_server-0.1.8.dist-info}/entry_points.txt +0 -0
  18. {awslabs_cloudwatch_appsignals_mcp_server-0.1.7.dist-info → awslabs_cloudwatch_appsignals_mcp_server-0.1.8.dist-info}/licenses/LICENSE +0 -0
  19. {awslabs_cloudwatch_appsignals_mcp_server-0.1.7.dist-info → awslabs_cloudwatch_appsignals_mcp_server-0.1.8.dist-info}/licenses/NOTICE +0 -0
@@ -0,0 +1,658 @@
1
+ # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """CloudWatch Application Signals MCP Server - Trace and logging tools."""
16
+
17
+ import asyncio
18
+ import json
19
+ from .aws_clients import appsignals_client, logs_client, xray_client
20
+ from .sli_report_client import AWSConfig, SLIReportClient
21
+ from .utils import remove_null_values
22
+ from datetime import datetime, timedelta, timezone
23
+ from loguru import logger
24
+ from pydantic import Field
25
+ from time import perf_counter as timer
26
+ from typing import Dict, Optional
27
+
28
+
29
+ def get_trace_summaries_paginated(
30
+ xray_client, start_time, end_time, filter_expression, max_traces: int = 100
31
+ ) -> list:
32
+ """Get trace summaries with pagination to avoid exceeding response size limits.
33
+
34
+ Args:
35
+ xray_client: Boto3 X-Ray client
36
+ start_time: Start time for trace query
37
+ end_time: End time for trace query
38
+ filter_expression: X-Ray filter expression
39
+ max_traces: Maximum number of traces to retrieve (default 100)
40
+
41
+ Returns:
42
+ List of trace summaries
43
+ """
44
+ all_traces = []
45
+ next_token = None
46
+ logger.debug(
47
+ f'Starting paginated trace retrieval - filter: {filter_expression}, max_traces: {max_traces}'
48
+ )
49
+
50
+ try:
51
+ while len(all_traces) < max_traces:
52
+ # Build request parameters
53
+ kwargs = {
54
+ 'StartTime': start_time,
55
+ 'EndTime': end_time,
56
+ 'FilterExpression': filter_expression,
57
+ 'Sampling': True,
58
+ 'TimeRangeType': 'Service',
59
+ }
60
+
61
+ if next_token:
62
+ kwargs['NextToken'] = next_token
63
+
64
+ # Make request
65
+ response = xray_client.get_trace_summaries(**kwargs)
66
+
67
+ # Add traces from this page
68
+ traces = response.get('TraceSummaries', [])
69
+ all_traces.extend(traces)
70
+ logger.debug(
71
+ f'Retrieved {len(traces)} traces in this page, total so far: {len(all_traces)}'
72
+ )
73
+
74
+ # Check if we have more pages
75
+ next_token = response.get('NextToken')
76
+ if not next_token:
77
+ break
78
+
79
+ # If we've collected enough traces, stop
80
+ if len(all_traces) >= max_traces:
81
+ all_traces = all_traces[:max_traces]
82
+ break
83
+
84
+ logger.info(f'Successfully retrieved {len(all_traces)} traces')
85
+ return all_traces
86
+
87
+ except Exception as e:
88
+ # Return what we have so far if there's an error
89
+ logger.error(f'Error during paginated trace retrieval: {str(e)}', exc_info=True)
90
+ logger.info(f'Returning {len(all_traces)} traces retrieved before error')
91
+ return all_traces
92
+
93
+
94
+ def check_transaction_search_enabled(region: str = 'us-east-1') -> tuple[bool, str, str]:
95
+ """Internal function to check if AWS X-Ray Transaction Search is enabled.
96
+
97
+ Returns:
98
+ tuple: (is_enabled: bool, destination: str, status: str)
99
+ """
100
+ try:
101
+ response = xray_client.get_trace_segment_destination()
102
+
103
+ destination = response.get('Destination', 'Unknown')
104
+ status = response.get('Status', 'Unknown')
105
+
106
+ is_enabled = destination == 'CloudWatchLogs' and status == 'ACTIVE'
107
+ logger.debug(
108
+ f'Transaction Search check - Enabled: {is_enabled}, Destination: {destination}, Status: {status}'
109
+ )
110
+
111
+ return is_enabled, destination, status
112
+
113
+ except Exception as e:
114
+ logger.error(f'Error checking transaction search status: {str(e)}')
115
+ return False, 'Unknown', 'Error'
116
+
117
+
118
+ async def search_transaction_spans(
119
+ log_group_name: str = Field(
120
+ default='',
121
+ description='CloudWatch log group name (defaults to "aws/spans" if not provided)',
122
+ ),
123
+ start_time: str = Field(
124
+ default='', description='Start time in ISO 8601 format (e.g., "2025-04-19T20:00:00+00:00")'
125
+ ),
126
+ end_time: str = Field(
127
+ default='', description='End time in ISO 8601 format (e.g., "2025-04-19T21:00:00+00:00")'
128
+ ),
129
+ query_string: str = Field(default='', description='CloudWatch Logs Insights query string'),
130
+ limit: Optional[int] = Field(default=None, description='Maximum number of results to return'),
131
+ max_timeout: int = Field(
132
+ default=30, description='Maximum time in seconds to wait for query completion'
133
+ ),
134
+ ) -> Dict:
135
+ """Executes a CloudWatch Logs Insights query for transaction search (100% sampled trace data).
136
+
137
+ IMPORTANT: If log_group_name is not provided use 'aws/spans' as default cloudwatch log group name.
138
+ The volume of returned logs can easily overwhelm the agent context window. Always include a limit in the query
139
+ (| limit 50) or using the limit parameter.
140
+
141
+ Usage:
142
+ "aws/spans" log group stores OpenTelemetry Spans data with many attributes for all monitored services.
143
+ This provides 100% sampled data vs X-Ray's 5% sampling, giving more accurate results.
144
+ User can write CloudWatch Logs Insights queries to group, list attribute with sum, avg.
145
+
146
+ ```
147
+ FILTER attributes.aws.local.service = "customers-service-java" and attributes.aws.local.environment = "eks:demo/default" and attributes.aws.remote.operation="InvokeModel"
148
+ | STATS sum(`attributes.gen_ai.usage.output_tokens`) as `avg_output_tokens` by `attributes.gen_ai.request.model`, `attributes.aws.local.service`,bin(1h)
149
+ | DISPLAY avg_output_tokens, `attributes.gen_ai.request.model`, `attributes.aws.local.service`
150
+ ```
151
+
152
+ Returns:
153
+ --------
154
+ A dictionary containing the final query results, including:
155
+ - status: The current status of the query (e.g., Scheduled, Running, Complete, Failed, etc.)
156
+ - results: A list of the actual query results if the status is Complete.
157
+ - statistics: Query performance statistics
158
+ - messages: Any informational messages about the query
159
+ - transaction_search_status: Information about transaction search availability
160
+ """
161
+ start_time_perf = timer()
162
+ logger.info(
163
+ f'Starting search_transactions - log_group: {log_group_name}, start: {start_time}, end: {end_time}'
164
+ )
165
+ logger.debug(f'Query string: {query_string}')
166
+
167
+ # Check if transaction search is enabled
168
+ is_enabled, destination, status = check_transaction_search_enabled()
169
+
170
+ if not is_enabled:
171
+ logger.warning(
172
+ f'Transaction Search not enabled - Destination: {destination}, Status: {status}'
173
+ )
174
+ return {
175
+ 'status': 'Transaction Search Not Available',
176
+ 'transaction_search_status': {
177
+ 'enabled': False,
178
+ 'destination': destination,
179
+ 'status': status,
180
+ },
181
+ 'message': (
182
+ '⚠️ Transaction Search is not enabled for this account. '
183
+ f'Current configuration: Destination={destination}, Status={status}. '
184
+ "Transaction Search requires sending traces to CloudWatch Logs (destination='CloudWatchLogs' and status='ACTIVE'). "
185
+ 'Without Transaction Search, you only have access to 5% sampled trace data through X-Ray. '
186
+ 'To get 100% trace visibility, please enable Transaction Search in your X-Ray settings. '
187
+ 'As a fallback, you can use query_sampled_traces() but results may be incomplete due to sampling.'
188
+ ),
189
+ 'fallback_recommendation': 'Use query_sampled_traces() with X-Ray filter expressions for 5% sampled data.',
190
+ }
191
+
192
+ try:
193
+ # Use default log group if none provided
194
+ if log_group_name is None:
195
+ log_group_name = 'aws/spans'
196
+ logger.debug('Using default log group: aws/spans')
197
+
198
+ # Start query
199
+ kwargs = {
200
+ 'startTime': int(datetime.fromisoformat(start_time).timestamp()),
201
+ 'endTime': int(datetime.fromisoformat(end_time).timestamp()),
202
+ 'queryString': query_string,
203
+ 'logGroupNames': [log_group_name],
204
+ 'limit': limit,
205
+ }
206
+
207
+ logger.debug(f'Starting CloudWatch Logs query with limit: {limit}')
208
+ start_response = logs_client.start_query(**remove_null_values(kwargs))
209
+ query_id = start_response['queryId']
210
+ logger.info(f'Started CloudWatch Logs query with ID: {query_id}')
211
+
212
+ # Seconds
213
+ poll_start = timer()
214
+ while poll_start + max_timeout > timer():
215
+ response = logs_client.get_query_results(queryId=query_id)
216
+ status = response['status']
217
+
218
+ if status in {'Complete', 'Failed', 'Cancelled'}:
219
+ elapsed_time = timer() - start_time_perf
220
+ logger.info(
221
+ f'Query {query_id} finished with status {status} in {elapsed_time:.3f}s'
222
+ )
223
+
224
+ if status == 'Failed':
225
+ logger.error(f'Query failed: {response.get("statistics", {})}')
226
+ elif status == 'Complete':
227
+ logger.debug(f'Query returned {len(response.get("results", []))} results')
228
+
229
+ return {
230
+ 'queryId': query_id,
231
+ 'status': status,
232
+ 'statistics': response.get('statistics', {}),
233
+ 'results': [
234
+ {field.get('field', ''): field.get('value', '') for field in line} # type: ignore
235
+ for line in response.get('results', [])
236
+ ],
237
+ 'transaction_search_status': {
238
+ 'enabled': True,
239
+ 'destination': 'CloudWatchLogs',
240
+ 'status': 'ACTIVE',
241
+ 'message': '✅ Using 100% sampled trace data from Transaction Search',
242
+ },
243
+ }
244
+
245
+ await asyncio.sleep(1)
246
+
247
+ elapsed_time = timer() - start_time_perf
248
+ msg = f'Query {query_id} did not complete within {max_timeout} seconds. Use get_query_results with the returned queryId to try again to retrieve query results.'
249
+ logger.warning(f'Query timeout after {elapsed_time:.3f}s: {msg}')
250
+ return {
251
+ 'queryId': query_id,
252
+ 'status': 'Polling Timeout',
253
+ 'message': msg,
254
+ }
255
+
256
+ except Exception as e:
257
+ logger.error(f'Error in search_transactions: {str(e)}', exc_info=True)
258
+ raise
259
+
260
+
261
+ async def query_sampled_traces(
262
+ start_time: Optional[str] = Field(
263
+ default=None,
264
+ description='Start time in ISO format (e.g., "2024-01-01T00:00:00Z"). Defaults to 3 hours ago',
265
+ ),
266
+ end_time: Optional[str] = Field(
267
+ default=None,
268
+ description='End time in ISO format (e.g., "2024-01-01T01:00:00Z"). Defaults to current time',
269
+ ),
270
+ filter_expression: Optional[str] = Field(
271
+ default=None,
272
+ description='X-Ray filter expression to narrow results (e.g., service("service-name"){fault = true})',
273
+ ),
274
+ region: Optional[str] = Field(
275
+ default=None, description='AWS region (defaults to AWS_REGION environment variable)'
276
+ ),
277
+ ) -> str:
278
+ """SECONDARY TRACE TOOL - Query AWS X-Ray traces (5% sampled data) for trace investigation.
279
+
280
+ ⚠️ **IMPORTANT: Consider using audit_slos() with auditors="all" instead for comprehensive root cause analysis**
281
+
282
+ **RECOMMENDED WORKFLOW FOR OPERATION DISCOVERY:**
283
+ 1. **Use `get_service_detail(service_name)` FIRST** to discover operations from metric dimensions
284
+ 2. **Use audit_slos() with auditors="all"** for comprehensive root cause analysis (PREFERRED)
285
+ 3. Only use this tool if you need specific trace filtering that other tools don't provide
286
+
287
+ **RECOMMENDED WORKFLOW FOR SLO BREACH INVESTIGATION:**
288
+ 1. Use get_slo() to understand SLO configuration
289
+ 2. **Use audit_slos() with auditors="all"** for comprehensive root cause analysis (PREFERRED)
290
+ 3. Only use this tool if you need specific trace filtering that audit_slos() doesn't provide
291
+
292
+ **WHY audit_slos() IS PREFERRED:**
293
+ - **Comprehensive analysis**: Combines traces, logs, metrics, and dependencies
294
+ - **Actionable recommendations**: Provides specific steps to resolve issues
295
+ - **Integrated findings**: Correlates multiple data sources for better insights
296
+ - **Much more effective** than individual trace analysis
297
+
298
+ **WHY get_service_detail() IS PREFERRED FOR OPERATION DISCOVERY:**
299
+ - **Direct operation discovery**: Operations are available in metric dimensions
300
+ - **More reliable**: Uses Application Signals service metadata instead of sampling
301
+ - **Comprehensive**: Shows all operations, not just those in sampled traces
302
+
303
+ ⚠️ **LIMITATIONS OF THIS TOOL:**
304
+ - Uses X-Ray's **5% sampled trace data** - may miss critical errors
305
+ - **Limited context** compared to comprehensive audit tools
306
+ - **No integrated analysis** with logs, metrics, or dependencies
307
+ - **May miss operations** due to sampling - use get_service_detail() for complete operation discovery
308
+ - For 100% trace visibility, enable Transaction Search and use search_transaction_spans()
309
+
310
+ **Use this tool only when:**
311
+ - You need specific X-Ray filter expressions not available in audit tools
312
+ - You're doing exploratory trace analysis outside of SLO breach investigation
313
+ - You need raw trace data for custom analysis
314
+ - **After using get_service_detail() for operation discovery**
315
+
316
+ **For operation discovery, use get_service_detail() instead:**
317
+ ```
318
+ get_service_detail(service_name='your-service-name')
319
+ ```
320
+
321
+ **For SLO breach root cause analysis, use audit_slos() instead:**
322
+ ```
323
+ audit_slos(
324
+ slo_targets='[{"Type":"slo","Data":{"Slo":{"SloName":"your-slo-name"}}}]', auditors='all'
325
+ )
326
+ ```
327
+
328
+ Common filter expressions (if you must use this tool):
329
+ - 'service("service-name"){fault = true}': Find all traces with faults (5xx errors) for a service
330
+ - 'service("service-name")': Filter by specific service
331
+ - 'duration > 5': Find slow requests (over 5 seconds)
332
+ - 'http.status = 500': Find specific HTTP status codes
333
+ - 'annotation[aws.local.operation]="GET /owners/*/lastname"': Filter by specific operation (from metric dimensions)
334
+ - 'annotation[aws.remote.operation]="ListOwners"': Filter by remote operation name
335
+ - Combine filters: 'service("api"){fault = true} AND annotation[aws.local.operation]="POST /visits"'
336
+
337
+ Returns JSON with trace summaries including:
338
+ - Trace ID for detailed investigation
339
+ - Duration and response time
340
+ - Error/fault/throttle status
341
+ - HTTP information (method, status, URL)
342
+ - Service interactions
343
+ - User information if available
344
+ - Exception root causes (ErrorRootCauses, FaultRootCauses, ResponseTimeRootCauses)
345
+
346
+ **RECOMMENDATION: Use get_service_detail() for operation discovery and audit_slos() with auditors="all" for comprehensive root cause analysis instead of this tool.**
347
+
348
+ Returns:
349
+ JSON string containing trace summaries with error status, duration, and service details
350
+ """
351
+ start_time_perf = timer()
352
+
353
+ # Use AWS_REGION environment variable if region not provided
354
+ if not region:
355
+ from .aws_clients import AWS_REGION
356
+
357
+ region = AWS_REGION
358
+
359
+ logger.info(f'Starting query_sampled_traces - region: {region}, filter: {filter_expression}')
360
+
361
+ try:
362
+ logger.debug('Using X-Ray client')
363
+
364
+ # Default to past 3 hours if times not provided
365
+ if not end_time:
366
+ end_datetime = datetime.now(timezone.utc)
367
+ else:
368
+ end_datetime = datetime.fromisoformat(end_time.replace('Z', '+00:00'))
369
+
370
+ if not start_time:
371
+ start_datetime = end_datetime - timedelta(hours=3)
372
+ else:
373
+ start_datetime = datetime.fromisoformat(start_time.replace('Z', '+00:00'))
374
+
375
+ # Validate time window to ensure it's not too large (max 6 hours)
376
+ time_diff = end_datetime - start_datetime
377
+ logger.debug(
378
+ f'Query time window: {start_datetime} to {end_datetime} ({time_diff.total_seconds() / 3600:.1f} hours)'
379
+ )
380
+ if time_diff > timedelta(hours=6):
381
+ logger.warning(f'Time window too large: {time_diff.total_seconds() / 3600:.1f} hours')
382
+ return json.dumps(
383
+ {
384
+ 'error': 'Time window too large. Maximum allowed is 6 hours.',
385
+ 'requested_hours': time_diff.total_seconds() / 3600,
386
+ },
387
+ indent=2,
388
+ )
389
+
390
+ # Use pagination helper with a reasonable limit
391
+ traces = get_trace_summaries_paginated(
392
+ xray_client,
393
+ start_datetime,
394
+ end_datetime,
395
+ filter_expression or '',
396
+ max_traces=100, # Limit to prevent response size issues
397
+ )
398
+
399
+ # Convert response to JSON-serializable format
400
+ def convert_datetime(obj):
401
+ if isinstance(obj, datetime):
402
+ return obj.isoformat()
403
+ return obj
404
+
405
+ trace_summaries = []
406
+ for trace in traces:
407
+ # Create a simplified trace data structure to reduce size
408
+ trace_data = {
409
+ 'Id': trace.get('Id'),
410
+ 'Duration': trace.get('Duration'),
411
+ 'ResponseTime': trace.get('ResponseTime'),
412
+ 'HasError': trace.get('HasError'),
413
+ 'HasFault': trace.get('HasFault'),
414
+ 'HasThrottle': trace.get('HasThrottle'),
415
+ 'Http': trace.get('Http', {}),
416
+ }
417
+
418
+ # Only include root causes if they exist (to save space)
419
+ if trace.get('ErrorRootCauses'):
420
+ trace_data['ErrorRootCauses'] = trace.get('ErrorRootCauses', [])[
421
+ :3
422
+ ] # Limit to first 3
423
+ if trace.get('FaultRootCauses'):
424
+ trace_data['FaultRootCauses'] = trace.get('FaultRootCauses', [])[
425
+ :3
426
+ ] # Limit to first 3
427
+ if trace.get('ResponseTimeRootCauses'):
428
+ trace_data['ResponseTimeRootCauses'] = trace.get('ResponseTimeRootCauses', [])[
429
+ :3
430
+ ] # Limit to first 3
431
+
432
+ # Include limited annotations for key operations
433
+ annotations = trace.get('Annotations', {})
434
+ if annotations:
435
+ # Only include operation-related annotations
436
+ filtered_annotations = {}
437
+ for key in ['aws.local.operation', 'aws.remote.operation']:
438
+ if key in annotations:
439
+ filtered_annotations[key] = annotations[key]
440
+ if filtered_annotations:
441
+ trace_data['Annotations'] = filtered_annotations
442
+
443
+ # Include user info if available
444
+ if trace.get('Users'):
445
+ trace_data['Users'] = trace.get('Users', [])[:2] # Limit to first 2 users
446
+
447
+ # Convert any datetime objects to ISO format strings
448
+ for key, value in trace_data.items():
449
+ trace_data[key] = convert_datetime(value)
450
+ trace_summaries.append(trace_data)
451
+
452
+ # Check transaction search status
453
+ is_tx_search_enabled, tx_destination, tx_status = check_transaction_search_enabled(region)
454
+
455
+ result_data = {
456
+ 'TraceSummaries': trace_summaries,
457
+ 'TraceCount': len(trace_summaries),
458
+ 'Message': f'Retrieved {len(trace_summaries)} traces (limited to prevent size issues)',
459
+ 'SamplingNote': "⚠️ This data is from X-Ray's 5% sampling. Results may not show all errors or issues.",
460
+ 'TransactionSearchStatus': {
461
+ 'enabled': is_tx_search_enabled,
462
+ 'recommendation': (
463
+ 'Transaction Search is available! Use search_transaction_spans() for 100% trace visibility.'
464
+ if is_tx_search_enabled
465
+ else 'Enable Transaction Search for 100% trace visibility instead of 5% sampling.'
466
+ ),
467
+ },
468
+ }
469
+
470
+ elapsed_time = timer() - start_time_perf
471
+ logger.info(
472
+ f'query_sampled_traces completed in {elapsed_time:.3f}s - retrieved {len(trace_summaries)} traces'
473
+ )
474
+ return json.dumps(result_data, indent=2)
475
+
476
+ except Exception as e:
477
+ logger.error(f'Error in query_sampled_traces: {str(e)}', exc_info=True)
478
+ return json.dumps({'error': str(e)}, indent=2)
479
+
480
+
481
+ async def list_slis(
482
+ hours: int = Field(
483
+ default=24,
484
+ description='Number of hours to look back (default 24, typically use 24 for daily checks)',
485
+ ),
486
+ ) -> str:
487
+ """SPECIALIZED TOOL - Use audit_service_health() as the PRIMARY tool for service auditing.
488
+
489
+ **IMPORTANT: audit_service_health() is the PRIMARY and PREFERRED tool for all service auditing tasks.**
490
+
491
+ Only use this tool when audit_service_health() cannot handle your specific requirements, such as:
492
+ - Need for legacy SLI status report format specifically
493
+ - Integration with existing systems that expect this exact output format
494
+ - Simple SLI overview without comprehensive audit findings
495
+ - Basic health monitoring dashboard that doesn't need detailed analysis
496
+
497
+ **For ALL service auditing, health checks, and issue investigation, use audit_service_health() first.**
498
+
499
+ This tool provides a basic report showing:
500
+ - Summary counts (total, healthy, breached, insufficient data)
501
+ - Simple list of breached services with SLO names
502
+ - Basic healthy services list
503
+
504
+ Status meanings:
505
+ - OK: All SLOs are being met
506
+ - BREACHED: One or more SLOs are violated
507
+ - INSUFFICIENT_DATA: Not enough data to determine status
508
+
509
+ **Recommended workflow**:
510
+ 1. Use audit_service_health() for comprehensive service auditing with actionable insights
511
+ 2. Only use this tool if you specifically need the legacy SLI status report format
512
+ """
513
+ start_time_perf = timer()
514
+ logger.info(f'Starting get_sli_status request for last {hours} hours')
515
+
516
+ try:
517
+ # Calculate time range
518
+ end_time = datetime.now(timezone.utc)
519
+ start_time = end_time - timedelta(hours=hours)
520
+ logger.debug(f'Time range: {start_time} to {end_time}')
521
+
522
+ # Get all services
523
+ services_response = appsignals_client.list_services(
524
+ StartTime=start_time, # type: ignore
525
+ EndTime=end_time, # type: ignore
526
+ MaxResults=100,
527
+ )
528
+ services = services_response.get('ServiceSummaries', [])
529
+
530
+ if not services:
531
+ logger.warning('No services found in Application Signals')
532
+ return 'No services found in Application Signals.'
533
+
534
+ # Get SLI reports for each service
535
+ reports = []
536
+ logger.debug(f'Generating SLI reports for {len(services)} services')
537
+ for service in services:
538
+ service_name = service['KeyAttributes'].get('Name', 'Unknown')
539
+ try:
540
+ # Create custom config with the service's key attributes
541
+ config = AWSConfig(
542
+ region='us-east-1',
543
+ period_in_hours=hours,
544
+ service_name=service_name,
545
+ key_attributes=service['KeyAttributes'],
546
+ )
547
+
548
+ # Generate SLI report
549
+ client = SLIReportClient(config)
550
+ sli_report = client.generate_sli_report()
551
+
552
+ # Convert to expected format
553
+ report = {
554
+ 'BreachedSloCount': sli_report.breached_slo_count,
555
+ 'BreachedSloNames': sli_report.breached_slo_names,
556
+ 'EndTime': sli_report.end_time.timestamp(),
557
+ 'OkSloCount': sli_report.ok_slo_count,
558
+ 'ReferenceId': {'KeyAttributes': service['KeyAttributes']},
559
+ 'SliStatus': 'BREACHED'
560
+ if sli_report.sli_status == 'CRITICAL'
561
+ else sli_report.sli_status,
562
+ 'StartTime': sli_report.start_time.timestamp(),
563
+ 'TotalSloCount': sli_report.total_slo_count,
564
+ }
565
+ reports.append(report)
566
+
567
+ except Exception as e:
568
+ # Log error but continue with other services
569
+ logger.error(
570
+ f'Failed to get SLI report for service {service_name}: {str(e)}', exc_info=True
571
+ )
572
+ # Add a report with insufficient data status
573
+ report = {
574
+ 'BreachedSloCount': 0,
575
+ 'BreachedSloNames': [],
576
+ 'EndTime': end_time.timestamp(),
577
+ 'OkSloCount': 0,
578
+ 'ReferenceId': {'KeyAttributes': service['KeyAttributes']},
579
+ 'SliStatus': 'INSUFFICIENT_DATA',
580
+ 'StartTime': start_time.timestamp(),
581
+ 'TotalSloCount': 0,
582
+ }
583
+ reports.append(report)
584
+
585
+ # Check transaction search status
586
+ is_tx_search_enabled, tx_destination, tx_status = check_transaction_search_enabled()
587
+
588
+ # Build response
589
+ result = f'SLI Status Report - Last {hours} hours\n'
590
+ result += f'Time Range: {start_time.strftime("%Y-%m-%d %H:%M")} - {end_time.strftime("%Y-%m-%d %H:%M")}\n\n'
591
+
592
+ # Add transaction search status
593
+ if is_tx_search_enabled:
594
+ result += '✅ Transaction Search: ENABLED (100% trace visibility available)\n\n'
595
+ else:
596
+ result += '⚠️ Transaction Search: NOT ENABLED (only 5% sampled traces available)\n'
597
+ result += f' Current config: Destination={tx_destination}, Status={tx_status}\n'
598
+ result += ' Enable Transaction Search for accurate root cause analysis\n\n'
599
+
600
+ # Count by status
601
+ status_counts = {
602
+ 'OK': sum(1 for r in reports if r['SliStatus'] == 'OK'),
603
+ 'BREACHED': sum(1 for r in reports if r['SliStatus'] == 'BREACHED'),
604
+ 'INSUFFICIENT_DATA': sum(1 for r in reports if r['SliStatus'] == 'INSUFFICIENT_DATA'),
605
+ }
606
+
607
+ result += 'Summary:\n'
608
+ result += f'• Total Services: {len(reports)}\n'
609
+ result += f'• Healthy (OK): {status_counts["OK"]}\n'
610
+ result += f'• Breached: {status_counts["BREACHED"]}\n'
611
+ result += f'• Insufficient Data: {status_counts["INSUFFICIENT_DATA"]}\n\n'
612
+
613
+ # Group by status
614
+ if status_counts['BREACHED'] > 0:
615
+ result += '⚠️ BREACHED SERVICES:\n'
616
+ for report in reports:
617
+ if report['SliStatus'] == 'BREACHED':
618
+ name = report['ReferenceId']['KeyAttributes']['Name']
619
+ env = report['ReferenceId']['KeyAttributes']['Environment']
620
+ breached_count = report['BreachedSloCount']
621
+ total_count = report['TotalSloCount']
622
+ breached_names = report['BreachedSloNames']
623
+
624
+ result += f'\n• {name} ({env})\n'
625
+ result += f' SLOs: {breached_count}/{total_count} breached\n'
626
+ if breached_names:
627
+ result += ' Breached SLOs:\n'
628
+ for slo_name in breached_names:
629
+ result += f' - {slo_name}\n'
630
+
631
+ if status_counts['OK'] > 0:
632
+ result += '\n✅ HEALTHY SERVICES:\n'
633
+ for report in reports:
634
+ if report['SliStatus'] == 'OK':
635
+ name = report['ReferenceId']['KeyAttributes']['Name']
636
+ env = report['ReferenceId']['KeyAttributes']['Environment']
637
+ ok_count = report['OkSloCount']
638
+
639
+ result += f'• {name} ({env}) - {ok_count} SLO(s) healthy\n'
640
+
641
+ if status_counts['INSUFFICIENT_DATA'] > 0:
642
+ result += '\n❓ INSUFFICIENT DATA:\n'
643
+ for report in reports:
644
+ if report['SliStatus'] == 'INSUFFICIENT_DATA':
645
+ name = report['ReferenceId']['KeyAttributes']['Name']
646
+ env = report['ReferenceId']['KeyAttributes']['Environment']
647
+
648
+ result += f'• {name} ({env})\n'
649
+
650
+ elapsed_time = timer() - start_time_perf
651
+ logger.info(
652
+ f'get_sli_status completed in {elapsed_time:.3f}s - Total: {len(reports)}, Breached: {status_counts["BREACHED"]}, OK: {status_counts["OK"]}'
653
+ )
654
+ return result
655
+
656
+ except Exception as e:
657
+ logger.error(f'Error in get_sli_status: {str(e)}', exc_info=True)
658
+ return f'Error getting SLI status: {str(e)}'