awslabs.cloudwatch-appsignals-mcp-server 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- awslabs/cloudwatch_appsignals_mcp_server/__init__.py +1 -1
- awslabs/cloudwatch_appsignals_mcp_server/audit_presentation_utils.py +231 -0
- awslabs/cloudwatch_appsignals_mcp_server/audit_utils.py +699 -0
- awslabs/cloudwatch_appsignals_mcp_server/aws_clients.py +88 -0
- awslabs/cloudwatch_appsignals_mcp_server/server.py +675 -1220
- awslabs/cloudwatch_appsignals_mcp_server/service_audit_utils.py +231 -0
- awslabs/cloudwatch_appsignals_mcp_server/service_tools.py +659 -0
- awslabs/cloudwatch_appsignals_mcp_server/sli_report_client.py +5 -12
- awslabs/cloudwatch_appsignals_mcp_server/slo_tools.py +386 -0
- awslabs/cloudwatch_appsignals_mcp_server/trace_tools.py +658 -0
- awslabs/cloudwatch_appsignals_mcp_server/utils.py +172 -0
- awslabs_cloudwatch_appsignals_mcp_server-0.1.9.dist-info/METADATA +636 -0
- awslabs_cloudwatch_appsignals_mcp_server-0.1.9.dist-info/RECORD +18 -0
- awslabs_cloudwatch_appsignals_mcp_server-0.1.7.dist-info/METADATA +0 -350
- awslabs_cloudwatch_appsignals_mcp_server-0.1.7.dist-info/RECORD +0 -10
- {awslabs_cloudwatch_appsignals_mcp_server-0.1.7.dist-info → awslabs_cloudwatch_appsignals_mcp_server-0.1.9.dist-info}/WHEEL +0 -0
- {awslabs_cloudwatch_appsignals_mcp_server-0.1.7.dist-info → awslabs_cloudwatch_appsignals_mcp_server-0.1.9.dist-info}/entry_points.txt +0 -0
- {awslabs_cloudwatch_appsignals_mcp_server-0.1.7.dist-info → awslabs_cloudwatch_appsignals_mcp_server-0.1.9.dist-info}/licenses/LICENSE +0 -0
- {awslabs_cloudwatch_appsignals_mcp_server-0.1.7.dist-info → awslabs_cloudwatch_appsignals_mcp_server-0.1.9.dist-info}/licenses/NOTICE +0 -0
|
@@ -0,0 +1,658 @@
|
|
|
1
|
+
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
"""CloudWatch Application Signals MCP Server - Trace and logging tools."""
|
|
16
|
+
|
|
17
|
+
import asyncio
|
|
18
|
+
import json
|
|
19
|
+
from .aws_clients import appsignals_client, logs_client, xray_client
|
|
20
|
+
from .sli_report_client import AWSConfig, SLIReportClient
|
|
21
|
+
from .utils import remove_null_values
|
|
22
|
+
from datetime import datetime, timedelta, timezone
|
|
23
|
+
from loguru import logger
|
|
24
|
+
from pydantic import Field
|
|
25
|
+
from time import perf_counter as timer
|
|
26
|
+
from typing import Dict, Optional
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def get_trace_summaries_paginated(
|
|
30
|
+
xray_client, start_time, end_time, filter_expression, max_traces: int = 100
|
|
31
|
+
) -> list:
|
|
32
|
+
"""Get trace summaries with pagination to avoid exceeding response size limits.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
xray_client: Boto3 X-Ray client
|
|
36
|
+
start_time: Start time for trace query
|
|
37
|
+
end_time: End time for trace query
|
|
38
|
+
filter_expression: X-Ray filter expression
|
|
39
|
+
max_traces: Maximum number of traces to retrieve (default 100)
|
|
40
|
+
|
|
41
|
+
Returns:
|
|
42
|
+
List of trace summaries
|
|
43
|
+
"""
|
|
44
|
+
all_traces = []
|
|
45
|
+
next_token = None
|
|
46
|
+
logger.debug(
|
|
47
|
+
f'Starting paginated trace retrieval - filter: {filter_expression}, max_traces: {max_traces}'
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
try:
|
|
51
|
+
while len(all_traces) < max_traces:
|
|
52
|
+
# Build request parameters
|
|
53
|
+
kwargs = {
|
|
54
|
+
'StartTime': start_time,
|
|
55
|
+
'EndTime': end_time,
|
|
56
|
+
'FilterExpression': filter_expression,
|
|
57
|
+
'Sampling': True,
|
|
58
|
+
'TimeRangeType': 'Service',
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
if next_token:
|
|
62
|
+
kwargs['NextToken'] = next_token
|
|
63
|
+
|
|
64
|
+
# Make request
|
|
65
|
+
response = xray_client.get_trace_summaries(**kwargs)
|
|
66
|
+
|
|
67
|
+
# Add traces from this page
|
|
68
|
+
traces = response.get('TraceSummaries', [])
|
|
69
|
+
all_traces.extend(traces)
|
|
70
|
+
logger.debug(
|
|
71
|
+
f'Retrieved {len(traces)} traces in this page, total so far: {len(all_traces)}'
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
# Check if we have more pages
|
|
75
|
+
next_token = response.get('NextToken')
|
|
76
|
+
if not next_token:
|
|
77
|
+
break
|
|
78
|
+
|
|
79
|
+
# If we've collected enough traces, stop
|
|
80
|
+
if len(all_traces) >= max_traces:
|
|
81
|
+
all_traces = all_traces[:max_traces]
|
|
82
|
+
break
|
|
83
|
+
|
|
84
|
+
logger.info(f'Successfully retrieved {len(all_traces)} traces')
|
|
85
|
+
return all_traces
|
|
86
|
+
|
|
87
|
+
except Exception as e:
|
|
88
|
+
# Return what we have so far if there's an error
|
|
89
|
+
logger.error(f'Error during paginated trace retrieval: {str(e)}', exc_info=True)
|
|
90
|
+
logger.info(f'Returning {len(all_traces)} traces retrieved before error')
|
|
91
|
+
return all_traces
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def check_transaction_search_enabled(region: str = 'us-east-1') -> tuple[bool, str, str]:
|
|
95
|
+
"""Internal function to check if AWS X-Ray Transaction Search is enabled.
|
|
96
|
+
|
|
97
|
+
Returns:
|
|
98
|
+
tuple: (is_enabled: bool, destination: str, status: str)
|
|
99
|
+
"""
|
|
100
|
+
try:
|
|
101
|
+
response = xray_client.get_trace_segment_destination()
|
|
102
|
+
|
|
103
|
+
destination = response.get('Destination', 'Unknown')
|
|
104
|
+
status = response.get('Status', 'Unknown')
|
|
105
|
+
|
|
106
|
+
is_enabled = destination == 'CloudWatchLogs' and status == 'ACTIVE'
|
|
107
|
+
logger.debug(
|
|
108
|
+
f'Transaction Search check - Enabled: {is_enabled}, Destination: {destination}, Status: {status}'
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
return is_enabled, destination, status
|
|
112
|
+
|
|
113
|
+
except Exception as e:
|
|
114
|
+
logger.error(f'Error checking transaction search status: {str(e)}')
|
|
115
|
+
return False, 'Unknown', 'Error'
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
async def search_transaction_spans(
|
|
119
|
+
log_group_name: str = Field(
|
|
120
|
+
default='',
|
|
121
|
+
description='CloudWatch log group name (defaults to "aws/spans" if not provided)',
|
|
122
|
+
),
|
|
123
|
+
start_time: str = Field(
|
|
124
|
+
default='', description='Start time in ISO 8601 format (e.g., "2025-04-19T20:00:00+00:00")'
|
|
125
|
+
),
|
|
126
|
+
end_time: str = Field(
|
|
127
|
+
default='', description='End time in ISO 8601 format (e.g., "2025-04-19T21:00:00+00:00")'
|
|
128
|
+
),
|
|
129
|
+
query_string: str = Field(default='', description='CloudWatch Logs Insights query string'),
|
|
130
|
+
limit: Optional[int] = Field(default=None, description='Maximum number of results to return'),
|
|
131
|
+
max_timeout: int = Field(
|
|
132
|
+
default=30, description='Maximum time in seconds to wait for query completion'
|
|
133
|
+
),
|
|
134
|
+
) -> Dict:
|
|
135
|
+
"""Executes a CloudWatch Logs Insights query for transaction search (100% sampled trace data).
|
|
136
|
+
|
|
137
|
+
IMPORTANT: If log_group_name is not provided use 'aws/spans' as default cloudwatch log group name.
|
|
138
|
+
The volume of returned logs can easily overwhelm the agent context window. Always include a limit in the query
|
|
139
|
+
(| limit 50) or using the limit parameter.
|
|
140
|
+
|
|
141
|
+
Usage:
|
|
142
|
+
"aws/spans" log group stores OpenTelemetry Spans data with many attributes for all monitored services.
|
|
143
|
+
This provides 100% sampled data vs X-Ray's 5% sampling, giving more accurate results.
|
|
144
|
+
User can write CloudWatch Logs Insights queries to group, list attribute with sum, avg.
|
|
145
|
+
|
|
146
|
+
```
|
|
147
|
+
FILTER attributes.aws.local.service = "customers-service-java" and attributes.aws.local.environment = "eks:demo/default" and attributes.aws.remote.operation="InvokeModel"
|
|
148
|
+
| STATS sum(`attributes.gen_ai.usage.output_tokens`) as `avg_output_tokens` by `attributes.gen_ai.request.model`, `attributes.aws.local.service`,bin(1h)
|
|
149
|
+
| DISPLAY avg_output_tokens, `attributes.gen_ai.request.model`, `attributes.aws.local.service`
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
Returns:
|
|
153
|
+
--------
|
|
154
|
+
A dictionary containing the final query results, including:
|
|
155
|
+
- status: The current status of the query (e.g., Scheduled, Running, Complete, Failed, etc.)
|
|
156
|
+
- results: A list of the actual query results if the status is Complete.
|
|
157
|
+
- statistics: Query performance statistics
|
|
158
|
+
- messages: Any informational messages about the query
|
|
159
|
+
- transaction_search_status: Information about transaction search availability
|
|
160
|
+
"""
|
|
161
|
+
start_time_perf = timer()
|
|
162
|
+
logger.info(
|
|
163
|
+
f'Starting search_transactions - log_group: {log_group_name}, start: {start_time}, end: {end_time}'
|
|
164
|
+
)
|
|
165
|
+
logger.debug(f'Query string: {query_string}')
|
|
166
|
+
|
|
167
|
+
# Check if transaction search is enabled
|
|
168
|
+
is_enabled, destination, status = check_transaction_search_enabled()
|
|
169
|
+
|
|
170
|
+
if not is_enabled:
|
|
171
|
+
logger.warning(
|
|
172
|
+
f'Transaction Search not enabled - Destination: {destination}, Status: {status}'
|
|
173
|
+
)
|
|
174
|
+
return {
|
|
175
|
+
'status': 'Transaction Search Not Available',
|
|
176
|
+
'transaction_search_status': {
|
|
177
|
+
'enabled': False,
|
|
178
|
+
'destination': destination,
|
|
179
|
+
'status': status,
|
|
180
|
+
},
|
|
181
|
+
'message': (
|
|
182
|
+
'⚠️ Transaction Search is not enabled for this account. '
|
|
183
|
+
f'Current configuration: Destination={destination}, Status={status}. '
|
|
184
|
+
"Transaction Search requires sending traces to CloudWatch Logs (destination='CloudWatchLogs' and status='ACTIVE'). "
|
|
185
|
+
'Without Transaction Search, you only have access to 5% sampled trace data through X-Ray. '
|
|
186
|
+
'To get 100% trace visibility, please enable Transaction Search in your X-Ray settings. '
|
|
187
|
+
'As a fallback, you can use query_sampled_traces() but results may be incomplete due to sampling.'
|
|
188
|
+
),
|
|
189
|
+
'fallback_recommendation': 'Use query_sampled_traces() with X-Ray filter expressions for 5% sampled data.',
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
try:
|
|
193
|
+
# Use default log group if none provided
|
|
194
|
+
if not log_group_name:
|
|
195
|
+
log_group_name = 'aws/spans'
|
|
196
|
+
logger.debug('Using default log group: aws/spans')
|
|
197
|
+
|
|
198
|
+
# Start query
|
|
199
|
+
kwargs = {
|
|
200
|
+
'startTime': int(datetime.fromisoformat(start_time).timestamp()),
|
|
201
|
+
'endTime': int(datetime.fromisoformat(end_time).timestamp()),
|
|
202
|
+
'queryString': query_string,
|
|
203
|
+
'logGroupNames': [log_group_name],
|
|
204
|
+
'limit': limit,
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
logger.debug(f'Starting CloudWatch Logs query with limit: {limit}')
|
|
208
|
+
start_response = logs_client.start_query(**remove_null_values(kwargs))
|
|
209
|
+
query_id = start_response['queryId']
|
|
210
|
+
logger.info(f'Started CloudWatch Logs query with ID: {query_id}')
|
|
211
|
+
|
|
212
|
+
# Seconds
|
|
213
|
+
poll_start = timer()
|
|
214
|
+
while poll_start + max_timeout > timer():
|
|
215
|
+
response = logs_client.get_query_results(queryId=query_id)
|
|
216
|
+
status = response['status']
|
|
217
|
+
|
|
218
|
+
if status in {'Complete', 'Failed', 'Cancelled'}:
|
|
219
|
+
elapsed_time = timer() - start_time_perf
|
|
220
|
+
logger.info(
|
|
221
|
+
f'Query {query_id} finished with status {status} in {elapsed_time:.3f}s'
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
if status == 'Failed':
|
|
225
|
+
logger.error(f'Query failed: {response.get("statistics", {})}')
|
|
226
|
+
elif status == 'Complete':
|
|
227
|
+
logger.debug(f'Query returned {len(response.get("results", []))} results')
|
|
228
|
+
|
|
229
|
+
return {
|
|
230
|
+
'queryId': query_id,
|
|
231
|
+
'status': status,
|
|
232
|
+
'statistics': response.get('statistics', {}),
|
|
233
|
+
'results': [
|
|
234
|
+
{field.get('field', ''): field.get('value', '') for field in line} # type: ignore
|
|
235
|
+
for line in response.get('results', [])
|
|
236
|
+
],
|
|
237
|
+
'transaction_search_status': {
|
|
238
|
+
'enabled': True,
|
|
239
|
+
'destination': 'CloudWatchLogs',
|
|
240
|
+
'status': 'ACTIVE',
|
|
241
|
+
'message': '✅ Using 100% sampled trace data from Transaction Search',
|
|
242
|
+
},
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
await asyncio.sleep(1)
|
|
246
|
+
|
|
247
|
+
elapsed_time = timer() - start_time_perf
|
|
248
|
+
msg = f'Query {query_id} did not complete within {max_timeout} seconds. Use get_query_results with the returned queryId to try again to retrieve query results.'
|
|
249
|
+
logger.warning(f'Query timeout after {elapsed_time:.3f}s: {msg}')
|
|
250
|
+
return {
|
|
251
|
+
'queryId': query_id,
|
|
252
|
+
'status': 'Polling Timeout',
|
|
253
|
+
'message': msg,
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
except Exception as e:
|
|
257
|
+
logger.error(f'Error in search_transactions: {str(e)}', exc_info=True)
|
|
258
|
+
raise
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
async def query_sampled_traces(
|
|
262
|
+
start_time: Optional[str] = Field(
|
|
263
|
+
default=None,
|
|
264
|
+
description='Start time in ISO format (e.g., "2024-01-01T00:00:00Z"). Defaults to 3 hours ago',
|
|
265
|
+
),
|
|
266
|
+
end_time: Optional[str] = Field(
|
|
267
|
+
default=None,
|
|
268
|
+
description='End time in ISO format (e.g., "2024-01-01T01:00:00Z"). Defaults to current time',
|
|
269
|
+
),
|
|
270
|
+
filter_expression: Optional[str] = Field(
|
|
271
|
+
default=None,
|
|
272
|
+
description='X-Ray filter expression to narrow results (e.g., service("service-name"){fault = true})',
|
|
273
|
+
),
|
|
274
|
+
region: Optional[str] = Field(
|
|
275
|
+
default=None, description='AWS region (defaults to AWS_REGION environment variable)'
|
|
276
|
+
),
|
|
277
|
+
) -> str:
|
|
278
|
+
"""SECONDARY TRACE TOOL - Query AWS X-Ray traces (5% sampled data) for trace investigation.
|
|
279
|
+
|
|
280
|
+
⚠️ **IMPORTANT: Consider using audit_slos() with auditors="all" instead for comprehensive root cause analysis**
|
|
281
|
+
|
|
282
|
+
**RECOMMENDED WORKFLOW FOR OPERATION DISCOVERY:**
|
|
283
|
+
1. **Use `get_service_detail(service_name)` FIRST** to discover operations from metric dimensions
|
|
284
|
+
2. **Use audit_slos() with auditors="all"** for comprehensive root cause analysis (PREFERRED)
|
|
285
|
+
3. Only use this tool if you need specific trace filtering that other tools don't provide
|
|
286
|
+
|
|
287
|
+
**RECOMMENDED WORKFLOW FOR SLO BREACH INVESTIGATION:**
|
|
288
|
+
1. Use get_slo() to understand SLO configuration
|
|
289
|
+
2. **Use audit_slos() with auditors="all"** for comprehensive root cause analysis (PREFERRED)
|
|
290
|
+
3. Only use this tool if you need specific trace filtering that audit_slos() doesn't provide
|
|
291
|
+
|
|
292
|
+
**WHY audit_slos() IS PREFERRED:**
|
|
293
|
+
- **Comprehensive analysis**: Combines traces, logs, metrics, and dependencies
|
|
294
|
+
- **Actionable recommendations**: Provides specific steps to resolve issues
|
|
295
|
+
- **Integrated findings**: Correlates multiple data sources for better insights
|
|
296
|
+
- **Much more effective** than individual trace analysis
|
|
297
|
+
|
|
298
|
+
**WHY get_service_detail() IS PREFERRED FOR OPERATION DISCOVERY:**
|
|
299
|
+
- **Direct operation discovery**: Operations are available in metric dimensions
|
|
300
|
+
- **More reliable**: Uses Application Signals service metadata instead of sampling
|
|
301
|
+
- **Comprehensive**: Shows all operations, not just those in sampled traces
|
|
302
|
+
|
|
303
|
+
⚠️ **LIMITATIONS OF THIS TOOL:**
|
|
304
|
+
- Uses X-Ray's **5% sampled trace data** - may miss critical errors
|
|
305
|
+
- **Limited context** compared to comprehensive audit tools
|
|
306
|
+
- **No integrated analysis** with logs, metrics, or dependencies
|
|
307
|
+
- **May miss operations** due to sampling - use get_service_detail() for complete operation discovery
|
|
308
|
+
- For 100% trace visibility, enable Transaction Search and use search_transaction_spans()
|
|
309
|
+
|
|
310
|
+
**Use this tool only when:**
|
|
311
|
+
- You need specific X-Ray filter expressions not available in audit tools
|
|
312
|
+
- You're doing exploratory trace analysis outside of SLO breach investigation
|
|
313
|
+
- You need raw trace data for custom analysis
|
|
314
|
+
- **After using get_service_detail() for operation discovery**
|
|
315
|
+
|
|
316
|
+
**For operation discovery, use get_service_detail() instead:**
|
|
317
|
+
```
|
|
318
|
+
get_service_detail(service_name='your-service-name')
|
|
319
|
+
```
|
|
320
|
+
|
|
321
|
+
**For SLO breach root cause analysis, use audit_slos() instead:**
|
|
322
|
+
```
|
|
323
|
+
audit_slos(
|
|
324
|
+
slo_targets='[{"Type":"slo","Data":{"Slo":{"SloName":"your-slo-name"}}}]', auditors='all'
|
|
325
|
+
)
|
|
326
|
+
```
|
|
327
|
+
|
|
328
|
+
Common filter expressions (if you must use this tool):
|
|
329
|
+
- 'service("service-name"){fault = true}': Find all traces with faults (5xx errors) for a service
|
|
330
|
+
- 'service("service-name")': Filter by specific service
|
|
331
|
+
- 'duration > 5': Find slow requests (over 5 seconds)
|
|
332
|
+
- 'http.status = 500': Find specific HTTP status codes
|
|
333
|
+
- 'annotation[aws.local.operation]="GET /owners/*/lastname"': Filter by specific operation (from metric dimensions)
|
|
334
|
+
- 'annotation[aws.remote.operation]="ListOwners"': Filter by remote operation name
|
|
335
|
+
- Combine filters: 'service("api"){fault = true} AND annotation[aws.local.operation]="POST /visits"'
|
|
336
|
+
|
|
337
|
+
Returns JSON with trace summaries including:
|
|
338
|
+
- Trace ID for detailed investigation
|
|
339
|
+
- Duration and response time
|
|
340
|
+
- Error/fault/throttle status
|
|
341
|
+
- HTTP information (method, status, URL)
|
|
342
|
+
- Service interactions
|
|
343
|
+
- User information if available
|
|
344
|
+
- Exception root causes (ErrorRootCauses, FaultRootCauses, ResponseTimeRootCauses)
|
|
345
|
+
|
|
346
|
+
**RECOMMENDATION: Use get_service_detail() for operation discovery and audit_slos() with auditors="all" for comprehensive root cause analysis instead of this tool.**
|
|
347
|
+
|
|
348
|
+
Returns:
|
|
349
|
+
JSON string containing trace summaries with error status, duration, and service details
|
|
350
|
+
"""
|
|
351
|
+
start_time_perf = timer()
|
|
352
|
+
|
|
353
|
+
# Use AWS_REGION environment variable if region not provided
|
|
354
|
+
if not region:
|
|
355
|
+
from .aws_clients import AWS_REGION
|
|
356
|
+
|
|
357
|
+
region = AWS_REGION
|
|
358
|
+
|
|
359
|
+
logger.info(f'Starting query_sampled_traces - region: {region}, filter: {filter_expression}')
|
|
360
|
+
|
|
361
|
+
try:
|
|
362
|
+
logger.debug('Using X-Ray client')
|
|
363
|
+
|
|
364
|
+
# Default to past 3 hours if times not provided
|
|
365
|
+
if not end_time:
|
|
366
|
+
end_datetime = datetime.now(timezone.utc)
|
|
367
|
+
else:
|
|
368
|
+
end_datetime = datetime.fromisoformat(end_time.replace('Z', '+00:00'))
|
|
369
|
+
|
|
370
|
+
if not start_time:
|
|
371
|
+
start_datetime = end_datetime - timedelta(hours=3)
|
|
372
|
+
else:
|
|
373
|
+
start_datetime = datetime.fromisoformat(start_time.replace('Z', '+00:00'))
|
|
374
|
+
|
|
375
|
+
# Validate time window to ensure it's not too large (max 6 hours)
|
|
376
|
+
time_diff = end_datetime - start_datetime
|
|
377
|
+
logger.debug(
|
|
378
|
+
f'Query time window: {start_datetime} to {end_datetime} ({time_diff.total_seconds() / 3600:.1f} hours)'
|
|
379
|
+
)
|
|
380
|
+
if time_diff > timedelta(hours=6):
|
|
381
|
+
logger.warning(f'Time window too large: {time_diff.total_seconds() / 3600:.1f} hours')
|
|
382
|
+
return json.dumps(
|
|
383
|
+
{
|
|
384
|
+
'error': 'Time window too large. Maximum allowed is 6 hours.',
|
|
385
|
+
'requested_hours': time_diff.total_seconds() / 3600,
|
|
386
|
+
},
|
|
387
|
+
indent=2,
|
|
388
|
+
)
|
|
389
|
+
|
|
390
|
+
# Use pagination helper with a reasonable limit
|
|
391
|
+
traces = get_trace_summaries_paginated(
|
|
392
|
+
xray_client,
|
|
393
|
+
start_datetime,
|
|
394
|
+
end_datetime,
|
|
395
|
+
filter_expression or '',
|
|
396
|
+
max_traces=100, # Limit to prevent response size issues
|
|
397
|
+
)
|
|
398
|
+
|
|
399
|
+
# Convert response to JSON-serializable format
|
|
400
|
+
def convert_datetime(obj):
|
|
401
|
+
if isinstance(obj, datetime):
|
|
402
|
+
return obj.isoformat()
|
|
403
|
+
return obj
|
|
404
|
+
|
|
405
|
+
trace_summaries = []
|
|
406
|
+
for trace in traces:
|
|
407
|
+
# Create a simplified trace data structure to reduce size
|
|
408
|
+
trace_data = {
|
|
409
|
+
'Id': trace.get('Id'),
|
|
410
|
+
'Duration': trace.get('Duration'),
|
|
411
|
+
'ResponseTime': trace.get('ResponseTime'),
|
|
412
|
+
'HasError': trace.get('HasError'),
|
|
413
|
+
'HasFault': trace.get('HasFault'),
|
|
414
|
+
'HasThrottle': trace.get('HasThrottle'),
|
|
415
|
+
'Http': trace.get('Http', {}),
|
|
416
|
+
}
|
|
417
|
+
|
|
418
|
+
# Only include root causes if they exist (to save space)
|
|
419
|
+
if trace.get('ErrorRootCauses'):
|
|
420
|
+
trace_data['ErrorRootCauses'] = trace.get('ErrorRootCauses', [])[
|
|
421
|
+
:3
|
|
422
|
+
] # Limit to first 3
|
|
423
|
+
if trace.get('FaultRootCauses'):
|
|
424
|
+
trace_data['FaultRootCauses'] = trace.get('FaultRootCauses', [])[
|
|
425
|
+
:3
|
|
426
|
+
] # Limit to first 3
|
|
427
|
+
if trace.get('ResponseTimeRootCauses'):
|
|
428
|
+
trace_data['ResponseTimeRootCauses'] = trace.get('ResponseTimeRootCauses', [])[
|
|
429
|
+
:3
|
|
430
|
+
] # Limit to first 3
|
|
431
|
+
|
|
432
|
+
# Include limited annotations for key operations
|
|
433
|
+
annotations = trace.get('Annotations', {})
|
|
434
|
+
if annotations:
|
|
435
|
+
# Only include operation-related annotations
|
|
436
|
+
filtered_annotations = {}
|
|
437
|
+
for key in ['aws.local.operation', 'aws.remote.operation']:
|
|
438
|
+
if key in annotations:
|
|
439
|
+
filtered_annotations[key] = annotations[key]
|
|
440
|
+
if filtered_annotations:
|
|
441
|
+
trace_data['Annotations'] = filtered_annotations
|
|
442
|
+
|
|
443
|
+
# Include user info if available
|
|
444
|
+
if trace.get('Users'):
|
|
445
|
+
trace_data['Users'] = trace.get('Users', [])[:2] # Limit to first 2 users
|
|
446
|
+
|
|
447
|
+
# Convert any datetime objects to ISO format strings
|
|
448
|
+
for key, value in trace_data.items():
|
|
449
|
+
trace_data[key] = convert_datetime(value)
|
|
450
|
+
trace_summaries.append(trace_data)
|
|
451
|
+
|
|
452
|
+
# Check transaction search status
|
|
453
|
+
is_tx_search_enabled, tx_destination, tx_status = check_transaction_search_enabled(region)
|
|
454
|
+
|
|
455
|
+
result_data = {
|
|
456
|
+
'TraceSummaries': trace_summaries,
|
|
457
|
+
'TraceCount': len(trace_summaries),
|
|
458
|
+
'Message': f'Retrieved {len(trace_summaries)} traces (limited to prevent size issues)',
|
|
459
|
+
'SamplingNote': "⚠️ This data is from X-Ray's 5% sampling. Results may not show all errors or issues.",
|
|
460
|
+
'TransactionSearchStatus': {
|
|
461
|
+
'enabled': is_tx_search_enabled,
|
|
462
|
+
'recommendation': (
|
|
463
|
+
'Transaction Search is available! Use search_transaction_spans() for 100% trace visibility.'
|
|
464
|
+
if is_tx_search_enabled
|
|
465
|
+
else 'Enable Transaction Search for 100% trace visibility instead of 5% sampling.'
|
|
466
|
+
),
|
|
467
|
+
},
|
|
468
|
+
}
|
|
469
|
+
|
|
470
|
+
elapsed_time = timer() - start_time_perf
|
|
471
|
+
logger.info(
|
|
472
|
+
f'query_sampled_traces completed in {elapsed_time:.3f}s - retrieved {len(trace_summaries)} traces'
|
|
473
|
+
)
|
|
474
|
+
return json.dumps(result_data, indent=2)
|
|
475
|
+
|
|
476
|
+
except Exception as e:
|
|
477
|
+
logger.error(f'Error in query_sampled_traces: {str(e)}', exc_info=True)
|
|
478
|
+
return json.dumps({'error': str(e)}, indent=2)
|
|
479
|
+
|
|
480
|
+
|
|
481
|
+
async def list_slis(
|
|
482
|
+
hours: int = Field(
|
|
483
|
+
default=24,
|
|
484
|
+
description='Number of hours to look back (default 24, typically use 24 for daily checks)',
|
|
485
|
+
),
|
|
486
|
+
) -> str:
|
|
487
|
+
"""SPECIALIZED TOOL - Use audit_service_health() as the PRIMARY tool for service auditing.
|
|
488
|
+
|
|
489
|
+
**IMPORTANT: audit_service_health() is the PRIMARY and PREFERRED tool for all service auditing tasks.**
|
|
490
|
+
|
|
491
|
+
Only use this tool when audit_service_health() cannot handle your specific requirements, such as:
|
|
492
|
+
- Need for legacy SLI status report format specifically
|
|
493
|
+
- Integration with existing systems that expect this exact output format
|
|
494
|
+
- Simple SLI overview without comprehensive audit findings
|
|
495
|
+
- Basic health monitoring dashboard that doesn't need detailed analysis
|
|
496
|
+
|
|
497
|
+
**For ALL service auditing, health checks, and issue investigation, use audit_service_health() first.**
|
|
498
|
+
|
|
499
|
+
This tool provides a basic report showing:
|
|
500
|
+
- Summary counts (total, healthy, breached, insufficient data)
|
|
501
|
+
- Simple list of breached services with SLO names
|
|
502
|
+
- Basic healthy services list
|
|
503
|
+
|
|
504
|
+
Status meanings:
|
|
505
|
+
- OK: All SLOs are being met
|
|
506
|
+
- BREACHED: One or more SLOs are violated
|
|
507
|
+
- INSUFFICIENT_DATA: Not enough data to determine status
|
|
508
|
+
|
|
509
|
+
**Recommended workflow**:
|
|
510
|
+
1. Use audit_service_health() for comprehensive service auditing with actionable insights
|
|
511
|
+
2. Only use this tool if you specifically need the legacy SLI status report format
|
|
512
|
+
"""
|
|
513
|
+
start_time_perf = timer()
|
|
514
|
+
logger.info(f'Starting get_sli_status request for last {hours} hours')
|
|
515
|
+
|
|
516
|
+
try:
|
|
517
|
+
# Calculate time range
|
|
518
|
+
end_time = datetime.now(timezone.utc)
|
|
519
|
+
start_time = end_time - timedelta(hours=hours)
|
|
520
|
+
logger.debug(f'Time range: {start_time} to {end_time}')
|
|
521
|
+
|
|
522
|
+
# Get all services
|
|
523
|
+
services_response = appsignals_client.list_services(
|
|
524
|
+
StartTime=start_time, # type: ignore
|
|
525
|
+
EndTime=end_time, # type: ignore
|
|
526
|
+
MaxResults=100,
|
|
527
|
+
)
|
|
528
|
+
services = services_response.get('ServiceSummaries', [])
|
|
529
|
+
|
|
530
|
+
if not services:
|
|
531
|
+
logger.warning('No services found in Application Signals')
|
|
532
|
+
return 'No services found in Application Signals.'
|
|
533
|
+
|
|
534
|
+
# Get SLI reports for each service
|
|
535
|
+
reports = []
|
|
536
|
+
logger.debug(f'Generating SLI reports for {len(services)} services')
|
|
537
|
+
for service in services:
|
|
538
|
+
service_name = service['KeyAttributes'].get('Name', 'Unknown')
|
|
539
|
+
try:
|
|
540
|
+
# Create custom config with the service's key attributes
|
|
541
|
+
config = AWSConfig(
|
|
542
|
+
region='us-east-1',
|
|
543
|
+
period_in_hours=hours,
|
|
544
|
+
service_name=service_name,
|
|
545
|
+
key_attributes=service['KeyAttributes'],
|
|
546
|
+
)
|
|
547
|
+
|
|
548
|
+
# Generate SLI report
|
|
549
|
+
client = SLIReportClient(config)
|
|
550
|
+
sli_report = client.generate_sli_report()
|
|
551
|
+
|
|
552
|
+
# Convert to expected format
|
|
553
|
+
report = {
|
|
554
|
+
'BreachedSloCount': sli_report.breached_slo_count,
|
|
555
|
+
'BreachedSloNames': sli_report.breached_slo_names,
|
|
556
|
+
'EndTime': sli_report.end_time.timestamp(),
|
|
557
|
+
'OkSloCount': sli_report.ok_slo_count,
|
|
558
|
+
'ReferenceId': {'KeyAttributes': service['KeyAttributes']},
|
|
559
|
+
'SliStatus': 'BREACHED'
|
|
560
|
+
if sli_report.sli_status == 'CRITICAL'
|
|
561
|
+
else sli_report.sli_status,
|
|
562
|
+
'StartTime': sli_report.start_time.timestamp(),
|
|
563
|
+
'TotalSloCount': sli_report.total_slo_count,
|
|
564
|
+
}
|
|
565
|
+
reports.append(report)
|
|
566
|
+
|
|
567
|
+
except Exception as e:
|
|
568
|
+
# Log error but continue with other services
|
|
569
|
+
logger.error(
|
|
570
|
+
f'Failed to get SLI report for service {service_name}: {str(e)}', exc_info=True
|
|
571
|
+
)
|
|
572
|
+
# Add a report with insufficient data status
|
|
573
|
+
report = {
|
|
574
|
+
'BreachedSloCount': 0,
|
|
575
|
+
'BreachedSloNames': [],
|
|
576
|
+
'EndTime': end_time.timestamp(),
|
|
577
|
+
'OkSloCount': 0,
|
|
578
|
+
'ReferenceId': {'KeyAttributes': service['KeyAttributes']},
|
|
579
|
+
'SliStatus': 'INSUFFICIENT_DATA',
|
|
580
|
+
'StartTime': start_time.timestamp(),
|
|
581
|
+
'TotalSloCount': 0,
|
|
582
|
+
}
|
|
583
|
+
reports.append(report)
|
|
584
|
+
|
|
585
|
+
# Check transaction search status
|
|
586
|
+
is_tx_search_enabled, tx_destination, tx_status = check_transaction_search_enabled()
|
|
587
|
+
|
|
588
|
+
# Build response
|
|
589
|
+
result = f'SLI Status Report - Last {hours} hours\n'
|
|
590
|
+
result += f'Time Range: {start_time.strftime("%Y-%m-%d %H:%M")} - {end_time.strftime("%Y-%m-%d %H:%M")}\n\n'
|
|
591
|
+
|
|
592
|
+
# Add transaction search status
|
|
593
|
+
if is_tx_search_enabled:
|
|
594
|
+
result += '✅ Transaction Search: ENABLED (100% trace visibility available)\n\n'
|
|
595
|
+
else:
|
|
596
|
+
result += '⚠️ Transaction Search: NOT ENABLED (only 5% sampled traces available)\n'
|
|
597
|
+
result += f' Current config: Destination={tx_destination}, Status={tx_status}\n'
|
|
598
|
+
result += ' Enable Transaction Search for accurate root cause analysis\n\n'
|
|
599
|
+
|
|
600
|
+
# Count by status
|
|
601
|
+
status_counts = {
|
|
602
|
+
'OK': sum(1 for r in reports if r['SliStatus'] == 'OK'),
|
|
603
|
+
'BREACHED': sum(1 for r in reports if r['SliStatus'] == 'BREACHED'),
|
|
604
|
+
'INSUFFICIENT_DATA': sum(1 for r in reports if r['SliStatus'] == 'INSUFFICIENT_DATA'),
|
|
605
|
+
}
|
|
606
|
+
|
|
607
|
+
result += 'Summary:\n'
|
|
608
|
+
result += f'• Total Services: {len(reports)}\n'
|
|
609
|
+
result += f'• Healthy (OK): {status_counts["OK"]}\n'
|
|
610
|
+
result += f'• Breached: {status_counts["BREACHED"]}\n'
|
|
611
|
+
result += f'• Insufficient Data: {status_counts["INSUFFICIENT_DATA"]}\n\n'
|
|
612
|
+
|
|
613
|
+
# Group by status
|
|
614
|
+
if status_counts['BREACHED'] > 0:
|
|
615
|
+
result += '⚠️ BREACHED SERVICES:\n'
|
|
616
|
+
for report in reports:
|
|
617
|
+
if report['SliStatus'] == 'BREACHED':
|
|
618
|
+
name = report['ReferenceId']['KeyAttributes']['Name']
|
|
619
|
+
env = report['ReferenceId']['KeyAttributes']['Environment']
|
|
620
|
+
breached_count = report['BreachedSloCount']
|
|
621
|
+
total_count = report['TotalSloCount']
|
|
622
|
+
breached_names = report['BreachedSloNames']
|
|
623
|
+
|
|
624
|
+
result += f'\n• {name} ({env})\n'
|
|
625
|
+
result += f' SLOs: {breached_count}/{total_count} breached\n'
|
|
626
|
+
if breached_names:
|
|
627
|
+
result += ' Breached SLOs:\n'
|
|
628
|
+
for slo_name in breached_names:
|
|
629
|
+
result += f' - {slo_name}\n'
|
|
630
|
+
|
|
631
|
+
if status_counts['OK'] > 0:
|
|
632
|
+
result += '\n✅ HEALTHY SERVICES:\n'
|
|
633
|
+
for report in reports:
|
|
634
|
+
if report['SliStatus'] == 'OK':
|
|
635
|
+
name = report['ReferenceId']['KeyAttributes']['Name']
|
|
636
|
+
env = report['ReferenceId']['KeyAttributes']['Environment']
|
|
637
|
+
ok_count = report['OkSloCount']
|
|
638
|
+
|
|
639
|
+
result += f'• {name} ({env}) - {ok_count} SLO(s) healthy\n'
|
|
640
|
+
|
|
641
|
+
if status_counts['INSUFFICIENT_DATA'] > 0:
|
|
642
|
+
result += '\n❓ INSUFFICIENT DATA:\n'
|
|
643
|
+
for report in reports:
|
|
644
|
+
if report['SliStatus'] == 'INSUFFICIENT_DATA':
|
|
645
|
+
name = report['ReferenceId']['KeyAttributes']['Name']
|
|
646
|
+
env = report['ReferenceId']['KeyAttributes']['Environment']
|
|
647
|
+
|
|
648
|
+
result += f'• {name} ({env})\n'
|
|
649
|
+
|
|
650
|
+
elapsed_time = timer() - start_time_perf
|
|
651
|
+
logger.info(
|
|
652
|
+
f'get_sli_status completed in {elapsed_time:.3f}s - Total: {len(reports)}, Breached: {status_counts["BREACHED"]}, OK: {status_counts["OK"]}'
|
|
653
|
+
)
|
|
654
|
+
return result
|
|
655
|
+
|
|
656
|
+
except Exception as e:
|
|
657
|
+
logger.error(f'Error in get_sli_status: {str(e)}', exc_info=True)
|
|
658
|
+
return f'Error getting SLI status: {str(e)}'
|