awslabs.cloudwatch-applicationsignals-mcp-server 0.1.21__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- awslabs/__init__.py +17 -0
- awslabs/cloudwatch_applicationsignals_mcp_server/__init__.py +17 -0
- awslabs/cloudwatch_applicationsignals_mcp_server/audit_presentation_utils.py +288 -0
- awslabs/cloudwatch_applicationsignals_mcp_server/audit_utils.py +912 -0
- awslabs/cloudwatch_applicationsignals_mcp_server/aws_clients.py +120 -0
- awslabs/cloudwatch_applicationsignals_mcp_server/canary_utils.py +910 -0
- awslabs/cloudwatch_applicationsignals_mcp_server/enablement_guides/templates/ec2/ec2-dotnet-enablement.md +435 -0
- awslabs/cloudwatch_applicationsignals_mcp_server/enablement_guides/templates/ec2/ec2-java-enablement.md +321 -0
- awslabs/cloudwatch_applicationsignals_mcp_server/enablement_guides/templates/ec2/ec2-nodejs-enablement.md +420 -0
- awslabs/cloudwatch_applicationsignals_mcp_server/enablement_guides/templates/ec2/ec2-python-enablement.md +598 -0
- awslabs/cloudwatch_applicationsignals_mcp_server/enablement_guides/templates/ecs/ecs-dotnet-enablement.md +264 -0
- awslabs/cloudwatch_applicationsignals_mcp_server/enablement_guides/templates/ecs/ecs-java-enablement.md +193 -0
- awslabs/cloudwatch_applicationsignals_mcp_server/enablement_guides/templates/ecs/ecs-nodejs-enablement.md +198 -0
- awslabs/cloudwatch_applicationsignals_mcp_server/enablement_guides/templates/ecs/ecs-python-enablement.md +236 -0
- awslabs/cloudwatch_applicationsignals_mcp_server/enablement_guides/templates/eks/eks-dotnet-enablement.md +166 -0
- awslabs/cloudwatch_applicationsignals_mcp_server/enablement_guides/templates/eks/eks-java-enablement.md +166 -0
- awslabs/cloudwatch_applicationsignals_mcp_server/enablement_guides/templates/eks/eks-nodejs-enablement.md +166 -0
- awslabs/cloudwatch_applicationsignals_mcp_server/enablement_guides/templates/eks/eks-python-enablement.md +169 -0
- awslabs/cloudwatch_applicationsignals_mcp_server/enablement_guides/templates/lambda/lambda-dotnet-enablement.md +336 -0
- awslabs/cloudwatch_applicationsignals_mcp_server/enablement_guides/templates/lambda/lambda-java-enablement.md +336 -0
- awslabs/cloudwatch_applicationsignals_mcp_server/enablement_guides/templates/lambda/lambda-nodejs-enablement.md +336 -0
- awslabs/cloudwatch_applicationsignals_mcp_server/enablement_guides/templates/lambda/lambda-python-enablement.md +336 -0
- awslabs/cloudwatch_applicationsignals_mcp_server/enablement_tools.py +147 -0
- awslabs/cloudwatch_applicationsignals_mcp_server/server.py +1505 -0
- awslabs/cloudwatch_applicationsignals_mcp_server/service_audit_utils.py +231 -0
- awslabs/cloudwatch_applicationsignals_mcp_server/service_tools.py +659 -0
- awslabs/cloudwatch_applicationsignals_mcp_server/sli_report_client.py +333 -0
- awslabs/cloudwatch_applicationsignals_mcp_server/slo_tools.py +386 -0
- awslabs/cloudwatch_applicationsignals_mcp_server/trace_tools.py +784 -0
- awslabs/cloudwatch_applicationsignals_mcp_server/utils.py +172 -0
- awslabs_cloudwatch_applicationsignals_mcp_server-0.1.21.dist-info/METADATA +808 -0
- awslabs_cloudwatch_applicationsignals_mcp_server-0.1.21.dist-info/RECORD +36 -0
- awslabs_cloudwatch_applicationsignals_mcp_server-0.1.21.dist-info/WHEEL +4 -0
- awslabs_cloudwatch_applicationsignals_mcp_server-0.1.21.dist-info/entry_points.txt +2 -0
- awslabs_cloudwatch_applicationsignals_mcp_server-0.1.21.dist-info/licenses/LICENSE +174 -0
- awslabs_cloudwatch_applicationsignals_mcp_server-0.1.21.dist-info/licenses/NOTICE +2 -0
|
@@ -0,0 +1,784 @@
|
|
|
1
|
+
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
"""CloudWatch Application Signals MCP Server - Trace and logging tools."""
|
|
16
|
+
|
|
17
|
+
import asyncio
|
|
18
|
+
import json
|
|
19
|
+
from .aws_clients import applicationsignals_client, logs_client, xray_client
|
|
20
|
+
from .sli_report_client import AWSConfig, SLIReportClient
|
|
21
|
+
from .utils import remove_null_values
|
|
22
|
+
from datetime import datetime, timedelta, timezone
|
|
23
|
+
from loguru import logger
|
|
24
|
+
from pydantic import Field
|
|
25
|
+
from time import perf_counter as timer
|
|
26
|
+
from typing import Dict, Optional
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def get_trace_summaries_paginated(
|
|
30
|
+
xray_client, start_time, end_time, filter_expression, max_traces: int = 100
|
|
31
|
+
) -> list:
|
|
32
|
+
"""Get trace summaries with pagination to avoid exceeding response size limits.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
xray_client: Boto3 X-Ray client
|
|
36
|
+
start_time: Start time for trace query
|
|
37
|
+
end_time: End time for trace query
|
|
38
|
+
filter_expression: X-Ray filter expression
|
|
39
|
+
max_traces: Maximum number of traces to retrieve (default 100)
|
|
40
|
+
|
|
41
|
+
Returns:
|
|
42
|
+
List of trace summaries
|
|
43
|
+
"""
|
|
44
|
+
all_traces = []
|
|
45
|
+
next_token = None
|
|
46
|
+
logger.debug(
|
|
47
|
+
f'Starting paginated trace retrieval - filter: {filter_expression}, max_traces: {max_traces}'
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
try:
|
|
51
|
+
while len(all_traces) < max_traces:
|
|
52
|
+
# Build request parameters
|
|
53
|
+
kwargs = {
|
|
54
|
+
'StartTime': start_time,
|
|
55
|
+
'EndTime': end_time,
|
|
56
|
+
'FilterExpression': filter_expression,
|
|
57
|
+
'Sampling': True,
|
|
58
|
+
'TimeRangeType': 'Service',
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
if next_token:
|
|
62
|
+
kwargs['NextToken'] = next_token
|
|
63
|
+
|
|
64
|
+
# Make request
|
|
65
|
+
response = xray_client.get_trace_summaries(**kwargs)
|
|
66
|
+
|
|
67
|
+
# Add traces from this page
|
|
68
|
+
traces = response.get('TraceSummaries', [])
|
|
69
|
+
all_traces.extend(traces)
|
|
70
|
+
logger.debug(
|
|
71
|
+
f'Retrieved {len(traces)} traces in this page, total so far: {len(all_traces)}'
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
# Check if we have more pages
|
|
75
|
+
next_token = response.get('NextToken')
|
|
76
|
+
if not next_token:
|
|
77
|
+
break
|
|
78
|
+
|
|
79
|
+
# If we've collected enough traces, stop
|
|
80
|
+
if len(all_traces) >= max_traces:
|
|
81
|
+
all_traces = all_traces[:max_traces]
|
|
82
|
+
break
|
|
83
|
+
|
|
84
|
+
logger.info(f'Successfully retrieved {len(all_traces)} traces')
|
|
85
|
+
return all_traces
|
|
86
|
+
|
|
87
|
+
except Exception as e:
|
|
88
|
+
# Return what we have so far if there's an error
|
|
89
|
+
logger.error(f'Error during paginated trace retrieval: {str(e)}', exc_info=True)
|
|
90
|
+
logger.info(f'Returning {len(all_traces)} traces retrieved before error')
|
|
91
|
+
return all_traces
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def check_transaction_search_enabled(region: str = 'us-east-1') -> tuple[bool, str, str]:
|
|
95
|
+
"""Internal function to check if AWS X-Ray Transaction Search is enabled.
|
|
96
|
+
|
|
97
|
+
Returns:
|
|
98
|
+
tuple: (is_enabled: bool, destination: str, status: str)
|
|
99
|
+
"""
|
|
100
|
+
try:
|
|
101
|
+
response = xray_client.get_trace_segment_destination()
|
|
102
|
+
|
|
103
|
+
destination = response.get('Destination', 'Unknown')
|
|
104
|
+
status = response.get('Status', 'Unknown')
|
|
105
|
+
|
|
106
|
+
is_enabled = destination == 'CloudWatchLogs' and status == 'ACTIVE'
|
|
107
|
+
logger.debug(
|
|
108
|
+
f'Transaction Search check - Enabled: {is_enabled}, Destination: {destination}, Status: {status}'
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
return is_enabled, destination, status
|
|
112
|
+
|
|
113
|
+
except Exception as e:
|
|
114
|
+
logger.error(f'Error checking transaction search status: {str(e)}')
|
|
115
|
+
return False, 'Unknown', 'Error'
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
async def search_transaction_spans(
|
|
119
|
+
log_group_name: str = Field(
|
|
120
|
+
default='',
|
|
121
|
+
description='CloudWatch log group name (defaults to "aws/spans" if not provided)',
|
|
122
|
+
),
|
|
123
|
+
start_time: str = Field(
|
|
124
|
+
default='', description='Start time in ISO 8601 format (e.g., "2025-04-19T20:00:00+00:00")'
|
|
125
|
+
),
|
|
126
|
+
end_time: str = Field(
|
|
127
|
+
default='', description='End time in ISO 8601 format (e.g., "2025-04-19T21:00:00+00:00")'
|
|
128
|
+
),
|
|
129
|
+
query_string: str = Field(default='', description='CloudWatch Logs Insights query string'),
|
|
130
|
+
limit: Optional[int] = Field(default=None, description='Maximum number of results to return'),
|
|
131
|
+
max_timeout: int = Field(
|
|
132
|
+
default=30, description='Maximum time in seconds to wait for query completion'
|
|
133
|
+
),
|
|
134
|
+
) -> Dict:
|
|
135
|
+
"""Executes a CloudWatch Logs Insights query for transaction search (100% sampled trace data).
|
|
136
|
+
|
|
137
|
+
IMPORTANT: If log_group_name is not provided use 'aws/spans' as default cloudwatch log group name.
|
|
138
|
+
The volume of returned logs can easily overwhelm the agent context window. Always include a limit in the query
|
|
139
|
+
(| limit 50) or using the limit parameter.
|
|
140
|
+
|
|
141
|
+
Usage:
|
|
142
|
+
"aws/spans" log group stores OpenTelemetry Spans data with many attributes for all monitored services.
|
|
143
|
+
This provides 100% sampled data vs X-Ray's 5% sampling, giving more accurate results.
|
|
144
|
+
User can write CloudWatch Logs Insights queries to group, list attribute with sum, avg.
|
|
145
|
+
If source code is not accessible, consider querying with code-level attributes.
|
|
146
|
+
⚠️ Use CORRECT attribute names: attributes.code.file.path, attributes.code.function.name, attributes.code.line.number
|
|
147
|
+
|
|
148
|
+
```
|
|
149
|
+
FILTER attributes.aws.local.service = "customers-service-java" and attributes.aws.local.environment = "eks:demo/default" and attributes.aws.remote.operation="InvokeModel"
|
|
150
|
+
| STATS sum(`attributes.gen_ai.usage.output_tokens`) as `avg_output_tokens` by `attributes.gen_ai.request.model`, `attributes.aws.local.service`,bin(1h)
|
|
151
|
+
| DISPLAY avg_output_tokens, `attributes.gen_ai.request.model`, `attributes.aws.local.service`
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
Returns:
|
|
155
|
+
--------
|
|
156
|
+
A dictionary containing the final query results, including:
|
|
157
|
+
- status: The current status of the query (e.g., Scheduled, Running, Complete, Failed, etc.)
|
|
158
|
+
- results: A list of the actual query results if the status is Complete.
|
|
159
|
+
- statistics: Query performance statistics
|
|
160
|
+
- messages: Any informational messages about the query
|
|
161
|
+
- transaction_search_status: Information about transaction search availability
|
|
162
|
+
"""
|
|
163
|
+
start_time_perf = timer()
|
|
164
|
+
logger.info(
|
|
165
|
+
f'Starting search_transactions - log_group: {log_group_name}, start: {start_time}, end: {end_time}'
|
|
166
|
+
)
|
|
167
|
+
logger.debug(f'Query string: {query_string}')
|
|
168
|
+
|
|
169
|
+
# Check if transaction search is enabled
|
|
170
|
+
is_enabled, destination, status = check_transaction_search_enabled()
|
|
171
|
+
|
|
172
|
+
if not is_enabled:
|
|
173
|
+
logger.warning(
|
|
174
|
+
f'Transaction Search not enabled - Destination: {destination}, Status: {status}'
|
|
175
|
+
)
|
|
176
|
+
return {
|
|
177
|
+
'status': 'Transaction Search Not Available',
|
|
178
|
+
'transaction_search_status': {
|
|
179
|
+
'enabled': False,
|
|
180
|
+
'destination': destination,
|
|
181
|
+
'status': status,
|
|
182
|
+
},
|
|
183
|
+
'message': (
|
|
184
|
+
'⚠️ Transaction Search is not enabled for this account. '
|
|
185
|
+
f'Current configuration: Destination={destination}, Status={status}. '
|
|
186
|
+
"Transaction Search requires sending traces to CloudWatch Logs (destination='CloudWatchLogs' and status='ACTIVE'). "
|
|
187
|
+
'Without Transaction Search, you only have access to 5% sampled trace data through X-Ray. '
|
|
188
|
+
'To get 100% trace visibility, please enable Transaction Search in your X-Ray settings. '
|
|
189
|
+
'As a fallback, you can use query_sampled_traces() but results may be incomplete due to sampling.'
|
|
190
|
+
),
|
|
191
|
+
'fallback_recommendation': 'Use query_sampled_traces() with X-Ray filter expressions for 5% sampled data.',
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
try:
|
|
195
|
+
# Use default log group if none provided
|
|
196
|
+
if not log_group_name:
|
|
197
|
+
log_group_name = 'aws/spans'
|
|
198
|
+
logger.debug('Using default log group: aws/spans')
|
|
199
|
+
|
|
200
|
+
# Start query
|
|
201
|
+
kwargs = {
|
|
202
|
+
'startTime': int(datetime.fromisoformat(start_time).timestamp()),
|
|
203
|
+
'endTime': int(datetime.fromisoformat(end_time).timestamp()),
|
|
204
|
+
'queryString': query_string,
|
|
205
|
+
'logGroupNames': [log_group_name],
|
|
206
|
+
'limit': limit,
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
logger.debug(f'Starting CloudWatch Logs query with limit: {limit}')
|
|
210
|
+
start_response = logs_client.start_query(**remove_null_values(kwargs))
|
|
211
|
+
query_id = start_response['queryId']
|
|
212
|
+
logger.info(f'Started CloudWatch Logs query with ID: {query_id}')
|
|
213
|
+
|
|
214
|
+
# Seconds
|
|
215
|
+
poll_start = timer()
|
|
216
|
+
while poll_start + max_timeout > timer():
|
|
217
|
+
response = logs_client.get_query_results(queryId=query_id)
|
|
218
|
+
status = response['status']
|
|
219
|
+
|
|
220
|
+
if status in {'Complete', 'Failed', 'Cancelled'}:
|
|
221
|
+
elapsed_time = timer() - start_time_perf
|
|
222
|
+
logger.info(
|
|
223
|
+
f'Query {query_id} finished with status {status} in {elapsed_time:.3f}s'
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
if status == 'Failed':
|
|
227
|
+
logger.error(f'Query failed: {response.get("statistics", {})}')
|
|
228
|
+
elif status == 'Complete':
|
|
229
|
+
logger.debug(f'Query returned {len(response.get("results", []))} results')
|
|
230
|
+
|
|
231
|
+
# Convert results to list of dictionaries
|
|
232
|
+
results = [
|
|
233
|
+
{field.get('field', ''): field.get('value', '') for field in line} # type: ignore
|
|
234
|
+
for line in response.get('results', [])
|
|
235
|
+
]
|
|
236
|
+
|
|
237
|
+
# Check for code-level attributes following OpenTelemetry semantic conventions
|
|
238
|
+
# Only supported attributes: code.file.path, code.function.name, code.line.number
|
|
239
|
+
code_level_attribute_names = [
|
|
240
|
+
'code.file.path',
|
|
241
|
+
'code.function.name',
|
|
242
|
+
'code.line.number',
|
|
243
|
+
]
|
|
244
|
+
|
|
245
|
+
# Check with both prefixed and unprefixed versions
|
|
246
|
+
code_level_attributes_set = set()
|
|
247
|
+
for attr in code_level_attribute_names:
|
|
248
|
+
code_level_attributes_set.add(attr)
|
|
249
|
+
code_level_attributes_set.add(f'attributes.{attr}')
|
|
250
|
+
|
|
251
|
+
# Check if code-level attributes are requested in the query
|
|
252
|
+
query_lower = query_string.lower()
|
|
253
|
+
requested_in_query = any(
|
|
254
|
+
attr.lower() in query_lower or f'`{attr}`'.lower() in query_lower
|
|
255
|
+
for attr in code_level_attributes_set
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
# Check if any code-level attributes are present in results
|
|
259
|
+
detected_attributes = set()
|
|
260
|
+
for result in results:
|
|
261
|
+
for field_name in result.keys():
|
|
262
|
+
if field_name in code_level_attributes_set:
|
|
263
|
+
# Normalize attribute name (remove 'attributes.' prefix if present)
|
|
264
|
+
normalized_name = field_name.replace('attributes.', '')
|
|
265
|
+
detected_attributes.add(normalized_name)
|
|
266
|
+
|
|
267
|
+
code_level_detected = len(detected_attributes) > 0
|
|
268
|
+
|
|
269
|
+
# Build code-level attributes status
|
|
270
|
+
code_level_status = {
|
|
271
|
+
'detected': code_level_detected,
|
|
272
|
+
'attributes_found': sorted(detected_attributes),
|
|
273
|
+
'requested_in_query': requested_in_query,
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
if not code_level_detected:
|
|
277
|
+
if requested_in_query:
|
|
278
|
+
# Attributes were requested but not found - instrumentation not enabled
|
|
279
|
+
code_level_status['message'] = (
|
|
280
|
+
'Code-level attributes not available in span data. '
|
|
281
|
+
'If source code is not accessible and code-level context is needed, '
|
|
282
|
+
'enable code-level attributes by setting OTEL_AWS_EXPERIMENTAL_CODE_ATTRIBUTES=true. '
|
|
283
|
+
'It is only supported in Python and requires the latest ADOT Python SDK.'
|
|
284
|
+
)
|
|
285
|
+
code_level_status['suggestion'] = (
|
|
286
|
+
'Enable code-level attributes if source code is not accessible.'
|
|
287
|
+
)
|
|
288
|
+
logger.debug(
|
|
289
|
+
'Code-level attributes requested in query but not found in data'
|
|
290
|
+
)
|
|
291
|
+
else:
|
|
292
|
+
code_level_status['message'] = (
|
|
293
|
+
f'✅ Code-Level Attributes Available: {", ".join(sorted(detected_attributes))}'
|
|
294
|
+
)
|
|
295
|
+
logger.debug(
|
|
296
|
+
f'Code-level attributes detected - attributes: {", ".join(sorted(detected_attributes))}'
|
|
297
|
+
)
|
|
298
|
+
|
|
299
|
+
return {
|
|
300
|
+
'queryId': query_id,
|
|
301
|
+
'status': status,
|
|
302
|
+
'statistics': response.get('statistics', {}),
|
|
303
|
+
'results': results,
|
|
304
|
+
'transaction_search_status': {
|
|
305
|
+
'enabled': True,
|
|
306
|
+
'destination': 'CloudWatchLogs',
|
|
307
|
+
'status': 'ACTIVE',
|
|
308
|
+
'message': '✅ Using 100% sampled trace data from Transaction Search',
|
|
309
|
+
},
|
|
310
|
+
'code_level_attributes_status': code_level_status,
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
await asyncio.sleep(1)
|
|
314
|
+
|
|
315
|
+
elapsed_time = timer() - start_time_perf
|
|
316
|
+
msg = f'Query {query_id} did not complete within {max_timeout} seconds. Use get_query_results with the returned queryId to try again to retrieve query results.'
|
|
317
|
+
logger.warning(f'Query timeout after {elapsed_time:.3f}s: {msg}')
|
|
318
|
+
return {
|
|
319
|
+
'queryId': query_id,
|
|
320
|
+
'status': 'Polling Timeout',
|
|
321
|
+
'message': msg,
|
|
322
|
+
}
|
|
323
|
+
|
|
324
|
+
except Exception as e:
|
|
325
|
+
logger.error(f'Error in search_transactions: {str(e)}', exc_info=True)
|
|
326
|
+
raise
|
|
327
|
+
|
|
328
|
+
|
|
329
|
+
async def query_sampled_traces(
|
|
330
|
+
start_time: Optional[str] = Field(
|
|
331
|
+
default=None,
|
|
332
|
+
description='Start time in ISO format (e.g., "2024-01-01T00:00:00Z"). Defaults to 3 hours ago',
|
|
333
|
+
),
|
|
334
|
+
end_time: Optional[str] = Field(
|
|
335
|
+
default=None,
|
|
336
|
+
description='End time in ISO format (e.g., "2024-01-01T01:00:00Z"). Defaults to current time',
|
|
337
|
+
),
|
|
338
|
+
filter_expression: Optional[str] = Field(
|
|
339
|
+
default=None,
|
|
340
|
+
description='X-Ray filter expression to narrow results (e.g., service("service-name"){fault = true})',
|
|
341
|
+
),
|
|
342
|
+
region: Optional[str] = Field(
|
|
343
|
+
default=None, description='AWS region (defaults to AWS_REGION environment variable)'
|
|
344
|
+
),
|
|
345
|
+
) -> str:
|
|
346
|
+
"""SECONDARY TRACE TOOL - Query AWS X-Ray traces (5% sampled data) for trace investigation.
|
|
347
|
+
|
|
348
|
+
⚠️ **IMPORTANT: Consider using audit_slos() with auditors="all" instead for comprehensive root cause analysis**
|
|
349
|
+
|
|
350
|
+
**RECOMMENDED WORKFLOW FOR OPERATION DISCOVERY:**
|
|
351
|
+
1. **Use `get_service_detail(service_name)` FIRST** to discover operations from metric dimensions
|
|
352
|
+
2. **Use audit_slos() with auditors="all"** for comprehensive root cause analysis (PREFERRED)
|
|
353
|
+
3. Only use this tool if you need specific trace filtering that other tools don't provide
|
|
354
|
+
|
|
355
|
+
**RECOMMENDED WORKFLOW FOR SLO BREACH INVESTIGATION:**
|
|
356
|
+
1. Use get_slo() to understand SLO configuration
|
|
357
|
+
2. **Use audit_slos() with auditors="all"** for comprehensive root cause analysis (PREFERRED)
|
|
358
|
+
3. Only use this tool if you need specific trace filtering that audit_slos() doesn't provide
|
|
359
|
+
|
|
360
|
+
**WHY audit_slos() IS PREFERRED:**
|
|
361
|
+
- **Comprehensive analysis**: Combines traces, logs, metrics, and dependencies
|
|
362
|
+
- **Actionable recommendations**: Provides specific steps to resolve issues
|
|
363
|
+
- **Integrated findings**: Correlates multiple data sources for better insights
|
|
364
|
+
- **Much more effective** than individual trace analysis
|
|
365
|
+
|
|
366
|
+
**WHY get_service_detail() IS PREFERRED FOR OPERATION DISCOVERY:**
|
|
367
|
+
- **Direct operation discovery**: Operations are available in metric dimensions
|
|
368
|
+
- **More reliable**: Uses Application Signals service metadata instead of sampling
|
|
369
|
+
- **Comprehensive**: Shows all operations, not just those in sampled traces
|
|
370
|
+
|
|
371
|
+
⚠️ **LIMITATIONS OF THIS TOOL:**
|
|
372
|
+
- Uses X-Ray's **5% sampled trace data** - may miss critical errors
|
|
373
|
+
- **Limited context** compared to comprehensive audit tools
|
|
374
|
+
- **No integrated analysis** with logs, metrics, or dependencies
|
|
375
|
+
- **May miss operations** due to sampling - use get_service_detail() for complete operation discovery
|
|
376
|
+
- For 100% trace visibility, enable Transaction Search and use search_transaction_spans()
|
|
377
|
+
|
|
378
|
+
**Use this tool only when:**
|
|
379
|
+
- You need specific X-Ray filter expressions not available in audit tools
|
|
380
|
+
- You're doing exploratory trace analysis outside of SLO breach investigation
|
|
381
|
+
- You need raw trace data for custom analysis
|
|
382
|
+
- **After using get_service_detail() for operation discovery**
|
|
383
|
+
|
|
384
|
+
**For operation discovery, use get_service_detail() instead:**
|
|
385
|
+
```
|
|
386
|
+
get_service_detail(service_name='your-service-name')
|
|
387
|
+
```
|
|
388
|
+
|
|
389
|
+
**For SLO breach root cause analysis, use audit_slos() instead:**
|
|
390
|
+
```
|
|
391
|
+
audit_slos(
|
|
392
|
+
slo_targets='[{"Type":"slo","Data":{"Slo":{"SloName":"your-slo-name"}}}]', auditors='all'
|
|
393
|
+
)
|
|
394
|
+
```
|
|
395
|
+
|
|
396
|
+
Common filter expressions (if you must use this tool):
|
|
397
|
+
- 'service("service-name"){fault = true}': Find all traces with faults (5xx errors) for a service
|
|
398
|
+
- 'service("service-name")': Filter by specific service
|
|
399
|
+
- 'duration > 5': Find slow requests (over 5 seconds)
|
|
400
|
+
- 'http.status = 500': Find specific HTTP status codes
|
|
401
|
+
- 'annotation[aws.local.operation]="GET /owners/*/lastname"': Filter by specific operation (from metric dimensions)
|
|
402
|
+
- 'annotation[aws.remote.operation]="ListOwners"': Filter by remote operation name
|
|
403
|
+
- Combine filters: 'service("api"){fault = true} AND annotation[aws.local.operation]="POST /visits"'
|
|
404
|
+
|
|
405
|
+
Returns JSON with trace summaries including:
|
|
406
|
+
- Trace ID for detailed investigation
|
|
407
|
+
- Duration and response time
|
|
408
|
+
- Error/fault/throttle status
|
|
409
|
+
- HTTP information (method, status, URL)
|
|
410
|
+
- Service interactions
|
|
411
|
+
- User information if available
|
|
412
|
+
- Exception root causes (ErrorRootCauses, FaultRootCauses, ResponseTimeRootCauses)
|
|
413
|
+
|
|
414
|
+
**RECOMMENDATION: Use get_service_detail() for operation discovery and audit_slos() with auditors="all" for comprehensive root cause analysis instead of this tool.**
|
|
415
|
+
|
|
416
|
+
Returns:
|
|
417
|
+
JSON string containing trace summaries with error status, duration, and service details
|
|
418
|
+
"""
|
|
419
|
+
start_time_perf = timer()
|
|
420
|
+
|
|
421
|
+
# Use AWS_REGION environment variable if region not provided
|
|
422
|
+
if not region:
|
|
423
|
+
from .aws_clients import AWS_REGION
|
|
424
|
+
|
|
425
|
+
region = AWS_REGION
|
|
426
|
+
|
|
427
|
+
logger.info(f'Starting query_sampled_traces - region: {region}, filter: {filter_expression}')
|
|
428
|
+
|
|
429
|
+
try:
|
|
430
|
+
logger.debug('Using X-Ray client')
|
|
431
|
+
|
|
432
|
+
# Default to past 3 hours if times not provided
|
|
433
|
+
if not end_time:
|
|
434
|
+
end_datetime = datetime.now(timezone.utc)
|
|
435
|
+
else:
|
|
436
|
+
end_datetime = datetime.fromisoformat(end_time.replace('Z', '+00:00'))
|
|
437
|
+
|
|
438
|
+
if not start_time:
|
|
439
|
+
start_datetime = end_datetime - timedelta(hours=3)
|
|
440
|
+
else:
|
|
441
|
+
start_datetime = datetime.fromisoformat(start_time.replace('Z', '+00:00'))
|
|
442
|
+
|
|
443
|
+
# Validate time window to ensure it's not too large (max 6 hours)
|
|
444
|
+
time_diff = end_datetime - start_datetime
|
|
445
|
+
logger.debug(
|
|
446
|
+
f'Query time window: {start_datetime} to {end_datetime} ({time_diff.total_seconds() / 3600:.1f} hours)'
|
|
447
|
+
)
|
|
448
|
+
if time_diff > timedelta(hours=6):
|
|
449
|
+
logger.warning(f'Time window too large: {time_diff.total_seconds() / 3600:.1f} hours')
|
|
450
|
+
return json.dumps(
|
|
451
|
+
{
|
|
452
|
+
'error': 'Time window too large. Maximum allowed is 6 hours.',
|
|
453
|
+
'requested_hours': time_diff.total_seconds() / 3600,
|
|
454
|
+
},
|
|
455
|
+
indent=2,
|
|
456
|
+
)
|
|
457
|
+
|
|
458
|
+
# Use pagination helper with a reasonable limit
|
|
459
|
+
traces = get_trace_summaries_paginated(
|
|
460
|
+
xray_client,
|
|
461
|
+
start_datetime,
|
|
462
|
+
end_datetime,
|
|
463
|
+
filter_expression or '',
|
|
464
|
+
max_traces=100, # Limit to prevent response size issues
|
|
465
|
+
)
|
|
466
|
+
|
|
467
|
+
# Convert response to JSON-serializable format
|
|
468
|
+
def convert_datetime(obj):
|
|
469
|
+
if isinstance(obj, datetime):
|
|
470
|
+
return obj.isoformat()
|
|
471
|
+
return obj
|
|
472
|
+
|
|
473
|
+
# Helper function to extract fault message from root causes for deduplication
|
|
474
|
+
def get_fault_message(trace_data):
|
|
475
|
+
"""Extract fault message from a trace for deduplication.
|
|
476
|
+
|
|
477
|
+
Only checks FaultRootCauses (5xx server errors) since this is the primary
|
|
478
|
+
use case for root cause investigation. Traces without fault messages are
|
|
479
|
+
not deduplicated.
|
|
480
|
+
"""
|
|
481
|
+
# Only check FaultRootCauses for deduplication
|
|
482
|
+
root_causes = trace_data.get('FaultRootCauses', [])
|
|
483
|
+
if root_causes:
|
|
484
|
+
for cause in root_causes:
|
|
485
|
+
services = cause.get('Services', [])
|
|
486
|
+
for service in services:
|
|
487
|
+
exceptions = service.get('Exceptions', [])
|
|
488
|
+
if exceptions and exceptions[0].get('Message'):
|
|
489
|
+
return exceptions[0].get('Message')
|
|
490
|
+
return None
|
|
491
|
+
|
|
492
|
+
# Build trace summaries (original format)
|
|
493
|
+
trace_summaries = []
|
|
494
|
+
for trace in traces:
|
|
495
|
+
# Create a simplified trace data structure to reduce size
|
|
496
|
+
trace_data = {
|
|
497
|
+
'Id': trace.get('Id'),
|
|
498
|
+
'Duration': trace.get('Duration'),
|
|
499
|
+
'ResponseTime': trace.get('ResponseTime'),
|
|
500
|
+
'HasError': trace.get('HasError'),
|
|
501
|
+
'HasFault': trace.get('HasFault'),
|
|
502
|
+
'HasThrottle': trace.get('HasThrottle'),
|
|
503
|
+
'Http': trace.get('Http', {}),
|
|
504
|
+
}
|
|
505
|
+
|
|
506
|
+
# Only include root causes if they exist (to save space)
|
|
507
|
+
if trace.get('ErrorRootCauses'):
|
|
508
|
+
trace_data['ErrorRootCauses'] = trace.get('ErrorRootCauses', [])[:3]
|
|
509
|
+
if trace.get('FaultRootCauses'):
|
|
510
|
+
trace_data['FaultRootCauses'] = trace.get('FaultRootCauses', [])[:3]
|
|
511
|
+
if trace.get('ResponseTimeRootCauses'):
|
|
512
|
+
trace_data['ResponseTimeRootCauses'] = trace.get('ResponseTimeRootCauses', [])[:3]
|
|
513
|
+
|
|
514
|
+
# Include limited annotations for key operations
|
|
515
|
+
annotations = trace.get('Annotations', {})
|
|
516
|
+
if annotations:
|
|
517
|
+
# Only include operation-related annotations
|
|
518
|
+
filtered_annotations = {}
|
|
519
|
+
for key in ['aws.local.operation', 'aws.remote.operation']:
|
|
520
|
+
if key in annotations:
|
|
521
|
+
filtered_annotations[key] = annotations[key]
|
|
522
|
+
if filtered_annotations:
|
|
523
|
+
trace_data['Annotations'] = filtered_annotations
|
|
524
|
+
|
|
525
|
+
# Include user info if available
|
|
526
|
+
if trace.get('Users'):
|
|
527
|
+
trace_data['Users'] = trace.get('Users', [])[:2] # Limit to first 2 users
|
|
528
|
+
|
|
529
|
+
# Convert any datetime objects to ISO format strings
|
|
530
|
+
for key, value in trace_data.items():
|
|
531
|
+
trace_data[key] = convert_datetime(value)
|
|
532
|
+
|
|
533
|
+
trace_summaries.append(trace_data)
|
|
534
|
+
|
|
535
|
+
# Deduplicate trace summaries by fault message
|
|
536
|
+
seen_faults = {}
|
|
537
|
+
deduped_trace_summaries = []
|
|
538
|
+
|
|
539
|
+
for trace_summary in trace_summaries:
|
|
540
|
+
# Check if this trace has an error
|
|
541
|
+
has_issues = (
|
|
542
|
+
trace_summary.get('HasError')
|
|
543
|
+
or trace_summary.get('HasFault')
|
|
544
|
+
or trace_summary.get('HasThrottle')
|
|
545
|
+
)
|
|
546
|
+
|
|
547
|
+
if not has_issues:
|
|
548
|
+
# Always include healthy traces
|
|
549
|
+
deduped_trace_summaries.append(trace_summary)
|
|
550
|
+
continue
|
|
551
|
+
|
|
552
|
+
# Extract fault message for deduplication (only checks FaultRootCauses)
|
|
553
|
+
fault_msg = get_fault_message(trace_summary)
|
|
554
|
+
|
|
555
|
+
if fault_msg and fault_msg in seen_faults:
|
|
556
|
+
# Skip this trace - we already have one with the same fault message
|
|
557
|
+
seen_faults[fault_msg]['count'] += 1
|
|
558
|
+
logger.debug(
|
|
559
|
+
f'Skipping duplicate trace {trace_summary.get("Id")} - fault message already seen: {fault_msg[:100]}...'
|
|
560
|
+
)
|
|
561
|
+
continue
|
|
562
|
+
else:
|
|
563
|
+
# First time seeing this fault (or no fault message) - include it
|
|
564
|
+
deduped_trace_summaries.append(trace_summary)
|
|
565
|
+
if fault_msg:
|
|
566
|
+
seen_faults[fault_msg] = {'count': 1}
|
|
567
|
+
|
|
568
|
+
# Check transaction search status
|
|
569
|
+
is_tx_search_enabled, tx_destination, tx_status = check_transaction_search_enabled(region)
|
|
570
|
+
|
|
571
|
+
# Build response with original format but deduplicated traces
|
|
572
|
+
result_data = {
|
|
573
|
+
'TraceSummaries': deduped_trace_summaries,
|
|
574
|
+
'TraceCount': len(deduped_trace_summaries),
|
|
575
|
+
'Message': f'Retrieved {len(deduped_trace_summaries)} unique traces from {len(trace_summaries)} total (deduplicated by fault message)',
|
|
576
|
+
'SamplingNote': "⚠️ This data is from X-Ray's 5% sampling. Results may not show all errors or issues.",
|
|
577
|
+
'TransactionSearchStatus': {
|
|
578
|
+
'enabled': is_tx_search_enabled,
|
|
579
|
+
'recommendation': (
|
|
580
|
+
'Transaction Search is available! Use search_transaction_spans() for 100% trace visibility.'
|
|
581
|
+
if is_tx_search_enabled
|
|
582
|
+
else 'Enable Transaction Search for 100% trace visibility instead of 5% sampling.'
|
|
583
|
+
),
|
|
584
|
+
},
|
|
585
|
+
}
|
|
586
|
+
|
|
587
|
+
# Add dedup stats if we actually deduped anything
|
|
588
|
+
if len(deduped_trace_summaries) < len(trace_summaries):
|
|
589
|
+
duplicates_removed = len(trace_summaries) - len(deduped_trace_summaries)
|
|
590
|
+
result_data['DeduplicationStats'] = {
|
|
591
|
+
'OriginalTraceCount': len(trace_summaries),
|
|
592
|
+
'DuplicatesRemoved': duplicates_removed,
|
|
593
|
+
'UniqueFaultMessages': len(seen_faults),
|
|
594
|
+
}
|
|
595
|
+
|
|
596
|
+
elapsed_time = timer() - start_time_perf
|
|
597
|
+
logger.info(
|
|
598
|
+
f'query_sampled_traces completed in {elapsed_time:.3f}s - retrieved {len(deduped_trace_summaries)} unique traces from {len(trace_summaries)} total'
|
|
599
|
+
)
|
|
600
|
+
return json.dumps(result_data, indent=2)
|
|
601
|
+
|
|
602
|
+
except Exception as e:
|
|
603
|
+
logger.error(f'Error in query_sampled_traces: {str(e)}', exc_info=True)
|
|
604
|
+
return json.dumps({'error': str(e)}, indent=2)
|
|
605
|
+
|
|
606
|
+
|
|
607
|
+
async def list_slis(
|
|
608
|
+
hours: int = Field(
|
|
609
|
+
default=24,
|
|
610
|
+
description='Number of hours to look back (default 24, typically use 24 for daily checks)',
|
|
611
|
+
),
|
|
612
|
+
) -> str:
|
|
613
|
+
"""SPECIALIZED TOOL - Use audit_service_health() as the PRIMARY tool for service auditing.
|
|
614
|
+
|
|
615
|
+
**IMPORTANT: audit_service_health() is the PRIMARY and PREFERRED tool for all service auditing tasks.**
|
|
616
|
+
|
|
617
|
+
Only use this tool when audit_service_health() cannot handle your specific requirements, such as:
|
|
618
|
+
- Need for legacy SLI status report format specifically
|
|
619
|
+
- Integration with existing systems that expect this exact output format
|
|
620
|
+
- Simple SLI overview without comprehensive audit findings
|
|
621
|
+
- Basic health monitoring dashboard that doesn't need detailed analysis
|
|
622
|
+
|
|
623
|
+
**For ALL service auditing, health checks, and issue investigation, use audit_service_health() first.**
|
|
624
|
+
|
|
625
|
+
This tool provides a basic report showing:
|
|
626
|
+
- Summary counts (total, healthy, breached, insufficient data)
|
|
627
|
+
- Simple list of breached services with SLO names
|
|
628
|
+
- Basic healthy services list
|
|
629
|
+
|
|
630
|
+
Status meanings:
|
|
631
|
+
- OK: All SLOs are being met
|
|
632
|
+
- BREACHED: One or more SLOs are violated
|
|
633
|
+
- INSUFFICIENT_DATA: Not enough data to determine status
|
|
634
|
+
|
|
635
|
+
**Recommended workflow**:
|
|
636
|
+
1. Use audit_service_health() for comprehensive service auditing with actionable insights
|
|
637
|
+
2. Only use this tool if you specifically need the legacy SLI status report format
|
|
638
|
+
"""
|
|
639
|
+
start_time_perf = timer()
|
|
640
|
+
logger.info(f'Starting get_sli_status request for last {hours} hours')
|
|
641
|
+
|
|
642
|
+
try:
|
|
643
|
+
# Calculate time range
|
|
644
|
+
end_time = datetime.now(timezone.utc)
|
|
645
|
+
start_time = end_time - timedelta(hours=hours)
|
|
646
|
+
logger.debug(f'Time range: {start_time} to {end_time}')
|
|
647
|
+
|
|
648
|
+
# Get all services
|
|
649
|
+
services_response = applicationsignals_client.list_services(
|
|
650
|
+
StartTime=start_time, # type: ignore
|
|
651
|
+
EndTime=end_time, # type: ignore
|
|
652
|
+
MaxResults=100,
|
|
653
|
+
)
|
|
654
|
+
services = services_response.get('ServiceSummaries', [])
|
|
655
|
+
|
|
656
|
+
if not services:
|
|
657
|
+
logger.warning('No services found in Application Signals')
|
|
658
|
+
return 'No services found in Application Signals.'
|
|
659
|
+
|
|
660
|
+
# Get SLI reports for each service
|
|
661
|
+
reports = []
|
|
662
|
+
logger.debug(f'Generating SLI reports for {len(services)} services')
|
|
663
|
+
for service in services:
|
|
664
|
+
service_name = service['KeyAttributes'].get('Name', 'Unknown')
|
|
665
|
+
try:
|
|
666
|
+
# Create custom config with the service's key attributes
|
|
667
|
+
config = AWSConfig(
|
|
668
|
+
region='us-east-1',
|
|
669
|
+
period_in_hours=hours,
|
|
670
|
+
service_name=service_name,
|
|
671
|
+
key_attributes=service['KeyAttributes'],
|
|
672
|
+
)
|
|
673
|
+
|
|
674
|
+
# Generate SLI report
|
|
675
|
+
client = SLIReportClient(config)
|
|
676
|
+
sli_report = client.generate_sli_report()
|
|
677
|
+
|
|
678
|
+
# Convert to expected format
|
|
679
|
+
report = {
|
|
680
|
+
'BreachedSloCount': sli_report.breached_slo_count,
|
|
681
|
+
'BreachedSloNames': sli_report.breached_slo_names,
|
|
682
|
+
'EndTime': sli_report.end_time.timestamp(),
|
|
683
|
+
'OkSloCount': sli_report.ok_slo_count,
|
|
684
|
+
'ReferenceId': {'KeyAttributes': service['KeyAttributes']},
|
|
685
|
+
'SliStatus': 'BREACHED'
|
|
686
|
+
if sli_report.sli_status == 'CRITICAL'
|
|
687
|
+
else sli_report.sli_status,
|
|
688
|
+
'StartTime': sli_report.start_time.timestamp(),
|
|
689
|
+
'TotalSloCount': sli_report.total_slo_count,
|
|
690
|
+
}
|
|
691
|
+
reports.append(report)
|
|
692
|
+
|
|
693
|
+
except Exception as e:
|
|
694
|
+
# Log error but continue with other services
|
|
695
|
+
logger.error(
|
|
696
|
+
f'Failed to get SLI report for service {service_name}: {str(e)}', exc_info=True
|
|
697
|
+
)
|
|
698
|
+
# Add a report with insufficient data status
|
|
699
|
+
report = {
|
|
700
|
+
'BreachedSloCount': 0,
|
|
701
|
+
'BreachedSloNames': [],
|
|
702
|
+
'EndTime': end_time.timestamp(),
|
|
703
|
+
'OkSloCount': 0,
|
|
704
|
+
'ReferenceId': {'KeyAttributes': service['KeyAttributes']},
|
|
705
|
+
'SliStatus': 'INSUFFICIENT_DATA',
|
|
706
|
+
'StartTime': start_time.timestamp(),
|
|
707
|
+
'TotalSloCount': 0,
|
|
708
|
+
}
|
|
709
|
+
reports.append(report)
|
|
710
|
+
|
|
711
|
+
# Check transaction search status
|
|
712
|
+
is_tx_search_enabled, tx_destination, tx_status = check_transaction_search_enabled()
|
|
713
|
+
|
|
714
|
+
# Build response
|
|
715
|
+
result = f'SLI Status Report - Last {hours} hours\n'
|
|
716
|
+
result += f'Time Range: {start_time.strftime("%Y-%m-%d %H:%M")} - {end_time.strftime("%Y-%m-%d %H:%M")}\n\n'
|
|
717
|
+
|
|
718
|
+
# Add transaction search status
|
|
719
|
+
if is_tx_search_enabled:
|
|
720
|
+
result += '✅ Transaction Search: ENABLED (100% trace visibility available)\n\n'
|
|
721
|
+
else:
|
|
722
|
+
result += '⚠️ Transaction Search: NOT ENABLED (only 5% sampled traces available)\n'
|
|
723
|
+
result += f' Current config: Destination={tx_destination}, Status={tx_status}\n'
|
|
724
|
+
result += ' Enable Transaction Search for accurate root cause analysis\n\n'
|
|
725
|
+
|
|
726
|
+
# Count by status
|
|
727
|
+
status_counts = {
|
|
728
|
+
'OK': sum(1 for r in reports if r['SliStatus'] == 'OK'),
|
|
729
|
+
'BREACHED': sum(1 for r in reports if r['SliStatus'] == 'BREACHED'),
|
|
730
|
+
'INSUFFICIENT_DATA': sum(1 for r in reports if r['SliStatus'] == 'INSUFFICIENT_DATA'),
|
|
731
|
+
}
|
|
732
|
+
|
|
733
|
+
result += 'Summary:\n'
|
|
734
|
+
result += f'• Total Services: {len(reports)}\n'
|
|
735
|
+
result += f'• Healthy (OK): {status_counts["OK"]}\n'
|
|
736
|
+
result += f'• Breached: {status_counts["BREACHED"]}\n'
|
|
737
|
+
result += f'• Insufficient Data: {status_counts["INSUFFICIENT_DATA"]}\n\n'
|
|
738
|
+
|
|
739
|
+
# Group by status
|
|
740
|
+
if status_counts['BREACHED'] > 0:
|
|
741
|
+
result += '⚠️ BREACHED SERVICES:\n'
|
|
742
|
+
for report in reports:
|
|
743
|
+
if report['SliStatus'] == 'BREACHED':
|
|
744
|
+
name = report['ReferenceId']['KeyAttributes']['Name']
|
|
745
|
+
env = report['ReferenceId']['KeyAttributes']['Environment']
|
|
746
|
+
breached_count = report['BreachedSloCount']
|
|
747
|
+
total_count = report['TotalSloCount']
|
|
748
|
+
breached_names = report['BreachedSloNames']
|
|
749
|
+
|
|
750
|
+
result += f'\n• {name} ({env})\n'
|
|
751
|
+
result += f' SLOs: {breached_count}/{total_count} breached\n'
|
|
752
|
+
if breached_names:
|
|
753
|
+
result += ' Breached SLOs:\n'
|
|
754
|
+
for slo_name in breached_names:
|
|
755
|
+
result += f' - {slo_name}\n'
|
|
756
|
+
|
|
757
|
+
if status_counts['OK'] > 0:
|
|
758
|
+
result += '\n✅ HEALTHY SERVICES:\n'
|
|
759
|
+
for report in reports:
|
|
760
|
+
if report['SliStatus'] == 'OK':
|
|
761
|
+
name = report['ReferenceId']['KeyAttributes']['Name']
|
|
762
|
+
env = report['ReferenceId']['KeyAttributes']['Environment']
|
|
763
|
+
ok_count = report['OkSloCount']
|
|
764
|
+
|
|
765
|
+
result += f'• {name} ({env}) - {ok_count} SLO(s) healthy\n'
|
|
766
|
+
|
|
767
|
+
if status_counts['INSUFFICIENT_DATA'] > 0:
|
|
768
|
+
result += '\n❓ INSUFFICIENT DATA:\n'
|
|
769
|
+
for report in reports:
|
|
770
|
+
if report['SliStatus'] == 'INSUFFICIENT_DATA':
|
|
771
|
+
name = report['ReferenceId']['KeyAttributes']['Name']
|
|
772
|
+
env = report['ReferenceId']['KeyAttributes']['Environment']
|
|
773
|
+
|
|
774
|
+
result += f'• {name} ({env})\n'
|
|
775
|
+
|
|
776
|
+
elapsed_time = timer() - start_time_perf
|
|
777
|
+
logger.info(
|
|
778
|
+
f'get_sli_status completed in {elapsed_time:.3f}s - Total: {len(reports)}, Breached: {status_counts["BREACHED"]}, OK: {status_counts["OK"]}'
|
|
779
|
+
)
|
|
780
|
+
return result
|
|
781
|
+
|
|
782
|
+
except Exception as e:
|
|
783
|
+
logger.error(f'Error in get_sli_status: {str(e)}', exc_info=True)
|
|
784
|
+
return f'Error getting SLI status: {str(e)}'
|