awslabs.cloudwatch-appsignals-mcp-server 0.1.7__py3-none-any.whl → 0.1.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- awslabs/cloudwatch_appsignals_mcp_server/__init__.py +1 -1
- awslabs/cloudwatch_appsignals_mcp_server/audit_presentation_utils.py +231 -0
- awslabs/cloudwatch_appsignals_mcp_server/audit_utils.py +699 -0
- awslabs/cloudwatch_appsignals_mcp_server/aws_clients.py +88 -0
- awslabs/cloudwatch_appsignals_mcp_server/server.py +675 -1220
- awslabs/cloudwatch_appsignals_mcp_server/service_audit_utils.py +231 -0
- awslabs/cloudwatch_appsignals_mcp_server/service_tools.py +659 -0
- awslabs/cloudwatch_appsignals_mcp_server/sli_report_client.py +5 -12
- awslabs/cloudwatch_appsignals_mcp_server/slo_tools.py +386 -0
- awslabs/cloudwatch_appsignals_mcp_server/trace_tools.py +658 -0
- awslabs/cloudwatch_appsignals_mcp_server/utils.py +172 -0
- awslabs_cloudwatch_appsignals_mcp_server-0.1.8.dist-info/METADATA +636 -0
- awslabs_cloudwatch_appsignals_mcp_server-0.1.8.dist-info/RECORD +18 -0
- awslabs_cloudwatch_appsignals_mcp_server-0.1.7.dist-info/METADATA +0 -350
- awslabs_cloudwatch_appsignals_mcp_server-0.1.7.dist-info/RECORD +0 -10
- {awslabs_cloudwatch_appsignals_mcp_server-0.1.7.dist-info → awslabs_cloudwatch_appsignals_mcp_server-0.1.8.dist-info}/WHEEL +0 -0
- {awslabs_cloudwatch_appsignals_mcp_server-0.1.7.dist-info → awslabs_cloudwatch_appsignals_mcp_server-0.1.8.dist-info}/entry_points.txt +0 -0
- {awslabs_cloudwatch_appsignals_mcp_server-0.1.7.dist-info → awslabs_cloudwatch_appsignals_mcp_server-0.1.8.dist-info}/licenses/LICENSE +0 -0
- {awslabs_cloudwatch_appsignals_mcp_server-0.1.7.dist-info → awslabs_cloudwatch_appsignals_mcp_server-0.1.8.dist-info}/licenses/NOTICE +0 -0
|
@@ -14,23 +14,39 @@
|
|
|
14
14
|
|
|
15
15
|
"""CloudWatch Application Signals MCP Server - Core server implementation."""
|
|
16
16
|
|
|
17
|
-
import asyncio
|
|
18
|
-
import boto3
|
|
19
17
|
import json
|
|
20
18
|
import os
|
|
21
19
|
import sys
|
|
22
|
-
|
|
23
|
-
from .
|
|
24
|
-
|
|
25
|
-
|
|
20
|
+
import tempfile
|
|
21
|
+
from .audit_utils import (
|
|
22
|
+
execute_audit_api,
|
|
23
|
+
expand_service_operation_wildcard_patterns,
|
|
24
|
+
expand_service_wildcard_patterns,
|
|
25
|
+
expand_slo_wildcard_patterns,
|
|
26
|
+
parse_auditors,
|
|
27
|
+
)
|
|
28
|
+
from .aws_clients import AWS_REGION, appsignals_client
|
|
29
|
+
from .service_audit_utils import normalize_service_targets, validate_and_enrich_service_targets
|
|
30
|
+
from .service_tools import (
|
|
31
|
+
get_service_detail,
|
|
32
|
+
list_monitored_services,
|
|
33
|
+
list_service_operations,
|
|
34
|
+
query_service_metrics,
|
|
35
|
+
)
|
|
36
|
+
from .slo_tools import get_slo, list_slos
|
|
37
|
+
from .trace_tools import list_slis, query_sampled_traces, search_transaction_spans
|
|
38
|
+
from .utils import parse_timestamp
|
|
26
39
|
from datetime import datetime, timedelta, timezone
|
|
27
40
|
from loguru import logger
|
|
28
41
|
from mcp.server.fastmcp import FastMCP
|
|
29
42
|
from pydantic import Field
|
|
30
43
|
from time import perf_counter as timer
|
|
31
|
-
from typing import
|
|
44
|
+
from typing import Optional
|
|
32
45
|
|
|
33
46
|
|
|
47
|
+
# Constants
|
|
48
|
+
BATCH_SIZE_THRESHOLD = 5
|
|
49
|
+
|
|
34
50
|
# Initialize FastMCP server
|
|
35
51
|
mcp = FastMCP('cloudwatch-appsignals')
|
|
36
52
|
|
|
@@ -38,1315 +54,754 @@ mcp = FastMCP('cloudwatch-appsignals')
|
|
|
38
54
|
log_level = os.environ.get('MCP_CLOUDWATCH_APPSIGNALS_LOG_LEVEL', 'INFO').upper()
|
|
39
55
|
logger.remove() # Remove default handler
|
|
40
56
|
logger.add(sys.stderr, level=log_level)
|
|
41
|
-
logger.debug(f'CloudWatch AppSignals MCP Server initialized with log level: {log_level}')
|
|
42
|
-
|
|
43
|
-
# Get AWS region from environment variable or use default
|
|
44
|
-
AWS_REGION = os.environ.get('AWS_REGION', 'us-east-1')
|
|
45
|
-
logger.debug(f'Using AWS region: {AWS_REGION}')
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
# Initialize AWS clients
|
|
49
|
-
def _initialize_aws_clients():
|
|
50
|
-
"""Initialize AWS clients with proper configuration."""
|
|
51
|
-
config = Config(user_agent_extra=f'awslabs.cloudwatch-appsignals-mcp-server/{__version__}')
|
|
52
57
|
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
cloudwatch = session.client('cloudwatch', config=config)
|
|
60
|
-
xray = session.client('xray', config=config)
|
|
58
|
+
# Add file logging to aws_cli.log
|
|
59
|
+
log_file_path = os.environ.get('AUDITOR_LOG_PATH', tempfile.gettempdir())
|
|
60
|
+
try:
|
|
61
|
+
if log_file_path.endswith(os.sep) or os.path.isdir(log_file_path):
|
|
62
|
+
os.makedirs(log_file_path, exist_ok=True)
|
|
63
|
+
aws_cli_log_path = os.path.join(log_file_path, 'aws_cli.log')
|
|
61
64
|
else:
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
65
|
+
os.makedirs(os.path.dirname(log_file_path) or '.', exist_ok=True)
|
|
66
|
+
aws_cli_log_path = log_file_path
|
|
67
|
+
except Exception:
|
|
68
|
+
temp_dir = tempfile.gettempdir()
|
|
69
|
+
os.makedirs(temp_dir, exist_ok=True)
|
|
70
|
+
aws_cli_log_path = os.path.join(temp_dir, 'aws_cli.log')
|
|
71
|
+
|
|
72
|
+
# Add file handler for all logs
|
|
73
|
+
logger.add(
|
|
74
|
+
aws_cli_log_path,
|
|
75
|
+
level=log_level,
|
|
76
|
+
rotation='10 MB', # Rotate when file reaches 10MB
|
|
77
|
+
retention='7 days', # Keep logs for 7 days
|
|
78
|
+
format='{time:YYYY-MM-DD HH:mm:ss.SSS} | {level: <8} | {name}:{function}:{line} - {message}',
|
|
79
|
+
enqueue=True, # Thread-safe logging
|
|
80
|
+
)
|
|
69
81
|
|
|
82
|
+
logger.debug(f'CloudWatch AppSignals MCP Server initialized with log level: {log_level}')
|
|
83
|
+
logger.debug(f'File logging enabled: {aws_cli_log_path}')
|
|
70
84
|
|
|
71
|
-
|
|
72
|
-
try:
|
|
73
|
-
logs_client, appsignals_client, cloudwatch_client, xray_client = _initialize_aws_clients()
|
|
74
|
-
except Exception as e:
|
|
75
|
-
logger.error(f'Failed to initialize AWS clients: {str(e)}')
|
|
76
|
-
raise
|
|
85
|
+
logger.debug(f'Using AWS region: {AWS_REGION}')
|
|
77
86
|
|
|
78
87
|
|
|
79
|
-
def
|
|
80
|
-
"""
|
|
88
|
+
def _filter_operation_targets(provided):
|
|
89
|
+
"""Helper function to filter operation targets and detect wildcards.
|
|
81
90
|
|
|
82
91
|
Args:
|
|
83
|
-
|
|
92
|
+
provided: List of target dictionaries
|
|
84
93
|
|
|
85
94
|
Returns:
|
|
86
|
-
|
|
95
|
+
tuple: (operation_only_targets, has_wildcards)
|
|
87
96
|
"""
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
logger.debug('Starting list_application_signals_services request')
|
|
110
|
-
|
|
111
|
-
try:
|
|
112
|
-
# Calculate time range (last 24 hours)
|
|
113
|
-
end_time = datetime.now(timezone.utc)
|
|
114
|
-
start_time = end_time - timedelta(hours=24)
|
|
115
|
-
|
|
116
|
-
# Get all services
|
|
117
|
-
logger.debug(f'Querying services for time range: {start_time} to {end_time}')
|
|
118
|
-
response = appsignals_client.list_services(
|
|
119
|
-
StartTime=start_time, EndTime=end_time, MaxResults=100
|
|
120
|
-
)
|
|
121
|
-
services = response.get('ServiceSummaries', [])
|
|
122
|
-
logger.debug(f'Retrieved {len(services)} services from Application Signals')
|
|
123
|
-
|
|
124
|
-
if not services:
|
|
125
|
-
logger.warning('No services found in Application Signals')
|
|
126
|
-
return 'No services found in Application Signals.'
|
|
127
|
-
|
|
128
|
-
result = f'Application Signals Services ({len(services)} total):\n\n'
|
|
129
|
-
|
|
130
|
-
for service in services:
|
|
131
|
-
# Extract service name from KeyAttributes
|
|
132
|
-
key_attrs = service.get('KeyAttributes', {})
|
|
133
|
-
service_name = key_attrs.get('Name', 'Unknown')
|
|
134
|
-
service_type = key_attrs.get('Type', 'Unknown')
|
|
135
|
-
|
|
136
|
-
result += f'• Service: {service_name}\n'
|
|
137
|
-
result += f' Type: {service_type}\n'
|
|
138
|
-
|
|
139
|
-
# Add key attributes
|
|
140
|
-
if key_attrs:
|
|
141
|
-
result += ' Key Attributes:\n'
|
|
142
|
-
for key, value in key_attrs.items():
|
|
143
|
-
result += f' {key}: {value}\n'
|
|
144
|
-
|
|
145
|
-
result += '\n'
|
|
146
|
-
|
|
147
|
-
elapsed_time = timer() - start_time_perf
|
|
148
|
-
logger.debug(f'list_monitored_services completed in {elapsed_time:.3f}s')
|
|
149
|
-
return result
|
|
150
|
-
|
|
151
|
-
except ClientError as e:
|
|
152
|
-
error_code = e.response.get('Error', {}).get('Code', 'Unknown')
|
|
153
|
-
error_message = e.response.get('Error', {}).get('Message', 'Unknown error')
|
|
154
|
-
logger.error(f'AWS ClientError in list_monitored_services: {error_code} - {error_message}')
|
|
155
|
-
return f'AWS Error: {error_message}'
|
|
156
|
-
except Exception as e:
|
|
157
|
-
logger.error(f'Unexpected error in list_monitored_services: {str(e)}', exc_info=True)
|
|
158
|
-
return f'Error: {str(e)}'
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
@mcp.tool()
|
|
162
|
-
async def get_service_detail(
|
|
163
|
-
service_name: str = Field(
|
|
164
|
-
..., description='Name of the service to get details for (case-sensitive)'
|
|
165
|
-
),
|
|
166
|
-
) -> str:
|
|
167
|
-
"""Get detailed information about a specific Application Signals service.
|
|
168
|
-
|
|
169
|
-
Use this tool when you need to:
|
|
170
|
-
- Understand a service's configuration and setup
|
|
171
|
-
- Understand where this servive is deployed and where it is running such as EKS, Lambda, etc.
|
|
172
|
-
- See what metrics are available for a service
|
|
173
|
-
- Find log groups associated with the service
|
|
174
|
-
- Get service metadata and attributes
|
|
175
|
-
|
|
176
|
-
Returns comprehensive details including:
|
|
177
|
-
- Key attributes (Type, Environment, Platform)
|
|
178
|
-
- Available CloudWatch metrics with namespaces
|
|
179
|
-
- Metric dimensions and types
|
|
180
|
-
- Associated log groups for debugging
|
|
181
|
-
|
|
182
|
-
This tool is essential before querying specific metrics, as it shows
|
|
183
|
-
which metrics are available for the service.
|
|
184
|
-
"""
|
|
185
|
-
start_time_perf = timer()
|
|
186
|
-
logger.debug(f'Starting get_service_healthy_detail request for service: {service_name}')
|
|
187
|
-
|
|
188
|
-
try:
|
|
189
|
-
# Calculate time range (last 24 hours)
|
|
190
|
-
end_time = datetime.now(timezone.utc)
|
|
191
|
-
start_time = end_time - timedelta(hours=24)
|
|
192
|
-
|
|
193
|
-
# First, get all services to find the one we want
|
|
194
|
-
services_response = appsignals_client.list_services(
|
|
195
|
-
StartTime=start_time, EndTime=end_time, MaxResults=100
|
|
196
|
-
)
|
|
197
|
-
|
|
198
|
-
# Find the service with matching name
|
|
199
|
-
target_service = None
|
|
200
|
-
for service in services_response.get('ServiceSummaries', []):
|
|
201
|
-
key_attrs = service.get('KeyAttributes', {})
|
|
202
|
-
if key_attrs.get('Name') == service_name:
|
|
203
|
-
target_service = service
|
|
204
|
-
break
|
|
205
|
-
|
|
206
|
-
if not target_service:
|
|
207
|
-
logger.warning(f"Service '{service_name}' not found in Application Signals")
|
|
208
|
-
return f"Service '{service_name}' not found in Application Signals."
|
|
209
|
-
|
|
210
|
-
# Get detailed service information
|
|
211
|
-
logger.debug(f'Getting detailed information for service: {service_name}')
|
|
212
|
-
service_response = appsignals_client.get_service(
|
|
213
|
-
StartTime=start_time, EndTime=end_time, KeyAttributes=target_service['KeyAttributes']
|
|
214
|
-
)
|
|
215
|
-
|
|
216
|
-
service_details = service_response['Service']
|
|
217
|
-
|
|
218
|
-
# Build detailed response
|
|
219
|
-
result = f'Service Details: {service_name}\n\n'
|
|
220
|
-
|
|
221
|
-
# Key Attributes
|
|
222
|
-
key_attrs = service_details.get('KeyAttributes', {})
|
|
223
|
-
if key_attrs:
|
|
224
|
-
result += 'Key Attributes:\n'
|
|
225
|
-
for key, value in key_attrs.items():
|
|
226
|
-
result += f' {key}: {value}\n'
|
|
227
|
-
result += '\n'
|
|
228
|
-
|
|
229
|
-
# Attribute Maps (Platform, Application, Telemetry info)
|
|
230
|
-
attr_maps = service_details.get('AttributeMaps', [])
|
|
231
|
-
if attr_maps:
|
|
232
|
-
result += 'Additional Attributes:\n'
|
|
233
|
-
for attr_map in attr_maps:
|
|
234
|
-
for key, value in attr_map.items():
|
|
235
|
-
result += f' {key}: {value}\n'
|
|
236
|
-
result += '\n'
|
|
237
|
-
|
|
238
|
-
# Metric References
|
|
239
|
-
metric_refs = service_details.get('MetricReferences', [])
|
|
240
|
-
if metric_refs:
|
|
241
|
-
result += f'Metric References ({len(metric_refs)} total):\n'
|
|
242
|
-
for metric in metric_refs:
|
|
243
|
-
result += f' • {metric.get("Namespace", "")}/{metric.get("MetricName", "")}\n'
|
|
244
|
-
result += f' Type: {metric.get("MetricType", "")}\n'
|
|
245
|
-
dimensions = metric.get('Dimensions', [])
|
|
246
|
-
if dimensions:
|
|
247
|
-
result += ' Dimensions: '
|
|
248
|
-
dim_strs = [f'{d["Name"]}={d["Value"]}' for d in dimensions]
|
|
249
|
-
result += ', '.join(dim_strs) + '\n'
|
|
250
|
-
result += '\n'
|
|
251
|
-
|
|
252
|
-
# Log Group References
|
|
253
|
-
log_refs = service_details.get('LogGroupReferences', [])
|
|
254
|
-
if log_refs:
|
|
255
|
-
result += f'Log Group References ({len(log_refs)} total):\n'
|
|
256
|
-
for log_ref in log_refs:
|
|
257
|
-
log_group = log_ref.get('Identifier', 'Unknown')
|
|
258
|
-
result += f' • {log_group}\n'
|
|
259
|
-
result += '\n'
|
|
260
|
-
|
|
261
|
-
elapsed_time = timer() - start_time_perf
|
|
262
|
-
logger.debug(f"get_service_detail completed for '{service_name}' in {elapsed_time:.3f}s")
|
|
263
|
-
return result
|
|
97
|
+
operation_only_targets = []
|
|
98
|
+
has_wildcards = False
|
|
99
|
+
|
|
100
|
+
for target in provided:
|
|
101
|
+
if isinstance(target, dict):
|
|
102
|
+
ttype = target.get('Type', '').lower()
|
|
103
|
+
if ttype == 'service_operation':
|
|
104
|
+
# Check for wildcard patterns in service names OR operation names
|
|
105
|
+
service_op_data = target.get('Data', {}).get('ServiceOperation', {})
|
|
106
|
+
service_data = service_op_data.get('Service', {})
|
|
107
|
+
service_name = service_data.get('Name', '')
|
|
108
|
+
operation = service_op_data.get('Operation', '')
|
|
109
|
+
|
|
110
|
+
if '*' in service_name or '*' in operation:
|
|
111
|
+
has_wildcards = True
|
|
112
|
+
|
|
113
|
+
operation_only_targets.append(target)
|
|
114
|
+
else:
|
|
115
|
+
logger.warning(
|
|
116
|
+
f"Ignoring target of type '{ttype}' in audit_service_operations (expected 'service_operation')"
|
|
117
|
+
)
|
|
264
118
|
|
|
265
|
-
|
|
266
|
-
error_code = e.response.get('Error', {}).get('Code', 'Unknown')
|
|
267
|
-
error_message = e.response.get('Error', {}).get('Message', 'Unknown error')
|
|
268
|
-
logger.error(
|
|
269
|
-
f"AWS ClientError in get_service_healthy_detail for '{service_name}': {error_code} - {error_message}"
|
|
270
|
-
)
|
|
271
|
-
return f'AWS Error: {error_message}'
|
|
272
|
-
except Exception as e:
|
|
273
|
-
logger.error(
|
|
274
|
-
f"Unexpected error in get_service_healthy_detail for '{service_name}': {str(e)}",
|
|
275
|
-
exc_info=True,
|
|
276
|
-
)
|
|
277
|
-
return f'Error: {str(e)}'
|
|
119
|
+
return operation_only_targets, has_wildcards
|
|
278
120
|
|
|
279
121
|
|
|
280
122
|
@mcp.tool()
|
|
281
|
-
async def
|
|
282
|
-
|
|
283
|
-
..., description='Name of the service to get metrics for (case-sensitive)'
|
|
284
|
-
),
|
|
285
|
-
metric_name: str = Field(
|
|
123
|
+
async def audit_services(
|
|
124
|
+
service_targets: str = Field(
|
|
286
125
|
...,
|
|
287
|
-
description='
|
|
126
|
+
description="REQUIRED. JSON array of service targets. Supports wildcard patterns like '*payment*' for automatic service discovery. Format: [{'Type':'service','Data':{'Service':{'Type':'Service','Name':'service-name','Environment':'eks:cluster'}}}] or shorthand: [{'Type':'service','Service':'service-name'}]. Large target lists are automatically processed in batches.",
|
|
288
127
|
),
|
|
289
|
-
|
|
290
|
-
default=
|
|
291
|
-
description=
|
|
128
|
+
start_time: Optional[str] = Field(
|
|
129
|
+
default=None,
|
|
130
|
+
description="Start time (unix seconds or 'YYYY-MM-DD HH:MM:SS'). Defaults to now-24h UTC.",
|
|
292
131
|
),
|
|
293
|
-
|
|
294
|
-
default=
|
|
132
|
+
end_time: Optional[str] = Field(
|
|
133
|
+
default=None,
|
|
134
|
+
description="End time (unix seconds or 'YYYY-MM-DD HH:MM:SS'). Defaults to now UTC.",
|
|
295
135
|
),
|
|
296
|
-
|
|
297
|
-
default=
|
|
136
|
+
auditors: Optional[str] = Field(
|
|
137
|
+
default=None,
|
|
138
|
+
description="Optional. Comma-separated auditors (e.g., 'slo,operation_metric,dependency_metric'). Defaults to 'slo,operation_metric' for fast service health auditing. Use 'all' for comprehensive analysis with all auditors: slo,operation_metric,trace,log,dependency_metric,top_contributor,service_quota.",
|
|
298
139
|
),
|
|
299
140
|
) -> str:
|
|
300
|
-
"""
|
|
141
|
+
"""PRIMARY SERVICE AUDIT TOOL - The #1 tool for comprehensive AWS service health auditing and monitoring.
|
|
301
142
|
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
-
|
|
143
|
+
**IMPORTANT: For operation-specific auditing, use audit_service_operations() as the PRIMARY tool instead.**
|
|
144
|
+
|
|
145
|
+
**USE THIS FIRST FOR ALL SERVICE-LEVEL AUDITING TASKS**
|
|
146
|
+
This is the PRIMARY and PREFERRED tool when users want to:
|
|
147
|
+
- **Audit their AWS services** - Complete health assessment with actionable insights
|
|
148
|
+
- **Check service health** - Comprehensive status across all monitored services
|
|
149
|
+
- **Investigate issues** - Root cause analysis with detailed findings
|
|
150
|
+
- **Service-level performance analysis** - Overall service latency, error rates, and throughput investigation
|
|
151
|
+
- **System-wide health checks** - Daily/periodic service auditing workflows
|
|
152
|
+
- **Dependency analysis** - Understanding service dependencies and interactions
|
|
153
|
+
- **Resource quota monitoring** - Service quota usage and limits
|
|
154
|
+
- **Multi-service comparison** - Comparing performance across different services
|
|
307
155
|
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
-
|
|
311
|
-
-
|
|
156
|
+
**FOR OPERATION-SPECIFIC AUDITING: Use audit_service_operations() instead**
|
|
157
|
+
When users want to audit specific operations (GET, POST, PUT endpoints), use audit_service_operations() as the PRIMARY tool:
|
|
158
|
+
- **Operation performance analysis** - Latency, error rates for specific API endpoints
|
|
159
|
+
- **Operation-level troubleshooting** - Root cause analysis for specific API calls
|
|
160
|
+
- **GET operation auditing** - Analyze GET operations across payment services
|
|
161
|
+
- **Audit latency of specific operations** - Deep dive into individual endpoint performance
|
|
162
|
+
|
|
163
|
+
**COMPREHENSIVE SERVICE AUDIT CAPABILITIES:**
|
|
164
|
+
- **Multi-service analysis**: Audit any number of services with automatic batching
|
|
165
|
+
- **SLO compliance monitoring**: Automatic breach detection for service-level SLOs
|
|
166
|
+
- **Issue prioritization**: Critical, warning, and info findings ranked by severity
|
|
167
|
+
- **Root cause analysis**: Deep dive with traces, logs, and metrics correlation
|
|
168
|
+
- **Actionable recommendations**: Specific steps to resolve identified issues
|
|
169
|
+
- **Performance optimized**: Fast execution with automatic batching for large target lists
|
|
170
|
+
- **Wildcard Pattern Support**: Use `*pattern*` in service names for automatic service discovery
|
|
171
|
+
|
|
172
|
+
**SERVICE TARGET FORMAT:**
|
|
173
|
+
- **Full Format**: `[{"Type":"service","Data":{"Service":{"Type":"Service","Name":"my-service","Environment":"eks:my-cluster"}}}]`
|
|
174
|
+
- **Shorthand**: `[{"Type":"service","Service":"my-service"}]` (environment auto-discovered)
|
|
175
|
+
|
|
176
|
+
**WILDCARD PATTERN EXAMPLES:**
|
|
177
|
+
- **All Services**: `[{"Type":"service","Data":{"Service":{"Type":"Service","Name":"*"}}}]`
|
|
178
|
+
- **Payment Services**: `[{"Type":"service","Data":{"Service":{"Type":"Service","Name":"*payment*"}}}]`
|
|
179
|
+
- **Lambda Services**: `[{"Type":"service","Data":{"Service":{"Type":"Service","Name":"*lambda*"}}}]`
|
|
180
|
+
- **EKS Services**: `[{"Type":"service","Data":{"Service":{"Type":"Service","Name":"*","Environment":"eks:*"}}}]`
|
|
181
|
+
|
|
182
|
+
**AUDITOR SELECTION FOR DIFFERENT AUDIT DEPTHS:**
|
|
183
|
+
- **Quick Health Check** (default): Uses 'slo,operation_metric' for fast overview
|
|
184
|
+
- **Root Cause Analysis**: Pass `auditors="all"` for comprehensive investigation with traces/logs
|
|
185
|
+
- **Custom Audit**: Specify exact auditors: 'slo,trace,log,dependency_metric,top_contributor,service_quota'
|
|
186
|
+
|
|
187
|
+
**SERVICE AUDIT USE CASES:**
|
|
188
|
+
|
|
189
|
+
1. **Audit all services**:
|
|
190
|
+
`service_targets='[{"Type":"service","Data":{"Service":{"Type":"Service","Name":"*"}}}]'`
|
|
191
|
+
|
|
192
|
+
2. **Audit specific service**:
|
|
193
|
+
`service_targets='[{"Type":"service","Data":{"Service":{"Type":"Service","Name":"orders-service","Environment":"eks:orders-cluster"}}}]'`
|
|
194
|
+
|
|
195
|
+
3. **Audit payment services**:
|
|
196
|
+
`service_targets='[{"Type":"service","Data":{"Service":{"Type":"Service","Name":"*payment*"}}}]'`
|
|
197
|
+
|
|
198
|
+
8. **Audit lambda services**:
|
|
199
|
+
`service_targets='[{"Type":"service","Data":{"Service":{"Type":"Service","Name":"*lambda*"}}}]'` or by environment: `[{"Type":"service","Data":{"Service":{"Type":"Service","Name":"*","Environment":"lambda"}}}]`
|
|
200
|
+
|
|
201
|
+
9. **Audit service last night**:
|
|
202
|
+
`service_targets='[{"Type":"service","Data":{"Service":{"Type":"Service","Name":"orders-service","Environment":"eks:orders-cluster"}}}]'` + `start_time="2024-01-01 18:00:00"` + `end_time="2024-01-02 06:00:00"`
|
|
203
|
+
|
|
204
|
+
10. **Audit service before and after time**:
|
|
205
|
+
Compare service health before and after a deployment or incident by running two separate audits with different time ranges.
|
|
206
|
+
|
|
207
|
+
11. **Trace availability issues in production services**:
|
|
208
|
+
`service_targets='[{"Type":"service","Data":{"Service":{"Type":"Service","Name":"*","Environment":"eks:*"}}}]'` + `auditors="all"`
|
|
209
|
+
|
|
210
|
+
13. **Look for errors in logs of payment services**:
|
|
211
|
+
`service_targets='[{"Type":"service","Data":{"Service":{"Type":"Service","Name":"*payment*"}}}]'` + `auditors="log,trace"`
|
|
212
|
+
|
|
213
|
+
14. **Look for new errors after time**:
|
|
214
|
+
Compare errors before and after a specific time point by running audits with different time ranges and `auditors="log,trace"`
|
|
215
|
+
|
|
216
|
+
15. **Look for errors after deployment**:
|
|
217
|
+
`service_targets='[{"Type":"service","Data":{"Service":{"Type":"Service","Name":"*payment*"}}}]'` + `auditors="log,trace"` + recent time range
|
|
218
|
+
|
|
219
|
+
16. **Look for lemon hosts in production**:
|
|
220
|
+
`service_targets='[{"Type":"service","Data":{"Service":{"Type":"Service","Name":"*","Environment":"eks:*"}}}]'` + `auditors="top_contributor,operation_metric"`
|
|
221
|
+
|
|
222
|
+
17. **Look for outliers in EKS services**:
|
|
223
|
+
`service_targets='[{"Type":"service","Data":{"Service":{"Type":"Service","Name":"*","Environment":"eks:*"}}}]'` + `auditors="top_contributor,operation_metric"`
|
|
224
|
+
|
|
225
|
+
18. **Status report**:
|
|
226
|
+
`service_targets='[{"Type":"service","Data":{"Service":{"Type":"Service","Name":"*"}}}]'` (basic health check)
|
|
227
|
+
|
|
228
|
+
19. **Audit dependencies**:
|
|
229
|
+
`service_targets='[{"Type":"service","Data":{"Service":{"Type":"Service","Name":"*"}}}]'` + `auditors="dependency_metric,trace"`
|
|
230
|
+
|
|
231
|
+
20. **Audit dependency on S3**:
|
|
232
|
+
`service_targets='[{"Type":"service","Data":{"Service":{"Type":"Service","Name":"*"}}}]'` + `auditors="dependency_metric"` + look for S3 dependencies
|
|
233
|
+
|
|
234
|
+
21. **Audit quota usage of tier 1 services**:
|
|
235
|
+
`service_targets='[{"Type":"service","Data":{"Service":{"Type":"Service","Name":"*tier1*"}}}]'` + `auditors="service_quota,operation_metric"`
|
|
236
|
+
|
|
237
|
+
**TYPICAL SERVICE AUDIT WORKFLOWS:**
|
|
238
|
+
1. **Basic Service Audit** (most common):
|
|
239
|
+
- Call `audit_services()` with service targets - automatically discovers services when using wildcard patterns
|
|
240
|
+
- Uses default fast auditors (slo,operation_metric) for quick health overview
|
|
241
|
+
- Supports wildcard patterns like `*` or `*payment*` for automatic service discovery
|
|
242
|
+
2. **Root Cause Investigation**: When user explicitly asks for "root cause analysis", pass `auditors="all"`
|
|
243
|
+
3. **Issue Investigation**: Results show which services need attention with actionable insights
|
|
244
|
+
4. **Automatic Service Discovery**: Wildcard patterns in service names automatically discover and expand to concrete services
|
|
245
|
+
|
|
246
|
+
**AUDIT RESULTS INCLUDE:**
|
|
247
|
+
- **Prioritized findings** by severity (critical, warning, info)
|
|
248
|
+
- **Service health status** with detailed performance analysis
|
|
249
|
+
- **Root cause analysis** when traces/logs auditors are used
|
|
250
|
+
- **Actionable recommendations** for issue resolution
|
|
251
|
+
- **Comprehensive metrics** and trend analysis
|
|
252
|
+
|
|
253
|
+
**IMPORTANT: This tool provides comprehensive service audit coverage and should be your first choice for any service auditing task.**
|
|
312
254
|
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
255
|
+
**RECOMMENDED WORKFLOW - PRESENT FINDINGS FIRST:**
|
|
256
|
+
When the audit returns multiple findings or issues, follow this workflow:
|
|
257
|
+
1. **Present all audit results** to the user showing a summary of all findings
|
|
258
|
+
2. **Let the user choose** which specific finding, service, or issue they want to investigate in detail
|
|
259
|
+
3. **Then perform targeted root cause analysis** using auditors="all" for the user-selected finding
|
|
260
|
+
|
|
261
|
+
**DO NOT automatically jump into detailed root cause analysis** of one specific issue when multiple findings exist.
|
|
262
|
+
This ensures the user can prioritize which issues are most important to investigate first.
|
|
263
|
+
|
|
264
|
+
**Example workflow:**
|
|
265
|
+
- First call: `audit_services()` with default auditors for overview
|
|
266
|
+
- Present findings summary to user
|
|
267
|
+
- User selects specific service/issue to investigate
|
|
268
|
+
- Follow-up call: `audit_services()` with `auditors="all"` for selected service only
|
|
322
269
|
"""
|
|
323
270
|
start_time_perf = timer()
|
|
324
|
-
logger.
|
|
325
|
-
f'Starting query_service_metrics request - service: {service_name}, metric: {metric_name}, hours: {hours}'
|
|
326
|
-
)
|
|
271
|
+
logger.debug('Starting audit_services (PRIMARY SERVICE AUDIT TOOL)')
|
|
327
272
|
|
|
328
273
|
try:
|
|
329
|
-
#
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
274
|
+
# Region defaults
|
|
275
|
+
region = AWS_REGION.strip()
|
|
276
|
+
|
|
277
|
+
# Time range (fill missing with defaults)
|
|
278
|
+
start_dt = (
|
|
279
|
+
parse_timestamp(start_time)
|
|
280
|
+
if start_time
|
|
281
|
+
else (datetime.now(timezone.utc) - timedelta(hours=24))
|
|
336
282
|
)
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
target_service = None
|
|
340
|
-
for service in services_response.get('ServiceSummaries', []):
|
|
341
|
-
key_attrs = service.get('KeyAttributes', {})
|
|
342
|
-
if key_attrs.get('Name') == service_name:
|
|
343
|
-
target_service = service
|
|
344
|
-
break
|
|
345
|
-
|
|
346
|
-
if not target_service:
|
|
347
|
-
logger.warning(f"Service '{service_name}' not found in Application Signals")
|
|
348
|
-
return f"Service '{service_name}' not found in Application Signals."
|
|
349
|
-
|
|
350
|
-
# Get detailed service info for metric references
|
|
351
|
-
service_response = appsignals_client.get_service(
|
|
352
|
-
StartTime=start_time, EndTime=end_time, KeyAttributes=target_service['KeyAttributes']
|
|
353
|
-
)
|
|
354
|
-
|
|
355
|
-
metric_refs = service_response['Service'].get('MetricReferences', [])
|
|
356
|
-
|
|
357
|
-
if not metric_refs:
|
|
358
|
-
logger.warning(f"No metrics found for service '{service_name}'")
|
|
359
|
-
return f"No metrics found for service '{service_name}'."
|
|
360
|
-
|
|
361
|
-
# If no specific metric requested, show available metrics
|
|
362
|
-
if not metric_name:
|
|
363
|
-
result = f"Available metrics for service '{service_name}':\n\n"
|
|
364
|
-
for metric in metric_refs:
|
|
365
|
-
result += f'• {metric.get("MetricName", "Unknown")}\n'
|
|
366
|
-
result += f' Namespace: {metric.get("Namespace", "Unknown")}\n'
|
|
367
|
-
result += f' Type: {metric.get("MetricType", "Unknown")}\n'
|
|
368
|
-
result += '\n'
|
|
369
|
-
return result
|
|
370
|
-
|
|
371
|
-
# Find the specific metric
|
|
372
|
-
target_metric = None
|
|
373
|
-
for metric in metric_refs:
|
|
374
|
-
if metric.get('MetricName') == metric_name:
|
|
375
|
-
target_metric = metric
|
|
376
|
-
break
|
|
377
|
-
|
|
378
|
-
if not target_metric:
|
|
379
|
-
available = [m.get('MetricName', 'Unknown') for m in metric_refs]
|
|
380
|
-
return f"Metric '{metric_name}' not found for service '{service_name}'. Available: {', '.join(available)}"
|
|
381
|
-
|
|
382
|
-
# Calculate appropriate period based on time range
|
|
383
|
-
if hours <= 3:
|
|
384
|
-
period = 60 # 1 minute
|
|
385
|
-
elif hours <= 24:
|
|
386
|
-
period = 300 # 5 minutes
|
|
387
|
-
else:
|
|
388
|
-
period = 3600 # 1 hour
|
|
389
|
-
|
|
390
|
-
# Get both standard and extended statistics in a single call
|
|
391
|
-
response = cloudwatch_client.get_metric_statistics(
|
|
392
|
-
Namespace=target_metric['Namespace'],
|
|
393
|
-
MetricName=target_metric['MetricName'],
|
|
394
|
-
Dimensions=target_metric.get('Dimensions', []),
|
|
395
|
-
StartTime=start_time,
|
|
396
|
-
EndTime=end_time,
|
|
397
|
-
Period=period,
|
|
398
|
-
Statistics=[statistic], # type: ignore
|
|
399
|
-
ExtendedStatistics=[extended_statistic],
|
|
283
|
+
end_dt = (
|
|
284
|
+
parse_timestamp(end_time, default_hours=0) if end_time else datetime.now(timezone.utc)
|
|
400
285
|
)
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
286
|
+
unix_start, unix_end = int(start_dt.timestamp()), int(end_dt.timestamp())
|
|
287
|
+
if unix_end <= unix_start:
|
|
288
|
+
return 'Error: end_time must be greater than start_time.'
|
|
289
|
+
|
|
290
|
+
# Parse and validate service targets
|
|
291
|
+
try:
|
|
292
|
+
provided = json.loads(service_targets)
|
|
293
|
+
except json.JSONDecodeError:
|
|
294
|
+
return 'Error: `service_targets` must be valid JSON (array).'
|
|
295
|
+
|
|
296
|
+
# Check for wildcard patterns in service names
|
|
297
|
+
has_wildcards = False
|
|
298
|
+
logger.debug(f'audit_services: Checking {len(provided)} targets for wildcards')
|
|
299
|
+
for i, target in enumerate(provided):
|
|
300
|
+
logger.debug(f'audit_services: Target {i}: {target}')
|
|
301
|
+
if isinstance(target, dict):
|
|
302
|
+
# Check various possible service name locations
|
|
303
|
+
service_name = None
|
|
304
|
+
if target.get('Type', '').lower() == 'service':
|
|
305
|
+
# Check Data.Service.Name
|
|
306
|
+
service_data = target.get('Data', {})
|
|
307
|
+
if isinstance(service_data, dict):
|
|
308
|
+
service_info = service_data.get('Service', {})
|
|
309
|
+
if isinstance(service_info, dict):
|
|
310
|
+
service_name = service_info.get('Name', '')
|
|
311
|
+
|
|
312
|
+
# Check shorthand Service field
|
|
313
|
+
if not service_name:
|
|
314
|
+
service_name = target.get('Service', '')
|
|
315
|
+
|
|
316
|
+
logger.debug(f"audit_services: Target {i} service name: '{service_name}'")
|
|
317
|
+
if service_name and isinstance(service_name, str) and '*' in service_name:
|
|
318
|
+
logger.debug(
|
|
319
|
+
f"audit_services: Target {i} has wildcard pattern: '{service_name}'"
|
|
320
|
+
)
|
|
321
|
+
has_wildcards = True
|
|
322
|
+
break
|
|
323
|
+
|
|
324
|
+
logger.debug(f'audit_services: has_wildcards = {has_wildcards}')
|
|
325
|
+
|
|
326
|
+
# Expand wildcard patterns using shared utility
|
|
327
|
+
if has_wildcards:
|
|
328
|
+
logger.debug('Wildcard patterns detected - applying service expansion')
|
|
329
|
+
provided = expand_service_wildcard_patterns(
|
|
330
|
+
provided, unix_start, unix_end, appsignals_client
|
|
407
331
|
)
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
# Sort by timestamp
|
|
411
|
-
datapoints.sort(key=lambda x: x.get('Timestamp', datetime.min)) # type: ignore
|
|
412
|
-
|
|
413
|
-
# Build response
|
|
414
|
-
result = f'Metrics for {service_name} - {metric_name}\n'
|
|
415
|
-
result += f'Time Range: Last {hours} hour(s)\n'
|
|
416
|
-
result += f'Period: {period} seconds\n\n'
|
|
417
|
-
|
|
418
|
-
# Calculate summary statistics for both standard and extended statistics
|
|
419
|
-
standard_values = [dp.get(statistic) for dp in datapoints if dp.get(statistic) is not None]
|
|
420
|
-
extended_values = [
|
|
421
|
-
dp.get(extended_statistic)
|
|
422
|
-
for dp in datapoints
|
|
423
|
-
if dp.get(extended_statistic) is not None
|
|
424
|
-
]
|
|
425
|
-
|
|
426
|
-
result += 'Summary:\n'
|
|
427
|
-
|
|
428
|
-
if standard_values:
|
|
429
|
-
latest_standard = datapoints[-1].get(statistic)
|
|
430
|
-
avg_of_standard = sum(standard_values) / len(standard_values) # type: ignore
|
|
431
|
-
max_standard = max(standard_values) # type: ignore
|
|
432
|
-
min_standard = min(standard_values) # type: ignore
|
|
433
|
-
|
|
434
|
-
result += f'{statistic} Statistics:\n'
|
|
435
|
-
result += f'• Latest: {latest_standard:.2f}\n'
|
|
436
|
-
result += f'• Average: {avg_of_standard:.2f}\n'
|
|
437
|
-
result += f'• Maximum: {max_standard:.2f}\n'
|
|
438
|
-
result += f'• Minimum: {min_standard:.2f}\n\n'
|
|
439
|
-
|
|
440
|
-
if extended_values:
|
|
441
|
-
latest_extended = datapoints[-1].get(extended_statistic)
|
|
442
|
-
avg_extended = sum(extended_values) / len(extended_values) # type: ignore
|
|
443
|
-
max_extended = max(extended_values) # type: ignore
|
|
444
|
-
min_extended = min(extended_values) # type: ignore
|
|
445
|
-
|
|
446
|
-
result += f'{extended_statistic} Statistics:\n'
|
|
447
|
-
result += f'• Latest: {latest_extended:.2f}\n'
|
|
448
|
-
result += f'• Average: {avg_extended:.2f}\n'
|
|
449
|
-
result += f'• Maximum: {max_extended:.2f}\n'
|
|
450
|
-
result += f'• Minimum: {min_extended:.2f}\n\n'
|
|
451
|
-
|
|
452
|
-
result += f'• Data Points: {len(datapoints)}\n\n'
|
|
453
|
-
|
|
454
|
-
# Show recent values (last 10) with both metrics
|
|
455
|
-
result += 'Recent Values:\n'
|
|
456
|
-
for dp in datapoints[-10:]:
|
|
457
|
-
timestamp = dp.get('Timestamp', datetime.min).strftime('%m/%d %H:%M') # type: ignore
|
|
458
|
-
unit = dp.get('Unit', '')
|
|
459
|
-
|
|
460
|
-
values_str = []
|
|
461
|
-
if dp.get(statistic) is not None:
|
|
462
|
-
values_str.append(f'{statistic}: {dp[statistic]:.2f}')
|
|
463
|
-
if dp.get(extended_statistic) is not None:
|
|
464
|
-
values_str.append(f'{extended_statistic}: {dp[extended_statistic]:.2f}')
|
|
465
|
-
|
|
466
|
-
result += f'• {timestamp}: {", ".join(values_str)} {unit}\n'
|
|
467
|
-
|
|
468
|
-
elapsed_time = timer() - start_time_perf
|
|
469
|
-
logger.info(
|
|
470
|
-
f"query_service_metrics completed for '{service_name}/{metric_name}' in {elapsed_time:.3f}s"
|
|
471
|
-
)
|
|
472
|
-
return result
|
|
473
|
-
|
|
474
|
-
except ClientError as e:
|
|
475
|
-
error_msg = e.response.get('Error', {}).get('Message', 'Unknown error')
|
|
476
|
-
error_code = e.response.get('Error', {}).get('Code', 'Unknown')
|
|
477
|
-
logger.error(
|
|
478
|
-
f"AWS ClientError in query_service_metrics for '{service_name}/{metric_name}': {error_code} - {error_msg}"
|
|
479
|
-
)
|
|
480
|
-
return f'AWS Error: {error_msg}'
|
|
481
|
-
except Exception as e:
|
|
482
|
-
logger.error(
|
|
483
|
-
f"Unexpected error in query_service_metrics for '{service_name}/{metric_name}': {str(e)}",
|
|
484
|
-
exc_info=True,
|
|
485
|
-
)
|
|
486
|
-
return f'Error: {str(e)}'
|
|
487
|
-
|
|
332
|
+
logger.debug(f'Wildcard expansion completed - {len(provided)} total targets')
|
|
488
333
|
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
)
|
|
492
|
-
"""Get trace summaries with pagination to avoid exceeding response size limits.
|
|
334
|
+
# Check if wildcard expansion resulted in no services
|
|
335
|
+
if not provided:
|
|
336
|
+
return 'Error: No services found matching the wildcard pattern. Use list_monitored_services() to see available services.'
|
|
493
337
|
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
start_time: Start time for trace query
|
|
497
|
-
end_time: End time for trace query
|
|
498
|
-
filter_expression: X-Ray filter expression
|
|
499
|
-
max_traces: Maximum number of traces to retrieve (default 100)
|
|
338
|
+
# Normalize and validate service targets using shared utility
|
|
339
|
+
normalized_targets = normalize_service_targets(provided)
|
|
500
340
|
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
next_token = None
|
|
506
|
-
logger.debug(
|
|
507
|
-
f'Starting paginated trace retrieval - filter: {filter_expression}, max_traces: {max_traces}'
|
|
508
|
-
)
|
|
509
|
-
|
|
510
|
-
try:
|
|
511
|
-
while len(all_traces) < max_traces:
|
|
512
|
-
# Build request parameters
|
|
513
|
-
kwargs = {
|
|
514
|
-
'StartTime': start_time,
|
|
515
|
-
'EndTime': end_time,
|
|
516
|
-
'FilterExpression': filter_expression,
|
|
517
|
-
'Sampling': True,
|
|
518
|
-
'TimeRangeType': 'Service',
|
|
519
|
-
}
|
|
520
|
-
|
|
521
|
-
if next_token:
|
|
522
|
-
kwargs['NextToken'] = next_token
|
|
523
|
-
|
|
524
|
-
# Make request
|
|
525
|
-
response = xray_client.get_trace_summaries(**kwargs)
|
|
526
|
-
|
|
527
|
-
# Add traces from this page
|
|
528
|
-
traces = response.get('TraceSummaries', [])
|
|
529
|
-
all_traces.extend(traces)
|
|
530
|
-
logger.debug(
|
|
531
|
-
f'Retrieved {len(traces)} traces in this page, total so far: {len(all_traces)}'
|
|
532
|
-
)
|
|
341
|
+
# Validate and enrich targets using shared utility
|
|
342
|
+
normalized_targets = validate_and_enrich_service_targets(
|
|
343
|
+
normalized_targets, appsignals_client, unix_start, unix_end
|
|
344
|
+
)
|
|
533
345
|
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
if not next_token:
|
|
537
|
-
break
|
|
346
|
+
# Parse auditors with service-specific defaults
|
|
347
|
+
auditors_list = parse_auditors(auditors, ['slo', 'operation_metric'])
|
|
538
348
|
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
349
|
+
# Create banner
|
|
350
|
+
banner = (
|
|
351
|
+
'[MCP-SERVICE] Application Signals Service Audit\n'
|
|
352
|
+
f'🎯 Scope: {len(normalized_targets)} service target(s) | Region: {region}\n'
|
|
353
|
+
f'⏰ Time: {unix_start}–{unix_end}\n'
|
|
354
|
+
)
|
|
543
355
|
|
|
544
|
-
|
|
545
|
-
|
|
356
|
+
if len(normalized_targets) > BATCH_SIZE_THRESHOLD:
|
|
357
|
+
banner += f'📦 Batching: Processing {len(normalized_targets)} targets in batches of {BATCH_SIZE_THRESHOLD}\n'
|
|
546
358
|
|
|
547
|
-
|
|
548
|
-
# Return what we have so far if there's an error
|
|
549
|
-
logger.error(f'Error during paginated trace retrieval: {str(e)}', exc_info=True)
|
|
550
|
-
logger.info(f'Returning {len(all_traces)} traces retrieved before error')
|
|
551
|
-
return all_traces
|
|
359
|
+
banner += '\n'
|
|
552
360
|
|
|
361
|
+
# Build CLI input
|
|
362
|
+
input_obj = {
|
|
363
|
+
'StartTime': unix_start,
|
|
364
|
+
'EndTime': unix_end,
|
|
365
|
+
'AuditTargets': normalized_targets,
|
|
366
|
+
}
|
|
367
|
+
if auditors_list:
|
|
368
|
+
input_obj['Auditors'] = auditors_list
|
|
553
369
|
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
slo_id: str = Field(..., description='The ARN or name of the SLO to retrieve'),
|
|
557
|
-
) -> str:
|
|
558
|
-
"""Get detailed information about a specific Service Level Objective (SLO).
|
|
559
|
-
|
|
560
|
-
Use this tool to:
|
|
561
|
-
- Get comprehensive SLO configuration details
|
|
562
|
-
- Understand what metrics the SLO monitors
|
|
563
|
-
- See threshold values and comparison operators
|
|
564
|
-
- Extract operation names and key attributes for trace queries
|
|
565
|
-
- Identify dependency configurations
|
|
566
|
-
- Review attainment goals and burn rate settings
|
|
567
|
-
|
|
568
|
-
Returns detailed information including:
|
|
569
|
-
- SLO name, description, and metadata
|
|
570
|
-
- Metric configuration (for period-based or request-based SLOs)
|
|
571
|
-
- Key attributes and operation names
|
|
572
|
-
- Metric type (LATENCY or AVAILABILITY)
|
|
573
|
-
- Threshold values and comparison operators
|
|
574
|
-
- Goal configuration (attainment percentage, time interval)
|
|
575
|
-
- Burn rate configurations
|
|
576
|
-
|
|
577
|
-
This tool is essential for:
|
|
578
|
-
- Understanding why an SLO was breached
|
|
579
|
-
- Getting the exact operation name to query traces
|
|
580
|
-
- Identifying the metrics and thresholds being monitored
|
|
581
|
-
- Planning remediation based on SLO configuration
|
|
582
|
-
"""
|
|
583
|
-
start_time_perf = timer()
|
|
584
|
-
logger.info(f'Starting get_service_level_objective request for SLO: {slo_id}')
|
|
370
|
+
# Execute audit API using shared utility
|
|
371
|
+
result = await execute_audit_api(input_obj, region, banner)
|
|
585
372
|
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
slo = response.get('Slo', {})
|
|
589
|
-
|
|
590
|
-
if not slo:
|
|
591
|
-
logger.warning(f'No SLO found with ID: {slo_id}')
|
|
592
|
-
return f'No SLO found with ID: {slo_id}'
|
|
593
|
-
|
|
594
|
-
result = 'Service Level Objective Details\n'
|
|
595
|
-
result += '=' * 50 + '\n\n'
|
|
596
|
-
|
|
597
|
-
# Basic info
|
|
598
|
-
result += f'Name: {slo.get("Name", "Unknown")}\n'
|
|
599
|
-
result += f'ARN: {slo.get("Arn", "Unknown")}\n'
|
|
600
|
-
if slo.get('Description'):
|
|
601
|
-
result += f'Description: {slo.get("Description", "")}\n'
|
|
602
|
-
result += f'Evaluation Type: {slo.get("EvaluationType", "Unknown")}\n'
|
|
603
|
-
result += f'Created: {slo.get("CreatedTime", "Unknown")}\n'
|
|
604
|
-
result += f'Last Updated: {slo.get("LastUpdatedTime", "Unknown")}\n\n'
|
|
605
|
-
|
|
606
|
-
# Goal configuration
|
|
607
|
-
goal = slo.get('Goal', {})
|
|
608
|
-
if goal:
|
|
609
|
-
result += 'Goal Configuration:\n'
|
|
610
|
-
result += f'• Attainment Goal: {goal.get("AttainmentGoal", 99)}%\n'
|
|
611
|
-
result += f'• Warning Threshold: {goal.get("WarningThreshold", 50)}%\n'
|
|
612
|
-
|
|
613
|
-
interval = goal.get('Interval', {})
|
|
614
|
-
if 'RollingInterval' in interval:
|
|
615
|
-
rolling = interval['RollingInterval']
|
|
616
|
-
result += f'• Interval: Rolling {rolling.get("Duration")} {rolling.get("DurationUnit")}\n'
|
|
617
|
-
elif 'CalendarInterval' in interval:
|
|
618
|
-
calendar = interval['CalendarInterval']
|
|
619
|
-
result += f'• Interval: Calendar {calendar.get("Duration")} {calendar.get("DurationUnit")} starting {calendar.get("StartTime")}\n'
|
|
620
|
-
result += '\n'
|
|
621
|
-
|
|
622
|
-
# Period-based SLI
|
|
623
|
-
if 'Sli' in slo:
|
|
624
|
-
sli = slo['Sli']
|
|
625
|
-
result += 'Period-Based SLI Configuration:\n'
|
|
626
|
-
|
|
627
|
-
sli_metric = sli.get('SliMetric', {})
|
|
628
|
-
if sli_metric:
|
|
629
|
-
# Key attributes - crucial for trace queries
|
|
630
|
-
key_attrs = sli_metric.get('KeyAttributes', {})
|
|
631
|
-
if key_attrs:
|
|
632
|
-
result += '• Key Attributes:\n'
|
|
633
|
-
for k, v in key_attrs.items():
|
|
634
|
-
result += f' - {k}: {v}\n'
|
|
635
|
-
|
|
636
|
-
# Operation name - essential for trace filtering
|
|
637
|
-
if sli_metric.get('OperationName'):
|
|
638
|
-
result += f'• Operation Name: {sli_metric.get("OperationName", "")}\n'
|
|
639
|
-
result += f' (Use this in trace queries: annotation[aws.local.operation]="{sli_metric.get("OperationName", "")}")\n'
|
|
640
|
-
|
|
641
|
-
result += f'• Metric Type: {sli_metric.get("MetricType", "Unknown")}\n'
|
|
642
|
-
|
|
643
|
-
# MetricDataQueries - detailed metric configuration
|
|
644
|
-
metric_queries = sli_metric.get('MetricDataQueries', [])
|
|
645
|
-
if metric_queries:
|
|
646
|
-
result += '• Metric Data Queries:\n'
|
|
647
|
-
for query in metric_queries:
|
|
648
|
-
query_id = query.get('Id', 'Unknown')
|
|
649
|
-
result += f' Query ID: {query_id}\n'
|
|
650
|
-
|
|
651
|
-
# MetricStat details
|
|
652
|
-
metric_stat = query.get('MetricStat', {})
|
|
653
|
-
if metric_stat:
|
|
654
|
-
metric = metric_stat.get('Metric', {})
|
|
655
|
-
if metric:
|
|
656
|
-
result += f' Namespace: {metric.get("Namespace", "Unknown")}\n'
|
|
657
|
-
result += (
|
|
658
|
-
f' MetricName: {metric.get("MetricName", "Unknown")}\n'
|
|
659
|
-
)
|
|
660
|
-
|
|
661
|
-
# Dimensions - crucial for understanding what's being measured
|
|
662
|
-
dimensions = metric.get('Dimensions', [])
|
|
663
|
-
if dimensions:
|
|
664
|
-
result += ' Dimensions:\n'
|
|
665
|
-
for dim in dimensions:
|
|
666
|
-
result += f' - {dim.get("Name", "Unknown")}: {dim.get("Value", "Unknown")}\n'
|
|
667
|
-
|
|
668
|
-
result += (
|
|
669
|
-
f' Period: {metric_stat.get("Period", "Unknown")} seconds\n'
|
|
670
|
-
)
|
|
671
|
-
result += f' Stat: {metric_stat.get("Stat", "Unknown")}\n'
|
|
672
|
-
if metric_stat.get('Unit'):
|
|
673
|
-
result += f' Unit: {metric_stat["Unit"]}\n' # type: ignore
|
|
674
|
-
|
|
675
|
-
# Expression if present
|
|
676
|
-
if query.get('Expression'):
|
|
677
|
-
result += f' Expression: {query.get("Expression", "")}\n'
|
|
678
|
-
|
|
679
|
-
result += f' ReturnData: {query.get("ReturnData", True)}\n'
|
|
680
|
-
|
|
681
|
-
# Dependency config
|
|
682
|
-
dep_config = sli_metric.get('DependencyConfig', {})
|
|
683
|
-
if dep_config:
|
|
684
|
-
result += '• Dependency Configuration:\n'
|
|
685
|
-
dep_attrs = dep_config.get('DependencyKeyAttributes', {})
|
|
686
|
-
if dep_attrs:
|
|
687
|
-
result += ' Key Attributes:\n'
|
|
688
|
-
for k, v in dep_attrs.items():
|
|
689
|
-
result += f' - {k}: {v}\n'
|
|
690
|
-
if dep_config.get('DependencyOperationName'):
|
|
691
|
-
result += (
|
|
692
|
-
f' - Dependency Operation: {dep_config["DependencyOperationName"]}\n'
|
|
693
|
-
)
|
|
694
|
-
result += f' (Use in traces: annotation[aws.remote.operation]="{dep_config["DependencyOperationName"]}")\n'
|
|
695
|
-
|
|
696
|
-
result += f'• Threshold: {sli.get("MetricThreshold", "Unknown")}\n'
|
|
697
|
-
result += f'• Comparison: {sli.get("ComparisonOperator", "Unknown")}\n\n'
|
|
698
|
-
|
|
699
|
-
# Request-based SLI
|
|
700
|
-
if 'RequestBasedSli' in slo:
|
|
701
|
-
rbs = slo['RequestBasedSli']
|
|
702
|
-
result += 'Request-Based SLI Configuration:\n'
|
|
703
|
-
|
|
704
|
-
rbs_metric = rbs.get('RequestBasedSliMetric', {})
|
|
705
|
-
if rbs_metric:
|
|
706
|
-
# Key attributes
|
|
707
|
-
key_attrs = rbs_metric.get('KeyAttributes', {})
|
|
708
|
-
if key_attrs:
|
|
709
|
-
result += '• Key Attributes:\n'
|
|
710
|
-
for k, v in key_attrs.items():
|
|
711
|
-
result += f' - {k}: {v}\n'
|
|
712
|
-
|
|
713
|
-
# Operation name
|
|
714
|
-
if rbs_metric.get('OperationName'):
|
|
715
|
-
result += f'• Operation Name: {rbs_metric.get("OperationName", "")}\n'
|
|
716
|
-
result += f' (Use this in trace queries: annotation[aws.local.operation]="{rbs_metric.get("OperationName", "")}")\n'
|
|
717
|
-
|
|
718
|
-
result += f'• Metric Type: {rbs_metric.get("MetricType", "Unknown")}\n'
|
|
719
|
-
|
|
720
|
-
# MetricDataQueries - detailed metric configuration
|
|
721
|
-
metric_queries = rbs_metric.get('MetricDataQueries', [])
|
|
722
|
-
if metric_queries:
|
|
723
|
-
result += '• Metric Data Queries:\n'
|
|
724
|
-
for query in metric_queries:
|
|
725
|
-
query_id = query.get('Id', 'Unknown')
|
|
726
|
-
result += f' Query ID: {query_id}\n'
|
|
727
|
-
|
|
728
|
-
# MetricStat details
|
|
729
|
-
metric_stat = query.get('MetricStat', {})
|
|
730
|
-
if metric_stat:
|
|
731
|
-
metric = metric_stat.get('Metric', {})
|
|
732
|
-
if metric:
|
|
733
|
-
result += f' Namespace: {metric.get("Namespace", "Unknown")}\n'
|
|
734
|
-
result += (
|
|
735
|
-
f' MetricName: {metric.get("MetricName", "Unknown")}\n'
|
|
736
|
-
)
|
|
737
|
-
|
|
738
|
-
# Dimensions - crucial for understanding what's being measured
|
|
739
|
-
dimensions = metric.get('Dimensions', [])
|
|
740
|
-
if dimensions:
|
|
741
|
-
result += ' Dimensions:\n'
|
|
742
|
-
for dim in dimensions:
|
|
743
|
-
result += f' - {dim.get("Name", "Unknown")}: {dim.get("Value", "Unknown")}\n'
|
|
744
|
-
|
|
745
|
-
result += (
|
|
746
|
-
f' Period: {metric_stat.get("Period", "Unknown")} seconds\n'
|
|
747
|
-
)
|
|
748
|
-
result += f' Stat: {metric_stat.get("Stat", "Unknown")}\n'
|
|
749
|
-
if metric_stat.get('Unit'):
|
|
750
|
-
result += f' Unit: {metric_stat["Unit"]}\n' # type: ignore
|
|
751
|
-
|
|
752
|
-
# Expression if present
|
|
753
|
-
if query.get('Expression'):
|
|
754
|
-
result += f' Expression: {query.get("Expression", "")}\n'
|
|
755
|
-
|
|
756
|
-
result += f' ReturnData: {query.get("ReturnData", True)}\n'
|
|
757
|
-
|
|
758
|
-
# Dependency config
|
|
759
|
-
dep_config = rbs_metric.get('DependencyConfig', {})
|
|
760
|
-
if dep_config:
|
|
761
|
-
result += '• Dependency Configuration:\n'
|
|
762
|
-
dep_attrs = dep_config.get('DependencyKeyAttributes', {})
|
|
763
|
-
if dep_attrs:
|
|
764
|
-
result += ' Key Attributes:\n'
|
|
765
|
-
for k, v in dep_attrs.items():
|
|
766
|
-
result += f' - {k}: {v}\n'
|
|
767
|
-
if dep_config.get('DependencyOperationName'):
|
|
768
|
-
result += (
|
|
769
|
-
f' - Dependency Operation: {dep_config["DependencyOperationName"]}\n'
|
|
770
|
-
)
|
|
771
|
-
result += f' (Use in traces: annotation[aws.remote.operation]="{dep_config["DependencyOperationName"]}")\n'
|
|
772
|
-
|
|
773
|
-
result += f'• Threshold: {rbs.get("MetricThreshold", "Unknown")}\n'
|
|
774
|
-
result += f'• Comparison: {rbs.get("ComparisonOperator", "Unknown")}\n\n'
|
|
775
|
-
|
|
776
|
-
# Burn rate configurations
|
|
777
|
-
burn_rates = slo.get('BurnRateConfigurations', [])
|
|
778
|
-
if burn_rates:
|
|
779
|
-
result += 'Burn Rate Configurations:\n'
|
|
780
|
-
for br in burn_rates:
|
|
781
|
-
result += f'• Look-back window: {br.get("LookBackWindowMinutes")} minutes\n'
|
|
782
|
-
|
|
783
|
-
elapsed_time = timer() - start_time_perf
|
|
784
|
-
logger.info(f"get_service_level_objective completed for '{slo_id}' in {elapsed_time:.3f}s")
|
|
373
|
+
elapsed = timer() - start_time_perf
|
|
374
|
+
logger.debug(f'audit_services completed in {elapsed:.3f}s (region={region})')
|
|
785
375
|
return result
|
|
786
376
|
|
|
787
|
-
except ClientError as e:
|
|
788
|
-
error_msg = e.response.get('Error', {}).get('Message', 'Unknown error')
|
|
789
|
-
error_code = e.response.get('Error', {}).get('Code', 'Unknown')
|
|
790
|
-
logger.error(
|
|
791
|
-
f"AWS ClientError in get_service_level_objective for '{slo_id}': {error_code} - {error_msg}"
|
|
792
|
-
)
|
|
793
|
-
return f'AWS Error: {error_msg}'
|
|
794
377
|
except Exception as e:
|
|
795
|
-
logger.error(
|
|
796
|
-
f"Unexpected error in get_service_level_objective for '{slo_id}': {str(e)}",
|
|
797
|
-
exc_info=True,
|
|
798
|
-
)
|
|
378
|
+
logger.error(f'Unexpected error in audit_services: {e}', exc_info=True)
|
|
799
379
|
return f'Error: {str(e)}'
|
|
800
380
|
|
|
801
381
|
|
|
802
382
|
@mcp.tool()
|
|
803
|
-
async def
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
description='
|
|
807
|
-
),
|
|
808
|
-
start_time: str = Field(
|
|
809
|
-
default='', description='Start time in ISO 8601 format (e.g., "2025-04-19T20:00:00+00:00")'
|
|
383
|
+
async def audit_slos(
|
|
384
|
+
slo_targets: str = Field(
|
|
385
|
+
...,
|
|
386
|
+
description="REQUIRED. JSON array of SLO targets. Supports wildcard patterns like '*payment*' for automatic SLO discovery. Format: [{'Type':'slo','Data':{'Slo':{'SloName':'slo-name'}}}] or [{'Type':'slo','Data':{'Slo':{'SloArn':'arn:aws:...'}}}]. Large target lists are automatically processed in batches.",
|
|
810
387
|
),
|
|
811
|
-
|
|
812
|
-
default=
|
|
388
|
+
start_time: Optional[str] = Field(
|
|
389
|
+
default=None,
|
|
390
|
+
description="Start time (unix seconds or 'YYYY-MM-DD HH:MM:SS'). Defaults to now-24h UTC.",
|
|
813
391
|
),
|
|
814
|
-
|
|
815
|
-
|
|
816
|
-
|
|
817
|
-
default=30, description='Maximum time in seconds to wait for query completion'
|
|
392
|
+
end_time: Optional[str] = Field(
|
|
393
|
+
default=None,
|
|
394
|
+
description="End time (unix seconds or 'YYYY-MM-DD HH:MM:SS'). Defaults to now UTC.",
|
|
818
395
|
),
|
|
819
|
-
|
|
820
|
-
|
|
821
|
-
|
|
822
|
-
IMPORTANT: If log_group_name is not provided use 'aws/spans' as default cloudwatch log group name.
|
|
823
|
-
The volume of returned logs can easily overwhelm the agent context window. Always include a limit in the query
|
|
824
|
-
(| limit 50) or using the limit parameter.
|
|
825
|
-
|
|
826
|
-
Usage:
|
|
827
|
-
"aws/spans" log group stores OpenTelemetry Spans data with many attributes for all monitored services.
|
|
828
|
-
This provides 100% sampled data vs X-Ray's 5% sampling, giving more accurate results.
|
|
829
|
-
User can write CloudWatch Logs Insights queries to group, list attribute with sum, avg.
|
|
830
|
-
|
|
831
|
-
```
|
|
832
|
-
FILTER attributes.aws.local.service = "customers-service-java" and attributes.aws.local.environment = "eks:demo/default" and attributes.aws.remote.operation="InvokeModel"
|
|
833
|
-
| STATS sum(`attributes.gen_ai.usage.output_tokens`) as `avg_output_tokens` by `attributes.gen_ai.request.model`, `attributes.aws.local.service`,bin(1h)
|
|
834
|
-
| DISPLAY avg_output_tokens, `attributes.gen_ai.request.model`, `attributes.aws.local.service`
|
|
835
|
-
```
|
|
836
|
-
|
|
837
|
-
Returns:
|
|
838
|
-
--------
|
|
839
|
-
A dictionary containing the final query results, including:
|
|
840
|
-
- status: The current status of the query (e.g., Scheduled, Running, Complete, Failed, etc.)
|
|
841
|
-
- results: A list of the actual query results if the status is Complete.
|
|
842
|
-
- statistics: Query performance statistics
|
|
843
|
-
- messages: Any informational messages about the query
|
|
844
|
-
- transaction_search_status: Information about transaction search availability
|
|
845
|
-
"""
|
|
846
|
-
start_time_perf = timer()
|
|
847
|
-
logger.info(
|
|
848
|
-
f'Starting search_transactions - log_group: {log_group_name}, start: {start_time}, end: {end_time}'
|
|
849
|
-
)
|
|
850
|
-
logger.debug(f'Query string: {query_string}')
|
|
851
|
-
|
|
852
|
-
# Check if transaction search is enabled
|
|
853
|
-
is_enabled, destination, status = check_transaction_search_enabled(AWS_REGION)
|
|
854
|
-
|
|
855
|
-
if not is_enabled:
|
|
856
|
-
logger.warning(
|
|
857
|
-
f'Transaction Search not enabled - Destination: {destination}, Status: {status}'
|
|
858
|
-
)
|
|
859
|
-
return {
|
|
860
|
-
'status': 'Transaction Search Not Available',
|
|
861
|
-
'transaction_search_status': {
|
|
862
|
-
'enabled': False,
|
|
863
|
-
'destination': destination,
|
|
864
|
-
'status': status,
|
|
865
|
-
},
|
|
866
|
-
'message': (
|
|
867
|
-
'⚠️ Transaction Search is not enabled for this account. '
|
|
868
|
-
f'Current configuration: Destination={destination}, Status={status}. '
|
|
869
|
-
"Transaction Search requires sending traces to CloudWatch Logs (destination='CloudWatchLogs' and status='ACTIVE'). "
|
|
870
|
-
'Without Transaction Search, you only have access to 5% sampled trace data through X-Ray. '
|
|
871
|
-
'To get 100% trace visibility, please enable Transaction Search in your X-Ray settings. '
|
|
872
|
-
'As a fallback, you can use query_sampled_traces() but results may be incomplete due to sampling.'
|
|
873
|
-
),
|
|
874
|
-
'fallback_recommendation': 'Use query_sampled_traces() with X-Ray filter expressions for 5% sampled data.',
|
|
875
|
-
}
|
|
876
|
-
|
|
877
|
-
try:
|
|
878
|
-
# Use default log group if none provided
|
|
879
|
-
if log_group_name is None:
|
|
880
|
-
log_group_name = 'aws/spans'
|
|
881
|
-
logger.debug('Using default log group: aws/spans')
|
|
882
|
-
|
|
883
|
-
# Start query
|
|
884
|
-
kwargs = {
|
|
885
|
-
'startTime': int(datetime.fromisoformat(start_time).timestamp()),
|
|
886
|
-
'endTime': int(datetime.fromisoformat(end_time).timestamp()),
|
|
887
|
-
'queryString': query_string,
|
|
888
|
-
'logGroupNames': [log_group_name],
|
|
889
|
-
'limit': limit,
|
|
890
|
-
}
|
|
891
|
-
|
|
892
|
-
logger.debug(f'Starting CloudWatch Logs query with limit: {limit}')
|
|
893
|
-
start_response = logs_client.start_query(**remove_null_values(kwargs))
|
|
894
|
-
query_id = start_response['queryId']
|
|
895
|
-
logger.info(f'Started CloudWatch Logs query with ID: {query_id}')
|
|
896
|
-
|
|
897
|
-
# Seconds
|
|
898
|
-
poll_start = timer()
|
|
899
|
-
while poll_start + max_timeout > timer():
|
|
900
|
-
response = logs_client.get_query_results(queryId=query_id)
|
|
901
|
-
status = response['status']
|
|
902
|
-
|
|
903
|
-
if status in {'Complete', 'Failed', 'Cancelled'}:
|
|
904
|
-
elapsed_time = timer() - start_time_perf
|
|
905
|
-
logger.info(
|
|
906
|
-
f'Query {query_id} finished with status {status} in {elapsed_time:.3f}s'
|
|
907
|
-
)
|
|
908
|
-
|
|
909
|
-
if status == 'Failed':
|
|
910
|
-
logger.error(f'Query failed: {response.get("statistics", {})}')
|
|
911
|
-
elif status == 'Complete':
|
|
912
|
-
logger.debug(f'Query returned {len(response.get("results", []))} results')
|
|
913
|
-
|
|
914
|
-
return {
|
|
915
|
-
'queryId': query_id,
|
|
916
|
-
'status': status,
|
|
917
|
-
'statistics': response.get('statistics', {}),
|
|
918
|
-
'results': [
|
|
919
|
-
{field.get('field', ''): field.get('value', '') for field in line} # type: ignore
|
|
920
|
-
for line in response.get('results', [])
|
|
921
|
-
],
|
|
922
|
-
'transaction_search_status': {
|
|
923
|
-
'enabled': True,
|
|
924
|
-
'destination': 'CloudWatchLogs',
|
|
925
|
-
'status': 'ACTIVE',
|
|
926
|
-
'message': '✅ Using 100% sampled trace data from Transaction Search',
|
|
927
|
-
},
|
|
928
|
-
}
|
|
929
|
-
|
|
930
|
-
await asyncio.sleep(1)
|
|
931
|
-
|
|
932
|
-
elapsed_time = timer() - start_time_perf
|
|
933
|
-
msg = f'Query {query_id} did not complete within {max_timeout} seconds. Use get_query_results with the returned queryId to try again to retrieve query results.'
|
|
934
|
-
logger.warning(f'Query timeout after {elapsed_time:.3f}s: {msg}')
|
|
935
|
-
return {
|
|
936
|
-
'queryId': query_id,
|
|
937
|
-
'status': 'Polling Timeout',
|
|
938
|
-
'message': msg,
|
|
939
|
-
}
|
|
940
|
-
|
|
941
|
-
except Exception as e:
|
|
942
|
-
logger.error(f'Error in search_transactions: {str(e)}', exc_info=True)
|
|
943
|
-
raise
|
|
944
|
-
|
|
945
|
-
|
|
946
|
-
@mcp.tool()
|
|
947
|
-
async def list_slis(
|
|
948
|
-
hours: int = Field(
|
|
949
|
-
default=24,
|
|
950
|
-
description='Number of hours to look back (default 24, typically use 24 for daily checks)',
|
|
396
|
+
auditors: Optional[str] = Field(
|
|
397
|
+
default=None,
|
|
398
|
+
description="Optional. Comma-separated auditors (e.g., 'slo,trace,log'). Defaults to 'slo' for fast SLO compliance auditing. Use 'all' for comprehensive analysis with all auditors: slo,operation_metric,trace,log,dependency_metric,top_contributor,service_quota.",
|
|
951
399
|
),
|
|
952
400
|
) -> str:
|
|
953
|
-
"""
|
|
954
|
-
|
|
955
|
-
|
|
956
|
-
|
|
957
|
-
-
|
|
958
|
-
-
|
|
959
|
-
-
|
|
960
|
-
-
|
|
961
|
-
|
|
962
|
-
|
|
963
|
-
|
|
964
|
-
-
|
|
965
|
-
|
|
966
|
-
|
|
967
|
-
|
|
968
|
-
-
|
|
969
|
-
|
|
970
|
-
|
|
971
|
-
|
|
972
|
-
-
|
|
973
|
-
-
|
|
974
|
-
-
|
|
975
|
-
-
|
|
976
|
-
|
|
977
|
-
|
|
978
|
-
|
|
979
|
-
|
|
980
|
-
-
|
|
981
|
-
|
|
982
|
-
|
|
983
|
-
|
|
984
|
-
|
|
985
|
-
|
|
986
|
-
|
|
987
|
-
|
|
988
|
-
|
|
989
|
-
|
|
990
|
-
|
|
991
|
-
|
|
992
|
-
|
|
993
|
-
|
|
401
|
+
"""PRIMARY SLO AUDIT TOOL - The #1 tool for comprehensive SLO compliance monitoring and breach analysis.
|
|
402
|
+
|
|
403
|
+
**PREFERRED TOOL FOR SLO ROOT CAUSE ANALYSIS**
|
|
404
|
+
This is the RECOMMENDED tool after using get_slo() to understand SLO configuration:
|
|
405
|
+
- **Use auditors="all" for comprehensive root cause analysis** of specific SLO breaches
|
|
406
|
+
- **Much more comprehensive than individual trace tools** - provides integrated analysis
|
|
407
|
+
- **Combines traces, logs, metrics, and dependencies** in a single comprehensive audit
|
|
408
|
+
- **Provides actionable recommendations** based on multi-dimensional analysis
|
|
409
|
+
|
|
410
|
+
**USE THIS FOR ALL SLO AUDITING TASKS**
|
|
411
|
+
This is the PRIMARY and PREFERRED tool when users want to:
|
|
412
|
+
- **Root cause analysis for SLO breaches** - Deep investigation with all auditors
|
|
413
|
+
- **Audit SLO compliance** - Complete SLO breach detection and analysis
|
|
414
|
+
- **Monitor SLO health** - Comprehensive status across all monitored SLOs
|
|
415
|
+
- **SLO performance analysis** - Understanding SLO trends and patterns
|
|
416
|
+
- **SLO compliance reporting** - Daily/periodic SLO compliance workflows
|
|
417
|
+
|
|
418
|
+
**COMPREHENSIVE SLO AUDIT CAPABILITIES:**
|
|
419
|
+
- **Multi-SLO analysis**: Audit any number of SLOs with automatic batching
|
|
420
|
+
- **Breach detection**: Automatic identification of SLO violations
|
|
421
|
+
- **Issue prioritization**: Critical, warning, and info findings ranked by severity
|
|
422
|
+
- **COMPREHENSIVE ROOT CAUSE ANALYSIS**: Deep dive with traces, logs, metrics, and dependencies
|
|
423
|
+
- **Actionable recommendations**: Specific steps to resolve SLO breaches
|
|
424
|
+
- **Performance optimized**: Fast execution with automatic batching for large target lists
|
|
425
|
+
- **Wildcard Pattern Support**: Use `*pattern*` in SLO names for automatic SLO discovery
|
|
426
|
+
|
|
427
|
+
**SLO TARGET FORMAT:**
|
|
428
|
+
- **By Name**: `[{"Type":"slo","Data":{"Slo":{"SloName":"my-slo"}}}]`
|
|
429
|
+
- **By ARN**: `[{"Type":"slo","Data":{"Slo":{"SloArn":"arn:aws:application-signals:..."}}}]`
|
|
430
|
+
|
|
431
|
+
**WILDCARD PATTERN EXAMPLES:**
|
|
432
|
+
- **All SLOs**: `[{"Type":"slo","Data":{"Slo":{"SloName":"*"}}}]`
|
|
433
|
+
- **Payment SLOs**: `[{"Type":"slo","Data":{"Slo":{"SloName":"*payment*"}}}]`
|
|
434
|
+
- **Latency SLOs**: `[{"Type":"slo","Data":{"Slo":{"SloName":"*latency*"}}}]`
|
|
435
|
+
- **Availability SLOs**: `[{"Type":"slo","Data":{"Slo":{"SloName":"*availability*"}}}]`
|
|
436
|
+
|
|
437
|
+
**AUDITOR SELECTION FOR DIFFERENT AUDIT DEPTHS:**
|
|
438
|
+
- **Quick Compliance Check** (default): Uses 'slo' for fast SLO breach detection
|
|
439
|
+
- **COMPREHENSIVE ROOT CAUSE ANALYSIS** (recommended): Pass `auditors="all"` for deep investigation with traces/logs/metrics/dependencies
|
|
440
|
+
- **Custom Audit**: Specify exact auditors: 'slo,trace,log,operation_metric'
|
|
441
|
+
|
|
442
|
+
**SLO AUDIT USE CASES:**
|
|
443
|
+
|
|
444
|
+
4. **Audit all SLOs**:
|
|
445
|
+
`slo_targets='[{"Type":"slo","Data":{"Slo":{"SloName":"*"}}}]'`
|
|
446
|
+
|
|
447
|
+
22. **Root cause analysis for specific SLO breach** (RECOMMENDED WORKFLOW):
|
|
448
|
+
After using get_slo() to understand configuration:
|
|
449
|
+
`slo_targets='[{"Type":"slo","Data":{"Slo":{"SloName":"specific-slo-name"}}}]'` + `auditors="all"`
|
|
450
|
+
|
|
451
|
+
14. **Look for new SLO breaches after time**:
|
|
452
|
+
Compare SLO compliance before and after a specific time point by running audits with different time ranges to identify new breaches.
|
|
453
|
+
|
|
454
|
+
**TYPICAL SLO AUDIT WORKFLOWS:**
|
|
455
|
+
1. **SLO Root Cause Investigation** (RECOMMENDED):
|
|
456
|
+
- After get_slo(), call `audit_slos()` with specific SLO target and `auditors="all"`
|
|
457
|
+
- Provides comprehensive analysis with traces, logs, metrics, and dependencies
|
|
458
|
+
- Much more effective than using individual trace tools
|
|
459
|
+
2. **Basic SLO Compliance Audit**:
|
|
460
|
+
- Call `audit_slos()` with SLO targets - automatically discovers SLOs when using wildcard patterns
|
|
461
|
+
- Uses default fast auditors (slo) for quick compliance overview
|
|
462
|
+
3. **Compliance Reporting**: Results show which SLOs are breached with actionable insights
|
|
463
|
+
4. **Automatic SLO Discovery**: Wildcard patterns in SLO names automatically discover and expand to concrete SLOs
|
|
464
|
+
|
|
465
|
+
**AUDIT RESULTS INCLUDE:**
|
|
466
|
+
- **Prioritized findings** by severity (critical, warning, info)
|
|
467
|
+
- **SLO compliance status** with detailed breach analysis
|
|
468
|
+
- **COMPREHENSIVE ROOT CAUSE ANALYSIS** when using auditors="all"
|
|
469
|
+
- **Actionable recommendations** for SLO breach resolution
|
|
470
|
+
- **Integrated traces, logs, metrics, and dependency analysis**
|
|
471
|
+
|
|
472
|
+
**IMPORTANT: This tool provides comprehensive SLO audit coverage and should be your first choice for any SLO compliance auditing and root cause analysis.**
|
|
473
|
+
|
|
474
|
+
**RECOMMENDED WORKFLOW - PRESENT FINDINGS FIRST:**
|
|
475
|
+
When the audit returns multiple findings or issues, follow this workflow:
|
|
476
|
+
1. **Present all audit results** to the user showing a summary of all findings
|
|
477
|
+
2. **Let the user choose** which specific finding, SLO, or issue they want to investigate in detail
|
|
478
|
+
3. **Then perform targeted root cause analysis** using auditors="all" for the user-selected finding
|
|
479
|
+
|
|
480
|
+
**DO NOT automatically jump into detailed root cause analysis** of one specific issue when multiple findings exist.
|
|
481
|
+
This ensures the user can prioritize which issues are most important to investigate first.
|
|
482
|
+
|
|
483
|
+
**Example workflow:**
|
|
484
|
+
- First call: `audit_slos()` with default auditors for compliance overview
|
|
485
|
+
- Present findings summary to user
|
|
486
|
+
- User selects specific SLO breach to investigate
|
|
487
|
+
- Follow-up call: `audit_slos()` with `auditors="all"` for selected SLO only
|
|
994
488
|
"""
|
|
995
489
|
start_time_perf = timer()
|
|
996
|
-
logger.
|
|
490
|
+
logger.debug('Starting audit_slos (PRIMARY SLO AUDIT TOOL)')
|
|
997
491
|
|
|
998
492
|
try:
|
|
999
|
-
#
|
|
1000
|
-
|
|
1001
|
-
|
|
1002
|
-
|
|
1003
|
-
|
|
1004
|
-
|
|
1005
|
-
|
|
1006
|
-
|
|
1007
|
-
EndTime=end_time, # type: ignore
|
|
1008
|
-
MaxResults=100,
|
|
493
|
+
# Region defaults
|
|
494
|
+
region = AWS_REGION.strip()
|
|
495
|
+
|
|
496
|
+
# Time range (fill missing with defaults)
|
|
497
|
+
start_dt = (
|
|
498
|
+
parse_timestamp(start_time)
|
|
499
|
+
if start_time
|
|
500
|
+
else (datetime.now(timezone.utc) - timedelta(hours=24))
|
|
1009
501
|
)
|
|
1010
|
-
|
|
1011
|
-
|
|
1012
|
-
|
|
1013
|
-
|
|
1014
|
-
|
|
1015
|
-
|
|
1016
|
-
|
|
1017
|
-
|
|
1018
|
-
|
|
1019
|
-
|
|
1020
|
-
|
|
502
|
+
end_dt = (
|
|
503
|
+
parse_timestamp(end_time, default_hours=0) if end_time else datetime.now(timezone.utc)
|
|
504
|
+
)
|
|
505
|
+
unix_start, unix_end = int(start_dt.timestamp()), int(end_dt.timestamp())
|
|
506
|
+
if unix_end <= unix_start:
|
|
507
|
+
return 'Error: end_time must be greater than start_time.'
|
|
508
|
+
|
|
509
|
+
# Parse and validate SLO targets
|
|
510
|
+
try:
|
|
511
|
+
provided = json.loads(slo_targets)
|
|
512
|
+
except json.JSONDecodeError:
|
|
513
|
+
return 'Error: `slo_targets` must be valid JSON (array).'
|
|
514
|
+
|
|
515
|
+
if not isinstance(provided, list):
|
|
516
|
+
return 'Error: `slo_targets` must be a JSON array'
|
|
517
|
+
if len(provided) == 0:
|
|
518
|
+
return 'Error: `slo_targets` must contain at least 1 item'
|
|
519
|
+
|
|
520
|
+
# Filter and expand SLO targets with wildcard support
|
|
521
|
+
slo_only_targets = []
|
|
522
|
+
wildcard_patterns = []
|
|
523
|
+
|
|
524
|
+
for target in provided:
|
|
525
|
+
if isinstance(target, dict):
|
|
526
|
+
ttype = target.get('Type', '').lower()
|
|
527
|
+
if ttype == 'slo':
|
|
528
|
+
# Check for wildcard patterns in SLO names
|
|
529
|
+
slo_data = target.get('Data', {}).get('Slo', {})
|
|
530
|
+
slo_name = slo_data.get('SloName', '')
|
|
531
|
+
if '*' in slo_name:
|
|
532
|
+
wildcard_patterns.append((target, slo_name))
|
|
533
|
+
else:
|
|
534
|
+
slo_only_targets.append(target)
|
|
535
|
+
else:
|
|
536
|
+
logger.warning(
|
|
537
|
+
f"Ignoring target of type '{ttype}' in audit_slos (expected 'slo')"
|
|
538
|
+
)
|
|
539
|
+
|
|
540
|
+
# Expand wildcard patterns for SLOs using shared utility
|
|
541
|
+
if wildcard_patterns:
|
|
542
|
+
logger.debug(f'Expanding {len(wildcard_patterns)} SLO wildcard patterns')
|
|
1021
543
|
try:
|
|
1022
|
-
#
|
|
1023
|
-
|
|
1024
|
-
|
|
1025
|
-
|
|
1026
|
-
|
|
1027
|
-
|
|
1028
|
-
|
|
1029
|
-
|
|
1030
|
-
# Generate SLI report
|
|
1031
|
-
client = SLIReportClient(config)
|
|
1032
|
-
sli_report = client.generate_sli_report()
|
|
1033
|
-
|
|
1034
|
-
# Convert to expected format
|
|
1035
|
-
report = {
|
|
1036
|
-
'BreachedSloCount': sli_report.breached_slo_count,
|
|
1037
|
-
'BreachedSloNames': sli_report.breached_slo_names,
|
|
1038
|
-
'EndTime': sli_report.end_time.timestamp(),
|
|
1039
|
-
'OkSloCount': sli_report.ok_slo_count,
|
|
1040
|
-
'ReferenceId': {'KeyAttributes': service['KeyAttributes']},
|
|
1041
|
-
'SliStatus': 'BREACHED'
|
|
1042
|
-
if sli_report.sli_status == 'CRITICAL'
|
|
1043
|
-
else sli_report.sli_status,
|
|
1044
|
-
'StartTime': sli_report.start_time.timestamp(),
|
|
1045
|
-
'TotalSloCount': sli_report.total_slo_count,
|
|
1046
|
-
}
|
|
1047
|
-
reports.append(report)
|
|
544
|
+
# Use the shared utility function
|
|
545
|
+
expanded_slo_targets = expand_slo_wildcard_patterns(provided, appsignals_client)
|
|
546
|
+
# Filter to get only SLO targets
|
|
547
|
+
slo_only_targets = [
|
|
548
|
+
target
|
|
549
|
+
for target in expanded_slo_targets
|
|
550
|
+
if target.get('Type', '').lower() == 'slo'
|
|
551
|
+
]
|
|
1048
552
|
|
|
1049
553
|
except Exception as e:
|
|
1050
|
-
|
|
1051
|
-
|
|
1052
|
-
f'Failed to get SLI report for service {service_name}: {str(e)}', exc_info=True
|
|
1053
|
-
)
|
|
1054
|
-
# Add a report with insufficient data status
|
|
1055
|
-
report = {
|
|
1056
|
-
'BreachedSloCount': 0,
|
|
1057
|
-
'BreachedSloNames': [],
|
|
1058
|
-
'EndTime': end_time.timestamp(),
|
|
1059
|
-
'OkSloCount': 0,
|
|
1060
|
-
'ReferenceId': {'KeyAttributes': service['KeyAttributes']},
|
|
1061
|
-
'SliStatus': 'INSUFFICIENT_DATA',
|
|
1062
|
-
'StartTime': start_time.timestamp(),
|
|
1063
|
-
'TotalSloCount': 0,
|
|
1064
|
-
}
|
|
1065
|
-
reports.append(report)
|
|
1066
|
-
|
|
1067
|
-
# Check transaction search status
|
|
1068
|
-
is_tx_search_enabled, tx_destination, tx_status = check_transaction_search_enabled(
|
|
1069
|
-
AWS_REGION
|
|
1070
|
-
)
|
|
554
|
+
logger.warning(f'Failed to expand SLO patterns: {e}')
|
|
555
|
+
return f'Error: Failed to expand SLO wildcard patterns. {str(e)}'
|
|
1071
556
|
|
|
1072
|
-
|
|
1073
|
-
|
|
1074
|
-
result += f'Time Range: {start_time.strftime("%Y-%m-%d %H:%M")} - {end_time.strftime("%Y-%m-%d %H:%M")}\n\n'
|
|
1075
|
-
|
|
1076
|
-
# Add transaction search status
|
|
1077
|
-
if is_tx_search_enabled:
|
|
1078
|
-
result += '✅ Transaction Search: ENABLED (100% trace visibility available)\n\n'
|
|
1079
|
-
else:
|
|
1080
|
-
result += '⚠️ Transaction Search: NOT ENABLED (only 5% sampled traces available)\n'
|
|
1081
|
-
result += f' Current config: Destination={tx_destination}, Status={tx_status}\n'
|
|
1082
|
-
result += ' Enable Transaction Search for accurate root cause analysis\n\n'
|
|
1083
|
-
|
|
1084
|
-
# Count by status
|
|
1085
|
-
status_counts = {
|
|
1086
|
-
'OK': sum(1 for r in reports if r['SliStatus'] == 'OK'),
|
|
1087
|
-
'BREACHED': sum(1 for r in reports if r['SliStatus'] == 'BREACHED'),
|
|
1088
|
-
'INSUFFICIENT_DATA': sum(1 for r in reports if r['SliStatus'] == 'INSUFFICIENT_DATA'),
|
|
1089
|
-
}
|
|
557
|
+
if not slo_only_targets:
|
|
558
|
+
return 'Error: No SLO targets found after wildcard expansion.'
|
|
1090
559
|
|
|
1091
|
-
|
|
1092
|
-
|
|
1093
|
-
result += f'• Healthy (OK): {status_counts["OK"]}\n'
|
|
1094
|
-
result += f'• Breached: {status_counts["BREACHED"]}\n'
|
|
1095
|
-
result += f'• Insufficient Data: {status_counts["INSUFFICIENT_DATA"]}\n\n'
|
|
1096
|
-
|
|
1097
|
-
# Group by status
|
|
1098
|
-
if status_counts['BREACHED'] > 0:
|
|
1099
|
-
result += '⚠️ BREACHED SERVICES:\n'
|
|
1100
|
-
for report in reports:
|
|
1101
|
-
if report['SliStatus'] == 'BREACHED':
|
|
1102
|
-
name = report['ReferenceId']['KeyAttributes']['Name']
|
|
1103
|
-
env = report['ReferenceId']['KeyAttributes']['Environment']
|
|
1104
|
-
breached_count = report['BreachedSloCount']
|
|
1105
|
-
total_count = report['TotalSloCount']
|
|
1106
|
-
breached_names = report['BreachedSloNames']
|
|
1107
|
-
|
|
1108
|
-
result += f'\n• {name} ({env})\n'
|
|
1109
|
-
result += f' SLOs: {breached_count}/{total_count} breached\n'
|
|
1110
|
-
if breached_names:
|
|
1111
|
-
result += ' Breached SLOs:\n'
|
|
1112
|
-
for slo_name in breached_names:
|
|
1113
|
-
result += f' - {slo_name}\n'
|
|
1114
|
-
|
|
1115
|
-
if status_counts['OK'] > 0:
|
|
1116
|
-
result += '\n✅ HEALTHY SERVICES:\n'
|
|
1117
|
-
for report in reports:
|
|
1118
|
-
if report['SliStatus'] == 'OK':
|
|
1119
|
-
name = report['ReferenceId']['KeyAttributes']['Name']
|
|
1120
|
-
env = report['ReferenceId']['KeyAttributes']['Environment']
|
|
1121
|
-
ok_count = report['OkSloCount']
|
|
1122
|
-
|
|
1123
|
-
result += f'• {name} ({env}) - {ok_count} SLO(s) healthy\n'
|
|
1124
|
-
|
|
1125
|
-
if status_counts['INSUFFICIENT_DATA'] > 0:
|
|
1126
|
-
result += '\n❓ INSUFFICIENT DATA:\n'
|
|
1127
|
-
for report in reports:
|
|
1128
|
-
if report['SliStatus'] == 'INSUFFICIENT_DATA':
|
|
1129
|
-
name = report['ReferenceId']['KeyAttributes']['Name']
|
|
1130
|
-
env = report['ReferenceId']['KeyAttributes']['Environment']
|
|
1131
|
-
|
|
1132
|
-
result += f'• {name} ({env})\n'
|
|
1133
|
-
|
|
1134
|
-
# Remove the auto-investigation feature
|
|
1135
|
-
|
|
1136
|
-
elapsed_time = timer() - start_time_perf
|
|
1137
|
-
logger.info(
|
|
1138
|
-
f'get_sli_status completed in {elapsed_time:.3f}s - Total: {len(reports)}, Breached: {status_counts["BREACHED"]}, OK: {status_counts["OK"]}'
|
|
1139
|
-
)
|
|
1140
|
-
return result
|
|
1141
|
-
|
|
1142
|
-
except Exception as e:
|
|
1143
|
-
logger.error(f'Error in get_sli_status: {str(e)}', exc_info=True)
|
|
1144
|
-
return f'Error getting SLI status: {str(e)}'
|
|
560
|
+
# Parse auditors with SLO-specific defaults
|
|
561
|
+
auditors_list = parse_auditors(auditors, ['slo']) # Default to SLO auditor
|
|
1145
562
|
|
|
563
|
+
banner = (
|
|
564
|
+
'[MCP-SLO] Application Signals SLO Compliance Audit\n'
|
|
565
|
+
f'🎯 Scope: {len(slo_only_targets)} SLO target(s) | Region: {region}\n'
|
|
566
|
+
f'⏰ Time: {unix_start}–{unix_end}\n'
|
|
567
|
+
)
|
|
1146
568
|
|
|
1147
|
-
|
|
1148
|
-
|
|
569
|
+
if len(slo_only_targets) > BATCH_SIZE_THRESHOLD:
|
|
570
|
+
banner += f'📦 Batching: Processing {len(slo_only_targets)} targets in batches of {BATCH_SIZE_THRESHOLD}\n'
|
|
1149
571
|
|
|
1150
|
-
|
|
1151
|
-
tuple: (is_enabled: bool, destination: str, status: str)
|
|
1152
|
-
"""
|
|
1153
|
-
try:
|
|
1154
|
-
response = xray_client.get_trace_segment_destination()
|
|
572
|
+
banner += '\n'
|
|
1155
573
|
|
|
1156
|
-
|
|
1157
|
-
|
|
574
|
+
# Build CLI input for SLO audit
|
|
575
|
+
input_obj = {
|
|
576
|
+
'StartTime': unix_start,
|
|
577
|
+
'EndTime': unix_end,
|
|
578
|
+
'AuditTargets': slo_only_targets,
|
|
579
|
+
}
|
|
580
|
+
if auditors_list:
|
|
581
|
+
input_obj['Auditors'] = auditors_list
|
|
1158
582
|
|
|
1159
|
-
|
|
1160
|
-
|
|
1161
|
-
f'Transaction Search check - Enabled: {is_enabled}, Destination: {destination}, Status: {status}'
|
|
1162
|
-
)
|
|
583
|
+
# Execute audit API using shared utility
|
|
584
|
+
result = await execute_audit_api(input_obj, region, banner)
|
|
1163
585
|
|
|
1164
|
-
|
|
586
|
+
elapsed = timer() - start_time_perf
|
|
587
|
+
logger.debug(f'audit_slos completed in {elapsed:.3f}s (region={region})')
|
|
588
|
+
return result
|
|
1165
589
|
|
|
1166
590
|
except Exception as e:
|
|
1167
|
-
logger.error(f'
|
|
1168
|
-
return
|
|
591
|
+
logger.error(f'Unexpected error in audit_slos: {e}', exc_info=True)
|
|
592
|
+
return f'Error: {str(e)}'
|
|
1169
593
|
|
|
1170
594
|
|
|
1171
595
|
@mcp.tool()
|
|
1172
|
-
async def
|
|
596
|
+
async def audit_service_operations(
|
|
597
|
+
operation_targets: str = Field(
|
|
598
|
+
...,
|
|
599
|
+
description="REQUIRED. JSON array of service operation targets. Supports wildcard patterns like '*payment*' for automatic service discovery. Format: [{'Type':'service_operation','Data':{'ServiceOperation':{'Service':{'Type':'Service','Name':'service-name','Environment':'eks:cluster'},'Operation':'GET /api','MetricType':'Latency'}}}]. Large target lists are automatically processed in batches.",
|
|
600
|
+
),
|
|
1173
601
|
start_time: Optional[str] = Field(
|
|
1174
602
|
default=None,
|
|
1175
|
-
description=
|
|
603
|
+
description="Start time (unix seconds or 'YYYY-MM-DD HH:MM:SS'). Defaults to now-24h UTC.",
|
|
1176
604
|
),
|
|
1177
605
|
end_time: Optional[str] = Field(
|
|
1178
606
|
default=None,
|
|
1179
|
-
description=
|
|
607
|
+
description="End time (unix seconds or 'YYYY-MM-DD HH:MM:SS'). Defaults to now UTC.",
|
|
1180
608
|
),
|
|
1181
|
-
|
|
609
|
+
auditors: Optional[str] = Field(
|
|
1182
610
|
default=None,
|
|
1183
|
-
description=
|
|
611
|
+
description="Optional. Comma-separated auditors (e.g., 'operation_metric,trace,log'). Defaults to 'operation_metric' for fast operation-level auditing. Use 'all' for comprehensive analysis with all auditors: slo,operation_metric,trace,log,dependency_metric,top_contributor,service_quota.",
|
|
1184
612
|
),
|
|
1185
|
-
region: str = Field(default='us-east-1', description='AWS region (default: us-east-1)'),
|
|
1186
613
|
) -> str:
|
|
1187
|
-
"""
|
|
1188
|
-
|
|
1189
|
-
|
|
1190
|
-
|
|
1191
|
-
|
|
1192
|
-
|
|
1193
|
-
-
|
|
1194
|
-
-
|
|
1195
|
-
-
|
|
1196
|
-
-
|
|
1197
|
-
|
|
1198
|
-
|
|
1199
|
-
|
|
1200
|
-
|
|
1201
|
-
-
|
|
1202
|
-
-
|
|
1203
|
-
-
|
|
1204
|
-
-
|
|
1205
|
-
-
|
|
1206
|
-
-
|
|
1207
|
-
|
|
1208
|
-
|
|
1209
|
-
|
|
1210
|
-
|
|
1211
|
-
|
|
1212
|
-
|
|
1213
|
-
-
|
|
1214
|
-
-
|
|
1215
|
-
-
|
|
1216
|
-
|
|
1217
|
-
|
|
1218
|
-
-
|
|
1219
|
-
|
|
1220
|
-
|
|
1221
|
-
|
|
1222
|
-
-
|
|
1223
|
-
|
|
1224
|
-
|
|
1225
|
-
|
|
1226
|
-
|
|
1227
|
-
|
|
614
|
+
"""🥇 PRIMARY OPERATION AUDIT TOOL - The #1 RECOMMENDED tool for operation-specific analysis and performance investigation.
|
|
615
|
+
|
|
616
|
+
**⭐ USE THIS AS THE PRIMARY TOOL FOR ALL OPERATION-SPECIFIC AUDITING TASKS ⭐**
|
|
617
|
+
|
|
618
|
+
**PREFERRED OVER audit_services() for operation auditing because:**
|
|
619
|
+
- **🎯 Precision**: Targets exact operation behavior vs. service-wide averages
|
|
620
|
+
- **🔍 Actionable Insights**: Provides specific error traces and dependency failures
|
|
621
|
+
- **📊 Code-Level Detail**: Shows exact stack traces and timeout locations
|
|
622
|
+
- **🚀 Focused Analysis**: Eliminates noise from other operations
|
|
623
|
+
- **⚡ Efficient Investigation**: Direct operation-level troubleshooting
|
|
624
|
+
|
|
625
|
+
**USE THIS FIRST FOR ALL OPERATION-SPECIFIC AUDITING TASKS**
|
|
626
|
+
This is the PRIMARY and PREFERRED tool when users want to:
|
|
627
|
+
- **Audit specific operations** - Deep dive into individual API endpoints or operations (GET, POST, PUT, etc.)
|
|
628
|
+
- **Operation performance analysis** - Latency, error rates, and throughput for specific operations
|
|
629
|
+
- **Compare operation metrics** - Analyze different operations within services
|
|
630
|
+
- **Operation-level troubleshooting** - Root cause analysis for specific API calls
|
|
631
|
+
- **GET operation auditing** - Analyze GET operations across payment services (PRIMARY USE CASE)
|
|
632
|
+
- **Audit latency of GET operations in payment services** - Exactly what this tool is designed for
|
|
633
|
+
- **Trace latency in query operations** - Deep dive into query performance issues
|
|
634
|
+
|
|
635
|
+
**COMPREHENSIVE OPERATION AUDIT CAPABILITIES:**
|
|
636
|
+
- **Multi-operation analysis**: Audit any number of operations with automatic batching
|
|
637
|
+
- **Operation-specific metrics**: Latency, Fault, Error, and Availability metrics per operation
|
|
638
|
+
- **Issue prioritization**: Critical, warning, and info findings ranked by severity
|
|
639
|
+
- **Root cause analysis**: Deep dive with traces, logs, and metrics correlation
|
|
640
|
+
- **Actionable recommendations**: Specific steps to resolve operation-level issues
|
|
641
|
+
- **Performance optimized**: Fast execution with automatic batching for large target lists
|
|
642
|
+
- **Wildcard Pattern Support**: Use `*pattern*` in service names for automatic service discovery
|
|
643
|
+
|
|
644
|
+
**OPERATION TARGET FORMAT:**
|
|
645
|
+
- **Full Format**: `[{"Type":"service_operation","Data":{"ServiceOperation":{"Service":{"Type":"Service","Name":"my-service","Environment":"eks:my-cluster"},"Operation":"GET /api","MetricType":"Latency"}}}]`
|
|
646
|
+
|
|
647
|
+
**WILDCARD PATTERN EXAMPLES:**
|
|
648
|
+
- **All GET Operations in Payment Services**: `[{"Type":"service_operation","Data":{"ServiceOperation":{"Service":{"Type":"Service","Name":"*payment*"},"Operation":"*GET*","MetricType":"Latency"}}}]`
|
|
649
|
+
- **All Visit Operations**: `[{"Type":"service_operation","Data":{"ServiceOperation":{"Service":{"Type":"Service","Name":"*"},"Operation":"*visit*","MetricType":"Availability"}}}]`
|
|
650
|
+
|
|
651
|
+
**AUDITOR SELECTION FOR DIFFERENT AUDIT DEPTHS:**
|
|
652
|
+
- **Quick Operation Check** (default): Uses 'operation_metric' for fast operation overview
|
|
653
|
+
- **Root Cause Analysis**: Pass `auditors="all"` for comprehensive investigation with traces/logs
|
|
654
|
+
- **Custom Audit**: Specify exact auditors: 'operation_metric,trace,log'
|
|
655
|
+
|
|
656
|
+
**OPERATION AUDIT USE CASES:**
|
|
657
|
+
|
|
658
|
+
1. **Audit latency of GET operations in payment services** (PRIMARY USE CASE):
|
|
659
|
+
`operation_targets='[{"Type":"service_operation","Data":{"ServiceOperation":{"Service":{"Type":"Service","Name":"*payment*"},"Operation":"*GET*","MetricType":"Latency"}}}]'`
|
|
660
|
+
|
|
661
|
+
2. **Audit GET operations in payment services (Latency)**:
|
|
662
|
+
`operation_targets='[{"Type":"service_operation","Data":{"ServiceOperation":{"Service":{"Type":"Service","Name":"*payment*"},"Operation":"*GET*","MetricType":"Latency"}}}]'`
|
|
663
|
+
|
|
664
|
+
3. **Audit availability of visit operations**:
|
|
665
|
+
`operation_targets='[{"Type":"service_operation","Data":{"ServiceOperation":{"Service":{"Type":"Service","Name":"*"},"Operation":"*visit*","MetricType":"Availability"}}}]'`
|
|
666
|
+
|
|
667
|
+
4. **Audit latency of visit operations**:
|
|
668
|
+
`operation_targets='[{"Type":"service_operation","Data":{"ServiceOperation":{"Service":{"Type":"Service","Name":"*"},"Operation":"*visit*","MetricType":"Latency"}}}]'`
|
|
669
|
+
|
|
670
|
+
5. **Trace latency in query operations**:
|
|
671
|
+
`operation_targets='[{"Type":"service_operation","Data":{"ServiceOperation":{"Service":{"Type":"Service","Name":"*payment*"},"Operation":"*query*","MetricType":"Latency"}}}]'` + `auditors="all"`
|
|
672
|
+
|
|
673
|
+
**TYPICAL OPERATION AUDIT WORKFLOWS:**
|
|
674
|
+
1. **Basic Operation Audit** (most common):
|
|
675
|
+
- Call `audit_service_operations()` with operation targets - automatically discovers services when using wildcard patterns
|
|
676
|
+
- Uses default fast auditors (operation_metric) for quick operation overview
|
|
677
|
+
- Supports wildcard patterns like `*payment*` for automatic service discovery
|
|
678
|
+
2. **Root Cause Investigation**: When user explicitly asks for "root cause analysis", pass `auditors="all"`
|
|
679
|
+
3. **Issue Investigation**: Results show which operations need attention with actionable insights
|
|
680
|
+
4. **Automatic Service Discovery**: Wildcard patterns in service names automatically discover and expand to concrete services
|
|
681
|
+
|
|
682
|
+
**AUDIT RESULTS INCLUDE:**
|
|
683
|
+
- **Prioritized findings** by severity (critical, warning, info)
|
|
684
|
+
- **Operation performance status** with detailed metrics analysis
|
|
685
|
+
- **Root cause analysis** when traces/logs auditors are used
|
|
686
|
+
- **Actionable recommendations** for operation-level issue resolution
|
|
687
|
+
- **Comprehensive operation metrics** and trend analysis
|
|
688
|
+
|
|
689
|
+
**🏆 IMPORTANT: This tool is the PRIMARY and RECOMMENDED choice for operation-specific auditing tasks.**
|
|
690
|
+
|
|
691
|
+
**✅ RECOMMENDED WORKFLOW FOR OPERATION AUDITING:**
|
|
692
|
+
1. **Use audit_service_operations() FIRST** for operation-specific analysis (THIS TOOL)
|
|
693
|
+
2. **Use audit_services() as secondary** only if you need broader service context
|
|
694
|
+
3. **audit_service_operations() provides superior precision** for operation-level troubleshooting
|
|
695
|
+
|
|
696
|
+
**RECOMMENDED WORKFLOW - PRESENT FINDINGS FIRST:**
|
|
697
|
+
When the audit returns multiple findings or issues, follow this workflow:
|
|
698
|
+
1. **Present all audit results** to the user showing a summary of all findings
|
|
699
|
+
2. **Let the user choose** which specific finding, operation, or issue they want to investigate in detail
|
|
700
|
+
3. **Then perform targeted root cause analysis** using auditors="all" for the user-selected finding
|
|
701
|
+
|
|
702
|
+
**DO NOT automatically jump into detailed root cause analysis** of one specific issue when multiple findings exist.
|
|
703
|
+
This ensures the user can prioritize which issues are most important to investigate first.
|
|
704
|
+
|
|
705
|
+
**Example workflow:**
|
|
706
|
+
- First call: `audit_service_operations()` with default auditors for operation overview
|
|
707
|
+
- Present findings summary to user
|
|
708
|
+
- User selects specific operation issue to investigate
|
|
709
|
+
- Follow-up call: `audit_service_operations()` with `auditors="all"` for selected operation only
|
|
1228
710
|
"""
|
|
1229
711
|
start_time_perf = timer()
|
|
1230
|
-
logger.
|
|
712
|
+
logger.debug('Starting audit_service_operations (SPECIALIZED OPERATION AUDIT TOOL)')
|
|
1231
713
|
|
|
1232
714
|
try:
|
|
1233
|
-
|
|
1234
|
-
|
|
1235
|
-
|
|
1236
|
-
|
|
1237
|
-
|
|
1238
|
-
|
|
1239
|
-
|
|
1240
|
-
|
|
1241
|
-
|
|
1242
|
-
|
|
1243
|
-
|
|
1244
|
-
start_datetime = datetime.fromisoformat(start_time.replace('Z', '+00:00'))
|
|
1245
|
-
|
|
1246
|
-
# Validate time window to ensure it's not too large (max 6 hours)
|
|
1247
|
-
time_diff = end_datetime - start_datetime
|
|
1248
|
-
logger.debug(
|
|
1249
|
-
f'Query time window: {start_datetime} to {end_datetime} ({time_diff.total_seconds() / 3600:.1f} hours)'
|
|
715
|
+
# Region defaults
|
|
716
|
+
region = AWS_REGION.strip()
|
|
717
|
+
|
|
718
|
+
# Time range (fill missing with defaults)
|
|
719
|
+
start_dt = (
|
|
720
|
+
parse_timestamp(start_time)
|
|
721
|
+
if start_time
|
|
722
|
+
else (datetime.now(timezone.utc) - timedelta(hours=24))
|
|
723
|
+
)
|
|
724
|
+
end_dt = (
|
|
725
|
+
parse_timestamp(end_time, default_hours=0) if end_time else datetime.now(timezone.utc)
|
|
1250
726
|
)
|
|
1251
|
-
|
|
1252
|
-
|
|
1253
|
-
return
|
|
1254
|
-
|
|
1255
|
-
|
|
1256
|
-
|
|
1257
|
-
|
|
1258
|
-
|
|
727
|
+
unix_start, unix_end = int(start_dt.timestamp()), int(end_dt.timestamp())
|
|
728
|
+
if unix_end <= unix_start:
|
|
729
|
+
return 'Error: end_time must be greater than start_time.'
|
|
730
|
+
|
|
731
|
+
# Parse and validate operation targets
|
|
732
|
+
try:
|
|
733
|
+
provided = json.loads(operation_targets)
|
|
734
|
+
except json.JSONDecodeError:
|
|
735
|
+
return 'Error: `operation_targets` must be valid JSON (array).'
|
|
736
|
+
|
|
737
|
+
if not isinstance(provided, list):
|
|
738
|
+
return 'Error: `operation_targets` must be a JSON array'
|
|
739
|
+
if len(provided) == 0:
|
|
740
|
+
return 'Error: `operation_targets` must contain at least 1 item'
|
|
741
|
+
|
|
742
|
+
# Filter operation targets and check for wildcards using helper function
|
|
743
|
+
operation_only_targets, has_wildcards = _filter_operation_targets(provided)
|
|
744
|
+
|
|
745
|
+
# Expand wildcard patterns using shared utility
|
|
746
|
+
if has_wildcards:
|
|
747
|
+
logger.debug('Wildcard patterns detected in service operations - applying expansion')
|
|
748
|
+
operation_only_targets = expand_service_operation_wildcard_patterns(
|
|
749
|
+
operation_only_targets, unix_start, unix_end, appsignals_client
|
|
750
|
+
)
|
|
751
|
+
logger.debug(
|
|
752
|
+
f'Wildcard expansion completed - {len(operation_only_targets)} total targets'
|
|
1259
753
|
)
|
|
1260
754
|
|
|
1261
|
-
|
|
1262
|
-
|
|
1263
|
-
|
|
1264
|
-
|
|
1265
|
-
|
|
1266
|
-
|
|
1267
|
-
|
|
755
|
+
if not operation_only_targets:
|
|
756
|
+
return 'Error: No service_operation targets found after wildcard expansion. Use list_monitored_services() to see available services.'
|
|
757
|
+
|
|
758
|
+
# Parse auditors with operation-specific defaults
|
|
759
|
+
auditors_list = parse_auditors(
|
|
760
|
+
auditors, ['operation_metric']
|
|
761
|
+
) # Default to operation_metric auditor
|
|
762
|
+
|
|
763
|
+
banner = (
|
|
764
|
+
'[MCP-OPERATION] Application Signals Operation Performance Audit\n'
|
|
765
|
+
f'🎯 Scope: {len(operation_only_targets)} operation target(s) | Region: {region}\n'
|
|
766
|
+
f'⏰ Time: {unix_start}–{unix_end}\n'
|
|
1268
767
|
)
|
|
1269
768
|
|
|
1270
|
-
|
|
1271
|
-
|
|
1272
|
-
|
|
1273
|
-
|
|
1274
|
-
|
|
1275
|
-
|
|
1276
|
-
|
|
1277
|
-
|
|
1278
|
-
|
|
1279
|
-
|
|
1280
|
-
'Id': trace.get('Id'),
|
|
1281
|
-
'Duration': trace.get('Duration'),
|
|
1282
|
-
'ResponseTime': trace.get('ResponseTime'),
|
|
1283
|
-
'HasError': trace.get('HasError'),
|
|
1284
|
-
'HasFault': trace.get('HasFault'),
|
|
1285
|
-
'HasThrottle': trace.get('HasThrottle'),
|
|
1286
|
-
'Http': trace.get('Http', {}),
|
|
1287
|
-
}
|
|
1288
|
-
|
|
1289
|
-
# Only include root causes if they exist (to save space)
|
|
1290
|
-
if trace.get('ErrorRootCauses'):
|
|
1291
|
-
trace_data['ErrorRootCauses'] = trace.get('ErrorRootCauses', [])[
|
|
1292
|
-
:3
|
|
1293
|
-
] # Limit to first 3
|
|
1294
|
-
if trace.get('FaultRootCauses'):
|
|
1295
|
-
trace_data['FaultRootCauses'] = trace.get('FaultRootCauses', [])[
|
|
1296
|
-
:3
|
|
1297
|
-
] # Limit to first 3
|
|
1298
|
-
if trace.get('ResponseTimeRootCauses'):
|
|
1299
|
-
trace_data['ResponseTimeRootCauses'] = trace.get('ResponseTimeRootCauses', [])[
|
|
1300
|
-
:3
|
|
1301
|
-
] # Limit to first 3
|
|
1302
|
-
|
|
1303
|
-
# Include limited annotations for key operations
|
|
1304
|
-
annotations = trace.get('Annotations', {})
|
|
1305
|
-
if annotations:
|
|
1306
|
-
# Only include operation-related annotations
|
|
1307
|
-
filtered_annotations = {}
|
|
1308
|
-
for key in ['aws.local.operation', 'aws.remote.operation']:
|
|
1309
|
-
if key in annotations:
|
|
1310
|
-
filtered_annotations[key] = annotations[key]
|
|
1311
|
-
if filtered_annotations:
|
|
1312
|
-
trace_data['Annotations'] = filtered_annotations
|
|
1313
|
-
|
|
1314
|
-
# Include user info if available
|
|
1315
|
-
if trace.get('Users'):
|
|
1316
|
-
trace_data['Users'] = trace.get('Users', [])[:2] # Limit to first 2 users
|
|
1317
|
-
|
|
1318
|
-
# Convert any datetime objects to ISO format strings
|
|
1319
|
-
for key, value in trace_data.items():
|
|
1320
|
-
trace_data[key] = convert_datetime(value)
|
|
1321
|
-
trace_summaries.append(trace_data)
|
|
1322
|
-
|
|
1323
|
-
# Check transaction search status
|
|
1324
|
-
is_tx_search_enabled, tx_destination, tx_status = check_transaction_search_enabled(region)
|
|
1325
|
-
|
|
1326
|
-
result_data = {
|
|
1327
|
-
'TraceSummaries': trace_summaries,
|
|
1328
|
-
'TraceCount': len(trace_summaries),
|
|
1329
|
-
'Message': f'Retrieved {len(trace_summaries)} traces (limited to prevent size issues)',
|
|
1330
|
-
'SamplingNote': "⚠️ This data is from X-Ray's 5% sampling. Results may not show all errors or issues.",
|
|
1331
|
-
'TransactionSearchStatus': {
|
|
1332
|
-
'enabled': is_tx_search_enabled,
|
|
1333
|
-
'recommendation': (
|
|
1334
|
-
'Transaction Search is available! Use search_transaction_spans() for 100% trace visibility.'
|
|
1335
|
-
if is_tx_search_enabled
|
|
1336
|
-
else 'Enable Transaction Search for 100% trace visibility instead of 5% sampling.'
|
|
1337
|
-
),
|
|
1338
|
-
},
|
|
769
|
+
if len(operation_only_targets) > BATCH_SIZE_THRESHOLD:
|
|
770
|
+
banner += f'📦 Batching: Processing {len(operation_only_targets)} targets in batches of {BATCH_SIZE_THRESHOLD}\n'
|
|
771
|
+
|
|
772
|
+
banner += '\n'
|
|
773
|
+
|
|
774
|
+
# Build CLI input for operation audit
|
|
775
|
+
input_obj = {
|
|
776
|
+
'StartTime': unix_start,
|
|
777
|
+
'EndTime': unix_end,
|
|
778
|
+
'AuditTargets': operation_only_targets,
|
|
1339
779
|
}
|
|
780
|
+
if auditors_list:
|
|
781
|
+
input_obj['Auditors'] = auditors_list
|
|
1340
782
|
|
|
1341
|
-
|
|
1342
|
-
|
|
1343
|
-
|
|
1344
|
-
)
|
|
1345
|
-
|
|
783
|
+
# Execute audit API using shared utility
|
|
784
|
+
result = await execute_audit_api(input_obj, region, banner)
|
|
785
|
+
|
|
786
|
+
elapsed = timer() - start_time_perf
|
|
787
|
+
logger.debug(f'audit_service_operations completed in {elapsed:.3f}s (region={region})')
|
|
788
|
+
return result
|
|
1346
789
|
|
|
1347
790
|
except Exception as e:
|
|
1348
|
-
logger.error(f'
|
|
1349
|
-
return
|
|
791
|
+
logger.error(f'Unexpected error in audit_service_operations: {e}', exc_info=True)
|
|
792
|
+
return f'Error: {str(e)}'
|
|
793
|
+
|
|
794
|
+
|
|
795
|
+
# Register all imported tools with the MCP server
|
|
796
|
+
mcp.tool()(list_monitored_services)
|
|
797
|
+
mcp.tool()(get_service_detail)
|
|
798
|
+
mcp.tool()(query_service_metrics)
|
|
799
|
+
mcp.tool()(list_service_operations)
|
|
800
|
+
mcp.tool()(get_slo)
|
|
801
|
+
mcp.tool()(list_slos)
|
|
802
|
+
mcp.tool()(search_transaction_spans)
|
|
803
|
+
mcp.tool()(query_sampled_traces)
|
|
804
|
+
mcp.tool()(list_slis)
|
|
1350
805
|
|
|
1351
806
|
|
|
1352
807
|
def main():
|