awslabs.cloudwatch-appsignals-mcp-server 0.1.5__py3-none-any.whl → 0.1.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (19) hide show
  1. awslabs/cloudwatch_appsignals_mcp_server/__init__.py +1 -1
  2. awslabs/cloudwatch_appsignals_mcp_server/audit_presentation_utils.py +231 -0
  3. awslabs/cloudwatch_appsignals_mcp_server/audit_utils.py +699 -0
  4. awslabs/cloudwatch_appsignals_mcp_server/aws_clients.py +88 -0
  5. awslabs/cloudwatch_appsignals_mcp_server/server.py +675 -1220
  6. awslabs/cloudwatch_appsignals_mcp_server/service_audit_utils.py +231 -0
  7. awslabs/cloudwatch_appsignals_mcp_server/service_tools.py +659 -0
  8. awslabs/cloudwatch_appsignals_mcp_server/sli_report_client.py +5 -12
  9. awslabs/cloudwatch_appsignals_mcp_server/slo_tools.py +386 -0
  10. awslabs/cloudwatch_appsignals_mcp_server/trace_tools.py +658 -0
  11. awslabs/cloudwatch_appsignals_mcp_server/utils.py +172 -0
  12. awslabs_cloudwatch_appsignals_mcp_server-0.1.8.dist-info/METADATA +636 -0
  13. awslabs_cloudwatch_appsignals_mcp_server-0.1.8.dist-info/RECORD +18 -0
  14. awslabs_cloudwatch_appsignals_mcp_server-0.1.5.dist-info/METADATA +0 -321
  15. awslabs_cloudwatch_appsignals_mcp_server-0.1.5.dist-info/RECORD +0 -10
  16. {awslabs_cloudwatch_appsignals_mcp_server-0.1.5.dist-info → awslabs_cloudwatch_appsignals_mcp_server-0.1.8.dist-info}/WHEEL +0 -0
  17. {awslabs_cloudwatch_appsignals_mcp_server-0.1.5.dist-info → awslabs_cloudwatch_appsignals_mcp_server-0.1.8.dist-info}/entry_points.txt +0 -0
  18. {awslabs_cloudwatch_appsignals_mcp_server-0.1.5.dist-info → awslabs_cloudwatch_appsignals_mcp_server-0.1.8.dist-info}/licenses/LICENSE +0 -0
  19. {awslabs_cloudwatch_appsignals_mcp_server-0.1.5.dist-info → awslabs_cloudwatch_appsignals_mcp_server-0.1.8.dist-info}/licenses/NOTICE +0 -0
@@ -14,23 +14,39 @@
14
14
 
15
15
  """CloudWatch Application Signals MCP Server - Core server implementation."""
16
16
 
17
- import asyncio
18
- import boto3
19
17
  import json
20
18
  import os
21
19
  import sys
22
- from . import __version__
23
- from .sli_report_client import AWSConfig, SLIReportClient
24
- from botocore.config import Config
25
- from botocore.exceptions import ClientError
20
+ import tempfile
21
+ from .audit_utils import (
22
+ execute_audit_api,
23
+ expand_service_operation_wildcard_patterns,
24
+ expand_service_wildcard_patterns,
25
+ expand_slo_wildcard_patterns,
26
+ parse_auditors,
27
+ )
28
+ from .aws_clients import AWS_REGION, appsignals_client
29
+ from .service_audit_utils import normalize_service_targets, validate_and_enrich_service_targets
30
+ from .service_tools import (
31
+ get_service_detail,
32
+ list_monitored_services,
33
+ list_service_operations,
34
+ query_service_metrics,
35
+ )
36
+ from .slo_tools import get_slo, list_slos
37
+ from .trace_tools import list_slis, query_sampled_traces, search_transaction_spans
38
+ from .utils import parse_timestamp
26
39
  from datetime import datetime, timedelta, timezone
27
40
  from loguru import logger
28
41
  from mcp.server.fastmcp import FastMCP
29
42
  from pydantic import Field
30
43
  from time import perf_counter as timer
31
- from typing import Dict, Optional
44
+ from typing import Optional
32
45
 
33
46
 
47
+ # Constants
48
+ BATCH_SIZE_THRESHOLD = 5
49
+
34
50
  # Initialize FastMCP server
35
51
  mcp = FastMCP('cloudwatch-appsignals')
36
52
 
@@ -38,1315 +54,754 @@ mcp = FastMCP('cloudwatch-appsignals')
38
54
  log_level = os.environ.get('MCP_CLOUDWATCH_APPSIGNALS_LOG_LEVEL', 'INFO').upper()
39
55
  logger.remove() # Remove default handler
40
56
  logger.add(sys.stderr, level=log_level)
41
- logger.debug(f'CloudWatch AppSignals MCP Server initialized with log level: {log_level}')
42
-
43
- # Get AWS region from environment variable or use default
44
- AWS_REGION = os.environ.get('AWS_REGION', 'us-east-1')
45
- logger.debug(f'Using AWS region: {AWS_REGION}')
46
-
47
-
48
- # Initialize AWS clients
49
- def _initialize_aws_clients():
50
- """Initialize AWS clients with proper configuration."""
51
- config = Config(user_agent_extra=f'awslabs.cloudwatch-appsignals-mcp-server/{__version__}')
52
57
 
53
- # Check for AWS_PROFILE environment variable
54
- if aws_profile := os.environ.get('AWS_PROFILE'):
55
- logger.debug(f'Using AWS profile: {aws_profile}')
56
- session = boto3.Session(profile_name=aws_profile, region_name=AWS_REGION)
57
- logs = session.client('logs', config=config)
58
- appsignals = session.client('application-signals', config=config)
59
- cloudwatch = session.client('cloudwatch', config=config)
60
- xray = session.client('xray', config=config)
58
+ # Add file logging to aws_cli.log
59
+ log_file_path = os.environ.get('AUDITOR_LOG_PATH', tempfile.gettempdir())
60
+ try:
61
+ if log_file_path.endswith(os.sep) or os.path.isdir(log_file_path):
62
+ os.makedirs(log_file_path, exist_ok=True)
63
+ aws_cli_log_path = os.path.join(log_file_path, 'aws_cli.log')
61
64
  else:
62
- logs = boto3.client('logs', region_name=AWS_REGION, config=config)
63
- appsignals = boto3.client('application-signals', region_name=AWS_REGION, config=config)
64
- cloudwatch = boto3.client('cloudwatch', region_name=AWS_REGION, config=config)
65
- xray = boto3.client('xray', region_name=AWS_REGION, config=config)
66
-
67
- logger.debug('AWS clients initialized successfully')
68
- return logs, appsignals, cloudwatch, xray
65
+ os.makedirs(os.path.dirname(log_file_path) or '.', exist_ok=True)
66
+ aws_cli_log_path = log_file_path
67
+ except Exception:
68
+ temp_dir = tempfile.gettempdir()
69
+ os.makedirs(temp_dir, exist_ok=True)
70
+ aws_cli_log_path = os.path.join(temp_dir, 'aws_cli.log')
71
+
72
+ # Add file handler for all logs
73
+ logger.add(
74
+ aws_cli_log_path,
75
+ level=log_level,
76
+ rotation='10 MB', # Rotate when file reaches 10MB
77
+ retention='7 days', # Keep logs for 7 days
78
+ format='{time:YYYY-MM-DD HH:mm:ss.SSS} | {level: <8} | {name}:{function}:{line} - {message}',
79
+ enqueue=True, # Thread-safe logging
80
+ )
69
81
 
82
+ logger.debug(f'CloudWatch AppSignals MCP Server initialized with log level: {log_level}')
83
+ logger.debug(f'File logging enabled: {aws_cli_log_path}')
70
84
 
71
- # Initialize clients at module level
72
- try:
73
- logs_client, appsignals_client, cloudwatch_client, xray_client = _initialize_aws_clients()
74
- except Exception as e:
75
- logger.error(f'Failed to initialize AWS clients: {str(e)}')
76
- raise
85
+ logger.debug(f'Using AWS region: {AWS_REGION}')
77
86
 
78
87
 
79
- def remove_null_values(data: dict) -> dict:
80
- """Remove keys with None values from a dictionary.
88
+ def _filter_operation_targets(provided):
89
+ """Helper function to filter operation targets and detect wildcards.
81
90
 
82
91
  Args:
83
- data: Dictionary to clean
92
+ provided: List of target dictionaries
84
93
 
85
94
  Returns:
86
- Dictionary with None values removed
95
+ tuple: (operation_only_targets, has_wildcards)
87
96
  """
88
- return {k: v for k, v in data.items() if v is not None}
89
-
90
-
91
- @mcp.tool()
92
- async def list_monitored_services() -> str:
93
- """List all services monitored by AWS Application Signals.
94
-
95
- Use this tool to:
96
- - Get an overview of all monitored services
97
- - See service names, types, and key attributes
98
- - Identify which services are being tracked
99
- - Count total number of services in your environment
100
-
101
- Returns a formatted list showing:
102
- - Service name and type
103
- - Key attributes (Environment, Platform, etc.)
104
- - Total count of services
105
-
106
- This is typically the first tool to use when starting monitoring or investigation.
107
- """
108
- start_time_perf = timer()
109
- logger.debug('Starting list_application_signals_services request')
110
-
111
- try:
112
- # Calculate time range (last 24 hours)
113
- end_time = datetime.now(timezone.utc)
114
- start_time = end_time - timedelta(hours=24)
115
-
116
- # Get all services
117
- logger.debug(f'Querying services for time range: {start_time} to {end_time}')
118
- response = appsignals_client.list_services(
119
- StartTime=start_time, EndTime=end_time, MaxResults=100
120
- )
121
- services = response.get('ServiceSummaries', [])
122
- logger.debug(f'Retrieved {len(services)} services from Application Signals')
123
-
124
- if not services:
125
- logger.warning('No services found in Application Signals')
126
- return 'No services found in Application Signals.'
127
-
128
- result = f'Application Signals Services ({len(services)} total):\n\n'
129
-
130
- for service in services:
131
- # Extract service name from KeyAttributes
132
- key_attrs = service.get('KeyAttributes', {})
133
- service_name = key_attrs.get('Name', 'Unknown')
134
- service_type = key_attrs.get('Type', 'Unknown')
135
-
136
- result += f'• Service: {service_name}\n'
137
- result += f' Type: {service_type}\n'
138
-
139
- # Add key attributes
140
- if key_attrs:
141
- result += ' Key Attributes:\n'
142
- for key, value in key_attrs.items():
143
- result += f' {key}: {value}\n'
144
-
145
- result += '\n'
146
-
147
- elapsed_time = timer() - start_time_perf
148
- logger.debug(f'list_monitored_services completed in {elapsed_time:.3f}s')
149
- return result
150
-
151
- except ClientError as e:
152
- error_code = e.response.get('Error', {}).get('Code', 'Unknown')
153
- error_message = e.response.get('Error', {}).get('Message', 'Unknown error')
154
- logger.error(f'AWS ClientError in list_monitored_services: {error_code} - {error_message}')
155
- return f'AWS Error: {error_message}'
156
- except Exception as e:
157
- logger.error(f'Unexpected error in list_monitored_services: {str(e)}', exc_info=True)
158
- return f'Error: {str(e)}'
159
-
160
-
161
- @mcp.tool()
162
- async def get_service_detail(
163
- service_name: str = Field(
164
- ..., description='Name of the service to get details for (case-sensitive)'
165
- ),
166
- ) -> str:
167
- """Get detailed information about a specific Application Signals service.
168
-
169
- Use this tool when you need to:
170
- - Understand a service's configuration and setup
171
- - Understand where this servive is deployed and where it is running such as EKS, Lambda, etc.
172
- - See what metrics are available for a service
173
- - Find log groups associated with the service
174
- - Get service metadata and attributes
175
-
176
- Returns comprehensive details including:
177
- - Key attributes (Type, Environment, Platform)
178
- - Available CloudWatch metrics with namespaces
179
- - Metric dimensions and types
180
- - Associated log groups for debugging
181
-
182
- This tool is essential before querying specific metrics, as it shows
183
- which metrics are available for the service.
184
- """
185
- start_time_perf = timer()
186
- logger.debug(f'Starting get_service_healthy_detail request for service: {service_name}')
187
-
188
- try:
189
- # Calculate time range (last 24 hours)
190
- end_time = datetime.now(timezone.utc)
191
- start_time = end_time - timedelta(hours=24)
192
-
193
- # First, get all services to find the one we want
194
- services_response = appsignals_client.list_services(
195
- StartTime=start_time, EndTime=end_time, MaxResults=100
196
- )
197
-
198
- # Find the service with matching name
199
- target_service = None
200
- for service in services_response.get('ServiceSummaries', []):
201
- key_attrs = service.get('KeyAttributes', {})
202
- if key_attrs.get('Name') == service_name:
203
- target_service = service
204
- break
205
-
206
- if not target_service:
207
- logger.warning(f"Service '{service_name}' not found in Application Signals")
208
- return f"Service '{service_name}' not found in Application Signals."
209
-
210
- # Get detailed service information
211
- logger.debug(f'Getting detailed information for service: {service_name}')
212
- service_response = appsignals_client.get_service(
213
- StartTime=start_time, EndTime=end_time, KeyAttributes=target_service['KeyAttributes']
214
- )
215
-
216
- service_details = service_response['Service']
217
-
218
- # Build detailed response
219
- result = f'Service Details: {service_name}\n\n'
220
-
221
- # Key Attributes
222
- key_attrs = service_details.get('KeyAttributes', {})
223
- if key_attrs:
224
- result += 'Key Attributes:\n'
225
- for key, value in key_attrs.items():
226
- result += f' {key}: {value}\n'
227
- result += '\n'
228
-
229
- # Attribute Maps (Platform, Application, Telemetry info)
230
- attr_maps = service_details.get('AttributeMaps', [])
231
- if attr_maps:
232
- result += 'Additional Attributes:\n'
233
- for attr_map in attr_maps:
234
- for key, value in attr_map.items():
235
- result += f' {key}: {value}\n'
236
- result += '\n'
237
-
238
- # Metric References
239
- metric_refs = service_details.get('MetricReferences', [])
240
- if metric_refs:
241
- result += f'Metric References ({len(metric_refs)} total):\n'
242
- for metric in metric_refs:
243
- result += f' • {metric.get("Namespace", "")}/{metric.get("MetricName", "")}\n'
244
- result += f' Type: {metric.get("MetricType", "")}\n'
245
- dimensions = metric.get('Dimensions', [])
246
- if dimensions:
247
- result += ' Dimensions: '
248
- dim_strs = [f'{d["Name"]}={d["Value"]}' for d in dimensions]
249
- result += ', '.join(dim_strs) + '\n'
250
- result += '\n'
251
-
252
- # Log Group References
253
- log_refs = service_details.get('LogGroupReferences', [])
254
- if log_refs:
255
- result += f'Log Group References ({len(log_refs)} total):\n'
256
- for log_ref in log_refs:
257
- log_group = log_ref.get('Identifier', 'Unknown')
258
- result += f' • {log_group}\n'
259
- result += '\n'
260
-
261
- elapsed_time = timer() - start_time_perf
262
- logger.debug(f"get_service_detail completed for '{service_name}' in {elapsed_time:.3f}s")
263
- return result
97
+ operation_only_targets = []
98
+ has_wildcards = False
99
+
100
+ for target in provided:
101
+ if isinstance(target, dict):
102
+ ttype = target.get('Type', '').lower()
103
+ if ttype == 'service_operation':
104
+ # Check for wildcard patterns in service names OR operation names
105
+ service_op_data = target.get('Data', {}).get('ServiceOperation', {})
106
+ service_data = service_op_data.get('Service', {})
107
+ service_name = service_data.get('Name', '')
108
+ operation = service_op_data.get('Operation', '')
109
+
110
+ if '*' in service_name or '*' in operation:
111
+ has_wildcards = True
112
+
113
+ operation_only_targets.append(target)
114
+ else:
115
+ logger.warning(
116
+ f"Ignoring target of type '{ttype}' in audit_service_operations (expected 'service_operation')"
117
+ )
264
118
 
265
- except ClientError as e:
266
- error_code = e.response.get('Error', {}).get('Code', 'Unknown')
267
- error_message = e.response.get('Error', {}).get('Message', 'Unknown error')
268
- logger.error(
269
- f"AWS ClientError in get_service_healthy_detail for '{service_name}': {error_code} - {error_message}"
270
- )
271
- return f'AWS Error: {error_message}'
272
- except Exception as e:
273
- logger.error(
274
- f"Unexpected error in get_service_healthy_detail for '{service_name}': {str(e)}",
275
- exc_info=True,
276
- )
277
- return f'Error: {str(e)}'
119
+ return operation_only_targets, has_wildcards
278
120
 
279
121
 
280
122
  @mcp.tool()
281
- async def query_service_metrics(
282
- service_name: str = Field(
283
- ..., description='Name of the service to get metrics for (case-sensitive)'
284
- ),
285
- metric_name: str = Field(
123
+ async def audit_services(
124
+ service_targets: str = Field(
286
125
  ...,
287
- description='Specific metric name (e.g., Latency, Error, Fault). Leave empty to list available metrics',
126
+ description="REQUIRED. JSON array of service targets. Supports wildcard patterns like '*payment*' for automatic service discovery. Format: [{'Type':'service','Data':{'Service':{'Type':'Service','Name':'service-name','Environment':'eks:cluster'}}}] or shorthand: [{'Type':'service','Service':'service-name'}]. Large target lists are automatically processed in batches.",
288
127
  ),
289
- statistic: str = Field(
290
- default='Average',
291
- description='Standard statistic type (Average, Sum, Maximum, Minimum, SampleCount)',
128
+ start_time: Optional[str] = Field(
129
+ default=None,
130
+ description="Start time (unix seconds or 'YYYY-MM-DD HH:MM:SS'). Defaults to now-24h UTC.",
292
131
  ),
293
- extended_statistic: str = Field(
294
- default='p99', description='Extended statistic (p99, p95, p90, p50, etc)'
132
+ end_time: Optional[str] = Field(
133
+ default=None,
134
+ description="End time (unix seconds or 'YYYY-MM-DD HH:MM:SS'). Defaults to now UTC.",
295
135
  ),
296
- hours: int = Field(
297
- default=1, description='Number of hours to look back (default 1, max 168 for 1 week)'
136
+ auditors: Optional[str] = Field(
137
+ default=None,
138
+ description="Optional. Comma-separated auditors (e.g., 'slo,operation_metric,dependency_metric'). Defaults to 'slo,operation_metric' for fast service health auditing. Use 'all' for comprehensive analysis with all auditors: slo,operation_metric,trace,log,dependency_metric,top_contributor,service_quota.",
298
139
  ),
299
140
  ) -> str:
300
- """Get CloudWatch metrics for a specific Application Signals service.
141
+ """PRIMARY SERVICE AUDIT TOOL - The #1 tool for comprehensive AWS service health auditing and monitoring.
301
142
 
302
- Use this tool to:
303
- - Analyze service performance (latency, throughput)
304
- - Check error rates and reliability
305
- - View trends over time
306
- - Get both standard statistics (Average, Max) and percentiles (p99, p95)
143
+ **IMPORTANT: For operation-specific auditing, use audit_service_operations() as the PRIMARY tool instead.**
144
+
145
+ **USE THIS FIRST FOR ALL SERVICE-LEVEL AUDITING TASKS**
146
+ This is the PRIMARY and PREFERRED tool when users want to:
147
+ - **Audit their AWS services** - Complete health assessment with actionable insights
148
+ - **Check service health** - Comprehensive status across all monitored services
149
+ - **Investigate issues** - Root cause analysis with detailed findings
150
+ - **Service-level performance analysis** - Overall service latency, error rates, and throughput investigation
151
+ - **System-wide health checks** - Daily/periodic service auditing workflows
152
+ - **Dependency analysis** - Understanding service dependencies and interactions
153
+ - **Resource quota monitoring** - Service quota usage and limits
154
+ - **Multi-service comparison** - Comparing performance across different services
307
155
 
308
- Common metric names:
309
- - 'Latency': Response time in milliseconds
310
- - 'Error': Percentage of failed requests
311
- - 'Fault': Percentage of server errors (5xx)
156
+ **FOR OPERATION-SPECIFIC AUDITING: Use audit_service_operations() instead**
157
+ When users want to audit specific operations (GET, POST, PUT endpoints), use audit_service_operations() as the PRIMARY tool:
158
+ - **Operation performance analysis** - Latency, error rates for specific API endpoints
159
+ - **Operation-level troubleshooting** - Root cause analysis for specific API calls
160
+ - **GET operation auditing** - Analyze GET operations across payment services
161
+ - **Audit latency of specific operations** - Deep dive into individual endpoint performance
162
+
163
+ **COMPREHENSIVE SERVICE AUDIT CAPABILITIES:**
164
+ - **Multi-service analysis**: Audit any number of services with automatic batching
165
+ - **SLO compliance monitoring**: Automatic breach detection for service-level SLOs
166
+ - **Issue prioritization**: Critical, warning, and info findings ranked by severity
167
+ - **Root cause analysis**: Deep dive with traces, logs, and metrics correlation
168
+ - **Actionable recommendations**: Specific steps to resolve identified issues
169
+ - **Performance optimized**: Fast execution with automatic batching for large target lists
170
+ - **Wildcard Pattern Support**: Use `*pattern*` in service names for automatic service discovery
171
+
172
+ **SERVICE TARGET FORMAT:**
173
+ - **Full Format**: `[{"Type":"service","Data":{"Service":{"Type":"Service","Name":"my-service","Environment":"eks:my-cluster"}}}]`
174
+ - **Shorthand**: `[{"Type":"service","Service":"my-service"}]` (environment auto-discovered)
175
+
176
+ **WILDCARD PATTERN EXAMPLES:**
177
+ - **All Services**: `[{"Type":"service","Data":{"Service":{"Type":"Service","Name":"*"}}}]`
178
+ - **Payment Services**: `[{"Type":"service","Data":{"Service":{"Type":"Service","Name":"*payment*"}}}]`
179
+ - **Lambda Services**: `[{"Type":"service","Data":{"Service":{"Type":"Service","Name":"*lambda*"}}}]`
180
+ - **EKS Services**: `[{"Type":"service","Data":{"Service":{"Type":"Service","Name":"*","Environment":"eks:*"}}}]`
181
+
182
+ **AUDITOR SELECTION FOR DIFFERENT AUDIT DEPTHS:**
183
+ - **Quick Health Check** (default): Uses 'slo,operation_metric' for fast overview
184
+ - **Root Cause Analysis**: Pass `auditors="all"` for comprehensive investigation with traces/logs
185
+ - **Custom Audit**: Specify exact auditors: 'slo,trace,log,dependency_metric,top_contributor,service_quota'
186
+
187
+ **SERVICE AUDIT USE CASES:**
188
+
189
+ 1. **Audit all services**:
190
+ `service_targets='[{"Type":"service","Data":{"Service":{"Type":"Service","Name":"*"}}}]'`
191
+
192
+ 2. **Audit specific service**:
193
+ `service_targets='[{"Type":"service","Data":{"Service":{"Type":"Service","Name":"orders-service","Environment":"eks:orders-cluster"}}}]'`
194
+
195
+ 3. **Audit payment services**:
196
+ `service_targets='[{"Type":"service","Data":{"Service":{"Type":"Service","Name":"*payment*"}}}]'`
197
+
198
+ 8. **Audit lambda services**:
199
+ `service_targets='[{"Type":"service","Data":{"Service":{"Type":"Service","Name":"*lambda*"}}}]'` or by environment: `[{"Type":"service","Data":{"Service":{"Type":"Service","Name":"*","Environment":"lambda"}}}]`
200
+
201
+ 9. **Audit service last night**:
202
+ `service_targets='[{"Type":"service","Data":{"Service":{"Type":"Service","Name":"orders-service","Environment":"eks:orders-cluster"}}}]'` + `start_time="2024-01-01 18:00:00"` + `end_time="2024-01-02 06:00:00"`
203
+
204
+ 10. **Audit service before and after time**:
205
+ Compare service health before and after a deployment or incident by running two separate audits with different time ranges.
206
+
207
+ 11. **Trace availability issues in production services**:
208
+ `service_targets='[{"Type":"service","Data":{"Service":{"Type":"Service","Name":"*","Environment":"eks:*"}}}]'` + `auditors="all"`
209
+
210
+ 13. **Look for errors in logs of payment services**:
211
+ `service_targets='[{"Type":"service","Data":{"Service":{"Type":"Service","Name":"*payment*"}}}]'` + `auditors="log,trace"`
212
+
213
+ 14. **Look for new errors after time**:
214
+ Compare errors before and after a specific time point by running audits with different time ranges and `auditors="log,trace"`
215
+
216
+ 15. **Look for errors after deployment**:
217
+ `service_targets='[{"Type":"service","Data":{"Service":{"Type":"Service","Name":"*payment*"}}}]'` + `auditors="log,trace"` + recent time range
218
+
219
+ 16. **Look for lemon hosts in production**:
220
+ `service_targets='[{"Type":"service","Data":{"Service":{"Type":"Service","Name":"*","Environment":"eks:*"}}}]'` + `auditors="top_contributor,operation_metric"`
221
+
222
+ 17. **Look for outliers in EKS services**:
223
+ `service_targets='[{"Type":"service","Data":{"Service":{"Type":"Service","Name":"*","Environment":"eks:*"}}}]'` + `auditors="top_contributor,operation_metric"`
224
+
225
+ 18. **Status report**:
226
+ `service_targets='[{"Type":"service","Data":{"Service":{"Type":"Service","Name":"*"}}}]'` (basic health check)
227
+
228
+ 19. **Audit dependencies**:
229
+ `service_targets='[{"Type":"service","Data":{"Service":{"Type":"Service","Name":"*"}}}]'` + `auditors="dependency_metric,trace"`
230
+
231
+ 20. **Audit dependency on S3**:
232
+ `service_targets='[{"Type":"service","Data":{"Service":{"Type":"Service","Name":"*"}}}]'` + `auditors="dependency_metric"` + look for S3 dependencies
233
+
234
+ 21. **Audit quota usage of tier 1 services**:
235
+ `service_targets='[{"Type":"service","Data":{"Service":{"Type":"Service","Name":"*tier1*"}}}]'` + `auditors="service_quota,operation_metric"`
236
+
237
+ **TYPICAL SERVICE AUDIT WORKFLOWS:**
238
+ 1. **Basic Service Audit** (most common):
239
+ - Call `audit_services()` with service targets - automatically discovers services when using wildcard patterns
240
+ - Uses default fast auditors (slo,operation_metric) for quick health overview
241
+ - Supports wildcard patterns like `*` or `*payment*` for automatic service discovery
242
+ 2. **Root Cause Investigation**: When user explicitly asks for "root cause analysis", pass `auditors="all"`
243
+ 3. **Issue Investigation**: Results show which services need attention with actionable insights
244
+ 4. **Automatic Service Discovery**: Wildcard patterns in service names automatically discover and expand to concrete services
245
+
246
+ **AUDIT RESULTS INCLUDE:**
247
+ - **Prioritized findings** by severity (critical, warning, info)
248
+ - **Service health status** with detailed performance analysis
249
+ - **Root cause analysis** when traces/logs auditors are used
250
+ - **Actionable recommendations** for issue resolution
251
+ - **Comprehensive metrics** and trend analysis
252
+
253
+ **IMPORTANT: This tool provides comprehensive service audit coverage and should be your first choice for any service auditing task.**
312
254
 
313
- Returns:
314
- - Summary statistics (latest, average, min, max)
315
- - Recent data points with timestamps
316
- - Both standard and percentile values when available
317
-
318
- The tool automatically adjusts the granularity based on time range:
319
- - Up to 3 hours: 1-minute resolution
320
- - Up to 24 hours: 5-minute resolution
321
- - Over 24 hours: 1-hour resolution
255
+ **RECOMMENDED WORKFLOW - PRESENT FINDINGS FIRST:**
256
+ When the audit returns multiple findings or issues, follow this workflow:
257
+ 1. **Present all audit results** to the user showing a summary of all findings
258
+ 2. **Let the user choose** which specific finding, service, or issue they want to investigate in detail
259
+ 3. **Then perform targeted root cause analysis** using auditors="all" for the user-selected finding
260
+
261
+ **DO NOT automatically jump into detailed root cause analysis** of one specific issue when multiple findings exist.
262
+ This ensures the user can prioritize which issues are most important to investigate first.
263
+
264
+ **Example workflow:**
265
+ - First call: `audit_services()` with default auditors for overview
266
+ - Present findings summary to user
267
+ - User selects specific service/issue to investigate
268
+ - Follow-up call: `audit_services()` with `auditors="all"` for selected service only
322
269
  """
323
270
  start_time_perf = timer()
324
- logger.info(
325
- f'Starting query_service_metrics request - service: {service_name}, metric: {metric_name}, hours: {hours}'
326
- )
271
+ logger.debug('Starting audit_services (PRIMARY SERVICE AUDIT TOOL)')
327
272
 
328
273
  try:
329
- # Calculate time range
330
- end_time = datetime.now(timezone.utc)
331
- start_time = end_time - timedelta(hours=hours)
332
-
333
- # Get service details to find metrics
334
- services_response = appsignals_client.list_services(
335
- StartTime=start_time, EndTime=end_time, MaxResults=100
274
+ # Region defaults
275
+ region = AWS_REGION.strip()
276
+
277
+ # Time range (fill missing with defaults)
278
+ start_dt = (
279
+ parse_timestamp(start_time)
280
+ if start_time
281
+ else (datetime.now(timezone.utc) - timedelta(hours=24))
336
282
  )
337
-
338
- # Find the target service
339
- target_service = None
340
- for service in services_response.get('ServiceSummaries', []):
341
- key_attrs = service.get('KeyAttributes', {})
342
- if key_attrs.get('Name') == service_name:
343
- target_service = service
344
- break
345
-
346
- if not target_service:
347
- logger.warning(f"Service '{service_name}' not found in Application Signals")
348
- return f"Service '{service_name}' not found in Application Signals."
349
-
350
- # Get detailed service info for metric references
351
- service_response = appsignals_client.get_service(
352
- StartTime=start_time, EndTime=end_time, KeyAttributes=target_service['KeyAttributes']
353
- )
354
-
355
- metric_refs = service_response['Service'].get('MetricReferences', [])
356
-
357
- if not metric_refs:
358
- logger.warning(f"No metrics found for service '{service_name}'")
359
- return f"No metrics found for service '{service_name}'."
360
-
361
- # If no specific metric requested, show available metrics
362
- if not metric_name:
363
- result = f"Available metrics for service '{service_name}':\n\n"
364
- for metric in metric_refs:
365
- result += f'• {metric.get("MetricName", "Unknown")}\n'
366
- result += f' Namespace: {metric.get("Namespace", "Unknown")}\n'
367
- result += f' Type: {metric.get("MetricType", "Unknown")}\n'
368
- result += '\n'
369
- return result
370
-
371
- # Find the specific metric
372
- target_metric = None
373
- for metric in metric_refs:
374
- if metric.get('MetricName') == metric_name:
375
- target_metric = metric
376
- break
377
-
378
- if not target_metric:
379
- available = [m.get('MetricName', 'Unknown') for m in metric_refs]
380
- return f"Metric '{metric_name}' not found for service '{service_name}'. Available: {', '.join(available)}"
381
-
382
- # Calculate appropriate period based on time range
383
- if hours <= 3:
384
- period = 60 # 1 minute
385
- elif hours <= 24:
386
- period = 300 # 5 minutes
387
- else:
388
- period = 3600 # 1 hour
389
-
390
- # Get both standard and extended statistics in a single call
391
- response = cloudwatch_client.get_metric_statistics(
392
- Namespace=target_metric['Namespace'],
393
- MetricName=target_metric['MetricName'],
394
- Dimensions=target_metric.get('Dimensions', []),
395
- StartTime=start_time,
396
- EndTime=end_time,
397
- Period=period,
398
- Statistics=[statistic], # type: ignore
399
- ExtendedStatistics=[extended_statistic],
283
+ end_dt = (
284
+ parse_timestamp(end_time, default_hours=0) if end_time else datetime.now(timezone.utc)
400
285
  )
401
-
402
- datapoints = response.get('Datapoints', [])
403
-
404
- if not datapoints:
405
- logger.warning(
406
- f"No data points found for metric '{metric_name}' on service '{service_name}' in the last {hours} hour(s)"
286
+ unix_start, unix_end = int(start_dt.timestamp()), int(end_dt.timestamp())
287
+ if unix_end <= unix_start:
288
+ return 'Error: end_time must be greater than start_time.'
289
+
290
+ # Parse and validate service targets
291
+ try:
292
+ provided = json.loads(service_targets)
293
+ except json.JSONDecodeError:
294
+ return 'Error: `service_targets` must be valid JSON (array).'
295
+
296
+ # Check for wildcard patterns in service names
297
+ has_wildcards = False
298
+ logger.debug(f'audit_services: Checking {len(provided)} targets for wildcards')
299
+ for i, target in enumerate(provided):
300
+ logger.debug(f'audit_services: Target {i}: {target}')
301
+ if isinstance(target, dict):
302
+ # Check various possible service name locations
303
+ service_name = None
304
+ if target.get('Type', '').lower() == 'service':
305
+ # Check Data.Service.Name
306
+ service_data = target.get('Data', {})
307
+ if isinstance(service_data, dict):
308
+ service_info = service_data.get('Service', {})
309
+ if isinstance(service_info, dict):
310
+ service_name = service_info.get('Name', '')
311
+
312
+ # Check shorthand Service field
313
+ if not service_name:
314
+ service_name = target.get('Service', '')
315
+
316
+ logger.debug(f"audit_services: Target {i} service name: '{service_name}'")
317
+ if service_name and isinstance(service_name, str) and '*' in service_name:
318
+ logger.debug(
319
+ f"audit_services: Target {i} has wildcard pattern: '{service_name}'"
320
+ )
321
+ has_wildcards = True
322
+ break
323
+
324
+ logger.debug(f'audit_services: has_wildcards = {has_wildcards}')
325
+
326
+ # Expand wildcard patterns using shared utility
327
+ if has_wildcards:
328
+ logger.debug('Wildcard patterns detected - applying service expansion')
329
+ provided = expand_service_wildcard_patterns(
330
+ provided, unix_start, unix_end, appsignals_client
407
331
  )
408
- return f"No data points found for metric '{metric_name}' on service '{service_name}' in the last {hours} hour(s)."
409
-
410
- # Sort by timestamp
411
- datapoints.sort(key=lambda x: x.get('Timestamp', datetime.min)) # type: ignore
412
-
413
- # Build response
414
- result = f'Metrics for {service_name} - {metric_name}\n'
415
- result += f'Time Range: Last {hours} hour(s)\n'
416
- result += f'Period: {period} seconds\n\n'
417
-
418
- # Calculate summary statistics for both standard and extended statistics
419
- standard_values = [dp.get(statistic) for dp in datapoints if dp.get(statistic) is not None]
420
- extended_values = [
421
- dp.get(extended_statistic)
422
- for dp in datapoints
423
- if dp.get(extended_statistic) is not None
424
- ]
425
-
426
- result += 'Summary:\n'
427
-
428
- if standard_values:
429
- latest_standard = datapoints[-1].get(statistic)
430
- avg_of_standard = sum(standard_values) / len(standard_values) # type: ignore
431
- max_standard = max(standard_values) # type: ignore
432
- min_standard = min(standard_values) # type: ignore
433
-
434
- result += f'{statistic} Statistics:\n'
435
- result += f'• Latest: {latest_standard:.2f}\n'
436
- result += f'• Average: {avg_of_standard:.2f}\n'
437
- result += f'• Maximum: {max_standard:.2f}\n'
438
- result += f'• Minimum: {min_standard:.2f}\n\n'
439
-
440
- if extended_values:
441
- latest_extended = datapoints[-1].get(extended_statistic)
442
- avg_extended = sum(extended_values) / len(extended_values) # type: ignore
443
- max_extended = max(extended_values) # type: ignore
444
- min_extended = min(extended_values) # type: ignore
445
-
446
- result += f'{extended_statistic} Statistics:\n'
447
- result += f'• Latest: {latest_extended:.2f}\n'
448
- result += f'• Average: {avg_extended:.2f}\n'
449
- result += f'• Maximum: {max_extended:.2f}\n'
450
- result += f'• Minimum: {min_extended:.2f}\n\n'
451
-
452
- result += f'• Data Points: {len(datapoints)}\n\n'
453
-
454
- # Show recent values (last 10) with both metrics
455
- result += 'Recent Values:\n'
456
- for dp in datapoints[-10:]:
457
- timestamp = dp.get('Timestamp', datetime.min).strftime('%m/%d %H:%M') # type: ignore
458
- unit = dp.get('Unit', '')
459
-
460
- values_str = []
461
- if dp.get(statistic) is not None:
462
- values_str.append(f'{statistic}: {dp[statistic]:.2f}')
463
- if dp.get(extended_statistic) is not None:
464
- values_str.append(f'{extended_statistic}: {dp[extended_statistic]:.2f}')
465
-
466
- result += f'• {timestamp}: {", ".join(values_str)} {unit}\n'
467
-
468
- elapsed_time = timer() - start_time_perf
469
- logger.info(
470
- f"query_service_metrics completed for '{service_name}/{metric_name}' in {elapsed_time:.3f}s"
471
- )
472
- return result
473
-
474
- except ClientError as e:
475
- error_msg = e.response.get('Error', {}).get('Message', 'Unknown error')
476
- error_code = e.response.get('Error', {}).get('Code', 'Unknown')
477
- logger.error(
478
- f"AWS ClientError in query_service_metrics for '{service_name}/{metric_name}': {error_code} - {error_msg}"
479
- )
480
- return f'AWS Error: {error_msg}'
481
- except Exception as e:
482
- logger.error(
483
- f"Unexpected error in query_service_metrics for '{service_name}/{metric_name}': {str(e)}",
484
- exc_info=True,
485
- )
486
- return f'Error: {str(e)}'
487
-
332
+ logger.debug(f'Wildcard expansion completed - {len(provided)} total targets')
488
333
 
489
- def get_trace_summaries_paginated(
490
- xray_client, start_time, end_time, filter_expression, max_traces: int = 100
491
- ) -> list:
492
- """Get trace summaries with pagination to avoid exceeding response size limits.
334
+ # Check if wildcard expansion resulted in no services
335
+ if not provided:
336
+ return 'Error: No services found matching the wildcard pattern. Use list_monitored_services() to see available services.'
493
337
 
494
- Args:
495
- xray_client: Boto3 X-Ray client
496
- start_time: Start time for trace query
497
- end_time: End time for trace query
498
- filter_expression: X-Ray filter expression
499
- max_traces: Maximum number of traces to retrieve (default 100)
338
+ # Normalize and validate service targets using shared utility
339
+ normalized_targets = normalize_service_targets(provided)
500
340
 
501
- Returns:
502
- List of trace summaries
503
- """
504
- all_traces = []
505
- next_token = None
506
- logger.debug(
507
- f'Starting paginated trace retrieval - filter: {filter_expression}, max_traces: {max_traces}'
508
- )
509
-
510
- try:
511
- while len(all_traces) < max_traces:
512
- # Build request parameters
513
- kwargs = {
514
- 'StartTime': start_time,
515
- 'EndTime': end_time,
516
- 'FilterExpression': filter_expression,
517
- 'Sampling': True,
518
- 'TimeRangeType': 'Service',
519
- }
520
-
521
- if next_token:
522
- kwargs['NextToken'] = next_token
523
-
524
- # Make request
525
- response = xray_client.get_trace_summaries(**kwargs)
526
-
527
- # Add traces from this page
528
- traces = response.get('TraceSummaries', [])
529
- all_traces.extend(traces)
530
- logger.debug(
531
- f'Retrieved {len(traces)} traces in this page, total so far: {len(all_traces)}'
532
- )
341
+ # Validate and enrich targets using shared utility
342
+ normalized_targets = validate_and_enrich_service_targets(
343
+ normalized_targets, appsignals_client, unix_start, unix_end
344
+ )
533
345
 
534
- # Check if we have more pages
535
- next_token = response.get('NextToken')
536
- if not next_token:
537
- break
346
+ # Parse auditors with service-specific defaults
347
+ auditors_list = parse_auditors(auditors, ['slo', 'operation_metric'])
538
348
 
539
- # If we've collected enough traces, stop
540
- if len(all_traces) >= max_traces:
541
- all_traces = all_traces[:max_traces]
542
- break
349
+ # Create banner
350
+ banner = (
351
+ '[MCP-SERVICE] Application Signals Service Audit\n'
352
+ f'🎯 Scope: {len(normalized_targets)} service target(s) | Region: {region}\n'
353
+ f'⏰ Time: {unix_start}–{unix_end}\n'
354
+ )
543
355
 
544
- logger.info(f'Successfully retrieved {len(all_traces)} traces')
545
- return all_traces
356
+ if len(normalized_targets) > BATCH_SIZE_THRESHOLD:
357
+ banner += f'📦 Batching: Processing {len(normalized_targets)} targets in batches of {BATCH_SIZE_THRESHOLD}\n'
546
358
 
547
- except Exception as e:
548
- # Return what we have so far if there's an error
549
- logger.error(f'Error during paginated trace retrieval: {str(e)}', exc_info=True)
550
- logger.info(f'Returning {len(all_traces)} traces retrieved before error')
551
- return all_traces
359
+ banner += '\n'
552
360
 
361
+ # Build CLI input
362
+ input_obj = {
363
+ 'StartTime': unix_start,
364
+ 'EndTime': unix_end,
365
+ 'AuditTargets': normalized_targets,
366
+ }
367
+ if auditors_list:
368
+ input_obj['Auditors'] = auditors_list
553
369
 
554
- @mcp.tool()
555
- async def get_slo(
556
- slo_id: str = Field(..., description='The ARN or name of the SLO to retrieve'),
557
- ) -> str:
558
- """Get detailed information about a specific Service Level Objective (SLO).
559
-
560
- Use this tool to:
561
- - Get comprehensive SLO configuration details
562
- - Understand what metrics the SLO monitors
563
- - See threshold values and comparison operators
564
- - Extract operation names and key attributes for trace queries
565
- - Identify dependency configurations
566
- - Review attainment goals and burn rate settings
567
-
568
- Returns detailed information including:
569
- - SLO name, description, and metadata
570
- - Metric configuration (for period-based or request-based SLOs)
571
- - Key attributes and operation names
572
- - Metric type (LATENCY or AVAILABILITY)
573
- - Threshold values and comparison operators
574
- - Goal configuration (attainment percentage, time interval)
575
- - Burn rate configurations
576
-
577
- This tool is essential for:
578
- - Understanding why an SLO was breached
579
- - Getting the exact operation name to query traces
580
- - Identifying the metrics and thresholds being monitored
581
- - Planning remediation based on SLO configuration
582
- """
583
- start_time_perf = timer()
584
- logger.info(f'Starting get_service_level_objective request for SLO: {slo_id}')
370
+ # Execute audit API using shared utility
371
+ result = await execute_audit_api(input_obj, region, banner)
585
372
 
586
- try:
587
- response = appsignals_client.get_service_level_objective(Id=slo_id)
588
- slo = response.get('Slo', {})
589
-
590
- if not slo:
591
- logger.warning(f'No SLO found with ID: {slo_id}')
592
- return f'No SLO found with ID: {slo_id}'
593
-
594
- result = 'Service Level Objective Details\n'
595
- result += '=' * 50 + '\n\n'
596
-
597
- # Basic info
598
- result += f'Name: {slo.get("Name", "Unknown")}\n'
599
- result += f'ARN: {slo.get("Arn", "Unknown")}\n'
600
- if slo.get('Description'):
601
- result += f'Description: {slo.get("Description", "")}\n'
602
- result += f'Evaluation Type: {slo.get("EvaluationType", "Unknown")}\n'
603
- result += f'Created: {slo.get("CreatedTime", "Unknown")}\n'
604
- result += f'Last Updated: {slo.get("LastUpdatedTime", "Unknown")}\n\n'
605
-
606
- # Goal configuration
607
- goal = slo.get('Goal', {})
608
- if goal:
609
- result += 'Goal Configuration:\n'
610
- result += f'• Attainment Goal: {goal.get("AttainmentGoal", 99)}%\n'
611
- result += f'• Warning Threshold: {goal.get("WarningThreshold", 50)}%\n'
612
-
613
- interval = goal.get('Interval', {})
614
- if 'RollingInterval' in interval:
615
- rolling = interval['RollingInterval']
616
- result += f'• Interval: Rolling {rolling.get("Duration")} {rolling.get("DurationUnit")}\n'
617
- elif 'CalendarInterval' in interval:
618
- calendar = interval['CalendarInterval']
619
- result += f'• Interval: Calendar {calendar.get("Duration")} {calendar.get("DurationUnit")} starting {calendar.get("StartTime")}\n'
620
- result += '\n'
621
-
622
- # Period-based SLI
623
- if 'Sli' in slo:
624
- sli = slo['Sli']
625
- result += 'Period-Based SLI Configuration:\n'
626
-
627
- sli_metric = sli.get('SliMetric', {})
628
- if sli_metric:
629
- # Key attributes - crucial for trace queries
630
- key_attrs = sli_metric.get('KeyAttributes', {})
631
- if key_attrs:
632
- result += '• Key Attributes:\n'
633
- for k, v in key_attrs.items():
634
- result += f' - {k}: {v}\n'
635
-
636
- # Operation name - essential for trace filtering
637
- if sli_metric.get('OperationName'):
638
- result += f'• Operation Name: {sli_metric.get("OperationName", "")}\n'
639
- result += f' (Use this in trace queries: annotation[aws.local.operation]="{sli_metric.get("OperationName", "")}")\n'
640
-
641
- result += f'• Metric Type: {sli_metric.get("MetricType", "Unknown")}\n'
642
-
643
- # MetricDataQueries - detailed metric configuration
644
- metric_queries = sli_metric.get('MetricDataQueries', [])
645
- if metric_queries:
646
- result += '• Metric Data Queries:\n'
647
- for query in metric_queries:
648
- query_id = query.get('Id', 'Unknown')
649
- result += f' Query ID: {query_id}\n'
650
-
651
- # MetricStat details
652
- metric_stat = query.get('MetricStat', {})
653
- if metric_stat:
654
- metric = metric_stat.get('Metric', {})
655
- if metric:
656
- result += f' Namespace: {metric.get("Namespace", "Unknown")}\n'
657
- result += (
658
- f' MetricName: {metric.get("MetricName", "Unknown")}\n'
659
- )
660
-
661
- # Dimensions - crucial for understanding what's being measured
662
- dimensions = metric.get('Dimensions', [])
663
- if dimensions:
664
- result += ' Dimensions:\n'
665
- for dim in dimensions:
666
- result += f' - {dim.get("Name", "Unknown")}: {dim.get("Value", "Unknown")}\n'
667
-
668
- result += (
669
- f' Period: {metric_stat.get("Period", "Unknown")} seconds\n'
670
- )
671
- result += f' Stat: {metric_stat.get("Stat", "Unknown")}\n'
672
- if metric_stat.get('Unit'):
673
- result += f' Unit: {metric_stat["Unit"]}\n' # type: ignore
674
-
675
- # Expression if present
676
- if query.get('Expression'):
677
- result += f' Expression: {query.get("Expression", "")}\n'
678
-
679
- result += f' ReturnData: {query.get("ReturnData", True)}\n'
680
-
681
- # Dependency config
682
- dep_config = sli_metric.get('DependencyConfig', {})
683
- if dep_config:
684
- result += '• Dependency Configuration:\n'
685
- dep_attrs = dep_config.get('DependencyKeyAttributes', {})
686
- if dep_attrs:
687
- result += ' Key Attributes:\n'
688
- for k, v in dep_attrs.items():
689
- result += f' - {k}: {v}\n'
690
- if dep_config.get('DependencyOperationName'):
691
- result += (
692
- f' - Dependency Operation: {dep_config["DependencyOperationName"]}\n'
693
- )
694
- result += f' (Use in traces: annotation[aws.remote.operation]="{dep_config["DependencyOperationName"]}")\n'
695
-
696
- result += f'• Threshold: {sli.get("MetricThreshold", "Unknown")}\n'
697
- result += f'• Comparison: {sli.get("ComparisonOperator", "Unknown")}\n\n'
698
-
699
- # Request-based SLI
700
- if 'RequestBasedSli' in slo:
701
- rbs = slo['RequestBasedSli']
702
- result += 'Request-Based SLI Configuration:\n'
703
-
704
- rbs_metric = rbs.get('RequestBasedSliMetric', {})
705
- if rbs_metric:
706
- # Key attributes
707
- key_attrs = rbs_metric.get('KeyAttributes', {})
708
- if key_attrs:
709
- result += '• Key Attributes:\n'
710
- for k, v in key_attrs.items():
711
- result += f' - {k}: {v}\n'
712
-
713
- # Operation name
714
- if rbs_metric.get('OperationName'):
715
- result += f'• Operation Name: {rbs_metric.get("OperationName", "")}\n'
716
- result += f' (Use this in trace queries: annotation[aws.local.operation]="{rbs_metric.get("OperationName", "")}")\n'
717
-
718
- result += f'• Metric Type: {rbs_metric.get("MetricType", "Unknown")}\n'
719
-
720
- # MetricDataQueries - detailed metric configuration
721
- metric_queries = rbs_metric.get('MetricDataQueries', [])
722
- if metric_queries:
723
- result += '• Metric Data Queries:\n'
724
- for query in metric_queries:
725
- query_id = query.get('Id', 'Unknown')
726
- result += f' Query ID: {query_id}\n'
727
-
728
- # MetricStat details
729
- metric_stat = query.get('MetricStat', {})
730
- if metric_stat:
731
- metric = metric_stat.get('Metric', {})
732
- if metric:
733
- result += f' Namespace: {metric.get("Namespace", "Unknown")}\n'
734
- result += (
735
- f' MetricName: {metric.get("MetricName", "Unknown")}\n'
736
- )
737
-
738
- # Dimensions - crucial for understanding what's being measured
739
- dimensions = metric.get('Dimensions', [])
740
- if dimensions:
741
- result += ' Dimensions:\n'
742
- for dim in dimensions:
743
- result += f' - {dim.get("Name", "Unknown")}: {dim.get("Value", "Unknown")}\n'
744
-
745
- result += (
746
- f' Period: {metric_stat.get("Period", "Unknown")} seconds\n'
747
- )
748
- result += f' Stat: {metric_stat.get("Stat", "Unknown")}\n'
749
- if metric_stat.get('Unit'):
750
- result += f' Unit: {metric_stat["Unit"]}\n' # type: ignore
751
-
752
- # Expression if present
753
- if query.get('Expression'):
754
- result += f' Expression: {query.get("Expression", "")}\n'
755
-
756
- result += f' ReturnData: {query.get("ReturnData", True)}\n'
757
-
758
- # Dependency config
759
- dep_config = rbs_metric.get('DependencyConfig', {})
760
- if dep_config:
761
- result += '• Dependency Configuration:\n'
762
- dep_attrs = dep_config.get('DependencyKeyAttributes', {})
763
- if dep_attrs:
764
- result += ' Key Attributes:\n'
765
- for k, v in dep_attrs.items():
766
- result += f' - {k}: {v}\n'
767
- if dep_config.get('DependencyOperationName'):
768
- result += (
769
- f' - Dependency Operation: {dep_config["DependencyOperationName"]}\n'
770
- )
771
- result += f' (Use in traces: annotation[aws.remote.operation]="{dep_config["DependencyOperationName"]}")\n'
772
-
773
- result += f'• Threshold: {rbs.get("MetricThreshold", "Unknown")}\n'
774
- result += f'• Comparison: {rbs.get("ComparisonOperator", "Unknown")}\n\n'
775
-
776
- # Burn rate configurations
777
- burn_rates = slo.get('BurnRateConfigurations', [])
778
- if burn_rates:
779
- result += 'Burn Rate Configurations:\n'
780
- for br in burn_rates:
781
- result += f'• Look-back window: {br.get("LookBackWindowMinutes")} minutes\n'
782
-
783
- elapsed_time = timer() - start_time_perf
784
- logger.info(f"get_service_level_objective completed for '{slo_id}' in {elapsed_time:.3f}s")
373
+ elapsed = timer() - start_time_perf
374
+ logger.debug(f'audit_services completed in {elapsed:.3f}s (region={region})')
785
375
  return result
786
376
 
787
- except ClientError as e:
788
- error_msg = e.response.get('Error', {}).get('Message', 'Unknown error')
789
- error_code = e.response.get('Error', {}).get('Code', 'Unknown')
790
- logger.error(
791
- f"AWS ClientError in get_service_level_objective for '{slo_id}': {error_code} - {error_msg}"
792
- )
793
- return f'AWS Error: {error_msg}'
794
377
  except Exception as e:
795
- logger.error(
796
- f"Unexpected error in get_service_level_objective for '{slo_id}': {str(e)}",
797
- exc_info=True,
798
- )
378
+ logger.error(f'Unexpected error in audit_services: {e}', exc_info=True)
799
379
  return f'Error: {str(e)}'
800
380
 
801
381
 
802
382
  @mcp.tool()
803
- async def search_transaction_spans(
804
- log_group_name: str = Field(
805
- default='',
806
- description='CloudWatch log group name (defaults to "aws/spans" if not provided)',
807
- ),
808
- start_time: str = Field(
809
- default='', description='Start time in ISO 8601 format (e.g., "2025-04-19T20:00:00+00:00")'
383
+ async def audit_slos(
384
+ slo_targets: str = Field(
385
+ ...,
386
+ description="REQUIRED. JSON array of SLO targets. Supports wildcard patterns like '*payment*' for automatic SLO discovery. Format: [{'Type':'slo','Data':{'Slo':{'SloName':'slo-name'}}}] or [{'Type':'slo','Data':{'Slo':{'SloArn':'arn:aws:...'}}}]. Large target lists are automatically processed in batches.",
810
387
  ),
811
- end_time: str = Field(
812
- default='', description='End time in ISO 8601 format (e.g., "2025-04-19T21:00:00+00:00")'
388
+ start_time: Optional[str] = Field(
389
+ default=None,
390
+ description="Start time (unix seconds or 'YYYY-MM-DD HH:MM:SS'). Defaults to now-24h UTC.",
813
391
  ),
814
- query_string: str = Field(default='', description='CloudWatch Logs Insights query string'),
815
- limit: Optional[int] = Field(default=None, description='Maximum number of results to return'),
816
- max_timeout: int = Field(
817
- default=30, description='Maximum time in seconds to wait for query completion'
392
+ end_time: Optional[str] = Field(
393
+ default=None,
394
+ description="End time (unix seconds or 'YYYY-MM-DD HH:MM:SS'). Defaults to now UTC.",
818
395
  ),
819
- ) -> Dict:
820
- """Executes a CloudWatch Logs Insights query for transaction search (100% sampled trace data).
821
-
822
- IMPORTANT: If log_group_name is not provided use 'aws/spans' as default cloudwatch log group name.
823
- The volume of returned logs can easily overwhelm the agent context window. Always include a limit in the query
824
- (| limit 50) or using the limit parameter.
825
-
826
- Usage:
827
- "aws/spans" log group stores OpenTelemetry Spans data with many attributes for all monitored services.
828
- This provides 100% sampled data vs X-Ray's 5% sampling, giving more accurate results.
829
- User can write CloudWatch Logs Insights queries to group, list attribute with sum, avg.
830
-
831
- ```
832
- FILTER attributes.aws.local.service = "customers-service-java" and attributes.aws.local.environment = "eks:demo/default" and attributes.aws.remote.operation="InvokeModel"
833
- | STATS sum(`attributes.gen_ai.usage.output_tokens`) as `avg_output_tokens` by `attributes.gen_ai.request.model`, `attributes.aws.local.service`,bin(1h)
834
- | DISPLAY avg_output_tokens, `attributes.gen_ai.request.model`, `attributes.aws.local.service`
835
- ```
836
-
837
- Returns:
838
- --------
839
- A dictionary containing the final query results, including:
840
- - status: The current status of the query (e.g., Scheduled, Running, Complete, Failed, etc.)
841
- - results: A list of the actual query results if the status is Complete.
842
- - statistics: Query performance statistics
843
- - messages: Any informational messages about the query
844
- - transaction_search_status: Information about transaction search availability
845
- """
846
- start_time_perf = timer()
847
- logger.info(
848
- f'Starting search_transactions - log_group: {log_group_name}, start: {start_time}, end: {end_time}'
849
- )
850
- logger.debug(f'Query string: {query_string}')
851
-
852
- # Check if transaction search is enabled
853
- is_enabled, destination, status = check_transaction_search_enabled(AWS_REGION)
854
-
855
- if not is_enabled:
856
- logger.warning(
857
- f'Transaction Search not enabled - Destination: {destination}, Status: {status}'
858
- )
859
- return {
860
- 'status': 'Transaction Search Not Available',
861
- 'transaction_search_status': {
862
- 'enabled': False,
863
- 'destination': destination,
864
- 'status': status,
865
- },
866
- 'message': (
867
- '⚠️ Transaction Search is not enabled for this account. '
868
- f'Current configuration: Destination={destination}, Status={status}. '
869
- "Transaction Search requires sending traces to CloudWatch Logs (destination='CloudWatchLogs' and status='ACTIVE'). "
870
- 'Without Transaction Search, you only have access to 5% sampled trace data through X-Ray. '
871
- 'To get 100% trace visibility, please enable Transaction Search in your X-Ray settings. '
872
- 'As a fallback, you can use query_sampled_traces() but results may be incomplete due to sampling.'
873
- ),
874
- 'fallback_recommendation': 'Use query_sampled_traces() with X-Ray filter expressions for 5% sampled data.',
875
- }
876
-
877
- try:
878
- # Use default log group if none provided
879
- if log_group_name is None:
880
- log_group_name = 'aws/spans'
881
- logger.debug('Using default log group: aws/spans')
882
-
883
- # Start query
884
- kwargs = {
885
- 'startTime': int(datetime.fromisoformat(start_time).timestamp()),
886
- 'endTime': int(datetime.fromisoformat(end_time).timestamp()),
887
- 'queryString': query_string,
888
- 'logGroupNames': [log_group_name],
889
- 'limit': limit,
890
- }
891
-
892
- logger.debug(f'Starting CloudWatch Logs query with limit: {limit}')
893
- start_response = logs_client.start_query(**remove_null_values(kwargs))
894
- query_id = start_response['queryId']
895
- logger.info(f'Started CloudWatch Logs query with ID: {query_id}')
896
-
897
- # Seconds
898
- poll_start = timer()
899
- while poll_start + max_timeout > timer():
900
- response = logs_client.get_query_results(queryId=query_id)
901
- status = response['status']
902
-
903
- if status in {'Complete', 'Failed', 'Cancelled'}:
904
- elapsed_time = timer() - start_time_perf
905
- logger.info(
906
- f'Query {query_id} finished with status {status} in {elapsed_time:.3f}s'
907
- )
908
-
909
- if status == 'Failed':
910
- logger.error(f'Query failed: {response.get("statistics", {})}')
911
- elif status == 'Complete':
912
- logger.debug(f'Query returned {len(response.get("results", []))} results')
913
-
914
- return {
915
- 'queryId': query_id,
916
- 'status': status,
917
- 'statistics': response.get('statistics', {}),
918
- 'results': [
919
- {field.get('field', ''): field.get('value', '') for field in line} # type: ignore
920
- for line in response.get('results', [])
921
- ],
922
- 'transaction_search_status': {
923
- 'enabled': True,
924
- 'destination': 'CloudWatchLogs',
925
- 'status': 'ACTIVE',
926
- 'message': '✅ Using 100% sampled trace data from Transaction Search',
927
- },
928
- }
929
-
930
- await asyncio.sleep(1)
931
-
932
- elapsed_time = timer() - start_time_perf
933
- msg = f'Query {query_id} did not complete within {max_timeout} seconds. Use get_query_results with the returned queryId to try again to retrieve query results.'
934
- logger.warning(f'Query timeout after {elapsed_time:.3f}s: {msg}')
935
- return {
936
- 'queryId': query_id,
937
- 'status': 'Polling Timeout',
938
- 'message': msg,
939
- }
940
-
941
- except Exception as e:
942
- logger.error(f'Error in search_transactions: {str(e)}', exc_info=True)
943
- raise
944
-
945
-
946
- @mcp.tool()
947
- async def list_slis(
948
- hours: int = Field(
949
- default=24,
950
- description='Number of hours to look back (default 24, typically use 24 for daily checks)',
396
+ auditors: Optional[str] = Field(
397
+ default=None,
398
+ description="Optional. Comma-separated auditors (e.g., 'slo,trace,log'). Defaults to 'slo' for fast SLO compliance auditing. Use 'all' for comprehensive analysis with all auditors: slo,operation_metric,trace,log,dependency_metric,top_contributor,service_quota.",
951
399
  ),
952
400
  ) -> str:
953
- """Get SLI (Service Level Indicator) status and SLO compliance for all services.
954
-
955
- Use this tool to:
956
- - Check overall system health at a glance
957
- - Identify services with breached SLOs (Service Level Objectives)
958
- - See which specific SLOs are failing
959
- - Prioritize which services need immediate attention
960
- - Monitor SLO compliance trends
961
-
962
- Returns a comprehensive report showing:
963
- - Summary counts (total, healthy, breached, insufficient data)
964
- - Detailed list of breached services with:
965
- - Service name and environment
966
- - Number and names of breached SLOs
967
- - Specific SLO violations
968
- - List of healthy services
969
- - Services with insufficient data
970
-
971
- This is the primary tool for health monitoring and should be used:
972
- - At the start of each day
973
- - During incident response
974
- - For regular health checks
975
- - When investigating "what is the root cause of breaching SLO" questions
976
-
977
- Status meanings:
978
- - OK: All SLOs are being met
979
- - BREACHED: One or more SLOs are violated
980
- - INSUFFICIENT_DATA: Not enough data to determine status
981
-
982
- To investigate breached SLOs, follow these steps:
983
- 1. Call get_service_level_objective() with SLO name to get the detailed SLI data including Metric statistics
984
- 2. Find the fault metrics from SLI under the breached SLO
985
- 3. Build trace query filters using metric dimensions (Operation, RemoteOperation, etc.):
986
- - For availability: `service("service-name"){fault = true} AND annotation[aws.local.operation]="operation-name"`
987
- - For latency: `service("service-name") AND annotation[aws.local.operation]="operation-name" AND duration > threshold`
988
- 4. Query traces:
989
- - If Transaction Search is enabled: Use search_transaction_spans() for 100% trace visibility
990
- - If not enabled: Use query_sampled_traces() with X-Ray (only 5% sampled data - may miss issues)
991
- 5. The query time window should default to last 3 hours if not specified. Max query time window length is 6 hours
992
- 6. Analyze the root causes from Exception data in traces
993
- 7. Include findings in the report and give fix and mitigation suggestions
401
+ """PRIMARY SLO AUDIT TOOL - The #1 tool for comprehensive SLO compliance monitoring and breach analysis.
402
+
403
+ **PREFERRED TOOL FOR SLO ROOT CAUSE ANALYSIS**
404
+ This is the RECOMMENDED tool after using get_slo() to understand SLO configuration:
405
+ - **Use auditors="all" for comprehensive root cause analysis** of specific SLO breaches
406
+ - **Much more comprehensive than individual trace tools** - provides integrated analysis
407
+ - **Combines traces, logs, metrics, and dependencies** in a single comprehensive audit
408
+ - **Provides actionable recommendations** based on multi-dimensional analysis
409
+
410
+ **USE THIS FOR ALL SLO AUDITING TASKS**
411
+ This is the PRIMARY and PREFERRED tool when users want to:
412
+ - **Root cause analysis for SLO breaches** - Deep investigation with all auditors
413
+ - **Audit SLO compliance** - Complete SLO breach detection and analysis
414
+ - **Monitor SLO health** - Comprehensive status across all monitored SLOs
415
+ - **SLO performance analysis** - Understanding SLO trends and patterns
416
+ - **SLO compliance reporting** - Daily/periodic SLO compliance workflows
417
+
418
+ **COMPREHENSIVE SLO AUDIT CAPABILITIES:**
419
+ - **Multi-SLO analysis**: Audit any number of SLOs with automatic batching
420
+ - **Breach detection**: Automatic identification of SLO violations
421
+ - **Issue prioritization**: Critical, warning, and info findings ranked by severity
422
+ - **COMPREHENSIVE ROOT CAUSE ANALYSIS**: Deep dive with traces, logs, metrics, and dependencies
423
+ - **Actionable recommendations**: Specific steps to resolve SLO breaches
424
+ - **Performance optimized**: Fast execution with automatic batching for large target lists
425
+ - **Wildcard Pattern Support**: Use `*pattern*` in SLO names for automatic SLO discovery
426
+
427
+ **SLO TARGET FORMAT:**
428
+ - **By Name**: `[{"Type":"slo","Data":{"Slo":{"SloName":"my-slo"}}}]`
429
+ - **By ARN**: `[{"Type":"slo","Data":{"Slo":{"SloArn":"arn:aws:application-signals:..."}}}]`
430
+
431
+ **WILDCARD PATTERN EXAMPLES:**
432
+ - **All SLOs**: `[{"Type":"slo","Data":{"Slo":{"SloName":"*"}}}]`
433
+ - **Payment SLOs**: `[{"Type":"slo","Data":{"Slo":{"SloName":"*payment*"}}}]`
434
+ - **Latency SLOs**: `[{"Type":"slo","Data":{"Slo":{"SloName":"*latency*"}}}]`
435
+ - **Availability SLOs**: `[{"Type":"slo","Data":{"Slo":{"SloName":"*availability*"}}}]`
436
+
437
+ **AUDITOR SELECTION FOR DIFFERENT AUDIT DEPTHS:**
438
+ - **Quick Compliance Check** (default): Uses 'slo' for fast SLO breach detection
439
+ - **COMPREHENSIVE ROOT CAUSE ANALYSIS** (recommended): Pass `auditors="all"` for deep investigation with traces/logs/metrics/dependencies
440
+ - **Custom Audit**: Specify exact auditors: 'slo,trace,log,operation_metric'
441
+
442
+ **SLO AUDIT USE CASES:**
443
+
444
+ 4. **Audit all SLOs**:
445
+ `slo_targets='[{"Type":"slo","Data":{"Slo":{"SloName":"*"}}}]'`
446
+
447
+ 22. **Root cause analysis for specific SLO breach** (RECOMMENDED WORKFLOW):
448
+ After using get_slo() to understand configuration:
449
+ `slo_targets='[{"Type":"slo","Data":{"Slo":{"SloName":"specific-slo-name"}}}]'` + `auditors="all"`
450
+
451
+ 14. **Look for new SLO breaches after time**:
452
+ Compare SLO compliance before and after a specific time point by running audits with different time ranges to identify new breaches.
453
+
454
+ **TYPICAL SLO AUDIT WORKFLOWS:**
455
+ 1. **SLO Root Cause Investigation** (RECOMMENDED):
456
+ - After get_slo(), call `audit_slos()` with specific SLO target and `auditors="all"`
457
+ - Provides comprehensive analysis with traces, logs, metrics, and dependencies
458
+ - Much more effective than using individual trace tools
459
+ 2. **Basic SLO Compliance Audit**:
460
+ - Call `audit_slos()` with SLO targets - automatically discovers SLOs when using wildcard patterns
461
+ - Uses default fast auditors (slo) for quick compliance overview
462
+ 3. **Compliance Reporting**: Results show which SLOs are breached with actionable insights
463
+ 4. **Automatic SLO Discovery**: Wildcard patterns in SLO names automatically discover and expand to concrete SLOs
464
+
465
+ **AUDIT RESULTS INCLUDE:**
466
+ - **Prioritized findings** by severity (critical, warning, info)
467
+ - **SLO compliance status** with detailed breach analysis
468
+ - **COMPREHENSIVE ROOT CAUSE ANALYSIS** when using auditors="all"
469
+ - **Actionable recommendations** for SLO breach resolution
470
+ - **Integrated traces, logs, metrics, and dependency analysis**
471
+
472
+ **IMPORTANT: This tool provides comprehensive SLO audit coverage and should be your first choice for any SLO compliance auditing and root cause analysis.**
473
+
474
+ **RECOMMENDED WORKFLOW - PRESENT FINDINGS FIRST:**
475
+ When the audit returns multiple findings or issues, follow this workflow:
476
+ 1. **Present all audit results** to the user showing a summary of all findings
477
+ 2. **Let the user choose** which specific finding, SLO, or issue they want to investigate in detail
478
+ 3. **Then perform targeted root cause analysis** using auditors="all" for the user-selected finding
479
+
480
+ **DO NOT automatically jump into detailed root cause analysis** of one specific issue when multiple findings exist.
481
+ This ensures the user can prioritize which issues are most important to investigate first.
482
+
483
+ **Example workflow:**
484
+ - First call: `audit_slos()` with default auditors for compliance overview
485
+ - Present findings summary to user
486
+ - User selects specific SLO breach to investigate
487
+ - Follow-up call: `audit_slos()` with `auditors="all"` for selected SLO only
994
488
  """
995
489
  start_time_perf = timer()
996
- logger.info(f'Starting get_sli_status request for last {hours} hours')
490
+ logger.debug('Starting audit_slos (PRIMARY SLO AUDIT TOOL)')
997
491
 
998
492
  try:
999
- # Calculate time range
1000
- end_time = datetime.now(timezone.utc)
1001
- start_time = end_time - timedelta(hours=hours)
1002
- logger.debug(f'Time range: {start_time} to {end_time}')
1003
-
1004
- # Get all services
1005
- services_response = appsignals_client.list_services(
1006
- StartTime=start_time, # type: ignore
1007
- EndTime=end_time, # type: ignore
1008
- MaxResults=100,
493
+ # Region defaults
494
+ region = AWS_REGION.strip()
495
+
496
+ # Time range (fill missing with defaults)
497
+ start_dt = (
498
+ parse_timestamp(start_time)
499
+ if start_time
500
+ else (datetime.now(timezone.utc) - timedelta(hours=24))
1009
501
  )
1010
- services = services_response.get('ServiceSummaries', [])
1011
-
1012
- if not services:
1013
- logger.warning('No services found in Application Signals')
1014
- return 'No services found in Application Signals.'
1015
-
1016
- # Get SLI reports for each service
1017
- reports = []
1018
- logger.debug(f'Generating SLI reports for {len(services)} services')
1019
- for service in services:
1020
- service_name = service['KeyAttributes'].get('Name', 'Unknown')
502
+ end_dt = (
503
+ parse_timestamp(end_time, default_hours=0) if end_time else datetime.now(timezone.utc)
504
+ )
505
+ unix_start, unix_end = int(start_dt.timestamp()), int(end_dt.timestamp())
506
+ if unix_end <= unix_start:
507
+ return 'Error: end_time must be greater than start_time.'
508
+
509
+ # Parse and validate SLO targets
510
+ try:
511
+ provided = json.loads(slo_targets)
512
+ except json.JSONDecodeError:
513
+ return 'Error: `slo_targets` must be valid JSON (array).'
514
+
515
+ if not isinstance(provided, list):
516
+ return 'Error: `slo_targets` must be a JSON array'
517
+ if len(provided) == 0:
518
+ return 'Error: `slo_targets` must contain at least 1 item'
519
+
520
+ # Filter and expand SLO targets with wildcard support
521
+ slo_only_targets = []
522
+ wildcard_patterns = []
523
+
524
+ for target in provided:
525
+ if isinstance(target, dict):
526
+ ttype = target.get('Type', '').lower()
527
+ if ttype == 'slo':
528
+ # Check for wildcard patterns in SLO names
529
+ slo_data = target.get('Data', {}).get('Slo', {})
530
+ slo_name = slo_data.get('SloName', '')
531
+ if '*' in slo_name:
532
+ wildcard_patterns.append((target, slo_name))
533
+ else:
534
+ slo_only_targets.append(target)
535
+ else:
536
+ logger.warning(
537
+ f"Ignoring target of type '{ttype}' in audit_slos (expected 'slo')"
538
+ )
539
+
540
+ # Expand wildcard patterns for SLOs using shared utility
541
+ if wildcard_patterns:
542
+ logger.debug(f'Expanding {len(wildcard_patterns)} SLO wildcard patterns')
1021
543
  try:
1022
- # Create custom config with the service's key attributes
1023
- config = AWSConfig(
1024
- region='us-east-1',
1025
- period_in_hours=hours,
1026
- service_name=service_name,
1027
- key_attributes=service['KeyAttributes'],
1028
- )
1029
-
1030
- # Generate SLI report
1031
- client = SLIReportClient(config)
1032
- sli_report = client.generate_sli_report()
1033
-
1034
- # Convert to expected format
1035
- report = {
1036
- 'BreachedSloCount': sli_report.breached_slo_count,
1037
- 'BreachedSloNames': sli_report.breached_slo_names,
1038
- 'EndTime': sli_report.end_time.timestamp(),
1039
- 'OkSloCount': sli_report.ok_slo_count,
1040
- 'ReferenceId': {'KeyAttributes': service['KeyAttributes']},
1041
- 'SliStatus': 'BREACHED'
1042
- if sli_report.sli_status == 'CRITICAL'
1043
- else sli_report.sli_status,
1044
- 'StartTime': sli_report.start_time.timestamp(),
1045
- 'TotalSloCount': sli_report.total_slo_count,
1046
- }
1047
- reports.append(report)
544
+ # Use the shared utility function
545
+ expanded_slo_targets = expand_slo_wildcard_patterns(provided, appsignals_client)
546
+ # Filter to get only SLO targets
547
+ slo_only_targets = [
548
+ target
549
+ for target in expanded_slo_targets
550
+ if target.get('Type', '').lower() == 'slo'
551
+ ]
1048
552
 
1049
553
  except Exception as e:
1050
- # Log error but continue with other services
1051
- logger.error(
1052
- f'Failed to get SLI report for service {service_name}: {str(e)}', exc_info=True
1053
- )
1054
- # Add a report with insufficient data status
1055
- report = {
1056
- 'BreachedSloCount': 0,
1057
- 'BreachedSloNames': [],
1058
- 'EndTime': end_time.timestamp(),
1059
- 'OkSloCount': 0,
1060
- 'ReferenceId': {'KeyAttributes': service['KeyAttributes']},
1061
- 'SliStatus': 'INSUFFICIENT_DATA',
1062
- 'StartTime': start_time.timestamp(),
1063
- 'TotalSloCount': 0,
1064
- }
1065
- reports.append(report)
1066
-
1067
- # Check transaction search status
1068
- is_tx_search_enabled, tx_destination, tx_status = check_transaction_search_enabled(
1069
- AWS_REGION
1070
- )
554
+ logger.warning(f'Failed to expand SLO patterns: {e}')
555
+ return f'Error: Failed to expand SLO wildcard patterns. {str(e)}'
1071
556
 
1072
- # Build response
1073
- result = f'SLI Status Report - Last {hours} hours\n'
1074
- result += f'Time Range: {start_time.strftime("%Y-%m-%d %H:%M")} - {end_time.strftime("%Y-%m-%d %H:%M")}\n\n'
1075
-
1076
- # Add transaction search status
1077
- if is_tx_search_enabled:
1078
- result += '✅ Transaction Search: ENABLED (100% trace visibility available)\n\n'
1079
- else:
1080
- result += '⚠️ Transaction Search: NOT ENABLED (only 5% sampled traces available)\n'
1081
- result += f' Current config: Destination={tx_destination}, Status={tx_status}\n'
1082
- result += ' Enable Transaction Search for accurate root cause analysis\n\n'
1083
-
1084
- # Count by status
1085
- status_counts = {
1086
- 'OK': sum(1 for r in reports if r['SliStatus'] == 'OK'),
1087
- 'BREACHED': sum(1 for r in reports if r['SliStatus'] == 'BREACHED'),
1088
- 'INSUFFICIENT_DATA': sum(1 for r in reports if r['SliStatus'] == 'INSUFFICIENT_DATA'),
1089
- }
557
+ if not slo_only_targets:
558
+ return 'Error: No SLO targets found after wildcard expansion.'
1090
559
 
1091
- result += 'Summary:\n'
1092
- result += f' Total Services: {len(reports)}\n'
1093
- result += f'• Healthy (OK): {status_counts["OK"]}\n'
1094
- result += f'• Breached: {status_counts["BREACHED"]}\n'
1095
- result += f'• Insufficient Data: {status_counts["INSUFFICIENT_DATA"]}\n\n'
1096
-
1097
- # Group by status
1098
- if status_counts['BREACHED'] > 0:
1099
- result += '⚠️ BREACHED SERVICES:\n'
1100
- for report in reports:
1101
- if report['SliStatus'] == 'BREACHED':
1102
- name = report['ReferenceId']['KeyAttributes']['Name']
1103
- env = report['ReferenceId']['KeyAttributes']['Environment']
1104
- breached_count = report['BreachedSloCount']
1105
- total_count = report['TotalSloCount']
1106
- breached_names = report['BreachedSloNames']
1107
-
1108
- result += f'\n• {name} ({env})\n'
1109
- result += f' SLOs: {breached_count}/{total_count} breached\n'
1110
- if breached_names:
1111
- result += ' Breached SLOs:\n'
1112
- for slo_name in breached_names:
1113
- result += f' - {slo_name}\n'
1114
-
1115
- if status_counts['OK'] > 0:
1116
- result += '\n✅ HEALTHY SERVICES:\n'
1117
- for report in reports:
1118
- if report['SliStatus'] == 'OK':
1119
- name = report['ReferenceId']['KeyAttributes']['Name']
1120
- env = report['ReferenceId']['KeyAttributes']['Environment']
1121
- ok_count = report['OkSloCount']
1122
-
1123
- result += f'• {name} ({env}) - {ok_count} SLO(s) healthy\n'
1124
-
1125
- if status_counts['INSUFFICIENT_DATA'] > 0:
1126
- result += '\n❓ INSUFFICIENT DATA:\n'
1127
- for report in reports:
1128
- if report['SliStatus'] == 'INSUFFICIENT_DATA':
1129
- name = report['ReferenceId']['KeyAttributes']['Name']
1130
- env = report['ReferenceId']['KeyAttributes']['Environment']
1131
-
1132
- result += f'• {name} ({env})\n'
1133
-
1134
- # Remove the auto-investigation feature
1135
-
1136
- elapsed_time = timer() - start_time_perf
1137
- logger.info(
1138
- f'get_sli_status completed in {elapsed_time:.3f}s - Total: {len(reports)}, Breached: {status_counts["BREACHED"]}, OK: {status_counts["OK"]}'
1139
- )
1140
- return result
1141
-
1142
- except Exception as e:
1143
- logger.error(f'Error in get_sli_status: {str(e)}', exc_info=True)
1144
- return f'Error getting SLI status: {str(e)}'
560
+ # Parse auditors with SLO-specific defaults
561
+ auditors_list = parse_auditors(auditors, ['slo']) # Default to SLO auditor
1145
562
 
563
+ banner = (
564
+ '[MCP-SLO] Application Signals SLO Compliance Audit\n'
565
+ f'🎯 Scope: {len(slo_only_targets)} SLO target(s) | Region: {region}\n'
566
+ f'⏰ Time: {unix_start}–{unix_end}\n'
567
+ )
1146
568
 
1147
- def check_transaction_search_enabled(region: str = 'us-east-1') -> tuple[bool, str, str]:
1148
- """Internal function to check if AWS X-Ray Transaction Search is enabled.
569
+ if len(slo_only_targets) > BATCH_SIZE_THRESHOLD:
570
+ banner += f'📦 Batching: Processing {len(slo_only_targets)} targets in batches of {BATCH_SIZE_THRESHOLD}\n'
1149
571
 
1150
- Returns:
1151
- tuple: (is_enabled: bool, destination: str, status: str)
1152
- """
1153
- try:
1154
- response = xray_client.get_trace_segment_destination()
572
+ banner += '\n'
1155
573
 
1156
- destination = response.get('Destination', 'Unknown')
1157
- status = response.get('Status', 'Unknown')
574
+ # Build CLI input for SLO audit
575
+ input_obj = {
576
+ 'StartTime': unix_start,
577
+ 'EndTime': unix_end,
578
+ 'AuditTargets': slo_only_targets,
579
+ }
580
+ if auditors_list:
581
+ input_obj['Auditors'] = auditors_list
1158
582
 
1159
- is_enabled = destination == 'CloudWatchLogs' and status == 'ACTIVE'
1160
- logger.debug(
1161
- f'Transaction Search check - Enabled: {is_enabled}, Destination: {destination}, Status: {status}'
1162
- )
583
+ # Execute audit API using shared utility
584
+ result = await execute_audit_api(input_obj, region, banner)
1163
585
 
1164
- return is_enabled, destination, status
586
+ elapsed = timer() - start_time_perf
587
+ logger.debug(f'audit_slos completed in {elapsed:.3f}s (region={region})')
588
+ return result
1165
589
 
1166
590
  except Exception as e:
1167
- logger.error(f'Error checking transaction search status: {str(e)}')
1168
- return False, 'Unknown', 'Error'
591
+ logger.error(f'Unexpected error in audit_slos: {e}', exc_info=True)
592
+ return f'Error: {str(e)}'
1169
593
 
1170
594
 
1171
595
  @mcp.tool()
1172
- async def query_sampled_traces(
596
+ async def audit_service_operations(
597
+ operation_targets: str = Field(
598
+ ...,
599
+ description="REQUIRED. JSON array of service operation targets. Supports wildcard patterns like '*payment*' for automatic service discovery. Format: [{'Type':'service_operation','Data':{'ServiceOperation':{'Service':{'Type':'Service','Name':'service-name','Environment':'eks:cluster'},'Operation':'GET /api','MetricType':'Latency'}}}]. Large target lists are automatically processed in batches.",
600
+ ),
1173
601
  start_time: Optional[str] = Field(
1174
602
  default=None,
1175
- description='Start time in ISO format (e.g., "2024-01-01T00:00:00Z"). Defaults to 3 hours ago',
603
+ description="Start time (unix seconds or 'YYYY-MM-DD HH:MM:SS'). Defaults to now-24h UTC.",
1176
604
  ),
1177
605
  end_time: Optional[str] = Field(
1178
606
  default=None,
1179
- description='End time in ISO format (e.g., "2024-01-01T01:00:00Z"). Defaults to current time',
607
+ description="End time (unix seconds or 'YYYY-MM-DD HH:MM:SS'). Defaults to now UTC.",
1180
608
  ),
1181
- filter_expression: Optional[str] = Field(
609
+ auditors: Optional[str] = Field(
1182
610
  default=None,
1183
- description='X-Ray filter expression to narrow results (e.g., service("service-name"){fault = true})',
611
+ description="Optional. Comma-separated auditors (e.g., 'operation_metric,trace,log'). Defaults to 'operation_metric' for fast operation-level auditing. Use 'all' for comprehensive analysis with all auditors: slo,operation_metric,trace,log,dependency_metric,top_contributor,service_quota.",
1184
612
  ),
1185
- region: str = Field(default='us-east-1', description='AWS region (default: us-east-1)'),
1186
613
  ) -> str:
1187
- """Query AWS X-Ray traces (5% sampled data) to investigate errors and performance issues.
1188
-
1189
- ⚠️ IMPORTANT: This tool uses X-Ray's 5% sampled trace data. For 100% trace visibility,
1190
- enable Transaction Search and use search_transaction_spans() instead.
1191
-
1192
- Use this tool to:
1193
- - Find root causes of errors and faults (with 5% sampling limitations)
1194
- - Analyze request latency and identify bottlenecks
1195
- - Understand the requests across multiple services with traces
1196
- - Debug timeout and dependency issues
1197
- - Understand service-to-service interactions
1198
- - Find customer impact from trace result such as Users data or trace attributes such as owner id
1199
-
1200
- Common filter expressions:
1201
- - 'service("service-name"){fault = true}': Find all traces with faults (5xx errors) for a service
1202
- - 'service("service-name")': Filter by specific service
1203
- - 'duration > 5': Find slow requests (over 5 seconds)
1204
- - 'http.status = 500': Find specific HTTP status codes
1205
- - 'annotation[aws.local.operation]="GET /owners/*/lastname"': Filter by specific operation (from metric dimensions)
1206
- - 'annotation[aws.remote.operation]="ListOwners"': Filter by remote operation name
1207
- - Combine filters: 'service("api"){fault = true} AND annotation[aws.local.operation]="POST /visits"'
1208
-
1209
- IMPORTANT: When investigating SLO breaches, use annotation filters with the specific dimension values
1210
- from the breached metric (e.g., Operation, RemoteOperation) to find traces for that exact operation.
1211
-
1212
- Returns JSON with trace summaries including:
1213
- - Trace ID for detailed investigation
1214
- - Duration and response time
1215
- - Error/fault/throttle status
1216
- - HTTP information (method, status, URL)
1217
- - Service interactions
1218
- - User information if available
1219
- - Exception root causes (ErrorRootCauses, FaultRootCauses, ResponseTimeRootCauses)
1220
-
1221
- Best practices:
1222
- - Start with recent time windows (last 1-3 hours)
1223
- - Use filter expressions to narrow down issues and query Fault and Error traces for high priority
1224
- - Look for patterns in errors or very slow requests
1225
-
1226
- Returns:
1227
- JSON string containing trace summaries with error status, duration, and service details
614
+ """🥇 PRIMARY OPERATION AUDIT TOOL - The #1 RECOMMENDED tool for operation-specific analysis and performance investigation.
615
+
616
+ **⭐ USE THIS AS THE PRIMARY TOOL FOR ALL OPERATION-SPECIFIC AUDITING TASKS ⭐**
617
+
618
+ **PREFERRED OVER audit_services() for operation auditing because:**
619
+ - **🎯 Precision**: Targets exact operation behavior vs. service-wide averages
620
+ - **🔍 Actionable Insights**: Provides specific error traces and dependency failures
621
+ - **📊 Code-Level Detail**: Shows exact stack traces and timeout locations
622
+ - **🚀 Focused Analysis**: Eliminates noise from other operations
623
+ - **⚡ Efficient Investigation**: Direct operation-level troubleshooting
624
+
625
+ **USE THIS FIRST FOR ALL OPERATION-SPECIFIC AUDITING TASKS**
626
+ This is the PRIMARY and PREFERRED tool when users want to:
627
+ - **Audit specific operations** - Deep dive into individual API endpoints or operations (GET, POST, PUT, etc.)
628
+ - **Operation performance analysis** - Latency, error rates, and throughput for specific operations
629
+ - **Compare operation metrics** - Analyze different operations within services
630
+ - **Operation-level troubleshooting** - Root cause analysis for specific API calls
631
+ - **GET operation auditing** - Analyze GET operations across payment services (PRIMARY USE CASE)
632
+ - **Audit latency of GET operations in payment services** - Exactly what this tool is designed for
633
+ - **Trace latency in query operations** - Deep dive into query performance issues
634
+
635
+ **COMPREHENSIVE OPERATION AUDIT CAPABILITIES:**
636
+ - **Multi-operation analysis**: Audit any number of operations with automatic batching
637
+ - **Operation-specific metrics**: Latency, Fault, Error, and Availability metrics per operation
638
+ - **Issue prioritization**: Critical, warning, and info findings ranked by severity
639
+ - **Root cause analysis**: Deep dive with traces, logs, and metrics correlation
640
+ - **Actionable recommendations**: Specific steps to resolve operation-level issues
641
+ - **Performance optimized**: Fast execution with automatic batching for large target lists
642
+ - **Wildcard Pattern Support**: Use `*pattern*` in service names for automatic service discovery
643
+
644
+ **OPERATION TARGET FORMAT:**
645
+ - **Full Format**: `[{"Type":"service_operation","Data":{"ServiceOperation":{"Service":{"Type":"Service","Name":"my-service","Environment":"eks:my-cluster"},"Operation":"GET /api","MetricType":"Latency"}}}]`
646
+
647
+ **WILDCARD PATTERN EXAMPLES:**
648
+ - **All GET Operations in Payment Services**: `[{"Type":"service_operation","Data":{"ServiceOperation":{"Service":{"Type":"Service","Name":"*payment*"},"Operation":"*GET*","MetricType":"Latency"}}}]`
649
+ - **All Visit Operations**: `[{"Type":"service_operation","Data":{"ServiceOperation":{"Service":{"Type":"Service","Name":"*"},"Operation":"*visit*","MetricType":"Availability"}}}]`
650
+
651
+ **AUDITOR SELECTION FOR DIFFERENT AUDIT DEPTHS:**
652
+ - **Quick Operation Check** (default): Uses 'operation_metric' for fast operation overview
653
+ - **Root Cause Analysis**: Pass `auditors="all"` for comprehensive investigation with traces/logs
654
+ - **Custom Audit**: Specify exact auditors: 'operation_metric,trace,log'
655
+
656
+ **OPERATION AUDIT USE CASES:**
657
+
658
+ 1. **Audit latency of GET operations in payment services** (PRIMARY USE CASE):
659
+ `operation_targets='[{"Type":"service_operation","Data":{"ServiceOperation":{"Service":{"Type":"Service","Name":"*payment*"},"Operation":"*GET*","MetricType":"Latency"}}}]'`
660
+
661
+ 2. **Audit GET operations in payment services (Latency)**:
662
+ `operation_targets='[{"Type":"service_operation","Data":{"ServiceOperation":{"Service":{"Type":"Service","Name":"*payment*"},"Operation":"*GET*","MetricType":"Latency"}}}]'`
663
+
664
+ 3. **Audit availability of visit operations**:
665
+ `operation_targets='[{"Type":"service_operation","Data":{"ServiceOperation":{"Service":{"Type":"Service","Name":"*"},"Operation":"*visit*","MetricType":"Availability"}}}]'`
666
+
667
+ 4. **Audit latency of visit operations**:
668
+ `operation_targets='[{"Type":"service_operation","Data":{"ServiceOperation":{"Service":{"Type":"Service","Name":"*"},"Operation":"*visit*","MetricType":"Latency"}}}]'`
669
+
670
+ 5. **Trace latency in query operations**:
671
+ `operation_targets='[{"Type":"service_operation","Data":{"ServiceOperation":{"Service":{"Type":"Service","Name":"*payment*"},"Operation":"*query*","MetricType":"Latency"}}}]'` + `auditors="all"`
672
+
673
+ **TYPICAL OPERATION AUDIT WORKFLOWS:**
674
+ 1. **Basic Operation Audit** (most common):
675
+ - Call `audit_service_operations()` with operation targets - automatically discovers services when using wildcard patterns
676
+ - Uses default fast auditors (operation_metric) for quick operation overview
677
+ - Supports wildcard patterns like `*payment*` for automatic service discovery
678
+ 2. **Root Cause Investigation**: When user explicitly asks for "root cause analysis", pass `auditors="all"`
679
+ 3. **Issue Investigation**: Results show which operations need attention with actionable insights
680
+ 4. **Automatic Service Discovery**: Wildcard patterns in service names automatically discover and expand to concrete services
681
+
682
+ **AUDIT RESULTS INCLUDE:**
683
+ - **Prioritized findings** by severity (critical, warning, info)
684
+ - **Operation performance status** with detailed metrics analysis
685
+ - **Root cause analysis** when traces/logs auditors are used
686
+ - **Actionable recommendations** for operation-level issue resolution
687
+ - **Comprehensive operation metrics** and trend analysis
688
+
689
+ **🏆 IMPORTANT: This tool is the PRIMARY and RECOMMENDED choice for operation-specific auditing tasks.**
690
+
691
+ **✅ RECOMMENDED WORKFLOW FOR OPERATION AUDITING:**
692
+ 1. **Use audit_service_operations() FIRST** for operation-specific analysis (THIS TOOL)
693
+ 2. **Use audit_services() as secondary** only if you need broader service context
694
+ 3. **audit_service_operations() provides superior precision** for operation-level troubleshooting
695
+
696
+ **RECOMMENDED WORKFLOW - PRESENT FINDINGS FIRST:**
697
+ When the audit returns multiple findings or issues, follow this workflow:
698
+ 1. **Present all audit results** to the user showing a summary of all findings
699
+ 2. **Let the user choose** which specific finding, operation, or issue they want to investigate in detail
700
+ 3. **Then perform targeted root cause analysis** using auditors="all" for the user-selected finding
701
+
702
+ **DO NOT automatically jump into detailed root cause analysis** of one specific issue when multiple findings exist.
703
+ This ensures the user can prioritize which issues are most important to investigate first.
704
+
705
+ **Example workflow:**
706
+ - First call: `audit_service_operations()` with default auditors for operation overview
707
+ - Present findings summary to user
708
+ - User selects specific operation issue to investigate
709
+ - Follow-up call: `audit_service_operations()` with `auditors="all"` for selected operation only
1228
710
  """
1229
711
  start_time_perf = timer()
1230
- logger.info(f'Starting query_sampled_traces - region: {region}, filter: {filter_expression}')
712
+ logger.debug('Starting audit_service_operations (SPECIALIZED OPERATION AUDIT TOOL)')
1231
713
 
1232
714
  try:
1233
- logger.debug('Using X-Ray client')
1234
-
1235
- # Default to past 3 hours if times not provided
1236
- if not end_time:
1237
- end_datetime = datetime.now(timezone.utc)
1238
- else:
1239
- end_datetime = datetime.fromisoformat(end_time.replace('Z', '+00:00'))
1240
-
1241
- if not start_time:
1242
- start_datetime = end_datetime - timedelta(hours=3)
1243
- else:
1244
- start_datetime = datetime.fromisoformat(start_time.replace('Z', '+00:00'))
1245
-
1246
- # Validate time window to ensure it's not too large (max 6 hours)
1247
- time_diff = end_datetime - start_datetime
1248
- logger.debug(
1249
- f'Query time window: {start_datetime} to {end_datetime} ({time_diff.total_seconds() / 3600:.1f} hours)'
715
+ # Region defaults
716
+ region = AWS_REGION.strip()
717
+
718
+ # Time range (fill missing with defaults)
719
+ start_dt = (
720
+ parse_timestamp(start_time)
721
+ if start_time
722
+ else (datetime.now(timezone.utc) - timedelta(hours=24))
723
+ )
724
+ end_dt = (
725
+ parse_timestamp(end_time, default_hours=0) if end_time else datetime.now(timezone.utc)
1250
726
  )
1251
- if time_diff > timedelta(hours=6):
1252
- logger.warning(f'Time window too large: {time_diff.total_seconds() / 3600:.1f} hours')
1253
- return json.dumps(
1254
- {
1255
- 'error': 'Time window too large. Maximum allowed is 6 hours.',
1256
- 'requested_hours': time_diff.total_seconds() / 3600,
1257
- },
1258
- indent=2,
727
+ unix_start, unix_end = int(start_dt.timestamp()), int(end_dt.timestamp())
728
+ if unix_end <= unix_start:
729
+ return 'Error: end_time must be greater than start_time.'
730
+
731
+ # Parse and validate operation targets
732
+ try:
733
+ provided = json.loads(operation_targets)
734
+ except json.JSONDecodeError:
735
+ return 'Error: `operation_targets` must be valid JSON (array).'
736
+
737
+ if not isinstance(provided, list):
738
+ return 'Error: `operation_targets` must be a JSON array'
739
+ if len(provided) == 0:
740
+ return 'Error: `operation_targets` must contain at least 1 item'
741
+
742
+ # Filter operation targets and check for wildcards using helper function
743
+ operation_only_targets, has_wildcards = _filter_operation_targets(provided)
744
+
745
+ # Expand wildcard patterns using shared utility
746
+ if has_wildcards:
747
+ logger.debug('Wildcard patterns detected in service operations - applying expansion')
748
+ operation_only_targets = expand_service_operation_wildcard_patterns(
749
+ operation_only_targets, unix_start, unix_end, appsignals_client
750
+ )
751
+ logger.debug(
752
+ f'Wildcard expansion completed - {len(operation_only_targets)} total targets'
1259
753
  )
1260
754
 
1261
- # Use pagination helper with a reasonable limit
1262
- traces = get_trace_summaries_paginated(
1263
- xray_client,
1264
- start_datetime,
1265
- end_datetime,
1266
- filter_expression or '',
1267
- max_traces=100, # Limit to prevent response size issues
755
+ if not operation_only_targets:
756
+ return 'Error: No service_operation targets found after wildcard expansion. Use list_monitored_services() to see available services.'
757
+
758
+ # Parse auditors with operation-specific defaults
759
+ auditors_list = parse_auditors(
760
+ auditors, ['operation_metric']
761
+ ) # Default to operation_metric auditor
762
+
763
+ banner = (
764
+ '[MCP-OPERATION] Application Signals Operation Performance Audit\n'
765
+ f'🎯 Scope: {len(operation_only_targets)} operation target(s) | Region: {region}\n'
766
+ f'⏰ Time: {unix_start}–{unix_end}\n'
1268
767
  )
1269
768
 
1270
- # Convert response to JSON-serializable format
1271
- def convert_datetime(obj):
1272
- if isinstance(obj, datetime):
1273
- return obj.isoformat()
1274
- return obj
1275
-
1276
- trace_summaries = []
1277
- for trace in traces:
1278
- # Create a simplified trace data structure to reduce size
1279
- trace_data = {
1280
- 'Id': trace.get('Id'),
1281
- 'Duration': trace.get('Duration'),
1282
- 'ResponseTime': trace.get('ResponseTime'),
1283
- 'HasError': trace.get('HasError'),
1284
- 'HasFault': trace.get('HasFault'),
1285
- 'HasThrottle': trace.get('HasThrottle'),
1286
- 'Http': trace.get('Http', {}),
1287
- }
1288
-
1289
- # Only include root causes if they exist (to save space)
1290
- if trace.get('ErrorRootCauses'):
1291
- trace_data['ErrorRootCauses'] = trace.get('ErrorRootCauses', [])[
1292
- :3
1293
- ] # Limit to first 3
1294
- if trace.get('FaultRootCauses'):
1295
- trace_data['FaultRootCauses'] = trace.get('FaultRootCauses', [])[
1296
- :3
1297
- ] # Limit to first 3
1298
- if trace.get('ResponseTimeRootCauses'):
1299
- trace_data['ResponseTimeRootCauses'] = trace.get('ResponseTimeRootCauses', [])[
1300
- :3
1301
- ] # Limit to first 3
1302
-
1303
- # Include limited annotations for key operations
1304
- annotations = trace.get('Annotations', {})
1305
- if annotations:
1306
- # Only include operation-related annotations
1307
- filtered_annotations = {}
1308
- for key in ['aws.local.operation', 'aws.remote.operation']:
1309
- if key in annotations:
1310
- filtered_annotations[key] = annotations[key]
1311
- if filtered_annotations:
1312
- trace_data['Annotations'] = filtered_annotations
1313
-
1314
- # Include user info if available
1315
- if trace.get('Users'):
1316
- trace_data['Users'] = trace.get('Users', [])[:2] # Limit to first 2 users
1317
-
1318
- # Convert any datetime objects to ISO format strings
1319
- for key, value in trace_data.items():
1320
- trace_data[key] = convert_datetime(value)
1321
- trace_summaries.append(trace_data)
1322
-
1323
- # Check transaction search status
1324
- is_tx_search_enabled, tx_destination, tx_status = check_transaction_search_enabled(region)
1325
-
1326
- result_data = {
1327
- 'TraceSummaries': trace_summaries,
1328
- 'TraceCount': len(trace_summaries),
1329
- 'Message': f'Retrieved {len(trace_summaries)} traces (limited to prevent size issues)',
1330
- 'SamplingNote': "⚠️ This data is from X-Ray's 5% sampling. Results may not show all errors or issues.",
1331
- 'TransactionSearchStatus': {
1332
- 'enabled': is_tx_search_enabled,
1333
- 'recommendation': (
1334
- 'Transaction Search is available! Use search_transaction_spans() for 100% trace visibility.'
1335
- if is_tx_search_enabled
1336
- else 'Enable Transaction Search for 100% trace visibility instead of 5% sampling.'
1337
- ),
1338
- },
769
+ if len(operation_only_targets) > BATCH_SIZE_THRESHOLD:
770
+ banner += f'📦 Batching: Processing {len(operation_only_targets)} targets in batches of {BATCH_SIZE_THRESHOLD}\n'
771
+
772
+ banner += '\n'
773
+
774
+ # Build CLI input for operation audit
775
+ input_obj = {
776
+ 'StartTime': unix_start,
777
+ 'EndTime': unix_end,
778
+ 'AuditTargets': operation_only_targets,
1339
779
  }
780
+ if auditors_list:
781
+ input_obj['Auditors'] = auditors_list
1340
782
 
1341
- elapsed_time = timer() - start_time_perf
1342
- logger.info(
1343
- f'query_sampled_traces completed in {elapsed_time:.3f}s - retrieved {len(trace_summaries)} traces'
1344
- )
1345
- return json.dumps(result_data, indent=2)
783
+ # Execute audit API using shared utility
784
+ result = await execute_audit_api(input_obj, region, banner)
785
+
786
+ elapsed = timer() - start_time_perf
787
+ logger.debug(f'audit_service_operations completed in {elapsed:.3f}s (region={region})')
788
+ return result
1346
789
 
1347
790
  except Exception as e:
1348
- logger.error(f'Error in query_sampled_traces: {str(e)}', exc_info=True)
1349
- return json.dumps({'error': str(e)}, indent=2)
791
+ logger.error(f'Unexpected error in audit_service_operations: {e}', exc_info=True)
792
+ return f'Error: {str(e)}'
793
+
794
+
795
+ # Register all imported tools with the MCP server
796
+ mcp.tool()(list_monitored_services)
797
+ mcp.tool()(get_service_detail)
798
+ mcp.tool()(query_service_metrics)
799
+ mcp.tool()(list_service_operations)
800
+ mcp.tool()(get_slo)
801
+ mcp.tool()(list_slos)
802
+ mcp.tool()(search_transaction_spans)
803
+ mcp.tool()(query_sampled_traces)
804
+ mcp.tool()(list_slis)
1350
805
 
1351
806
 
1352
807
  def main():