awslabs.eks-mcp-server 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- awslabs/__init__.py +13 -0
- awslabs/eks_mcp_server/__init__.py +14 -0
- awslabs/eks_mcp_server/aws_helper.py +71 -0
- awslabs/eks_mcp_server/cloudwatch_handler.py +670 -0
- awslabs/eks_mcp_server/consts.py +33 -0
- awslabs/eks_mcp_server/eks_kb_handler.py +86 -0
- awslabs/eks_mcp_server/eks_stack_handler.py +661 -0
- awslabs/eks_mcp_server/iam_handler.py +359 -0
- awslabs/eks_mcp_server/k8s_apis.py +506 -0
- awslabs/eks_mcp_server/k8s_client_cache.py +164 -0
- awslabs/eks_mcp_server/k8s_handler.py +1151 -0
- awslabs/eks_mcp_server/logging_helper.py +52 -0
- awslabs/eks_mcp_server/models.py +271 -0
- awslabs/eks_mcp_server/server.py +151 -0
- awslabs/eks_mcp_server/templates/eks-templates/eks-with-vpc.yaml +454 -0
- awslabs/eks_mcp_server/templates/k8s-templates/deployment.yaml +49 -0
- awslabs/eks_mcp_server/templates/k8s-templates/service.yaml +18 -0
- awslabs_eks_mcp_server-0.1.1.dist-info/METADATA +596 -0
- awslabs_eks_mcp_server-0.1.1.dist-info/RECORD +23 -0
- awslabs_eks_mcp_server-0.1.1.dist-info/WHEEL +4 -0
- awslabs_eks_mcp_server-0.1.1.dist-info/entry_points.txt +2 -0
- awslabs_eks_mcp_server-0.1.1.dist-info/licenses/LICENSE +175 -0
- awslabs_eks_mcp_server-0.1.1.dist-info/licenses/NOTICE +2 -0
|
@@ -0,0 +1,670 @@
|
|
|
1
|
+
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
|
|
4
|
+
# with the License. A copy of the License is located at
|
|
5
|
+
#
|
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
7
|
+
#
|
|
8
|
+
# or in the 'license' file accompanying this file. This file is distributed on an 'AS IS' BASIS, WITHOUT WARRANTIES
|
|
9
|
+
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions
|
|
10
|
+
# and limitations under the License.
|
|
11
|
+
|
|
12
|
+
"""CloudWatch handler for the EKS MCP Server."""
|
|
13
|
+
|
|
14
|
+
import datetime
|
|
15
|
+
import json
|
|
16
|
+
import time
|
|
17
|
+
from awslabs.eks_mcp_server.aws_helper import AwsHelper
|
|
18
|
+
from awslabs.eks_mcp_server.logging_helper import LogLevel, log_with_request_id
|
|
19
|
+
from awslabs.eks_mcp_server.models import CloudWatchLogsResponse, CloudWatchMetricsResponse
|
|
20
|
+
from mcp.server.fastmcp import Context
|
|
21
|
+
from mcp.types import TextContent
|
|
22
|
+
from pydantic import Field
|
|
23
|
+
from typing import Optional, Union
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class CloudWatchHandler:
|
|
27
|
+
"""Handler for CloudWatch operations in the EKS MCP Server.
|
|
28
|
+
|
|
29
|
+
This class provides tools for retrieving and analyzing CloudWatch logs and metrics
|
|
30
|
+
from EKS clusters, enabling effective monitoring and troubleshooting.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
def __init__(self, mcp, allow_sensitive_data_access=False):
|
|
34
|
+
"""Initialize the CloudWatch handler.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
mcp: The MCP server instance
|
|
38
|
+
allow_sensitive_data_access: Whether to allow access to sensitive data (default: False)
|
|
39
|
+
"""
|
|
40
|
+
self.mcp = mcp
|
|
41
|
+
self.allow_sensitive_data_access = allow_sensitive_data_access
|
|
42
|
+
|
|
43
|
+
# Register tools
|
|
44
|
+
self.mcp.tool(name='get_cloudwatch_logs')(self.get_cloudwatch_logs)
|
|
45
|
+
self.mcp.tool(name='get_cloudwatch_metrics')(self.get_cloudwatch_metrics)
|
|
46
|
+
|
|
47
|
+
def resolve_time_range(
|
|
48
|
+
self,
|
|
49
|
+
start_time: Optional[Union[str, datetime.datetime]] = None,
|
|
50
|
+
end_time: Optional[Union[str, datetime.datetime]] = None,
|
|
51
|
+
minutes: int = 15,
|
|
52
|
+
) -> tuple:
|
|
53
|
+
"""Resolve start and end times for CloudWatch queries.
|
|
54
|
+
|
|
55
|
+
This function is public for unit testing purposes.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
start_time: Start time as string (ISO format) or datetime object
|
|
59
|
+
end_time: End time as string (ISO format) or datetime object
|
|
60
|
+
minutes: Number of minutes to look back if start_time is not provided
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
Tuple of (start_datetime, end_datetime)
|
|
64
|
+
"""
|
|
65
|
+
# Handle end_time
|
|
66
|
+
if end_time is None:
|
|
67
|
+
end_dt = datetime.datetime.now()
|
|
68
|
+
elif isinstance(end_time, str):
|
|
69
|
+
end_dt = datetime.datetime.fromisoformat(end_time)
|
|
70
|
+
else:
|
|
71
|
+
end_dt = end_time
|
|
72
|
+
|
|
73
|
+
# Handle start_time
|
|
74
|
+
if start_time is None:
|
|
75
|
+
start_dt = end_dt - datetime.timedelta(minutes=minutes)
|
|
76
|
+
elif isinstance(start_time, str):
|
|
77
|
+
start_dt = datetime.datetime.fromisoformat(start_time)
|
|
78
|
+
else:
|
|
79
|
+
start_dt = start_time
|
|
80
|
+
|
|
81
|
+
return start_dt, end_dt
|
|
82
|
+
|
|
83
|
+
async def get_cloudwatch_logs(
|
|
84
|
+
self,
|
|
85
|
+
ctx: Context,
|
|
86
|
+
resource_type: str = Field(
|
|
87
|
+
...,
|
|
88
|
+
description='Resource type to search logs for. Valid values: "pod", "node", "container". This determines how logs are filtered.',
|
|
89
|
+
),
|
|
90
|
+
resource_name: str = Field(
|
|
91
|
+
...,
|
|
92
|
+
description='Resource name to search for in log messages (e.g., pod name, node name, container name). Used to filter logs for the specific resource.',
|
|
93
|
+
),
|
|
94
|
+
cluster_name: str = Field(
|
|
95
|
+
...,
|
|
96
|
+
description='Name of the EKS cluster where the resource is located. Used to construct the CloudWatch log group name.',
|
|
97
|
+
),
|
|
98
|
+
log_type: str = Field(
|
|
99
|
+
...,
|
|
100
|
+
description="""Log type to query. Options:
|
|
101
|
+
- "application": Container/application logs
|
|
102
|
+
- "host": Node-level system logs
|
|
103
|
+
- "performance": Performance metrics logs
|
|
104
|
+
- "control-plane": EKS control plane logs
|
|
105
|
+
- Or provide a custom CloudWatch log group name directly""",
|
|
106
|
+
),
|
|
107
|
+
minutes: int = Field(
|
|
108
|
+
15,
|
|
109
|
+
description='Number of minutes to look back for logs. Default: 15. Ignored if start_time is provided. Use smaller values for recent issues, larger values for historical analysis.',
|
|
110
|
+
),
|
|
111
|
+
start_time: Optional[str] = Field(
|
|
112
|
+
None,
|
|
113
|
+
description='Start time in ISO format (e.g., "2023-01-01T00:00:00Z"). If provided, overrides the minutes parameter. IMPORTANT: Use this for precise time ranges.',
|
|
114
|
+
),
|
|
115
|
+
end_time: Optional[str] = Field(
|
|
116
|
+
None,
|
|
117
|
+
description='End time in ISO format (e.g., "2023-01-01T01:00:00Z"). If not provided, defaults to current time. IMPORTANT: Use with start_time for precise time ranges.',
|
|
118
|
+
),
|
|
119
|
+
limit: int = Field(
|
|
120
|
+
50,
|
|
121
|
+
description='Maximum number of log entries to return. Use lower values (10-50) for faster queries, higher values (100-1000) for more comprehensive results. IMPORTANT: Higher values may impact performance.',
|
|
122
|
+
),
|
|
123
|
+
filter_pattern: Optional[str] = Field(
|
|
124
|
+
None,
|
|
125
|
+
description='Additional CloudWatch Logs filter pattern to apply. Uses CloudWatch Logs Insights syntax (e.g., "ERROR", "field=value"). IMPORTANT: Use this to narrow down results for specific issues.',
|
|
126
|
+
),
|
|
127
|
+
fields: Optional[str] = Field(
|
|
128
|
+
None,
|
|
129
|
+
description='Custom fields to include in the query results (defaults to "@timestamp, @message"). Use CloudWatch Logs Insights field syntax. IMPORTANT: Only specify if you need fields beyond the default timestamp and message.',
|
|
130
|
+
),
|
|
131
|
+
) -> CloudWatchLogsResponse:
|
|
132
|
+
"""Get logs from CloudWatch for a specific resource.
|
|
133
|
+
|
|
134
|
+
This tool retrieves logs from CloudWatch for Kubernetes resources in an EKS cluster,
|
|
135
|
+
allowing you to analyze application behavior, troubleshoot issues, and monitor system
|
|
136
|
+
health. It supports filtering by resource type, time range, and content for troubleshooting
|
|
137
|
+
application errors, investigating security incidents, and analyzing startup configuration issues.
|
|
138
|
+
|
|
139
|
+
## Requirements
|
|
140
|
+
- The server must be run with the `--allow-sensitive-data-access` flag
|
|
141
|
+
- The EKS cluster must have CloudWatch logging enabled
|
|
142
|
+
- The resource must exist in the specified cluster
|
|
143
|
+
|
|
144
|
+
## Response Information
|
|
145
|
+
The response includes resource details (type, name, cluster), log group information,
|
|
146
|
+
time range queried, and formatted log entries with timestamps and messages.
|
|
147
|
+
|
|
148
|
+
## Usage Tips
|
|
149
|
+
- Start with a small time range (15-30 minutes) and expand if needed
|
|
150
|
+
- Use filter_pattern to narrow down results (e.g., "ERROR", "exception")
|
|
151
|
+
- For JSON logs, the tool automatically parses nested structures
|
|
152
|
+
- Combine with get_k8s_events for comprehensive troubleshooting
|
|
153
|
+
|
|
154
|
+
Args:
|
|
155
|
+
ctx: MCP context
|
|
156
|
+
resource_type: Resource type (pod, node, container)
|
|
157
|
+
resource_name: Resource name to search for in log messages
|
|
158
|
+
cluster_name: Name of the EKS cluster
|
|
159
|
+
log_type: Log type (application, host, performance, control-plane, or custom)
|
|
160
|
+
minutes: Number of minutes to look back
|
|
161
|
+
start_time: Start time in ISO format (overrides minutes)
|
|
162
|
+
end_time: End time in ISO format (defaults to now)
|
|
163
|
+
limit: Maximum number of log entries to return
|
|
164
|
+
filter_pattern: Additional CloudWatch Logs filter pattern
|
|
165
|
+
fields: Custom fields to include in the query results
|
|
166
|
+
|
|
167
|
+
Returns:
|
|
168
|
+
CloudWatchLogsResponse with log entries and resource information
|
|
169
|
+
"""
|
|
170
|
+
try:
|
|
171
|
+
# Check if sensitive data access is allowed
|
|
172
|
+
if not self.allow_sensitive_data_access:
|
|
173
|
+
error_message = (
|
|
174
|
+
'Access to CloudWatch logs requires --allow-sensitive-data-access flag'
|
|
175
|
+
)
|
|
176
|
+
log_with_request_id(ctx, LogLevel.ERROR, error_message)
|
|
177
|
+
return CloudWatchLogsResponse(
|
|
178
|
+
isError=True,
|
|
179
|
+
content=[TextContent(type='text', text=error_message)],
|
|
180
|
+
resource_type=resource_type,
|
|
181
|
+
resource_name=resource_name,
|
|
182
|
+
cluster_name=cluster_name,
|
|
183
|
+
log_type=log_type,
|
|
184
|
+
log_group='',
|
|
185
|
+
start_time='',
|
|
186
|
+
end_time='',
|
|
187
|
+
log_entries=[],
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
start_dt, end_dt = self.resolve_time_range(start_time, end_time, minutes)
|
|
191
|
+
|
|
192
|
+
# Create CloudWatch Logs client
|
|
193
|
+
logs = AwsHelper.create_boto3_client('logs')
|
|
194
|
+
|
|
195
|
+
# Determine the log group based on log_type
|
|
196
|
+
known_types = {'application', 'host', 'performance', 'dataplane'}
|
|
197
|
+
if log_type in known_types:
|
|
198
|
+
log_group = f'/aws/containerinsights/{cluster_name}/{log_type}'
|
|
199
|
+
elif log_type == 'control-plane':
|
|
200
|
+
log_group = f'/aws/eks/{cluster_name}/cluster'
|
|
201
|
+
else:
|
|
202
|
+
log_group = log_type # Assume user passed full log group name
|
|
203
|
+
|
|
204
|
+
# Determine fields to include
|
|
205
|
+
query_fields = fields if fields else '@timestamp, @message'
|
|
206
|
+
|
|
207
|
+
# Construct the base query
|
|
208
|
+
query = f"""
|
|
209
|
+
fields {query_fields}
|
|
210
|
+
| filter @message like '{resource_name}'
|
|
211
|
+
"""
|
|
212
|
+
|
|
213
|
+
# Add additional filter pattern if provided
|
|
214
|
+
if filter_pattern:
|
|
215
|
+
query += f'\n| {filter_pattern}'
|
|
216
|
+
|
|
217
|
+
# Add sorting and limit
|
|
218
|
+
query += f'\n| sort @timestamp desc\n| limit {limit}'
|
|
219
|
+
|
|
220
|
+
log_with_request_id(
|
|
221
|
+
ctx,
|
|
222
|
+
LogLevel.INFO,
|
|
223
|
+
f'Starting CloudWatch Logs query for {resource_type} {resource_name} in cluster {cluster_name}',
|
|
224
|
+
log_group=log_group,
|
|
225
|
+
start_time=start_dt.isoformat(),
|
|
226
|
+
end_time=end_dt.isoformat(),
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
# Start the query
|
|
230
|
+
start_query_response = logs.start_query(
|
|
231
|
+
logGroupName=log_group,
|
|
232
|
+
startTime=int(start_dt.timestamp()),
|
|
233
|
+
endTime=int(end_dt.timestamp()),
|
|
234
|
+
queryString=query,
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
query_id = start_query_response['queryId']
|
|
238
|
+
|
|
239
|
+
# Poll for results
|
|
240
|
+
query_response = self._poll_query_results(
|
|
241
|
+
ctx, logs, query_id, resource_type, resource_name
|
|
242
|
+
)
|
|
243
|
+
|
|
244
|
+
# Process results
|
|
245
|
+
results = query_response['results']
|
|
246
|
+
log_entries = []
|
|
247
|
+
|
|
248
|
+
for result in results:
|
|
249
|
+
entry = self._build_log_entry(result)
|
|
250
|
+
log_entries.append(entry)
|
|
251
|
+
|
|
252
|
+
log_with_request_id(
|
|
253
|
+
ctx,
|
|
254
|
+
LogLevel.INFO,
|
|
255
|
+
f'Retrieved {len(log_entries)} log entries for {resource_type} {resource_name}',
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
# Return the results
|
|
259
|
+
return CloudWatchLogsResponse(
|
|
260
|
+
isError=False,
|
|
261
|
+
content=[
|
|
262
|
+
TextContent(
|
|
263
|
+
type='text',
|
|
264
|
+
text=f'Successfully retrieved {len(log_entries)} log entries for {resource_type} {resource_name} in cluster {cluster_name}',
|
|
265
|
+
)
|
|
266
|
+
],
|
|
267
|
+
resource_type=resource_type,
|
|
268
|
+
resource_name=resource_name,
|
|
269
|
+
cluster_name=cluster_name,
|
|
270
|
+
log_type=log_type,
|
|
271
|
+
log_group=log_group,
|
|
272
|
+
start_time=start_dt.isoformat(),
|
|
273
|
+
end_time=end_dt.isoformat(),
|
|
274
|
+
log_entries=log_entries,
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
except Exception as e:
|
|
278
|
+
error_message = f'Failed to get logs for {resource_type} {resource_name}: {str(e)}'
|
|
279
|
+
log_with_request_id(ctx, LogLevel.ERROR, error_message)
|
|
280
|
+
|
|
281
|
+
return CloudWatchLogsResponse(
|
|
282
|
+
isError=True,
|
|
283
|
+
content=[TextContent(type='text', text=error_message)],
|
|
284
|
+
resource_type=resource_type,
|
|
285
|
+
resource_name=resource_name,
|
|
286
|
+
cluster_name=cluster_name,
|
|
287
|
+
log_type=log_type,
|
|
288
|
+
log_group='',
|
|
289
|
+
start_time='',
|
|
290
|
+
end_time='',
|
|
291
|
+
log_entries=[],
|
|
292
|
+
)
|
|
293
|
+
|
|
294
|
+
async def get_cloudwatch_metrics(
|
|
295
|
+
self,
|
|
296
|
+
ctx: Context,
|
|
297
|
+
resource_type: str = Field(
|
|
298
|
+
...,
|
|
299
|
+
description='Resource type to retrieve metrics for. Valid values: "pod", "node", "container", "cluster", "service". Determines the CloudWatch dimensions.',
|
|
300
|
+
),
|
|
301
|
+
resource_name: str = Field(
|
|
302
|
+
...,
|
|
303
|
+
description='Name of the resource to retrieve metrics for (e.g., pod name, node name). Used as a dimension value in CloudWatch.',
|
|
304
|
+
),
|
|
305
|
+
cluster_name: str = Field(
|
|
306
|
+
...,
|
|
307
|
+
description='Name of the EKS cluster where the resource is located. Used as the ClusterName dimension in CloudWatch.',
|
|
308
|
+
),
|
|
309
|
+
metric_name: str = Field(
|
|
310
|
+
...,
|
|
311
|
+
description="""Metric name to retrieve. Common examples:
|
|
312
|
+
- cpu_usage_total: Total CPU usage
|
|
313
|
+
- memory_rss: Resident Set Size memory usage
|
|
314
|
+
- network_rx_bytes: Network bytes received
|
|
315
|
+
- network_tx_bytes: Network bytes transmitted""",
|
|
316
|
+
),
|
|
317
|
+
namespace: str = Field(
|
|
318
|
+
...,
|
|
319
|
+
description="""CloudWatch namespace where the metric is stored. Common values:
|
|
320
|
+
- "ContainerInsights": For container metrics
|
|
321
|
+
- "AWS/EC2": For EC2 instance metrics
|
|
322
|
+
- "AWS/EKS": For EKS control plane metrics""",
|
|
323
|
+
),
|
|
324
|
+
k8s_namespace: str = Field(
|
|
325
|
+
'default',
|
|
326
|
+
description='Kubernetes namespace for the resource. Used as the Namespace dimension in CloudWatch. Default: "default"',
|
|
327
|
+
),
|
|
328
|
+
minutes: int = Field(
|
|
329
|
+
15,
|
|
330
|
+
description='Number of minutes to look back for metrics. Default: 15. Ignored if start_time is provided. IMPORTANT: Choose a time range appropriate for the metric resolution.',
|
|
331
|
+
),
|
|
332
|
+
start_time: Optional[str] = Field(
|
|
333
|
+
None,
|
|
334
|
+
description='Start time in ISO format (e.g., "2023-01-01T00:00:00Z"). If provided, overrides the minutes parameter. IMPORTANT: Use this for precise historical analysis.',
|
|
335
|
+
),
|
|
336
|
+
end_time: Optional[str] = Field(
|
|
337
|
+
None,
|
|
338
|
+
description='End time in ISO format (e.g., "2023-01-01T01:00:00Z"). If not provided, defaults to current time. IMPORTANT: Use with start_time for precise time ranges.',
|
|
339
|
+
),
|
|
340
|
+
limit: int = Field(
|
|
341
|
+
50,
|
|
342
|
+
description='Maximum number of data points to return. Higher values (100-1000) provide more granular data but may impact performance. IMPORTANT: Balance between granularity and performance.',
|
|
343
|
+
),
|
|
344
|
+
period: int = Field(
|
|
345
|
+
60,
|
|
346
|
+
description='Period in seconds for the metric data points. Default: 60 (1 minute). Lower values (1-60) provide higher resolution but may be less available. IMPORTANT: Match to your monitoring needs.',
|
|
347
|
+
),
|
|
348
|
+
stat: str = Field(
|
|
349
|
+
'Average',
|
|
350
|
+
description="""Statistic to use for the metric aggregation:
|
|
351
|
+
- Average: Mean value during the period
|
|
352
|
+
- Sum: Total value during the period
|
|
353
|
+
- Maximum: Highest value during the period
|
|
354
|
+
- Minimum: Lowest value during the period
|
|
355
|
+
- SampleCount: Number of samples during the period""",
|
|
356
|
+
),
|
|
357
|
+
custom_dimensions: Optional[dict] = Field(
|
|
358
|
+
None,
|
|
359
|
+
description='Custom dimensions to use instead of the default ones. Provide as a dictionary of dimension name-value pairs. IMPORTANT: Only use this if you need to override the standard dimensions.',
|
|
360
|
+
),
|
|
361
|
+
) -> CloudWatchMetricsResponse:
|
|
362
|
+
"""Get metrics from CloudWatch for a specific resource.
|
|
363
|
+
|
|
364
|
+
This tool retrieves metrics from CloudWatch for Kubernetes resources in an EKS cluster,
|
|
365
|
+
allowing you to monitor performance, resource utilization, and system health. It supports
|
|
366
|
+
various resource types and metrics with flexible time ranges and aggregation options for
|
|
367
|
+
monitoring CPU/memory usage, analyzing network traffic, and identifying performance bottlenecks.
|
|
368
|
+
|
|
369
|
+
## Requirements
|
|
370
|
+
- The EKS cluster must have CloudWatch Container Insights enabled
|
|
371
|
+
- The resource must exist in the specified cluster
|
|
372
|
+
- The metric must be available in the specified namespace
|
|
373
|
+
|
|
374
|
+
## Response Information
|
|
375
|
+
The response includes resource details (type, name, cluster), metric information (name, namespace),
|
|
376
|
+
time range queried, and data points with timestamps and values.
|
|
377
|
+
|
|
378
|
+
## Usage Tips
|
|
379
|
+
- Use appropriate statistics for different metrics (e.g., Average for CPU, Maximum for memory spikes)
|
|
380
|
+
- Match the period to your analysis needs (smaller for detailed graphs, larger for trends)
|
|
381
|
+
- For rate metrics like network traffic, Sum is often more useful than Average
|
|
382
|
+
- Combine with get_cloudwatch_logs to correlate metrics with log events
|
|
383
|
+
|
|
384
|
+
Args:
|
|
385
|
+
ctx: MCP context
|
|
386
|
+
resource_type: Resource type (pod, node, container, cluster)
|
|
387
|
+
resource_name: Resource name
|
|
388
|
+
cluster_name: Name of the EKS cluster
|
|
389
|
+
metric_name: Metric name (e.g., cpu_usage_total, memory_rss)
|
|
390
|
+
namespace: CloudWatch namespace
|
|
391
|
+
k8s_namespace: Kubernetes namespace for the resource
|
|
392
|
+
minutes: Number of minutes to look back
|
|
393
|
+
start_time: Start time in ISO format (overrides minutes)
|
|
394
|
+
end_time: End time in ISO format (defaults to now)
|
|
395
|
+
limit: Maximum number of data points to return
|
|
396
|
+
period: Period in seconds for the metric data points
|
|
397
|
+
stat: Statistic to use for the metric
|
|
398
|
+
custom_dimensions: Custom dimensions to use instead of defaults
|
|
399
|
+
|
|
400
|
+
Returns:
|
|
401
|
+
CloudWatchMetricsResponse with metric data points and resource information
|
|
402
|
+
"""
|
|
403
|
+
try:
|
|
404
|
+
start_dt, end_dt = self.resolve_time_range(start_time, end_time, minutes)
|
|
405
|
+
|
|
406
|
+
# Create CloudWatch client
|
|
407
|
+
cloudwatch = AwsHelper.create_boto3_client('cloudwatch')
|
|
408
|
+
|
|
409
|
+
# Use custom dimensions if provided, otherwise determine based on resource_type
|
|
410
|
+
dimensions = {}
|
|
411
|
+
|
|
412
|
+
if isinstance(custom_dimensions, dict):
|
|
413
|
+
# Use the provided custom dimensions directly
|
|
414
|
+
dimensions = custom_dimensions
|
|
415
|
+
elif custom_dimensions is not None and not hasattr(custom_dimensions, 'default'):
|
|
416
|
+
# Try to convert to dict if possible
|
|
417
|
+
try:
|
|
418
|
+
dimensions = dict(custom_dimensions)
|
|
419
|
+
except (TypeError, ValueError):
|
|
420
|
+
# If conversion fails, use default dimensions
|
|
421
|
+
dimensions = {'ClusterName': cluster_name}
|
|
422
|
+
else:
|
|
423
|
+
# Set default dimensions based on resource type
|
|
424
|
+
dimensions['ClusterName'] = cluster_name
|
|
425
|
+
dimensions['Namespace'] = k8s_namespace
|
|
426
|
+
|
|
427
|
+
if resource_type == 'pod':
|
|
428
|
+
dimensions['PodName'] = resource_name
|
|
429
|
+
elif resource_type == 'node':
|
|
430
|
+
dimensions['NodeName'] = resource_name
|
|
431
|
+
elif resource_type == 'container':
|
|
432
|
+
dimensions['ContainerName'] = resource_name
|
|
433
|
+
elif resource_type == 'service':
|
|
434
|
+
dimensions['Service'] = resource_name
|
|
435
|
+
|
|
436
|
+
log_with_request_id(
|
|
437
|
+
ctx,
|
|
438
|
+
LogLevel.INFO,
|
|
439
|
+
f'Getting CloudWatch metrics for {resource_type} {resource_name} in cluster {cluster_name}',
|
|
440
|
+
metric_name=metric_name,
|
|
441
|
+
namespace=namespace,
|
|
442
|
+
start_time=start_dt.isoformat(),
|
|
443
|
+
end_time=end_dt.isoformat(),
|
|
444
|
+
)
|
|
445
|
+
|
|
446
|
+
# Create the metric data query
|
|
447
|
+
metric_data_query = {
|
|
448
|
+
'Id': 'm1',
|
|
449
|
+
'ReturnData': True,
|
|
450
|
+
}
|
|
451
|
+
|
|
452
|
+
# Convert dimensions to the format expected by CloudWatch
|
|
453
|
+
dimension_list = [{'Name': k, 'Value': v} for k, v in dimensions.items()]
|
|
454
|
+
|
|
455
|
+
# Create the metric definition
|
|
456
|
+
metric_def = {
|
|
457
|
+
'Namespace': namespace,
|
|
458
|
+
'MetricName': metric_name,
|
|
459
|
+
'Dimensions': dimension_list,
|
|
460
|
+
}
|
|
461
|
+
|
|
462
|
+
# Create the metric stat with the appropriate statistics
|
|
463
|
+
# Handle the case where period/stat is a Field object
|
|
464
|
+
period_value = period if isinstance(period, int) else period.default
|
|
465
|
+
stat_value = stat if isinstance(stat, str) else stat.default
|
|
466
|
+
|
|
467
|
+
# Create the metric stat
|
|
468
|
+
metric_stat = {'Metric': metric_def, 'Period': period_value, 'Stat': stat_value}
|
|
469
|
+
|
|
470
|
+
# Add the metric stat to the query
|
|
471
|
+
metric_data_query['MetricStat'] = metric_stat
|
|
472
|
+
|
|
473
|
+
# Get metric data
|
|
474
|
+
response = cloudwatch.get_metric_data(
|
|
475
|
+
MetricDataQueries=[metric_data_query],
|
|
476
|
+
StartTime=start_dt,
|
|
477
|
+
EndTime=end_dt,
|
|
478
|
+
MaxDatapoints=limit,
|
|
479
|
+
)
|
|
480
|
+
|
|
481
|
+
# Process results
|
|
482
|
+
metric_data = response['MetricDataResults'][0]
|
|
483
|
+
timestamps = [ts.isoformat() for ts in metric_data.get('Timestamps', [])]
|
|
484
|
+
values = metric_data.get('Values', [])
|
|
485
|
+
|
|
486
|
+
# Create data points
|
|
487
|
+
data_points = []
|
|
488
|
+
for i in range(len(timestamps)):
|
|
489
|
+
if i < len(values):
|
|
490
|
+
data_points.append({'timestamp': timestamps[i], 'value': values[i]})
|
|
491
|
+
|
|
492
|
+
log_with_request_id(
|
|
493
|
+
ctx,
|
|
494
|
+
LogLevel.INFO,
|
|
495
|
+
f'Retrieved {len(data_points)} metric data points for {resource_type} {resource_name}',
|
|
496
|
+
)
|
|
497
|
+
|
|
498
|
+
# Return the results
|
|
499
|
+
return CloudWatchMetricsResponse(
|
|
500
|
+
isError=False,
|
|
501
|
+
content=[
|
|
502
|
+
TextContent(
|
|
503
|
+
type='text',
|
|
504
|
+
text=f'Successfully retrieved {len(data_points)} metric data points for {resource_type} {resource_name} in cluster {cluster_name}',
|
|
505
|
+
)
|
|
506
|
+
],
|
|
507
|
+
resource_type=resource_type,
|
|
508
|
+
resource_name=resource_name,
|
|
509
|
+
cluster_name=cluster_name,
|
|
510
|
+
metric_name=metric_name,
|
|
511
|
+
namespace=namespace,
|
|
512
|
+
start_time=start_dt.isoformat(),
|
|
513
|
+
end_time=end_dt.isoformat(),
|
|
514
|
+
data_points=data_points,
|
|
515
|
+
)
|
|
516
|
+
|
|
517
|
+
except Exception as e:
|
|
518
|
+
error_message = f'Failed to get metrics for {resource_type} {resource_name}: {str(e)}'
|
|
519
|
+
log_with_request_id(ctx, LogLevel.ERROR, error_message)
|
|
520
|
+
|
|
521
|
+
return CloudWatchMetricsResponse(
|
|
522
|
+
isError=True,
|
|
523
|
+
content=[TextContent(type='text', text=error_message)],
|
|
524
|
+
resource_type=resource_type,
|
|
525
|
+
resource_name=resource_name,
|
|
526
|
+
cluster_name=cluster_name,
|
|
527
|
+
metric_name=metric_name,
|
|
528
|
+
namespace=namespace,
|
|
529
|
+
start_time='',
|
|
530
|
+
end_time='',
|
|
531
|
+
data_points=[],
|
|
532
|
+
)
|
|
533
|
+
|
|
534
|
+
def _poll_query_results(
|
|
535
|
+
self,
|
|
536
|
+
ctx,
|
|
537
|
+
logs_client,
|
|
538
|
+
query_id,
|
|
539
|
+
resource_type,
|
|
540
|
+
resource_name,
|
|
541
|
+
max_attempts=60,
|
|
542
|
+
initial_delay=1,
|
|
543
|
+
):
|
|
544
|
+
"""Poll for CloudWatch Logs query results with exponential backoff.
|
|
545
|
+
|
|
546
|
+
Args:
|
|
547
|
+
ctx: MCP context
|
|
548
|
+
logs_client: Boto3 CloudWatch Logs client
|
|
549
|
+
query_id: ID of the query to poll for
|
|
550
|
+
resource_type: Resource type for logging
|
|
551
|
+
resource_name: Resource name for logging
|
|
552
|
+
max_attempts: Maximum number of polling attempts before timing out
|
|
553
|
+
initial_delay: Initial delay between polling attempts in seconds
|
|
554
|
+
|
|
555
|
+
Returns:
|
|
556
|
+
Query response when complete
|
|
557
|
+
|
|
558
|
+
Raises:
|
|
559
|
+
TimeoutError: If the query does not complete within the maximum number of attempts
|
|
560
|
+
"""
|
|
561
|
+
attempts = 0
|
|
562
|
+
delay = initial_delay
|
|
563
|
+
|
|
564
|
+
log_with_request_id(
|
|
565
|
+
ctx,
|
|
566
|
+
LogLevel.INFO,
|
|
567
|
+
f'Polling for CloudWatch Logs query results (query_id: {query_id})',
|
|
568
|
+
)
|
|
569
|
+
|
|
570
|
+
while attempts < max_attempts:
|
|
571
|
+
query_response = logs_client.get_query_results(queryId=query_id)
|
|
572
|
+
status = query_response.get('status')
|
|
573
|
+
|
|
574
|
+
if status == 'Complete':
|
|
575
|
+
log_with_request_id(
|
|
576
|
+
ctx,
|
|
577
|
+
LogLevel.INFO,
|
|
578
|
+
f'CloudWatch Logs query completed successfully after {attempts + 1} attempts',
|
|
579
|
+
)
|
|
580
|
+
return query_response
|
|
581
|
+
elif status == 'Failed':
|
|
582
|
+
error_message = f'CloudWatch Logs query failed for {resource_type} {resource_name}'
|
|
583
|
+
log_with_request_id(ctx, LogLevel.ERROR, error_message)
|
|
584
|
+
raise Exception(error_message)
|
|
585
|
+
elif status == 'Cancelled':
|
|
586
|
+
error_message = (
|
|
587
|
+
f'CloudWatch Logs query was cancelled for {resource_type} {resource_name}'
|
|
588
|
+
)
|
|
589
|
+
log_with_request_id(ctx, LogLevel.ERROR, error_message)
|
|
590
|
+
raise Exception(error_message)
|
|
591
|
+
|
|
592
|
+
# Log progress periodically
|
|
593
|
+
if attempts % 5 == 0:
|
|
594
|
+
log_with_request_id(
|
|
595
|
+
ctx,
|
|
596
|
+
LogLevel.INFO,
|
|
597
|
+
f'Waiting for CloudWatch Logs query to complete (attempt {attempts + 1}/{max_attempts})',
|
|
598
|
+
)
|
|
599
|
+
|
|
600
|
+
# Sleep with exponential backoff (capped at 5 seconds)
|
|
601
|
+
time.sleep(min(delay, 5))
|
|
602
|
+
delay = min(delay * 1.5, 5) # Exponential backoff with a cap
|
|
603
|
+
attempts += 1
|
|
604
|
+
|
|
605
|
+
# If we've exhausted all attempts, raise a timeout error
|
|
606
|
+
error_message = f'CloudWatch Logs query timed out after {max_attempts} attempts for {resource_type} {resource_name}'
|
|
607
|
+
log_with_request_id(ctx, LogLevel.ERROR, error_message)
|
|
608
|
+
raise TimeoutError(error_message)
|
|
609
|
+
|
|
610
|
+
def _build_log_entry(self, result):
|
|
611
|
+
"""Build a log entry from CloudWatch Logs query result.
|
|
612
|
+
|
|
613
|
+
Args:
|
|
614
|
+
result: A single result from CloudWatch Logs query
|
|
615
|
+
|
|
616
|
+
Returns:
|
|
617
|
+
Formatted log entry dictionary
|
|
618
|
+
"""
|
|
619
|
+
entry = {}
|
|
620
|
+
for field in result:
|
|
621
|
+
if field['field'] == '@timestamp':
|
|
622
|
+
entry['timestamp'] = field['value']
|
|
623
|
+
elif field['field'] == '@message':
|
|
624
|
+
message = field['value']
|
|
625
|
+
|
|
626
|
+
# Clean up the message to make it more human-readable
|
|
627
|
+
message = message.replace('\n', '')
|
|
628
|
+
message = message.replace('"', '"')
|
|
629
|
+
|
|
630
|
+
# Try to parse JSON if the message appears to be JSON
|
|
631
|
+
if message.startswith('{') and message.endswith('}'):
|
|
632
|
+
try:
|
|
633
|
+
parsed_json = json.loads(message)
|
|
634
|
+
|
|
635
|
+
# Format any nested JSON structures
|
|
636
|
+
parsed_json = self._format_nested_json(parsed_json)
|
|
637
|
+
|
|
638
|
+
entry['message'] = parsed_json
|
|
639
|
+
except json.JSONDecodeError:
|
|
640
|
+
# If it's not valid JSON, just use the cleaned message
|
|
641
|
+
entry['message'] = message
|
|
642
|
+
else:
|
|
643
|
+
# For non-JSON messages, use the cleaned message
|
|
644
|
+
entry['message'] = message
|
|
645
|
+
else:
|
|
646
|
+
entry[field['field']] = field['value']
|
|
647
|
+
return entry
|
|
648
|
+
|
|
649
|
+
def _format_nested_json(self, obj):
|
|
650
|
+
"""Format nested JSON objects for better readability.
|
|
651
|
+
|
|
652
|
+
Args:
|
|
653
|
+
obj: The JSON object to format
|
|
654
|
+
|
|
655
|
+
Returns:
|
|
656
|
+
The formatted JSON object
|
|
657
|
+
"""
|
|
658
|
+
if isinstance(obj, dict):
|
|
659
|
+
for key, value in obj.items():
|
|
660
|
+
if isinstance(value, (dict, list)):
|
|
661
|
+
obj[key] = self._format_nested_json(value)
|
|
662
|
+
elif isinstance(value, str) and value.startswith('{') and value.endswith('}'):
|
|
663
|
+
try:
|
|
664
|
+
obj[key] = json.loads(value)
|
|
665
|
+
except json.JSONDecodeError:
|
|
666
|
+
pass
|
|
667
|
+
elif isinstance(obj, list):
|
|
668
|
+
for i, item in enumerate(obj):
|
|
669
|
+
obj[i] = self._format_nested_json(item)
|
|
670
|
+
return obj
|