@zerry_jin/k8s-doctor-mcp 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.ko.md +330 -0
- package/README.md +330 -0
- package/dist/analyzers/log-analyzer.d.ts +17 -0
- package/dist/analyzers/log-analyzer.js +402 -0
- package/dist/diagnostics/cluster-health.d.ts +13 -0
- package/dist/diagnostics/cluster-health.js +176 -0
- package/dist/diagnostics/pod-diagnostics.d.ts +23 -0
- package/dist/diagnostics/pod-diagnostics.js +654 -0
- package/dist/index.d.ts +11 -0
- package/dist/index.js +678 -0
- package/dist/types.d.ts +337 -0
- package/dist/types.js +6 -0
- package/dist/utils/cache.d.ts +59 -0
- package/dist/utils/cache.js +99 -0
- package/dist/utils/formatters.d.ts +48 -0
- package/dist/utils/formatters.js +129 -0
- package/dist/utils/k8s-client.d.ts +37 -0
- package/dist/utils/k8s-client.js +74 -0
- package/dist/utils/retry.d.ts +25 -0
- package/dist/utils/retry.js +70 -0
- package/package.json +65 -0
package/dist/index.js
ADDED
|
@@ -0,0 +1,678 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
/**
|
|
3
|
+
* K8s Doctor MCP Server
|
|
4
|
+
*
|
|
5
|
+
* MCP server for AI-powered Kubernetes cluster diagnosis and problem solving.
|
|
6
|
+
* Goes beyond simple queries - analyzes error logs, identifies root causes, and suggests solutions.
|
|
7
|
+
*
|
|
8
|
+
* @author zerry
|
|
9
|
+
* @license MIT
|
|
10
|
+
*/
|
|
11
|
+
import { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
|
|
12
|
+
import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
|
|
13
|
+
import * as z from 'zod';
|
|
14
|
+
import { loadK8sConfig, createK8sClients } from './utils/k8s-client.js';
|
|
15
|
+
import { diagnosePod, diagnoseCrashLoop } from './diagnostics/pod-diagnostics.js';
|
|
16
|
+
import { analyzeLogs } from './analyzers/log-analyzer.js';
|
|
17
|
+
import { diagnoseClusterHealth } from './diagnostics/cluster-health.js';
|
|
18
|
+
import { formatIssues, formatBytes, formatCPU, getHealthEmoji, createTable } from './utils/formatters.js';
|
|
19
|
+
import { MemoryCache, getOrCompute } from './utils/cache.js';
|
|
20
|
+
// ============================================
|
|
21
|
+
// MCP Server Initialization
|
|
22
|
+
// ============================================
|
|
23
|
+
const server = new McpServer({
|
|
24
|
+
name: 'k8s-doctor',
|
|
25
|
+
version: '1.0.0',
|
|
26
|
+
});
|
|
27
|
+
// Kubernetes client initialization
|
|
28
|
+
let k8sClients = null;
|
|
29
|
+
let k8sConfig = null;
|
|
30
|
+
// Cache instances for performance optimization
|
|
31
|
+
const namespaceCache = new MemoryCache(30000); // 30 seconds TTL
|
|
32
|
+
const podListCache = new MemoryCache(30000); // 30 seconds TTL
|
|
33
|
+
/**
|
|
34
|
+
* Get K8s clients with lazy initialization
|
|
35
|
+
*/
|
|
36
|
+
function getK8sClients() {
|
|
37
|
+
if (!k8sClients || !k8sConfig) {
|
|
38
|
+
try {
|
|
39
|
+
k8sConfig = loadK8sConfig();
|
|
40
|
+
k8sClients = createK8sClients(k8sConfig);
|
|
41
|
+
console.error('â
Kubernetes connection established');
|
|
42
|
+
}
|
|
43
|
+
catch (error) {
|
|
44
|
+
console.error('â Kubernetes connection failed:', error.message);
|
|
45
|
+
throw new Error(`Cannot connect to Kubernetes: ${error.message}\nPlease verify kubectl is configured.`);
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
return k8sClients;
|
|
49
|
+
}
|
|
50
|
+
/**
|
|
51
|
+
* Comprehensive pod diagnostics
|
|
52
|
+
*
|
|
53
|
+
* This is the core feature! Clearly explains why the pod is not working.
|
|
54
|
+
* Analyzes all issues including CrashLoopBackOff, ImagePullBackOff, OOM, etc.
|
|
55
|
+
*/
|
|
56
|
+
server.registerTool('diagnose-pod', {
|
|
57
|
+
title: 'Comprehensive pod diagnostics',
|
|
58
|
+
description: 'Analyzes pod status, logs, and events to identify root causes and suggest solutions',
|
|
59
|
+
inputSchema: {
|
|
60
|
+
namespace: z.string().describe('Namespace'),
|
|
61
|
+
podName: z.string().describe('Pod name'),
|
|
62
|
+
detailed: z.boolean().default(true).describe('Enable detailed analysis (includes logs)'),
|
|
63
|
+
},
|
|
64
|
+
}, async ({ namespace, podName, detailed }) => {
|
|
65
|
+
try {
|
|
66
|
+
const diagnostics = await diagnosePod(getK8sClients().core, namespace, podName, getK8sClients().metrics);
|
|
67
|
+
let result = `# đĨ Pod Diagnosis Report\n\n`;
|
|
68
|
+
result += `**Pod**: ${diagnostics.podInfo.name}\n`;
|
|
69
|
+
result += `**Namespace**: ${diagnostics.podInfo.namespace}\n`;
|
|
70
|
+
result += `**Status**: ${diagnostics.podInfo.phase}\n`;
|
|
71
|
+
result += `**Node**: ${diagnostics.podInfo.nodeName || 'N/A'}\n`;
|
|
72
|
+
result += `**Health**: ${getHealthEmoji(diagnostics.healthScore)} ${diagnostics.healthScore}/100\n\n`;
|
|
73
|
+
// Summary
|
|
74
|
+
result += `## đ Summary\n\n${diagnostics.summary}\n\n`;
|
|
75
|
+
// Container Status
|
|
76
|
+
result += `## đŗ Container Status\n\n`;
|
|
77
|
+
const containerRows = diagnostics.containers.map(c => [
|
|
78
|
+
c.name,
|
|
79
|
+
c.ready ? 'â
' : 'â',
|
|
80
|
+
c.restartCount.toString(),
|
|
81
|
+
c.state.running ? 'Running' :
|
|
82
|
+
c.state.waiting ? `Waiting: ${c.state.waiting.reason}` :
|
|
83
|
+
c.state.terminated ? `Terminated: ${c.state.terminated.reason}` : 'Unknown',
|
|
84
|
+
]);
|
|
85
|
+
result += createTable(['Name', 'Ready', 'Restarts', 'State'], containerRows);
|
|
86
|
+
result += '\n\n';
|
|
87
|
+
// Resource usage
|
|
88
|
+
result += `## đž Resources\n\n`;
|
|
89
|
+
result += `**CPU**:\n`;
|
|
90
|
+
if (diagnostics.resources.cpu.current !== undefined) {
|
|
91
|
+
result += ` - Current: ${formatCPU(diagnostics.resources.cpu.current)}`;
|
|
92
|
+
if (diagnostics.resources.cpu.usagePercent !== undefined) {
|
|
93
|
+
const emoji = diagnostics.resources.cpu.usagePercent >= 80 ? ' â ī¸' : '';
|
|
94
|
+
result += ` (${diagnostics.resources.cpu.usagePercent.toFixed(1)}%${emoji})\n`;
|
|
95
|
+
}
|
|
96
|
+
else {
|
|
97
|
+
result += '\n';
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
if (diagnostics.resources.cpu.requested) {
|
|
101
|
+
result += ` - Requested: ${formatCPU(diagnostics.resources.cpu.requested)}\n`;
|
|
102
|
+
}
|
|
103
|
+
if (diagnostics.resources.cpu.limit) {
|
|
104
|
+
result += ` - Limit: ${formatCPU(diagnostics.resources.cpu.limit)}\n`;
|
|
105
|
+
}
|
|
106
|
+
if (diagnostics.resources.cpu.isThrottled) {
|
|
107
|
+
result += ` - â ī¸ **WARNING**: CPU usage is high (>80%)\n`;
|
|
108
|
+
}
|
|
109
|
+
result += `\n**Memory**:\n`;
|
|
110
|
+
if (diagnostics.resources.memory.current !== undefined) {
|
|
111
|
+
result += ` - Current: ${formatBytes(diagnostics.resources.memory.current)}`;
|
|
112
|
+
if (diagnostics.resources.memory.usagePercent !== undefined) {
|
|
113
|
+
const emoji = diagnostics.resources.memory.usagePercent >= 90 ? ' đ´' :
|
|
114
|
+
diagnostics.resources.memory.usagePercent >= 80 ? ' â ī¸' : '';
|
|
115
|
+
result += ` (${diagnostics.resources.memory.usagePercent.toFixed(1)}%${emoji})\n`;
|
|
116
|
+
}
|
|
117
|
+
else {
|
|
118
|
+
result += '\n';
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
if (diagnostics.resources.memory.requested) {
|
|
122
|
+
result += ` - Requested: ${formatBytes(diagnostics.resources.memory.requested)}\n`;
|
|
123
|
+
}
|
|
124
|
+
if (diagnostics.resources.memory.limit) {
|
|
125
|
+
result += ` - Limit: ${formatBytes(diagnostics.resources.memory.limit)}\n`;
|
|
126
|
+
}
|
|
127
|
+
if (diagnostics.resources.memory.isOOMRisk) {
|
|
128
|
+
result += ` - đ´ **CRITICAL**: OOM risk detected (>90%)\n`;
|
|
129
|
+
}
|
|
130
|
+
if (!diagnostics.resources.cpu.current && !diagnostics.resources.memory.current) {
|
|
131
|
+
result += `\nđĄ **Tip**: Install Metrics Server to see real-time usage:\n`;
|
|
132
|
+
result += '```bash\nkubectl apply -f https://github.com/kubernetes-sigs/metrics-server/releases/latest/download/components.yaml\n```\n';
|
|
133
|
+
}
|
|
134
|
+
result += '\n\n';
|
|
135
|
+
// Issues
|
|
136
|
+
result += formatIssues(diagnostics.issues);
|
|
137
|
+
// Recent Events
|
|
138
|
+
if (diagnostics.events.length > 0) {
|
|
139
|
+
result += `## đ Recent Events (last 5)\n\n`;
|
|
140
|
+
for (const event of diagnostics.events.slice(0, 5)) {
|
|
141
|
+
const icon = event.type === 'Warning' ? 'â ī¸' : 'âšī¸';
|
|
142
|
+
result += `${icon} **${event.reason}** (${event.count} times)\n`;
|
|
143
|
+
result += ` ${event.message}\n\n`;
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
return { content: [{ type: 'text', text: result }] };
|
|
147
|
+
}
|
|
148
|
+
catch (error) {
|
|
149
|
+
return {
|
|
150
|
+
content: [{
|
|
151
|
+
type: 'text',
|
|
152
|
+
text: `â Pod diagnosis failed: ${error.message}\n\nVerify pod exists:\n\`\`\`bash\nkubectl get pod ${podName} -n ${namespace}\n\`\`\``,
|
|
153
|
+
}],
|
|
154
|
+
};
|
|
155
|
+
}
|
|
156
|
+
});
|
|
157
|
+
/**
|
|
158
|
+
* Specialized CrashLoopBackOff diagnostics
|
|
159
|
+
*
|
|
160
|
+
* CrashLoop is really tricky - this tool analyzes exit codes
|
|
161
|
+
* and logs to accurately identify the root cause
|
|
162
|
+
*/
|
|
163
|
+
server.registerTool('debug-crashloop', {
|
|
164
|
+
title: 'CrashLoopBackOff Diagnostics',
|
|
165
|
+
description: 'Analyzes pods in CrashLoop state by examining exit codes, logs, and events to find the root cause',
|
|
166
|
+
inputSchema: {
|
|
167
|
+
namespace: z.string().describe('Namespace'),
|
|
168
|
+
podName: z.string().describe('Pod name'),
|
|
169
|
+
containerName: z.string().optional().describe('Container name (optional)'),
|
|
170
|
+
},
|
|
171
|
+
}, async ({ namespace, podName, containerName }) => {
|
|
172
|
+
try {
|
|
173
|
+
const issues = await diagnoseCrashLoop(getK8sClients().core, getK8sClients().log, namespace, podName, containerName);
|
|
174
|
+
let result = `# đ CrashLoopBackOff Diagnostics\n\n`;
|
|
175
|
+
result += `**Pod**: ${podName}\n`;
|
|
176
|
+
result += `**Namespace**: ${namespace}\n\n`;
|
|
177
|
+
if (issues.length === 0) {
|
|
178
|
+
result += 'â
No CrashLoop issues detected.\n';
|
|
179
|
+
}
|
|
180
|
+
else {
|
|
181
|
+
result += formatIssues(issues);
|
|
182
|
+
}
|
|
183
|
+
// Additional debugging commands
|
|
184
|
+
result += `\n## đ ī¸ Additional Debugging Commands\n\n`;
|
|
185
|
+
result += '```bash\n';
|
|
186
|
+
result += `# Check previous logs (important!)\n`;
|
|
187
|
+
result += `kubectl logs ${podName} -n ${namespace} --previous\n\n`;
|
|
188
|
+
result += `# Check current logs\n`;
|
|
189
|
+
result += `kubectl logs ${podName} -n ${namespace}\n\n`;
|
|
190
|
+
result += `# Check events\n`;
|
|
191
|
+
result += `kubectl describe pod ${podName} -n ${namespace}\n\n`;
|
|
192
|
+
result += `# Check pod YAML\n`;
|
|
193
|
+
result += `kubectl get pod ${podName} -n ${namespace} -o yaml\n`;
|
|
194
|
+
result += '```\n';
|
|
195
|
+
return { content: [{ type: 'text', text: result }] };
|
|
196
|
+
}
|
|
197
|
+
catch (error) {
|
|
198
|
+
return {
|
|
199
|
+
content: [{
|
|
200
|
+
type: 'text',
|
|
201
|
+
text: `â CrashLoop diagnostics failed: ${error.message}`,
|
|
202
|
+
}],
|
|
203
|
+
};
|
|
204
|
+
}
|
|
205
|
+
});
|
|
206
|
+
/**
|
|
207
|
+
* Log analysis
|
|
208
|
+
*
|
|
209
|
+
* Rather than just showing logs, finds error patterns
|
|
210
|
+
* and identifies likely causes of errors
|
|
211
|
+
*/
|
|
212
|
+
server.registerTool('analyze-logs', {
|
|
213
|
+
title: 'Smart Log Analysis',
|
|
214
|
+
description: 'Detects error patterns in logs and suggests causes and solutions (Connection Refused, OOM, DB errors, etc.)',
|
|
215
|
+
inputSchema: {
|
|
216
|
+
namespace: z.string().describe('Namespace'),
|
|
217
|
+
podName: z.string().describe('Pod name'),
|
|
218
|
+
containerName: z.string().optional().describe('Container name (optional)'),
|
|
219
|
+
tailLines: z.number().default(500).describe('Number of recent lines to analyze'),
|
|
220
|
+
},
|
|
221
|
+
}, async ({ namespace, podName, containerName, tailLines }) => {
|
|
222
|
+
try {
|
|
223
|
+
const analysis = await analyzeLogs(getK8sClients().log, namespace, podName, containerName, tailLines);
|
|
224
|
+
let result = `# đ Log Analysis Results\n\n`;
|
|
225
|
+
result += `${analysis.summary}\n\n`;
|
|
226
|
+
// Detected patterns
|
|
227
|
+
if (analysis.patterns.length > 0) {
|
|
228
|
+
result += `## đ¯ Detected Error Patterns\n\n`;
|
|
229
|
+
for (const pattern of analysis.patterns) {
|
|
230
|
+
result += `### ${pattern.name} (${pattern.matchedLines.length} occurrences)\n\n`;
|
|
231
|
+
result += `**Description**: ${pattern.description}\n\n`;
|
|
232
|
+
result += `**Possible Causes**:\n`;
|
|
233
|
+
for (const cause of pattern.possibleCauses) {
|
|
234
|
+
result += ` - ${cause}\n`;
|
|
235
|
+
}
|
|
236
|
+
result += `\n**Solutions**:\n`;
|
|
237
|
+
for (const solution of pattern.solutions) {
|
|
238
|
+
result += ` - ${solution}\n`;
|
|
239
|
+
}
|
|
240
|
+
result += `\n**Locations**: lines ${pattern.matchedLines.slice(0, 5).join(', ')}`;
|
|
241
|
+
if (pattern.matchedLines.length > 5) {
|
|
242
|
+
result += ` and ${pattern.matchedLines.length - 5} more`;
|
|
243
|
+
}
|
|
244
|
+
result += '\n\n---\n\n';
|
|
245
|
+
}
|
|
246
|
+
}
|
|
247
|
+
// Repeated errors
|
|
248
|
+
if (analysis.repeatedErrors.length > 0) {
|
|
249
|
+
result += `## đ Repeated Errors\n\n`;
|
|
250
|
+
for (const repeated of analysis.repeatedErrors.slice(0, 5)) {
|
|
251
|
+
result += `- **${repeated.message}** (${repeated.count} times)\n`;
|
|
252
|
+
result += ` Lines ${repeated.firstLine} ~ ${repeated.lastLine}\n\n`;
|
|
253
|
+
}
|
|
254
|
+
}
|
|
255
|
+
// Recommendations
|
|
256
|
+
result += `## đĄ Recommendations\n\n`;
|
|
257
|
+
for (const rec of analysis.recommendations) {
|
|
258
|
+
result += `${rec}\n\n`;
|
|
259
|
+
}
|
|
260
|
+
// Error log samples
|
|
261
|
+
if (analysis.errorLines.length > 0) {
|
|
262
|
+
result += `\n## â Error Log Samples (last 10)\n\n\`\`\`\n`;
|
|
263
|
+
for (const line of analysis.errorLines.slice(-10)) {
|
|
264
|
+
result += `${line.lineNumber}: ${line.content}\n`;
|
|
265
|
+
}
|
|
266
|
+
result += '```\n';
|
|
267
|
+
}
|
|
268
|
+
return { content: [{ type: 'text', text: result }] };
|
|
269
|
+
}
|
|
270
|
+
catch (error) {
|
|
271
|
+
return {
|
|
272
|
+
content: [{
|
|
273
|
+
type: 'text',
|
|
274
|
+
text: `â Log analysis failed: ${error.message}`,
|
|
275
|
+
}],
|
|
276
|
+
};
|
|
277
|
+
}
|
|
278
|
+
});
|
|
279
|
+
/**
|
|
280
|
+
* Resource usage check
|
|
281
|
+
*
|
|
282
|
+
* Checks if CPU/Memory is approaching limits and OOM risk
|
|
283
|
+
*/
|
|
284
|
+
server.registerTool('check-resources', {
|
|
285
|
+
title: 'Resource Usage Check',
|
|
286
|
+
description: 'Compares pod CPU/Memory usage against limits to check for threshold violations',
|
|
287
|
+
inputSchema: {
|
|
288
|
+
namespace: z.string().describe('Namespace'),
|
|
289
|
+
podName: z.string().optional().describe('Specific pod (optional, entire namespace if empty)'),
|
|
290
|
+
},
|
|
291
|
+
}, async ({ namespace, podName }) => {
|
|
292
|
+
try {
|
|
293
|
+
const podsResponse = podName
|
|
294
|
+
? await getK8sClients().core.readNamespacedPod({ name: podName, namespace })
|
|
295
|
+
: await getK8sClients().core.listNamespacedPod({ namespace });
|
|
296
|
+
const pods = podName ? [podsResponse] : podsResponse.items;
|
|
297
|
+
// Try to get metrics
|
|
298
|
+
let metricsMap = new Map();
|
|
299
|
+
let metricsAvailable = false;
|
|
300
|
+
try {
|
|
301
|
+
const metrics = await getK8sClients().metrics.getPodMetrics(namespace);
|
|
302
|
+
for (const podMetric of metrics.items || []) {
|
|
303
|
+
const name = podMetric.metadata?.name;
|
|
304
|
+
if (name) {
|
|
305
|
+
// Sum container metrics for each pod
|
|
306
|
+
let totalCpu = 0;
|
|
307
|
+
let totalMem = 0;
|
|
308
|
+
for (const container of podMetric.containers || []) {
|
|
309
|
+
if (container.usage?.cpu) {
|
|
310
|
+
totalCpu += parseFloat(container.usage.cpu.replace('n', '')) / 1_000_000;
|
|
311
|
+
}
|
|
312
|
+
if (container.usage?.memory) {
|
|
313
|
+
totalMem += parseInt(container.usage.memory.replace('Ki', '')) * 1024;
|
|
314
|
+
}
|
|
315
|
+
}
|
|
316
|
+
metricsMap.set(name, { cpu: totalCpu, memory: totalMem });
|
|
317
|
+
}
|
|
318
|
+
}
|
|
319
|
+
metricsAvailable = metricsMap.size > 0;
|
|
320
|
+
}
|
|
321
|
+
catch (e) {
|
|
322
|
+
// Metrics Server not available
|
|
323
|
+
}
|
|
324
|
+
let result = `# đž Resource Usage Check\n\n`;
|
|
325
|
+
if (metricsAvailable) {
|
|
326
|
+
result += `â
**Real-time metrics available**\n\n`;
|
|
327
|
+
}
|
|
328
|
+
else {
|
|
329
|
+
result += `â ī¸ **Metrics Server not available** - showing only spec values\n\n`;
|
|
330
|
+
}
|
|
331
|
+
for (const pod of pods) {
|
|
332
|
+
const containers = pod.spec?.containers || [];
|
|
333
|
+
const podMetrics = metricsMap.get(pod.metadata?.name || '');
|
|
334
|
+
result += `## Pod: ${pod.metadata?.name}\n\n`;
|
|
335
|
+
// Calculate totals
|
|
336
|
+
let totalCpuRequest = 0;
|
|
337
|
+
let totalCpuLimit = 0;
|
|
338
|
+
let totalMemRequest = 0;
|
|
339
|
+
let totalMemLimit = 0;
|
|
340
|
+
for (const container of containers) {
|
|
341
|
+
const requests = container.resources?.requests || {};
|
|
342
|
+
const limits = container.resources?.limits || {};
|
|
343
|
+
if (requests.cpu) {
|
|
344
|
+
const val = requests.cpu.endsWith('m') ? parseInt(requests.cpu) : parseFloat(requests.cpu) * 1000;
|
|
345
|
+
totalCpuRequest += val;
|
|
346
|
+
}
|
|
347
|
+
if (limits.cpu) {
|
|
348
|
+
const val = limits.cpu.endsWith('m') ? parseInt(limits.cpu) : parseFloat(limits.cpu) * 1000;
|
|
349
|
+
totalCpuLimit += val;
|
|
350
|
+
}
|
|
351
|
+
if (requests.memory) {
|
|
352
|
+
totalMemRequest += parseMemoryValue(requests.memory);
|
|
353
|
+
}
|
|
354
|
+
if (limits.memory) {
|
|
355
|
+
totalMemLimit += parseMemoryValue(limits.memory);
|
|
356
|
+
}
|
|
357
|
+
}
|
|
358
|
+
// Show current usage if available
|
|
359
|
+
if (podMetrics) {
|
|
360
|
+
result += `**Current Usage**:\n`;
|
|
361
|
+
result += ` - CPU: ${formatCPU(podMetrics.cpu)}`;
|
|
362
|
+
if (totalCpuLimit > 0) {
|
|
363
|
+
const percent = (podMetrics.cpu / totalCpuLimit) * 100;
|
|
364
|
+
const emoji = percent >= 80 ? ' â ī¸' : '';
|
|
365
|
+
result += ` (${percent.toFixed(1)}%${emoji})`;
|
|
366
|
+
}
|
|
367
|
+
result += '\n';
|
|
368
|
+
result += ` - Memory: ${formatBytes(podMetrics.memory)}`;
|
|
369
|
+
if (totalMemLimit > 0) {
|
|
370
|
+
const percent = (podMetrics.memory / totalMemLimit) * 100;
|
|
371
|
+
const emoji = percent >= 90 ? ' đ´' : percent >= 80 ? ' â ī¸' : '';
|
|
372
|
+
result += ` (${percent.toFixed(1)}%${emoji})`;
|
|
373
|
+
}
|
|
374
|
+
result += '\n\n';
|
|
375
|
+
}
|
|
376
|
+
const rows = [];
|
|
377
|
+
for (const container of containers) {
|
|
378
|
+
const requests = container.resources?.requests || {};
|
|
379
|
+
const limits = container.resources?.limits || {};
|
|
380
|
+
rows.push([
|
|
381
|
+
container.name,
|
|
382
|
+
requests.cpu || 'N/A',
|
|
383
|
+
limits.cpu || 'â ī¸ None',
|
|
384
|
+
requests.memory || 'N/A',
|
|
385
|
+
limits.memory || 'â ī¸ None',
|
|
386
|
+
]);
|
|
387
|
+
}
|
|
388
|
+
result += `**Resource Specs**:\n`;
|
|
389
|
+
result += createTable(['Container', 'CPU Request', 'CPU Limit', 'Memory Request', 'Memory Limit'], rows);
|
|
390
|
+
result += '\n';
|
|
391
|
+
// Warnings
|
|
392
|
+
const noLimits = containers.filter((c) => !c.resources?.limits);
|
|
393
|
+
if (noLimits.length > 0) {
|
|
394
|
+
result += `\nâ ī¸ **Warning**: ${noLimits.length} container(s) have no resource limits set\n`;
|
|
395
|
+
result += `This can lead to unlimited resource consumption.\n\n`;
|
|
396
|
+
}
|
|
397
|
+
// Threshold warnings
|
|
398
|
+
if (podMetrics && totalCpuLimit > 0) {
|
|
399
|
+
const cpuPercent = (podMetrics.cpu / totalCpuLimit) * 100;
|
|
400
|
+
if (cpuPercent >= 80) {
|
|
401
|
+
result += `â ī¸ **CPU Warning**: Usage is high (${cpuPercent.toFixed(1)}%)\n`;
|
|
402
|
+
}
|
|
403
|
+
}
|
|
404
|
+
if (podMetrics && totalMemLimit > 0) {
|
|
405
|
+
const memPercent = (podMetrics.memory / totalMemLimit) * 100;
|
|
406
|
+
if (memPercent >= 90) {
|
|
407
|
+
result += `đ´ **Memory Critical**: OOM risk detected (${memPercent.toFixed(1)}%)\n`;
|
|
408
|
+
}
|
|
409
|
+
else if (memPercent >= 80) {
|
|
410
|
+
result += `â ī¸ **Memory Warning**: Usage is high (${memPercent.toFixed(1)}%)\n`;
|
|
411
|
+
}
|
|
412
|
+
}
|
|
413
|
+
}
|
|
414
|
+
if (!metricsAvailable) {
|
|
415
|
+
result += `\nđĄ **Tip**: Install Metrics Server to see real-time usage:\n`;
|
|
416
|
+
result += '```bash\nkubectl apply -f https://github.com/kubernetes-sigs/metrics-server/releases/latest/download/components.yaml\n```\n';
|
|
417
|
+
}
|
|
418
|
+
return { content: [{ type: 'text', text: result }] };
|
|
419
|
+
}
|
|
420
|
+
catch (error) {
|
|
421
|
+
return {
|
|
422
|
+
content: [{
|
|
423
|
+
type: 'text',
|
|
424
|
+
text: `â Resource check failed: ${error.message}`,
|
|
425
|
+
}],
|
|
426
|
+
};
|
|
427
|
+
}
|
|
428
|
+
});
|
|
429
|
+
/**
|
|
430
|
+
* Cluster-wide Health Diagnosis
|
|
431
|
+
*
|
|
432
|
+
* Scans all nodes and pods in the cluster to check for problems
|
|
433
|
+
*/
|
|
434
|
+
server.registerTool('full-diagnosis', {
|
|
435
|
+
title: 'Cluster-wide Health Diagnosis',
|
|
436
|
+
description: 'Comprehensively analyzes cluster nodes, pods, and resources to evaluate health',
|
|
437
|
+
inputSchema: {
|
|
438
|
+
namespace: z.string().optional().describe('Specific namespace only (optional, all if empty)'),
|
|
439
|
+
},
|
|
440
|
+
}, async ({ namespace }) => {
|
|
441
|
+
try {
|
|
442
|
+
const health = await diagnoseClusterHealth(getK8sClients().core, namespace);
|
|
443
|
+
let result = `# đĨ Cluster Health Diagnosis\n\n`;
|
|
444
|
+
result += `${health.summary}\n\n`;
|
|
445
|
+
// Node Health
|
|
446
|
+
result += `## đĨī¸ Node Status\n\n`;
|
|
447
|
+
result += `- Total: ${health.nodeHealth.total}\n`;
|
|
448
|
+
result += `- Ready: ${health.nodeHealth.ready} â
\n`;
|
|
449
|
+
if (health.nodeHealth.notReady > 0) {
|
|
450
|
+
result += `- Not Ready: ${health.nodeHealth.notReady} â\n`;
|
|
451
|
+
}
|
|
452
|
+
result += '\n';
|
|
453
|
+
// Pod Health
|
|
454
|
+
result += `## đŗ Pod Status\n\n`;
|
|
455
|
+
result += `- Total: ${health.podHealth.total}\n`;
|
|
456
|
+
result += `- Running: ${health.podHealth.running} â
\n`;
|
|
457
|
+
if (health.podHealth.pending > 0) {
|
|
458
|
+
result += `- Pending: ${health.podHealth.pending} âŗ\n`;
|
|
459
|
+
}
|
|
460
|
+
if (health.podHealth.failed > 0) {
|
|
461
|
+
result += `- Failed: ${health.podHealth.failed} â\n`;
|
|
462
|
+
}
|
|
463
|
+
if (health.podHealth.crashLooping > 0) {
|
|
464
|
+
result += `- CrashLoop: ${health.podHealth.crashLooping} đĨ\n`;
|
|
465
|
+
}
|
|
466
|
+
result += '\n';
|
|
467
|
+
// Critical issues
|
|
468
|
+
if (health.criticalIssues.length > 0) {
|
|
469
|
+
result += `## đ´ Critical Issues\n\n`;
|
|
470
|
+
result += formatIssues(health.criticalIssues);
|
|
471
|
+
}
|
|
472
|
+
// Recommendations
|
|
473
|
+
result += `## đĄ Recommendations\n\n`;
|
|
474
|
+
for (const rec of health.recommendations) {
|
|
475
|
+
result += `${rec}\n\n`;
|
|
476
|
+
}
|
|
477
|
+
return { content: [{ type: 'text', text: result }] };
|
|
478
|
+
}
|
|
479
|
+
catch (error) {
|
|
480
|
+
console.error('Cluster diagnosis error:', error);
|
|
481
|
+
return {
|
|
482
|
+
content: [{
|
|
483
|
+
type: 'text',
|
|
484
|
+
text: `â Cluster diagnosis failed: ${error.message}\n\nDetails: ${error.stack || JSON.stringify(error, null, 2)}`,
|
|
485
|
+
}],
|
|
486
|
+
};
|
|
487
|
+
}
|
|
488
|
+
});
|
|
489
|
+
/**
|
|
490
|
+
* Event Query and Analysis
|
|
491
|
+
*
|
|
492
|
+
* Shows resource events in chronological order and alerts on problems
|
|
493
|
+
*/
|
|
494
|
+
server.registerTool('check-events', {
|
|
495
|
+
title: 'Event Query and Analysis',
|
|
496
|
+
description: 'Queries events for specific resources or namespaces and analyzes Warning events',
|
|
497
|
+
inputSchema: {
|
|
498
|
+
namespace: z.string().describe('Namespace'),
|
|
499
|
+
resourceName: z.string().optional().describe('Resource name (optional, entire namespace if empty)'),
|
|
500
|
+
showNormal: z.boolean().default(false).describe('Show Normal events too'),
|
|
501
|
+
},
|
|
502
|
+
}, async ({ namespace, resourceName, showNormal }) => {
|
|
503
|
+
try {
|
|
504
|
+
const eventsResponse = await getK8sClients().core.listNamespacedEvent({
|
|
505
|
+
namespace,
|
|
506
|
+
fieldSelector: resourceName ? `involvedObject.name=${resourceName}` : undefined,
|
|
507
|
+
});
|
|
508
|
+
const events = eventsResponse.items;
|
|
509
|
+
// ėę°ė ė ë Ŧ (ėĩė ė)
|
|
510
|
+
events.sort((a, b) => new Date(b.lastTimestamp || b.metadata?.creationTimestamp || '').getTime() -
|
|
511
|
+
new Date(a.lastTimestamp || a.metadata?.creationTimestamp || '').getTime());
|
|
512
|
+
let result = `# đ Event Analysis\n\n`;
|
|
513
|
+
result += `**Namespace**: ${namespace}\n`;
|
|
514
|
+
if (resourceName) {
|
|
515
|
+
result += `**Resource**: ${resourceName}\n`;
|
|
516
|
+
}
|
|
517
|
+
result += `\n`;
|
|
518
|
+
const warnings = events.filter((e) => e.type === 'Warning');
|
|
519
|
+
const normals = events.filter((e) => e.type === 'Normal');
|
|
520
|
+
result += `Total ${events.length} events (Warning: ${warnings.length}, Normal: ${normals.length})\n\n`;
|
|
521
|
+
// Warning events
|
|
522
|
+
if (warnings.length > 0) {
|
|
523
|
+
result += `## â ī¸ Warning Events\n\n`;
|
|
524
|
+
for (const event of warnings.slice(0, 20)) {
|
|
525
|
+
result += `**${event.reason}** (${event.count || 1} times)\n`;
|
|
526
|
+
result += ` - ${event.message}\n`;
|
|
527
|
+
result += ` - Target: ${event.involvedObject?.kind}/${event.involvedObject?.name}\n`;
|
|
528
|
+
result += ` - Time: ${event.lastTimestamp || event.metadata?.creationTimestamp}\n\n`;
|
|
529
|
+
}
|
|
530
|
+
}
|
|
531
|
+
else {
|
|
532
|
+
result += `â
No Warning events!\n\n`;
|
|
533
|
+
}
|
|
534
|
+
// Normal events (optional)
|
|
535
|
+
if (showNormal && normals.length > 0) {
|
|
536
|
+
result += `## âšī¸ Normal Events (last 10)\n\n`;
|
|
537
|
+
for (const event of normals.slice(0, 10)) {
|
|
538
|
+
result += `- **${event.reason}**: ${event.message}\n`;
|
|
539
|
+
}
|
|
540
|
+
result += '\n';
|
|
541
|
+
}
|
|
542
|
+
return { content: [{ type: 'text', text: result }] };
|
|
543
|
+
}
|
|
544
|
+
catch (error) {
|
|
545
|
+
return {
|
|
546
|
+
content: [{
|
|
547
|
+
type: 'text',
|
|
548
|
+
text: `â Event query failed: ${error.message}`,
|
|
549
|
+
}],
|
|
550
|
+
};
|
|
551
|
+
}
|
|
552
|
+
});
|
|
553
|
+
/**
|
|
554
|
+
* List namespaces
|
|
555
|
+
*
|
|
556
|
+
* Utility function - Check available namespaces
|
|
557
|
+
*/
|
|
558
|
+
server.registerTool('list-namespaces', {
|
|
559
|
+
title: 'List Namespaces',
|
|
560
|
+
description: 'Lists all namespaces in the cluster',
|
|
561
|
+
inputSchema: {},
|
|
562
|
+
}, async () => {
|
|
563
|
+
try {
|
|
564
|
+
// Use cache for namespace list
|
|
565
|
+
const namespaces = await getOrCompute(namespaceCache, 'all-namespaces', async () => {
|
|
566
|
+
const nsResponse = await getK8sClients().core.listNamespace();
|
|
567
|
+
return nsResponse.items;
|
|
568
|
+
});
|
|
569
|
+
let result = `# đ Namespace List\n\n`;
|
|
570
|
+
result += `Total: ${namespaces.length}\n\n`;
|
|
571
|
+
for (const ns of namespaces) {
|
|
572
|
+
const status = ns.status?.phase || 'Unknown';
|
|
573
|
+
const icon = status === 'Active' ? 'â
' : 'â';
|
|
574
|
+
result += `${icon} **${ns.metadata?.name}** (${status})\n`;
|
|
575
|
+
}
|
|
576
|
+
return { content: [{ type: 'text', text: result }] };
|
|
577
|
+
}
|
|
578
|
+
catch (error) {
|
|
579
|
+
return {
|
|
580
|
+
content: [{
|
|
581
|
+
type: 'text',
|
|
582
|
+
text: `â Namespace query failed: ${error.message}`,
|
|
583
|
+
}],
|
|
584
|
+
};
|
|
585
|
+
}
|
|
586
|
+
});
|
|
587
|
+
/**
|
|
588
|
+
* List pods
|
|
589
|
+
*
|
|
590
|
+
* Utility function - List pods in a namespace
|
|
591
|
+
*/
|
|
592
|
+
server.registerTool('list-pods', {
|
|
593
|
+
title: 'List Pods',
|
|
594
|
+
description: 'Lists all pods in a specific namespace',
|
|
595
|
+
inputSchema: {
|
|
596
|
+
namespace: z.string().describe('Namespace'),
|
|
597
|
+
showAll: z.boolean().default(false).describe('Show all pods (default shows only problematic pods)'),
|
|
598
|
+
},
|
|
599
|
+
}, async ({ namespace, showAll }) => {
|
|
600
|
+
try {
|
|
601
|
+
// Use cache for pod list per namespace
|
|
602
|
+
const pods = await getOrCompute(podListCache, `pods-${namespace}`, async () => {
|
|
603
|
+
const podsResponse = await getK8sClients().core.listNamespacedPod({ namespace });
|
|
604
|
+
return podsResponse.items;
|
|
605
|
+
});
|
|
606
|
+
let result = `# đŗ Pod List (${namespace})\n\n`;
|
|
607
|
+
const rows = [];
|
|
608
|
+
for (const pod of pods) {
|
|
609
|
+
const phase = pod.status?.phase || 'Unknown';
|
|
610
|
+
const restarts = pod.status?.containerStatuses?.reduce((sum, c) => sum + (c.restartCount || 0), 0) || 0;
|
|
611
|
+
const ready = pod.status?.containerStatuses?.filter((c) => c.ready).length || 0;
|
|
612
|
+
const total = pod.status?.containerStatuses?.length || 0;
|
|
613
|
+
// Filter problematic pods
|
|
614
|
+
const hasProblem = phase !== 'Running' || restarts > 0;
|
|
615
|
+
if (!showAll && !hasProblem)
|
|
616
|
+
continue;
|
|
617
|
+
const statusIcon = phase === 'Running' && restarts === 0 ? 'â
' :
|
|
618
|
+
phase === 'Pending' ? 'âŗ' :
|
|
619
|
+
phase === 'Failed' ? 'â' :
|
|
620
|
+
restarts > 5 ? 'đĨ' : 'â ī¸';
|
|
621
|
+
rows.push([
|
|
622
|
+
statusIcon,
|
|
623
|
+
pod.metadata?.name || '',
|
|
624
|
+
phase,
|
|
625
|
+
`${ready}/${total}`,
|
|
626
|
+
restarts.toString(),
|
|
627
|
+
pod.spec?.nodeName || 'N/A',
|
|
628
|
+
]);
|
|
629
|
+
}
|
|
630
|
+
if (rows.length === 0) {
|
|
631
|
+
result += 'â
All pods are healthy!\n';
|
|
632
|
+
}
|
|
633
|
+
else {
|
|
634
|
+
result += createTable(['Status', 'Name', 'Phase', 'Ready', 'Restarts', 'Node'], rows);
|
|
635
|
+
}
|
|
636
|
+
return { content: [{ type: 'text', text: result }] };
|
|
637
|
+
}
|
|
638
|
+
catch (error) {
|
|
639
|
+
return {
|
|
640
|
+
content: [{
|
|
641
|
+
type: 'text',
|
|
642
|
+
text: `â Pod list query failed: ${error.message}`,
|
|
643
|
+
}],
|
|
644
|
+
};
|
|
645
|
+
}
|
|
646
|
+
});
|
|
647
|
+
// ============================================
|
|
648
|
+
// Helper functions
|
|
649
|
+
// ============================================
|
|
650
|
+
function parseMemoryValue(mem) {
|
|
651
|
+
const units = {
|
|
652
|
+
'Ki': 1024,
|
|
653
|
+
'Mi': 1024 * 1024,
|
|
654
|
+
'Gi': 1024 * 1024 * 1024,
|
|
655
|
+
'K': 1000,
|
|
656
|
+
'M': 1000 * 1000,
|
|
657
|
+
'G': 1000 * 1000 * 1000,
|
|
658
|
+
};
|
|
659
|
+
for (const [unit, multiplier] of Object.entries(units)) {
|
|
660
|
+
if (mem.endsWith(unit)) {
|
|
661
|
+
return parseFloat(mem.slice(0, -unit.length)) * multiplier;
|
|
662
|
+
}
|
|
663
|
+
}
|
|
664
|
+
return parseFloat(mem);
|
|
665
|
+
}
|
|
666
|
+
// ============================================
|
|
667
|
+
// Server startup
|
|
668
|
+
// ============================================
|
|
669
|
+
async function main() {
|
|
670
|
+
const transport = new StdioServerTransport();
|
|
671
|
+
await server.connect(transport);
|
|
672
|
+
console.error('đĨ K8s Doctor MCP Server started');
|
|
673
|
+
console.error(' Available in environments where kubectl commands work');
|
|
674
|
+
}
|
|
675
|
+
main().catch(error => {
|
|
676
|
+
console.error('Fatal error:', error);
|
|
677
|
+
process.exit(1);
|
|
678
|
+
});
|