@zerry_jin/k8s-doctor-mcp 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.ko.md +330 -0
- package/README.md +330 -0
- package/dist/analyzers/log-analyzer.d.ts +17 -0
- package/dist/analyzers/log-analyzer.js +402 -0
- package/dist/diagnostics/cluster-health.d.ts +13 -0
- package/dist/diagnostics/cluster-health.js +176 -0
- package/dist/diagnostics/pod-diagnostics.d.ts +23 -0
- package/dist/diagnostics/pod-diagnostics.js +654 -0
- package/dist/index.d.ts +11 -0
- package/dist/index.js +678 -0
- package/dist/types.d.ts +337 -0
- package/dist/types.js +6 -0
- package/dist/utils/cache.d.ts +59 -0
- package/dist/utils/cache.js +99 -0
- package/dist/utils/formatters.d.ts +48 -0
- package/dist/utils/formatters.js +129 -0
- package/dist/utils/k8s-client.d.ts +37 -0
- package/dist/utils/k8s-client.js +74 -0
- package/dist/utils/retry.d.ts +25 -0
- package/dist/utils/retry.js +70 -0
- package/package.json +65 -0
|
@@ -0,0 +1,654 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Pod diagnostics module
|
|
3
|
+
*
|
|
4
|
+
* Comprehensively analyzes pod status, containers, and events
|
|
5
|
+
* to identify the root cause of problems
|
|
6
|
+
*
|
|
7
|
+
* @author zerry
|
|
8
|
+
*/
|
|
9
|
+
import { withRetry } from '../utils/retry.js';
|
|
10
|
+
/**
|
|
11
|
+
* Comprehensive pod diagnostics
|
|
12
|
+
*
|
|
13
|
+
* This is the core feature. Analyzes all pod states
|
|
14
|
+
* to clearly explain "why it's not working"
|
|
15
|
+
*/
|
|
16
|
+
export async function diagnosePod(coreApi, namespace, podName, metricsApi) {
|
|
17
|
+
try {
|
|
18
|
+
console.error(`[diagnosePod] Starting diagnostics for pod ${podName} in namespace ${namespace}`);
|
|
19
|
+
// Parallel API calls for better performance
|
|
20
|
+
const [pod, eventsResponse] = await Promise.all([
|
|
21
|
+
// 1. Get pod information with retry
|
|
22
|
+
withRetry(() => coreApi.readNamespacedPod({ name: podName, namespace }), {
|
|
23
|
+
maxAttempts: 3,
|
|
24
|
+
initialDelay: 500,
|
|
25
|
+
}).catch((error) => {
|
|
26
|
+
console.error(`[diagnosePod] Failed to get pod info:`, error.message);
|
|
27
|
+
throw new Error(`Cannot read pod ${podName}: ${error.message}`);
|
|
28
|
+
}),
|
|
29
|
+
// 2. Get pod events with retry
|
|
30
|
+
withRetry(() => coreApi.listNamespacedEvent({
|
|
31
|
+
namespace,
|
|
32
|
+
fieldSelector: `involvedObject.name=${podName}`,
|
|
33
|
+
}), {
|
|
34
|
+
maxAttempts: 2, // Events are less critical, fewer retries
|
|
35
|
+
initialDelay: 500,
|
|
36
|
+
}).catch((error) => {
|
|
37
|
+
console.error(`[diagnosePod] Failed to get events (non-fatal):`, error.message);
|
|
38
|
+
// Return empty events instead of failing
|
|
39
|
+
return { items: [] };
|
|
40
|
+
}),
|
|
41
|
+
]);
|
|
42
|
+
const events = parseEvents(eventsResponse.items);
|
|
43
|
+
// 3. Analyze container status
|
|
44
|
+
const containers = parseContainerStatuses(pod.status?.containerStatuses || []);
|
|
45
|
+
// 4. Detect issues
|
|
46
|
+
const issues = [];
|
|
47
|
+
// Container-related issues
|
|
48
|
+
issues.push(...detectContainerIssues(pod, containers, events));
|
|
49
|
+
// Image pull issues
|
|
50
|
+
issues.push(...detectImagePullIssues(pod, events));
|
|
51
|
+
// Resource-related issues
|
|
52
|
+
const resources = await analyzeResourceUsage(pod, namespace, podName, metricsApi);
|
|
53
|
+
issues.push(...detectResourceIssues(pod, resources));
|
|
54
|
+
// Volume mount issues
|
|
55
|
+
issues.push(...detectVolumeIssues(pod, events));
|
|
56
|
+
// Network issues
|
|
57
|
+
issues.push(...detectNetworkIssues(pod, events));
|
|
58
|
+
// 5. Calculate health score
|
|
59
|
+
const healthScore = calculateHealthScore(pod, issues);
|
|
60
|
+
// 6. Generate summary
|
|
61
|
+
const summary = generatePodSummary(pod, issues, healthScore);
|
|
62
|
+
return {
|
|
63
|
+
podInfo: {
|
|
64
|
+
name: pod.metadata?.name || podName,
|
|
65
|
+
namespace: pod.metadata?.namespace || namespace,
|
|
66
|
+
phase: pod.status?.phase || 'Unknown',
|
|
67
|
+
startTime: pod.status?.startTime?.toISOString(),
|
|
68
|
+
nodeName: pod.spec?.nodeName,
|
|
69
|
+
hostIP: pod.status?.hostIP,
|
|
70
|
+
podIP: pod.status?.podIP,
|
|
71
|
+
},
|
|
72
|
+
containers,
|
|
73
|
+
issues,
|
|
74
|
+
resources,
|
|
75
|
+
events,
|
|
76
|
+
summary,
|
|
77
|
+
healthScore,
|
|
78
|
+
};
|
|
79
|
+
}
|
|
80
|
+
catch (error) {
|
|
81
|
+
console.error(`[diagnosePod] Fatal error:`, error);
|
|
82
|
+
throw new Error(`Pod diagnosis failed: ${error.message}`);
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
/**
|
|
86
|
+
* Specialized CrashLoopBackOff diagnostics
|
|
87
|
+
*
|
|
88
|
+
* CrashLoop is really tricky, this function accurately identifies the cause
|
|
89
|
+
*/
|
|
90
|
+
export async function diagnoseCrashLoop(coreApi, logApi, namespace, podName, containerName) {
|
|
91
|
+
const issues = [];
|
|
92
|
+
try {
|
|
93
|
+
console.error(`[diagnoseCrashLoop] Analyzing pod ${podName} in namespace ${namespace}`);
|
|
94
|
+
const pod = await withRetry(() => coreApi.readNamespacedPod({ name: podName, namespace }), { maxAttempts: 3 });
|
|
95
|
+
const containerStatuses = pod.status?.containerStatuses || [];
|
|
96
|
+
for (const status of containerStatuses) {
|
|
97
|
+
// If containerName specified, only that container
|
|
98
|
+
if (containerName && status.name !== containerName)
|
|
99
|
+
continue;
|
|
100
|
+
const restartCount = status.restartCount || 0;
|
|
101
|
+
// Detect CrashLoop
|
|
102
|
+
if (restartCount > 3 || status.state?.waiting?.reason === 'CrashLoopBackOff') {
|
|
103
|
+
// Check termination reason from previous state
|
|
104
|
+
const lastTerminated = status.lastState?.terminated;
|
|
105
|
+
let rootCause = 'unknown';
|
|
106
|
+
let solution = '';
|
|
107
|
+
if (lastTerminated) {
|
|
108
|
+
const exitCode = lastTerminated.exitCode;
|
|
109
|
+
// Analyze exit code
|
|
110
|
+
if (exitCode === 0) {
|
|
111
|
+
rootCause = 'Container exited normally but keeps restarting due to restart policy';
|
|
112
|
+
solution = 'Change spec.restartPolicy to "Never" or "OnFailure"\n```yaml\nspec:\n restartPolicy: OnFailure\n```';
|
|
113
|
+
}
|
|
114
|
+
else if (exitCode === 1) {
|
|
115
|
+
rootCause = 'Application error caused termination';
|
|
116
|
+
solution = 'Check logs to fix application errors\n```bash\nkubectl logs ' + podName + ' -n ' + namespace + ' -c ' + status.name + ' --previous\n```';
|
|
117
|
+
}
|
|
118
|
+
else if (exitCode === 137) {
|
|
119
|
+
rootCause = 'OOM (Out Of Memory) - Container was killed due to insufficient memory';
|
|
120
|
+
solution = 'Increase memory limit or optimize application memory usage\n```yaml\nresources:\n limits:\n memory: "512Mi" # Set higher than current\n```';
|
|
121
|
+
}
|
|
122
|
+
else if (exitCode === 143) {
|
|
123
|
+
rootCause = 'Terminated by SIGTERM - Received normal termination signal';
|
|
124
|
+
solution = 'Graceful shutdown may not be properly implemented. Try increasing terminationGracePeriodSeconds';
|
|
125
|
+
}
|
|
126
|
+
else if (exitCode === 126) {
|
|
127
|
+
rootCause = 'Permission denied - Executable file lacks execute permission';
|
|
128
|
+
solution = 'Grant execute permission with chmod +x in Dockerfile';
|
|
129
|
+
}
|
|
130
|
+
else if (exitCode === 127) {
|
|
131
|
+
rootCause = 'Command not found - CMD/ENTRYPOINT command does not exist';
|
|
132
|
+
solution = 'Verify CMD/ENTRYPOINT path in Dockerfile';
|
|
133
|
+
}
|
|
134
|
+
else {
|
|
135
|
+
rootCause = `Unknown error (exit code ${exitCode})`;
|
|
136
|
+
solution = 'Check logs to identify detailed cause';
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
// Find additional clues in logs
|
|
140
|
+
try {
|
|
141
|
+
const { Writable } = require('stream');
|
|
142
|
+
const stream = new Writable();
|
|
143
|
+
let logData = '';
|
|
144
|
+
stream._write = (chunk, _encoding, next) => {
|
|
145
|
+
logData += chunk.toString();
|
|
146
|
+
next();
|
|
147
|
+
};
|
|
148
|
+
await withRetry(() => logApi.log(namespace, podName, status.name, stream, {
|
|
149
|
+
previous: true,
|
|
150
|
+
tailLines: 50,
|
|
151
|
+
}), { maxAttempts: 2 });
|
|
152
|
+
// Find error patterns in logs
|
|
153
|
+
const relevantLogs = [];
|
|
154
|
+
const lines = logData.split('\n');
|
|
155
|
+
for (const line of lines) {
|
|
156
|
+
if (line.toLowerCase().includes('error') ||
|
|
157
|
+
line.toLowerCase().includes('exception') ||
|
|
158
|
+
line.toLowerCase().includes('fatal') ||
|
|
159
|
+
line.toLowerCase().includes('panic')) {
|
|
160
|
+
relevantLogs.push(line.trim());
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
issues.push({
|
|
164
|
+
type: 'CrashLoopBackOff',
|
|
165
|
+
severity: 'critical',
|
|
166
|
+
message: `Container "${status.name}" has restarted ${restartCount} times`,
|
|
167
|
+
rootCause,
|
|
168
|
+
solution,
|
|
169
|
+
resource: {
|
|
170
|
+
kind: 'Pod',
|
|
171
|
+
name: podName,
|
|
172
|
+
namespace,
|
|
173
|
+
},
|
|
174
|
+
relevantLogs: relevantLogs.slice(0, 10), // Max 10 lines
|
|
175
|
+
timestamp: new Date().toISOString(),
|
|
176
|
+
});
|
|
177
|
+
}
|
|
178
|
+
catch (logError) {
|
|
179
|
+
console.error(`[diagnoseCrashLoop] Failed to retrieve logs for ${status.name}:`, logError.message);
|
|
180
|
+
// Add issue even if logs cannot be retrieved
|
|
181
|
+
issues.push({
|
|
182
|
+
type: 'CrashLoopBackOff',
|
|
183
|
+
severity: 'critical',
|
|
184
|
+
message: `Container "${status.name}" has restarted ${restartCount} times`,
|
|
185
|
+
rootCause,
|
|
186
|
+
solution,
|
|
187
|
+
resource: {
|
|
188
|
+
kind: 'Pod',
|
|
189
|
+
name: podName,
|
|
190
|
+
namespace,
|
|
191
|
+
},
|
|
192
|
+
timestamp: new Date().toISOString(),
|
|
193
|
+
});
|
|
194
|
+
}
|
|
195
|
+
}
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
catch (error) {
|
|
199
|
+
throw new Error(`CrashLoop diagnostics failed: ${error.message}`);
|
|
200
|
+
}
|
|
201
|
+
return issues;
|
|
202
|
+
}
|
|
203
|
+
/**
|
|
204
|
+
* Parse container statuses
|
|
205
|
+
*/
|
|
206
|
+
function parseContainerStatuses(statuses) {
|
|
207
|
+
return statuses.map(s => ({
|
|
208
|
+
name: s.name,
|
|
209
|
+
ready: s.ready || false,
|
|
210
|
+
restartCount: s.restartCount || 0,
|
|
211
|
+
state: s.state || {},
|
|
212
|
+
lastState: s.lastState,
|
|
213
|
+
image: s.image,
|
|
214
|
+
imageID: s.imageID,
|
|
215
|
+
}));
|
|
216
|
+
}
|
|
217
|
+
/**
|
|
218
|
+
* Parse events
|
|
219
|
+
*/
|
|
220
|
+
function parseEvents(items) {
|
|
221
|
+
return items
|
|
222
|
+
.map(e => ({
|
|
223
|
+
type: e.type,
|
|
224
|
+
reason: e.reason,
|
|
225
|
+
message: e.message,
|
|
226
|
+
count: e.count || 1,
|
|
227
|
+
firstTimestamp: e.firstTimestamp,
|
|
228
|
+
lastTimestamp: e.lastTimestamp,
|
|
229
|
+
source: e.source?.component,
|
|
230
|
+
}))
|
|
231
|
+
.sort((a, b) => new Date(b.lastTimestamp).getTime() - new Date(a.lastTimestamp).getTime());
|
|
232
|
+
}
|
|
233
|
+
/**
|
|
234
|
+
* Detect container issues
|
|
235
|
+
*/
|
|
236
|
+
function detectContainerIssues(pod, containers, _events) {
|
|
237
|
+
const issues = [];
|
|
238
|
+
for (const container of containers) {
|
|
239
|
+
// Check Waiting state
|
|
240
|
+
if (container.state.waiting) {
|
|
241
|
+
const reason = container.state.waiting.reason;
|
|
242
|
+
const message = container.state.waiting.message;
|
|
243
|
+
if (reason === 'ErrImagePull' || reason === 'ImagePullBackOff') {
|
|
244
|
+
// Image pull issues are handled in a separate function
|
|
245
|
+
continue;
|
|
246
|
+
}
|
|
247
|
+
issues.push({
|
|
248
|
+
type: `Container Waiting: ${reason}`,
|
|
249
|
+
severity: 'high',
|
|
250
|
+
message: `Container "${container.name}" is in ${reason} state`,
|
|
251
|
+
rootCause: message || 'Unknown reason',
|
|
252
|
+
solution: getWaitingSolution(reason),
|
|
253
|
+
resource: {
|
|
254
|
+
kind: 'Pod',
|
|
255
|
+
name: pod.metadata?.name,
|
|
256
|
+
namespace: pod.metadata?.namespace,
|
|
257
|
+
},
|
|
258
|
+
timestamp: new Date().toISOString(),
|
|
259
|
+
});
|
|
260
|
+
}
|
|
261
|
+
// Check Terminated state
|
|
262
|
+
if (container.state.terminated && container.state.terminated.exitCode !== 0) {
|
|
263
|
+
issues.push({
|
|
264
|
+
type: 'Container Terminated',
|
|
265
|
+
severity: 'high',
|
|
266
|
+
message: `Container "${container.name}" terminated with exit code ${container.state.terminated.exitCode}`,
|
|
267
|
+
rootCause: container.state.terminated.reason || 'Unknown reason',
|
|
268
|
+
solution: getTerminatedSolution(container.state.terminated.exitCode),
|
|
269
|
+
resource: {
|
|
270
|
+
kind: 'Pod',
|
|
271
|
+
name: pod.metadata?.name,
|
|
272
|
+
namespace: pod.metadata?.namespace,
|
|
273
|
+
},
|
|
274
|
+
timestamp: new Date().toISOString(),
|
|
275
|
+
});
|
|
276
|
+
}
|
|
277
|
+
}
|
|
278
|
+
return issues;
|
|
279
|
+
}
|
|
280
|
+
/**
|
|
281
|
+
* Detect image pull issues
|
|
282
|
+
*/
|
|
283
|
+
function detectImagePullIssues(pod, events) {
|
|
284
|
+
const issues = [];
|
|
285
|
+
const imagePullEvents = events.filter(e => e.reason === 'Failed' && e.message.includes('pull'));
|
|
286
|
+
if (imagePullEvents.length > 0) {
|
|
287
|
+
const event = imagePullEvents[0];
|
|
288
|
+
let rootCause = 'Cannot download image';
|
|
289
|
+
let solution = '';
|
|
290
|
+
if (event.message.includes('not found') || event.message.includes('manifest unknown')) {
|
|
291
|
+
rootCause = 'Image or tag does not exist';
|
|
292
|
+
solution = '1. Verify image name and tag\n2. Test locally with docker pull <image>';
|
|
293
|
+
}
|
|
294
|
+
else if (event.message.includes('unauthorized') || event.message.includes('authentication')) {
|
|
295
|
+
rootCause = 'Image registry authentication failed';
|
|
296
|
+
solution = '```bash\nkubectl create secret docker-registry regcred \\\n --docker-server=<registry> \\\n --docker-username=<username> \\\n --docker-password=<password>\n\n# Add to Pod spec:\nspec:\n imagePullSecrets:\n - name: regcred\n```';
|
|
297
|
+
}
|
|
298
|
+
else if (event.message.includes('timeout')) {
|
|
299
|
+
rootCause = 'Network timeout - Cannot access registry';
|
|
300
|
+
solution = '1. Check cluster network connectivity\n2. Verify firewall/proxy settings\n3. Verify registry URL is correct';
|
|
301
|
+
}
|
|
302
|
+
issues.push({
|
|
303
|
+
type: 'ImagePullBackOff',
|
|
304
|
+
severity: 'critical',
|
|
305
|
+
message: 'Cannot pull container image',
|
|
306
|
+
rootCause,
|
|
307
|
+
solution,
|
|
308
|
+
resource: {
|
|
309
|
+
kind: 'Pod',
|
|
310
|
+
name: pod.metadata?.name,
|
|
311
|
+
namespace: pod.metadata?.namespace,
|
|
312
|
+
},
|
|
313
|
+
relatedEvents: [event],
|
|
314
|
+
timestamp: new Date().toISOString(),
|
|
315
|
+
});
|
|
316
|
+
}
|
|
317
|
+
return issues;
|
|
318
|
+
}
|
|
319
|
+
/**
|
|
320
|
+
* Analyze resource usage
|
|
321
|
+
*
|
|
322
|
+
* Collects real-time metrics from Metrics Server if available
|
|
323
|
+
*/
|
|
324
|
+
async function analyzeResourceUsage(pod, namespace, podName, metricsApi) {
|
|
325
|
+
const containers = pod.spec?.containers || [];
|
|
326
|
+
let totalCpuRequest = 0;
|
|
327
|
+
let totalCpuLimit = 0;
|
|
328
|
+
let totalMemRequest = 0;
|
|
329
|
+
let totalMemLimit = 0;
|
|
330
|
+
for (const container of containers) {
|
|
331
|
+
const requests = container.resources?.requests || {};
|
|
332
|
+
const limits = container.resources?.limits || {};
|
|
333
|
+
totalCpuRequest += parseCPU(requests.cpu || '0');
|
|
334
|
+
totalCpuLimit += parseCPU(limits.cpu || '0');
|
|
335
|
+
totalMemRequest += parseMemory(requests.memory || '0');
|
|
336
|
+
totalMemLimit += parseMemory(limits.memory || '0');
|
|
337
|
+
}
|
|
338
|
+
// Try to get real-time metrics from Metrics Server
|
|
339
|
+
let currentCpu;
|
|
340
|
+
let currentMem;
|
|
341
|
+
let cpuUsagePercent;
|
|
342
|
+
let memUsagePercent;
|
|
343
|
+
if (metricsApi) {
|
|
344
|
+
try {
|
|
345
|
+
const metrics = await withRetry(() => metricsApi.getPodMetrics(namespace), {
|
|
346
|
+
maxAttempts: 2,
|
|
347
|
+
initialDelay: 500,
|
|
348
|
+
shouldRetry: (error) => {
|
|
349
|
+
// Don't retry if Metrics Server is not installed
|
|
350
|
+
if (error.statusCode === 404)
|
|
351
|
+
return false;
|
|
352
|
+
return true;
|
|
353
|
+
},
|
|
354
|
+
});
|
|
355
|
+
// Find the specific pod in the metrics list
|
|
356
|
+
const podMetric = metrics.items?.find((item) => item.metadata?.name === podName);
|
|
357
|
+
if (podMetric) {
|
|
358
|
+
// Sum up all container metrics
|
|
359
|
+
let totalCpuUsage = 0;
|
|
360
|
+
let totalMemUsage = 0;
|
|
361
|
+
for (const container of podMetric.containers || []) {
|
|
362
|
+
// CPU is in nanocores, convert to millicores
|
|
363
|
+
if (container.usage?.cpu) {
|
|
364
|
+
totalCpuUsage += parseMetricCPU(container.usage.cpu);
|
|
365
|
+
}
|
|
366
|
+
// Memory is in Ki, convert to bytes
|
|
367
|
+
if (container.usage?.memory) {
|
|
368
|
+
totalMemUsage += parseMetricMemory(container.usage.memory);
|
|
369
|
+
}
|
|
370
|
+
}
|
|
371
|
+
currentCpu = totalCpuUsage;
|
|
372
|
+
currentMem = totalMemUsage;
|
|
373
|
+
// Calculate usage percentages
|
|
374
|
+
if (totalCpuLimit > 0) {
|
|
375
|
+
cpuUsagePercent = (currentCpu / totalCpuLimit) * 100;
|
|
376
|
+
}
|
|
377
|
+
if (totalMemLimit > 0) {
|
|
378
|
+
memUsagePercent = (currentMem / totalMemLimit) * 100;
|
|
379
|
+
}
|
|
380
|
+
}
|
|
381
|
+
}
|
|
382
|
+
catch (error) {
|
|
383
|
+
// Metrics Server not available or pod metrics not ready
|
|
384
|
+
// This is fine, we'll just show spec values
|
|
385
|
+
if (error.statusCode !== 404) {
|
|
386
|
+
console.error(`[analyzeResourceUsage] Failed to get metrics (non-fatal):`, error.message);
|
|
387
|
+
}
|
|
388
|
+
}
|
|
389
|
+
}
|
|
390
|
+
return {
|
|
391
|
+
cpu: {
|
|
392
|
+
current: currentCpu,
|
|
393
|
+
requested: totalCpuRequest,
|
|
394
|
+
limit: totalCpuLimit,
|
|
395
|
+
usagePercent: cpuUsagePercent,
|
|
396
|
+
isThrottled: cpuUsagePercent !== undefined && cpuUsagePercent >= 80,
|
|
397
|
+
},
|
|
398
|
+
memory: {
|
|
399
|
+
current: currentMem,
|
|
400
|
+
requested: totalMemRequest,
|
|
401
|
+
limit: totalMemLimit,
|
|
402
|
+
usagePercent: memUsagePercent,
|
|
403
|
+
isOOMRisk: memUsagePercent !== undefined && memUsagePercent >= 90,
|
|
404
|
+
},
|
|
405
|
+
};
|
|
406
|
+
}
|
|
407
|
+
/**
|
|
408
|
+
* Detect resource issues
|
|
409
|
+
*/
|
|
410
|
+
function detectResourceIssues(pod, resources) {
|
|
411
|
+
const issues = [];
|
|
412
|
+
// Check for high CPU usage (throttling)
|
|
413
|
+
if (resources.cpu.isThrottled && resources.cpu.usagePercent !== undefined) {
|
|
414
|
+
issues.push({
|
|
415
|
+
type: 'High CPU Usage',
|
|
416
|
+
severity: 'high',
|
|
417
|
+
message: `CPU usage is high (${resources.cpu.usagePercent.toFixed(1)}%)`,
|
|
418
|
+
rootCause: 'CPU limit may be too low for current workload',
|
|
419
|
+
solution: `Increase CPU limit or optimize application:\n\`\`\`yaml\nresources:\n limits:\n cpu: "${Math.ceil((resources.cpu.limit || 1000) * 1.5)}m" # Increased by 50%\n\`\`\``,
|
|
420
|
+
resource: {
|
|
421
|
+
kind: 'Pod',
|
|
422
|
+
name: pod.metadata?.name,
|
|
423
|
+
namespace: pod.metadata?.namespace,
|
|
424
|
+
},
|
|
425
|
+
timestamp: new Date().toISOString(),
|
|
426
|
+
});
|
|
427
|
+
}
|
|
428
|
+
// Check for OOM risk
|
|
429
|
+
if (resources.memory.isOOMRisk && resources.memory.usagePercent !== undefined) {
|
|
430
|
+
issues.push({
|
|
431
|
+
type: 'OOM Risk',
|
|
432
|
+
severity: 'critical',
|
|
433
|
+
message: `Memory usage is critically high (${resources.memory.usagePercent.toFixed(1)}%)`,
|
|
434
|
+
rootCause: 'Pod is at risk of OOM kill - memory usage exceeds 90% of limit',
|
|
435
|
+
solution: `Increase memory limit immediately:\n\`\`\`yaml\nresources:\n limits:\n memory: "${Math.ceil((resources.memory.limit || 512 * 1024 * 1024) / (1024 * 1024) * 1.5)}Mi" # Increased by 50%\n\`\`\``,
|
|
436
|
+
resource: {
|
|
437
|
+
kind: 'Pod',
|
|
438
|
+
name: pod.metadata?.name,
|
|
439
|
+
namespace: pod.metadata?.namespace,
|
|
440
|
+
},
|
|
441
|
+
timestamp: new Date().toISOString(),
|
|
442
|
+
});
|
|
443
|
+
}
|
|
444
|
+
// When resource limits are not set
|
|
445
|
+
if (!resources.cpu.limit) {
|
|
446
|
+
issues.push({
|
|
447
|
+
type: 'Missing CPU Limit',
|
|
448
|
+
severity: 'medium',
|
|
449
|
+
message: 'CPU limit is not set',
|
|
450
|
+
rootCause: 'CPU usage can increase without limit',
|
|
451
|
+
solution: '```yaml\nresources:\n limits:\n cpu: "1000m"\n requests:\n cpu: "100m"\n```',
|
|
452
|
+
resource: {
|
|
453
|
+
kind: 'Pod',
|
|
454
|
+
name: pod.metadata?.name,
|
|
455
|
+
namespace: pod.metadata?.namespace,
|
|
456
|
+
},
|
|
457
|
+
timestamp: new Date().toISOString(),
|
|
458
|
+
});
|
|
459
|
+
}
|
|
460
|
+
if (!resources.memory.limit) {
|
|
461
|
+
issues.push({
|
|
462
|
+
type: 'Missing Memory Limit',
|
|
463
|
+
severity: 'high',
|
|
464
|
+
message: 'Memory limit is not set',
|
|
465
|
+
rootCause: 'Memory leak can affect entire node',
|
|
466
|
+
solution: '```yaml\nresources:\n limits:\n memory: "512Mi"\n requests:\n memory: "128Mi"\n```',
|
|
467
|
+
resource: {
|
|
468
|
+
kind: 'Pod',
|
|
469
|
+
name: pod.metadata?.name,
|
|
470
|
+
namespace: pod.metadata?.namespace,
|
|
471
|
+
},
|
|
472
|
+
timestamp: new Date().toISOString(),
|
|
473
|
+
});
|
|
474
|
+
}
|
|
475
|
+
return issues;
|
|
476
|
+
}
|
|
477
|
+
/**
|
|
478
|
+
* Detect volume issues
|
|
479
|
+
*/
|
|
480
|
+
function detectVolumeIssues(pod, events) {
|
|
481
|
+
const issues = [];
|
|
482
|
+
const volumeEvents = events.filter(e => e.message.includes('volume') || e.message.includes('mount'));
|
|
483
|
+
for (const event of volumeEvents) {
|
|
484
|
+
if (event.type === 'Warning') {
|
|
485
|
+
issues.push({
|
|
486
|
+
type: 'Volume Mount Issue',
|
|
487
|
+
severity: 'high',
|
|
488
|
+
message: 'Volume mount failed',
|
|
489
|
+
rootCause: event.message,
|
|
490
|
+
solution: '1. Verify PVC is in Bound state\n2. Verify storage class is correct\n3. Check status with kubectl describe pvc <pvc-name>',
|
|
491
|
+
resource: {
|
|
492
|
+
kind: 'Pod',
|
|
493
|
+
name: pod.metadata?.name,
|
|
494
|
+
namespace: pod.metadata?.namespace,
|
|
495
|
+
},
|
|
496
|
+
relatedEvents: [event],
|
|
497
|
+
timestamp: new Date().toISOString(),
|
|
498
|
+
});
|
|
499
|
+
}
|
|
500
|
+
}
|
|
501
|
+
return issues;
|
|
502
|
+
}
|
|
503
|
+
/**
|
|
504
|
+
* Detect network issues
|
|
505
|
+
*/
|
|
506
|
+
function detectNetworkIssues(pod, events) {
|
|
507
|
+
const issues = [];
|
|
508
|
+
const networkEvents = events.filter(e => e.message.includes('network') || e.message.includes('CNI'));
|
|
509
|
+
for (const event of networkEvents) {
|
|
510
|
+
if (event.type === 'Warning') {
|
|
511
|
+
issues.push({
|
|
512
|
+
type: 'Network Configuration Issue',
|
|
513
|
+
severity: 'high',
|
|
514
|
+
message: 'Network configuration problem',
|
|
515
|
+
rootCause: event.message,
|
|
516
|
+
solution: '1. Check CNI plugin status\n2. Check network policy\n3. Verify Pod CIDR range',
|
|
517
|
+
resource: {
|
|
518
|
+
kind: 'Pod',
|
|
519
|
+
name: pod.metadata?.name,
|
|
520
|
+
namespace: pod.metadata?.namespace,
|
|
521
|
+
},
|
|
522
|
+
relatedEvents: [event],
|
|
523
|
+
timestamp: new Date().toISOString(),
|
|
524
|
+
});
|
|
525
|
+
}
|
|
526
|
+
}
|
|
527
|
+
return issues;
|
|
528
|
+
}
|
|
529
|
+
/**
|
|
530
|
+
* Calculate health score
|
|
531
|
+
*/
|
|
532
|
+
function calculateHealthScore(pod, issues) {
|
|
533
|
+
let score = 100;
|
|
534
|
+
// Deductions based on pod phase
|
|
535
|
+
const phase = pod.status?.phase;
|
|
536
|
+
if (phase === 'Failed')
|
|
537
|
+
score -= 100;
|
|
538
|
+
else if (phase === 'Pending')
|
|
539
|
+
score -= 30;
|
|
540
|
+
else if (phase === 'Unknown')
|
|
541
|
+
score -= 50;
|
|
542
|
+
// Deductions based on issues
|
|
543
|
+
for (const issue of issues) {
|
|
544
|
+
if (issue.severity === 'critical')
|
|
545
|
+
score -= 30;
|
|
546
|
+
else if (issue.severity === 'high')
|
|
547
|
+
score -= 20;
|
|
548
|
+
else if (issue.severity === 'medium')
|
|
549
|
+
score -= 10;
|
|
550
|
+
else if (issue.severity === 'low')
|
|
551
|
+
score -= 5;
|
|
552
|
+
}
|
|
553
|
+
return Math.max(0, Math.min(100, score));
|
|
554
|
+
}
|
|
555
|
+
/**
|
|
556
|
+
* Generate pod summary
|
|
557
|
+
*/
|
|
558
|
+
function generatePodSummary(pod, issues, healthScore) {
|
|
559
|
+
const phase = pod.status?.phase || 'Unknown';
|
|
560
|
+
const containerCount = pod.spec?.containers?.length || 0;
|
|
561
|
+
const readyContainers = pod.status?.containerStatuses?.filter((c) => c.ready).length || 0;
|
|
562
|
+
let summary = `Pod "${pod.metadata?.name}" is currently in ${phase} state.\n`;
|
|
563
|
+
summary += `Containers: ${readyContainers}/${containerCount} ready\n`;
|
|
564
|
+
summary += `Health: ${healthScore}/100\n\n`;
|
|
565
|
+
if (issues.length === 0) {
|
|
566
|
+
summary += '✅ No issues found!';
|
|
567
|
+
}
|
|
568
|
+
else {
|
|
569
|
+
summary += `⚠️ ${issues.length} issue(s) detected.\n`;
|
|
570
|
+
const critical = issues.filter(i => i.severity === 'critical').length;
|
|
571
|
+
const high = issues.filter(i => i.severity === 'high').length;
|
|
572
|
+
if (critical > 0)
|
|
573
|
+
summary += ` - Critical: ${critical}\n`;
|
|
574
|
+
if (high > 0)
|
|
575
|
+
summary += ` - High: ${high}\n`;
|
|
576
|
+
}
|
|
577
|
+
return summary;
|
|
578
|
+
}
|
|
579
|
+
// ===== Helper functions =====
|
|
580
|
+
function getWaitingSolution(reason) {
|
|
581
|
+
const solutions = {
|
|
582
|
+
'CreateContainerConfigError': 'Check container configuration (ConfigMap, Secret, etc.)',
|
|
583
|
+
'InvalidImageName': 'Verify image name format',
|
|
584
|
+
'CreateContainerError': 'Check container creation settings',
|
|
585
|
+
};
|
|
586
|
+
return solutions[reason] || 'Check logs and events to identify the cause';
|
|
587
|
+
}
|
|
588
|
+
function getTerminatedSolution(exitCode) {
|
|
589
|
+
const solutions = {
|
|
590
|
+
1: 'Check application logs to fix errors',
|
|
591
|
+
137: 'Increase memory limit (OOM killed)',
|
|
592
|
+
143: 'Verify graceful shutdown implementation',
|
|
593
|
+
126: 'Check executable permissions (chmod +x)',
|
|
594
|
+
127: 'Verify CMD/ENTRYPOINT path',
|
|
595
|
+
};
|
|
596
|
+
return solutions[exitCode] || `Check logs for exit code ${exitCode}`;
|
|
597
|
+
}
|
|
598
|
+
function parseCPU(cpu) {
|
|
599
|
+
if (cpu.endsWith('m')) {
|
|
600
|
+
return parseInt(cpu.slice(0, -1));
|
|
601
|
+
}
|
|
602
|
+
return parseFloat(cpu) * 1000;
|
|
603
|
+
}
|
|
604
|
+
function parseMemory(mem) {
|
|
605
|
+
const units = {
|
|
606
|
+
'Ki': 1024,
|
|
607
|
+
'Mi': 1024 * 1024,
|
|
608
|
+
'Gi': 1024 * 1024 * 1024,
|
|
609
|
+
'K': 1000,
|
|
610
|
+
'M': 1000 * 1000,
|
|
611
|
+
'G': 1000 * 1000 * 1000,
|
|
612
|
+
};
|
|
613
|
+
for (const [unit, multiplier] of Object.entries(units)) {
|
|
614
|
+
if (mem.endsWith(unit)) {
|
|
615
|
+
return parseFloat(mem.slice(0, -unit.length)) * multiplier;
|
|
616
|
+
}
|
|
617
|
+
}
|
|
618
|
+
return parseFloat(mem);
|
|
619
|
+
}
|
|
620
|
+
/**
|
|
621
|
+
* Parse CPU from Metrics API format
|
|
622
|
+
* Metrics API returns nanocores (e.g., "123456789n") or millicores (e.g., "123m")
|
|
623
|
+
*/
|
|
624
|
+
function parseMetricCPU(cpu) {
|
|
625
|
+
if (cpu.endsWith('n')) {
|
|
626
|
+
// Nanocores to millicores: divide by 1,000,000
|
|
627
|
+
return parseInt(cpu.slice(0, -1)) / 1_000_000;
|
|
628
|
+
}
|
|
629
|
+
else if (cpu.endsWith('m')) {
|
|
630
|
+
// Already in millicores
|
|
631
|
+
return parseInt(cpu.slice(0, -1));
|
|
632
|
+
}
|
|
633
|
+
else {
|
|
634
|
+
// Cores to millicores: multiply by 1000
|
|
635
|
+
return parseFloat(cpu) * 1000;
|
|
636
|
+
}
|
|
637
|
+
}
|
|
638
|
+
/**
|
|
639
|
+
* Parse Memory from Metrics API format
|
|
640
|
+
* Metrics API returns in Ki (e.g., "123456Ki")
|
|
641
|
+
*/
|
|
642
|
+
function parseMetricMemory(mem) {
|
|
643
|
+
if (mem.endsWith('Ki')) {
|
|
644
|
+
return parseInt(mem.slice(0, -2)) * 1024;
|
|
645
|
+
}
|
|
646
|
+
else if (mem.endsWith('Mi')) {
|
|
647
|
+
return parseInt(mem.slice(0, -2)) * 1024 * 1024;
|
|
648
|
+
}
|
|
649
|
+
else if (mem.endsWith('Gi')) {
|
|
650
|
+
return parseInt(mem.slice(0, -2)) * 1024 * 1024 * 1024;
|
|
651
|
+
}
|
|
652
|
+
// Assume bytes
|
|
653
|
+
return parseInt(mem);
|
|
654
|
+
}
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
/**
|
|
3
|
+
* K8s Doctor MCP Server
|
|
4
|
+
*
|
|
5
|
+
* MCP server for AI-powered Kubernetes cluster diagnosis and problem solving.
|
|
6
|
+
* Goes beyond simple queries - analyzes error logs, identifies root causes, and suggests solutions.
|
|
7
|
+
*
|
|
8
|
+
* @author zerry
|
|
9
|
+
* @license MIT
|
|
10
|
+
*/
|
|
11
|
+
export {};
|