@zerry_jin/k8s-doctor-mcp 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.ko.md +330 -0
- package/README.md +330 -0
- package/dist/analyzers/log-analyzer.d.ts +17 -0
- package/dist/analyzers/log-analyzer.js +402 -0
- package/dist/diagnostics/cluster-health.d.ts +13 -0
- package/dist/diagnostics/cluster-health.js +176 -0
- package/dist/diagnostics/pod-diagnostics.d.ts +23 -0
- package/dist/diagnostics/pod-diagnostics.js +654 -0
- package/dist/index.d.ts +11 -0
- package/dist/index.js +678 -0
- package/dist/types.d.ts +337 -0
- package/dist/types.js +6 -0
- package/dist/utils/cache.d.ts +59 -0
- package/dist/utils/cache.js +99 -0
- package/dist/utils/formatters.d.ts +48 -0
- package/dist/utils/formatters.js +129 -0
- package/dist/utils/k8s-client.d.ts +37 -0
- package/dist/utils/k8s-client.js +74 -0
- package/dist/utils/retry.d.ts +25 -0
- package/dist/utils/retry.js +70 -0
- package/package.json +65 -0
|
@@ -0,0 +1,402 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Log analysis module
|
|
3
|
+
*
|
|
4
|
+
* Rather than simply showing logs
|
|
5
|
+
* finds error patterns and analyzes root causes
|
|
6
|
+
*
|
|
7
|
+
* @author zerry
|
|
8
|
+
*/
|
|
9
|
+
/**
|
|
10
|
+
* Analyze pod logs
|
|
11
|
+
*
|
|
12
|
+
* Finds error patterns in logs and suggests solutions
|
|
13
|
+
* Extracts key information from thousands of log lines
|
|
14
|
+
*/
|
|
15
|
+
export async function analyzeLogs(logApi, namespace, podName, containerName, tailLines = 500) {
|
|
16
|
+
try {
|
|
17
|
+
const stream = new (require('stream').Writable)();
|
|
18
|
+
let logData = '';
|
|
19
|
+
stream._write = (chunk, encoding, next) => {
|
|
20
|
+
logData += chunk.toString();
|
|
21
|
+
next();
|
|
22
|
+
};
|
|
23
|
+
// Fetch logs
|
|
24
|
+
await logApi.log(namespace, podName, containerName || '', stream, {
|
|
25
|
+
tailLines,
|
|
26
|
+
});
|
|
27
|
+
const lines = logData.split('\n').filter(line => line.trim());
|
|
28
|
+
// 1. Extract error/warning lines
|
|
29
|
+
const errorLines = extractErrorLines(lines);
|
|
30
|
+
const warningLines = extractWarningLines(lines);
|
|
31
|
+
// 2. Detect error patterns
|
|
32
|
+
const patterns = detectErrorPatterns(lines);
|
|
33
|
+
// 3. Find repeated errors
|
|
34
|
+
const repeatedErrors = findRepeatedErrors(errorLines);
|
|
35
|
+
// 4. Generate summary
|
|
36
|
+
const summary = generateLogSummary(lines.length, errorLines, patterns);
|
|
37
|
+
// 5. Generate recommendations
|
|
38
|
+
const recommendations = generateRecommendations(patterns, repeatedErrors);
|
|
39
|
+
return {
|
|
40
|
+
totalLines: lines.length,
|
|
41
|
+
errorLines,
|
|
42
|
+
warningLines,
|
|
43
|
+
patterns,
|
|
44
|
+
repeatedErrors,
|
|
45
|
+
summary,
|
|
46
|
+
recommendations,
|
|
47
|
+
};
|
|
48
|
+
}
|
|
49
|
+
catch (error) {
|
|
50
|
+
throw new Error(`Log analysis failed: ${error.message}`);
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
/**
|
|
54
|
+
* Extract error lines
|
|
55
|
+
*
|
|
56
|
+
* Finds errors using keywords like ERROR, Exception, Fatal
|
|
57
|
+
*/
|
|
58
|
+
function extractErrorLines(lines) {
|
|
59
|
+
const errorKeywords = [
|
|
60
|
+
'error',
|
|
61
|
+
'exception',
|
|
62
|
+
'fatal',
|
|
63
|
+
'panic',
|
|
64
|
+
'failed',
|
|
65
|
+
'failure',
|
|
66
|
+
'err:',
|
|
67
|
+
'traceback',
|
|
68
|
+
'stacktrace',
|
|
69
|
+
];
|
|
70
|
+
const errorLines = [];
|
|
71
|
+
for (let i = 0; i < lines.length; i++) {
|
|
72
|
+
const line = lines[i];
|
|
73
|
+
const lowerLine = line.toLowerCase();
|
|
74
|
+
for (const keyword of errorKeywords) {
|
|
75
|
+
if (lowerLine.includes(keyword)) {
|
|
76
|
+
errorLines.push({
|
|
77
|
+
lineNumber: i + 1,
|
|
78
|
+
content: line,
|
|
79
|
+
timestamp: extractTimestamp(line),
|
|
80
|
+
level: 'ERROR',
|
|
81
|
+
});
|
|
82
|
+
break;
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
return errorLines;
|
|
87
|
+
}
|
|
88
|
+
/**
|
|
89
|
+
* Extract warning lines
|
|
90
|
+
*/
|
|
91
|
+
function extractWarningLines(lines) {
|
|
92
|
+
const warnKeywords = ['warn', 'warning', 'deprecated'];
|
|
93
|
+
const warningLines = [];
|
|
94
|
+
for (let i = 0; i < lines.length; i++) {
|
|
95
|
+
const line = lines[i];
|
|
96
|
+
const lowerLine = line.toLowerCase();
|
|
97
|
+
for (const keyword of warnKeywords) {
|
|
98
|
+
if (lowerLine.includes(keyword)) {
|
|
99
|
+
warningLines.push({
|
|
100
|
+
lineNumber: i + 1,
|
|
101
|
+
content: line,
|
|
102
|
+
timestamp: extractTimestamp(line),
|
|
103
|
+
level: 'WARN',
|
|
104
|
+
});
|
|
105
|
+
break;
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
return warningLines;
|
|
110
|
+
}
|
|
111
|
+
/**
|
|
112
|
+
* Detect error patterns
|
|
113
|
+
*
|
|
114
|
+
* Matches patterns of commonly occurring errors with solutions
|
|
115
|
+
* This is the core functionality!
|
|
116
|
+
*/
|
|
117
|
+
function detectErrorPatterns(lines) {
|
|
118
|
+
const patterns = [];
|
|
119
|
+
// Pattern definitions: errors frequently seen in production
|
|
120
|
+
const knownPatterns = [
|
|
121
|
+
{
|
|
122
|
+
name: 'Connection Refused',
|
|
123
|
+
regex: /connection refused|ECONNREFUSED/i,
|
|
124
|
+
description: 'Cannot connect to target service',
|
|
125
|
+
causes: [
|
|
126
|
+
'Target service not started yet',
|
|
127
|
+
'Wrong service port',
|
|
128
|
+
'Blocked by network policy',
|
|
129
|
+
],
|
|
130
|
+
solutions: [
|
|
131
|
+
'Check if service is running: kubectl get pods',
|
|
132
|
+
'Verify service port: kubectl get svc',
|
|
133
|
+
'Check network policy: kubectl get networkpolicy',
|
|
134
|
+
],
|
|
135
|
+
severity: 'high',
|
|
136
|
+
},
|
|
137
|
+
{
|
|
138
|
+
name: 'Database Connection Error',
|
|
139
|
+
regex: /could not connect to.*database|ETIMEDOUT.*:5432|:3306|:27017/i,
|
|
140
|
+
description: 'Database connection failed',
|
|
141
|
+
causes: [
|
|
142
|
+
'DB service not ready',
|
|
143
|
+
'Invalid connection string',
|
|
144
|
+
'DB authentication failed',
|
|
145
|
+
],
|
|
146
|
+
solutions: [
|
|
147
|
+
'Check DB Pod status',
|
|
148
|
+
'Verify environment variables (ConfigMap/Secret)',
|
|
149
|
+
'Check DB service endpoints: kubectl get endpoints',
|
|
150
|
+
],
|
|
151
|
+
severity: 'critical',
|
|
152
|
+
},
|
|
153
|
+
{
|
|
154
|
+
name: 'Out of Memory',
|
|
155
|
+
regex: /out of memory|OOMKilled|cannot allocate memory/i,
|
|
156
|
+
description: 'Insufficient memory',
|
|
157
|
+
causes: [
|
|
158
|
+
'Memory limit set too low',
|
|
159
|
+
'Memory leak',
|
|
160
|
+
'Higher memory usage than expected',
|
|
161
|
+
],
|
|
162
|
+
solutions: [
|
|
163
|
+
'Increase memory limit: resources.limits.memory',
|
|
164
|
+
'Profile application memory usage',
|
|
165
|
+
'Increase pod count with HPA',
|
|
166
|
+
],
|
|
167
|
+
severity: 'critical',
|
|
168
|
+
},
|
|
169
|
+
{
|
|
170
|
+
name: 'File Not Found',
|
|
171
|
+
regex: /no such file|ENOENT|FileNotFoundError/i,
|
|
172
|
+
description: 'File or directory not found',
|
|
173
|
+
causes: [
|
|
174
|
+
'ConfigMap/Secret not mounted',
|
|
175
|
+
'Invalid file path',
|
|
176
|
+
'Volume mount failed',
|
|
177
|
+
],
|
|
178
|
+
solutions: [
|
|
179
|
+
'Check volumeMounts configuration',
|
|
180
|
+
'Verify ConfigMap/Secret exists',
|
|
181
|
+
'Verify file path is correct',
|
|
182
|
+
],
|
|
183
|
+
severity: 'high',
|
|
184
|
+
},
|
|
185
|
+
{
|
|
186
|
+
name: 'Permission Denied',
|
|
187
|
+
regex: /permission denied|EACCES|access denied/i,
|
|
188
|
+
description: 'Permission denied',
|
|
189
|
+
causes: [
|
|
190
|
+
'SecurityContext runAsUser setting',
|
|
191
|
+
'Volume fsGroup not set',
|
|
192
|
+
'File permission issues',
|
|
193
|
+
],
|
|
194
|
+
solutions: [
|
|
195
|
+
'Configure securityContext:\nfsGroup: 1000\nrunAsUser: 1000',
|
|
196
|
+
'Check volume permissions',
|
|
197
|
+
'Set permissions in Dockerfile',
|
|
198
|
+
],
|
|
199
|
+
severity: 'high',
|
|
200
|
+
},
|
|
201
|
+
{
|
|
202
|
+
name: 'DNS Resolution Failed',
|
|
203
|
+
regex: /dns.*failed|getaddrinfo.*ENOTFOUND|name.*not known/i,
|
|
204
|
+
description: 'DNS lookup failed',
|
|
205
|
+
causes: [
|
|
206
|
+
'CoreDNS issues',
|
|
207
|
+
'Invalid service name',
|
|
208
|
+
'ndots configuration problem',
|
|
209
|
+
],
|
|
210
|
+
solutions: [
|
|
211
|
+
'Check CoreDNS Pod: kubectl get pods -n kube-system',
|
|
212
|
+
'Service name format: <service>.<namespace>.svc.cluster.local',
|
|
213
|
+
'Verify dnsPolicy setting',
|
|
214
|
+
],
|
|
215
|
+
severity: 'high',
|
|
216
|
+
},
|
|
217
|
+
{
|
|
218
|
+
name: 'Port Already in Use',
|
|
219
|
+
regex: /address already in use|EADDRINUSE/i,
|
|
220
|
+
description: 'Port already in use',
|
|
221
|
+
causes: [
|
|
222
|
+
'Multiple processes using same port',
|
|
223
|
+
'Previous process not terminated properly',
|
|
224
|
+
],
|
|
225
|
+
solutions: [
|
|
226
|
+
'Change port number',
|
|
227
|
+
'Implement graceful shutdown',
|
|
228
|
+
'Configure preStop hook',
|
|
229
|
+
],
|
|
230
|
+
severity: 'medium',
|
|
231
|
+
},
|
|
232
|
+
{
|
|
233
|
+
name: 'Timeout',
|
|
234
|
+
regex: /timeout|timed out|ETIMEDOUT/i,
|
|
235
|
+
description: 'Timeout occurred',
|
|
236
|
+
causes: [
|
|
237
|
+
'Response time too long',
|
|
238
|
+
'Network latency',
|
|
239
|
+
'Target service overloaded',
|
|
240
|
+
],
|
|
241
|
+
solutions: [
|
|
242
|
+
'Increase timeout value',
|
|
243
|
+
'Optimize service performance',
|
|
244
|
+
'Adjust readinessProbe timeout',
|
|
245
|
+
],
|
|
246
|
+
severity: 'medium',
|
|
247
|
+
},
|
|
248
|
+
{
|
|
249
|
+
name: 'Null Pointer / Undefined',
|
|
250
|
+
regex: /null pointer|undefined is not|cannot read property.*undefined|NullPointerException/i,
|
|
251
|
+
description: 'Null/Undefined reference',
|
|
252
|
+
causes: [
|
|
253
|
+
'Environment variable not set',
|
|
254
|
+
'Using uninitialized variable',
|
|
255
|
+
],
|
|
256
|
+
solutions: [
|
|
257
|
+
'Verify ConfigMap/Secret',
|
|
258
|
+
'Set default value for environment variables',
|
|
259
|
+
'Fix code',
|
|
260
|
+
],
|
|
261
|
+
severity: 'medium',
|
|
262
|
+
},
|
|
263
|
+
{
|
|
264
|
+
name: 'SSL/TLS Error',
|
|
265
|
+
regex: /ssl.*error|certificate.*invalid|CERT_/i,
|
|
266
|
+
description: 'SSL/TLS certificate error',
|
|
267
|
+
causes: [
|
|
268
|
+
'Expired certificate',
|
|
269
|
+
'Self-signed certificate',
|
|
270
|
+
'CA bundle missing',
|
|
271
|
+
],
|
|
272
|
+
solutions: [
|
|
273
|
+
'Renew certificate',
|
|
274
|
+
'Verify tls.crt, tls.key Secret',
|
|
275
|
+
'NODE_TLS_REJECT_UNAUTHORIZED=0 (development only)',
|
|
276
|
+
],
|
|
277
|
+
severity: 'high',
|
|
278
|
+
},
|
|
279
|
+
];
|
|
280
|
+
// Match each pattern
|
|
281
|
+
for (const pattern of knownPatterns) {
|
|
282
|
+
const matchedLines = [];
|
|
283
|
+
for (let i = 0; i < lines.length; i++) {
|
|
284
|
+
if (pattern.regex.test(lines[i])) {
|
|
285
|
+
matchedLines.push(i + 1);
|
|
286
|
+
}
|
|
287
|
+
}
|
|
288
|
+
if (matchedLines.length > 0) {
|
|
289
|
+
patterns.push({
|
|
290
|
+
name: pattern.name,
|
|
291
|
+
matchedLines,
|
|
292
|
+
description: pattern.description,
|
|
293
|
+
possibleCauses: pattern.causes,
|
|
294
|
+
solutions: pattern.solutions,
|
|
295
|
+
severity: pattern.severity,
|
|
296
|
+
});
|
|
297
|
+
}
|
|
298
|
+
}
|
|
299
|
+
return patterns;
|
|
300
|
+
}
|
|
301
|
+
/**
|
|
302
|
+
* Find repeated errors
|
|
303
|
+
*
|
|
304
|
+
* If the same error keeps occurring, there's a pattern
|
|
305
|
+
*/
|
|
306
|
+
function findRepeatedErrors(errorLines) {
|
|
307
|
+
const errorMap = new Map();
|
|
308
|
+
for (const entry of errorLines) {
|
|
309
|
+
// Normalize error message (remove timestamps, numbers, etc.)
|
|
310
|
+
const normalized = normalizeErrorMessage(entry.content);
|
|
311
|
+
if (!errorMap.has(normalized)) {
|
|
312
|
+
errorMap.set(normalized, []);
|
|
313
|
+
}
|
|
314
|
+
errorMap.get(normalized).push(entry.lineNumber);
|
|
315
|
+
}
|
|
316
|
+
const repeatedErrors = [];
|
|
317
|
+
for (const [message, lineNumbers] of errorMap.entries()) {
|
|
318
|
+
if (lineNumbers.length >= 3) { // If repeated 3+ times
|
|
319
|
+
repeatedErrors.push({
|
|
320
|
+
message,
|
|
321
|
+
count: lineNumbers.length,
|
|
322
|
+
firstLine: lineNumbers[0],
|
|
323
|
+
lastLine: lineNumbers[lineNumbers.length - 1],
|
|
324
|
+
isPattern: true,
|
|
325
|
+
});
|
|
326
|
+
}
|
|
327
|
+
}
|
|
328
|
+
// Sort by occurrence count
|
|
329
|
+
return repeatedErrors.sort((a, b) => b.count - a.count);
|
|
330
|
+
}
|
|
331
|
+
/**
|
|
332
|
+
* Normalize error message
|
|
333
|
+
*
|
|
334
|
+
* Remove timestamps, IPs, ports, etc. to group similar errors
|
|
335
|
+
*/
|
|
336
|
+
function normalizeErrorMessage(message) {
|
|
337
|
+
return message
|
|
338
|
+
.replace(/\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2}.*?Z/g, '<timestamp>') // ISO timestamp
|
|
339
|
+
.replace(/\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}/g, '<ip>') // IP address
|
|
340
|
+
.replace(/:\d{2,5}/g, ':<port>') // Port number
|
|
341
|
+
.replace(/\d+/g, '<num>') // Other numbers
|
|
342
|
+
.replace(/0x[0-9a-f]+/gi, '<hex>') // Hex values
|
|
343
|
+
.toLowerCase()
|
|
344
|
+
.trim();
|
|
345
|
+
}
|
|
346
|
+
/**
|
|
347
|
+
* 타임스탬프 추출
|
|
348
|
+
*
|
|
349
|
+
* 로그에서 타임스탬프를 파싱 시도
|
|
350
|
+
*/
|
|
351
|
+
function extractTimestamp(line) {
|
|
352
|
+
// ISO 8601 format
|
|
353
|
+
const isoMatch = line.match(/\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2}(?:\.\d+)?Z?/);
|
|
354
|
+
if (isoMatch)
|
|
355
|
+
return isoMatch[0];
|
|
356
|
+
// RFC 3339
|
|
357
|
+
const rfcMatch = line.match(/\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}[+-]\d{2}:\d{2}/);
|
|
358
|
+
if (rfcMatch)
|
|
359
|
+
return rfcMatch[0];
|
|
360
|
+
return undefined;
|
|
361
|
+
}
|
|
362
|
+
/**
|
|
363
|
+
* Generate log summary
|
|
364
|
+
*/
|
|
365
|
+
function generateLogSummary(totalLines, errorLines, patterns) {
|
|
366
|
+
let summary = `Analyzed ${totalLines} lines of logs.\n`;
|
|
367
|
+
summary += `Errors: ${errorLines.length}\n`;
|
|
368
|
+
if (patterns.length > 0) {
|
|
369
|
+
summary += `\nDetected ${patterns.length} error patterns:\n`;
|
|
370
|
+
for (const pattern of patterns) {
|
|
371
|
+
summary += ` - ${pattern.name} (${pattern.matchedLines.length} occurrences)\n`;
|
|
372
|
+
}
|
|
373
|
+
}
|
|
374
|
+
else {
|
|
375
|
+
summary += '\n✅ No known error patterns detected.';
|
|
376
|
+
}
|
|
377
|
+
return summary;
|
|
378
|
+
}
|
|
379
|
+
/**
|
|
380
|
+
* Generate recommendations
|
|
381
|
+
*/
|
|
382
|
+
function generateRecommendations(patterns, repeatedErrors) {
|
|
383
|
+
const recommendations = [];
|
|
384
|
+
// Prioritize critical patterns
|
|
385
|
+
const criticalPatterns = patterns.filter(p => p.severity === 'critical');
|
|
386
|
+
if (criticalPatterns.length > 0) {
|
|
387
|
+
recommendations.push(`🔴 Resolve ${criticalPatterns.length} Critical issue(s) as top priority`);
|
|
388
|
+
}
|
|
389
|
+
// If there are many repeated errors
|
|
390
|
+
if (repeatedErrors.length > 0) {
|
|
391
|
+
const topError = repeatedErrors[0];
|
|
392
|
+
recommendations.push(`⚠️ "${topError.message}" error is repeating ${topError.count} times. Find the root cause.`);
|
|
393
|
+
}
|
|
394
|
+
// Pattern-specific recommendations
|
|
395
|
+
for (const pattern of patterns.slice(0, 3)) { // Top 3 only
|
|
396
|
+
recommendations.push(`💡 ${pattern.name}: ${pattern.solutions[0]}`);
|
|
397
|
+
}
|
|
398
|
+
if (recommendations.length === 0) {
|
|
399
|
+
recommendations.push('✅ No special issues found in current logs.');
|
|
400
|
+
}
|
|
401
|
+
return recommendations;
|
|
402
|
+
}
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Cluster-wide health diagnostics
|
|
3
|
+
*
|
|
4
|
+
* @author zerry
|
|
5
|
+
*/
|
|
6
|
+
import * as k8s from '@kubernetes/client-node';
|
|
7
|
+
import type { ClusterHealth } from '../types.js';
|
|
8
|
+
/**
|
|
9
|
+
* Diagnose overall cluster health
|
|
10
|
+
*
|
|
11
|
+
* Comprehensively analyzes nodes, pods, and resource utilization
|
|
12
|
+
*/
|
|
13
|
+
export declare function diagnoseClusterHealth(coreApi: k8s.CoreV1Api, namespace?: string): Promise<ClusterHealth>;
|
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Cluster-wide health diagnostics
|
|
3
|
+
*
|
|
4
|
+
* @author zerry
|
|
5
|
+
*/
|
|
6
|
+
/**
|
|
7
|
+
* Diagnose overall cluster health
|
|
8
|
+
*
|
|
9
|
+
* Comprehensively analyzes nodes, pods, and resource utilization
|
|
10
|
+
*/
|
|
11
|
+
export async function diagnoseClusterHealth(coreApi, namespace) {
|
|
12
|
+
const issues = [];
|
|
13
|
+
// 1. Node health
|
|
14
|
+
const nodesResponse = await coreApi.listNode();
|
|
15
|
+
const nodes = nodesResponse.items;
|
|
16
|
+
const readyNodes = nodes.filter((n) => n.status?.conditions?.some((c) => c.type === 'Ready' && c.status === 'True'));
|
|
17
|
+
const notReadyNodes = nodes.filter((n) => !n.status?.conditions?.some((c) => c.type === 'Ready' && c.status === 'True'));
|
|
18
|
+
// Not Ready node issues
|
|
19
|
+
for (const node of notReadyNodes) {
|
|
20
|
+
issues.push({
|
|
21
|
+
type: 'Node Not Ready',
|
|
22
|
+
severity: 'critical',
|
|
23
|
+
message: `Node "${node.metadata?.name}" is not in Ready state`,
|
|
24
|
+
rootCause: 'Node has encountered a problem',
|
|
25
|
+
solution: 'Check detailed cause with: kubectl describe node <node-name>',
|
|
26
|
+
resource: {
|
|
27
|
+
kind: 'Node',
|
|
28
|
+
name: node.metadata?.name || 'unknown',
|
|
29
|
+
namespace: '',
|
|
30
|
+
},
|
|
31
|
+
timestamp: new Date().toISOString(),
|
|
32
|
+
});
|
|
33
|
+
}
|
|
34
|
+
// 2. Pod health
|
|
35
|
+
const podsResponse = namespace
|
|
36
|
+
? await coreApi.listNamespacedPod({ namespace })
|
|
37
|
+
: await coreApi.listPodForAllNamespaces();
|
|
38
|
+
const pods = podsResponse.items;
|
|
39
|
+
const podStats = {
|
|
40
|
+
total: pods.length,
|
|
41
|
+
running: pods.filter((p) => p.status?.phase === 'Running').length,
|
|
42
|
+
pending: pods.filter((p) => p.status?.phase === 'Pending').length,
|
|
43
|
+
failed: pods.filter((p) => p.status?.phase === 'Failed').length,
|
|
44
|
+
crashLooping: pods.filter((p) => p.status?.containerStatuses?.some((c) => c.state?.waiting?.reason === 'CrashLoopBackOff' || (c.restartCount || 0) > 5)).length,
|
|
45
|
+
};
|
|
46
|
+
// Pending pod issues
|
|
47
|
+
const pendingPods = pods.filter((p) => p.status?.phase === 'Pending');
|
|
48
|
+
for (const pod of pendingPods.slice(0, 5)) { // Max 5 pods
|
|
49
|
+
issues.push({
|
|
50
|
+
type: 'Pod Pending',
|
|
51
|
+
severity: 'high',
|
|
52
|
+
message: `Pod "${pod.metadata?.name}" is in Pending state`,
|
|
53
|
+
rootCause: 'Cannot be scheduled or image cannot be pulled',
|
|
54
|
+
solution: `kubectl describe pod ${pod.metadata?.name} -n ${pod.metadata?.namespace}`,
|
|
55
|
+
resource: {
|
|
56
|
+
kind: 'Pod',
|
|
57
|
+
name: pod.metadata?.name || 'unknown',
|
|
58
|
+
namespace: pod.metadata?.namespace || 'default',
|
|
59
|
+
},
|
|
60
|
+
timestamp: new Date().toISOString(),
|
|
61
|
+
});
|
|
62
|
+
}
|
|
63
|
+
// CrashLoop pod issues
|
|
64
|
+
const crashLoopPods = pods.filter((p) => p.status?.containerStatuses?.some((c) => c.state?.waiting?.reason === 'CrashLoopBackOff' || (c.restartCount || 0) > 5));
|
|
65
|
+
for (const pod of crashLoopPods.slice(0, 5)) {
|
|
66
|
+
issues.push({
|
|
67
|
+
type: 'CrashLoopBackOff',
|
|
68
|
+
severity: 'critical',
|
|
69
|
+
message: `Pod "${pod.metadata?.name}" is in CrashLoop state`,
|
|
70
|
+
rootCause: 'Container is repeatedly failing',
|
|
71
|
+
solution: `kubectl logs ${pod.metadata?.name} -n ${pod.metadata?.namespace} --previous`,
|
|
72
|
+
resource: {
|
|
73
|
+
kind: 'Pod',
|
|
74
|
+
name: pod.metadata?.name || 'unknown',
|
|
75
|
+
namespace: pod.metadata?.namespace || 'default',
|
|
76
|
+
},
|
|
77
|
+
timestamp: new Date().toISOString(),
|
|
78
|
+
});
|
|
79
|
+
}
|
|
80
|
+
// 3. Resource utilization (basic calculation)
|
|
81
|
+
const resourceUtilization = {
|
|
82
|
+
cpu: 0,
|
|
83
|
+
memory: 0,
|
|
84
|
+
storage: 0,
|
|
85
|
+
};
|
|
86
|
+
// 4. Overall health score
|
|
87
|
+
const overallScore = calculateOverallScore(nodes.length, readyNodes.length, podStats, issues);
|
|
88
|
+
// 5. Filter critical issues only
|
|
89
|
+
const criticalIssues = issues.filter(i => i.severity === 'critical');
|
|
90
|
+
// 6. Recommendations
|
|
91
|
+
const recommendations = generateClusterRecommendations(issues, podStats, nodes.length);
|
|
92
|
+
// 7. Summary
|
|
93
|
+
const summary = generateClusterSummary(nodes.length, readyNodes.length, podStats, overallScore);
|
|
94
|
+
return {
|
|
95
|
+
overallScore,
|
|
96
|
+
nodeHealth: {
|
|
97
|
+
total: nodes.length,
|
|
98
|
+
ready: readyNodes.length,
|
|
99
|
+
notReady: notReadyNodes.length,
|
|
100
|
+
issues: issues.filter(i => i.resource?.kind === 'Node'),
|
|
101
|
+
},
|
|
102
|
+
podHealth: {
|
|
103
|
+
...podStats,
|
|
104
|
+
issues: issues.filter(i => i.resource?.kind === 'Pod'),
|
|
105
|
+
},
|
|
106
|
+
resourceUtilization,
|
|
107
|
+
criticalIssues,
|
|
108
|
+
recommendations,
|
|
109
|
+
summary,
|
|
110
|
+
};
|
|
111
|
+
}
|
|
112
|
+
/**
|
|
113
|
+
* Calculate overall health score
|
|
114
|
+
*/
|
|
115
|
+
function calculateOverallScore(totalNodes, readyNodes, podStats, issues) {
|
|
116
|
+
let score = 100;
|
|
117
|
+
// Deduct for node status
|
|
118
|
+
const nodeHealthPercent = (readyNodes / totalNodes) * 100;
|
|
119
|
+
score -= (100 - nodeHealthPercent) * 0.5;
|
|
120
|
+
// Deduct for pod status
|
|
121
|
+
const healthyPodPercent = (podStats.running / podStats.total) * 100;
|
|
122
|
+
score -= (100 - healthyPodPercent) * 0.3;
|
|
123
|
+
// CrashLoop is severe
|
|
124
|
+
score -= podStats.crashLooping * 5;
|
|
125
|
+
// Deduct for issues
|
|
126
|
+
for (const issue of issues) {
|
|
127
|
+
if (issue.severity === 'critical')
|
|
128
|
+
score -= 10;
|
|
129
|
+
else if (issue.severity === 'high')
|
|
130
|
+
score -= 5;
|
|
131
|
+
else if (issue.severity === 'medium')
|
|
132
|
+
score -= 2;
|
|
133
|
+
}
|
|
134
|
+
return Math.max(0, Math.min(100, score));
|
|
135
|
+
}
|
|
136
|
+
/**
|
|
137
|
+
* Generate cluster recommendations
|
|
138
|
+
*/
|
|
139
|
+
function generateClusterRecommendations(issues, podStats, totalNodes) {
|
|
140
|
+
const recommendations = [];
|
|
141
|
+
const critical = issues.filter(i => i.severity === 'critical');
|
|
142
|
+
if (critical.length > 0) {
|
|
143
|
+
recommendations.push(`🔴 Resolve ${critical.length} Critical issue(s) as top priority`);
|
|
144
|
+
}
|
|
145
|
+
if (podStats.crashLooping > 0) {
|
|
146
|
+
recommendations.push(`⚠️ ${podStats.crashLooping} pod(s) in CrashLoop state. Check logs immediately`);
|
|
147
|
+
}
|
|
148
|
+
if (podStats.pending > 5) {
|
|
149
|
+
recommendations.push(`⚠️ ${podStats.pending} pod(s) in Pending state. Check for resource insufficiency`);
|
|
150
|
+
}
|
|
151
|
+
if (totalNodes < 3) {
|
|
152
|
+
recommendations.push('💡 For high availability, running at least 3 nodes is recommended');
|
|
153
|
+
}
|
|
154
|
+
if (recommendations.length === 0) {
|
|
155
|
+
recommendations.push('✅ Cluster is healthy!');
|
|
156
|
+
}
|
|
157
|
+
return recommendations;
|
|
158
|
+
}
|
|
159
|
+
/**
|
|
160
|
+
* Generate cluster summary
|
|
161
|
+
*/
|
|
162
|
+
function generateClusterSummary(totalNodes, readyNodes, podStats, score) {
|
|
163
|
+
let summary = `Cluster Health Score: ${score.toFixed(1)}/100\n\n`;
|
|
164
|
+
summary += `Nodes: ${readyNodes}/${totalNodes} Ready\n`;
|
|
165
|
+
summary += `Pods: ${podStats.running}/${podStats.total} Running\n`;
|
|
166
|
+
if (podStats.crashLooping > 0) {
|
|
167
|
+
summary += `⚠️ CrashLoop: ${podStats.crashLooping}\n`;
|
|
168
|
+
}
|
|
169
|
+
if (podStats.pending > 0) {
|
|
170
|
+
summary += `⚠️ Pending: ${podStats.pending}\n`;
|
|
171
|
+
}
|
|
172
|
+
if (podStats.failed > 0) {
|
|
173
|
+
summary += `⚠️ Failed: ${podStats.failed}\n`;
|
|
174
|
+
}
|
|
175
|
+
return summary;
|
|
176
|
+
}
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Pod diagnostics module
|
|
3
|
+
*
|
|
4
|
+
* Comprehensively analyzes pod status, containers, and events
|
|
5
|
+
* to identify the root cause of problems
|
|
6
|
+
*
|
|
7
|
+
* @author zerry
|
|
8
|
+
*/
|
|
9
|
+
import * as k8s from '@kubernetes/client-node';
|
|
10
|
+
import type { PodDiagnostics, DiagnosticIssue } from '../types.js';
|
|
11
|
+
/**
|
|
12
|
+
* Comprehensive pod diagnostics
|
|
13
|
+
*
|
|
14
|
+
* This is the core feature. Analyzes all pod states
|
|
15
|
+
* to clearly explain "why it's not working"
|
|
16
|
+
*/
|
|
17
|
+
export declare function diagnosePod(coreApi: k8s.CoreV1Api, namespace: string, podName: string, metricsApi?: k8s.Metrics): Promise<PodDiagnostics>;
|
|
18
|
+
/**
|
|
19
|
+
* Specialized CrashLoopBackOff diagnostics
|
|
20
|
+
*
|
|
21
|
+
* CrashLoop is really tricky, this function accurately identifies the cause
|
|
22
|
+
*/
|
|
23
|
+
export declare function diagnoseCrashLoop(coreApi: k8s.CoreV1Api, logApi: k8s.Log, namespace: string, podName: string, containerName?: string): Promise<DiagnosticIssue[]>;
|