@zerry_jin/k8s-doctor-mcp 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,654 @@
1
+ /**
2
+ * Pod diagnostics module
3
+ *
4
+ * Comprehensively analyzes pod status, containers, and events
5
+ * to identify the root cause of problems
6
+ *
7
+ * @author zerry
8
+ */
9
+ import { withRetry } from '../utils/retry.js';
10
+ /**
11
+ * Comprehensive pod diagnostics
12
+ *
13
+ * This is the core feature. Analyzes all pod states
14
+ * to clearly explain "why it's not working"
15
+ */
16
+ export async function diagnosePod(coreApi, namespace, podName, metricsApi) {
17
+ try {
18
+ console.error(`[diagnosePod] Starting diagnostics for pod ${podName} in namespace ${namespace}`);
19
+ // Parallel API calls for better performance
20
+ const [pod, eventsResponse] = await Promise.all([
21
+ // 1. Get pod information with retry
22
+ withRetry(() => coreApi.readNamespacedPod({ name: podName, namespace }), {
23
+ maxAttempts: 3,
24
+ initialDelay: 500,
25
+ }).catch((error) => {
26
+ console.error(`[diagnosePod] Failed to get pod info:`, error.message);
27
+ throw new Error(`Cannot read pod ${podName}: ${error.message}`);
28
+ }),
29
+ // 2. Get pod events with retry
30
+ withRetry(() => coreApi.listNamespacedEvent({
31
+ namespace,
32
+ fieldSelector: `involvedObject.name=${podName}`,
33
+ }), {
34
+ maxAttempts: 2, // Events are less critical, fewer retries
35
+ initialDelay: 500,
36
+ }).catch((error) => {
37
+ console.error(`[diagnosePod] Failed to get events (non-fatal):`, error.message);
38
+ // Return empty events instead of failing
39
+ return { items: [] };
40
+ }),
41
+ ]);
42
+ const events = parseEvents(eventsResponse.items);
43
+ // 3. Analyze container status
44
+ const containers = parseContainerStatuses(pod.status?.containerStatuses || []);
45
+ // 4. Detect issues
46
+ const issues = [];
47
+ // Container-related issues
48
+ issues.push(...detectContainerIssues(pod, containers, events));
49
+ // Image pull issues
50
+ issues.push(...detectImagePullIssues(pod, events));
51
+ // Resource-related issues
52
+ const resources = await analyzeResourceUsage(pod, namespace, podName, metricsApi);
53
+ issues.push(...detectResourceIssues(pod, resources));
54
+ // Volume mount issues
55
+ issues.push(...detectVolumeIssues(pod, events));
56
+ // Network issues
57
+ issues.push(...detectNetworkIssues(pod, events));
58
+ // 5. Calculate health score
59
+ const healthScore = calculateHealthScore(pod, issues);
60
+ // 6. Generate summary
61
+ const summary = generatePodSummary(pod, issues, healthScore);
62
+ return {
63
+ podInfo: {
64
+ name: pod.metadata?.name || podName,
65
+ namespace: pod.metadata?.namespace || namespace,
66
+ phase: pod.status?.phase || 'Unknown',
67
+ startTime: pod.status?.startTime?.toISOString(),
68
+ nodeName: pod.spec?.nodeName,
69
+ hostIP: pod.status?.hostIP,
70
+ podIP: pod.status?.podIP,
71
+ },
72
+ containers,
73
+ issues,
74
+ resources,
75
+ events,
76
+ summary,
77
+ healthScore,
78
+ };
79
+ }
80
+ catch (error) {
81
+ console.error(`[diagnosePod] Fatal error:`, error);
82
+ throw new Error(`Pod diagnosis failed: ${error.message}`);
83
+ }
84
+ }
85
+ /**
86
+ * Specialized CrashLoopBackOff diagnostics
87
+ *
88
+ * CrashLoop is really tricky, this function accurately identifies the cause
89
+ */
90
+ export async function diagnoseCrashLoop(coreApi, logApi, namespace, podName, containerName) {
91
+ const issues = [];
92
+ try {
93
+ console.error(`[diagnoseCrashLoop] Analyzing pod ${podName} in namespace ${namespace}`);
94
+ const pod = await withRetry(() => coreApi.readNamespacedPod({ name: podName, namespace }), { maxAttempts: 3 });
95
+ const containerStatuses = pod.status?.containerStatuses || [];
96
+ for (const status of containerStatuses) {
97
+ // If containerName specified, only that container
98
+ if (containerName && status.name !== containerName)
99
+ continue;
100
+ const restartCount = status.restartCount || 0;
101
+ // Detect CrashLoop
102
+ if (restartCount > 3 || status.state?.waiting?.reason === 'CrashLoopBackOff') {
103
+ // Check termination reason from previous state
104
+ const lastTerminated = status.lastState?.terminated;
105
+ let rootCause = 'unknown';
106
+ let solution = '';
107
+ if (lastTerminated) {
108
+ const exitCode = lastTerminated.exitCode;
109
+ // Analyze exit code
110
+ if (exitCode === 0) {
111
+ rootCause = 'Container exited normally but keeps restarting due to restart policy';
112
+ solution = 'Change spec.restartPolicy to "Never" or "OnFailure"\n```yaml\nspec:\n restartPolicy: OnFailure\n```';
113
+ }
114
+ else if (exitCode === 1) {
115
+ rootCause = 'Application error caused termination';
116
+ solution = 'Check logs to fix application errors\n```bash\nkubectl logs ' + podName + ' -n ' + namespace + ' -c ' + status.name + ' --previous\n```';
117
+ }
118
+ else if (exitCode === 137) {
119
+ rootCause = 'OOM (Out Of Memory) - Container was killed due to insufficient memory';
120
+ solution = 'Increase memory limit or optimize application memory usage\n```yaml\nresources:\n limits:\n memory: "512Mi" # Set higher than current\n```';
121
+ }
122
+ else if (exitCode === 143) {
123
+ rootCause = 'Terminated by SIGTERM - Received normal termination signal';
124
+ solution = 'Graceful shutdown may not be properly implemented. Try increasing terminationGracePeriodSeconds';
125
+ }
126
+ else if (exitCode === 126) {
127
+ rootCause = 'Permission denied - Executable file lacks execute permission';
128
+ solution = 'Grant execute permission with chmod +x in Dockerfile';
129
+ }
130
+ else if (exitCode === 127) {
131
+ rootCause = 'Command not found - CMD/ENTRYPOINT command does not exist';
132
+ solution = 'Verify CMD/ENTRYPOINT path in Dockerfile';
133
+ }
134
+ else {
135
+ rootCause = `Unknown error (exit code ${exitCode})`;
136
+ solution = 'Check logs to identify detailed cause';
137
+ }
138
+ }
139
+ // Find additional clues in logs
140
+ try {
141
+ const { Writable } = require('stream');
142
+ const stream = new Writable();
143
+ let logData = '';
144
+ stream._write = (chunk, _encoding, next) => {
145
+ logData += chunk.toString();
146
+ next();
147
+ };
148
+ await withRetry(() => logApi.log(namespace, podName, status.name, stream, {
149
+ previous: true,
150
+ tailLines: 50,
151
+ }), { maxAttempts: 2 });
152
+ // Find error patterns in logs
153
+ const relevantLogs = [];
154
+ const lines = logData.split('\n');
155
+ for (const line of lines) {
156
+ if (line.toLowerCase().includes('error') ||
157
+ line.toLowerCase().includes('exception') ||
158
+ line.toLowerCase().includes('fatal') ||
159
+ line.toLowerCase().includes('panic')) {
160
+ relevantLogs.push(line.trim());
161
+ }
162
+ }
163
+ issues.push({
164
+ type: 'CrashLoopBackOff',
165
+ severity: 'critical',
166
+ message: `Container "${status.name}" has restarted ${restartCount} times`,
167
+ rootCause,
168
+ solution,
169
+ resource: {
170
+ kind: 'Pod',
171
+ name: podName,
172
+ namespace,
173
+ },
174
+ relevantLogs: relevantLogs.slice(0, 10), // Max 10 lines
175
+ timestamp: new Date().toISOString(),
176
+ });
177
+ }
178
+ catch (logError) {
179
+ console.error(`[diagnoseCrashLoop] Failed to retrieve logs for ${status.name}:`, logError.message);
180
+ // Add issue even if logs cannot be retrieved
181
+ issues.push({
182
+ type: 'CrashLoopBackOff',
183
+ severity: 'critical',
184
+ message: `Container "${status.name}" has restarted ${restartCount} times`,
185
+ rootCause,
186
+ solution,
187
+ resource: {
188
+ kind: 'Pod',
189
+ name: podName,
190
+ namespace,
191
+ },
192
+ timestamp: new Date().toISOString(),
193
+ });
194
+ }
195
+ }
196
+ }
197
+ }
198
+ catch (error) {
199
+ throw new Error(`CrashLoop diagnostics failed: ${error.message}`);
200
+ }
201
+ return issues;
202
+ }
203
+ /**
204
+ * Parse container statuses
205
+ */
206
+ function parseContainerStatuses(statuses) {
207
+ return statuses.map(s => ({
208
+ name: s.name,
209
+ ready: s.ready || false,
210
+ restartCount: s.restartCount || 0,
211
+ state: s.state || {},
212
+ lastState: s.lastState,
213
+ image: s.image,
214
+ imageID: s.imageID,
215
+ }));
216
+ }
217
+ /**
218
+ * Parse events
219
+ */
220
+ function parseEvents(items) {
221
+ return items
222
+ .map(e => ({
223
+ type: e.type,
224
+ reason: e.reason,
225
+ message: e.message,
226
+ count: e.count || 1,
227
+ firstTimestamp: e.firstTimestamp,
228
+ lastTimestamp: e.lastTimestamp,
229
+ source: e.source?.component,
230
+ }))
231
+ .sort((a, b) => new Date(b.lastTimestamp).getTime() - new Date(a.lastTimestamp).getTime());
232
+ }
233
+ /**
234
+ * Detect container issues
235
+ */
236
+ function detectContainerIssues(pod, containers, _events) {
237
+ const issues = [];
238
+ for (const container of containers) {
239
+ // Check Waiting state
240
+ if (container.state.waiting) {
241
+ const reason = container.state.waiting.reason;
242
+ const message = container.state.waiting.message;
243
+ if (reason === 'ErrImagePull' || reason === 'ImagePullBackOff') {
244
+ // Image pull issues are handled in a separate function
245
+ continue;
246
+ }
247
+ issues.push({
248
+ type: `Container Waiting: ${reason}`,
249
+ severity: 'high',
250
+ message: `Container "${container.name}" is in ${reason} state`,
251
+ rootCause: message || 'Unknown reason',
252
+ solution: getWaitingSolution(reason),
253
+ resource: {
254
+ kind: 'Pod',
255
+ name: pod.metadata?.name,
256
+ namespace: pod.metadata?.namespace,
257
+ },
258
+ timestamp: new Date().toISOString(),
259
+ });
260
+ }
261
+ // Check Terminated state
262
+ if (container.state.terminated && container.state.terminated.exitCode !== 0) {
263
+ issues.push({
264
+ type: 'Container Terminated',
265
+ severity: 'high',
266
+ message: `Container "${container.name}" terminated with exit code ${container.state.terminated.exitCode}`,
267
+ rootCause: container.state.terminated.reason || 'Unknown reason',
268
+ solution: getTerminatedSolution(container.state.terminated.exitCode),
269
+ resource: {
270
+ kind: 'Pod',
271
+ name: pod.metadata?.name,
272
+ namespace: pod.metadata?.namespace,
273
+ },
274
+ timestamp: new Date().toISOString(),
275
+ });
276
+ }
277
+ }
278
+ return issues;
279
+ }
280
+ /**
281
+ * Detect image pull issues
282
+ */
283
+ function detectImagePullIssues(pod, events) {
284
+ const issues = [];
285
+ const imagePullEvents = events.filter(e => e.reason === 'Failed' && e.message.includes('pull'));
286
+ if (imagePullEvents.length > 0) {
287
+ const event = imagePullEvents[0];
288
+ let rootCause = 'Cannot download image';
289
+ let solution = '';
290
+ if (event.message.includes('not found') || event.message.includes('manifest unknown')) {
291
+ rootCause = 'Image or tag does not exist';
292
+ solution = '1. Verify image name and tag\n2. Test locally with docker pull <image>';
293
+ }
294
+ else if (event.message.includes('unauthorized') || event.message.includes('authentication')) {
295
+ rootCause = 'Image registry authentication failed';
296
+ solution = '```bash\nkubectl create secret docker-registry regcred \\\n --docker-server=<registry> \\\n --docker-username=<username> \\\n --docker-password=<password>\n\n# Add to Pod spec:\nspec:\n imagePullSecrets:\n - name: regcred\n```';
297
+ }
298
+ else if (event.message.includes('timeout')) {
299
+ rootCause = 'Network timeout - Cannot access registry';
300
+ solution = '1. Check cluster network connectivity\n2. Verify firewall/proxy settings\n3. Verify registry URL is correct';
301
+ }
302
+ issues.push({
303
+ type: 'ImagePullBackOff',
304
+ severity: 'critical',
305
+ message: 'Cannot pull container image',
306
+ rootCause,
307
+ solution,
308
+ resource: {
309
+ kind: 'Pod',
310
+ name: pod.metadata?.name,
311
+ namespace: pod.metadata?.namespace,
312
+ },
313
+ relatedEvents: [event],
314
+ timestamp: new Date().toISOString(),
315
+ });
316
+ }
317
+ return issues;
318
+ }
319
+ /**
320
+ * Analyze resource usage
321
+ *
322
+ * Collects real-time metrics from Metrics Server if available
323
+ */
324
+ async function analyzeResourceUsage(pod, namespace, podName, metricsApi) {
325
+ const containers = pod.spec?.containers || [];
326
+ let totalCpuRequest = 0;
327
+ let totalCpuLimit = 0;
328
+ let totalMemRequest = 0;
329
+ let totalMemLimit = 0;
330
+ for (const container of containers) {
331
+ const requests = container.resources?.requests || {};
332
+ const limits = container.resources?.limits || {};
333
+ totalCpuRequest += parseCPU(requests.cpu || '0');
334
+ totalCpuLimit += parseCPU(limits.cpu || '0');
335
+ totalMemRequest += parseMemory(requests.memory || '0');
336
+ totalMemLimit += parseMemory(limits.memory || '0');
337
+ }
338
+ // Try to get real-time metrics from Metrics Server
339
+ let currentCpu;
340
+ let currentMem;
341
+ let cpuUsagePercent;
342
+ let memUsagePercent;
343
+ if (metricsApi) {
344
+ try {
345
+ const metrics = await withRetry(() => metricsApi.getPodMetrics(namespace), {
346
+ maxAttempts: 2,
347
+ initialDelay: 500,
348
+ shouldRetry: (error) => {
349
+ // Don't retry if Metrics Server is not installed
350
+ if (error.statusCode === 404)
351
+ return false;
352
+ return true;
353
+ },
354
+ });
355
+ // Find the specific pod in the metrics list
356
+ const podMetric = metrics.items?.find((item) => item.metadata?.name === podName);
357
+ if (podMetric) {
358
+ // Sum up all container metrics
359
+ let totalCpuUsage = 0;
360
+ let totalMemUsage = 0;
361
+ for (const container of podMetric.containers || []) {
362
+ // CPU is in nanocores, convert to millicores
363
+ if (container.usage?.cpu) {
364
+ totalCpuUsage += parseMetricCPU(container.usage.cpu);
365
+ }
366
+ // Memory is in Ki, convert to bytes
367
+ if (container.usage?.memory) {
368
+ totalMemUsage += parseMetricMemory(container.usage.memory);
369
+ }
370
+ }
371
+ currentCpu = totalCpuUsage;
372
+ currentMem = totalMemUsage;
373
+ // Calculate usage percentages
374
+ if (totalCpuLimit > 0) {
375
+ cpuUsagePercent = (currentCpu / totalCpuLimit) * 100;
376
+ }
377
+ if (totalMemLimit > 0) {
378
+ memUsagePercent = (currentMem / totalMemLimit) * 100;
379
+ }
380
+ }
381
+ }
382
+ catch (error) {
383
+ // Metrics Server not available or pod metrics not ready
384
+ // This is fine, we'll just show spec values
385
+ if (error.statusCode !== 404) {
386
+ console.error(`[analyzeResourceUsage] Failed to get metrics (non-fatal):`, error.message);
387
+ }
388
+ }
389
+ }
390
+ return {
391
+ cpu: {
392
+ current: currentCpu,
393
+ requested: totalCpuRequest,
394
+ limit: totalCpuLimit,
395
+ usagePercent: cpuUsagePercent,
396
+ isThrottled: cpuUsagePercent !== undefined && cpuUsagePercent >= 80,
397
+ },
398
+ memory: {
399
+ current: currentMem,
400
+ requested: totalMemRequest,
401
+ limit: totalMemLimit,
402
+ usagePercent: memUsagePercent,
403
+ isOOMRisk: memUsagePercent !== undefined && memUsagePercent >= 90,
404
+ },
405
+ };
406
+ }
407
+ /**
408
+ * Detect resource issues
409
+ */
410
+ function detectResourceIssues(pod, resources) {
411
+ const issues = [];
412
+ // Check for high CPU usage (throttling)
413
+ if (resources.cpu.isThrottled && resources.cpu.usagePercent !== undefined) {
414
+ issues.push({
415
+ type: 'High CPU Usage',
416
+ severity: 'high',
417
+ message: `CPU usage is high (${resources.cpu.usagePercent.toFixed(1)}%)`,
418
+ rootCause: 'CPU limit may be too low for current workload',
419
+ solution: `Increase CPU limit or optimize application:\n\`\`\`yaml\nresources:\n limits:\n cpu: "${Math.ceil((resources.cpu.limit || 1000) * 1.5)}m" # Increased by 50%\n\`\`\``,
420
+ resource: {
421
+ kind: 'Pod',
422
+ name: pod.metadata?.name,
423
+ namespace: pod.metadata?.namespace,
424
+ },
425
+ timestamp: new Date().toISOString(),
426
+ });
427
+ }
428
+ // Check for OOM risk
429
+ if (resources.memory.isOOMRisk && resources.memory.usagePercent !== undefined) {
430
+ issues.push({
431
+ type: 'OOM Risk',
432
+ severity: 'critical',
433
+ message: `Memory usage is critically high (${resources.memory.usagePercent.toFixed(1)}%)`,
434
+ rootCause: 'Pod is at risk of OOM kill - memory usage exceeds 90% of limit',
435
+ solution: `Increase memory limit immediately:\n\`\`\`yaml\nresources:\n limits:\n memory: "${Math.ceil((resources.memory.limit || 512 * 1024 * 1024) / (1024 * 1024) * 1.5)}Mi" # Increased by 50%\n\`\`\``,
436
+ resource: {
437
+ kind: 'Pod',
438
+ name: pod.metadata?.name,
439
+ namespace: pod.metadata?.namespace,
440
+ },
441
+ timestamp: new Date().toISOString(),
442
+ });
443
+ }
444
+ // When resource limits are not set
445
+ if (!resources.cpu.limit) {
446
+ issues.push({
447
+ type: 'Missing CPU Limit',
448
+ severity: 'medium',
449
+ message: 'CPU limit is not set',
450
+ rootCause: 'CPU usage can increase without limit',
451
+ solution: '```yaml\nresources:\n limits:\n cpu: "1000m"\n requests:\n cpu: "100m"\n```',
452
+ resource: {
453
+ kind: 'Pod',
454
+ name: pod.metadata?.name,
455
+ namespace: pod.metadata?.namespace,
456
+ },
457
+ timestamp: new Date().toISOString(),
458
+ });
459
+ }
460
+ if (!resources.memory.limit) {
461
+ issues.push({
462
+ type: 'Missing Memory Limit',
463
+ severity: 'high',
464
+ message: 'Memory limit is not set',
465
+ rootCause: 'Memory leak can affect entire node',
466
+ solution: '```yaml\nresources:\n limits:\n memory: "512Mi"\n requests:\n memory: "128Mi"\n```',
467
+ resource: {
468
+ kind: 'Pod',
469
+ name: pod.metadata?.name,
470
+ namespace: pod.metadata?.namespace,
471
+ },
472
+ timestamp: new Date().toISOString(),
473
+ });
474
+ }
475
+ return issues;
476
+ }
477
+ /**
478
+ * Detect volume issues
479
+ */
480
+ function detectVolumeIssues(pod, events) {
481
+ const issues = [];
482
+ const volumeEvents = events.filter(e => e.message.includes('volume') || e.message.includes('mount'));
483
+ for (const event of volumeEvents) {
484
+ if (event.type === 'Warning') {
485
+ issues.push({
486
+ type: 'Volume Mount Issue',
487
+ severity: 'high',
488
+ message: 'Volume mount failed',
489
+ rootCause: event.message,
490
+ solution: '1. Verify PVC is in Bound state\n2. Verify storage class is correct\n3. Check status with kubectl describe pvc <pvc-name>',
491
+ resource: {
492
+ kind: 'Pod',
493
+ name: pod.metadata?.name,
494
+ namespace: pod.metadata?.namespace,
495
+ },
496
+ relatedEvents: [event],
497
+ timestamp: new Date().toISOString(),
498
+ });
499
+ }
500
+ }
501
+ return issues;
502
+ }
503
+ /**
504
+ * Detect network issues
505
+ */
506
+ function detectNetworkIssues(pod, events) {
507
+ const issues = [];
508
+ const networkEvents = events.filter(e => e.message.includes('network') || e.message.includes('CNI'));
509
+ for (const event of networkEvents) {
510
+ if (event.type === 'Warning') {
511
+ issues.push({
512
+ type: 'Network Configuration Issue',
513
+ severity: 'high',
514
+ message: 'Network configuration problem',
515
+ rootCause: event.message,
516
+ solution: '1. Check CNI plugin status\n2. Check network policy\n3. Verify Pod CIDR range',
517
+ resource: {
518
+ kind: 'Pod',
519
+ name: pod.metadata?.name,
520
+ namespace: pod.metadata?.namespace,
521
+ },
522
+ relatedEvents: [event],
523
+ timestamp: new Date().toISOString(),
524
+ });
525
+ }
526
+ }
527
+ return issues;
528
+ }
529
+ /**
530
+ * Calculate health score
531
+ */
532
+ function calculateHealthScore(pod, issues) {
533
+ let score = 100;
534
+ // Deductions based on pod phase
535
+ const phase = pod.status?.phase;
536
+ if (phase === 'Failed')
537
+ score -= 100;
538
+ else if (phase === 'Pending')
539
+ score -= 30;
540
+ else if (phase === 'Unknown')
541
+ score -= 50;
542
+ // Deductions based on issues
543
+ for (const issue of issues) {
544
+ if (issue.severity === 'critical')
545
+ score -= 30;
546
+ else if (issue.severity === 'high')
547
+ score -= 20;
548
+ else if (issue.severity === 'medium')
549
+ score -= 10;
550
+ else if (issue.severity === 'low')
551
+ score -= 5;
552
+ }
553
+ return Math.max(0, Math.min(100, score));
554
+ }
555
+ /**
556
+ * Generate pod summary
557
+ */
558
+ function generatePodSummary(pod, issues, healthScore) {
559
+ const phase = pod.status?.phase || 'Unknown';
560
+ const containerCount = pod.spec?.containers?.length || 0;
561
+ const readyContainers = pod.status?.containerStatuses?.filter((c) => c.ready).length || 0;
562
+ let summary = `Pod "${pod.metadata?.name}" is currently in ${phase} state.\n`;
563
+ summary += `Containers: ${readyContainers}/${containerCount} ready\n`;
564
+ summary += `Health: ${healthScore}/100\n\n`;
565
+ if (issues.length === 0) {
566
+ summary += '✅ No issues found!';
567
+ }
568
+ else {
569
+ summary += `⚠️ ${issues.length} issue(s) detected.\n`;
570
+ const critical = issues.filter(i => i.severity === 'critical').length;
571
+ const high = issues.filter(i => i.severity === 'high').length;
572
+ if (critical > 0)
573
+ summary += ` - Critical: ${critical}\n`;
574
+ if (high > 0)
575
+ summary += ` - High: ${high}\n`;
576
+ }
577
+ return summary;
578
+ }
579
+ // ===== Helper functions =====
580
+ function getWaitingSolution(reason) {
581
+ const solutions = {
582
+ 'CreateContainerConfigError': 'Check container configuration (ConfigMap, Secret, etc.)',
583
+ 'InvalidImageName': 'Verify image name format',
584
+ 'CreateContainerError': 'Check container creation settings',
585
+ };
586
+ return solutions[reason] || 'Check logs and events to identify the cause';
587
+ }
588
+ function getTerminatedSolution(exitCode) {
589
+ const solutions = {
590
+ 1: 'Check application logs to fix errors',
591
+ 137: 'Increase memory limit (OOM killed)',
592
+ 143: 'Verify graceful shutdown implementation',
593
+ 126: 'Check executable permissions (chmod +x)',
594
+ 127: 'Verify CMD/ENTRYPOINT path',
595
+ };
596
+ return solutions[exitCode] || `Check logs for exit code ${exitCode}`;
597
+ }
598
+ function parseCPU(cpu) {
599
+ if (cpu.endsWith('m')) {
600
+ return parseInt(cpu.slice(0, -1));
601
+ }
602
+ return parseFloat(cpu) * 1000;
603
+ }
604
+ function parseMemory(mem) {
605
+ const units = {
606
+ 'Ki': 1024,
607
+ 'Mi': 1024 * 1024,
608
+ 'Gi': 1024 * 1024 * 1024,
609
+ 'K': 1000,
610
+ 'M': 1000 * 1000,
611
+ 'G': 1000 * 1000 * 1000,
612
+ };
613
+ for (const [unit, multiplier] of Object.entries(units)) {
614
+ if (mem.endsWith(unit)) {
615
+ return parseFloat(mem.slice(0, -unit.length)) * multiplier;
616
+ }
617
+ }
618
+ return parseFloat(mem);
619
+ }
620
+ /**
621
+ * Parse CPU from Metrics API format
622
+ * Metrics API returns nanocores (e.g., "123456789n") or millicores (e.g., "123m")
623
+ */
624
+ function parseMetricCPU(cpu) {
625
+ if (cpu.endsWith('n')) {
626
+ // Nanocores to millicores: divide by 1,000,000
627
+ return parseInt(cpu.slice(0, -1)) / 1_000_000;
628
+ }
629
+ else if (cpu.endsWith('m')) {
630
+ // Already in millicores
631
+ return parseInt(cpu.slice(0, -1));
632
+ }
633
+ else {
634
+ // Cores to millicores: multiply by 1000
635
+ return parseFloat(cpu) * 1000;
636
+ }
637
+ }
638
+ /**
639
+ * Parse Memory from Metrics API format
640
+ * Metrics API returns in Ki (e.g., "123456Ki")
641
+ */
642
+ function parseMetricMemory(mem) {
643
+ if (mem.endsWith('Ki')) {
644
+ return parseInt(mem.slice(0, -2)) * 1024;
645
+ }
646
+ else if (mem.endsWith('Mi')) {
647
+ return parseInt(mem.slice(0, -2)) * 1024 * 1024;
648
+ }
649
+ else if (mem.endsWith('Gi')) {
650
+ return parseInt(mem.slice(0, -2)) * 1024 * 1024 * 1024;
651
+ }
652
+ // Assume bytes
653
+ return parseInt(mem);
654
+ }
@@ -0,0 +1,11 @@
1
+ #!/usr/bin/env node
2
+ /**
3
+ * K8s Doctor MCP Server
4
+ *
5
+ * MCP server for AI-powered Kubernetes cluster diagnosis and problem solving.
6
+ * Goes beyond simple queries - analyzes error logs, identifies root causes, and suggests solutions.
7
+ *
8
+ * @author zerry
9
+ * @license MIT
10
+ */
11
+ export {};