@gtadi/k8s-node-debugger 1.0.1 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -22,16 +22,37 @@ node bin/k8s-node-debugger.js <node-name>
22
22
 
23
23
  ![Node Health view](https://raw.githubusercontent.com/goutamtadi1/k8s-node-debugger/main/docs/screenshot-health.png)
24
24
 
25
+ ### GPU Health — temperature, power, utilization, memory, ECC errors, clock throttle
26
+
27
+ ![GPU Health view](https://raw.githubusercontent.com/goutamtadi1/k8s-node-debugger/main/docs/screenshot-gpu-health.png)
28
+
29
+ ### GPU Status — driver version, CUDA, per-GPU metrics, processes
30
+
31
+ ![GPU Status view](https://raw.githubusercontent.com/goutamtadi1/k8s-node-debugger/main/docs/screenshot-gpu-status.png)
32
+
25
33
  ## Install
26
34
 
35
+ ```bash
36
+ npm install -g @gtadi/k8s-node-debugger
37
+ ```
38
+
39
+ Or run without installing:
40
+
41
+ ```bash
42
+ npx @gtadi/k8s-node-debugger <node-name>
43
+ ```
44
+
45
+ Requires `kubectl` on your PATH with an active kubeconfig. The debug image (`nicolaka/netshoot`) is pulled from Docker Hub on first use.
46
+
47
+ ### From source
48
+
27
49
  ```bash
28
50
  git clone git@github.com:goutamtadi1/k8s-node-debugger.git
29
51
  cd k8s-node-debugger
30
52
  npm install
53
+ node bin/k8s-node-debugger.js <node-name>
31
54
  ```
32
55
 
33
- Requires `kubectl` on your PATH with an active kubeconfig. The debug image (`nicolaka/netshoot`) is pulled from Docker Hub on first use.
34
-
35
56
  ## Usage
36
57
 
37
58
  ```bash
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@gtadi/k8s-node-debugger",
3
- "version": "1.0.1",
3
+ "version": "1.0.2",
4
4
  "description": "Spin up a privileged debug pod on a target Kubernetes node and inspect its network stack (iptables, resolv.conf, conntrack, routes, sockets) from a browser UI.",
5
5
  "bin": {
6
6
  "k8s-node-debugger": "bin/k8s-node-debugger.js"
package/public/app.js CHANGED
@@ -256,6 +256,7 @@ const FANCY_PROBES = new Set([
256
256
  'iptables', 'iptables-nat',
257
257
  'conntrack', 'conntrack-stats', 'conntrack-count',
258
258
  'mem-info', 'mem-pressure', 'oom-kills', 'kubelet-logs', 'disk-usage', 'cpu-stat',
259
+ 'gpu-info', 'gpu-health', 'gpu-processes',
259
260
  ]);
260
261
 
261
262
  function buildProbePanel(probe) {
@@ -385,6 +386,9 @@ function tryFancyRender(id, output, container) {
385
386
  'kubelet-logs': () => renderKubeletLogsView(output, container),
386
387
  'disk-usage': () => renderDiskView(output, container),
387
388
  'cpu-stat': () => renderCpuView(output, container),
389
+ 'gpu-info': () => renderGpuInfoView(output, container),
390
+ 'gpu-health': () => renderGpuHealthView(output, container),
391
+ 'gpu-processes': () => renderGpuProcessesView(output, container),
388
392
  };
389
393
  if (!renderers[id]) return false;
390
394
  try { renderers[id](); return true; } catch (e) { console.error('[fancy render]', id, e); return false; }
@@ -451,6 +451,415 @@
451
451
  container.appendChild(wrap);
452
452
  }
453
453
 
454
+ /* ══════════════════════════════════════════════════════════════════════
455
+ * GPU status — nvidia-smi plain text table
456
+ * ══════════════════════════════════════════════════════════════════════ */
457
+ function renderGpuInfoView(raw, container) {
458
+ if (!raw.includes('NVIDIA-SMI')) { container.className = 'output'; container.textContent = raw; return; }
459
+
460
+ const lines = raw.split('\n');
461
+
462
+ // Version header
463
+ const verLine = lines.find(l => l.includes('NVIDIA-SMI')) || '';
464
+ const smiVer = verLine.match(/NVIDIA-SMI\s+(\S+)/)?.[1] || '—';
465
+ const driverVer = verLine.match(/Driver Version:\s+(\S+)/)?.[1] || '—';
466
+ const cudaVer = verLine.match(/CUDA Version:\s+(\S+)/)?.[1] || '—';
467
+
468
+ // Collect GPU data: groups of 3 content lines between |====| ... +---+
469
+ const gpus = [];
470
+ let inGpu = false, buf = [];
471
+ for (const line of lines) {
472
+ if (/^\|[=]+\|?$/.test(line.trim())) { inGpu = true; buf = []; continue; }
473
+ if (inGpu && line.startsWith('+')) {
474
+ if (buf.length >= 2) {
475
+ const g = parseNvidiaSmiGpuBlock(buf);
476
+ if (g) gpus.push(g);
477
+ }
478
+ inGpu = false; buf = []; continue;
479
+ }
480
+ if (inGpu && line.startsWith('|') && !line.includes('Processes:') &&
481
+ !line.includes('No running') && !line.includes('GPU GI') && !line.includes('GPU GI')) {
482
+ buf.push(line);
483
+ }
484
+ }
485
+
486
+ // Processes
487
+ let noProcs = raw.includes('No running processes found');
488
+ const procs = [];
489
+ if (!noProcs) {
490
+ const procRe = /\|\s+(\d+)\s+\S+\s+\S+\s+(\d+)\s+(\S+)\s+(.*?)\s+(\d+)MiB\s+\|/;
491
+ let inP = false;
492
+ for (const line of lines) {
493
+ if (line.includes('Processes:')) { inP = true; continue; }
494
+ if (inP && /^\|[=]+/.test(line)) continue;
495
+ if (inP && line.startsWith('+')) break;
496
+ if (inP) {
497
+ if (line.includes('No running')) { noProcs = true; break; }
498
+ const m = line.match(procRe);
499
+ if (m) procs.push({ gpu: m[1], pid: m[2], type: m[3], name: m[4].trim(), mem: m[5] });
500
+ }
501
+ }
502
+ }
503
+
504
+ function tempCls(t) { return t >= 85 ? 'hv-crit' : t >= 70 ? 'hv-warn' : 'hv-ok'; }
505
+ function pctCls(v) { return v >= 90 ? 'hv-crit' : v >= 70 ? 'hv-warn' : 'hv-ok'; }
506
+ function powerCls2(d,c) { if (!c) return 'hv-ok'; const r=d/c; return r>=0.95?'hv-crit':r>=0.80?'hv-warn':'hv-ok'; }
507
+
508
+ function memBar(used, total) {
509
+ if (!total) return '';
510
+ const pct = Math.min(Math.round(used/total*100),100);
511
+ const cls = pctCls(pct);
512
+ return `<div class="gpu-health-metric">
513
+ <div class="gpu-health-metric-hdr">
514
+ <span class="gpu-health-lbl">Memory</span>
515
+ <span class="gpu-health-val ${cls}">${pct}%</span>
516
+ </div>
517
+ <div class="hv-gauge-bar gpu-health-bar"><div class="hv-gauge-fill ${cls}" style="width:${pct}%"></div></div>
518
+ <div class="gpu-health-sub">${used} MiB used · ${total} MiB total</div>
519
+ </div>`;
520
+ }
521
+
522
+ function pwrBar(draw, cap) {
523
+ if (!cap) return '';
524
+ const pct = Math.min(Math.round(draw/cap*100),100);
525
+ const cls = powerCls2(draw, cap);
526
+ return `<div class="gpu-health-metric">
527
+ <div class="gpu-health-metric-hdr">
528
+ <span class="gpu-health-lbl">Power</span>
529
+ <span class="gpu-health-val ${cls}">${draw} W / ${cap} W</span>
530
+ </div>
531
+ <div class="hv-gauge-bar gpu-health-bar"><div class="hv-gauge-fill ${cls}" style="width:${pct}%"></div></div>
532
+ </div>`;
533
+ }
534
+
535
+ const gpuCards = gpus.map(g => `
536
+ <div class="gpu-health-card">
537
+ <div class="gpu-health-hdr">
538
+ <span class="gpu-proc-idx">GPU ${h(g.index)}</span>
539
+ <span class="gpu-health-name">${h(g.name)}</span>
540
+ </div>
541
+ <div class="gpu-health-row">
542
+ <div class="gpu-health-cell">
543
+ <span class="gpu-health-lbl">GPU Temp</span>
544
+ <span class="gpu-health-val ${tempCls(g.temp)}">${g.temp}°C</span>
545
+ </div>
546
+ <div class="gpu-health-cell">
547
+ <span class="gpu-health-lbl">Perf</span>
548
+ <span class="gpu-health-val">${h(g.perf)}</span>
549
+ </div>
550
+ <div class="gpu-health-cell">
551
+ <span class="gpu-health-lbl">GPU Util</span>
552
+ <span class="gpu-health-val ${pctCls(g.utilGpu)}">${g.utilGpu}%</span>
553
+ </div>
554
+ <div class="gpu-health-cell">
555
+ <span class="gpu-health-lbl">Fan</span>
556
+ <span class="gpu-health-val">${g.fan !== null ? g.fan + '%' : '—'}</span>
557
+ </div>
558
+ </div>
559
+ ${memBar(g.memUsed, g.memTotal)}
560
+ ${pwrBar(g.pwrDraw, g.pwrCap)}
561
+ <div class="gpu-info-meta">
562
+ <div class="gpu-info-meta-row">
563
+ <span class="gpu-health-lbl">Bus ID</span>
564
+ <span class="gpu-info-meta-val">${h(g.busId)}</span>
565
+ </div>
566
+ <div class="gpu-info-meta-row">
567
+ <span class="gpu-health-lbl">Persistence</span>
568
+ <span class="gpu-info-meta-val ${g.persistence === 'On' ? 'hv-ok' : ''}">${h(g.persistence)}</span>
569
+ </div>
570
+ <div class="gpu-info-meta-row">
571
+ <span class="gpu-health-lbl">Compute Mode</span>
572
+ <span class="gpu-info-meta-val">${h(g.computeMode)}</span>
573
+ </div>
574
+ ${g.migMode ? `<div class="gpu-info-meta-row">
575
+ <span class="gpu-health-lbl">MIG Mode</span>
576
+ <span class="gpu-info-meta-val">${h(g.migMode)}</span>
577
+ </div>` : ''}
578
+ </div>
579
+ </div>`).join('');
580
+
581
+ const procsHtml = noProcs
582
+ ? '<div class="gpu-info-no-procs">No running processes.</div>'
583
+ : `<div class="hv-top-table-wrap"><table class="hv-top-table">
584
+ <thead><tr><th>GPU</th><th>PID</th><th>Type</th><th>Process</th><th>GPU Mem</th></tr></thead>
585
+ <tbody>${procs.map(p => `<tr>
586
+ <td><span class="gpu-proc-idx">GPU ${h(p.gpu)}</span></td>
587
+ <td class="gpu-proc-pid">${h(p.pid)}</td>
588
+ <td><span class="gpu-proc-type gpu-proc-type-${h(p.type.toLowerCase())}">${h(p.type)}</span></td>
589
+ <td class="gpu-proc-cmd">${h(p.name)}</td>
590
+ <td class="gpu-proc-mem">${h(p.mem)} MiB</td>
591
+ </tr>`).join('')}</tbody>
592
+ </table></div>`;
593
+
594
+ const wrap = document.createElement('div');
595
+ wrap.className = 'hv-wrap gpu-health-grid';
596
+ wrap.innerHTML = `
597
+ <div class="gpu-info-versions">
598
+ <div class="gpu-info-ver-item"><span class="gpu-health-lbl">NVIDIA-SMI</span><span class="gpu-info-ver-val">${h(smiVer)}</span></div>
599
+ <div class="gpu-info-ver-item"><span class="gpu-health-lbl">Driver</span><span class="gpu-info-ver-val">${h(driverVer)}</span></div>
600
+ <div class="gpu-info-ver-item"><span class="gpu-health-lbl">CUDA</span><span class="gpu-info-ver-val">${h(cudaVer)}</span></div>
601
+ </div>
602
+ ${gpuCards}
603
+ <div class="gpu-info-procs-section">
604
+ <div class="gpu-info-procs-title">Processes</div>
605
+ ${procsHtml}
606
+ </div>`;
607
+
608
+ container.innerHTML = '';
609
+ container.className = '';
610
+ container.appendChild(wrap);
611
+ }
612
+
613
+ function parseNvidiaSmiGpuBlock(lines) {
614
+ // Line 0: | idx name persistence | bus_id disp_a | ecc |
615
+ // Line 1: | fan tempC perf drawW/capW | usedMiB/totalMiB | util% compute |
616
+ // Line 2: | ... | ... | mig |
617
+ const l0 = lines[0] || '', l1 = lines[1] || '', l2 = lines[2] || '';
618
+ const m0 = l0.match(/\|\s+(\d+)\s+(.*?)\s+(On|Off)\s+\|\s+(\S+)\s+(On|Off)\s+\|\s+(\S+)\s+\|/);
619
+ const m1 = l1.match(/\|\s*(N\/A|\d+)\s+(\d+)C\s+(\S+)\s+(\d+)W\s*\/\s*(\d+)W\s+\|\s+(\d+)MiB\s*\/\s*(\d+)MiB\s+\|\s+(\d+)%\s+(\S+)\s+\|/);
620
+ if (!m0 || !m1) return null;
621
+ const m2 = l2.match(/\|\s*\|\s*\|\s+(\S+)\s+\|/);
622
+ return {
623
+ index: m0[1],
624
+ name: m0[2].trim(),
625
+ persistence: m0[3],
626
+ busId: m0[4],
627
+ dispA: m0[5],
628
+ ecc: m0[6],
629
+ fan: m1[1] === 'N/A' ? null : parseInt(m1[1]),
630
+ temp: parseInt(m1[2]),
631
+ perf: m1[3],
632
+ pwrDraw: parseInt(m1[4]),
633
+ pwrCap: parseInt(m1[5]),
634
+ memUsed: parseInt(m1[6]),
635
+ memTotal: parseInt(m1[7]),
636
+ utilGpu: parseInt(m1[8]),
637
+ computeMode: m1[9],
638
+ migMode: m2?.[1] || null,
639
+ };
640
+ }
641
+
642
+ /* ══════════════════════════════════════════════════════════════════════
643
+ * GPU health — nvidia-smi --query-gpu CSV
644
+ * Columns: index, name, temp.gpu, temp.mem, power.draw, power.limit,
645
+ * util.gpu, util.mem, mem.used, mem.free, mem.total,
646
+ * ecc.corrected, ecc.uncorrected, throttle_reasons
647
+ * ══════════════════════════════════════════════════════════════════════ */
648
+ function renderGpuHealthView(raw, container) {
649
+ const lines = raw.split('\n').map(l => l.trim()).filter(l => l);
650
+ if (!lines.length) { container.textContent = raw; return; }
651
+
652
+ const THROTTLE_REASONS = [
653
+ [0x02, 'App Clock Setting'],
654
+ [0x04, 'SW Power Cap'],
655
+ [0x08, 'HW Slowdown'],
656
+ [0x10, 'Sync Boost'],
657
+ [0x20, 'SW Thermal Slowdown'],
658
+ [0x40, 'HW Thermal Slowdown'],
659
+ [0x80, 'HW Power Brake'],
660
+ [0x100, 'Display Clock Setting'],
661
+ ];
662
+
663
+ function pn(s) { const n = parseFloat(s); return isNaN(n) ? null : n; }
664
+ function strip(s, u) { return pn((s || '').replace(u, '')); }
665
+
666
+ const gpus = lines.map(line => {
667
+ const p = line.split(',').map(s => s.trim());
668
+ return {
669
+ index: p[0],
670
+ name: p[1],
671
+ tempGpu: pn(p[2]),
672
+ tempMem: pn(p[3]),
673
+ powerDraw: strip(p[4], ' W'),
674
+ powerLimit: strip(p[5], ' W'),
675
+ utilGpu: strip(p[6], ' %'),
676
+ utilMem: strip(p[7], ' %'),
677
+ memUsed: strip(p[8], ' MiB'),
678
+ memTotal: strip(p[10], ' MiB'),
679
+ eccCorrected: pn(p[11]),
680
+ eccUncorrected: pn(p[12]),
681
+ throttleRaw: (p[13] || '').trim(),
682
+ };
683
+ });
684
+
685
+ function tempCls(t) { return t === null ? '' : t >= 85 ? 'hv-crit' : t >= 70 ? 'hv-warn' : 'hv-ok'; }
686
+ function pctCls(v) { return v === null ? '' : v >= 90 ? 'hv-crit' : v >= 70 ? 'hv-warn' : 'hv-ok'; }
687
+ function powerCls(d,l) { if (d === null || !l) return 'hv-ok'; const r = d/l; return r >= 0.95 ? 'hv-crit' : r >= 0.80 ? 'hv-warn' : 'hv-ok'; }
688
+
689
+ function decodeThrottle(raw) {
690
+ if (!raw || raw === 'N/A') return [];
691
+ const val = parseInt(raw, 16);
692
+ if (isNaN(val) || val === 0 || val === 1) return [];
693
+ return THROTTLE_REASONS.filter(([mask]) => val & mask).map(([, name]) => name);
694
+ }
695
+
696
+ function bar(pct, cls) {
697
+ return `<div class="hv-gauge-bar gpu-health-bar"><div class="hv-gauge-fill ${cls}" style="width:${pct}%"></div></div>`;
698
+ }
699
+
700
+ const wrap = document.createElement('div');
701
+ wrap.className = 'hv-wrap gpu-health-grid';
702
+
703
+ wrap.innerHTML = gpus.map(g => {
704
+ const memPct = (g.memUsed !== null && g.memTotal) ? Math.min(Math.round(g.memUsed / g.memTotal * 100), 100) : null;
705
+ const pwrPct = (g.powerDraw !== null && g.powerLimit) ? Math.min(Math.round(g.powerDraw / g.powerLimit * 100), 100) : null;
706
+ const memCls = pctCls(memPct);
707
+ const pwrCls = powerCls(g.powerDraw, g.powerLimit);
708
+ const throttled = decodeThrottle(g.throttleRaw);
709
+ const eccBad = g.eccUncorrected !== null && g.eccUncorrected > 0;
710
+ const eccWarn = !eccBad && g.eccCorrected !== null && g.eccCorrected > 0;
711
+
712
+ return `
713
+ <div class="gpu-health-card">
714
+ <div class="gpu-health-hdr">
715
+ <span class="gpu-proc-idx">GPU ${h(g.index)}</span>
716
+ <span class="gpu-health-name">${h(g.name)}</span>
717
+ </div>
718
+
719
+ <div class="gpu-health-row">
720
+ <div class="gpu-health-cell">
721
+ <span class="gpu-health-lbl">GPU temp</span>
722
+ <span class="gpu-health-val ${tempCls(g.tempGpu)}">${g.tempGpu !== null ? g.tempGpu + '°C' : '—'}</span>
723
+ </div>
724
+ <div class="gpu-health-cell">
725
+ <span class="gpu-health-lbl">Mem temp</span>
726
+ <span class="gpu-health-val ${tempCls(g.tempMem)}">${g.tempMem !== null ? g.tempMem + '°C' : '—'}</span>
727
+ </div>
728
+ <div class="gpu-health-cell">
729
+ <span class="gpu-health-lbl">GPU util</span>
730
+ <span class="gpu-health-val ${pctCls(g.utilGpu)}">${g.utilGpu !== null ? g.utilGpu + '%' : '—'}</span>
731
+ </div>
732
+ <div class="gpu-health-cell">
733
+ <span class="gpu-health-lbl">Mem util</span>
734
+ <span class="gpu-health-val ${pctCls(g.utilMem)}">${g.utilMem !== null ? g.utilMem + '%' : '—'}</span>
735
+ </div>
736
+ </div>
737
+
738
+ <div class="gpu-health-metric">
739
+ <div class="gpu-health-metric-hdr">
740
+ <span class="gpu-health-lbl">Memory</span>
741
+ <span class="gpu-health-val ${memCls}">${memPct !== null ? memPct + '%' : '—'}</span>
742
+ </div>
743
+ ${memPct !== null ? bar(memPct, memCls) : ''}
744
+ <div class="gpu-health-sub">${g.memUsed !== null ? Math.round(g.memUsed) + ' MiB used' : ''} ${g.memTotal ? '· ' + Math.round(g.memTotal) + ' MiB total' : ''}</div>
745
+ </div>
746
+
747
+ <div class="gpu-health-metric">
748
+ <div class="gpu-health-metric-hdr">
749
+ <span class="gpu-health-lbl">Power</span>
750
+ <span class="gpu-health-val ${pwrCls}">${g.powerDraw !== null ? g.powerDraw.toFixed(1) + ' W' : '—'}${g.powerLimit ? ' / ' + g.powerLimit.toFixed(0) + ' W' : ''}</span>
751
+ </div>
752
+ ${pwrPct !== null ? bar(pwrPct, pwrCls) : ''}
753
+ </div>
754
+
755
+ <div class="gpu-health-ecc ${eccBad ? 'gpu-health-ecc-bad' : eccWarn ? 'gpu-health-ecc-warn' : 'gpu-health-ecc-ok'}">
756
+ <span class="gpu-health-lbl">ECC errors</span>
757
+ <span class="gpu-health-ecc-vals">
758
+ <span title="Corrected (volatile)">${g.eccCorrected ?? '—'} corrected</span>
759
+ <span class="${eccBad ? 'hv-crit' : ''}" title="Uncorrected (volatile)">${g.eccUncorrected ?? '—'} uncorrected</span>
760
+ </span>
761
+ </div>
762
+
763
+ ${throttled.length ? `
764
+ <div class="gpu-health-throttle">
765
+ <span class="gpu-health-lbl gpu-health-throttle-lbl">Clock throttled</span>
766
+ <div class="gpu-health-throttle-tags">${throttled.map(r => `<span class="gpu-health-throttle-tag">${h(r)}</span>`).join('')}</div>
767
+ </div>` : ''}
768
+ </div>`;
769
+ }).join('');
770
+
771
+ container.innerHTML = '';
772
+ container.className = '';
773
+ container.appendChild(wrap);
774
+ }
775
+
776
+ /* ══════════════════════════════════════════════════════════════════════
777
+ * GPU processes — nvidia-smi pmon or --query-compute-apps
778
+ * ══════════════════════════════════════════════════════════════════════ */
779
+ function renderGpuProcessesView(raw, container) {
780
+ const lines = raw.split('\n').map(l => l.trim()).filter(l => l);
781
+ if (!lines.length) { container.textContent = raw; return; }
782
+
783
+ const wrap = document.createElement('div');
784
+ wrap.className = 'hv-wrap';
785
+
786
+ // pmon format: header lines start with '#'
787
+ if (lines[0].startsWith('# gpu')) {
788
+ const dataLines = lines.filter(l => !l.startsWith('#'));
789
+ const active = dataLines.filter(l => {
790
+ const parts = l.split(/\s+/);
791
+ return parts[1] && parts[1] !== '-';
792
+ });
793
+
794
+ if (!active.length) {
795
+ const idleHtml = dataLines.map(l => {
796
+ const parts = l.split(/\s+/);
797
+ return `<div class="gpu-proc-idle-row">
798
+ <span class="gpu-proc-idx">GPU ${h(parts[0] || '?')}</span>
799
+ <span class="gpu-proc-idle-lbl">No active processes</span>
800
+ </div>`;
801
+ }).join('');
802
+ wrap.innerHTML = `<div class="gpu-proc-idle">${idleHtml}</div>`;
803
+ } else {
804
+ function metricCell(v) {
805
+ const n = parseFloat(v);
806
+ const cls = (!isNaN(n) && n > 0)
807
+ ? (n >= 80 ? 'hv-crit' : n >= 40 ? 'hv-warn' : 'gpu-proc-active') : '';
808
+ return `<td class="${cls}">${h(v === '-' ? '—' : v + '%')}</td>`;
809
+ }
810
+ const tableRows = active.map(l => {
811
+ const [gpu, pid, type, sm, mem, enc, dec, , , ...cmdParts] = l.split(/\s+/);
812
+ const cmd = cmdParts.join(' ') || '—';
813
+ return `<tr>
814
+ <td><span class="gpu-proc-idx">GPU ${h(gpu)}</span></td>
815
+ <td class="gpu-proc-pid">${h(pid)}</td>
816
+ <td><span class="gpu-proc-type gpu-proc-type-${h((type || '').toLowerCase())}">${h(type || '—')}</span></td>
817
+ ${metricCell(sm)} ${metricCell(mem)} ${metricCell(enc)} ${metricCell(dec)}
818
+ <td class="gpu-proc-cmd">${h(cmd)}</td>
819
+ </tr>`;
820
+ }).join('');
821
+ wrap.innerHTML = `
822
+ <div class="hv-top-table-wrap">
823
+ <table class="hv-top-table">
824
+ <thead><tr>
825
+ <th>GPU</th><th>PID</th><th>Type</th>
826
+ <th>SM %</th><th>Mem %</th><th>Enc %</th><th>Dec %</th>
827
+ <th>Process</th>
828
+ </tr></thead>
829
+ <tbody>${tableRows}</tbody>
830
+ </table>
831
+ </div>`;
832
+ }
833
+
834
+ } else {
835
+ // --query-compute-apps CSV: pid, used_gpu_memory, name
836
+ const rows = lines
837
+ .map(l => { const p = l.split(',').map(s => s.trim()); return { pid: p[0], mem: p[1], name: p[2] }; })
838
+ .filter(r => r.pid && r.pid !== '-');
839
+
840
+ if (!rows.length) {
841
+ wrap.innerHTML = '<div class="gpu-proc-idle"><div class="gpu-proc-idle-row"><span class="gpu-proc-idle-lbl">No GPU compute processes running.</span></div></div>';
842
+ } else {
843
+ const tableRows = rows.map(r => `<tr>
844
+ <td class="gpu-proc-pid">${h(r.pid)}</td>
845
+ <td class="gpu-proc-mem">${h(r.mem)}</td>
846
+ <td class="gpu-proc-cmd">${h(r.name)}</td>
847
+ </tr>`).join('');
848
+ wrap.innerHTML = `
849
+ <div class="hv-top-table-wrap">
850
+ <table class="hv-top-table">
851
+ <thead><tr><th>PID</th><th>GPU Memory</th><th>Process</th></tr></thead>
852
+ <tbody>${tableRows}</tbody>
853
+ </table>
854
+ </div>`;
855
+ }
856
+ }
857
+
858
+ container.innerHTML = '';
859
+ container.className = '';
860
+ container.appendChild(wrap);
861
+ }
862
+
454
863
  /* ── Exports ─────────────────────────────────────────────────────────── */
455
864
  window.renderMemInfoView = renderMemInfoView;
456
865
  window.renderMemPressureView = renderMemPressureView;
@@ -458,5 +867,8 @@
458
867
  window.renderKubeletLogsView = renderKubeletLogsView;
459
868
  window.renderDiskView = renderDiskView;
460
869
  window.renderCpuView = renderCpuView;
870
+ window.renderGpuInfoView = renderGpuInfoView;
871
+ window.renderGpuHealthView = renderGpuHealthView;
872
+ window.renderGpuProcessesView = renderGpuProcessesView;
461
873
 
462
874
  })();
package/public/style.css CHANGED
@@ -864,3 +864,53 @@ kbd {
864
864
  .conn-empty { color: var(--fg-dim); font-style: italic; font-size: 13px; padding: 6px 0; }
865
865
  .conn-loading { color: var(--accent); font-size: 14px; padding: 20px; text-align: center; }
866
866
  .conn-error { color: var(--err); font-size: 13px; padding: 10px; }
867
+
868
+ /* ── GPU info view (nvidia-smi table) ──────────────────────────────────── */
869
+ .gpu-info-versions { display: flex; gap: 24px; padding: 12px 16px; background: var(--bg-2); border: 1px solid var(--border); border-radius: 8px; flex-wrap: wrap; }
870
+ .gpu-info-ver-item { display: flex; flex-direction: column; gap: 3px; }
871
+ .gpu-info-ver-val { font-family: var(--mono); font-size: 14px; font-weight: 600; color: var(--accent); }
872
+ .gpu-info-meta { display: grid; grid-template-columns: repeat(auto-fill, minmax(160px, 1fr)); gap: 8px; padding: 10px 12px; background: var(--bg-3); border-radius: 6px; border: 1px solid var(--border); }
873
+ .gpu-info-meta-row { display: flex; flex-direction: column; gap: 3px; }
874
+ .gpu-info-meta-val { font-family: var(--mono); font-size: 12px; color: var(--fg); }
875
+ .gpu-info-procs-section { display: flex; flex-direction: column; gap: 8px; }
876
+ .gpu-info-procs-title { font-size: 11px; font-weight: 600; text-transform: uppercase; letter-spacing: .08em; color: var(--fg-dim); }
877
+ .gpu-info-no-procs { font-size: 13px; color: var(--fg-dim); padding: 10px 14px; background: var(--bg-2); border: 1px solid var(--border); border-radius: 6px; }
878
+
879
+ /* ── GPU health view ────────────────────────────────────────────────────── */
880
+ .gpu-health-grid { display: flex; flex-direction: column; gap: 16px; }
881
+ .gpu-health-card { background: var(--bg-2); border: 1px solid var(--border); border-radius: 10px; padding: 18px 20px; display: flex; flex-direction: column; gap: 14px; }
882
+ .gpu-health-hdr { display: flex; align-items: center; gap: 10px; }
883
+ .gpu-health-name { font-size: 14px; font-weight: 600; color: var(--fg); }
884
+ .gpu-health-row { display: flex; gap: 0; flex-wrap: wrap; background: var(--bg-3); border-radius: 7px; overflow: hidden; border: 1px solid var(--border); }
885
+ .gpu-health-cell { flex: 1; min-width: 80px; display: flex; flex-direction: column; align-items: center; padding: 10px 8px; gap: 4px; border-right: 1px solid var(--border); }
886
+ .gpu-health-cell:last-child { border-right: none; }
887
+ .gpu-health-lbl { font-size: 10px; font-weight: 600; text-transform: uppercase; letter-spacing: .06em; color: var(--fg-dim); }
888
+ .gpu-health-val { font-size: 18px; font-weight: 700; font-family: var(--mono); }
889
+ .gpu-health-metric { display: flex; flex-direction: column; gap: 5px; }
890
+ .gpu-health-metric-hdr { display: flex; justify-content: space-between; align-items: baseline; }
891
+ .gpu-health-metric .gpu-health-lbl { font-size: 11px; }
892
+ .gpu-health-metric .gpu-health-val { font-size: 14px; }
893
+ .gpu-health-bar { height: 8px; }
894
+ .gpu-health-sub { font-size: 11px; color: var(--fg-dim); font-family: var(--mono); }
895
+ .gpu-health-ecc { display: flex; align-items: center; justify-content: space-between; padding: 8px 10px; border-radius: 6px; font-size: 12px; }
896
+ .gpu-health-ecc-ok { background: #0d2b16; border: 1px solid #1e5c3a; }
897
+ .gpu-health-ecc-warn { background: #2b2000; border: 1px solid #5a4500; }
898
+ .gpu-health-ecc-bad { background: #2b0d0d; border: 1px solid #5c2020; }
899
+ .gpu-health-ecc-vals { display: flex; gap: 14px; font-family: var(--mono); color: var(--fg-dim); }
900
+ .gpu-health-throttle { display: flex; flex-direction: column; gap: 6px; padding: 8px 10px; background: #2b2000; border: 1px solid #5a4500; border-radius: 6px; }
901
+ .gpu-health-throttle-lbl { color: #d29922; }
902
+ .gpu-health-throttle-tags { display: flex; gap: 6px; flex-wrap: wrap; }
903
+ .gpu-health-throttle-tag { padding: 2px 8px; background: #3d2e00; border: 1px solid #5a4500; border-radius: 4px; font-size: 11px; color: #d29922; font-family: var(--mono); }
904
+
905
+ /* ── GPU processes view ─────────────────────────────────────────────────── */
906
+ .gpu-proc-idle { display: flex; flex-direction: column; gap: 8px; }
907
+ .gpu-proc-idle-row { display: flex; align-items: center; gap: 12px; padding: 10px 14px; background: var(--bg-2); border: 1px solid var(--border); border-radius: 8px; }
908
+ .gpu-proc-idle-lbl { font-size: 13px; color: var(--fg-dim); }
909
+ .gpu-proc-idx { display: inline-block; padding: 2px 8px; border-radius: 5px; font-size: 11px; font-weight: 700; font-family: var(--mono); background: #0d1f3c; color: var(--accent); border: 1px solid #1e3a6e; }
910
+ .gpu-proc-pid { font-family: var(--mono); font-size: 12px; color: var(--fg-dim); }
911
+ .gpu-proc-mem { font-family: var(--mono); font-size: 12px; }
912
+ .gpu-proc-type { display: inline-block; padding: 1px 6px; border-radius: 4px; font-size: 11px; font-weight: 600; font-family: var(--mono); background: var(--bg-3); color: var(--fg-dim); border: 1px solid var(--border); }
913
+ .gpu-proc-type-c { background: #0d2b16; color: #3fb950; border-color: #1e5c3a; }
914
+ .gpu-proc-type-g { background: #0d1f3c; color: var(--accent); border-color: #1e3a6e; }
915
+ .gpu-proc-cmd { color: var(--fg-dim); font-size: 12px; font-family: var(--mono); }
916
+ .gpu-proc-active { color: var(--ok); font-family: var(--mono); }
package/src/probes.js CHANGED
@@ -229,6 +229,7 @@ const PROBES = [
229
229
  commands: [
230
230
  'nvidia-smi',
231
231
  'nsenter --mount=/proc/1/ns/mnt -- nvidia-smi 2>/dev/null',
232
+ 'echo "nvidia-smi not available — no GPU detected on this node."',
232
233
  ],
233
234
  },
234
235
  {
@@ -238,16 +239,21 @@ const PROBES = [
238
239
  desc: 'Processes currently consuming GPU memory.',
239
240
  commands: [
240
241
  'nvidia-smi --query-compute-apps=pid,used_gpu_memory,name --format=csv,noheader 2>/dev/null | sort -t, -k2 -rn | head -30',
242
+ 'nsenter --mount=/proc/1/ns/mnt -- nvidia-smi --query-compute-apps=pid,used_gpu_memory,name --format=csv,noheader | sort -t, -k2 -rn | head -30',
241
243
  'nvidia-smi pmon -s u -c 1 2>/dev/null',
244
+ 'nsenter --mount=/proc/1/ns/mnt -- nvidia-smi pmon -s u -c 1',
245
+ 'echo "nvidia-smi not available — no GPU detected on this node."',
242
246
  ],
243
247
  },
244
248
  {
245
- id: 'gpu-dcgm',
246
- label: 'DCGM health',
249
+ id: 'gpu-health',
250
+ label: 'GPU health',
247
251
  group: 'GPU',
248
- desc: 'DCGM (Data Center GPU Manager) health check. Requires dcgmi to be installed.',
252
+ desc: 'Per-GPU temperature, power, utilization, memory, ECC errors, and clock throttle reasons.',
249
253
  commands: [
250
- 'dcgmi health -g 0 -j 2>/dev/null || dcgmi health -g 0 2>/dev/null || echo "dcgmi not available — DCGM is not installed on this node."',
254
+ 'nvidia-smi --query-gpu=index,name,temperature.gpu,temperature.memory,power.draw,power.limit,utilization.gpu,utilization.memory,memory.used,memory.free,memory.total,ecc.errors.corrected.volatile.total,ecc.errors.uncorrected.volatile.total,clocks_throttle_reasons.active --format=csv,noheader 2>/dev/null',
255
+ 'nsenter --mount=/proc/1/ns/mnt -- nvidia-smi --query-gpu=index,name,temperature.gpu,temperature.memory,power.draw,power.limit,utilization.gpu,utilization.memory,memory.used,memory.free,memory.total,ecc.errors.corrected.volatile.total,ecc.errors.uncorrected.volatile.total,clocks_throttle_reasons.active --format=csv,noheader',
256
+ 'echo "nvidia-smi not available — no GPU detected on this node."',
251
257
  ],
252
258
  },
253
259
  ];