@gtadi/k8s-node-debugger 1.0.1 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -22,16 +22,37 @@ node bin/k8s-node-debugger.js <node-name>
22
22
 
23
23
  ![Node Health view](https://raw.githubusercontent.com/goutamtadi1/k8s-node-debugger/main/docs/screenshot-health.png)
24
24
 
25
+ ### GPU Health — temperature, power, utilization, memory, ECC errors, clock throttle
26
+
27
+ ![GPU Health view](https://raw.githubusercontent.com/goutamtadi1/k8s-node-debugger/main/docs/screenshot-gpu-health.png)
28
+
29
+ ### GPU Status — driver version, CUDA, per-GPU metrics, processes
30
+
31
+ ![GPU Status view](https://raw.githubusercontent.com/goutamtadi1/k8s-node-debugger/main/docs/screenshot-gpu-status.png)
32
+
25
33
  ## Install
26
34
 
35
+ ```bash
36
+ npm install -g @gtadi/k8s-node-debugger
37
+ ```
38
+
39
+ Or run without installing:
40
+
41
+ ```bash
42
+ npx @gtadi/k8s-node-debugger <node-name>
43
+ ```
44
+
45
+ Requires `kubectl` on your PATH with an active kubeconfig. The debug image (`nicolaka/netshoot`) is pulled from Docker Hub on first use.
46
+
47
+ ### From source
48
+
27
49
  ```bash
28
50
  git clone git@github.com:goutamtadi1/k8s-node-debugger.git
29
51
  cd k8s-node-debugger
30
52
  npm install
53
+ node bin/k8s-node-debugger.js <node-name>
31
54
  ```
32
55
 
33
- Requires `kubectl` on your PATH with an active kubeconfig. The debug image (`nicolaka/netshoot`) is pulled from Docker Hub on first use.
34
-
35
56
  ## Usage
36
57
 
37
58
  ```bash
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@gtadi/k8s-node-debugger",
3
- "version": "1.0.1",
3
+ "version": "1.1.0",
4
4
  "description": "Spin up a privileged debug pod on a target Kubernetes node and inspect its network stack (iptables, resolv.conf, conntrack, routes, sockets) from a browser UI.",
5
5
  "bin": {
6
6
  "k8s-node-debugger": "bin/k8s-node-debugger.js"
package/public/app.js CHANGED
@@ -191,6 +191,7 @@ function buildSidebar(probes) {
191
191
  'Health': { panelId: 'health', label: 'Node Health' },
192
192
  'Firewall': { panelId: 'firewall', label: 'Firewall' },
193
193
  'Conntrack': { panelId: 'conntrack', label: 'Conntrack' },
194
+ 'Storage': { panelId: 'storage', label: 'Storage' },
194
195
  };
195
196
  if (GROUP_PANELS[grp]) {
196
197
  const { panelId, label } = GROUP_PANELS[grp];
@@ -250,12 +251,15 @@ function navItem(id, label, type) {
250
251
  const HEALTH_ORDER = ['cpu-stat', 'mem-info', 'disk-usage', 'mem-pressure', 'oom-kills', 'kubelet-logs'];
251
252
  const FIREWALL_ORDER = ['iptables', 'iptables-nat', 'nftables', 'ipvs'];
252
253
  const CONNTRACK_ORDER = ['conntrack', 'conntrack-stats', 'conntrack-count'];
254
+ const STORAGE_ORDER = ['storage-partitions', 'storage-du-tree', 'storage-containers'];
253
255
 
254
256
  // Probes that get a rich custom renderer instead of a plain <pre>.
255
257
  const FANCY_PROBES = new Set([
256
258
  'iptables', 'iptables-nat',
257
259
  'conntrack', 'conntrack-stats', 'conntrack-count',
258
260
  'mem-info', 'mem-pressure', 'oom-kills', 'kubelet-logs', 'disk-usage', 'cpu-stat',
261
+ 'gpu-info', 'gpu-health', 'gpu-processes',
262
+ 'storage-partitions', 'storage-du-tree', 'storage-containers',
259
263
  ]);
260
264
 
261
265
  function buildProbePanel(probe) {
@@ -383,8 +387,14 @@ function tryFancyRender(id, output, container) {
383
387
  'mem-pressure': () => renderMemPressureView(output, container),
384
388
  'oom-kills': () => renderOomKillsView(output, container),
385
389
  'kubelet-logs': () => renderKubeletLogsView(output, container),
386
- 'disk-usage': () => renderDiskView(output, container),
387
- 'cpu-stat': () => renderCpuView(output, container),
390
+ 'disk-usage': () => renderDiskView(output, container),
391
+ 'cpu-stat': () => renderCpuView(output, container),
392
+ 'gpu-info': () => renderGpuInfoView(output, container),
393
+ 'gpu-health': () => renderGpuHealthView(output, container),
394
+ 'gpu-processes': () => renderGpuProcessesView(output, container),
395
+ 'storage-partitions': () => renderStoragePartitionsView(output, container),
396
+ 'storage-du-tree': () => renderStorageDuTreeView(output, container),
397
+ 'storage-containers': () => renderStorageContainersView(output, container),
388
398
  };
389
399
  if (!renderers[id]) return false;
390
400
  try { renderers[id](); return true; } catch (e) { console.error('[fancy render]', id, e); return false; }
@@ -518,10 +528,11 @@ async function init() {
518
528
  session = await api('/api/session');
519
529
  const nodes = await api('/api/nodes');
520
530
 
521
- const GROUP_IDS = new Set(['Health', 'Firewall', 'Conntrack']);
531
+ const GROUP_IDS = new Set(['Health', 'Firewall', 'Conntrack', 'Storage']);
522
532
  const healthProbes = session.probes.filter(p => p.group === 'Health');
523
533
  const firewallProbes = session.probes.filter(p => p.group === 'Firewall');
524
534
  const conntrackProbes = session.probes.filter(p => p.group === 'Conntrack');
535
+ const storageProbes = session.probes.filter(p => p.group === 'Storage');
525
536
  const otherProbes = session.probes.filter(p => !GROUP_IDS.has(p.group));
526
537
 
527
538
  buildSidebar(session.probes);
@@ -529,6 +540,7 @@ async function init() {
529
540
  buildGroupPanel('health', 'Node Health', healthProbes, HEALTH_ORDER);
530
541
  buildGroupPanel('firewall', 'Firewall', firewallProbes, FIREWALL_ORDER);
531
542
  buildGroupPanel('conntrack', 'Conntrack', conntrackProbes, CONNTRACK_ORDER);
543
+ buildGroupPanel('storage', 'Storage', storageProbes, STORAGE_ORDER);
532
544
 
533
545
  renderOverview(nodes);
534
546
  showPanel('overview');
@@ -451,6 +451,670 @@
451
451
  container.appendChild(wrap);
452
452
  }
453
453
 
454
+ /* ══════════════════════════════════════════════════════════════════════
455
+ * GPU status — nvidia-smi plain text table
456
+ * ══════════════════════════════════════════════════════════════════════ */
457
+ function renderGpuInfoView(raw, container) {
458
+ if (!raw.includes('NVIDIA-SMI')) { container.className = 'output'; container.textContent = raw; return; }
459
+
460
+ const lines = raw.split('\n');
461
+
462
+ // Version header
463
+ const verLine = lines.find(l => l.includes('NVIDIA-SMI')) || '';
464
+ const smiVer = verLine.match(/NVIDIA-SMI\s+(\S+)/)?.[1] || '—';
465
+ const driverVer = verLine.match(/Driver Version:\s+(\S+)/)?.[1] || '—';
466
+ const cudaVer = verLine.match(/CUDA Version:\s+(\S+)/)?.[1] || '—';
467
+
468
+ // Collect GPU data: groups of 3 content lines between |====| ... +---+
469
+ const gpus = [];
470
+ let inGpu = false, buf = [];
471
+ for (const line of lines) {
472
+ if (/^\|[=]+\|?$/.test(line.trim())) { inGpu = true; buf = []; continue; }
473
+ if (inGpu && line.startsWith('+')) {
474
+ if (buf.length >= 2) {
475
+ const g = parseNvidiaSmiGpuBlock(buf);
476
+ if (g) gpus.push(g);
477
+ }
478
+ inGpu = false; buf = []; continue;
479
+ }
480
+ if (inGpu && line.startsWith('|') && !line.includes('Processes:') &&
481
+ !line.includes('No running') && !line.includes('GPU GI') && !line.includes('GPU GI')) {
482
+ buf.push(line);
483
+ }
484
+ }
485
+
486
+ // Processes
487
+ let noProcs = raw.includes('No running processes found');
488
+ const procs = [];
489
+ if (!noProcs) {
490
+ const procRe = /\|\s+(\d+)\s+\S+\s+\S+\s+(\d+)\s+(\S+)\s+(.*?)\s+(\d+)MiB\s+\|/;
491
+ let inP = false;
492
+ for (const line of lines) {
493
+ if (line.includes('Processes:')) { inP = true; continue; }
494
+ if (inP && /^\|[=]+/.test(line)) continue;
495
+ if (inP && line.startsWith('+')) break;
496
+ if (inP) {
497
+ if (line.includes('No running')) { noProcs = true; break; }
498
+ const m = line.match(procRe);
499
+ if (m) procs.push({ gpu: m[1], pid: m[2], type: m[3], name: m[4].trim(), mem: m[5] });
500
+ }
501
+ }
502
+ }
503
+
504
+ function tempCls(t) { return t >= 85 ? 'hv-crit' : t >= 70 ? 'hv-warn' : 'hv-ok'; }
505
+ function pctCls(v) { return v >= 90 ? 'hv-crit' : v >= 70 ? 'hv-warn' : 'hv-ok'; }
506
+ function powerCls2(d,c) { if (!c) return 'hv-ok'; const r=d/c; return r>=0.95?'hv-crit':r>=0.80?'hv-warn':'hv-ok'; }
507
+
508
+ function memBar(used, total) {
509
+ if (!total) return '';
510
+ const pct = Math.min(Math.round(used/total*100),100);
511
+ const cls = pctCls(pct);
512
+ return `<div class="gpu-health-metric">
513
+ <div class="gpu-health-metric-hdr">
514
+ <span class="gpu-health-lbl">Memory</span>
515
+ <span class="gpu-health-val ${cls}">${pct}%</span>
516
+ </div>
517
+ <div class="hv-gauge-bar gpu-health-bar"><div class="hv-gauge-fill ${cls}" style="width:${pct}%"></div></div>
518
+ <div class="gpu-health-sub">${used} MiB used · ${total} MiB total</div>
519
+ </div>`;
520
+ }
521
+
522
+ function pwrBar(draw, cap) {
523
+ if (!cap) return '';
524
+ const pct = Math.min(Math.round(draw/cap*100),100);
525
+ const cls = powerCls2(draw, cap);
526
+ return `<div class="gpu-health-metric">
527
+ <div class="gpu-health-metric-hdr">
528
+ <span class="gpu-health-lbl">Power</span>
529
+ <span class="gpu-health-val ${cls}">${draw} W / ${cap} W</span>
530
+ </div>
531
+ <div class="hv-gauge-bar gpu-health-bar"><div class="hv-gauge-fill ${cls}" style="width:${pct}%"></div></div>
532
+ </div>`;
533
+ }
534
+
535
+ const gpuCards = gpus.map(g => `
536
+ <div class="gpu-health-card">
537
+ <div class="gpu-health-hdr">
538
+ <span class="gpu-proc-idx">GPU ${h(g.index)}</span>
539
+ <span class="gpu-health-name">${h(g.name)}</span>
540
+ </div>
541
+ <div class="gpu-health-row">
542
+ <div class="gpu-health-cell">
543
+ <span class="gpu-health-lbl">GPU Temp</span>
544
+ <span class="gpu-health-val ${tempCls(g.temp)}">${g.temp}°C</span>
545
+ </div>
546
+ <div class="gpu-health-cell">
547
+ <span class="gpu-health-lbl">Perf</span>
548
+ <span class="gpu-health-val">${h(g.perf)}</span>
549
+ </div>
550
+ <div class="gpu-health-cell">
551
+ <span class="gpu-health-lbl">GPU Util</span>
552
+ <span class="gpu-health-val ${pctCls(g.utilGpu)}">${g.utilGpu}%</span>
553
+ </div>
554
+ <div class="gpu-health-cell">
555
+ <span class="gpu-health-lbl">Fan</span>
556
+ <span class="gpu-health-val">${g.fan !== null ? g.fan + '%' : '—'}</span>
557
+ </div>
558
+ </div>
559
+ ${memBar(g.memUsed, g.memTotal)}
560
+ ${pwrBar(g.pwrDraw, g.pwrCap)}
561
+ <div class="gpu-info-meta">
562
+ <div class="gpu-info-meta-row">
563
+ <span class="gpu-health-lbl">Bus ID</span>
564
+ <span class="gpu-info-meta-val">${h(g.busId)}</span>
565
+ </div>
566
+ <div class="gpu-info-meta-row">
567
+ <span class="gpu-health-lbl">Persistence</span>
568
+ <span class="gpu-info-meta-val ${g.persistence === 'On' ? 'hv-ok' : ''}">${h(g.persistence)}</span>
569
+ </div>
570
+ <div class="gpu-info-meta-row">
571
+ <span class="gpu-health-lbl">Compute Mode</span>
572
+ <span class="gpu-info-meta-val">${h(g.computeMode)}</span>
573
+ </div>
574
+ ${g.migMode ? `<div class="gpu-info-meta-row">
575
+ <span class="gpu-health-lbl">MIG Mode</span>
576
+ <span class="gpu-info-meta-val">${h(g.migMode)}</span>
577
+ </div>` : ''}
578
+ </div>
579
+ </div>`).join('');
580
+
581
+ const procsHtml = noProcs
582
+ ? '<div class="gpu-info-no-procs">No running processes.</div>'
583
+ : `<div class="hv-top-table-wrap"><table class="hv-top-table">
584
+ <thead><tr><th>GPU</th><th>PID</th><th>Type</th><th>Process</th><th>GPU Mem</th></tr></thead>
585
+ <tbody>${procs.map(p => `<tr>
586
+ <td><span class="gpu-proc-idx">GPU ${h(p.gpu)}</span></td>
587
+ <td class="gpu-proc-pid">${h(p.pid)}</td>
588
+ <td><span class="gpu-proc-type gpu-proc-type-${h(p.type.toLowerCase())}">${h(p.type)}</span></td>
589
+ <td class="gpu-proc-cmd">${h(p.name)}</td>
590
+ <td class="gpu-proc-mem">${h(p.mem)} MiB</td>
591
+ </tr>`).join('')}</tbody>
592
+ </table></div>`;
593
+
594
+ const wrap = document.createElement('div');
595
+ wrap.className = 'hv-wrap gpu-health-grid';
596
+ wrap.innerHTML = `
597
+ <div class="gpu-info-versions">
598
+ <div class="gpu-info-ver-item"><span class="gpu-health-lbl">NVIDIA-SMI</span><span class="gpu-info-ver-val">${h(smiVer)}</span></div>
599
+ <div class="gpu-info-ver-item"><span class="gpu-health-lbl">Driver</span><span class="gpu-info-ver-val">${h(driverVer)}</span></div>
600
+ <div class="gpu-info-ver-item"><span class="gpu-health-lbl">CUDA</span><span class="gpu-info-ver-val">${h(cudaVer)}</span></div>
601
+ </div>
602
+ ${gpuCards}
603
+ <div class="gpu-info-procs-section">
604
+ <div class="gpu-info-procs-title">Processes</div>
605
+ ${procsHtml}
606
+ </div>`;
607
+
608
+ container.innerHTML = '';
609
+ container.className = '';
610
+ container.appendChild(wrap);
611
+ }
612
+
613
+ function parseNvidiaSmiGpuBlock(lines) {
614
+ // Line 0: | idx name persistence | bus_id disp_a | ecc |
615
+ // Line 1: | fan tempC perf drawW/capW | usedMiB/totalMiB | util% compute |
616
+ // Line 2: | ... | ... | mig |
617
+ const l0 = lines[0] || '', l1 = lines[1] || '', l2 = lines[2] || '';
618
+ const m0 = l0.match(/\|\s+(\d+)\s+(.*?)\s+(On|Off)\s+\|\s+(\S+)\s+(On|Off)\s+\|\s+(\S+)\s+\|/);
619
+ const m1 = l1.match(/\|\s*(N\/A|\d+)\s+(\d+)C\s+(\S+)\s+(\d+)W\s*\/\s*(\d+)W\s+\|\s+(\d+)MiB\s*\/\s*(\d+)MiB\s+\|\s+(\d+)%\s+(\S+)\s+\|/);
620
+ if (!m0 || !m1) return null;
621
+ const m2 = l2.match(/\|\s*\|\s*\|\s+(\S+)\s+\|/);
622
+ return {
623
+ index: m0[1],
624
+ name: m0[2].trim(),
625
+ persistence: m0[3],
626
+ busId: m0[4],
627
+ dispA: m0[5],
628
+ ecc: m0[6],
629
+ fan: m1[1] === 'N/A' ? null : parseInt(m1[1]),
630
+ temp: parseInt(m1[2]),
631
+ perf: m1[3],
632
+ pwrDraw: parseInt(m1[4]),
633
+ pwrCap: parseInt(m1[5]),
634
+ memUsed: parseInt(m1[6]),
635
+ memTotal: parseInt(m1[7]),
636
+ utilGpu: parseInt(m1[8]),
637
+ computeMode: m1[9],
638
+ migMode: m2?.[1] || null,
639
+ };
640
+ }
641
+
642
+ /* ══════════════════════════════════════════════════════════════════════
643
+ * GPU health — nvidia-smi --query-gpu CSV
644
+ * Columns: index, name, temp.gpu, temp.mem, power.draw, power.limit,
645
+ * util.gpu, util.mem, mem.used, mem.free, mem.total,
646
+ * ecc.corrected, ecc.uncorrected, throttle_reasons
647
+ * ══════════════════════════════════════════════════════════════════════ */
648
+ function renderGpuHealthView(raw, container) {
649
+ const lines = raw.split('\n').map(l => l.trim()).filter(l => l);
650
+ if (!lines.length) { container.textContent = raw; return; }
651
+
652
+ const THROTTLE_REASONS = [
653
+ [0x02, 'App Clock Setting'],
654
+ [0x04, 'SW Power Cap'],
655
+ [0x08, 'HW Slowdown'],
656
+ [0x10, 'Sync Boost'],
657
+ [0x20, 'SW Thermal Slowdown'],
658
+ [0x40, 'HW Thermal Slowdown'],
659
+ [0x80, 'HW Power Brake'],
660
+ [0x100, 'Display Clock Setting'],
661
+ ];
662
+
663
+ function pn(s) { const n = parseFloat(s); return isNaN(n) ? null : n; }
664
+ function strip(s, u) { return pn((s || '').replace(u, '')); }
665
+
666
+ const gpus = lines.map(line => {
667
+ const p = line.split(',').map(s => s.trim());
668
+ return {
669
+ index: p[0],
670
+ name: p[1],
671
+ tempGpu: pn(p[2]),
672
+ tempMem: pn(p[3]),
673
+ powerDraw: strip(p[4], ' W'),
674
+ powerLimit: strip(p[5], ' W'),
675
+ utilGpu: strip(p[6], ' %'),
676
+ utilMem: strip(p[7], ' %'),
677
+ memUsed: strip(p[8], ' MiB'),
678
+ memTotal: strip(p[10], ' MiB'),
679
+ eccCorrected: pn(p[11]),
680
+ eccUncorrected: pn(p[12]),
681
+ throttleRaw: (p[13] || '').trim(),
682
+ };
683
+ });
684
+
685
+ function tempCls(t) { return t === null ? '' : t >= 85 ? 'hv-crit' : t >= 70 ? 'hv-warn' : 'hv-ok'; }
686
+ function pctCls(v) { return v === null ? '' : v >= 90 ? 'hv-crit' : v >= 70 ? 'hv-warn' : 'hv-ok'; }
687
+ function powerCls(d,l) { if (d === null || !l) return 'hv-ok'; const r = d/l; return r >= 0.95 ? 'hv-crit' : r >= 0.80 ? 'hv-warn' : 'hv-ok'; }
688
+
689
+ function decodeThrottle(raw) {
690
+ if (!raw || raw === 'N/A') return [];
691
+ const val = parseInt(raw, 16);
692
+ if (isNaN(val) || val === 0 || val === 1) return [];
693
+ return THROTTLE_REASONS.filter(([mask]) => val & mask).map(([, name]) => name);
694
+ }
695
+
696
+ function bar(pct, cls) {
697
+ return `<div class="hv-gauge-bar gpu-health-bar"><div class="hv-gauge-fill ${cls}" style="width:${pct}%"></div></div>`;
698
+ }
699
+
700
+ const wrap = document.createElement('div');
701
+ wrap.className = 'hv-wrap gpu-health-grid';
702
+
703
+ wrap.innerHTML = gpus.map(g => {
704
+ const memPct = (g.memUsed !== null && g.memTotal) ? Math.min(Math.round(g.memUsed / g.memTotal * 100), 100) : null;
705
+ const pwrPct = (g.powerDraw !== null && g.powerLimit) ? Math.min(Math.round(g.powerDraw / g.powerLimit * 100), 100) : null;
706
+ const memCls = pctCls(memPct);
707
+ const pwrCls = powerCls(g.powerDraw, g.powerLimit);
708
+ const throttled = decodeThrottle(g.throttleRaw);
709
+ const eccBad = g.eccUncorrected !== null && g.eccUncorrected > 0;
710
+ const eccWarn = !eccBad && g.eccCorrected !== null && g.eccCorrected > 0;
711
+
712
+ return `
713
+ <div class="gpu-health-card">
714
+ <div class="gpu-health-hdr">
715
+ <span class="gpu-proc-idx">GPU ${h(g.index)}</span>
716
+ <span class="gpu-health-name">${h(g.name)}</span>
717
+ </div>
718
+
719
+ <div class="gpu-health-row">
720
+ <div class="gpu-health-cell">
721
+ <span class="gpu-health-lbl">GPU temp</span>
722
+ <span class="gpu-health-val ${tempCls(g.tempGpu)}">${g.tempGpu !== null ? g.tempGpu + '°C' : '—'}</span>
723
+ </div>
724
+ <div class="gpu-health-cell">
725
+ <span class="gpu-health-lbl">Mem temp</span>
726
+ <span class="gpu-health-val ${tempCls(g.tempMem)}">${g.tempMem !== null ? g.tempMem + '°C' : '—'}</span>
727
+ </div>
728
+ <div class="gpu-health-cell">
729
+ <span class="gpu-health-lbl">GPU util</span>
730
+ <span class="gpu-health-val ${pctCls(g.utilGpu)}">${g.utilGpu !== null ? g.utilGpu + '%' : '—'}</span>
731
+ </div>
732
+ <div class="gpu-health-cell">
733
+ <span class="gpu-health-lbl">Mem util</span>
734
+ <span class="gpu-health-val ${pctCls(g.utilMem)}">${g.utilMem !== null ? g.utilMem + '%' : '—'}</span>
735
+ </div>
736
+ </div>
737
+
738
+ <div class="gpu-health-metric">
739
+ <div class="gpu-health-metric-hdr">
740
+ <span class="gpu-health-lbl">Memory</span>
741
+ <span class="gpu-health-val ${memCls}">${memPct !== null ? memPct + '%' : '—'}</span>
742
+ </div>
743
+ ${memPct !== null ? bar(memPct, memCls) : ''}
744
+ <div class="gpu-health-sub">${g.memUsed !== null ? Math.round(g.memUsed) + ' MiB used' : ''} ${g.memTotal ? '· ' + Math.round(g.memTotal) + ' MiB total' : ''}</div>
745
+ </div>
746
+
747
+ <div class="gpu-health-metric">
748
+ <div class="gpu-health-metric-hdr">
749
+ <span class="gpu-health-lbl">Power</span>
750
+ <span class="gpu-health-val ${pwrCls}">${g.powerDraw !== null ? g.powerDraw.toFixed(1) + ' W' : '—'}${g.powerLimit ? ' / ' + g.powerLimit.toFixed(0) + ' W' : ''}</span>
751
+ </div>
752
+ ${pwrPct !== null ? bar(pwrPct, pwrCls) : ''}
753
+ </div>
754
+
755
+ <div class="gpu-health-ecc ${eccBad ? 'gpu-health-ecc-bad' : eccWarn ? 'gpu-health-ecc-warn' : 'gpu-health-ecc-ok'}">
756
+ <span class="gpu-health-lbl">ECC errors</span>
757
+ <span class="gpu-health-ecc-vals">
758
+ <span title="Corrected (volatile)">${g.eccCorrected ?? '—'} corrected</span>
759
+ <span class="${eccBad ? 'hv-crit' : ''}" title="Uncorrected (volatile)">${g.eccUncorrected ?? '—'} uncorrected</span>
760
+ </span>
761
+ </div>
762
+
763
+ ${throttled.length ? `
764
+ <div class="gpu-health-throttle">
765
+ <span class="gpu-health-lbl gpu-health-throttle-lbl">Clock throttled</span>
766
+ <div class="gpu-health-throttle-tags">${throttled.map(r => `<span class="gpu-health-throttle-tag">${h(r)}</span>`).join('')}</div>
767
+ </div>` : ''}
768
+ </div>`;
769
+ }).join('');
770
+
771
+ container.innerHTML = '';
772
+ container.className = '';
773
+ container.appendChild(wrap);
774
+ }
775
+
776
+ /* ══════════════════════════════════════════════════════════════════════
777
+ * GPU processes — nvidia-smi pmon or --query-compute-apps
778
+ * ══════════════════════════════════════════════════════════════════════ */
779
+ function renderGpuProcessesView(raw, container) {
780
+ const lines = raw.split('\n').map(l => l.trim()).filter(l => l);
781
+ if (!lines.length) { container.textContent = raw; return; }
782
+
783
+ const wrap = document.createElement('div');
784
+ wrap.className = 'hv-wrap';
785
+
786
+ // pmon format: header lines start with '#'
787
+ if (lines[0].startsWith('# gpu')) {
788
+ const dataLines = lines.filter(l => !l.startsWith('#'));
789
+ const active = dataLines.filter(l => {
790
+ const parts = l.split(/\s+/);
791
+ return parts[1] && parts[1] !== '-';
792
+ });
793
+
794
+ if (!active.length) {
795
+ const idleHtml = dataLines.map(l => {
796
+ const parts = l.split(/\s+/);
797
+ return `<div class="gpu-proc-idle-row">
798
+ <span class="gpu-proc-idx">GPU ${h(parts[0] || '?')}</span>
799
+ <span class="gpu-proc-idle-lbl">No active processes</span>
800
+ </div>`;
801
+ }).join('');
802
+ wrap.innerHTML = `<div class="gpu-proc-idle">${idleHtml}</div>`;
803
+ } else {
804
+ function metricCell(v) {
805
+ const n = parseFloat(v);
806
+ const cls = (!isNaN(n) && n > 0)
807
+ ? (n >= 80 ? 'hv-crit' : n >= 40 ? 'hv-warn' : 'gpu-proc-active') : '';
808
+ return `<td class="${cls}">${h(v === '-' ? '—' : v + '%')}</td>`;
809
+ }
810
+ const tableRows = active.map(l => {
811
+ const [gpu, pid, type, sm, mem, enc, dec, , , ...cmdParts] = l.split(/\s+/);
812
+ const cmd = cmdParts.join(' ') || '—';
813
+ return `<tr>
814
+ <td><span class="gpu-proc-idx">GPU ${h(gpu)}</span></td>
815
+ <td class="gpu-proc-pid">${h(pid)}</td>
816
+ <td><span class="gpu-proc-type gpu-proc-type-${h((type || '').toLowerCase())}">${h(type || '—')}</span></td>
817
+ ${metricCell(sm)} ${metricCell(mem)} ${metricCell(enc)} ${metricCell(dec)}
818
+ <td class="gpu-proc-cmd">${h(cmd)}</td>
819
+ </tr>`;
820
+ }).join('');
821
+ wrap.innerHTML = `
822
+ <div class="hv-top-table-wrap">
823
+ <table class="hv-top-table">
824
+ <thead><tr>
825
+ <th>GPU</th><th>PID</th><th>Type</th>
826
+ <th>SM %</th><th>Mem %</th><th>Enc %</th><th>Dec %</th>
827
+ <th>Process</th>
828
+ </tr></thead>
829
+ <tbody>${tableRows}</tbody>
830
+ </table>
831
+ </div>`;
832
+ }
833
+
834
+ } else {
835
+ // --query-compute-apps CSV: pid, used_gpu_memory, name
836
+ const rows = lines
837
+ .map(l => { const p = l.split(',').map(s => s.trim()); return { pid: p[0], mem: p[1], name: p[2] }; })
838
+ .filter(r => r.pid && r.pid !== '-');
839
+
840
+ if (!rows.length) {
841
+ wrap.innerHTML = '<div class="gpu-proc-idle"><div class="gpu-proc-idle-row"><span class="gpu-proc-idle-lbl">No GPU compute processes running.</span></div></div>';
842
+ } else {
843
+ const tableRows = rows.map(r => `<tr>
844
+ <td class="gpu-proc-pid">${h(r.pid)}</td>
845
+ <td class="gpu-proc-mem">${h(r.mem)}</td>
846
+ <td class="gpu-proc-cmd">${h(r.name)}</td>
847
+ </tr>`).join('');
848
+ wrap.innerHTML = `
849
+ <div class="hv-top-table-wrap">
850
+ <table class="hv-top-table">
851
+ <thead><tr><th>PID</th><th>GPU Memory</th><th>Process</th></tr></thead>
852
+ <tbody>${tableRows}</tbody>
853
+ </table>
854
+ </div>`;
855
+ }
856
+ }
857
+
858
+ container.innerHTML = '';
859
+ container.className = '';
860
+ container.appendChild(wrap);
861
+ }
862
+
863
+ /* ══════════════════════════════════════════════════════════════════════
864
+ * Storage — Partition overview (filtered df -hT)
865
+ * ══════════════════════════════════════════════════════════════════════ */
866
+ function renderStoragePartitionsView(raw, container) {
867
+ const lines = raw.split('\n').filter(l => l.trim());
868
+ if (lines.length < 2) { container.textContent = raw; return; }
869
+
870
+ const rows = [];
871
+ for (const line of lines.slice(1)) {
872
+ const parts = line.trim().split(/\s+/);
873
+ if (parts.length < 7) continue;
874
+ const [fs, type, size, used, avail, usePct, ...rest] = parts;
875
+ const mount = rest.join(' ');
876
+ const p = parseInt(usePct);
877
+ rows.push({ fs, type, size, used, avail, pct: isNaN(p) ? 0 : p, mount });
878
+ }
879
+
880
+ if (!rows.length) { container.textContent = raw; return; }
881
+
882
+ const wrap = document.createElement('div');
883
+ wrap.className = 'hv-wrap';
884
+
885
+ const rowsHtml = rows.map(r => {
886
+ const cls = colorCls(r.pct);
887
+ return `
888
+ <div class="hv-disk-row">
889
+ <div class="hv-disk-header">
890
+ <span class="hv-disk-mount">${h(r.mount)}</span>
891
+ <span class="hv-disk-fs">${h(r.fs)} <span style="opacity:0.5;font-size:11px">${h(r.type)}</span></span>
892
+ <div class="hv-disk-sizes">
893
+ <span class="hv-disk-pct ${cls}">${r.pct}%</span>
894
+ <span class="hv-disk-nums">${h(r.used)} used · ${h(r.avail)} free · ${h(r.size)} total</span>
895
+ </div>
896
+ </div>
897
+ <div class="hv-disk-bar"><div class="hv-disk-fill ${cls}" style="width:${r.pct}%"></div></div>
898
+ </div>`;
899
+ }).join('');
900
+
901
+ wrap.innerHTML = `<div class="hv-disk-list">${rowsHtml}</div>`;
902
+ container.innerHTML = '';
903
+ container.className = '';
904
+ container.appendChild(wrap);
905
+ }
906
+
907
+ /* ══════════════════════════════════════════════════════════════════════
908
+ * Storage — du tree drill-down
909
+ * ══════════════════════════════════════════════════════════════════════ */
910
+ function renderStorageDuTreeView(raw, container) {
911
+ const SECTION_LABELS = {
912
+ stateful: '/mnt/stateful_partition',
913
+ var: '/var',
914
+ varlib: '/var/lib',
915
+ containerd: '/var/lib/containerd',
916
+ };
917
+
918
+ const sections = {};
919
+ let cur = null;
920
+ for (const line of raw.split('\n')) {
921
+ const sec = line.match(/^=(stateful|var|varlib|containerd)=$/);
922
+ if (sec) { cur = sec[1]; sections[cur] = []; continue; }
923
+ if (cur && line.trim()) sections[cur].push(line.trim());
924
+ }
925
+
926
+ function parseHumanSize(s) {
927
+ const m = (s || '').trim().match(/^([0-9.]+)\s*([KMGTP]?)/i);
928
+ if (!m) return 0;
929
+ const n = parseFloat(m[1]);
930
+ const mul = { '': 1, K: 1024, M: 1024 ** 2, G: 1024 ** 3, T: 1024 ** 4 }[m[2].toUpperCase()] || 1;
931
+ return n * mul;
932
+ }
933
+
934
+ const wrap = document.createElement('div');
935
+ wrap.className = 'hv-wrap';
936
+
937
+ const order = ['stateful', 'var', 'varlib', 'containerd'];
938
+ const sectionsHtml = order.filter(k => sections[k]?.length).map(key => {
939
+ const entries = sections[key]
940
+ .map(line => { const [size, ...pathParts] = line.split(/\t/); return { size: size.trim(), path: pathParts.join('\t').trim() }; })
941
+ .filter(e => e.size && e.path);
942
+
943
+ if (!entries.length) return '';
944
+
945
+ const maxBytes = Math.max(...entries.map(e => parseHumanSize(e.size)), 1);
946
+
947
+ const rowsHtml = entries.slice(0, 20).map(e => {
948
+ const bytes = parseHumanSize(e.size);
949
+ const pct = Math.min(Math.round(bytes / maxBytes * 100), 100);
950
+ const cls = pct >= 80 ? 'hv-crit' : pct >= 50 ? 'hv-warn' : 'hv-ok';
951
+ const name = e.path.replace(/^.*\//, '') || e.path;
952
+ return `
953
+ <div class="st-du-row">
954
+ <div class="st-du-header">
955
+ <span class="st-du-name" title="${h(e.path)}">${h(name)}</span>
956
+ <span class="st-du-size ${pct >= 60 ? cls : ''}">${h(e.size)}</span>
957
+ </div>
958
+ <div class="hv-gauge-bar"><div class="hv-gauge-fill ${cls}" style="width:${pct}%"></div></div>
959
+ </div>`;
960
+ }).join('');
961
+
962
+ return `
963
+ <div class="st-du-section">
964
+ <div class="st-du-section-title">${h(SECTION_LABELS[key] || key)}</div>
965
+ <div class="st-du-rows">${rowsHtml}</div>
966
+ </div>`;
967
+ }).join('');
968
+
969
+ wrap.innerHTML = sectionsHtml || '<div class="hv-info-note">No du data available — path may not exist on this node.</div>';
970
+ container.innerHTML = '';
971
+ container.className = '';
972
+ container.appendChild(wrap);
973
+ }
974
+
975
+ /* ══════════════════════════════════════════════════════════════════════
976
+ * Storage — Top containers by snapshot disk usage
977
+ * Parses three sections from the compound command:
978
+ * =SNAPS= du -d 1 …/snapshots (bytes\tpath)
979
+ * =MOUNTS= mount | grep snapshots
980
+ * =CRICTL= crictl ps -a
981
+ * ══════════════════════════════════════════════════════════════════════ */
982
+ function renderStorageContainersView(raw, container) {
983
+ const sections = {};
984
+ let cur = null;
985
+ for (const line of raw.split('\n')) {
986
+ const sec = line.match(/^=(SNAPS|MOUNTS|CRICTL)=$/);
987
+ if (sec) { cur = sec[1]; sections[cur] = []; continue; }
988
+ if (cur && line.trim()) sections[cur].push(line);
989
+ }
990
+
991
+ // 1. Parse snapshot sizes: "1234567\t/path/snapshots/932" → snapId → KB
992
+ const snapKb = new Map();
993
+ for (const line of (sections.SNAPS || [])) {
994
+ const m = line.match(/^(\d+)\s+.*\/(\d+)\s*$/);
995
+ if (m) snapKb.set(m[2], parseInt(m[1]));
996
+ }
997
+
998
+ // 2. Build snapId → containerHash from overlay mount lines.
999
+ // Mount target contains: /k8s.io/{64-char-hash}/rootfs
1000
+ // Mount options contain: snapshots/NNN/ references
1001
+ const snapToHash = new Map();
1002
+ for (const line of (sections.MOUNTS || [])) {
1003
+ const hashM = line.match(/\/k8s\.io\/([a-f0-9]{64})\//);
1004
+ if (!hashM) continue;
1005
+ const hash = hashM[1];
1006
+ for (const m of line.matchAll(/snapshots\/(\d+)\//g)) {
1007
+ if (!snapToHash.has(m[1])) snapToHash.set(m[1], hash);
1008
+ }
1009
+ }
1010
+
1011
+ // 3. Parse crictl ps -a using header positions for fixed-width columns.
1012
+ const hashToInfo = new Map();
1013
+ const crictlLines = (sections.CRICTL || []).filter(l => l.trim());
1014
+ if (crictlLines.length > 1) {
1015
+ const header = crictlLines[0];
1016
+ const colStarts = {
1017
+ CONTAINER: header.indexOf('CONTAINER'),
1018
+ IMAGE: header.indexOf('IMAGE'),
1019
+ STATE: header.indexOf('STATE'),
1020
+ NAME: header.indexOf('NAME'),
1021
+ POD: header.lastIndexOf('POD'),
1022
+ };
1023
+ for (const line of crictlLines.slice(1)) {
1024
+ if (line.startsWith('CONTAINER')) continue;
1025
+ function col(start, end) {
1026
+ return end > start ? line.substring(start, end).trim() : line.substring(start).trim();
1027
+ }
1028
+ const id = col(colStarts.CONTAINER, colStarts.IMAGE);
1029
+ const state = col(colStarts.STATE, colStarts.NAME);
1030
+ const name = col(colStarts.NAME, colStarts.POD);
1031
+ const pod = line.substring(colStarts.POD).trim();
1032
+ if (!id) continue;
1033
+ const info = { state, name, pod: pod || name };
1034
+ hashToInfo.set(id, info);
1035
+ if (id.length > 12) hashToInfo.set(id.substring(0, 12), info);
1036
+ }
1037
+ }
1038
+
1039
+ // 4. Build ranked rows: snapId sorted by KB desc → join hash → join container info
1040
+ const rows = [...snapKb.entries()]
1041
+ .sort((a, b) => b[1] - a[1])
1042
+ .slice(0, 25)
1043
+ .map(([snapId, kb]) => {
1044
+ const hash = snapToHash.get(snapId);
1045
+ const info = hash
1046
+ ? (hashToInfo.get(hash) || hashToInfo.get(hash.substring(0, 12)))
1047
+ : null;
1048
+ return { snapId, kb, hash, info };
1049
+ });
1050
+
1051
+ const wrap = document.createElement('div');
1052
+ wrap.className = 'hv-wrap';
1053
+
1054
+ if (!rows.length) {
1055
+ wrap.innerHTML = '<div class="hv-info-note">No containerd overlayfs snapshot data found. The snapshotter path may differ on this node.</div>';
1056
+ container.innerHTML = '';
1057
+ container.appendChild(wrap);
1058
+ return;
1059
+ }
1060
+
1061
+ const totalKb = [...snapKb.values()].reduce((a, b) => a + b, 0);
1062
+ const mappedCount = rows.filter(r => r.info).length;
1063
+ const maxKb = rows[0].kb || 1;
1064
+
1065
+ const listHtml = rows.map((r, i) => {
1066
+ const pct = Math.min(Math.round(r.kb / maxKb * 100), 100);
1067
+ const cls = r.kb >= maxKb * 0.5 ? 'hv-crit' : r.kb >= maxKb * 0.2 ? 'hv-warn' : 'hv-ok';
1068
+ const stateCls = r.info?.state === 'Running' ? 'hv-ok' : r.info?.state ? 'hv-warn' : '';
1069
+ const podLabel = r.info?.pod || (r.hash ? r.hash.substring(0, 16) + '…' : '—');
1070
+ const nameLabel = r.info?.name || '';
1071
+ return `
1072
+ <div class="st-cont-row">
1073
+ <div class="st-cont-header">
1074
+ <span class="st-cont-rank">#${i + 1}</span>
1075
+ <div class="st-cont-names">
1076
+ <span class="st-cont-pod">${h(podLabel)}</span>
1077
+ ${nameLabel ? `<span class="st-cont-name">${h(nameLabel)}</span>` : ''}
1078
+ </div>
1079
+ ${r.info?.state ? `<span class="st-cont-state ${stateCls}">${h(r.info.state)}</span>` : ''}
1080
+ <span class="st-cont-size ${cls}">${hBytes(r.kb)}</span>
1081
+ </div>
1082
+ <div class="hv-gauge-bar st-cont-bar">
1083
+ <div class="hv-gauge-fill ${cls}" style="width:${pct}%"></div>
1084
+ </div>
1085
+ <div class="st-cont-meta">
1086
+ <span class="st-cont-meta-tag">snap #${h(r.snapId)}</span>
1087
+ ${r.hash ? `<span class="st-cont-meta-tag">${h(r.hash.substring(0, 20))}…</span>` : '<span class="st-cont-meta-unmapped">unmapped</span>'}
1088
+ </div>
1089
+ </div>`;
1090
+ }).join('');
1091
+
1092
+ wrap.innerHTML = `
1093
+ <div class="hv-info-note">
1094
+ Containerd overlayfs snapshot layers ranked by disk usage. Active layers are mapped to pods via overlay mounts and crictl.
1095
+ Prune dangling images with: <code>nsenter -t 1 -m -u -i -n -p -- crictl rmi --prune</code>
1096
+ </div>
1097
+ <div class="st-summary">
1098
+ <div class="hv-grid-item">
1099
+ <div class="hv-grid-label">Snapshots tracked</div>
1100
+ <div class="hv-grid-val">${snapKb.size}</div>
1101
+ </div>
1102
+ <div class="hv-grid-item">
1103
+ <div class="hv-grid-label">Mapped to pods</div>
1104
+ <div class="hv-grid-val">${mappedCount} / ${rows.length}</div>
1105
+ </div>
1106
+ <div class="hv-grid-item">
1107
+ <div class="hv-grid-label">Total snapshot storage</div>
1108
+ <div class="hv-grid-val">${hBytes(totalKb)}</div>
1109
+ </div>
1110
+ </div>
1111
+ <div class="st-cont-list">${listHtml}</div>`;
1112
+
1113
+ container.innerHTML = '';
1114
+ container.className = '';
1115
+ container.appendChild(wrap);
1116
+ }
1117
+
454
1118
  /* ── Exports ─────────────────────────────────────────────────────────── */
455
1119
  window.renderMemInfoView = renderMemInfoView;
456
1120
  window.renderMemPressureView = renderMemPressureView;
@@ -458,5 +1122,11 @@
458
1122
  window.renderKubeletLogsView = renderKubeletLogsView;
459
1123
  window.renderDiskView = renderDiskView;
460
1124
  window.renderCpuView = renderCpuView;
1125
+ window.renderGpuInfoView = renderGpuInfoView;
1126
+ window.renderGpuHealthView = renderGpuHealthView;
1127
+ window.renderGpuProcessesView = renderGpuProcessesView;
1128
+ window.renderStoragePartitionsView = renderStoragePartitionsView;
1129
+ window.renderStorageDuTreeView = renderStorageDuTreeView;
1130
+ window.renderStorageContainersView = renderStorageContainersView;
461
1131
 
462
1132
  })();
package/public/style.css CHANGED
@@ -864,3 +864,138 @@ kbd {
864
864
  .conn-empty { color: var(--fg-dim); font-style: italic; font-size: 13px; padding: 6px 0; }
865
865
  .conn-loading { color: var(--accent); font-size: 14px; padding: 20px; text-align: center; }
866
866
  .conn-error { color: var(--err); font-size: 13px; padding: 10px; }
867
+
868
+ /* ── GPU info view (nvidia-smi table) ──────────────────────────────────── */
869
+ .gpu-info-versions { display: flex; gap: 24px; padding: 12px 16px; background: var(--bg-2); border: 1px solid var(--border); border-radius: 8px; flex-wrap: wrap; }
870
+ .gpu-info-ver-item { display: flex; flex-direction: column; gap: 3px; }
871
+ .gpu-info-ver-val { font-family: var(--mono); font-size: 14px; font-weight: 600; color: var(--accent); }
872
+ .gpu-info-meta { display: grid; grid-template-columns: repeat(auto-fill, minmax(160px, 1fr)); gap: 8px; padding: 10px 12px; background: var(--bg-3); border-radius: 6px; border: 1px solid var(--border); }
873
+ .gpu-info-meta-row { display: flex; flex-direction: column; gap: 3px; }
874
+ .gpu-info-meta-val { font-family: var(--mono); font-size: 12px; color: var(--fg); }
875
+ .gpu-info-procs-section { display: flex; flex-direction: column; gap: 8px; }
876
+ .gpu-info-procs-title { font-size: 11px; font-weight: 600; text-transform: uppercase; letter-spacing: .08em; color: var(--fg-dim); }
877
+ .gpu-info-no-procs { font-size: 13px; color: var(--fg-dim); padding: 10px 14px; background: var(--bg-2); border: 1px solid var(--border); border-radius: 6px; }
878
+
879
+ /* ── GPU health view ────────────────────────────────────────────────────── */
880
+ .gpu-health-grid { display: flex; flex-direction: column; gap: 16px; }
881
+ .gpu-health-card { background: var(--bg-2); border: 1px solid var(--border); border-radius: 10px; padding: 18px 20px; display: flex; flex-direction: column; gap: 14px; }
882
+ .gpu-health-hdr { display: flex; align-items: center; gap: 10px; }
883
+ .gpu-health-name { font-size: 14px; font-weight: 600; color: var(--fg); }
884
+ .gpu-health-row { display: flex; gap: 0; flex-wrap: wrap; background: var(--bg-3); border-radius: 7px; overflow: hidden; border: 1px solid var(--border); }
885
+ .gpu-health-cell { flex: 1; min-width: 80px; display: flex; flex-direction: column; align-items: center; padding: 10px 8px; gap: 4px; border-right: 1px solid var(--border); }
886
+ .gpu-health-cell:last-child { border-right: none; }
887
+ .gpu-health-lbl { font-size: 10px; font-weight: 600; text-transform: uppercase; letter-spacing: .06em; color: var(--fg-dim); }
888
+ .gpu-health-val { font-size: 18px; font-weight: 700; font-family: var(--mono); }
889
+ .gpu-health-metric { display: flex; flex-direction: column; gap: 5px; }
890
+ .gpu-health-metric-hdr { display: flex; justify-content: space-between; align-items: baseline; }
891
+ .gpu-health-metric .gpu-health-lbl { font-size: 11px; }
892
+ .gpu-health-metric .gpu-health-val { font-size: 14px; }
893
+ .gpu-health-bar { height: 8px; }
894
+ .gpu-health-sub { font-size: 11px; color: var(--fg-dim); font-family: var(--mono); }
895
+ .gpu-health-ecc { display: flex; align-items: center; justify-content: space-between; padding: 8px 10px; border-radius: 6px; font-size: 12px; }
896
+ .gpu-health-ecc-ok { background: #0d2b16; border: 1px solid #1e5c3a; }
897
+ .gpu-health-ecc-warn { background: #2b2000; border: 1px solid #5a4500; }
898
+ .gpu-health-ecc-bad { background: #2b0d0d; border: 1px solid #5c2020; }
899
+ .gpu-health-ecc-vals { display: flex; gap: 14px; font-family: var(--mono); color: var(--fg-dim); }
900
+ .gpu-health-throttle { display: flex; flex-direction: column; gap: 6px; padding: 8px 10px; background: #2b2000; border: 1px solid #5a4500; border-radius: 6px; }
901
+ .gpu-health-throttle-lbl { color: #d29922; }
902
+ .gpu-health-throttle-tags { display: flex; gap: 6px; flex-wrap: wrap; }
903
+ .gpu-health-throttle-tag { padding: 2px 8px; background: #3d2e00; border: 1px solid #5a4500; border-radius: 4px; font-size: 11px; color: #d29922; font-family: var(--mono); }
904
+
905
+ /* ── GPU processes view ─────────────────────────────────────────────────── */
906
+ .gpu-proc-idle { display: flex; flex-direction: column; gap: 8px; }
907
+ .gpu-proc-idle-row { display: flex; align-items: center; gap: 12px; padding: 10px 14px; background: var(--bg-2); border: 1px solid var(--border); border-radius: 8px; }
908
+ .gpu-proc-idle-lbl { font-size: 13px; color: var(--fg-dim); }
909
+ .gpu-proc-idx { display: inline-block; padding: 2px 8px; border-radius: 5px; font-size: 11px; font-weight: 700; font-family: var(--mono); background: #0d1f3c; color: var(--accent); border: 1px solid #1e3a6e; }
910
+ .gpu-proc-pid { font-family: var(--mono); font-size: 12px; color: var(--fg-dim); }
911
+ .gpu-proc-mem { font-family: var(--mono); font-size: 12px; }
912
+ .gpu-proc-type { display: inline-block; padding: 1px 6px; border-radius: 4px; font-size: 11px; font-weight: 600; font-family: var(--mono); background: var(--bg-3); color: var(--fg-dim); border: 1px solid var(--border); }
913
+ .gpu-proc-type-c { background: #0d2b16; color: #3fb950; border-color: #1e5c3a; }
914
+ .gpu-proc-type-g { background: #0d1f3c; color: var(--accent); border-color: #1e3a6e; }
915
+ .gpu-proc-cmd { color: var(--fg-dim); font-size: 12px; font-family: var(--mono); }
916
+ .gpu-proc-active { color: var(--ok); font-family: var(--mono); }
917
+
918
+ /* ── Storage views ──────────────────────────────────────────────────────── */
919
+
920
+ /* du tree drill-down */
921
+ .st-du-section { margin-bottom: 20px; }
922
+ .st-du-section-title {
923
+ font-size: 11px; font-weight: 700; text-transform: uppercase;
924
+ letter-spacing: .08em; color: var(--accent); font-family: var(--mono);
925
+ padding: 6px 0 8px;
926
+ border-bottom: 1px solid var(--border); margin-bottom: 10px;
927
+ }
928
+ .st-du-rows { display: flex; flex-direction: column; gap: 7px; }
929
+ .st-du-row { display: flex; flex-direction: column; gap: 4px; }
930
+ .st-du-header { display: flex; align-items: center; gap: 10px; }
931
+ .st-du-name {
932
+ flex: 1; font-family: var(--mono); font-size: 12.5px;
933
+ white-space: nowrap; overflow: hidden; text-overflow: ellipsis;
934
+ color: var(--fg);
935
+ }
936
+ .st-du-size {
937
+ font-family: var(--mono); font-size: 12.5px; font-weight: 600;
938
+ color: var(--fg-dim); flex-shrink: 0; min-width: 60px; text-align: right;
939
+ }
940
+ .st-du-size.hv-crit { color: var(--err); }
941
+ .st-du-size.hv-warn { color: #d29922; }
942
+
943
+ /* summary grid */
944
+ .st-summary {
945
+ display: flex; gap: 12px; flex-wrap: wrap;
946
+ margin-bottom: 18px;
947
+ }
948
+ .st-summary .hv-grid-item {
949
+ flex: 1; min-width: 140px;
950
+ background: var(--bg-2); border: 1px solid var(--border);
951
+ border-radius: 8px; padding: 10px 14px;
952
+ }
953
+
954
+ /* container ranked list */
955
+ .st-cont-list { display: flex; flex-direction: column; gap: 10px; }
956
+ .st-cont-row {
957
+ background: var(--bg-2); border: 1px solid var(--border);
958
+ border-radius: 8px; padding: 11px 14px;
959
+ display: flex; flex-direction: column; gap: 6px;
960
+ }
961
+ .st-cont-header {
962
+ display: flex; align-items: center; gap: 10px; flex-wrap: wrap;
963
+ }
964
+ .st-cont-rank {
965
+ font-family: var(--mono); font-size: 11px; font-weight: 700;
966
+ color: var(--fg-dim); min-width: 26px; flex-shrink: 0;
967
+ }
968
+ .st-cont-names { flex: 1; min-width: 0; display: flex; flex-direction: column; gap: 2px; }
969
+ .st-cont-pod {
970
+ font-family: var(--mono); font-size: 13px; font-weight: 600;
971
+ color: var(--fg); white-space: nowrap; overflow: hidden; text-overflow: ellipsis;
972
+ }
973
+ .st-cont-name {
974
+ font-size: 11px; color: var(--fg-dim); font-family: var(--mono);
975
+ white-space: nowrap; overflow: hidden; text-overflow: ellipsis;
976
+ }
977
+ .st-cont-state {
978
+ font-size: 11px; font-weight: 700; font-family: var(--mono);
979
+ padding: 2px 7px; border-radius: 4px;
980
+ background: var(--bg-3); border: 1px solid var(--border);
981
+ color: var(--fg-dim); flex-shrink: 0;
982
+ }
983
+ .st-cont-state.hv-ok { background: #0d2b16; color: var(--ok); border-color: #1e5c3a; }
984
+ .st-cont-state.hv-warn { background: #2b2000; color: #d29922; border-color: #5a4500; }
985
+ .st-cont-size {
986
+ font-family: var(--mono); font-size: 13px; font-weight: 700;
987
+ flex-shrink: 0; color: var(--fg);
988
+ }
989
+ .st-cont-size.hv-crit { color: var(--err); }
990
+ .st-cont-size.hv-warn { color: #d29922; }
991
+ .st-cont-size.hv-ok { color: var(--ok); }
992
+ .st-cont-bar { margin: 0; }
993
+ .st-cont-meta { display: flex; gap: 8px; align-items: center; flex-wrap: wrap; }
994
+ .st-cont-meta-tag {
995
+ font-family: var(--mono); font-size: 11px; color: var(--fg-dim);
996
+ background: var(--bg-3); border: 1px solid var(--border);
997
+ padding: 1px 7px; border-radius: 4px;
998
+ }
999
+ .st-cont-meta-unmapped {
1000
+ font-size: 11px; color: var(--fg-dim); font-style: italic; opacity: 0.6;
1001
+ }
package/src/probes.js CHANGED
@@ -220,6 +220,39 @@ const PROBES = [
220
220
  ],
221
221
  },
222
222
 
223
+ // ── Storage ─────────────────────────────────────────────────────────
224
+ {
225
+ id: 'storage-partitions',
226
+ label: 'Partitions',
227
+ group: 'Storage',
228
+ desc: 'Real physical host partitions (tmpfs, devtmpfs, shm, overlay filtered out). Identifies the true data-hosting disk.',
229
+ commands: [
230
+ "nsenter -t 1 -m -- df -hT 2>/dev/null | grep -vE '^tmpfs|^devtmpfs|^overlay|^shm'",
231
+ "df -hT | grep -vE '^tmpfs|^devtmpfs|^overlay|^shm'",
232
+ ],
233
+ },
234
+ {
235
+ id: 'storage-du-tree',
236
+ label: 'Folder drill-down',
237
+ group: 'Storage',
238
+ desc: 'Layered du drill-down — stateful partition → /var → /var/lib → containerd — to pinpoint space consumers at each level.',
239
+ commands: [
240
+ "echo '=stateful='; nsenter -t 1 -m -- du -h -d 1 /mnt/stateful_partition 2>/dev/null | sort -h -r; echo '=var='; nsenter -t 1 -m -- du -h -d 1 /mnt/stateful_partition/var 2>/dev/null | sort -h -r; echo '=varlib='; nsenter -t 1 -m -- du -h -d 1 /mnt/stateful_partition/var/lib 2>/dev/null | sort -h -r; echo '=containerd='; nsenter -t 1 -m -- du -h -d 1 /mnt/stateful_partition/var/lib/containerd 2>/dev/null | sort -h -r",
241
+ "echo '=stateful='; du -h -d 1 /mnt/stateful_partition 2>/dev/null | sort -h -r; echo '=var='; du -h -d 1 /mnt/stateful_partition/var 2>/dev/null | sort -h -r; echo '=varlib='; du -h -d 1 /mnt/stateful_partition/var/lib 2>/dev/null | sort -h -r; echo '=containerd='; du -h -d 1 /mnt/stateful_partition/var/lib/containerd 2>/dev/null | sort -h -r",
242
+ "echo '=varlib='; du -h -d 1 /var/lib 2>/dev/null | sort -h -r; echo '=containerd='; du -h -d 1 /var/lib/containerd 2>/dev/null | sort -h -r",
243
+ ],
244
+ },
245
+ {
246
+ id: 'storage-containers',
247
+ label: 'Top containers',
248
+ group: 'Storage',
249
+ desc: 'Ranked list of containers by disk usage. Maps containerd snapshot sizes to pod names via overlay mounts and crictl.',
250
+ commands: [
251
+ "echo '=SNAPS='; nsenter -t 1 -m -- du -d 1 /var/lib/containerd/io.containerd.snapshotter.v1.overlayfs/snapshots 2>/dev/null | sort -rn | head -40; echo '=MOUNTS='; nsenter -t 1 -m -- mount 2>/dev/null | grep snapshots; echo '=CRICTL='; nsenter -t 1 -m -u -i -n -p -- crictl ps -a 2>/dev/null",
252
+ "echo '=SNAPS='; du -d 1 /var/lib/containerd/io.containerd.snapshotter.v1.overlayfs/snapshots 2>/dev/null | sort -rn | head -40; echo '=MOUNTS='; mount 2>/dev/null | grep snapshots; echo '=CRICTL='; crictl ps -a 2>/dev/null",
253
+ ],
254
+ },
255
+
223
256
  // ── GPU ─────────────────────────────────────────────────────────────
224
257
  {
225
258
  id: 'gpu-info',
@@ -229,6 +262,7 @@ const PROBES = [
229
262
  commands: [
230
263
  'nvidia-smi',
231
264
  'nsenter --mount=/proc/1/ns/mnt -- nvidia-smi 2>/dev/null',
265
+ 'echo "nvidia-smi not available — no GPU detected on this node."',
232
266
  ],
233
267
  },
234
268
  {
@@ -238,16 +272,21 @@ const PROBES = [
238
272
  desc: 'Processes currently consuming GPU memory.',
239
273
  commands: [
240
274
  'nvidia-smi --query-compute-apps=pid,used_gpu_memory,name --format=csv,noheader 2>/dev/null | sort -t, -k2 -rn | head -30',
275
+ 'nsenter --mount=/proc/1/ns/mnt -- nvidia-smi --query-compute-apps=pid,used_gpu_memory,name --format=csv,noheader | sort -t, -k2 -rn | head -30',
241
276
  'nvidia-smi pmon -s u -c 1 2>/dev/null',
277
+ 'nsenter --mount=/proc/1/ns/mnt -- nvidia-smi pmon -s u -c 1',
278
+ 'echo "nvidia-smi not available — no GPU detected on this node."',
242
279
  ],
243
280
  },
244
281
  {
245
- id: 'gpu-dcgm',
246
- label: 'DCGM health',
282
+ id: 'gpu-health',
283
+ label: 'GPU health',
247
284
  group: 'GPU',
248
- desc: 'DCGM (Data Center GPU Manager) health check. Requires dcgmi to be installed.',
285
+ desc: 'Per-GPU temperature, power, utilization, memory, ECC errors, and clock throttle reasons.',
249
286
  commands: [
250
- 'dcgmi health -g 0 -j 2>/dev/null || dcgmi health -g 0 2>/dev/null || echo "dcgmi not available — DCGM is not installed on this node."',
287
+ 'nvidia-smi --query-gpu=index,name,temperature.gpu,temperature.memory,power.draw,power.limit,utilization.gpu,utilization.memory,memory.used,memory.free,memory.total,ecc.errors.corrected.volatile.total,ecc.errors.uncorrected.volatile.total,clocks_throttle_reasons.active --format=csv,noheader 2>/dev/null',
288
+ 'nsenter --mount=/proc/1/ns/mnt -- nvidia-smi --query-gpu=index,name,temperature.gpu,temperature.memory,power.draw,power.limit,utilization.gpu,utilization.memory,memory.used,memory.free,memory.total,ecc.errors.corrected.volatile.total,ecc.errors.uncorrected.volatile.total,clocks_throttle_reasons.active --format=csv,noheader',
289
+ 'echo "nvidia-smi not available — no GPU detected on this node."',
251
290
  ],
252
291
  },
253
292
  ];