npm - @gtadi/k8s-node-debugger - Versions diffs - 1.0.0 - Mend

@gtadi/k8s-node-debugger 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

package/LICENSE +183 -0
package/README.md +118 -0
package/bin/k8s-node-debugger.js +168 -0
package/package.json +28 -0
package/public/app.js +547 -0
package/public/conntrack-view.js +569 -0
package/public/health-view.js +462 -0
package/public/index.html +90 -0
package/public/iptables-view.js +523 -0
package/public/style.css +866 -0
package/src/k8s.js +196 -0
package/src/probes.js +255 -0
package/src/server.js +187 -0

package/src/k8s.js ADDED Viewed

@@ -0,0 +1,196 @@
+'use strict';
+const { spawn, execFile } = require('child_process');
+/**
+ * Thin wrapper around the `kubectl` binary on the user's PATH. It deliberately
+ * shells out to kubectl so the active kubeconfig / current-context (including
+ * any exec auth plugins, e.g. EKS/GKE) is reused exactly as on the shell.
+ */
+const KUBECTL = process.env.KUBECTL_BIN || 'kubectl';
+const DEBUG_IMAGE = process.env.DEBUGGER_IMAGE || 'nicolaka/netshoot:latest';
+function kubectl(args, { input, timeout = 60000 } = {}) {
+  return new Promise((resolve, reject) => {
+    const child = execFile(
+      KUBECTL,
+      args,
+      { timeout, maxBuffer: 64 * 1024 * 1024 },
+      (err, stdout, stderr) => {
+        if (err) {
+          err.stdout = stdout;
+          err.stderr = stderr;
+          err.message = stderr?.trim() || err.message;
+          return reject(err);
+        }
+        resolve({ stdout, stderr });
+      }
+    );
+    if (input !== undefined) {
+      child.stdin.write(input);
+      child.stdin.end();
+    }
+  });
+}
+function buildKubectlArgs(extra, { context, kubeconfig } = {}) {
+  const args = [];
+  if (kubeconfig) args.push('--kubeconfig', kubeconfig);
+  if (context) args.push('--context', context);
+  return args.concat(extra);
+}
+async function currentContext(opts = {}) {
+  try {
+    const { stdout } = await kubectl(
+      buildKubectlArgs(['config', 'current-context'], opts)
+    );
+    return stdout.trim();
+  } catch {
+    return null;
+  }
+}
+async function listNodes(opts = {}) {
+  const { stdout } = await kubectl(
+    buildKubectlArgs(['get', 'nodes', '-o', 'json'], opts)
+  );
+  const data = JSON.parse(stdout);
+  return (data.items || []).map((n) => {
+    const addr = (n.status?.addresses || []).reduce((acc, a) => {
+      acc[a.type] = a.address;
+      return acc;
+    }, {});
+    const conditions = (n.status?.conditions || []).reduce((acc, c) => {
+      acc[c.type] = c.status;
+      return acc;
+    }, {});
+    return {
+      name: n.metadata?.name,
+      roles: Object.keys(n.metadata?.labels || {})
+        .filter((l) => l.startsWith('node-role.kubernetes.io/'))
+        .map((l) => l.replace('node-role.kubernetes.io/', '') || 'control-plane')
+        .filter(Boolean),
+      ready: conditions.Ready === 'True',
+      internalIP: addr.InternalIP,
+      hostname: addr.Hostname,
+      os: n.status?.nodeInfo?.osImage,
+      kernel: n.status?.nodeInfo?.kernelVersion,
+      kubelet: n.status?.nodeInfo?.kubeletVersion,
+      runtime: n.status?.nodeInfo?.containerRuntimeVersion,
+    };
+  });
+}
+function debugPodManifest(node, podName, namespace) {
+  return JSON.stringify({
+    apiVersion: 'v1',
+    kind: 'Pod',
+    metadata: {
+      name: podName,
+      namespace,
+      labels: { app: 'k8s-node-debugger' },
+    },
+    spec: {
+      nodeName: node,
+      hostNetwork: true,
+      hostPID: true,
+      hostIPC: true,
+      restartPolicy: 'Never',
+      // Schedule onto control-plane / tainted nodes too.
+      tolerations: [{ operator: 'Exists' }],
+      containers: [
+        {
+          name: 'debugger',
+          image: DEBUG_IMAGE,
+          imagePullPolicy: 'IfNotPresent',
+          command: ['sleep', 'infinity'],
+          securityContext: {
+            privileged: true,
+            runAsUser: 0,
+          },
+          volumeMounts: [{ name: 'host-root', mountPath: '/host' }],
+        },
+      ],
+      volumes: [{ name: 'host-root', hostPath: { path: '/' } }],
+    },
+  });
+}
+async function createDebugPod(node, { namespace = 'default', context, kubeconfig } = {}) {
+  const suffix = node.toLowerCase().replace(/[^a-z0-9-]/g, '-').slice(0, 30);
+  const podName = `node-debugger-${suffix}-${process.pid.toString(36)}`;
+  const opts = { context, kubeconfig };
+  const manifest = debugPodManifest(node, podName, namespace);
+  await kubectl(
+    buildKubectlArgs(['apply', '-n', namespace, '-f', '-'], opts),
+    { input: manifest }
+  );
+  return { podName, namespace };
+}
+async function waitForPodReady(podName, { namespace = 'default', context, kubeconfig, timeout = 120 } = {}) {
+  await kubectl(
+    buildKubectlArgs(
+      ['wait', '-n', namespace, `pod/${podName}`, '--for=condition=Ready', `--timeout=${timeout}s`],
+      { context, kubeconfig }
+    ),
+    { timeout: (timeout + 10) * 1000 }
+  );
+}
+async function deletePod(podName, { namespace = 'default', context, kubeconfig } = {}) {
+  try {
+    await kubectl(
+      buildKubectlArgs(
+        ['delete', 'pod', podName, '-n', namespace, '--ignore-not-found', '--wait=false'],
+        { context, kubeconfig }
+      )
+    );
+  } catch {
+    /* best effort */
+  }
+}
+/** Run a one-shot command inside the debug pod, returning combined output. */
+async function execInPod(podName, command, { namespace = 'default', context, kubeconfig, timeout = 60000 } = {}) {
+  const args = buildKubectlArgs(
+    ['exec', '-n', namespace, podName, '--', 'sh', '-c', command],
+    { context, kubeconfig }
+  );
+  try {
+    const { stdout, stderr } = await kubectl(args, { timeout });
+    return { ok: true, stdout, stderr };
+  } catch (err) {
+    return {
+      ok: false,
+      stdout: err.stdout || '',
+      stderr: err.stderr || err.message || String(err),
+    };
+  }
+}
+/**
+ * Spawn a streaming command in the pod. Returns the ChildProcess so callers can
+ * pipe stdout/stderr (e.g. tcpdump) over a websocket and kill it on demand.
+ */
+function streamInPod(podName, command, { namespace = 'default', context, kubeconfig } = {}) {
+  const args = buildKubectlArgs(
+    ['exec', '-i', '-n', namespace, podName, '--', 'sh', '-c', command],
+    { context, kubeconfig }
+  );
+  return spawn(KUBECTL, args);
+}
+module.exports = {
+  KUBECTL,
+  DEBUG_IMAGE,
+  currentContext,
+  listNodes,
+  createDebugPod,
+  waitForPodReady,
+  deletePod,
+  execInPod,
+  streamInPod,
+};

package/src/probes.js ADDED Viewed

@@ -0,0 +1,255 @@
+'use strict';
+/**
+ * Network-debug "probes". Because the debug pod runs with hostNetwork/hostPID,
+ * commands like iptables-save / conntrack / ip run directly against the node's
+ * host network namespace. Files that kubelet would otherwise override for the
+ * pod (resolv.conf) are read from the mounted host root at /host.
+ *
+ * Each probe declares fallback commands tried in order until one succeeds, so a
+ * single id works across legacy-iptables and nft-based nodes.
+ */
+const PROBES = [
+  {
+    id: 'iptables',
+    label: 'iptables',
+    group: 'Firewall',
+    desc: 'Full iptables ruleset (host network namespace).',
+    commands: ['iptables-save', 'iptables-legacy-save', 'iptables -S'],
+  },
+  {
+    id: 'iptables-nat',
+    label: 'iptables (nat)',
+    group: 'Firewall',
+    desc: 'NAT table — where kube-proxy / CNI install service & SNAT rules.',
+    commands: ['iptables-save -t nat', 'iptables -t nat -S'],
+  },
+  {
+    id: 'nftables',
+    label: 'nftables',
+    group: 'Firewall',
+    desc: 'nftables ruleset (modern kube-proxy / firewalls).',
+    commands: ['nft list ruleset'],
+  },
+  {
+    id: 'ipvs',
+    label: 'IPVS',
+    group: 'Firewall',
+    desc: 'IPVS virtual servers (kube-proxy ipvs mode).',
+    commands: ['ipvsadm -ln', 'ipvsadm -L -n'],
+  },
+  {
+    id: 'resolv',
+    label: 'resolv.conf',
+    group: 'DNS',
+    desc: "The node's /etc/resolv.conf (enters host mount namespace via nsenter so symlinks resolve correctly).",
+    // On many distros /etc/resolv.conf is a symlink (e.g. → /run/systemd/resolve/stub-resolv.conf).
+    // `cat /host/etc/resolv.conf` fails because the kernel resolves that symlink against the
+    // container root, not /host. nsenter --mount enters the host mount namespace, so all paths
+    // and symlinks resolve exactly as they would on the node itself.
+    commands: [
+      'nsenter --mount=/proc/1/ns/mnt -- cat /etc/resolv.conf',
+      'chroot /host cat /etc/resolv.conf',
+      'cat /host/etc/resolv.conf',
+    ],
+  },
+  {
+    id: 'nsswitch',
+    label: 'nsswitch.conf',
+    group: 'DNS',
+    desc: 'Host name-resolution order (/etc/nsswitch.conf).',
+    commands: [
+      'nsenter --mount=/proc/1/ns/mnt -- cat /etc/nsswitch.conf',
+      'chroot /host cat /etc/nsswitch.conf',
+      'cat /host/etc/nsswitch.conf',
+    ],
+  },
+  {
+    id: 'hosts',
+    label: '/etc/hosts',
+    group: 'DNS',
+    desc: 'Static host entries on the node.',
+    commands: [
+      'nsenter --mount=/proc/1/ns/mnt -- cat /etc/hosts',
+      'chroot /host cat /etc/hosts',
+      'cat /host/etc/hosts',
+    ],
+  },
+  {
+    id: 'conntrack',
+    label: 'conntrack table',
+    group: 'Conntrack',
+    desc: 'Live connection tracking table.',
+    commands: ['conntrack -L', 'cat /host/proc/net/nf_conntrack'],
+  },
+  {
+    id: 'conntrack-stats',
+    label: 'conntrack stats',
+    group: 'Conntrack',
+    desc: 'Per-CPU conntrack counters (insert, drop, early_drop...).',
+    commands: ['conntrack -S'],
+  },
+  {
+    id: 'conntrack-count',
+    label: 'conntrack count / max',
+    group: 'Conntrack',
+    desc: 'Current entry count and configured maximum.',
+    commands: [
+      'echo "count: $(cat /proc/sys/net/netfilter/nf_conntrack_count)"; echo "max:   $(cat /proc/sys/net/netfilter/nf_conntrack_max)"',
+    ],
+  },
+  {
+    id: 'routes',
+    label: 'routes (v4)',
+    group: 'Routing',
+    desc: 'IPv4 routing table.',
+    commands: ['ip route show', 'route -n'],
+  },
+  {
+    id: 'routes6',
+    label: 'routes (v6)',
+    group: 'Routing',
+    desc: 'IPv6 routing table.',
+    commands: ['ip -6 route show'],
+  },
+  {
+    id: 'rules',
+    label: 'routing rules',
+    group: 'Routing',
+    desc: 'Policy routing rules (ip rule).',
+    commands: ['ip rule show'],
+  },
+  {
+    id: 'interfaces',
+    label: 'interfaces',
+    group: 'Interfaces',
+    desc: 'All network interfaces and addresses.',
+    commands: ['ip -d addr show', 'ip addr show'],
+  },
+  {
+    id: 'links',
+    label: 'links',
+    group: 'Interfaces',
+    desc: 'Link layer details and statistics.',
+    commands: ['ip -s link show'],
+  },
+  {
+    id: 'neigh',
+    label: 'ARP / neighbors',
+    group: 'Interfaces',
+    desc: 'ARP / neighbor cache.',
+    commands: ['ip neigh show'],
+  },
+  {
+    id: 'sockets',
+    label: 'listening sockets',
+    group: 'Sockets',
+    desc: 'TCP/UDP listening sockets with owning process.',
+    commands: ['ss -tulpn', 'netstat -tulpn'],
+  },
+  {
+    id: 'sockets-all',
+    label: 'all sockets',
+    group: 'Sockets',
+    desc: 'All TCP/UDP sockets and their states.',
+    commands: ['ss -tanp', 'netstat -tanp'],
+  },
+  {
+    id: 'sysctl-net',
+    label: 'net sysctls',
+    group: 'Kernel',
+    desc: 'Key networking sysctls (forwarding, rp_filter, conntrack...).',
+    commands: [
+      "sysctl net.ipv4.ip_forward net.ipv4.conf.all.rp_filter net.bridge.bridge-nf-call-iptables net.netfilter.nf_conntrack_max net.ipv4.tcp_syncookies 2>/dev/null",
+    ],
+  },
+  // ── Health ──────────────────────────────────────────────────────────
+  {
+    id: 'mem-info',
+    label: 'Memory',
+    group: 'Health',
+    desc: 'Node memory usage, buffers, cache, and swap from /proc/meminfo.',
+    commands: ['cat /proc/meminfo'],
+  },
+  {
+    id: 'mem-pressure',
+    label: 'PSI pressure',
+    group: 'Health',
+    desc: 'Linux Pressure Stall Information (PSI) for CPU, memory, and I/O — non-zero avg10 indicates resource contention.',
+    commands: [
+      'echo "=cpu="; cat /proc/pressure/cpu 2>/dev/null || echo "n/a"; echo "=memory="; cat /proc/pressure/memory 2>/dev/null || echo "n/a"; echo "=io="; cat /proc/pressure/io 2>/dev/null || echo "n/a"',
+    ],
+  },
+  {
+    id: 'oom-kills',
+    label: 'OOM kills',
+    group: 'Health',
+    desc: 'OOM kill events from the kernel ring buffer (dmesg). Empty means no OOM events since last boot.',
+    commands: [
+      'nsenter --mount=/proc/1/ns/mnt -- dmesg --time-format=iso 2>/dev/null | grep -iE "oom|out of memory|killed process|oom_kill" | tail -60',
+      'nsenter --mount=/proc/1/ns/mnt -- dmesg 2>/dev/null | grep -iE "oom|out of memory|killed process|oom_kill" | tail -60',
+      'dmesg | grep -iE "oom|out of memory|killed process|oom_kill" | tail -60',
+    ],
+  },
+  {
+    id: 'kubelet-logs',
+    label: 'kubelet logs',
+    group: 'Health',
+    desc: 'Last 100 kubelet log lines — look for eviction events, node conditions, and errors.',
+    commands: [
+      'nsenter --mount=/proc/1/ns/mnt -- journalctl -u kubelet --no-pager -n 100 --output=short-iso 2>/dev/null',
+      'nsenter --mount=/proc/1/ns/mnt -- journalctl -u kubelet --no-pager -n 100 2>/dev/null',
+    ],
+  },
+  {
+    id: 'disk-usage',
+    label: 'Disk usage',
+    group: 'Health',
+    desc: 'Filesystem disk usage on the node (df -h). High /var/lib/kubelet or /var/lib/containerd usage triggers disk-pressure eviction.',
+    commands: ['df -h', 'df -hT'],
+  },
+  {
+    id: 'cpu-stat',
+    label: 'CPU & load',
+    group: 'Health',
+    desc: 'Load averages, CPU count, and top CPU-consuming processes.',
+    commands: [
+      'echo "=loadavg="; cat /proc/loadavg; echo "=nproc="; nproc --all; echo "=cpumodel="; grep "model name" /proc/cpuinfo | head -1 | cut -d: -f2; echo "=procstat="; cat /proc/stat | head -1; echo "=topproc="; ps aux --sort=-%cpu | head -16',
+    ],
+  },
+  // ── GPU ─────────────────────────────────────────────────────────────
+  {
+    id: 'gpu-info',
+    label: 'GPU status',
+    group: 'GPU',
+    desc: 'NVIDIA GPU status via nvidia-smi. Only relevant on GPU-enabled nodes.',
+    commands: [
+      'nvidia-smi',
+      'nsenter --mount=/proc/1/ns/mnt -- nvidia-smi 2>/dev/null',
+    ],
+  },
+  {
+    id: 'gpu-processes',
+    label: 'GPU processes',
+    group: 'GPU',
+    desc: 'Processes currently consuming GPU memory.',
+    commands: [
+      'nvidia-smi --query-compute-apps=pid,used_gpu_memory,name --format=csv,noheader 2>/dev/null | sort -t, -k2 -rn | head -30',
+      'nvidia-smi pmon -s u -c 1 2>/dev/null',
+    ],
+  },
+  {
+    id: 'gpu-dcgm',
+    label: 'DCGM health',
+    group: 'GPU',
+    desc: 'DCGM (Data Center GPU Manager) health check. Requires dcgmi to be installed.',
+    commands: [
+      'dcgmi health -g 0 -j 2>/dev/null || dcgmi health -g 0 2>/dev/null || echo "dcgmi not available — DCGM is not installed on this node."',
+    ],
+  },
+];
+module.exports = { PROBES };

package/src/server.js ADDED Viewed

@@ -0,0 +1,187 @@
+'use strict';
+const http = require('http');
+const path = require('path');
+const express = require('express');
+const { WebSocketServer } = require('ws');
+const k8s = require('./k8s');
+const { PROBES } = require('./probes');
+/**
+ * Builds and starts the debugger server. `session` holds the pod that was
+ * created for the target node; everything in the UI operates against it.
+ */
+function createServer(session) {
+  // session: { node, podName, namespace, context, kubeconfig }
+  const podOpts = () => ({
+    namespace: session.namespace,
+    context: session.context,
+    kubeconfig: session.kubeconfig,
+  });
+  const app = express();
+  app.use(express.json());
+  app.use(express.static(path.join(__dirname, '..', 'public')));
+  app.get('/api/session', (req, res) => {
+    res.json({
+      node: session.node,
+      podName: session.podName,
+      namespace: session.namespace,
+      context: session.context || null,
+      image: k8s.DEBUG_IMAGE,
+      probes: PROBES.map(({ id, label, group, desc }) => ({ id, label, group, desc })),
+    });
+  });
+  app.get('/api/nodes', async (req, res) => {
+    try {
+      res.json(await k8s.listNodes(podOpts()));
+    } catch (err) {
+      res.status(500).json({ error: err.message });
+    }
+  });
+  // Run a single named probe (with built-in command fallbacks).
+  app.get('/api/probe/:id', async (req, res) => {
+    const probe = PROBES.find((p) => p.id === req.params.id);
+    if (!probe) return res.status(404).json({ error: 'unknown probe' });
+    let last = null;
+    for (const command of probe.commands) {
+      const result = await k8s.execInPod(session.podName, command, podOpts());
+      last = { ...result, command };
+      if (result.ok && result.stdout.trim()) break;
+    }
+    res.json({
+      id: probe.id,
+      label: probe.label,
+      command: last.command,
+      ok: last.ok,
+      output: last.stdout || '',
+      error: last.ok ? last.stderr : last.stderr || last.stdout,
+    });
+  });
+  // Connectivity prober — runs nc/curl/ping from the debug pod and
+  // correlates with conntrack entries for the target IP.
+  app.post('/api/connectivity', async (req, res) => {
+    const { target, port, protocol = 'tcp' } = req.body || {};
+    if (!target) return res.status(400).json({ error: 'target required' });
+    const opts = { ...podOpts(), timeout: 30000 };
+    const portArg = port ? String(port) : '';
+    const results = {};
+    // Ping (ICMP reachability)
+    const ping = await k8s.execInPod(session.podName,
+      `ping -c 3 -W 2 ${target} 2>&1`, opts);
+    results.ping = { ok: ping.ok, output: ping.stdout + ping.stderr };
+    // TCP / HTTP / HTTPS
+    if (protocol === 'http' || protocol === 'https') {
+      const url = `${protocol}://${target}${portArg ? ':' + portArg : ''}`;
+      const curl = await k8s.execInPod(session.podName,
+        `curl -sv --connect-timeout 5 --max-time 10 "${url}" 2>&1 | head -80`, opts);
+      results.curl = { ok: curl.ok, output: curl.stdout + curl.stderr, url };
+    } else if (portArg) {
+      const nc = await k8s.execInPod(session.podName,
+        `nc -zv -w 5 ${target} ${portArg} 2>&1`, opts);
+      results.nc = { ok: nc.ok, output: nc.stdout + nc.stderr };
+    }
+    // DNS resolution
+    const dns = await k8s.execInPod(session.podName,
+      `getent hosts ${target} 2>&1 || nslookup ${target} 2>&1 | head -20`, opts);
+    results.dns = { ok: dns.ok, output: dns.stdout + dns.stderr };
+    // Matching conntrack entries
+    const ct = await k8s.execInPod(session.podName,
+      `conntrack -L 2>/dev/null | grep -F "${target}" | head -20`, opts);
+    results.conntrack = { ok: ct.ok, output: ct.stdout };
+    // Trace route (best-effort)
+    const tr = await k8s.execInPod(session.podName,
+      `traceroute -n -m 10 -w 1 ${target} 2>&1 || tracepath -n ${target} 2>&1 | head -20`, opts);
+    results.traceroute = { ok: tr.ok, output: tr.stdout + tr.stderr };
+    res.json({ target, port: portArg || null, protocol, results });
+  });
+  // Arbitrary command execution from the UI.
+  app.post('/api/exec', async (req, res) => {
+    const command = (req.body && req.body.command || '').trim();
+    if (!command) return res.status(400).json({ error: 'command required' });
+    const result = await k8s.execInPod(session.podName, command, {
+      ...podOpts(),
+      timeout: 120000,
+    });
+    res.json({
+      command,
+      ok: result.ok,
+      output: result.stdout || '',
+      error: result.stderr || '',
+    });
+  });
+  const server = http.createServer(app);
+  // ---- Streaming terminal over WebSocket ----------------------------------
+  // Each connection runs one command at a time; long-running commands
+  // (tcpdump, conntrack -E, ping) stream until the client sends a signal.
+  const wss = new WebSocketServer({ server, path: '/ws/term' });
+  wss.on('connection', (ws) => {
+    let current = null;
+    const send = (type, data) => {
+      if (ws.readyState === ws.OPEN) ws.send(JSON.stringify({ type, data }));
+    };
+    ws.on('message', (raw) => {
+      let msg;
+      try {
+        msg = JSON.parse(raw.toString());
+      } catch {
+        return;
+      }
+      if (msg.type === 'run') {
+        const command = (msg.command || '').trim();
+        if (!command) return;
+        if (current) {
+          send('stderr', '\r\n[a command is already running — interrupt it first]\r\n');
+          return;
+        }
+        send('started', command);
+        const child = k8s.streamInPod(session.podName, command, podOpts());
+        current = child;
+        child.stdout.on('data', (d) => send('stdout', d.toString()));
+        child.stderr.on('data', (d) => send('stderr', d.toString()));
+        child.on('close', (code) => {
+          current = null;
+          send('exit', code);
+        });
+        child.on('error', (err) => {
+          current = null;
+          send('stderr', `\r\n[exec error] ${err.message}\r\n`);
+          send('exit', -1);
+        });
+      } else if (msg.type === 'signal') {
+        if (current) {
+          current.kill(msg.signal === 'SIGKILL' ? 'SIGKILL' : 'SIGINT');
+        }
+      } else if (msg.type === 'stdin') {
+        if (current && current.stdin.writable) current.stdin.write(msg.data);
+      }
+    });
+    ws.on('close', () => {
+      if (current) current.kill('SIGKILL');
+    });
+  });
+  return server;
+}
+module.exports = { createServer };