npm - @link-assistant/hive-mind - Versions diffs - 2.0.2 → 2.0.4 - Mend

@link-assistant/hive-mind 2.0.2 → 2.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

package/CHANGELOG.md +138 -0
package/package.json +1 -1
package/src/bot-lifecycle.lib.mjs +128 -0
package/src/bot-logger.lib.mjs +253 -0
package/src/cleanup.lib.mjs +22 -4
package/src/cleanup.mjs +15 -2
package/src/cleanup.os.lib.mjs +94 -8
package/src/isolation-runner.lib.mjs +378 -11
package/src/session-monitor.lib.mjs +389 -18
package/src/session-resume.lib.mjs +269 -0
package/src/session-status.lib.mjs +141 -0
package/src/session-store.lib.mjs +232 -0
package/src/telegram-bot.mjs +65 -13
package/src/telegram-command-execution.lib.mjs +3 -1
package/src/telegram-terminal-watch-command.lib.mjs +47 -6
package/src/work-session-formatting.lib.mjs +44 -11

package/src/session-resume.lib.mjs ADDED Viewed

@@ -0,0 +1,269 @@
+#!/usr/bin/env node
+/**
+ * Issue #1927 (review follow-up): resume planning for killed `/solve` sessions.
+ *
+ * When a detached `/solve` session is OOM/SIGKILL-ed, the surviving parent
+ * (the Telegram bot, or `/hive`) can relaunch the work with the AI tool's
+ * `--resume <sessionId>` flow instead of starting from scratch. Two facts make
+ * that safe and correct, and this module encodes both so every call site agrees:
+ *
+ *   1. **Use the LAST session id.** A single `/solve` run can spin up *many*
+ *      tool sessions — auto-continue across usage-limit resets, uncommitted-
+ *      changes restarts (`solve.watch`), and manual `--resume` chains. Every one
+ *      prints a `Session ID:` marker to the captured log in chronological order,
+ *      and start-command also renames the per-session log to `<sessionId>.log`.
+ *      The most advanced context lives in the *last* of these, so resuming must
+ *      pick the last id — never the first. {@link selectLastSessionId} /
+ *      {@link findLatestSessionLogId} enforce that rule.
+ *
+ *   2. **Never storm.** Auto-resuming a killed session must be bounded so a job
+ *      that reliably OOMs cannot spawn an infinite relaunch loop (which would be
+ *      worse than the silent hang #1927 set out to fix). {@link planKilledSessionResume}
+ *      caps the number of automatic resumes per session (default 1) and only ever
+ *      acts on a session that actually *can* be resumed.
+ *
+ * The module is pure and dependency-free apart from an injectable `fs`, so it is
+ * trivially unit-testable and importable from the bot, the monitor, or `/hive`
+ * without pulling in heavy transitive dependencies.
+ */
+import fs from 'node:fs';
+import path from 'node:path';
+// A tool session id printed to the log. Claude/codex/gemini all emit a
+// `Session ID: <id>` marker (sometimes prefixed with 📌 and/or wrapped in
+// backticks for Markdown). We capture the first non-space, non-backtick token
+// after the label, which covers UUIDs and the slug-style ids other tools use.
+const SESSION_ID_MARKER_RE = /Session ID:\s*`?([^\s`]+)`?/gi;
+// Canonical UUID v4-ish shape used by Claude Code session ids and by the
+// `<sessionId>.log` files start-command writes. Used to validate directory
+// scans so unrelated `*.log` files are never mistaken for a session.
+const SESSION_LOG_UUID_RE = /^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i;
+/**
+ * Extract every tool session id printed to a log, in the order they appear.
+ *
+ * Consecutive duplicates are collapsed (a single tool run prints its id more
+ * than once — startup, completion, verbose footer — and that is one session,
+ * not three) while still preserving order across genuinely different sessions.
+ *
+ * @param {string} text - Log text
+ * @returns {string[]} Ordered session ids (possibly empty)
+ */
+export function extractSessionIds(text) {
+  if (!text || typeof text !== 'string') return [];
+  const ids = [];
+  let match;
+  SESSION_ID_MARKER_RE.lastIndex = 0;
+  while ((match = SESSION_ID_MARKER_RE.exec(text)) !== null) {
+    const id = match[1];
+    // Skip obvious non-ids that can follow the label in prose/log output.
+    if (!id || id.toLowerCase() === 'unknown' || id.toLowerCase() === 'n/a') continue;
+    if (ids[ids.length - 1] !== id) ids.push(id);
+  }
+  return ids;
+}
+/**
+ * The session id to resume from a log: the LAST one printed (requirement:
+ * "when we have multiple sessions in a single /solve call we use last of them").
+ *
+ * @param {string} text - Log text
+ * @returns {string|null}
+ */
+export function selectLastSessionId(text) {
+  const ids = extractSessionIds(text);
+  return ids.length > 0 ? ids[ids.length - 1] : null;
+}
+/**
+ * Read the LAST tool session id from a `/solve` execution log. Only the tail of
+ * the file is scanned (the most recent session marker lives near the end), so
+ * this stays cheap on multi-megabyte logs. Never throws — a missing/unreadable
+ * log yields `null`.
+ *
+ * @param {string} logPath
+ * @param {Object} [options]
+ * @param {Object} [options.fsImpl=fs] - Injectable fs (for tests)
+ * @param {number} [options.tailBytes=262144] - Trailing bytes to scan (256 KiB)
+ * @param {boolean} [options.verbose]
+ * @returns {string|null}
+ */
+export function readLastSessionIdFromLog(logPath, options = {}) {
+  const { fsImpl = fs, tailBytes = 262144, verbose = false } = options;
+  if (!logPath) return null;
+  try {
+    const stat = fsImpl.statSync(logPath);
+    const start = Math.max(0, stat.size - tailBytes);
+    const fd = fsImpl.openSync(logPath, 'r');
+    try {
+      const length = stat.size - start;
+      const buffer = Buffer.alloc(length);
+      fsImpl.readSync(fd, buffer, 0, length, start);
+      const id = selectLastSessionId(buffer.toString('utf8'));
+      if (verbose && id) {
+        console.log(`[VERBOSE] session-resume: last tool session id in ${logPath} is ${id}`);
+      }
+      return id;
+    } finally {
+      fsImpl.closeSync(fd);
+    }
+  } catch (error) {
+    if (verbose) {
+      console.log(`[VERBOSE] session-resume: could not read session id from ${logPath}: ${error.message}`);
+    }
+    return null;
+  }
+}
+/**
+ * Find the id of the most-recently-modified `<sessionId>.log` in a directory.
+ *
+ * start-command renames each tool session's log to `<sessionId>.log`, so the
+ * newest such file is the last session of the run — a second, filesystem-based
+ * source for the "use the last session" rule that works even when the captured
+ * stdout log has been rotated away. Never throws.
+ *
+ * @param {Object} options
+ * @param {string} options.dir - Directory holding `<sessionId>.log` files
+ * @param {Object} [options.fsImpl=fs] - Injectable fs (for tests)
+ * @param {boolean} [options.verbose]
+ * @returns {string|null}
+ */
+export function findLatestSessionLogId({ dir, fsImpl = fs, verbose = false } = {}) {
+  if (!dir) return null;
+  try {
+    const entries = fsImpl.readdirSync(dir);
+    let bestId = null;
+    let bestMtime = -Infinity;
+    for (const entry of entries) {
+      if (!entry.endsWith('.log')) continue;
+      const id = entry.slice(0, -'.log'.length);
+      if (!SESSION_LOG_UUID_RE.test(id)) continue;
+      let mtime;
+      try {
+        mtime = fsImpl.statSync(path.join(dir, entry)).mtimeMs;
+      } catch {
+        continue;
+      }
+      if (mtime > bestMtime) {
+        bestMtime = mtime;
+        bestId = id;
+      }
+    }
+    if (verbose && bestId) {
+      console.log(`[VERBOSE] session-resume: latest <sessionId>.log in ${dir} is ${bestId}`);
+    }
+    return bestId;
+  } catch (error) {
+    if (verbose) {
+      console.log(`[VERBOSE] session-resume: could not scan ${dir} for session logs: ${error.message}`);
+    }
+    return null;
+  }
+}
+function quoteArg(value) {
+  const str = String(value);
+  // Quote only when needed; keep already-safe tokens (URLs, flags) readable.
+  if (/^[A-Za-z0-9_./:@=-]+$/.test(str)) return str;
+  return `"${str.replaceAll('\\', '\\\\').replaceAll('"', '\\"')}"`;
+}
+/**
+ * Drop any pre-existing `--resume`/`-r <id>` pair from an args array so a fresh
+ * resume id can be appended without conflict. Pure; returns a new array.
+ *
+ * @param {string[]} args
+ * @returns {string[]}
+ */
+export function stripResumeFlag(args) {
+  if (!Array.isArray(args)) return [];
+  const out = [];
+  for (let i = 0; i < args.length; i++) {
+    const a = args[i];
+    if (a === '--resume' || a === '-r') {
+      i += 1; // skip the value too
+      continue;
+    }
+    if (typeof a === 'string' && (a.startsWith('--resume=') || a.startsWith('-r='))) continue;
+    out.push(a);
+  }
+  return out;
+}
+/**
+ * Build the command that resumes a killed `/solve` session with its LAST tool
+ * session id. Only `/solve` sessions are resumable this way — `/hive` and other
+ * commands return `null` (the caller surfaces nothing rather than a bogus
+ * command). When the original args were persisted they are reused verbatim
+ * (minus any stale `--resume`); otherwise a minimal `<url> [--tool]` command is
+ * reconstructed from the persisted session info.
+ *
+ * @param {Object} options
+ * @param {Object} options.sessionInfo - Persisted session info (command/url/tool/args)
+ * @param {string} options.lastSessionId - The session id to resume from
+ * @param {string} [options.binary] - Override the invoked binary (default: the command)
+ * @returns {{ binary: string, args: string[], display: string }|null}
+ */
+export function buildResumeCommand({ sessionInfo = {}, lastSessionId = null, binary = null } = {}) {
+  if (!lastSessionId) return null;
+  const command = sessionInfo.command || 'solve';
+  if (command !== 'solve') return null; // only /solve is resumable via --resume
+  const url = sessionInfo.url || (Array.isArray(sessionInfo.args) ? sessionInfo.args[0] : null);
+  if (!url) return null;
+  const bin = binary || command;
+  let args;
+  if (Array.isArray(sessionInfo.args) && sessionInfo.args.length > 0) {
+    args = stripResumeFlag(sessionInfo.args);
+  } else {
+    args = [url];
+    if (sessionInfo.tool && sessionInfo.tool !== 'claude') args.push('--tool', sessionInfo.tool);
+  }
+  args = [...args, '--resume', lastSessionId];
+  return { binary: bin, args, display: `${bin} ${args.map(quoteArg).join(' ')}` };
+}
+/**
+ * Decide whether — and how — a killed `/solve` session should be auto-resumed by
+ * a surviving parent, bounding the number of automatic attempts so a reliably
+ * crashing job can never storm.
+ *
+ * @param {Object} options
+ * @param {Object} options.sessionInfo - Persisted session info
+ * @param {string|null} [options.lastSessionId] - LAST tool session id (from the log)
+ * @param {number} [options.attempts=0] - Resume attempts already made for this session
+ * @param {number} [options.maxAttempts=1] - Hard cap on automatic resumes
+ * @returns {{ resumable: boolean, reason: string, command: object|null, attempt: number }}
+ */
+export function planKilledSessionResume({ sessionInfo = {}, lastSessionId = null, attempts = 0, maxAttempts = 1 } = {}) {
+  if (!lastSessionId) {
+    return { resumable: false, reason: 'no-session-id', command: null, attempt: attempts };
+  }
+  const command = buildResumeCommand({ sessionInfo, lastSessionId });
+  if (!command) {
+    return { resumable: false, reason: 'not-resumable', command: null, attempt: attempts };
+  }
+  if (attempts >= maxAttempts) {
+    return { resumable: false, reason: 'max-attempts-reached', command, attempt: attempts };
+  }
+  return { resumable: true, reason: 'ready', command, attempt: attempts + 1 };
+}
+/**
+ * Markdown section surfaced under a killed-session completion message so an
+ * operator (or an automation) can resume the work with one copy-paste. Purely
+ * additive — returns `''` when there is nothing to resume.
+ *
+ * @param {Object} options
+ * @param {string|null} options.lastSessionId
+ * @param {{ display: string }|null} options.command
+ * @returns {string}
+ */
+export function formatResumeSection({ lastSessionId = null, command = null } = {}) {
+  if (!lastSessionId || !command) return '';
+  return `♻️ *Resume from last session* \`${lastSessionId}\`:\n\`\`\`\n${command.display}\n\`\`\``;
+}

package/src/session-status.lib.mjs ADDED Viewed

@@ -0,0 +1,141 @@
+/**
+ * Shared session-status vocabulary and exit-code classification.
+ *
+ * Issue #1927: a detached `/solve` was OOM-killed (exit 137) but the Telegram
+ * bot never reported the failure. Two gaps in the status vocabulary contributed:
+ *
+ *   1. start-command only emits `executing`/`executed`; it has no notion of a
+ *      *killed* session, and a signal exit (137 = 128+SIGKILL) was treated the
+ *      same as any other completion — or, worse, hidden entirely.
+ *   2. The sets that decide "is this running / terminal / a failure" were
+ *      duplicated across isolation-runner, session-monitor and work-session
+ *      formatting, so a fix in one place silently disagreed with another.
+ *
+ * This module is the single source of truth for that vocabulary and for mapping
+ * a process exit code to a signal/kill label. It is intentionally
+ * dependency-free (pure JS, no Node built-ins) so every layer can import it
+ * without pulling heavy transitive deps (command-stream, i18n, …).
+ *
+ * @see https://github.com/link-assistant/hive-mind/issues/1927
+ */
+function norm(status) {
+  return String(status || '')
+    .trim()
+    .toLowerCase();
+}
+/**
+ * Normalize an exit code to a finite integer or null.
+ * @param {*} value
+ * @returns {number|null}
+ */
+export function normalizeExitCode(value) {
+  if (value === null || value === undefined || value === '') return null;
+  const numeric = Number(value);
+  return Number.isFinite(numeric) ? numeric : null;
+}
+// A session that is still executing. start-command emits `executing`; hive-mind
+// historically also accepted `running`.
+export const RUNNING_SESSION_STATUSES = new Set(['executing', 'running']);
+// Statuses that mean the process was killed (by a signal) rather than exiting on
+// its own. Surfaced to the user as an explicit "killed" rather than a generic
+// failure so an OOM/SIGKILL is recognizable. (Issue #1927 requirement #1.)
+export const KILLED_SESSION_STATUSES = new Set(['killed', 'terminated', 'dead', 'oom', 'oom-killed', 'oomkilled', 'sigkill', 'sigterm', 'sigsegv']);
+// Statuses that mean the session ended unsuccessfully (a non-zero/abnormal
+// outcome). Kills are a subset of failures.
+export const FAILURE_SESSION_STATUSES = new Set(['failed', 'cancelled', 'canceled', 'error', 'timeout', 'timedout', 'timed_out', ...KILLED_SESSION_STATUSES]);
+// Statuses that mean the session is no longer executing (success or failure).
+// A superset of the original {executed, completed, failed, cancelled, canceled,
+// error} plus the kill/timeout vocabulary added for issue #1927.
+export const TERMINAL_SESSION_STATUSES = new Set(['executed', 'completed', ...FAILURE_SESSION_STATUSES]);
+/**
+ * @param {string} status
+ * @returns {boolean} True when the session is still executing.
+ */
+export function isExecutingSessionStatus(status) {
+  return RUNNING_SESSION_STATUSES.has(norm(status));
+}
+/**
+ * @param {string} status
+ * @returns {boolean} True when the session is no longer executing.
+ */
+export function isTerminalSessionStatus(status) {
+  return TERMINAL_SESSION_STATUSES.has(norm(status));
+}
+/**
+ * @param {string} status
+ * @returns {boolean} True when the session was killed by a signal.
+ */
+export function isKilledSessionStatus(status) {
+  return KILLED_SESSION_STATUSES.has(norm(status));
+}
+/**
+ * @param {string} status
+ * @returns {boolean} True when the session ended unsuccessfully.
+ */
+export function isFailureSessionStatus(status) {
+  return FAILURE_SESSION_STATUSES.has(norm(status));
+}
+// POSIX signals that commonly terminate a wrapped command, with the reason we
+// surface to the user. Exit codes above 128 encode the signal as `128 + signum`
+// (the shell/Node convention), so 137 → SIGKILL, 143 → SIGTERM, 139 → SIGSEGV.
+const SIGNAL_DESCRIPTIONS = {
+  1: { name: 'SIGHUP', reason: 'hung up (SIGHUP)' },
+  2: { name: 'SIGINT', reason: 'interrupted (SIGINT)' },
+  3: { name: 'SIGQUIT', reason: 'quit (SIGQUIT)' },
+  6: { name: 'SIGABRT', reason: 'aborted (SIGABRT)' },
+  9: { name: 'SIGKILL', reason: 'killed — out of memory or forced kill (SIGKILL)' },
+  11: { name: 'SIGSEGV', reason: 'crashed — segmentation fault (SIGSEGV)' },
+  15: { name: 'SIGTERM', reason: 'terminated (SIGTERM)' },
+};
+/**
+ * Describe a signal-based exit code (anything above 128).
+ *
+ * @param {*} exitCode
+ * @returns {{signal: string, signalNumber: number, reason: string}|null}
+ *   Signal details, or null when the exit code is not a signal exit.
+ */
+export function describeExitSignal(exitCode) {
+  const code = normalizeExitCode(exitCode);
+  if (code === null || code <= 128) return null;
+  const signalNumber = code - 128;
+  const info = SIGNAL_DESCRIPTIONS[signalNumber] || { name: `SIG${signalNumber}`, reason: `killed by signal ${signalNumber}` };
+  return { signal: info.name, signalNumber, reason: info.reason };
+}
+/**
+ * Map an exit code to a canonical session status string.
+ *
+ *   - 0            → 'executed'  (success)
+ *   - 137,139,…    → 'killed'    (SIGKILL/SIGSEGV/etc.)
+ *   - 143,130      → 'terminated'(SIGTERM/SIGINT — orderly termination)
+ *   - other != 0   → 'failed'
+ *   - null         → null        (unknown)
+ *
+ * @param {*} exitCode
+ * @returns {string|null}
+ */
+export function classifyExitStatus(exitCode) {
+  const code = normalizeExitCode(exitCode);
+  if (code === null) return null;
+  if (code === 0) return 'executed';
+  const signal = describeExitSignal(code);
+  if (signal) {
+    // SIGTERM/SIGINT are orderly terminations; everything else above 128 is a
+    // hard kill/crash.
+    if (signal.signalNumber === 15 || signal.signalNumber === 2) return 'terminated';
+    return 'killed';
+  }
+  return 'failed';
+}

package/src/session-store.lib.mjs ADDED Viewed

@@ -0,0 +1,232 @@
+/**
+ * Durable persistence for tracked Telegram work sessions.
+ *
+ * Issue #1927: the session monitor kept its registry purely in-memory
+ * (`activeSessions` Map). When the bot process was killed/restarted that map was
+ * lost, so a /solve running in a detached `$` session became an orphan the bot
+ * could never report on — it just vanished. Requirement #2 asks the bot to
+ * "detect restart … and if after bot start we have commands in `$`, try to
+ * resume them, if they started before bot start time." Requirement #4 asks that
+ * we never destroy previous data.
+ *
+ * This module persists the minimal, plain-data subset of each session's
+ * metadata to disk so that after a restart the monitor can reload its registry
+ * and keep watching detached sessions to completion. Two artifacts are written:
+ *
+ *   - `sessions.json`  — an atomically-rewritten snapshot of the *current* set
+ *     of tracked sessions (the source of truth for resume).
+ *   - `sessions-events.jsonl` — an append-only, timestamped audit log of every
+ *     track/complete event. It is never truncated, so the full history of what
+ *     ran (and when it ended) survives even total failures.
+ *
+ * The store is dependency-free and fully injectable for unit testing.
+ *
+ * @see https://github.com/link-assistant/hive-mind/issues/1927
+ */
+import fs from 'node:fs';
+import os from 'node:os';
+import path from 'node:path';
+// Only plain, serializable metadata is persisted. Runtime-only fields (the bot
+// instance, cached limits snapshots, transient error strings) are deliberately
+// excluded so the snapshot stays small and safe to reload.
+// `args` (#1927 review follow-up) is persisted so a killed /solve can be resumed
+// with its exact original invocation plus `--resume <lastSessionId>`.
+const PERSISTABLE_FIELDS = ['chatId', 'messageId', 'startTime', 'url', 'command', 'isolationBackend', 'sessionId', 'tool', 'infoBlock', 'urlContext', 'requesterUserId', 'showLimits', 'locale', 'logPath', 'args'];
+/**
+ * Resolve the directory durable bot state is written to. Honors
+ * HIVE_MIND_STATE_DIR, then a stable per-user fallback. Never throws.
+ *
+ * @param {object} [env=process.env]
+ * @param {Function} [homedir=os.homedir]
+ * @returns {string} Absolute directory path
+ */
+export function resolveBotStateDir(env = process.env, homedir = os.homedir) {
+  const explicit = String(env.HIVE_MIND_STATE_DIR || '').trim();
+  if (explicit) return explicit;
+  const home = (() => {
+    try {
+      return homedir();
+    } catch {
+      return '/tmp';
+    }
+  })();
+  return path.join(home, '.hive-mind', 'state');
+}
+function toIso(value) {
+  if (!value) return null;
+  if (value instanceof Date) return Number.isNaN(value.getTime()) ? null : value.toISOString();
+  const date = new Date(value);
+  return Number.isNaN(date.getTime()) ? null : date.toISOString();
+}
+/**
+ * Reduce a sessionInfo object to its persistable subset, normalizing the
+ * startTime to an ISO string.
+ * @param {object} sessionInfo
+ * @returns {object}
+ */
+export function serializeSessionInfo(sessionInfo = {}) {
+  const out = {};
+  for (const field of PERSISTABLE_FIELDS) {
+    if (sessionInfo[field] === undefined) continue;
+    if (field === 'startTime') {
+      const iso = toIso(sessionInfo.startTime);
+      if (iso) out.startTime = iso;
+      continue;
+    }
+    out[field] = sessionInfo[field];
+  }
+  return out;
+}
+/**
+ * Rehydrate a persisted session record, converting startTime back to a Date.
+ * @param {object} record
+ * @returns {object}
+ */
+export function deserializeSessionInfo(record = {}) {
+  const out = { ...record };
+  if (out.startTime) {
+    const date = new Date(out.startTime);
+    if (!Number.isNaN(date.getTime())) out.startTime = date;
+  }
+  return out;
+}
+/**
+ * Create a durable session store bound to a directory.
+ *
+ * @param {object} [options]
+ * @param {string} [options.dir] - State directory (default: resolveBotStateDir()).
+ * @param {object} [options.fsImpl=fs] - Injectable fs (for tests).
+ * @param {Function} [options.now] - Injectable clock returning a Date.
+ * @param {boolean} [options.verbose=false]
+ * @param {object} [options.logger] - Optional bot logger for structured events.
+ * @returns {object} Session store instance.
+ */
+export function createSessionStore(options = {}) {
+  const { dir = resolveBotStateDir(), fsImpl = fs, now = () => new Date(), verbose = false, logger = null } = options;
+  const snapshotPath = path.join(dir, 'sessions.json');
+  const eventsPath = path.join(dir, 'sessions-events.jsonl');
+  let disabled = false;
+  function log(level, message, meta) {
+    if (logger && typeof logger[level] === 'function') logger[level](message, meta);
+    else if (verbose) console.log(`[session-store] ${message}${meta ? ` ${JSON.stringify(meta)}` : ''}`);
+  }
+  function ensureDir() {
+    if (disabled) return false;
+    try {
+      fsImpl.mkdirSync(dir, { recursive: true });
+      return true;
+    } catch (error) {
+      disabled = true;
+      log('error', `Could not create state dir ${dir}: ${error.message} — persistence disabled`);
+      return false;
+    }
+  }
+  function readSnapshotMap() {
+    try {
+      const raw = fsImpl.readFileSync(snapshotPath, 'utf8');
+      const parsed = JSON.parse(raw);
+      if (parsed && typeof parsed === 'object' && parsed.sessions && typeof parsed.sessions === 'object') {
+        return parsed.sessions;
+      }
+      return {};
+    } catch {
+      // Missing or corrupt snapshot is non-fatal — start from empty.
+      return {};
+    }
+  }
+  function writeSnapshotMap(sessions) {
+    if (!ensureDir()) return;
+    const payload = JSON.stringify({ version: 1, updatedAt: toIso(now()), sessions }, null, 2);
+    const tmpPath = `${snapshotPath}.tmp`;
+    try {
+      // Atomic replace: write tmp then rename so a crash mid-write never leaves
+      // a half-written snapshot.
+      fsImpl.writeFileSync(tmpPath, payload);
+      fsImpl.renameSync(tmpPath, snapshotPath);
+    } catch (error) {
+      log('error', `Could not write session snapshot: ${error.message}`);
+    }
+  }
+  function appendEvent(type, sessionName, data) {
+    if (!ensureDir()) return;
+    const entry = { ts: toIso(now()), type, sessionName, ...data };
+    try {
+      fsImpl.appendFileSync(eventsPath, JSON.stringify(entry) + '\n');
+    } catch (error) {
+      log('error', `Could not append session event: ${error.message}`);
+    }
+  }
+  return {
+    get snapshotPath() {
+      return snapshotPath;
+    },
+    get eventsPath() {
+      return eventsPath;
+    },
+    get disabled() {
+      return disabled;
+    },
+    /**
+     * Persist (upsert) a tracked session and append a `track` audit event.
+     * @param {string} sessionName
+     * @param {object} sessionInfo
+     */
+    persist(sessionName, sessionInfo) {
+      if (!sessionName) return;
+      const sessions = readSnapshotMap();
+      const serialized = serializeSessionInfo(sessionInfo);
+      serialized.persistedAt = toIso(now());
+      sessions[sessionName] = serialized;
+      writeSnapshotMap(sessions);
+      appendEvent('track', sessionName, { sessionInfo: serialized });
+      log('debug', `Persisted session ${sessionName}`, { command: serialized.command, url: serialized.url });
+    },
+    /**
+     * Remove a session from the snapshot and append a `complete` audit event.
+     * The event records the terminal status/exit code so the history survives
+     * even though the live snapshot no longer lists the session.
+     * @param {string} sessionName
+     * @param {object} [meta] - { status, exitCode }
+     */
+    remove(sessionName, meta = {}) {
+      if (!sessionName) return;
+      const sessions = readSnapshotMap();
+      if (sessionName in sessions) {
+        delete sessions[sessionName];
+        writeSnapshotMap(sessions);
+      }
+      appendEvent('complete', sessionName, { status: meta.status ?? null, exitCode: meta.exitCode ?? null });
+      log('debug', `Removed session ${sessionName} from snapshot`, meta);
+    },
+    /**
+     * Load all persisted sessions as `{ sessionName, sessionInfo }` records with
+     * startTime rehydrated to a Date.
+     * @returns {Array<{sessionName: string, sessionInfo: object}>}
+     */
+    load() {
+      const sessions = readSnapshotMap();
+      const out = [];
+      for (const [sessionName, record] of Object.entries(sessions)) {
+        out.push({ sessionName, sessionInfo: deserializeSessionInfo(record) });
+      }
+      return out;
+    },
+  };
+}