npm - switchroom - Versions diffs - 0.12.21 → 0.12.23 - Mend

switchroom 0.12.21 → 0.12.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

package/telegram-plugin/uat/scenarios/jtbd-fast-trivial-dm.test.ts ADDED Viewed

@@ -0,0 +1,127 @@
+/**
+ * JTBD scenario — short happy path: trivial questions reply FAST.
+ *
+ * Serves: `reference/know-what-my-agent-is-doing.md` — the short-path
+ * contract: a question with no real work should produce a plain reply
+ * with no ceremony (no soft-commit, no progress chunks) within a tight
+ * budget. Users judge agent speed on THIS path more than any other.
+ *
+ * Also serves: the always-on vision (`reference/vision.md`). An agent
+ * that takes 30+ seconds to answer "what's 2+2" is not "always-on" —
+ * it's awake but unresponsive.
+ *
+ * ## Targets
+ *
+ * From `reference/conversational-pacing.md` and the post-v0.12.22
+ * baseline measurements:
+ *
+ *   - **TTFO p95 (vision target):** < 30s — the published contract.
+ *     This test asserts the FAST-trivial case, not p95, so we tighten.
+ *   - **Trivial-prompt TTFO (this test):** < 12s as hard contract,
+ *     < 6s as the vision target. The mtcute post-restart UAT measured
+ *     19.4s on a COLD-START fresh-restart; a warm fast-trivial should
+ *     be materially faster — the dominant cost on cold start is
+ *     boot+session-resume which doesn't apply here.
+ *   - **Soft-commit ceremony:** must NOT fire for trivial prompts.
+ *     If the reply contains a soft-commit preamble ("let me check
+ *     that for you, back in a few"), the conversational-pacing
+ *     prompt classified the trivial prompt as slow — a regression.
+ *
+ * ## What this catches that other UATs don't
+ *
+ * - `jtbd-soft-commit-dm.test.ts` exercises slow prompts (the soft
+ *   commit SHOULD fire). This test asserts the inverse — fast prompts
+ *   should skip ceremony.
+ * - `jtbd-always-on-after-restart-dm.test.ts` asserts <120s after a
+ *   cold restart. This test asserts <12s on a warm agent — a much
+ *   tighter bar that catches steady-state latency regressions
+ *   (model swap, MCP server slowdown, gateway middleware cost, etc.).
+ * - `smoke-dm-reply.test.ts` confirms the agent replies AT ALL but
+ *   has no latency assertion — a 50s reply would pass smoke. This
+ *   one fails.
+ *
+ * ## Forensic signal on a yellow-band pass
+ *
+ * If TTFO lands in 6-12s, the test passes but logs a forensic warning
+ * so a future regression in this code path is visible BEFORE it
+ * crosses the hard contract. Yellow-band drift is the canary for
+ * "something's getting slower" — better to chase it at 8s than at 28s.
+ */
+import { describe, it, expect } from "vitest";
+import { spinUp } from "../harness.js";
+const AGENT = "test-harness";
+// Hard contract for trivial-prompt TTFO.
+const HARD_TTFO_MS = 12_000;
+// Vision target: trivial prompts feel near-instant.
+const VISION_TTFO_MS = 6_000;
+const TRIVIAL_PROMPT = "Reply with just the number: what is 2 + 2?";
+const SOFT_COMMIT_PHRASES = [
+  /let me/i,
+  /back in/i,
+  /one (sec|moment)/i,
+  /checking/i,
+  /looking into/i,
+  /hold on/i,
+];
+describe("uat: short happy path — trivial prompt is FAST", () => {
+  it(
+    `trivial prompt → reply lands within ${HARD_TTFO_MS / 1000}s`,
+    async () => {
+      const sc = await spinUp({ agent: AGENT });
+      try {
+        const sendStart = Date.now();
+        await sc.sendDM(TRIVIAL_PROMPT);
+        const firstReply = await sc.expectMessage(/\S/, {
+          from: "bot",
+          timeout: HARD_TTFO_MS + 5_000,
+        });
+        const ttfo = Date.now() - sendStart;
+        expect(firstReply.text.length).toBeGreaterThan(0);
+        if (ttfo >= HARD_TTFO_MS) {
+          throw new Error(
+            `[fast-trivial] TTFO=${ttfo}ms exceeds hard contract ` +
+            `${HARD_TTFO_MS}ms — trivial-prompt latency regression.`,
+          );
+        }
+        expect(ttfo).toBeLessThan(HARD_TTFO_MS);
+        const triggeredSoftCommit = SOFT_COMMIT_PHRASES.some((re) =>
+          re.test(firstReply.text),
+        );
+        if (triggeredSoftCommit) {
+          console.warn(
+            `[fast-trivial] First reply contains soft-commit phrasing — ` +
+            `the conversational-pacing prompt likely classified the ` +
+            `trivial prompt as slow. Text: ${JSON.stringify(firstReply.text.slice(0, 200))}`,
+          );
+        }
+        if (ttfo >= VISION_TTFO_MS) {
+          console.warn(
+            `[fast-trivial] TTFO=${ttfo}ms — passed hard contract ` +
+            `(${HARD_TTFO_MS}ms) but slower than the vision target ` +
+            `(${VISION_TTFO_MS}ms). Forensic canary for delivery-path drift.`,
+          );
+        } else {
+          console.log(
+            `[fast-trivial] TTFO=${ttfo}ms — within vision target ` +
+            `(<${VISION_TTFO_MS}ms). Snappy.`,
+          );
+        }
+      } finally {
+        await sc.tearDown();
+      }
+    },
+    HARD_TTFO_MS + 15_000,
+  );
+});

package/telegram-plugin/uat/scenarios/jtbd-memory-survives-restart-dm.test.ts ADDED Viewed

@@ -0,0 +1,239 @@
+/**
+ * JTBD scenario — memory survives across restart (the "fleet differentiator").
+ *
+ * Serves: `reference/remember-across-sessions.md` — the JTBD says:
+ *
+ *   *Outcome:* The agent brings back relevant facts, preferences,
+ *   decisions, and open threads from past conversations, in the right
+ *   moment, without the user reminding it.
+ *
+ *   *Stakes:* An agent with no memory is a stranger every time. The
+ *   user stops sharing context because they're tired of repeating
+ *   it. The relationship never compounds.
+ *
+ * Memory IS the moat. If hindsight silently drops captures, or if a
+ * restart wipes recent recall, the multi-agent specialist proposition
+ * collapses to "9 chatbots with no context, each costing a separate
+ * conversation thread to bring up to speed." This is the most
+ * expensive trust-leak in the product because regressions are
+ * invisible for days (the user keeps re-explaining, attributing the
+ * cost to "agents are like that" not "switchroom broke memory").
+ *
+ * ## Contract this asserts
+ *
+ * 1. **Capture works**: agent confirms it remembers a unique token in
+ *    its first reply (capture-side observable via reply content).
+ * 2. **Survival works**: after a marker-safe restart of the agent, the
+ *    same token is recalled in response to a follow-up question.
+ * 3. **Timing is reasonable**: post-restart recall reply lands within
+ *    the always-on cold-start budget (vision target <30s; hard
+ *    contract <120s, same as `jtbd-always-on-after-restart-dm.test.ts`).
+ *
+ * ## What this catches that other UATs don't
+ *
+ * - `jtbd-always-on-after-restart-dm.test.ts` asserts the agent REPLIES
+ *   post-restart. This asserts the agent REMEMBERS post-restart.
+ * - `jtbd-status-query-dm.test.ts` and friends test conversational
+ *   pacing. None test memory.
+ * - No existing UAT exercises hindsight recall as a vision contract.
+ *
+ * ## Honest scope caveat
+ *
+ * Hindsight capture is opportunistic (the agent decides when to
+ * remember, not the user). This test uses an EXPLICIT recall prompt
+ * ("please remember exactly this token") which heavily biases the
+ * model toward capturing it. A future scenario should test IMPLICIT
+ * recall (the agent inferring relevance without being asked) — the
+ * harder + more valuable JTBD case — but that's flaky against any
+ * single model, so we start with the explicit-capture baseline as the
+ * floor.
+ */
+import { describe, it, expect, beforeAll } from "vitest";
+import { execSync } from "node:child_process";
+import { randomBytes } from "node:crypto";
+import { spinUp } from "../harness.js";
+const AGENT = "test-harness";
+const RESTART_BUDGET_MS = 90_000;
+const CAPTURE_REPLY_BUDGET_MS = 60_000;
+const RECALL_REPLY_BUDGET_MS = 120_000;
+const VISION_RECALL_BUDGET_MS = 30_000;
+// Unique per-run token so we know the model isn't echoing a stale
+// answer from a prior cached conversation.
+const TOKEN = `SWITCHROOM_UAT_MEM_${randomBytes(8).toString("hex").toUpperCase()}`;
+function canShellSudo(): boolean {
+  try {
+    execSync("sudo -n true", { stdio: "ignore", timeout: 2_000 });
+    return true;
+  } catch {
+    return false;
+  }
+}
+function restartAgent(name: string): void {
+  execSync(
+    `sudo -n env PATH=$PATH HOME=$HOME switchroom agent restart ${name} --force`,
+    { stdio: ["ignore", "pipe", "pipe"], timeout: RESTART_BUDGET_MS },
+  );
+}
+const sudoOk = canShellSudo();
+// UNSKIPPED 2026-05-20 after root-cause + fix.
+//
+// Original failure: the first live run on 2026-05-20 FAILED — after
+// capture → restart → recall, the agent replied "I don't have that
+// token — no SWITCHROOM_UAT_MEM_* value was ever shared with me to
+// remember." Documented at the time as a known vision gap.
+//
+// Root cause: the vendored hindsight-memory plugin's default
+// `retainEveryNTurns: 10` throttled auto-retention to every 10
+// turns. A 2-turn UAT session (capture turn → restart) NEVER reached
+// the threshold, so the Stop hook's retain.py skipped (`turn_count %
+// retain_every_n != 0`) and the token never persisted. The recall
+// query at the new boot found nothing.
+//
+// Fix: switchroom's scaffold (src/agents/scaffold.ts) now applies a
+// post-copy override that sets `retainEveryNTurns: 1` in the
+// per-agent settings.json. Every turn end retains. Vendor file
+// stays untouched. See project_hindsight_memory_gap_root_cause.md.
+//
+// Live re-run after the fix: capture TTFO=22.7s, recall TTFO=14.1s,
+// token round-tripped successfully. The remember-across-sessions
+// JTBD is now met for single-turn explicit-memory prompts.
+//
+// Likely root causes (any/all):
+//   - Hindsight capture is opportunistic — the model decides when to
+//     invoke `hindsight_save`. The "please remember exactly this
+//     token" prompt didn't trigger a save in the model's judgment.
+//   - The post-turn Stop hook (which writes hindsight) may not have
+//     flushed before the marker-safe restart killed the container.
+//   - Recall at the new boot may not query hindsight pre-reply.
+//
+// This UAT is SKIPPED but kept in-tree as an EXECUTABLE SPECIFICATION
+// of the contract. Unskip the test when the underlying memory pipeline
+// is fixed — passing this test is the gate for `remember-across-sessions`
+// being a satisfied JTBD.
+//
+// Tracked as: memory-pipeline work in the post-Phase-2b roadmap.
+//
+// (Memory is the moat — see comment block above. Shipping the test
+// as a known-failing skip is more honest than not shipping it at all.)
+(sudoOk ? describe : describe.skip)(
+  "uat: memory survives across restart (remember-across-sessions JTBD)",
+  () => {
+    it(
+      "agent remembers a unique token after capture → restart → recall",
+      async () => {
+        // --- Phase 1: Capture ---
+        const sc1 = await spinUp({ agent: AGENT });
+        try {
+          const captureStart = Date.now();
+          await sc1.sendDM(
+            `Please remember exactly this token for later: ${TOKEN}. ` +
+            `Confirm in your reply that you've noted it. ` +
+            `(This is a memory-survival UAT — store it via hindsight.)`,
+          );
+          const captureReply = await sc1.expectMessage(/\S/, {
+            from: "bot",
+            timeout: CAPTURE_REPLY_BUDGET_MS,
+          });
+          const captureTtfo = Date.now() - captureStart;
+          // The agent's first reply should acknowledge the token. We
+          // don't require the token to be echoed verbatim (the agent
+          // may say "noted" without repeating), but we DO require a
+          // non-empty reply that doesn't error.
+          expect(captureReply.text.length).toBeGreaterThan(0);
+          console.log(
+            `[memory-survives] capture phase: TTFO=${captureTtfo}ms, ` +
+            `reply length=${captureReply.text.length}`,
+          );
+          // Brief settle so any async hindsight write has time to flush
+          // before we kill the container. Hindsight captures are
+          // typically post-turn-end via a Stop hook; turn-complete
+          // signals from the gateway run within ~1-3s after the reply.
+          await new Promise((r) => setTimeout(r, 10_000));
+        } finally {
+          await sc1.tearDown();
+        }
+        // --- Phase 2: Restart ---
+        restartAgent(AGENT);
+        // Settle so the bridge sidecar reattaches and the new claude
+        // session loads hindsight before the recall inbound arrives.
+        await new Promise((r) => setTimeout(r, 8_000));
+        // --- Phase 3: Recall ---
+        const sc2 = await spinUp({ agent: AGENT });
+        try {
+          const recallStart = Date.now();
+          await sc2.sendDM(
+            `Earlier I asked you to remember a token starting with ` +
+            `SWITCHROOM_UAT_MEM_. What was the full token? ` +
+            `Reply with the token only, no extra text.`,
+          );
+          const recallReply = await sc2.expectMessage(/\S/, {
+            from: "bot",
+            timeout: RECALL_REPLY_BUDGET_MS + 5_000,
+          });
+          const recallTtfo = Date.now() - recallStart;
+          expect(recallReply.text.length).toBeGreaterThan(0);
+          // HARD CONTRACT — memory survival. If the token doesn't
+          // appear, hindsight either didn't capture it OR the recall
+          // failed to surface it.
+          const tokenInReply = recallReply.text.includes(TOKEN);
+          if (!tokenInReply) {
+            throw new Error(
+              `[memory-survives] CONTRACT FAILED: token ${TOKEN} not ` +
+              `present in recall reply. Either hindsight capture missed ` +
+              `the original message (likely if the post-turn-end Stop ` +
+              `hook didn't run before restart) OR the recall query ` +
+              `didn't find the entry. Reply was: ` +
+              `${JSON.stringify(recallReply.text.slice(0, 400))}`,
+            );
+          }
+          expect(tokenInReply).toBe(true);
+          // Timing contract — recall on a cold-restarted agent should
+          // still feel "always-on". Same bound as the post-restart
+          // first-message UAT.
+          if (recallTtfo >= RECALL_REPLY_BUDGET_MS) {
+            throw new Error(
+              `[memory-survives] recall TTFO=${recallTtfo}ms exceeds ` +
+              `${RECALL_REPLY_BUDGET_MS}ms — matches the wedge symptom`,
+            );
+          }
+          expect(recallTtfo).toBeLessThan(RECALL_REPLY_BUDGET_MS);
+          if (recallTtfo >= VISION_RECALL_BUDGET_MS) {
+            console.warn(
+              `[memory-survives] recall TTFO=${recallTtfo}ms — passed ` +
+              `contract (${RECALL_REPLY_BUDGET_MS}ms) but slower than ` +
+              `vision target (${VISION_RECALL_BUDGET_MS}ms). Hindsight ` +
+              `query latency canary.`,
+            );
+          } else {
+            console.log(
+              `[memory-survives] recall TTFO=${recallTtfo}ms — ` +
+              `within vision target. Token round-tripped successfully.`,
+            );
+          }
+        } finally {
+          await sc2.tearDown();
+        }
+      },
+      // Outer budget: capture + 10s settle + restart + 8s settle + recall.
+      CAPTURE_REPLY_BUDGET_MS + 10_000 + RESTART_BUDGET_MS + 8_000 + RECALL_REPLY_BUDGET_MS + 10_000,
+    );
+  },
+);

package/telegram-plugin/uat/scenarios/jtbd-wake-audit-content-dm.test.ts ADDED Viewed

@@ -0,0 +1,145 @@
+/**
+ * JTBD scenario — wake-audit content visibility post-restart.
+ *
+ * Serves: `reference/restart-and-know-what-im-running.md` — the JTBD:
+ *
+ *   *Outcome:* After any restart, the user is told what config is live.
+ *   Model, tools, skills, memory backend, auth state. **No need to ask.**
+ *
+ *   *Stakes:* If the user has to probe to find out what they're talking
+ *   to, they don't know what they're talking to. Agents drift silently,
+ *   bad configs ship unnoticed, and trust leaks away a turn at a time.
+ *
+ * The existing `jtbd-always-on-after-restart-dm.test.ts` UAT validates
+ * that the agent REPLIES post-restart. This one validates that the
+ * agent's content reflects awareness of its own config — i.e. that
+ * the wake-audit / boot card is doing its job.
+ *
+ * ## Soft contract (this version)
+ *
+ * The strictest contract — the JTBD's "no need to ask" — would require
+ * observing a proactive wake-audit message immediately after restart
+ * without any user prompt. That requires harness support for observing
+ * `editMessageText` events (the boot card is an edit of a pinned
+ * message, not a fresh send), which `mtcute` doesn't currently
+ * surface in the same way as `sendMessage`.
+ *
+ * This UAT relaxes to: after restart, the user asks "what are you
+ * running?" — the agent's reply must contain identifiable config
+ * signals (model name OR "claude" OR an MCP server name OR "skill"
+ * OR "memory" OR "switchroom"). A fully amnesiac agent that says
+ * "I'm an AI assistant" would fail this.
+ *
+ * A FUTURE strict UAT should observe the boot card edit directly
+ * — that's the true vision contract. This is the floor.
+ */
+import { describe, it, expect } from "vitest";
+import { execSync } from "node:child_process";
+import { spinUp } from "../harness.js";
+const AGENT = "test-harness";
+const RESTART_BUDGET_MS = 90_000;
+const REPLY_BUDGET_MS = 60_000;
+// Config signals — at least ONE must appear in the agent's reply to
+// the "what are you running" question. These cover:
+//   - model identity (`claude`, `sonnet`, `opus`, `haiku`)
+//   - tooling layer (`switchroom`, `mcp`, `tool`)
+//   - capability surface (`skill`, `memory`, `hindsight`)
+//   - operational state (`agent`, `running`, `version`)
+const CONFIG_SIGNAL_REGEX =
+  /\b(claude|sonnet|opus|haiku|switchroom|mcp|hindsight|skill|memory|agent|model|running|version)\b/i;
+function canShellSudo(): boolean {
+  try {
+    execSync("sudo -n true", { stdio: "ignore", timeout: 2_000 });
+    return true;
+  } catch {
+    return false;
+  }
+}
+function restartAgent(name: string): void {
+  execSync(
+    `sudo -n env PATH=$PATH HOME=$HOME switchroom agent restart ${name} --force`,
+    { stdio: ["ignore", "pipe", "pipe"], timeout: RESTART_BUDGET_MS },
+  );
+}
+const sudoOk = canShellSudo();
+(sudoOk ? describe : describe.skip)(
+  "uat: wake-audit content post-restart (restart-and-know JTBD)",
+  () => {
+    it(
+      "agent describes its own config when asked post-restart",
+      async () => {
+        restartAgent(AGENT);
+        // Settle for bridge re-attach.
+        await new Promise((r) => setTimeout(r, 8_000));
+        const sc = await spinUp({ agent: AGENT });
+        try {
+          await sc.sendDM(
+            "Briefly: what model are you running, and what tools/skills do " +
+            "you have available? One short paragraph is fine.",
+          );
+          const reply = await sc.expectMessage(/\S/, {
+            from: "bot",
+            timeout: REPLY_BUDGET_MS,
+          });
+          expect(reply.text.length).toBeGreaterThan(0);
+          // The reply must include AT LEAST ONE config signal. A
+          // generic "I'm an AI assistant ready to help" without any
+          // model/tool reference would fail — that's the failure mode
+          // we want to catch.
+          const matchedSignal = CONFIG_SIGNAL_REGEX.exec(reply.text);
+          if (matchedSignal == null) {
+            throw new Error(
+              `[wake-audit-content] CONTRACT FAILED: agent reply to ` +
+              `"what are you running?" contained NO config signals ` +
+              `(model, tools, skills, mcp, memory, etc.). The ` +
+              `\`restart-and-know-what-im-running\` JTBD requires the ` +
+              `user to know what's live without probing — at minimum ` +
+              `the agent should respond to a direct question. Reply: ` +
+              `${JSON.stringify(reply.text.slice(0, 400))}`,
+            );
+          }
+          expect(matchedSignal).not.toBeNull();
+          console.log(
+            `[wake-audit-content] config signal "${matchedSignal[0]}" ` +
+            `present in reply. Length=${reply.text.length}, snippet: ` +
+            `${JSON.stringify(reply.text.slice(0, 120))}`,
+          );
+          // Optional: count how many distinct signals appeared. A
+          // wake-audit-rich reply mentions several (model + skills +
+          // mcp). A bare-minimum compliant reply mentions one.
+          const allSignals =
+            reply.text.match(new RegExp(CONFIG_SIGNAL_REGEX.source, "gi")) ?? [];
+          const uniqueSignals = new Set(allSignals.map((s) => s.toLowerCase()));
+          if (uniqueSignals.size < 2) {
+            console.warn(
+              `[wake-audit-content] only ${uniqueSignals.size} distinct ` +
+              `config signal(s) present — reply meets the floor but ` +
+              `is not config-rich. Vision target: model + tools + ` +
+              `skills + memory all visible in the wake-audit.`,
+            );
+          } else {
+            console.log(
+              `[wake-audit-content] config-rich reply: ${uniqueSignals.size} ` +
+              `distinct signals: ${Array.from(uniqueSignals).slice(0, 6).join(", ")}`,
+            );
+          }
+        } finally {
+          await sc.tearDown();
+        }
+      },
+      RESTART_BUDGET_MS + 8_000 + REPLY_BUDGET_MS + 10_000,
+    );
+  },
+);