npm - switchroom - Versions diffs - 0.13.3 → 0.13.5 - Mend

switchroom 0.13.3 → 0.13.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

package/README.md +36 -45
package/dist/agent-scheduler/index.js +80 -80
package/dist/auth-broker/index.js +80 -80
package/dist/cli/drive-write-pretool.mjs +10 -10
package/dist/cli/skill-validate-pretool.mjs +72 -72
package/dist/cli/switchroom.js +485 -566
package/dist/host-control/main.js +99 -99
package/dist/vault/approvals/kernel-server.js +82 -82
package/dist/vault/broker/server.js +83 -83
package/package.json +1 -1
package/profiles/_base/start.sh.hbs +8 -8
package/profiles/_shared/telegram-style.md.hbs +1 -1
package/profiles/_shared/vault-protocol.md.hbs +12 -0
package/profiles/default/CLAUDE.md +192 -0
package/profiles/default/CLAUDE.md.hbs +1 -1
package/telegram-plugin/dist/bridge/bridge.js +112 -112
package/telegram-plugin/dist/gateway/gateway.js +210 -192
package/telegram-plugin/dist/server.js +160 -160
package/telegram-plugin/runtime-metrics.ts +14 -8
package/telegram-plugin/silence-poke.ts +49 -1
package/telegram-plugin/tests/silence-poke.test.ts +135 -3
package/telegram-plugin/uat/scenarios/bridge-flap-resilience-dm.test.ts +166 -0
package/telegram-plugin/uat/scenarios/jtbd-fast-ack-dm.test.ts +217 -0
package/telegram-plugin/uat/scenarios/jtbd-soft-commit-dm.test.ts +16 -11

package/telegram-plugin/uat/scenarios/jtbd-fast-ack-dm.test.ts ADDED Viewed

@@ -0,0 +1,217 @@
+/**
+ * JTBD scenario — guaranteed fast acknowledgement (human-feel UX epic).
+ *
+ * Serves: `reference/conversational-pacing.md` and the JTBD
+ * "talking to my agent feels like talking to a capable person".
+ *
+ * A person you message answers in a beat — "got it", "on it, checking
+ * now" — before the work is done. PR #1633 made that opening
+ * acknowledgement a *guarantee*, split across two layers:
+ *
+ *   - the conversational-pacing prompt teaches the model to open with
+ *     a short human one-liner unless the real answer lands in a second
+ *     or two;
+ *   - the silence-poke subsystem *enforces* it — a ~10s ack-budget
+ *     poke fires when nothing at all has been sent this turn, nudging
+ *     the model to acknowledge before it does more work.
+ *
+ * This UAT drives a FUZZY set of non-trivial prompt shapes — research,
+ * multi-step compute, open-ended advice, code, reflective asks. Every
+ * one needs real work, so a turn that goes silent for tens of seconds
+ * is a black box. The invariant under test: the user sees a sign of
+ * life FAST, every time, across every prompt shape.
+ *
+ * ## Targets
+ *
+ *   - **Hard contract:** the first outbound lands within `ACK_HARD_MS`
+ *     for every prompt. This is a tight *latency target*, not a
+ *     framework guarantee. The silence-poke ack rung is a *nudge*
+ *     piggybacked on the model's next tool result (`consumeArmedPoke`
+ *     drained at the gateway tool-result chokepoint) — not a
+ *     framework-composed send. It helps the model along, but a
+ *     pure-reasoning prompt that issues no tool call never drains the
+ *     nudge, so the bound ultimately depends on model latency. It
+ *     still has teeth: pre-#1633 a slow prompt's first outbound was
+ *     the full answer, often 30-60s out, so 20s cleanly separates the
+ *     fixed behaviour from a regression. A failure here means the
+ *     agent left the user on a silent chat — a real pacing defect.
+ *   - **Vision target (soft, per-case forensic):** the first outbound
+ *     lands within `ACK_VISION_MS` and is short — a genuine
+ *     acknowledgement, not a full-answer dump. The model self-acking
+ *     quickly is what makes it *feel* human. Logged, not failed: real
+ *     model runs vary, and the prompt explicitly lets a turn skip the
+ *     ack when the answer itself arrives in the first couple seconds.
+ *
+ * ## Relationship to adjacent UATs
+ *
+ *   - `jtbd-fast-trivial-dm.test.ts` — TRIVIAL prompts: the answer
+ *     itself should land fast, no ack ceremony. This file is the
+ *     non-trivial inverse: real work, but a fast *acknowledgement*.
+ *   - `jtbd-soft-commit-dm.test.ts` — the predecessor: a single slow
+ *     prompt, a looser "first reply within 30s" floor. This file is
+ *     the stronger, fuzzed successor of that contract.
+ *
+ * Each case is a single inbound; cases run sequentially. As with the
+ * other fuzz files, a prior turn may still be finishing in the
+ * background when the next case starts — an accepted, noted risk.
+ */
+import { describe, it, expect } from "vitest";
+import { spinUp } from "../harness.js";
+const AGENT = "test-harness";
+// Hard contract: a sign of life within this budget, every prompt.
+// A tight latency target — well above a healthy self-ack (~3-8s on a
+// warm agent) and well below the pre-#1633 silent-then-dump regression
+// (30-60s). Model-dependent, not a framework guarantee (see header
+// doc), so it carries generous headroom for mtcute polling jitter and
+// for a model that leans on the ack-poke nudge instead of self-acking.
+const ACK_HARD_MS = 20_000;
+// Vision target: the model self-acknowledges in a beat, fast enough
+// that the ack-poke nudge never has to come into it.
+const ACK_VISION_MS = 8_000;
+// A first outbound at or under this length reads as an acknowledgement
+// one-liner rather than a full-answer dump. Mirrors the >200-char
+// "long answer" heuristic in jtbd-soft-commit-dm, with headroom for a
+// persona-voiced ack ("on it — pulling the os-release and hostname now").
+const ACK_LEN_CEILING = 320;
+interface AckCase {
+  name: string;
+  /** A prompt that genuinely needs more than a second or two of work,
+   *  so an instant full answer is not a legitimate ack-skip. */
+  prompt: string;
+}
+const ACK_CASES: readonly AckCase[] = [
+  // ─── Research / multi-source read ─────────────────────────────
+  {
+    name: "machine-summary research",
+    prompt:
+      "Read /etc/os-release and /etc/hostname, then tell me in one "
+      + "sentence what kind of machine this is.",
+  },
+  // ─── Multi-step compute ───────────────────────────────────────
+  {
+    name: "compound date math",
+    prompt:
+      "Work out what day of the week it is today, then tell me how "
+      + "many days are left until the end of this month.",
+  },
+  // ─── Open-ended advice ("take your time") ─────────────────────
+  {
+    name: "open-ended prioritisation",
+    prompt:
+      "I've got a free afternoon and three half-finished side "
+      + "projects. Help me decide what to focus on. Take your time.",
+  },
+  // ─── Summarise / explain ──────────────────────────────────────
+  {
+    name: "plain-language summary",
+    prompt:
+      "Give me a 3-bullet summary of what a Linux container actually "
+      + "is, in plain language.",
+  },
+  // ─── Code task ────────────────────────────────────────────────
+  {
+    name: "bash one-liner with explanation",
+    prompt:
+      "Write me a small bash one-liner that counts the total number "
+      + "of lines across all .ts files under the current directory, "
+      + "and explain how it works.",
+  },
+  // ─── Reflective / vague-but-real ──────────────────────────────
+  {
+    name: "reflective open ask",
+    prompt:
+      "Something feels off with how I'm spending my mornings lately. "
+      + "Help me think through it.",
+  },
+  // ─── Comparison / judgement ───────────────────────────────────
+  {
+    name: "tech comparison",
+    prompt:
+      "Compare REST and GraphQL for a small side project — which "
+      + "would you pick and why?",
+  },
+  // ─── Investigate the box ──────────────────────────────────────
+  {
+    name: "disk-usage investigation",
+    prompt:
+      "Have a look at what's taking up the most space under /var/log "
+      + "and summarise what you find.",
+  },
+];
+describe("uat: guaranteed fast acknowledgement — fuzzy prompt shapes", () => {
+  for (const tc of ACK_CASES) {
+    it(
+      `[ack] ${tc.name} — sign of life within ${ACK_HARD_MS / 1000}s`,
+      async () => {
+        const sc = await spinUp({ agent: AGENT });
+        try {
+          const sendStart = Date.now();
+          await sc.sendDM(tc.prompt);
+          const firstOutbound = await sc.expectMessage(/\S/, {
+            from: "bot",
+            timeout: ACK_HARD_MS + 6_000,
+          });
+          const ttfo = Date.now() - sendStart;
+          const len = firstOutbound.text.trim().length;
+          // Invariant: the outbound is a real, non-empty message.
+          expect(len).toBeGreaterThan(0);
+          // Hard contract: a sign of life FAST. A latency target, not
+          // a framework guarantee (see header doc) — but a failure
+          // here is a real pacing defect, so it fails the build.
+          if (ttfo >= ACK_HARD_MS) {
+            throw new Error(
+              `[ack] ${tc.name}: TTFO=${ttfo}ms exceeds the hard `
+              + `contract ${ACK_HARD_MS}ms — the user sat on a silent `
+              + `chat. The fast-ack path (pacing prompt + ack-poke `
+              + `nudge) is not delivering. First outbound: `
+              + `${JSON.stringify(firstOutbound.text.slice(0, 200))}`,
+            );
+          }
+          expect(ttfo).toBeLessThan(ACK_HARD_MS);
+          // Forensic, soft: did the model self-acknowledge in a beat,
+          // or did it only get there with the ack-poke nudge?
+          const looksLikeAck = len <= ACK_LEN_CEILING;
+          if (ttfo < ACK_VISION_MS && looksLikeAck) {
+            console.log(
+              `[ack] ${tc.name}: TTFO=${ttfo}ms, ${len} chars — fast `
+              + `short acknowledgement. Feels human.`,
+            );
+          } else if (ttfo < ACK_VISION_MS && !looksLikeAck) {
+            // Fast but long: the answer itself arrived quickly. The
+            // pacing prompt explicitly sanctions skipping the ack when
+            // the answer lands in the first couple of seconds.
+            console.log(
+              `[ack] ${tc.name}: TTFO=${ttfo}ms, ${len} chars — fast `
+              + `full answer (legitimate ack-skip).`,
+            );
+          } else {
+            // Passed the hard contract but slower than the vision
+            // target — the canary for the model needing the ack-poke
+            // nudge instead of acknowledging promptly on its own.
+            console.warn(
+              `[ack] ${tc.name}: TTFO=${ttfo}ms (vision target `
+              + `<${ACK_VISION_MS}ms), ${len} chars`
+              + `${looksLikeAck ? "" : " — and long, not an ack one-liner"}`
+              + `. The model did not acknowledge promptly on its own.`,
+            );
+          }
+        } finally {
+          await sc.tearDown();
+        }
+      },
+      ACK_HARD_MS + 45_000,
+    );
+  }
+});

package/telegram-plugin/uat/scenarios/jtbd-soft-commit-dm.test.ts CHANGED Viewed

@@ -1,16 +1,21 @@
 /**
- * JTBD scenario — soft-commit for slow turns.
+ * JTBD scenario — first sign of life on a slow turn.
  *
- * The new conversational-pacing prompt (#1122) instructs the agent
- * to send a one-liner "let me check, back in a few" before slow
- * work. This UAT exercises that behaviour: send a prompt that
- * obviously needs >15s, expect the FIRST outbound to be a short
- * soft-commit message, with the final answer landing later.
+ * The conversational-pacing prompt instructs the agent to open with
+ * an acknowledgement before slow work. (The original ">15s soft
+ * commit" bullet this file was named for was superseded by the
+ * guaranteed "Open with an acknowledgement" bullet in PR #1633 —
+ * acknowledge every turn unless the answer lands in a second or two.)
  *
- * Not strict — the agent's allowed to skip the soft-commit if it
- * judges the work is fast enough. The assertion is "the user does
- * NOT see a long silent gap before the first sign of life": either
- * a soft-commit OR the actual reply lands within 20s.
+ * This UAT exercises a single slow prompt and asserts the loose
+ * floor: the user does NOT see a long silent gap before the first
+ * sign of life — a reply lands within 30s.
+ *
+ * The stronger, fuzzed successor of this contract is
+ * `jtbd-fast-ack-dm.test.ts` — varied prompt shapes, a tight 20s
+ * hard latency target (a tight target, not a framework guarantee —
+ * see that file's header). This file is retained as a minimal
+ * single-prompt floor.
  */
 import { describe, it, expect } from "vitest";
@@ -26,7 +31,7 @@ const SLOW_PROMPT = (
 describe("uat: soft-commit pacing", () => {
   it(
-    "user asks slow question → first reply lands within 20s",
+    "user asks slow question → first reply lands within 30s",
     async () => {
       const sc = await spinUp({ agent: "test-harness" });
       try {