npm - switchroom - Versions diffs - 0.15.44 → 0.16.4 - Mend

switchroom 0.15.44 → 0.16.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (150) hide show

package/dist/agent-scheduler/index.js +122 -88
package/dist/auth-broker/index.js +463 -177
package/dist/cli/autoaccept-poll.js +4842 -35
package/dist/cli/drive-write-pretool.mjs +17 -14
package/dist/cli/notion-write-pretool.mjs +117 -86
package/dist/cli/self-improve-apply-guard-pretool.mjs +626 -0
package/dist/cli/self-improve-stop.mjs +428 -0
package/dist/cli/skill-validate-pretool.mjs +72 -72
package/dist/cli/switchroom.js +3249 -1241
package/dist/cli/ui/index.html +1 -1
package/dist/host-control/main.js +2833 -355
package/dist/vault/approvals/kernel-server.js +7482 -7439
package/dist/vault/broker/server.js +11315 -11272
package/examples/minimal.yaml +1 -0
package/examples/switchroom.yaml +1 -0
package/package.json +3 -3
package/profiles/_base/start.sh.hbs +88 -1
package/profiles/_shared/execution-discipline.md.hbs +18 -0
package/profiles/default/CLAUDE.md.hbs +3 -22
package/telegram-plugin/.claude-plugin/plugin.json +2 -2
package/telegram-plugin/answer-stream-flag.ts +12 -49
package/telegram-plugin/answer-stream.ts +5 -150
package/telegram-plugin/auth-snapshot-format.ts +280 -48
package/telegram-plugin/auto-fallback-fleet.ts +44 -1
package/telegram-plugin/context-exhaustion.ts +12 -0
package/telegram-plugin/demo-mask.ts +154 -0
package/telegram-plugin/dist/bridge/bridge.js +167 -124
package/telegram-plugin/dist/gateway/gateway.js +3039 -1159
package/telegram-plugin/dist/server.js +215 -172
package/telegram-plugin/docs/waiting-ux-spec.md +2 -2
package/telegram-plugin/draft-stream.ts +47 -410
package/telegram-plugin/final-answer-detect.ts +17 -12
package/telegram-plugin/fleet-fallback-resume.ts +131 -0
package/telegram-plugin/format.ts +56 -19
package/telegram-plugin/gateway/auth-add-flow.ts +332 -127
package/telegram-plugin/gateway/auth-broker-client.ts +2 -2
package/telegram-plugin/gateway/auth-command.ts +70 -14
package/telegram-plugin/gateway/clean-shutdown-marker.ts +44 -0
package/telegram-plugin/gateway/config-approval-handler.test.ts +91 -4
package/telegram-plugin/gateway/config-approval-handler.ts +94 -13
package/telegram-plugin/gateway/current-turn-map.ts +188 -0
package/telegram-plugin/gateway/disconnect-flush.ts +3 -1
package/telegram-plugin/gateway/effort-command.ts +8 -3
package/telegram-plugin/gateway/emission-authority.ts +369 -0
package/telegram-plugin/gateway/feed-open-gate.ts +292 -0
package/telegram-plugin/gateway/gateway.ts +1837 -291
package/telegram-plugin/gateway/inject-handler.test.ts +2 -1
package/telegram-plugin/gateway/ms365-write-approval.test.ts +4 -4
package/telegram-plugin/gateway/represent-guard.ts +72 -0
package/telegram-plugin/gateway/status-surface-log.test.ts +5 -4
package/telegram-plugin/gateway/status-surface-log.ts +14 -3
package/telegram-plugin/history.ts +33 -11
package/telegram-plugin/hooks/repo-context-pretool.mjs +26 -0
package/telegram-plugin/hooks/subagent-tracker-posttool.mjs +5 -0
package/telegram-plugin/hooks/subagent-tracker-pretool.mjs +8 -0
package/telegram-plugin/hooks/tool-label-pretool.mjs +39 -15
package/telegram-plugin/issues-card.ts +4 -0
package/telegram-plugin/model-unavailable.ts +124 -0
package/telegram-plugin/narrative-dedup.ts +69 -0
package/telegram-plugin/over-ping-safety-net.ts +70 -4
package/telegram-plugin/package.json +3 -3
package/telegram-plugin/pending-work-progress.ts +12 -0
package/telegram-plugin/permission-rule.ts +32 -5
package/telegram-plugin/permission-title.ts +152 -9
package/telegram-plugin/quota-check.ts +13 -0
package/telegram-plugin/quota-watch.ts +135 -7
package/telegram-plugin/registry/turns-schema.test.ts +24 -0
package/telegram-plugin/registry/turns-schema.ts +9 -0
package/telegram-plugin/runtime-metrics.ts +13 -0
package/telegram-plugin/session-tail.ts +96 -11
package/telegram-plugin/silence-poke.ts +170 -24
package/telegram-plugin/slot-banner-driver.ts +3 -0
package/telegram-plugin/status-no-truncate.ts +44 -0
package/telegram-plugin/status-reactions.ts +20 -3
package/telegram-plugin/stream-controller.ts +4 -23
package/telegram-plugin/stream-reply-handler.ts +6 -24
package/telegram-plugin/streaming-metrics.ts +91 -0
package/telegram-plugin/subagent-watcher.ts +212 -66
package/telegram-plugin/tests/activity-ever-opened-sticky.test.ts +47 -0
package/telegram-plugin/tests/answer-stream-dedup.test.ts +9 -26
package/telegram-plugin/tests/answer-stream-flag.test.ts +25 -58
package/telegram-plugin/tests/answer-stream-silent-markers.test.ts +41 -51
package/telegram-plugin/tests/answer-stream.test.ts +2 -411
package/telegram-plugin/tests/auth-add-flow.test.ts +488 -253
package/telegram-plugin/tests/auth-command-format2.test.ts +71 -1
package/telegram-plugin/tests/auth-snapshot-format.test.ts +376 -6
package/telegram-plugin/tests/auto-fallback-fleet.test.ts +120 -0
package/telegram-plugin/tests/cross-turn-card-gate.test.ts +424 -0
package/telegram-plugin/tests/demo-mask.test.ts +127 -0
package/telegram-plugin/tests/draft-stream.test.ts +0 -827
package/telegram-plugin/tests/emission-authority-card-drain-gate.test.ts +236 -0
package/telegram-plugin/tests/emission-authority-facade.test.ts +488 -0
package/telegram-plugin/tests/emission-authority-open-gate.test.ts +179 -0
package/telegram-plugin/tests/emission-authority-ping-gate.test.ts +395 -0
package/telegram-plugin/tests/emission-determinism-wiring.test.ts +177 -0
package/telegram-plugin/tests/feed-heartbeat-liveness-open.test.ts +146 -0
package/telegram-plugin/tests/feed-open-gate.test.ts +259 -0
package/telegram-plugin/tests/feed-survival.test.ts +526 -0
package/telegram-plugin/tests/fleet-fallback-resume.test.ts +197 -0
package/telegram-plugin/tests/gateway-clean-shutdown-marker.test.ts +117 -0
package/telegram-plugin/tests/gateway-no-reply-single-emit.test.ts +4 -11
package/telegram-plugin/tests/history.test.ts +60 -0
package/telegram-plugin/tests/model-unavailable.test.ts +118 -0
package/telegram-plugin/tests/narrative-dedup.test.ts +118 -0
package/telegram-plugin/tests/orphaned-reply-rearm.test.ts +285 -0
package/telegram-plugin/tests/over-ping-final-answer-decoupling.test.ts +194 -0
package/telegram-plugin/tests/over-ping-safety-net.test.ts +2 -2
package/telegram-plugin/tests/per-topic-current-turn.test.ts +373 -0
package/telegram-plugin/tests/permission-card-origin-kill-switch.test.ts +42 -0
package/telegram-plugin/tests/permission-rule.test.ts +17 -0
package/telegram-plugin/tests/permission-title.test.ts +206 -17
package/telegram-plugin/tests/quota-watch.test.ts +252 -9
package/telegram-plugin/tests/reply-terminal-reaction.test.ts +6 -1
package/telegram-plugin/tests/repo-context-pretool.test.ts +62 -0
package/telegram-plugin/tests/represent-guard.test.ts +162 -0
package/telegram-plugin/tests/session-tail.test.ts +147 -3
package/telegram-plugin/tests/silence-liveness-wiring.test.ts +18 -0
package/telegram-plugin/tests/status-card-budget-parity.test.ts +72 -0
package/telegram-plugin/tests/status-surface-log.test.ts +146 -0
package/telegram-plugin/tests/subagent-watcher-clip-narrative.test.ts +58 -0
package/telegram-plugin/tests/subagent-watcher-parent-turn-key.test.ts +102 -0
package/telegram-plugin/tests/subagent-watcher-workflow-visibility.test.ts +225 -0
package/telegram-plugin/tests/subagent-watcher.test.ts +147 -0
package/telegram-plugin/tests/telegram-activity-visibility-integration.test.ts +597 -0
package/telegram-plugin/tests/telegram-format.test.ts +101 -6
package/telegram-plugin/tests/tool-activity-summary.test.ts +550 -15
package/telegram-plugin/tests/tool-label-pretool.test.ts +73 -0
package/telegram-plugin/tests/tool-label-sidecar.test.ts +44 -0
package/telegram-plugin/tests/tool-labels.test.ts +67 -0
package/telegram-plugin/tests/turn-liveness-floor.test.ts +196 -0
package/telegram-plugin/tests/turn-liveness-invariant.test.ts +340 -0
package/telegram-plugin/tests/welcome-text.test.ts +32 -3
package/telegram-plugin/tests/worker-activity-feed.test.ts +470 -22
package/telegram-plugin/tool-activity-summary.ts +375 -58
package/telegram-plugin/turn-liveness-floor.ts +240 -0
package/telegram-plugin/uat/assertions.ts +115 -0
package/telegram-plugin/uat/driver.ts +68 -0
package/telegram-plugin/uat/scenarios/bg-sub-agent-dispatch-dm.test.ts +119 -133
package/telegram-plugin/uat/scenarios/jtbd-answer-pings.test.ts +94 -0
package/telegram-plugin/uat/scenarios/jtbd-cross-turn-card-dm.test.ts +109 -0
package/telegram-plugin/uat/scenarios/jtbd-foreground-feed-thinkgap-dm.test.ts +478 -0
package/telegram-plugin/uat/scenarios/jtbd-foreground-feed-visibility-dm.test.ts +396 -0
package/telegram-plugin/uat/scenarios/jtbd-liveness-feed-open-dm.test.ts +202 -0
package/telegram-plugin/uat/scenarios/jtbd-reply-is-last-dm.test.ts +202 -0
package/telegram-plugin/uat/scenarios/reactions-dm.test.ts +93 -87
package/telegram-plugin/welcome-text.ts +13 -1
package/telegram-plugin/worker-activity-feed.ts +157 -82
package/telegram-plugin/draft-transport.ts +0 -122
package/telegram-plugin/tests/draft-retirement-wiring.test.ts +0 -82
package/telegram-plugin/tests/draft-transport.test.ts +0 -211

package/telegram-plugin/uat/scenarios/jtbd-foreground-feed-visibility-dm.test.ts ADDED Viewed

@@ -0,0 +1,396 @@
+/**
+ * Foreground activity-feed visibility across the silence-fallback threshold.
+ *
+ * ## Cause class
+ *
+ * A foreground turn that does REAL sequential work (several tool calls, each
+ * followed by a model-thinking gap) can trip the silence-fallback timer
+ * (`SILENCE_FALLBACK_MS`, default 300 000 ms) even while it is visibly
+ * progressing. The silence clock is reset by:
+ *
+ *   - A fresh `reply` or `stream_reply` first-emit (any real user-visible send).
+ *   - `SILENCE_LIVENESS_PRODUCTION` ON (the default): a new tool-activity label
+ *     appearing on the feed, or an answer-stream draft update.
+ *
+ * Crucially, every render resets the clock to ZERO — so the failure is NOT
+ * cumulative across many short gaps. The feed only darkens on a SINGLE
+ * continuous no-render window longer than the threshold. Heartbeat edits keep
+ * the message visually advancing but do NOT count as liveness, so they do not
+ * reset the clock; only a real tool-label render or an answer-stream draft does.
+ *
+ * This file is the DETERMINISTIC guard half of the pair. The fleet runs
+ * `SWITCHROOM_SILENCE_DEFER_INFLIGHT_TOOLS=1` (set in defaults.env — it is the
+ * fleet default, despite a stale gateway.ts comment that still says "OFF, canary
+ * on marko"). With the defer ON, a single long IN-FLIGHT tool does NOT trip the
+ * base fallback: the defer holds it back while the tool runs (up to the hard
+ * ceiling, default 15 min) and the feed heartbeat keeps editing the live
+ * message. So the CORRECT behaviour for a long in-flight turn is: the feed stays
+ * lit. This guard pins exactly that. A regression that breaks the defer, stops
+ * the heartbeat, or nulls `currentTurn` for an in-flight turn darkens the feed,
+ * and this test catches it.
+ *
+ * The workload is one ~35 s no-output command (`sleep 35`) — well under the
+ * default 15 min hard ceiling — so under prod config the feed must stay live
+ * across the shrunk 20 s base fallback. A prompt of several FAST steps would not
+ * exercise the silence window at all (each tool start re-renders and resets the
+ * clock); one long stretch is what holds the clock open.
+ *
+ * This guard does NOT reproduce #680's exact trigger — silent model thinking
+ * BETWEEN tools, with no tool in-flight. The defer does not cover that vector
+ * and it cannot be forced deterministically; it lives in the sibling best-effort
+ * scenario `jtbd-foreground-feed-thinkgap-dm.test.ts`. Together the pair covers
+ * the feed-visibility invariant deterministically (here) and the true #680
+ * vector best-effort (there).
+ *
+ * This scenario shrinks the base fallback to 20 s via
+ * `SWITCHROOM_SILENCE_FALLBACK_MS=20000` on the test-harness agent so the
+ * window is exercised within a test budget instead of 5 minutes.
+ *
+ * ## Required env precondition (operator must set on test-harness agent)
+ *
+ *   SWITCHROOM_SILENCE_FALLBACK_MS=20000
+ *
+ * Set this under the `test-harness` agent's `env:` block in
+ * `~/.switchroom/switchroom.yaml`, then restart the agent
+ * (`switchroom agent restart test-harness --wait --force`) before running
+ * this scenario. The scenario detects whether the threshold is plausibly
+ * shrunk by reading `SWITCHROOM_UAT_SILENCE_FALLBACK_MS` from the test
+ * env (a parallel knob populated by the UAT `.env` file) and
+ * skip-with-message if it is not set to ≤ 30 000.
+ *
+ * Without the shrunk threshold the scenario still runs but the timing
+ * assertions become vacuous (the default 300 s threshold far exceeds the
+ * test budget) and the test exits before the scenario would have had a
+ * chance to catch a regression. The skip keeps the failure signal honest.
+ *
+ * ## What it asserts (the gap no existing scenario covers)
+ *
+ * 1. **Feed opened.** An activity-feed message (`→`/`✓` lines) appears in the
+ *    DM at some point during the turn — the agent started reporting progress.
+ *
+ * 2. **Feed survived the fallback window.** After the silence-fallback interval
+ *    has elapsed from the point the feed was first observed, the feed message
+ *    is still present and carries at least one more edit that arrived AFTER
+ *    the threshold mark. If `currentTurn` was nulled mid-turn, the gateway
+ *    stops sending activity-feed edits and the message goes stale or disappears
+ *    — this assertion catches that.
+ *
+ * 3. **Final answer arrives.** A substantive reply (≥ 150 chars) eventually
+ *    lands, confirming the turn completed rather than being wedged.
+ *
+ * ## Failure shapes
+ *
+ *   (a) Feed never opened — the activity feed did not paint at all. Either the
+ *       agent never used tools or the very first drainActivitySummary call
+ *       failed. Distinct from the regression; both are failures.
+ *
+ *   (b) Feed went dark — the feed message was present before the fallback mark
+ *       but received no fresh edit after the mark. This IS the regression this
+ *       test exists to catch: `currentTurn` was nulled mid-turn, silencing the
+ *       live feed while the agent was still working.
+ *
+ *   (c) No final answer — the turn never produced a substantive reply. Possibly
+ *       the fallback also dropped the answer path (compound regression), or the
+ *       prompt was too slow for the overall test budget.
+ *
+ * ## Tolerances
+ *
+ * The feed edit observation is polled via `driver.getMessage` (the same
+ * technique used by `jtbd-worker-activity-feed-dm`). Because mtcute's live
+ * `observeMessages` may miss edits that arrive before the observer is attached,
+ * we cross-check by comparing the snapshot taken just before the fallback mark
+ * against a fresh fetch just after it. A changed body confirms a live edit
+ * occurred across the threshold; an UNCHANGED body with no subsequent new edit
+ * in the live stream is the regression signal.
+ *
+ * The prompt is engineered so the model does at least 4 sequential tool calls
+ * with brief thinking gaps between them, each step taking ~4–6 s, giving a
+ * natural total span of ~25–35 s that straddles the shrunk 20 s fallback.
+ */
+import { describe, expect, it } from "vitest";
+import { spinUp } from "../harness.js";
+import { isActivityFeedMessage } from "../assertions.js";
+import type { ObservedMessage } from "../driver.js";
+/**
+ * The shrunk fallback threshold the operator must set on the test-harness
+ * agent. The test reads a parallel UAT-env knob so we can detect whether
+ * the precondition is satisfied without reaching into the agent's process env.
+ *
+ * Set `SWITCHROOM_UAT_SILENCE_FALLBACK_MS=20000` in the repo-root `.env`
+ * alongside `SWITCHROOM_SILENCE_FALLBACK_MS=20000` in the agent's env block.
+ */
+const PRECONDITION_FALLBACK_MS = Number.parseInt(
+  process.env.SWITCHROOM_UAT_SILENCE_FALLBACK_MS ?? "",
+  10,
+);
+/**
+ * How long to wait after the feed first appears before taking the
+ * "before-mark" snapshot. We want the fallback timer to have clearly
+ * elapsed by the time we take the "after-mark" snapshot. Chosen to be
+ * safely above the shrunk 20 s fallback while staying comfortably within
+ * the test budget.
+ */
+const FALLBACK_WINDOW_MS = 25_000;
+/**
+ * How long to poll after the fallback window for a fresh feed edit. Short
+ * enough not to waste budget but long enough for one more drainActivitySummary
+ * cycle to land (the feed heartbeat fires roughly every 5–8 s).
+ */
+const POST_MARK_EDIT_WAIT_MS = 15_000;
+/**
+ * A substantive answer is at least this many characters. Avoids latching
+ * onto a brief "on it" ack or a stub.
+ */
+const MIN_ANSWER_CHARS = 150;
+/**
+ * Overall test budget. Includes:
+ *   - spinUp settle: ~8 s
+ *   - turn onset (first tool + first feed paint): ~20 s
+ *   - FALLBACK_WINDOW_MS: 25 s
+ *   - POST_MARK_EDIT_WAIT_MS: 15 s
+ *   - final-answer wait: ~30 s
+ *   - headroom: ~20 s
+ */
+const OVERALL_BUDGET_MS = 150_000;
+/**
+ * Workload prompt: one ~33 s no-output command, `timeout 33 tail -f /dev/null`.
+ * NOTE: standalone `sleep` is blocked by the Claude Code harness ("foreground
+ * sleep is blocked"), so this is the hook-safe equivalent of a long in-flight
+ * no-op. The tool's label renders once at its start (resetting the clock to
+ * zero), then nothing renders for ~33 s while the tool is in-flight. Under prod
+ * config (defer ON) the base fallback is HELD during that in-flight stretch, so
+ * the feed must stay live and the heartbeat keeps editing it. That is the
+ * invariant this guard asserts; a regression that lets `currentTurn` get nulled
+ * mid-stretch breaks it.
+ *
+ * The prompt explicitly asks the model to give the Bash call a short
+ * DESCRIPTION. That matters: an empty-label tool is dropped from the activity
+ * feed (it never opens), which would fail the test on "feed never appeared"
+ * (shape a) for the wrong reason. A labelled tool opens the feed reliably.
+ *
+ * Why not several fast steps: each fast tool start emits a fresh label that
+ * resets the clock, so the silence window never opens at all and the test would
+ * pass vacuously (a false green). One long stretch is what holds it open.
+ *
+ * We do NOT use run_in_background — this must be a FOREGROUND turn so
+ * currentTurn stays in place and the silence clock applies to it directly.
+ */
+const SEQUENTIAL_WORK_PROMPT =
+  "Use the Bash tool to run EXACTLY this command — and give the tool call a " +
+  'short description such as "long-running wait" so it is clearly labelled: ' +
+  "`timeout 33 tail -f /dev/null`. It runs for about 33 seconds and then exits " +
+  "on its own (a non-zero timeout exit code is expected and fine). Do not run " +
+  "any other tool while it is running. After it finishes, reply with a short " +
+  "paragraph (a few sentences) telling me it completed and that you waited " +
+  "about 33 seconds.";
+describe("uat: foreground activity-feed visibility across silence-fallback threshold", () => {
+  it(
+    "feed remains live and receives edits after the shrunk fallback window",
+    async () => {
+      // Precondition guard: if the operator hasn't shrunk the silence
+      // fallback to ≤ 30 000 ms on the test-harness agent (and mirrored
+      // it into the UAT env), the timing assertions are vacuous. Skip
+      // with a clear message rather than silently producing a false green.
+      if (!Number.isFinite(PRECONDITION_FALLBACK_MS) || PRECONDITION_FALLBACK_MS > 30_000) {
+        console.warn(
+          "[uat/foreground-feed-visibility] SKIPPED — precondition not met.\n" +
+            "  This scenario requires SWITCHROOM_SILENCE_FALLBACK_MS=20000 set on\n" +
+            "  the test-harness agent AND SWITCHROOM_UAT_SILENCE_FALLBACK_MS=20000\n" +
+            "  in the repo-root .env. Without it the silence fallback does not fire\n" +
+            "  within the test window and the regression cannot be detected.\n" +
+            "  See the header doc comment in this file for setup instructions.",
+        );
+        return;
+      }
+      const sc = await spinUp({ agent: "test-harness" });
+      try {
+        // Start observing BEFORE sending so no activity-feed messages are missed.
+        const iter = sc.driver
+          .observeMessages(sc.botUserId)
+          [Symbol.asyncIterator]();
+        await sc.sendDM(SEQUENTIAL_WORK_PROMPT);
+        console.log("[foreground-feed] prompt sent; watching for activity-feed message…");
+        // ── Assertion 1: feed opened ─────────────────────────────────────────
+        // Drain the live message stream until we see an activity-feed message
+        // (lines matching `→ …` or `✓ …`). Give generous budget for the agent to
+        // start tools and open the feed.
+        let feedMsg: ObservedMessage | null = null;
+        const feedDeadline = Date.now() + 90_000;
+        while (Date.now() < feedDeadline) {
+          const remaining = feedDeadline - Date.now();
+          const next = await Promise.race([
+            iter.next(),
+            new Promise<{ done: true; value: undefined }>((r) =>
+              setTimeout(() => r({ done: true, value: undefined }), Math.max(0, remaining)),
+            ),
+          ]);
+          if (next.done || next.value == null) break;
+          const m = next.value as ObservedMessage;
+          // Skip our own echo and worker-feed messages; we want the foreground
+          // activity feed.
+          if (m.senderUserId === sc.driverUserId) continue;
+          if (isActivityFeedMessage(m)) {
+            feedMsg = m;
+            break;
+          }
+        }
+        expect(
+          feedMsg,
+          "Failure shape (a): the foreground activity-feed message never appeared. " +
+            "Either the agent did not use tools, the prompt was too fast for the " +
+            "feed to paint, or drainActivitySummary failed on every attempt this turn.",
+        ).not.toBeNull();
+        // feedMsg is confirmed non-null beyond this point.
+        const { messageId: feedId } = feedMsg!;
+        console.log(
+          `[foreground-feed] feed opened (id=${feedId}): ` +
+            JSON.stringify(feedMsg!.text.slice(0, 120)),
+        );
+        // ── Snapshot before the fallback mark ───────────────────────────────
+        // Record the feed body and clock, then wait FALLBACK_WINDOW_MS so the
+        // shrunk base fallback has definitely elapsed. During this wait the
+        // single in-flight stretch (`sleep 35`) emits no renders, but the defer
+        // (prod default ON) HOLDS the base fallback while the tool is in-flight,
+        // so currentTurn survives and the feed heartbeat keeps editing the
+        // message. We assert those heartbeat edits keep landing after the mark;
+        // if a regression nulls currentTurn mid-stretch, the edits stop and we
+        // catch it.
+        const beforeMarkText = feedMsg!.text;
+        const markAt = Date.now() + FALLBACK_WINDOW_MS;
+        console.log(
+          `[foreground-feed] waiting ${FALLBACK_WINDOW_MS}ms for fallback window to elapse…`,
+        );
+        await new Promise((r) => setTimeout(r, FALLBACK_WINDOW_MS));
+        // ── Assertion 2: feed survived the fallback window ───────────────────
+        // Fetch the feed message directly. If currentTurn was nulled mid-turn,
+        // the gateway either stopped editing (stale text) or the message may have
+        // been deleted by clearActivitySummary (null). Either condition is the
+        // regression.
+        const afterMark = await sc.driver.getMessage(sc.botUserId, feedId);
+        console.log(
+          `[foreground-feed] feed state after ${FALLBACK_WINDOW_MS}ms mark ` +
+            `(id=${feedId}): ` +
+            JSON.stringify(afterMark?.text?.slice(0, 120) ?? null),
+        );
+        expect(
+          afterMark,
+          "Failure shape (b): the activity-feed message was deleted after the " +
+            `silence-fallback window (${FALLBACK_WINDOW_MS}ms). This means ` +
+            "currentTurn was nulled mid-turn by the silence-fallback handler, " +
+            "which then triggered clearActivitySummary and removed the live feed " +
+            "message. The regression: a productive foreground turn went dark " +
+            "because a continuous no-render window exceeded the " +
+            `shrunk threshold (SWITCHROOM_SILENCE_FALLBACK_MS=${PRECONDITION_FALLBACK_MS}).`,
+        ).not.toBeNull();
+        // The body should have changed — i.e. the feed received at least one
+        // edit after the fallback mark — proving it is still alive. We check
+        // both the polled snapshot and the live-stream edits we may have
+        // collected during the wait.
+        const bodyChangedAfterMark = afterMark!.text !== beforeMarkText;
+        // Also drain any edits that arrived during the POST_MARK_EDIT_WAIT_MS
+        // window to catch the next drainActivitySummary cycle if the polled
+        // snapshot was taken slightly before the edit landed.
+        let sawFeedEditAfterMark = bodyChangedAfterMark;
+        const postMarkDeadline = Date.now() + POST_MARK_EDIT_WAIT_MS;
+        while (!sawFeedEditAfterMark && Date.now() < postMarkDeadline) {
+          const remaining = postMarkDeadline - Date.now();
+          const next = await Promise.race([
+            iter.next(),
+            new Promise<{ done: true; value: undefined }>((r) =>
+              setTimeout(() => r({ done: true, value: undefined }), Math.max(0, remaining)),
+            ),
+          ]);
+          if (next.done || next.value == null) break;
+          const m = next.value as ObservedMessage;
+          if (m.senderUserId === sc.driverUserId) continue;
+          // An edit of the feed message that arrived after the mark
+          if (m.edited && m.messageId === feedId && m.date.getTime() >= markAt) {
+            sawFeedEditAfterMark = true;
+            console.log(
+              `[foreground-feed] feed edit confirmed after mark (id=${feedId}): ` +
+                JSON.stringify(m.text.slice(0, 120)),
+            );
+          }
+        }
+        expect(
+          sawFeedEditAfterMark,
+          "Failure shape (b): the activity-feed message still exists but received " +
+            `no edit after the ${FALLBACK_WINDOW_MS}ms fallback window. This is the ` +
+            "feed-went-dark regression: currentTurn was nulled mid-turn so no further " +
+            "drainActivitySummary calls fired. The feed body was frozen at " +
+            `${JSON.stringify(beforeMarkText.slice(0, 80))} and did not advance.`,
+        ).toBe(true);
+        // ── Assertion 3: final answer lands ─────────────────────────────────
+        // Collect from the live stream until a substantive bot reply arrives.
+        // This confirms the turn completed — the fallback did not also drop
+        // the answer path.
+        let finalAnswer: ObservedMessage | null = null;
+        const answerDeadline = Date.now() + 60_000;
+        while (Date.now() < answerDeadline) {
+          const remaining = answerDeadline - Date.now();
+          const next = await Promise.race([
+            iter.next(),
+            new Promise<{ done: true; value: undefined }>((r) =>
+              setTimeout(() => r({ done: true, value: undefined }), Math.max(0, remaining)),
+            ),
+          ]);
+          if (next.done || next.value == null) break;
+          const m = next.value as ObservedMessage;
+          if (m.senderUserId === sc.driverUserId) continue;
+          if (m.edited) continue; // edits are feed updates, not the answer
+          if (isActivityFeedMessage(m)) continue; // skip feed-only sends
+          if (m.text.trim().length >= MIN_ANSWER_CHARS) {
+            finalAnswer = m;
+            break;
+          }
+        }
+        console.log(
+          `[foreground-feed] final answer (id=${finalAnswer?.messageId ?? "NONE"}): ` +
+            JSON.stringify(finalAnswer?.text?.slice(0, 180) ?? null),
+        );
+        expect(
+          finalAnswer,
+          "Failure shape (c): no final answer arrived after the silence-fallback " +
+            `window. The turn did not produce a substantive reply (≥${MIN_ANSWER_CHARS} chars). ` +
+            "If the feed-gone-dark assertion also failed, the fallback may have " +
+            "suppressed the entire turn's output. If only this assertion failed, " +
+            "the turn is still in flight past the test budget — increase the prompt " +
+            "timeout or check that the agent is not wedged.",
+        ).not.toBeNull();
+        await iter.return?.();
+      } finally {
+        await sc.tearDown();
+      }
+    },
+    OVERALL_BUDGET_MS,
+  );
+});

package/telegram-plugin/uat/scenarios/jtbd-liveness-feed-open-dm.test.ts ADDED Viewed

@@ -0,0 +1,202 @@
+/**
+ * Liveness-driven feed open — a thinking-only turn still surfaces a live feed.
+ *
+ * ## Cause class (the #680 dark-turn, true vector)
+ *
+ * The activity feed is TOOL-driven: it opens only when a tool emits a non-null
+ * label (`drainActivitySummary`). A turn dominated by model thinking, or by
+ * suppressed-by-design tools (typing / memory recall / reply), emits no label —
+ * so the feed never opens and a long turn reads as pure silence until the 300s
+ * silence-poke. Turn #680 was exactly this: 335 s alive, `tools=4`, yet
+ * `feedOpened=false / activityMsgId=none` the entire time.
+ *
+ * The fix (gateway `feedHeartbeatTick`): once a turn has been alive for
+ * `FEED_LIVENESS_OPEN_MS` with no labelled tool yet, open a minimal `Working…`
+ * feed and let the existing 6 s heartbeat climb its elapsed. The first real
+ * tool label takes over and its edit replaces the placeholder; a pure-thinking
+ * turn finalizes to `✓ Working…` rather than freezing on the live line.
+ *
+ * ## Precondition (set on the test-harness agent for this run)
+ *
+ *   SWITCHROOM_FEED_LIVENESS_OPEN_MS=6000   (shrinks the default 12 s so the
+ *                                            window is exercised within budget)
+ *
+ * The scenario does not hard-require it — at the 12 s default the feed still
+ * opens, just later; the deadlines below tolerate either.
+ *
+ * ## What it asserts (asymmetric — non-determinism is handled, not faked)
+ *
+ * The trigger is a "think, then answer at length, use NO tools" prompt. The
+ * model's exact behaviour is not fully forceable, so the branches are:
+ *
+ *   PASS         — a `Working…` activity-feed message appeared. ONLY the
+ *                  liveness path produces a bare "Working…" feed (a tool would
+ *                  carry a tool label), so this is positive proof the timer
+ *                  opened the feed on a tool-less turn.
+ *   INCONCLUSIVE — the agent used a tool anyway (feed opened with a tool label,
+ *                  no "Working…"). The liveness path was not exercised; not a
+ *                  failure of the fix. Warn + pass.
+ *   INCONCLUSIVE — the turn was too short (answer landed before the threshold).
+ *                  Warn + pass.
+ *   HARD FAIL    — the turn ran clearly longer than the threshold with NO feed
+ *                  of any kind, yet produced an answer. Liveness should have
+ *                  opened a feed and did not — the regression this guard exists
+ *                  to catch.
+ *   FAIL         — no answer at all within budget (wedged).
+ */
+import { describe, expect, it } from "vitest";
+import { spinUp } from "../harness.js";
+import { isActivityFeedMessage } from "../assertions.js";
+import type { ObservedMessage } from "../driver.js";
+/** A substantive answer is at least this many characters (skips a brief ack). */
+const MIN_ANSWER_CHARS = 200;
+/** Overall test budget. */
+const OVERALL_BUDGET_MS = 150_000;
+/**
+ * Workload: think, then answer at length, with NO tools. The long answer
+ * generation holds the turn open past the liveness threshold with no tool
+ * label, which is exactly the condition that should open the `Working…` feed.
+ */
+const THINKING_WORKLOAD_PROMPT =
+  "Do NOT use any tools at all for this — no Bash, no Read, no memory search, " +
+  "nothing. Just think carefully and then write me a thorough, detailed " +
+  "explanation (at least 450 words) of how the TCP three-way handshake works, " +
+  "including SYN, SYN-ACK, ACK, sequence numbers, and what happens if the final " +
+  "ACK is lost. Take your time getting it right, then reply with the full essay " +
+  "in one message.";
+describe("uat: liveness-driven feed open (thinking-only turn stays visible)", () => {
+  it(
+    "opens a 'Working…' feed for a turn that emits no tool label",
+    async () => {
+      const sc = await spinUp({ agent: "test-harness" });
+      try {
+        const iter = sc.driver
+          .observeMessages(sc.botUserId)
+          [Symbol.asyncIterator]();
+        await sc.sendDM(THINKING_WORKLOAD_PROMPT);
+        const sentAt = Date.now();
+        console.log("[liveness-feed] prompt sent; watching for feed + answer…");
+        // Drain the stream until EITHER a feed message appears OR a substantive
+        // answer lands. Track which kind of feed (liveness vs tool) we saw.
+        let livenessFeed: ObservedMessage | null = null;
+        let toolFeed: ObservedMessage | null = null;
+        let answer: ObservedMessage | null = null;
+        let answerAt = 0;
+        const deadline = Date.now() + 110_000;
+        while (Date.now() < deadline) {
+          if (livenessFeed && answer) break;
+          const remaining = deadline - Date.now();
+          const next = await Promise.race([
+            iter.next(),
+            new Promise<{ done: true; value: undefined }>((r) =>
+              setTimeout(
+                () => r({ done: true, value: undefined }),
+                Math.max(0, remaining),
+              ),
+            ),
+          ]);
+          if (next.done || next.value == null) break;
+          const m = next.value as ObservedMessage;
+          if (m.senderUserId === sc.driverUserId) continue;
+          if (isActivityFeedMessage(m)) {
+            // A "Working…" feed body is the liveness placeholder; anything else
+            // is a tool-label feed.
+            if (/Working/.test(m.text)) {
+              if (!livenessFeed) {
+                livenessFeed = m;
+                console.log(
+                  `[liveness-feed] LIVENESS feed opened at +${Date.now() - sentAt}ms: ` +
+                    JSON.stringify(m.text.slice(0, 120)),
+                );
+              }
+            } else if (!toolFeed) {
+              toolFeed = m;
+              console.log(
+                `[liveness-feed] tool-label feed opened at +${Date.now() - sentAt}ms: ` +
+                  JSON.stringify(m.text.slice(0, 120)),
+              );
+            }
+            continue;
+          }
+          if (m.edited) continue;
+          if (m.text.trim().length >= MIN_ANSWER_CHARS && !answer) {
+            answer = m;
+            answerAt = Date.now();
+            console.log(
+              `[liveness-feed] answer landed at +${answerAt - sentAt}ms (len=${m.text.trim().length}).`,
+            );
+          }
+        }
+        const turnSpanMs = (answerAt || Date.now()) - sentAt;
+        // ── Branch resolution ───────────────────────────────────────────────
+        if (livenessFeed) {
+          // PASS: the liveness timer opened a feed on a tool-less turn.
+          expect(
+            livenessFeed,
+            "liveness feed should be present in the PASS branch",
+          ).not.toBeNull();
+          // Confirm it carries the in-progress placeholder shape.
+          expect(livenessFeed!.text).toMatch(/Working/);
+          console.log(
+            "[liveness-feed] PASS — liveness-driven feed open confirmed.",
+          );
+          return;
+        }
+        if (toolFeed) {
+          console.warn(
+            "[liveness-feed] INCONCLUSIVE — the agent used a tool, so the feed " +
+              "opened via the normal tool-label path and the liveness timer was " +
+              "not exercised. Not a failure of the fix.",
+          );
+          // Still require the turn completed.
+          expect(
+            answer,
+            "even in the tool-feed branch the turn must complete with an answer",
+          ).not.toBeNull();
+          return;
+        }
+        // No feed of any kind appeared.
+        if (answer && turnSpanMs < 8_000) {
+          console.warn(
+            `[liveness-feed] INCONCLUSIVE — turn completed in ${turnSpanMs}ms, ` +
+              "below the liveness threshold; no feed was expected.",
+          );
+          return;
+        }
+        // No feed, and the turn ran long enough that liveness SHOULD have opened.
+        expect(
+          answer,
+          "FAIL — no answer arrived within budget; the turn may be wedged.",
+        ).not.toBeNull();
+        expect(
+          livenessFeed,
+          "HARD FAIL — the turn ran for " +
+            `${turnSpanMs}ms (well past the liveness threshold) with NO activity ` +
+            "feed of any kind, yet produced an answer. The liveness timer should " +
+            "have opened a 'Working…' feed and did not — this is the #680 " +
+            "dark-turn regression.",
+        ).not.toBeNull();
+        await iter.return?.();
+      } finally {
+        await sc.tearDown();
+      }
+    },
+    OVERALL_BUDGET_MS,
+  );
+});