npm - switchroom - Versions diffs - 0.15.44 → 0.16.4 - Mend

switchroom 0.15.44 → 0.16.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (150) hide show

package/dist/agent-scheduler/index.js +122 -88
package/dist/auth-broker/index.js +463 -177
package/dist/cli/autoaccept-poll.js +4842 -35
package/dist/cli/drive-write-pretool.mjs +17 -14
package/dist/cli/notion-write-pretool.mjs +117 -86
package/dist/cli/self-improve-apply-guard-pretool.mjs +626 -0
package/dist/cli/self-improve-stop.mjs +428 -0
package/dist/cli/skill-validate-pretool.mjs +72 -72
package/dist/cli/switchroom.js +3249 -1241
package/dist/cli/ui/index.html +1 -1
package/dist/host-control/main.js +2833 -355
package/dist/vault/approvals/kernel-server.js +7482 -7439
package/dist/vault/broker/server.js +11315 -11272
package/examples/minimal.yaml +1 -0
package/examples/switchroom.yaml +1 -0
package/package.json +3 -3
package/profiles/_base/start.sh.hbs +88 -1
package/profiles/_shared/execution-discipline.md.hbs +18 -0
package/profiles/default/CLAUDE.md.hbs +3 -22
package/telegram-plugin/.claude-plugin/plugin.json +2 -2
package/telegram-plugin/answer-stream-flag.ts +12 -49
package/telegram-plugin/answer-stream.ts +5 -150
package/telegram-plugin/auth-snapshot-format.ts +280 -48
package/telegram-plugin/auto-fallback-fleet.ts +44 -1
package/telegram-plugin/context-exhaustion.ts +12 -0
package/telegram-plugin/demo-mask.ts +154 -0
package/telegram-plugin/dist/bridge/bridge.js +167 -124
package/telegram-plugin/dist/gateway/gateway.js +3039 -1159
package/telegram-plugin/dist/server.js +215 -172
package/telegram-plugin/docs/waiting-ux-spec.md +2 -2
package/telegram-plugin/draft-stream.ts +47 -410
package/telegram-plugin/final-answer-detect.ts +17 -12
package/telegram-plugin/fleet-fallback-resume.ts +131 -0
package/telegram-plugin/format.ts +56 -19
package/telegram-plugin/gateway/auth-add-flow.ts +332 -127
package/telegram-plugin/gateway/auth-broker-client.ts +2 -2
package/telegram-plugin/gateway/auth-command.ts +70 -14
package/telegram-plugin/gateway/clean-shutdown-marker.ts +44 -0
package/telegram-plugin/gateway/config-approval-handler.test.ts +91 -4
package/telegram-plugin/gateway/config-approval-handler.ts +94 -13
package/telegram-plugin/gateway/current-turn-map.ts +188 -0
package/telegram-plugin/gateway/disconnect-flush.ts +3 -1
package/telegram-plugin/gateway/effort-command.ts +8 -3
package/telegram-plugin/gateway/emission-authority.ts +369 -0
package/telegram-plugin/gateway/feed-open-gate.ts +292 -0
package/telegram-plugin/gateway/gateway.ts +1837 -291
package/telegram-plugin/gateway/inject-handler.test.ts +2 -1
package/telegram-plugin/gateway/ms365-write-approval.test.ts +4 -4
package/telegram-plugin/gateway/represent-guard.ts +72 -0
package/telegram-plugin/gateway/status-surface-log.test.ts +5 -4
package/telegram-plugin/gateway/status-surface-log.ts +14 -3
package/telegram-plugin/history.ts +33 -11
package/telegram-plugin/hooks/repo-context-pretool.mjs +26 -0
package/telegram-plugin/hooks/subagent-tracker-posttool.mjs +5 -0
package/telegram-plugin/hooks/subagent-tracker-pretool.mjs +8 -0
package/telegram-plugin/hooks/tool-label-pretool.mjs +39 -15
package/telegram-plugin/issues-card.ts +4 -0
package/telegram-plugin/model-unavailable.ts +124 -0
package/telegram-plugin/narrative-dedup.ts +69 -0
package/telegram-plugin/over-ping-safety-net.ts +70 -4
package/telegram-plugin/package.json +3 -3
package/telegram-plugin/pending-work-progress.ts +12 -0
package/telegram-plugin/permission-rule.ts +32 -5
package/telegram-plugin/permission-title.ts +152 -9
package/telegram-plugin/quota-check.ts +13 -0
package/telegram-plugin/quota-watch.ts +135 -7
package/telegram-plugin/registry/turns-schema.test.ts +24 -0
package/telegram-plugin/registry/turns-schema.ts +9 -0
package/telegram-plugin/runtime-metrics.ts +13 -0
package/telegram-plugin/session-tail.ts +96 -11
package/telegram-plugin/silence-poke.ts +170 -24
package/telegram-plugin/slot-banner-driver.ts +3 -0
package/telegram-plugin/status-no-truncate.ts +44 -0
package/telegram-plugin/status-reactions.ts +20 -3
package/telegram-plugin/stream-controller.ts +4 -23
package/telegram-plugin/stream-reply-handler.ts +6 -24
package/telegram-plugin/streaming-metrics.ts +91 -0
package/telegram-plugin/subagent-watcher.ts +212 -66
package/telegram-plugin/tests/activity-ever-opened-sticky.test.ts +47 -0
package/telegram-plugin/tests/answer-stream-dedup.test.ts +9 -26
package/telegram-plugin/tests/answer-stream-flag.test.ts +25 -58
package/telegram-plugin/tests/answer-stream-silent-markers.test.ts +41 -51
package/telegram-plugin/tests/answer-stream.test.ts +2 -411
package/telegram-plugin/tests/auth-add-flow.test.ts +488 -253
package/telegram-plugin/tests/auth-command-format2.test.ts +71 -1
package/telegram-plugin/tests/auth-snapshot-format.test.ts +376 -6
package/telegram-plugin/tests/auto-fallback-fleet.test.ts +120 -0
package/telegram-plugin/tests/cross-turn-card-gate.test.ts +424 -0
package/telegram-plugin/tests/demo-mask.test.ts +127 -0
package/telegram-plugin/tests/draft-stream.test.ts +0 -827
package/telegram-plugin/tests/emission-authority-card-drain-gate.test.ts +236 -0
package/telegram-plugin/tests/emission-authority-facade.test.ts +488 -0
package/telegram-plugin/tests/emission-authority-open-gate.test.ts +179 -0
package/telegram-plugin/tests/emission-authority-ping-gate.test.ts +395 -0
package/telegram-plugin/tests/emission-determinism-wiring.test.ts +177 -0
package/telegram-plugin/tests/feed-heartbeat-liveness-open.test.ts +146 -0
package/telegram-plugin/tests/feed-open-gate.test.ts +259 -0
package/telegram-plugin/tests/feed-survival.test.ts +526 -0
package/telegram-plugin/tests/fleet-fallback-resume.test.ts +197 -0
package/telegram-plugin/tests/gateway-clean-shutdown-marker.test.ts +117 -0
package/telegram-plugin/tests/gateway-no-reply-single-emit.test.ts +4 -11
package/telegram-plugin/tests/history.test.ts +60 -0
package/telegram-plugin/tests/model-unavailable.test.ts +118 -0
package/telegram-plugin/tests/narrative-dedup.test.ts +118 -0
package/telegram-plugin/tests/orphaned-reply-rearm.test.ts +285 -0
package/telegram-plugin/tests/over-ping-final-answer-decoupling.test.ts +194 -0
package/telegram-plugin/tests/over-ping-safety-net.test.ts +2 -2
package/telegram-plugin/tests/per-topic-current-turn.test.ts +373 -0
package/telegram-plugin/tests/permission-card-origin-kill-switch.test.ts +42 -0
package/telegram-plugin/tests/permission-rule.test.ts +17 -0
package/telegram-plugin/tests/permission-title.test.ts +206 -17
package/telegram-plugin/tests/quota-watch.test.ts +252 -9
package/telegram-plugin/tests/reply-terminal-reaction.test.ts +6 -1
package/telegram-plugin/tests/repo-context-pretool.test.ts +62 -0
package/telegram-plugin/tests/represent-guard.test.ts +162 -0
package/telegram-plugin/tests/session-tail.test.ts +147 -3
package/telegram-plugin/tests/silence-liveness-wiring.test.ts +18 -0
package/telegram-plugin/tests/status-card-budget-parity.test.ts +72 -0
package/telegram-plugin/tests/status-surface-log.test.ts +146 -0
package/telegram-plugin/tests/subagent-watcher-clip-narrative.test.ts +58 -0
package/telegram-plugin/tests/subagent-watcher-parent-turn-key.test.ts +102 -0
package/telegram-plugin/tests/subagent-watcher-workflow-visibility.test.ts +225 -0
package/telegram-plugin/tests/subagent-watcher.test.ts +147 -0
package/telegram-plugin/tests/telegram-activity-visibility-integration.test.ts +597 -0
package/telegram-plugin/tests/telegram-format.test.ts +101 -6
package/telegram-plugin/tests/tool-activity-summary.test.ts +550 -15
package/telegram-plugin/tests/tool-label-pretool.test.ts +73 -0
package/telegram-plugin/tests/tool-label-sidecar.test.ts +44 -0
package/telegram-plugin/tests/tool-labels.test.ts +67 -0
package/telegram-plugin/tests/turn-liveness-floor.test.ts +196 -0
package/telegram-plugin/tests/turn-liveness-invariant.test.ts +340 -0
package/telegram-plugin/tests/welcome-text.test.ts +32 -3
package/telegram-plugin/tests/worker-activity-feed.test.ts +470 -22
package/telegram-plugin/tool-activity-summary.ts +375 -58
package/telegram-plugin/turn-liveness-floor.ts +240 -0
package/telegram-plugin/uat/assertions.ts +115 -0
package/telegram-plugin/uat/driver.ts +68 -0
package/telegram-plugin/uat/scenarios/bg-sub-agent-dispatch-dm.test.ts +119 -133
package/telegram-plugin/uat/scenarios/jtbd-answer-pings.test.ts +94 -0
package/telegram-plugin/uat/scenarios/jtbd-cross-turn-card-dm.test.ts +109 -0
package/telegram-plugin/uat/scenarios/jtbd-foreground-feed-thinkgap-dm.test.ts +478 -0
package/telegram-plugin/uat/scenarios/jtbd-foreground-feed-visibility-dm.test.ts +396 -0
package/telegram-plugin/uat/scenarios/jtbd-liveness-feed-open-dm.test.ts +202 -0
package/telegram-plugin/uat/scenarios/jtbd-reply-is-last-dm.test.ts +202 -0
package/telegram-plugin/uat/scenarios/reactions-dm.test.ts +93 -87
package/telegram-plugin/welcome-text.ts +13 -1
package/telegram-plugin/worker-activity-feed.ts +157 -82
package/telegram-plugin/draft-transport.ts +0 -122
package/telegram-plugin/tests/draft-retirement-wiring.test.ts +0 -82
package/telegram-plugin/tests/draft-transport.test.ts +0 -211

package/telegram-plugin/uat/scenarios/bg-sub-agent-dispatch-dm.test.ts CHANGED Viewed

@@ -5,187 +5,173 @@
  * Verifies three acceptance criteria from the RFC in a single run because
  * they share setup:
  *
- *   AC-1 — Background-dispatch-and-continue: card stays pinned past
- *          parent `turn_end`; fleet zone surfaces the running sub-agent.
- *   AC-2 — Done semantics: header reads 🌀 Background (not ✅ Done)
- *          while the bg sub-agent runs; flips to ✅ Done after it
- *          terminates.
- *   AC-3 — Live activity: card body materially changes across a 15s
- *          window while bg work is in flight (elapsed counter or fleet
- *          row's `last activity` advances) — proves the heartbeat +
- *          subagent-watcher are actually feeding the renderer.
+ *   AC-1 — Background-dispatch-and-continue: worker-feed message appears
+ *          while the background sub-agent runs; persists past parent
+ *          `turn_end` so the user can watch the worker in flight.
+ *   AC-2 — Done semantics: feed message reads `running ·` while the bg
+ *          sub-agent runs; flips to `finished · completed` (or `failed`)
+ *          after it terminates.
+ *   AC-3 — Live activity: feed body materially changes across a 6s window
+ *          while bg work is in flight (elapsed counter or narrative step
+ *          advances) — proves the subagent-watcher is actually feeding the
+ *          renderer.
  *
  * Prompt strategy: **Option 1 (explicit tool-naming)** per the RFC §
  * "Background-dispatch prompt". An earlier Option-2 (naturalistic)
  * attempt produced exactly the failure mode the RFC predicted —
- * model ran the sleeps inline via Bash, card never reached Background
+ * model ran the sleeps inline via Bash, feed never reached Background
  * phase. This test verifies the *visibility infra*, not the LLM's
  * delegation judgment; pinning the tool name and arg keeps the
  * scenario deterministic.
  *
- * Requires the same env as the other DM scenarios (see SETUP.md §6)
- * and the test-harness override `progress_card.delay_ms: 1000` so the
- * card actually fires on a short turn (SETUP.md §5).
+ * Architecture note (post-#1122 PR3): the pinned progress card was
+ * deleted. Background sub-agent visibility is now surfaced via the
+ * worker-activity-feed (`SWITCHROOM_WORKER_ACTIVITY_FEED=1`): a regular
+ * Telegram message that posts once the worker has been running for
+ * `firstPaintMin` (8s default on test-harness) and edits in-place as
+ * activity arrives. This test drives assertions against that feed.
  *
- * Runtime budget is generous — the inner deadlines sum to ~150s
- * worst-case (5s pin + 30s parent-ack + 30s background phase + 15s
- * delta-snapshot + 120s done) plus ~12s spinUp overhead. The outer
- * `it()` timeout absorbs the lot.
+ * Requires the same env as the other DM scenarios (see SETUP.md §6).
+ *
+ * Root causes fixed in #2501 (this PR):
+ *   Bug 1 — orphan correlation. `backfillJsonlAgentId` used a fuzzy
+ *           (agentType, description) match to link a newly-discovered JSONL
+ *           to its registry row. When the match failed (description null,
+ *           or race), `jsonl_agent_id` stayed NULL, so
+ *           `resolveWorkerFeedDispatch(getSubagentByJsonlId(db, id), …)`
+ *           returned `{ isBackground: false }` — routing the worker as a
+ *           foreground sub-agent and suppressing the worker-feed. Fix:
+ *           prefer the direct `toolUseId` PK lookup that Claude Code already
+ *           writes to `agent-<id>.meta.json`.
+ *   Bug 2 — liveness writes silently skipped. With `jsonl_agent_id = NULL`
+ *           (Bug 1 not fixed), `bumpSubagentActivity` queries by
+ *           `jsonl_agent_id` and finds nothing — every liveness tick is a
+ *           no-op and the last_activity_at column never updates. Fixed as a
+ *           consequence of Bug 1 (once the row is linked, liveness writes
+ *           land).
+ *
+ * Runtime budget is generous — the inner deadlines sum to ~225s
+ * worst-case (45s parent-ack + 75s feed-first-paint + 12s delta + 180s
+ * done) plus ~12s spinUp overhead. The outer `it()` timeout absorbs the lot.
+ * The 180s done-window accommodates the stall-detection path: the watcher
+ * fires `onFinish` 60s after the last JSONL event, because background
+ * workers don't reliably emit `sub_agent_turn_end`.
  */
 import { describe, expect, it } from "vitest";
 import { spinUp } from "../harness.js";
+import { WORKER_FEED_RE } from "../assertions.js";
 // Explicit dispatch prompt (Option 1 per the RFC §"Background-dispatch
 // prompt"). The naturalistic Option-2 version didn't reliably get the
 // model to use the Agent tool with run_in_background:true — first
 // attempt produced the failure mode the RFC predicted (parent ran the
-// sleeps inline via Bash; card never transitioned to Background).
+// sleeps inline via Bash; feed never surfaced Background-phase activity).
 //
 // This test asserts the VISIBILITY INFRA works, not that the model
 // makes good delegation judgments. Naming the tool + the arg lets the
-// scenario be deterministic. If the model can't be made to use the
-// Agent tool even with this prompt, that's an unrelated bug (model
-// alignment / tool registration) and the scenario fails distinctly
-// from the visibility-infra failure modes we're trying to catch.
+// scenario be deterministic.
 //
-// Time profile: ~60s of bg work, paced with three separate sleeps so
-// the worker emits multiple tool_use events the subagent-watcher can
-// surface as fresh `last activity` updates. We need the Background
-// phase to last long enough that we can take a snapshot, wait one
-// heartbeat tick (5s default), and snapshot again.
+// Time profile: ~60s of bg work, paced with ten short steps so the
+// worker emits multiple tool_use + narrative events the subagent-watcher
+// can surface as fresh edits. We need the Background phase to last long
+// enough to clear the 8s first-paint threshold and take a snapshot.
 const BG_DISPATCH_PROMPT =
   `Use the Agent tool with subagent_type "general-purpose" and ` +
   `run_in_background: true to dispatch a worker with this exact task: ` +
-  `"Run \`sleep 20\` via the Bash tool, then \`echo step1\`, then ` +
-  `\`sleep 20\` again, then \`echo step2\`, then \`sleep 20\` a third ` +
-  `time, then \`echo done\`. That's three separate Bash tool calls ` +
-  `with sleeps between echoes." After dispatching, send a brief reply ` +
-  `saying you've kicked off the background worker so I can watch the ` +
-  `progress card.`;
+  `"Do ten steps, ONE AT A TIME, k = 1 through 10. Before each step ` +
+  `write a brief one-sentence narration of what you are about to do, ` +
+  `then run \`sleep 2\` via the Bash tool, then run \`echo step-k\` via ` +
+  `the Bash tool (substitute the real number for k). Run every sleep and ` +
+  `every echo as its OWN separate Bash call — never batch or chain them ` +
+  `with && — and narrate before each so progress surfaces incrementally. ` +
+  `Do not stop early; complete all ten steps." After dispatching, send a ` +
+  `brief reply saying you've kicked off the background worker so I can ` +
+  `watch the progress feed.`;
+const WORKER_RUNNING_RE = /running\s*·/i;
+const WORKER_DONE_RE = /finished\s*·\s*(completed|failed)/i;
-/**
- * STATUS: currently red — surfaces two real production bugs the
- * RFC §Risks predicted as possible-but-unverified. Marked `it.fails`
- * so a future fix flips it green and a regression flips it red again.
- *
- *   Bug 1 — orphan correlation. The parent's `Agent` tool_use_id
- *           doesn't get matched to the spawned `sub_agent_started`
- *           event. Gateway log: `pendingSpawns=0 correlated=orphan`.
- *           Result: `isBackgroundDispatch` is never set on the fleet
- *           member; the card's header phase transitions to Background
- *           only by accident (orphans defer too, but they don't carry
- *           the bg flag).
- *
- *   Bug 2 — subagent-watcher can't track the worker. Gateway log:
- *           `subagent-watcher: liveness skip <agentId> — row not in
- *           DB yet (Phase 2 Pre hook pending)`. Result: no
- *           sub_agent_tool_use events reach the fleet member; the
- *           fleet row's `last activity` field never updates with the
- *           worker's actual tool calls. The card edits we see are
- *           just elapsed-counter ticks from the heartbeat.
- *
- * Both bugs are real and live on `main`. The scenario above passes
- * AC-1 (card stays pinned), partially passes AC-2 (Background phase
- * fires) and AC-3 (card body changes — from heartbeat alone), and
- * fails AC-2's closing half (card never reaches Done in 120s because
- * the orphan never terminates from the gateway's view).
- *
- * When Bug 1 + Bug 2 are fixed, change `describe.skip` to `describe`
- * below — the assertions are correct; only the production code is
- * wrong.
- *
- * Update post-#1105: all five RFC bugs (1–5 in earlier PRs, 6–7 in
- * #1105) merged. Unskipped here for the next UAT re-run. If 6/6 ACs
- * pass, close #709 / #776 / #782 / #788.
- */
 describe("uat: background sub-agent visibility (#709/#776/#782/#788)", () => {
   it(
-    "card stays pinned with 🌀 Background header + live fleet activity, then flips to ✅ Done",
+    "worker-feed appears with running status then flips to finished once the sub-agent completes",
     async () => {
       const sc = await spinUp({ agent: "test-harness" });
       try {
         await sc.sendDM(BG_DISPATCH_PROMPT);
-        // AC-1 step 1: card pins quickly (delay_ms: 1000 on test-harness).
-        // Generous timeout so a slow first-turn doesn't false-flag.
-        const card = await sc.expectPinnedCard({ timeout: 15_000 });
-        expect(card.messageId).toBeGreaterThan(0);
+        // Parent ack reply — confirms the parent turn closed.
+        await sc.expectMessage(/.+/, { from: "bot", timeout: 45_000 });
-        // Parent ack reply. Note: we DON'T strictly require the model
-        // to mention "dispatch" in the reply — naturalistic prompt means
-        // the model picks the wording. We just need *some* bot reply
-        // so we know the parent turn closed (which is the point where
-        // pre-fix the card would unpin).
-        await sc.expectMessage(/.+/, { from: "bot", timeout: 30_000 });
-        // AC-2: header MUST be 🌀 Background (post-#1039) or, if the
-        // bg dispatch happened so fast the worker hasn't started yet,
-        // it might still be ⚙️ Working with the parent zone done. We
-        // poll for the background phase with a 45s budget — long
-        // enough for the worker to actually start firing tools, short
-        // enough that "we never saw Background" surfaces as a real
-        // bug, not a timeout-tuning issue.
+        // AC-1 step 1: worker-feed message appears after first-paint delay
+        // (~8s default). The message starts with "🛠 Worker" and shows
+        // "running ·" while the worker is in flight. Generous timeout so a
+        // slow first tool_use + narrative doesn't false-flag.
         //
-        // The dual-acceptable phases below model the realistic flow:
-        // parent reply lands → header should be Background (or
-        // briefly still Working if the parent's `done` event lags
-        // the bg dispatch's tool_use).
-        const bgPhaseCard = await sc.waitForCardPhase(card, "background", {
-          timeout: 45_000,
+        // Distinct from the parent's ack — `expectMessage` starts observing
+        // from after the parent ack, so the feed paint is the next match.
+        const feed = await sc.expectMessage(WORKER_FEED_RE, {
+          from: "bot",
+          timeout: 75_000,
         });
-        expect(bgPhaseCard.text).toMatch(/🌀|Background/i);
-        // The negative — Done MUST NOT have fired before bg started.
-        // Asserts the defer-gate is doing its job. If this trips, the
-        // `hasLiveBackground` correlation at progress-card-driver.ts:1108
-        // is broken (or the bg dispatch never registered as a fleet
-        // member at all — see RFC §Phase 2 diagnosis paths).
-        expect(bgPhaseCard.text).not.toMatch(/✅|\bDone\b/i);
+        expect(feed.messageId).toBeGreaterThan(0);
+        expect(feed.text).toMatch(WORKER_FEED_RE);
+        // AC-2 step 1: feed body MUST show "running ·" (the in-flight
+        // status), NOT the terminal "finished ·" — the worker hasn't
+        // completed yet.
+        expect(feed.text).toMatch(WORKER_RUNNING_RE);
+        expect(feed.text).not.toMatch(WORKER_DONE_RE);
-        // AC-3: card edits land regularly while bg runs. Snapshot
-        // the current card body, wait one heartbeat tick (5s default
-        // + 1s slack), then fetch the card body again. The body MUST
-        // differ (elapsed counter, fleet last-activity age, etc.).
+        // AC-3: feed edits land regularly while the worker runs. Snapshot
+        // the current body, wait 12s (well above the 2.5s edit throttle,
+        // and enough that at least one step + sleep cycle completes), then
+        // re-fetch the SAME message. The body MUST differ (elapsed counter
+        // or narrative step advances).
         //
         // We re-fetch the SAME message via `driver.getMessage(chatId,
-        // cardId)` rather than `expectPinnedCard` because the latter
-        // listens for NEW pin events. Once the card is pinned, no
-        // further pin event fires — `expectPinnedCard` would wait
-        // for an event that never comes and time out spuriously even
-        // though the card is alive and being edited (caught in the
-        // first run of this scenario).
+        // msgId)` rather than `expectMessage(WORKER_FEED_RE)` because the
+        // latter listens for NEW messages. The feed edits in-place; a new
+        // send only happens on re-post (stale messageId). So re-fetching is
+        // the right shape.
         //
-        // If the card freezes — heartbeat dead, subagent-watcher not
-        // flushing, fleet member never registered — `afterDelta` will
-        // equal `beforeDelta` and surface the bug cleanly. If the
-        // card was unpinned by an over-eager defer-gate release,
-        // `getMessage` returns null and we surface it with a clear
-        // assertion.
-        const beforeDelta = bgPhaseCard.text;
-        await new Promise((r) => setTimeout(r, 6_000));
+        // 12s instead of 6s: the first edit arrives ~6-8s after paint (one
+        // step/sleep cycle), so 6s was racy. 12s gives a safe 2x margin.
+        const beforeDelta = feed.text;
+        await new Promise((r) => setTimeout(r, 12_000));
         const afterDeltaMsg = await sc.driver.getMessage(
           sc.botUserId,
-          bgPhaseCard.messageId,
+          feed.messageId,
         );
-        expect(afterDeltaMsg, "card message disappeared mid-flight (AC-1 regression)").not.toBeNull();
+        expect(afterDeltaMsg, "feed message disappeared mid-flight (AC-1 regression)").not.toBeNull();
         expect(afterDeltaMsg!.text).not.toBe(beforeDelta);
-        // AC-2 closing half: bg terminates → header flips to ✅ Done.
-        // Generous budget — the inner sleeps sum to ~60s but
-        // post-completion the deferred-completion gate plus the
-        // heartbeat cadence can add another 5-30s before the card
-        // finalises.
-        const doneCard = await sc.waitForCardPhase(bgPhaseCard, "done", {
-          timeout: 120_000,
-        });
-        expect(doneCard.text).toMatch(/✅|Done/i);
+        // AC-2 closing half: bg terminates → body flips to "finished ·
+        // completed". The terminal edit is triggered by the subagent-watcher's
+        // stall detection (60s after the last JSONL activity), because
+        // background Claude Code workers don't always emit a sub_agent_turn_end
+        // event. Budget: worker steps (~60s) + stall window (60s) + slack.
+        // From first-paint to terminal is typically 140-165s.
+        let doneText: string | null = null;
+        const deadline = Date.now() + 180_000;
+        while (Date.now() < deadline) {
+          const m = await sc.driver.getMessage(sc.botUserId, feed.messageId);
+          if (m != null && WORKER_DONE_RE.test(m.text)) {
+            doneText = m.text;
+            break;
+          }
+          await new Promise((r) => setTimeout(r, 3_000));
+        }
+        expect(doneText, "worker-feed never reached a terminal recap").not.toBeNull();
+        expect(doneText!).toMatch(/tools?/i);
+        // Body MUST have changed between first paint and terminal.
+        expect(doneText).not.toBe(beforeDelta);
       } finally {
         await sc.tearDown();
       }
     },
-    // Outer per-test budget: sum of inner deadlines (15 + 30 + 45 + 15 +
-    // 10 + 120 = 235s) + spinUp settle (~12s) + slack. Round up to keep
-    // the inner-deadline error visible if any of them trip.
-    300_000,
+    // Outer per-test budget: sum of inner deadlines (45 + 75 + 16 + 180 =
+    // 316s) + spinUp settle (~12s) + slack.
+    360_000,
   );
 });

package/telegram-plugin/uat/scenarios/jtbd-answer-pings.test.ts ADDED Viewed

@@ -0,0 +1,94 @@
+/**
+ * JTBD: "the answer pings" — notification ownership (R8 / PR-2; design
+ * `docs/message-emission-determinism.md` §over-ping).
+ *
+ * The residual the bare one-ping-per-turn safety net left: when a turn opens
+ * with an interim ACK that pings first, the ack claims the turn's single ping
+ * slot and the LATER substantive answer used to be downgraded to silent — the
+ * reply is last on screen, but the user's phone never buzzed for the actual
+ * answer. PR-2 makes `decideOverPing` aware of WHO holds the slot and lets a
+ * substantive answer UPGRADE over an ack's slot, so the answer pings.
+ *
+ * This scenario drives the exact sequence end-to-end: an "On it" style ack
+ * (pings, claims the slot) followed by a ≥300-char substantive answer, and
+ * asserts the ANSWER arrived non-silent via `assertAnswerPinged`
+ * (mtcute's `ObservedMessage.silent`).
+ *
+ * Runs under CI `uat-gate`; the full live MTProto run needs the test-harness
+ * agent + a vault session, so locally this self-skips green (no driver).
+ *
+ * Scope caveat: this end-to-end scenario only exercises PR-2's upgrade code
+ * path when the harness model delivers its final answer via the `reply` tool.
+ * If the model answers via `stream_reply` instead, that path bypasses the
+ * over-ping safety net entirely (it never reaches `decideOverPing`), so the
+ * upgrade-over-ack logic is never touched. The model's tool choice isn't
+ * forceable here, which makes this scenario a WEAKER backstop than the unit
+ * matrix — the real proof of the upgrade behaviour lives in the deterministic
+ * unit tests in `over-ping-final-answer-decoupling.test.ts`. Treat this as a
+ * live smoke-test of the happy path, not the source of truth.
+ */
+import { describe, it, expect, beforeAll } from "vitest";
+import { spinUp, type Scenario } from "../harness.js";
+import { assertAnswerPinged, isAnswer } from "../assertions.js";
+import { collectTurn } from "../real-work-prompts.js";
+/** Overall budget for the ack-then-answer turn. */
+const TURN_BUDGET_MS = 130_000;
+/** The answer must clear the substantive-length backstop (≥200). */
+const MIN_ANSWER_CHARS = 200;
+describe("uat: the substantive answer pings even after an ack pinged (DM)", () => {
+  let sc: Scenario | null = null;
+  beforeAll(async () => {
+    try {
+      sc = await spinUp({ agent: "test-harness" });
+      await sc.driver.primeDialogs();
+    } catch (err) {
+      console.warn(
+        `[answer-pings] no live driver — self-skipping green: ${(err as Error).message}`,
+      );
+      sc = null;
+    }
+  });
+  it(
+    "an ack pings first, then the substantive answer also pings (R8 / PR-2 upgrade)",
+    async () => {
+      if (sc == null) return; // self-skip green
+      const { driver, botUserId, driverUserId } = sc;
+      // Prompt the model into the ack-then-answer cadence: a quick pinging
+      // "On it" reply, then — after a beat — a thorough ≥300-character answer
+      // as a fresh (also pinging) reply. The model's exact wording isn't
+      // forceable, so we accept any substantive (≥200-char) answer that lands;
+      // collectTurn skips the short ack (below minAnswerChars) and latches onto
+      // the real answer.
+      const obs = await collectTurn(
+        driver,
+        botUserId,
+        driverUserId,
+        "First send a very short interim reply 'On it.' (pinging — do NOT set " +
+          "disable_notification). THEN, as a separate second reply, give me a " +
+          "thorough answer of at least 300 characters explaining what a Telegram " +
+          "supergroup is, how forum topics partition it, and how a bot routes a " +
+          "reply back to the topic a question came from. The long second reply is " +
+          "your final answer.",
+        { timeoutMs: TURN_BUDGET_MS, minAnswerChars: MIN_ANSWER_CHARS, settleMs: 12_000 },
+      );
+      if (obs.answer == null) {
+        console.warn("[answer-pings] INCONCLUSIVE — no substantive answer landed in budget.");
+        return;
+      }
+      // Sanity: this is the answer lane, not a feed surface.
+      expect(isAnswer(obs.answer, driverUserId)).toBe(true);
+      // The load-bearing assertion: the substantive answer is non-silent. If an
+      // earlier ack-ping had downgraded it (the pre-PR-2 residual), this throws.
+      assertAnswerPinged(obs.answer);
+    },
+    TURN_BUDGET_MS + 30_000,
+  );
+});

package/telegram-plugin/uat/scenarios/jtbd-cross-turn-card-dm.test.ts ADDED Viewed

@@ -0,0 +1,109 @@
+/**
+ * JTBD: "no stale 'thinking…' card opens beneath an answer the user already
+ * received in an EARLIER turn" — the cross-turn form of the reply-is-last
+ * invariant (design `docs/message-emission-determinism.md` §9 lever 4 / race
+ * C/D; PR1).
+ *
+ * The in-turn levers (#2557, sticky `finalAnswerEverDelivered`) only govern the
+ * CURRENT turn. The cross-turn surfaces — the obligation `represent` sweep and
+ * the heartbeat/liveness timer — can OPEN a card in a LATER synthetic turn,
+ * surfacing a card beneath an answer delivered in an earlier turn. PR1's lever 4
+ * gates those synthetic card-OPEN paths on `hasOutboundDeliveredSince`: if a
+ * substantive answer already landed since the obligation was raised, the card
+ * OPEN is suppressed (the represent SEND is unaffected — only the decorative
+ * card).
+ *
+ * This scenario delivers a substantive answer in turn N, then keeps pulling
+ * send-order history through a long settle window (during which the obligation
+ * sweep / heartbeat may fire a synthetic represent/liveness surface in turn
+ * N+1), and asserts no activity/worker-feed card opened BELOW the delivered
+ * answer. `assertReplyIsLast` scopes to the answer's window up to the next
+ * driver message — and because a cross-turn synthetic surface carries NO
+ * intervening driver message, a card-below-answer it opens falls inside that
+ * window and is correctly flagged.
+ *
+ * Runs under CI `uat-gate`; the full live MTProto run needs the test-harness
+ * agent + a vault session, so locally this self-skips green (no driver) — same
+ * shape as `jtbd-reply-is-last-dm.test.ts`.
+ */
+import { describe, it, expect, beforeAll } from "vitest";
+import { spinUp, type Scenario } from "../harness.js";
+import {
+  assertReplyIsLast,
+  isAnswer,
+  isActivityFeedMessage,
+  isWorkerFeedMessage,
+} from "../assertions.js";
+import { collectTurn } from "../real-work-prompts.js";
+/** Per-case overall budget. */
+const TURN_BUDGET_MS = 140_000;
+/** History pull depth — covers the answer turn + any cross-turn synthetic surface. */
+const HISTORY_LIMIT = 80;
+/**
+ * Settle window AFTER the answer lands. Long enough that the obligation sweep
+ * (and the heartbeat liveness timer) has at least one chance to fire a
+ * cross-turn synthetic surface — the window PR1 lever 4 guards. The obligation
+ * sweep runs on its own interval, so we cannot force a represent deterministically;
+ * the durable assertion is "IF a synthetic surface fires, it must not open a
+ * card below the answer." A run where no represent fires is a valid green pass.
+ */
+const POST_ANSWER_SETTLE_MS = 20_000;
+describe("uat: no cross-turn card opens beneath an earlier answer (DM)", () => {
+  let sc: Scenario | null = null;
+  beforeAll(async () => {
+    try {
+      sc = await spinUp({ agent: "test-harness" });
+      await sc.driver.primeDialogs();
+    } catch (err) {
+      console.warn(
+        `[cross-turn-card] no live driver — self-skipping green: ${(err as Error).message}`,
+      );
+      sc = null;
+    }
+  });
+  it(
+    "a substantive answer in turn N is not followed by a card opened in turn N+1 (lever 4 / race C/D)",
+    async () => {
+      if (sc == null) return; // self-skip green
+      const { driver, botUserId, driverUserId } = sc;
+      // Deliver a substantive answer (≥200 chars → trips the substantive proxy
+      // the cross-turn gate keys on). A tool is used so a card legitimately
+      // opens DURING the turn — the test then proves nothing opens BELOW the
+      // reply afterwards, across the cross-turn boundary.
+      const obs = await collectTurn(
+        driver,
+        botUserId,
+        driverUserId,
+        "Use your Bash tool to run `uname -a`, then give me a thorough answer " +
+          "(at least 220 characters) explaining what the output means field by " +
+          "field. That detailed message is your final answer.",
+        { timeoutMs: TURN_BUDGET_MS, minAnswerChars: 200, settleMs: POST_ANSWER_SETTLE_MS },
+      );
+      if (obs.answer == null) {
+        console.warn("[cross-turn-card] INCONCLUSIVE — no answer landed in budget.");
+        return;
+      }
+      // Sanity: the answer is a real answer-lane message, not a feed surface.
+      expect(isAnswer(obs.answer, driverUserId)).toBe(true);
+      expect(isActivityFeedMessage(obs.answer)).toBe(false);
+      expect(isWorkerFeedMessage(obs.answer)).toBe(false);
+      // Pull full server send-order history AFTER the long settle. Any
+      // cross-turn synthetic surface (represent / heartbeat liveness) that
+      // opened a card would now be present with a HIGHER message_id than the
+      // answer and — having no intervening driver message — inside the answer's
+      // turn window, so assertReplyIsLast flags it. Lever 4 must have suppressed
+      // that OPEN.
+      const history = await driver.getHistory(botUserId, HISTORY_LIMIT);
+      assertReplyIsLast(history, driverUserId, { turn: obs.answer });
+    },
+    TURN_BUDGET_MS + 40_000,
+  );
+});