npm - switchroom - Versions diffs - 0.14.66 → 0.14.67 - Mend

switchroom 0.14.66 → 0.14.67

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

package/dist/cli/switchroom.js +453 -325
package/package.json +1 -1
package/telegram-plugin/dist/gateway/gateway.js +24 -7
package/telegram-plugin/gateway/gateway.ts +42 -2
package/telegram-plugin/silence-poke.ts +25 -0
package/telegram-plugin/tests/silence-liveness-wiring.test.ts +67 -0
package/telegram-plugin/tests/silence-poke.test.ts +42 -0
package/telegram-plugin/uat/real-work-prompts.ts +332 -0
package/telegram-plugin/uat/scenarios/fuzz-real-work-channel.test.ts +82 -0
package/telegram-plugin/uat/scenarios/fuzz-real-work-dm.test.ts +64 -0

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "switchroom",
-  "version": "0.14.66",
+  "version": "0.14.67",
   "description": "Run Claude Code 24/7 on your Claude Pro/Max subscription over Telegram. Open-source alternative to OpenClaw and NanoClaw — no API keys.",
   "type": "module",
   "bin": {

package/telegram-plugin/dist/gateway/gateway.js CHANGED Viewed

@@ -39034,6 +39034,13 @@ function noteOutbound2(key, now) {
   s.lastOutboundAt = now;
   s.fallbackFired = false;
 }
+function noteProduction(key, now) {
+  const s = state2.get(key);
+  if (s == null)
+    return;
+  s.lastOutboundAt = now;
+  s.fallbackFired = false;
+}
 function noteThinking(key, now) {
   const s = state2.get(key);
   if (s == null)
@@ -52763,11 +52770,11 @@ function sweepStaleTurnActiveMarker(stateDir, opts) {
 }
 // ../src/build-info.ts
-var VERSION = "0.14.66";
-var COMMIT_SHA = "0f4f029d";
-var COMMIT_DATE = "2026-06-05T07:05:45Z";
-var LATEST_PR = 2167;
-var COMMITS_AHEAD_OF_TAG = 2;
+var VERSION = "0.14.67";
+var COMMIT_SHA = "dcade213";
+var COMMIT_DATE = "2026-06-05T08:22:01Z";
+var LATEST_PR = 2171;
+var COMMITS_AHEAD_OF_TAG = 4;
 // gateway/boot-version.ts
 function formatRelativeAgo(iso) {
@@ -54075,7 +54082,7 @@ function findLatestEndedTurnForChat(chatId) {
   return latest;
 }
 function resolveAnswerThreadWithLog(chatId, explicitThreadId, originTurn, liveTurn, surface) {
-  const recovered = LATE_REPLY_TOPIC_RECOVERY_ENABLED && explicitThreadId == null && originTurn == null && liveTurn?.sessionThreadId == null ? findLatestEndedTurnForChat(chatId) : null;
+  const recovered = LATE_REPLY_TOPIC_RECOVERY_ENABLED && explicitThreadId == null && originTurn == null && liveTurn == null ? findLatestEndedTurnForChat(chatId) : null;
   const threadId = resolveAnswerThreadId({
     explicitThreadId,
     originResolved: originTurn != null,
@@ -55260,6 +55267,7 @@ function parsePositiveMsEnv(name, fallbackMs) {
 var SILENCE_FALLBACK_MS = parsePositiveMsEnv("SWITCHROOM_SILENCE_FALLBACK_MS", 300000);
 var SILENCE_FALLBACK_HARD_MS = parsePositiveMsEnv("SWITCHROOM_SILENCE_FALLBACK_HARD_MS", 900000);
 var SILENCE_DEFER_INFLIGHT_TOOLS = process.env.SWITCHROOM_SILENCE_DEFER_INFLIGHT_TOOLS === "1";
+var SILENCE_LIVENESS_PRODUCTION = process.env.SWITCHROOM_SILENCE_LIVENESS_PRODUCTION !== "0";
 startTimer({
   thresholdsMs: { fallback: SILENCE_FALLBACK_MS, fallbackHardCeiling: SILENCE_FALLBACK_HARD_MS },
   deferFallbackWhileToolInFlight: SILENCE_DEFER_INFLIGHT_TOOLS,
@@ -55351,8 +55359,11 @@ startTimer({
       const sib = silenceMsForKey(siblingKey, fbNow);
       return sib == null || sib >= DEFAULT_THRESHOLDS.fallback;
     });
-    if (turnMatchesFallback && currentTurn === wedgedTurn)
+    if (turnMatchesFallback && currentTurn === wedgedTurn && wedgedTurn != null) {
+      process.stderr.write(`telegram gateway: ${formatTurnLifecycle("clear", "silence_fallback", wedgedTurn, Date.now())}
+`);
       currentTurn = null;
+    }
     try {
       clearSilentEndState(fbKey);
     } catch {}
@@ -58076,6 +58087,9 @@ function handleSessionEvent(ev) {
       const rendered = appendActivityLabel(turn.mirrorLines, ev.label);
       if (rendered != null) {
         turn.lastToolLabelAt = Date.now();
+        if (SILENCE_LIVENESS_PRODUCTION && currentTurn === turn) {
+          noteProduction(statusKey(turn.sessionChatId, turn.sessionThreadId), Date.now());
+        }
         turn.activityPendingRender = composeTurnActivity(turn) ?? rendered;
         if (turn.activityInFlight == null) {
           turn.activityInFlight = drainActivitySummary(turn);
@@ -58130,6 +58144,9 @@ function handleSessionEvent(ev) {
               logStreamingEvent(metricEv);
               if (currentTurn === turn) {
                 noteSignal(statusKey(turn.sessionChatId, turn.sessionThreadId), Date.now());
+                if (SILENCE_LIVENESS_PRODUCTION) {
+                  noteProduction(statusKey(turn.sessionChatId, turn.sessionThreadId), Date.now());
+                }
               }
             },
             checkDedup: (text) => {

package/telegram-plugin/gateway/gateway.ts CHANGED Viewed

@@ -1930,11 +1930,17 @@ function resolveAnswerThreadWithLog(
   liveTurn: CurrentTurn | null,
   surface: 'reply' | 'stream_reply',
 ): number | undefined {
+  // Recover ONLY for a genuinely LATE reply — no live turn at all. Gating on
+  // `liveTurn?.sessionThreadId == null` (the original) also fired for a
+  // threadless DM that still had a live turn, marking every DM reply
+  // `via=recovered`/RECOVERED in the telemetry (routing result unchanged —
+  // DM → undefined — but it drowned the real supergroup recoveries the marker
+  // exists to surface). `liveTurn == null` is the precise late-reply condition.
   const recovered =
     LATE_REPLY_TOPIC_RECOVERY_ENABLED &&
     explicitThreadId == null &&
     originTurn == null &&
-    liveTurn?.sessionThreadId == null
+    liveTurn == null
       ? findLatestEndedTurnForChat(chatId)
       : null
   const threadId = resolveAnswerThreadId({
@@ -4673,6 +4679,12 @@ function parsePositiveMsEnv(name: string, fallbackMs: number): number {
 const SILENCE_FALLBACK_MS = parsePositiveMsEnv('SWITCHROOM_SILENCE_FALLBACK_MS', 300_000)
 const SILENCE_FALLBACK_HARD_MS = parsePositiveMsEnv('SWITCHROOM_SILENCE_FALLBACK_HARD_MS', 900_000)
 const SILENCE_DEFER_INFLIGHT_TOOLS = process.env.SWITCHROOM_SILENCE_DEFER_INFLIGHT_TOOLS === '1'
+// Production-liveness (2026-06-05 UAT finding). Count an activity-feed render or
+// an answer-stream draft update as liveness for the silence clock, so a long
+// tool/composition turn that's visibly producing doesn't trip the 300s fallback
+// and null currentTurn mid-work. Default ON; SWITCHROOM_SILENCE_LIVENESS_PRODUCTION=0
+// restores the legacy "only a real reply resets the clock" behaviour.
+const SILENCE_LIVENESS_PRODUCTION = process.env.SWITCHROOM_SILENCE_LIVENESS_PRODUCTION !== '0'
 silencePoke.startTimer({
   thresholdsMs: { fallback: SILENCE_FALLBACK_MS, fallbackHardCeiling: SILENCE_FALLBACK_HARD_MS },
@@ -4889,7 +4901,16 @@ silencePoke.startTimer({
     // returns null and the regular teardown short-circuits. Without
     // this, the late event would re-emit `turn_ended` AND clobber
     // whatever fresh turn the next inbound started.
-    if (turnMatchesFallback && currentTurn === wedgedTurn) currentTurn = null
+    if (turnMatchesFallback && currentTurn === wedgedTurn && wedgedTurn != null) {
+      // Status-surface observability: emit the lifecycle CLEAR for the
+      // silence-poke teardown so a fallback-nulled turn has a turn-lifecycle
+      // line like every other clear path (the framework-fallback line below is
+      // its own format — this makes the dark-out greppable in the same shape).
+      process.stderr.write(
+        `telegram gateway: ${formatTurnLifecycle('clear', 'silence_fallback', wedgedTurn, Date.now())}\n`,
+      )
+      currentTurn = null
+    }
     // Best-effort: clear any pending silent-end marker so the Stop hook
     // doesn't double-block when claude eventually exits the wedged turn.
     try {
@@ -9452,6 +9473,16 @@ function handleSessionEvent(ev: SessionEvent): void {
         // the " · Ns" elapsed restarts from this step (and the feed itself just
         // advanced, so it isn't stale).
         turn.lastToolLabelAt = Date.now()
+        // Production-liveness: a NEW model-driven activity label is genuine
+        // liveness (the model emitted a new step), so reset the silence-poke
+        // clock — this is the safe site, NOT drainActivitySummary, because the
+        // framework feedHeartbeatTick also drains (climbing-elapsed re-renders)
+        // and would falsely reset the clock forever on a hung-mid-tool turn,
+        // reintroducing the #1556 dangling-turn wedge. Only the model emitting a
+        // fresh label reaches here.
+        if (SILENCE_LIVENESS_PRODUCTION && currentTurn === turn) {
+          silencePoke.noteProduction(statusKey(turn.sessionChatId, turn.sessionThreadId), Date.now())
+        }
         // Recompose so any active foreground sub-agent's nested block (Model A)
         // is preserved when the parent appends its own step. composeTurnActivity
         // == the flat render when no foreground sub-agent is active.
@@ -9612,6 +9643,15 @@ function handleSessionEvent(ev: SessionEvent): void {
                   statusKey(turn.sessionChatId, turn.sessionThreadId),
                   Date.now(),
                 )
+                // Production-liveness: a draft update is the agent visibly
+                // composing — reset the silence-poke clock so a long
+                // compose-only turn (no tools, no reply yet) isn't torn down.
+                if (SILENCE_LIVENESS_PRODUCTION) {
+                  silencePoke.noteProduction(
+                    statusKey(turn.sessionChatId, turn.sessionThreadId),
+                    Date.now(),
+                  )
+                }
               }
             },
             // #646 — wire the shared outboundDedup into the answer-stream

package/telegram-plugin/silence-poke.ts CHANGED Viewed

@@ -196,6 +196,31 @@ export function noteOutbound(key: string, now: number): void {
   s.fallbackFired = false
 }
+/**
+ * Record observable PRODUCTION that isn't a final reply — an activity-feed
+ * render (`→/✓` edit-in-place message) or an answer-stream draft update. Resets
+ * the silence clock exactly like a reply.
+ *
+ * Why this exists (2026-06-05): the header's "only a real reply counts; tool
+ * churn / the model ripping through 20 tool calls is still SILENT to the user"
+ * rule predates the live activity feed (#2162) and the compose draft. Those
+ * surfaces ARE user-visible now, so a turn actively rendering them is NOT
+ * silent — yet the 300s fallback (which nulls `currentTurn` and kills the very
+ * feed/draft the user is watching) still fired on a long tool/composition turn,
+ * darkening the live status mid-work. Counting production as liveness makes the
+ * fallback fire only on GENUINE silence (no reply, no feed, no draft, no tool
+ * events for the window) — a real wedge. A wedged agent produces nothing
+ * observable, so its clock is never reset and it still recovers.
+ *
+ * No-op when the kill switch is on or the key has no turn.
+ */
+export function noteProduction(key: string, now: number): void {
+  const s = state.get(key)
+  if (s == null) return
+  s.lastOutboundAt = now
+  s.fallbackFired = false
+}
 /**
  * Record a `thinking` session event. Used to pick "still thinking…" vs
  * "still working…" wording for the 300s framework fallback.

package/telegram-plugin/tests/silence-liveness-wiring.test.ts ADDED Viewed

@@ -0,0 +1,67 @@
+/**
+ * Silence-poke production-liveness — heartbeat-safety guard (2026-06-05).
+ *
+ * The production-liveness fix resets the silence clock on observable production
+ * so a long WORKING turn doesn't dark out. The load-bearing constraint: the
+ * reset must fire ONLY on MODEL-driven production, NEVER from the framework
+ * `feedHeartbeatTick` — a model-INDEPENDENT setInterval that re-renders a
+ * climbing " · Ns" elapsed every 6s (defeating the feed's content-dedup). If the
+ * reset lived in `drainActivitySummary` (which the heartbeat drains), a
+ * hung-but-bridge-connected agent would have its 300s silence clock reset every
+ * 6s forever, the load-bearing silence-poke unwedge would NEVER fire, and the
+ * conversation would be pinned — the #1556 permanent dangling-turn wedge.
+ *
+ * An adversarial review panel caught exactly this in an earlier revision. These
+ * are STRUCTURAL assertions (the gateway IIFE can't be instantiated in-process —
+ * same pattern as multitopic-routing-wiring.test) that pin the reset to the
+ * model-driven sites so a refactor can't silently reintroduce the regression.
+ * The behavioural counterpart (noteProduction resets; STOP producing → fires)
+ * lives in silence-poke.test.ts; this guards the WIRING the heartbeat must not
+ * cross.
+ */
+import { describe, it, expect } from 'vitest'
+import { readFileSync } from 'node:fs'
+import { resolve } from 'node:path'
+const gatewaySrc = readFileSync(resolve(__dirname, '..', 'gateway', 'gateway.ts'), 'utf-8')
+function between(src: string, startMarker: string, endMarker: string): string {
+  const after = src.split(startMarker)[1] ?? ''
+  return after.split(endMarker)[0] ?? ''
+}
+describe('silence-poke production-liveness — heartbeat safety', () => {
+  it('drainActivitySummary must NOT reset the silence clock (the framework heartbeat drains here)', () => {
+    const body = between(gatewaySrc, 'async function drainActivitySummary', '\nfunction feedHeartbeatTick')
+    expect(body.length).toBeGreaterThan(100) // sanity: the slice found the function body
+    expect(body).not.toMatch(/noteProduction/)
+  })
+  it('feedHeartbeatTick itself must NOT reset the silence clock (model-independent re-render)', () => {
+    const body = between(gatewaySrc, 'function feedHeartbeatTick(): void {', '\n}')
+    expect(body.length).toBeGreaterThan(50)
+    expect(body).not.toMatch(/noteProduction/)
+  })
+  it('the MODEL-driven tool-label append IS the reset site, gated on the live turn', () => {
+    // appendActivityLabel returns a fresh render only when the model emits a NEW
+    // labelled step — the genuine liveness signal the heartbeat can never forge.
+    const block = between(
+      gatewaySrc,
+      'const rendered = appendActivityLabel(turn.mirrorLines, ev.label)',
+      '\n      return',
+    )
+    expect(block).toMatch(/silencePoke\.noteProduction/)
+    expect(block).toMatch(/currentTurn === turn/)
+  })
+  it('the answer-stream draft onMetric reset is model-driven and gated on the live turn', () => {
+    const block = between(gatewaySrc, 'onMetric: (metricEv) => {', '\n            },')
+    expect(block).toMatch(/silencePoke\.noteProduction/)
+    expect(block).toMatch(/currentTurn === turn/)
+  })
+  it('production-liveness is behind the default-ON SWITCHROOM_SILENCE_LIVENESS_PRODUCTION kill switch', () => {
+    expect(gatewaySrc).toMatch(/SWITCHROOM_SILENCE_LIVENESS_PRODUCTION !== '0'/)
+  })
+})

package/telegram-plugin/tests/silence-poke.test.ts CHANGED Viewed

@@ -2,6 +2,7 @@ import { describe, it, expect, beforeEach, afterEach } from 'vitest'
 import {
   startTurn,
   noteOutbound,
+  noteProduction,
   noteThinking,
   noteToolStart,
   noteToolEnd,
@@ -136,6 +137,47 @@ describe('silence-poke — outbound resets the silence clock', () => {
   })
 })
+// Production-liveness (2026-06-05): an activity-feed render or draft update is
+// the agent visibly working — it resets the silence clock so a long
+// tool/composition turn isn't torn down mid-work.
+describe('silence-poke — noteProduction resets the silence clock', () => {
+  it('a feed/draft render at 250s pushes the fallback measurement to it', () => {
+    const fx = setupDeps()
+    startTurn('k', 0)
+    noteProduction('k', 250_000)
+    __tickForTests(300_000) // 50s since production — no fire
+    expect(fx.fallbacks).toHaveLength(0)
+    __tickForTests(550_000) // 300s since production — fires
+    expect(fx.fallbacks).toHaveLength(1)
+  })
+  it('repeated production every 60s keeps a long turn alive indefinitely', () => {
+    const fx = setupDeps()
+    startTurn('k', 0)
+    for (let t = 60_000; t <= 600_000; t += 60_000) {
+      noteProduction('k', t)
+      __tickForTests(t)
+    }
+    // 10 min of steady feed/draft renders — never torn down.
+    expect(fx.fallbacks).toHaveLength(0)
+  })
+  it('production STOPS → the fallback fires 300s after the last render (genuine wedge)', () => {
+    const fx = setupDeps()
+    startTurn('k', 0)
+    noteProduction('k', 100_000) // last render at 100s, then silence
+    __tickForTests(390_000) // 290s since last render — no fire
+    expect(fx.fallbacks).toHaveLength(0)
+    __tickForTests(401_000) // 301s since last render — fires
+    expect(fx.fallbacks).toHaveLength(1)
+  })
+  it('is a no-op for an unknown key (no turn state)', () => {
+    setupDeps()
+    expect(() => noteProduction('nope', 1_000)).not.toThrow()
+  })
+})
 // Pin the contract the gateway must uphold for ABNORMAL turn-ends:
 // every code path that abandons a turn before turn_end (context-
 // exhaust bail, gateway-side wedge timeout, silent-end recovery)

package/telegram-plugin/uat/real-work-prompts.ts ADDED Viewed

@@ -0,0 +1,332 @@
+/**
+ * Real-work UAT coverage — human-style prompts that trigger actual work
+ * (multi-tool, web research, sub-agents, background workers) plus a turn
+ * collector + bug detectors for the failure classes the conversational fuzz
+ * never exercised.
+ *
+ * Why this exists: the existing fuzz scenarios send conversational prompts
+ * ("hey how's it going", emoji, markdown edge-cases) → trivial fast replies.
+ * The status-surface and reply-ordering bugs (live feed going dark mid-work,
+ * the orphaned-reply backstop flushing a fragment then the real answer landing
+ * late and out of order, late replies misrouting) only manifest when the agent
+ * does REAL work — uses tools/MCPs, spawns sub-agents, researches long enough to
+ * cross the silence-poke / orphaned-reply thresholds. These prompts provoke that
+ * work in a human voice; `collectTurn` captures the whole bot-message sequence;
+ * `analyzeTurn` flags the known bug signatures.
+ *
+ * Harness limits (see CLAUDE.md): mtcute observes real sendMessage/editMessageText
+ * (so the activity feed `→/✓` and worker feed `🛠` ARE observable) but NOT drafts
+ * or reactions, and has no forum-topic API (channel scenarios use the General
+ * topic — they prove DM-vs-channel routing, not correct-topic-among-many, which
+ * the gateway unit thread-assertions pin). So work-triggering is probabilistic on
+ * a generic agent: the UNIVERSAL invariants (a substantive answer arrives, in the
+ * right surface, not as an orphaned fragment) are hard; the work-specific surfaces
+ * (feed painted, worker surfaced) are reported and only hard-checked once their
+ * precondition is observed.
+ */
+import type { Driver, ObservedMessage } from "./driver.js";
+import { isWorkerFeedMessage, isActivityFeedMessage } from "./assertions.js";
+export type WorkKind =
+  | "research" // web/multi-source research → multi-tool, long
+  | "multitool" // several tool calls, sequential
+  | "subagent" // delegates to a foreground sub-agent
+  | "bgworker" // dispatches a background worker (the 🛠 feed)
+  | "compound" // first X then Y then summarise — ordered multi-step
+  | "web"; // current/recent info → forces a web fetch
+export interface RealWorkCase {
+  name: string;
+  /** Human-style prompt that should provoke real work. */
+  prompt: string;
+  kind: WorkKind;
+  /** Generous budget — deep research can run minutes. */
+  timeoutMs: number;
+  /** The substantive answer must be at least this long; a backstop fragment /
+   *  bare ack is shorter, so this distinguishes "the answer landed" from "only a
+   *  stub landed". */
+  minAnswerChars: number;
+  /** When true, this prompt RELIABLY triggers the named surface, so the scenario
+   *  hard-asserts it appeared (not just reports it). Used for the semi-prescriptive
+   *  but natural-sounding bgworker/subagent cases. */
+  requireSurface?: "worker" | "activity";
+}
+/**
+ * The case set. The first block is fully human-style (probabilistic work); the
+ * `requireSurface` block phrases the dispatch naturally but reliably enough to
+ * hard-assert the surface. Keep prompts provider-agnostic so they run on the
+ * generic test-harness agent (no marko-specific MCPs).
+ */
+export const REAL_WORK_CASES: RealWorkCase[] = [
+  {
+    name: "deep research, take your time",
+    prompt:
+      "Can you research the current state of WebAssembly outside the browser — " +
+      "the main server-side runtimes, who's actually using it in production, and " +
+      "the real limitations today? Take your time and give me a proper rundown, " +
+      "not a one-liner.",
+    kind: "research",
+    timeoutMs: 180_000,
+    minAnswerChars: 400,
+  },
+  {
+    name: "current info, forces a lookup",
+    prompt:
+      "What's the latest with the Bun JavaScript runtime — the recent releases " +
+      "and whether people consider it production-ready yet? Check, don't guess.",
+    kind: "web",
+    timeoutMs: 150_000,
+    minAnswerChars: 300,
+  },
+  {
+    name: "multi-angle investigation",
+    prompt:
+      "Dig into Postgres vs SQLite for a small SaaS backend — look at it from a " +
+      "few angles (concurrency, ops burden, cost at scale) and tell me which " +
+      "you'd actually pick and why.",
+    kind: "multitool",
+    timeoutMs: 150_000,
+    minAnswerChars: 400,
+  },
+  {
+    name: "compound sequential ask",
+    prompt:
+      "First work out what today's date is, then how many days are left until the " +
+      "end of this quarter, then suggest three concrete milestones I could hit " +
+      "before then. Do it in that order.",
+    kind: "compound",
+    timeoutMs: 120_000,
+    minAnswerChars: 250,
+  },
+  {
+    name: "invite delegation",
+    prompt:
+      "I need a proper comparison of Stripe vs Paddle vs Lemon Squeezy for selling " +
+      "a digital product — pricing, who handles sales tax, and payout timing. Farm " +
+      "it out to a sub-agent if that's faster; just give me the bottom line at the end.",
+    kind: "subagent",
+    timeoutMs: 180_000,
+    minAnswerChars: 350,
+  },
+  {
+    name: "long sourced briefing (crosses thresholds)",
+    prompt:
+      "Give me a thorough, well-sourced briefing on the EU AI Act — what it covers, " +
+      "the risk tiers, the key deadlines, and what a small AI startup actually has to " +
+      "do. Be comprehensive; I'd rather wait and get depth.",
+    kind: "research",
+    timeoutMs: 360_000,
+    minAnswerChars: 500,
+  },
+  // ── reliably-triggering, still natural voice ──────────────────────────────
+  {
+    name: "background worker, ping me when done",
+    prompt:
+      "Don't answer this inline — actually dispatch a background worker for it " +
+      "(Task / Agent with run_in_background: true) so I can keep chatting while it " +
+      "runs, and ping me when it's done. The task: go through, ONE step at a time " +
+      "with a one-line note on each (run a quick command or jot a note per step so " +
+      "there's visible progress), the eight most common email-deliverability " +
+      "mistakes a solo founder makes — SPF, DKIM, DMARC, warmup, list hygiene, " +
+      "content, sending cadence, monitoring. Pace it over a couple of minutes; do " +
+      "all eight, then hand back the summary.",
+    kind: "bgworker",
+    // Generous: if the agent declines to background it and composes inline, a
+    // paced 8-step answer can run past 5 min (and, with no tracked tool in
+    // flight, trip the 300s silence-poke — see the 2026-06-05 UAT finding).
+    timeoutMs: 360_000,
+    minAnswerChars: 250,
+    requireSurface: "worker",
+  },
+  {
+    name: "step-by-step so the feed paints",
+    prompt:
+      "Walk through, ONE step at a time (run a quick command or note for each so I " +
+      "can see progress), how you'd debug a Linux box that's suddenly out of disk " +
+      "space — six steps: df, du on the big dirs, find large files, check logs, " +
+      "check deleted-but-open files, then a cleanup plan. Then give me the recap.",
+    kind: "multitool",
+    timeoutMs: 180_000,
+    minAnswerChars: 300,
+    requireSurface: "activity",
+  },
+];
+/** What the collector observed across one turn. */
+export interface TurnObservation {
+  /** Every bot message (initial sends only; edits tracked separately). */
+  botMessages: ObservedMessage[];
+  /** Edit events seen (worker/activity feeds grow via edits). */
+  edits: ObservedMessage[];
+  /** The first substantive answer (non-feed, >= minAnswerChars), or null. */
+  answer: ObservedMessage | null;
+  /** ms from send to the answer (or to timeout). */
+  answerLatencyMs: number;
+  /** Whether an activity feed (`→/✓`) message was seen. */
+  sawActivityFeed: boolean;
+  /** Whether a worker feed (`🛠 Worker`) message was seen. */
+  sawWorkerFeed: boolean;
+}
+/**
+ * Send `prompt` and collect the bot's message sequence until a substantive
+ * answer lands (+ a short settle to catch trailing/late sends — the very window
+ * the orphaned-reply bug lives in) or `timeoutMs` elapses. Observing starts
+ * BEFORE the send so nothing is missed.
+ */
+export async function collectTurn(
+  driver: Driver,
+  chatId: number,
+  driverUserId: number,
+  prompt: string,
+  opts: { timeoutMs: number; minAnswerChars: number; settleMs?: number },
+): Promise<TurnObservation> {
+  const settleMs = opts.settleMs ?? 6_000;
+  const botMessages: ObservedMessage[] = [];
+  const edits: ObservedMessage[] = [];
+  let answer: ObservedMessage | null = null;
+  let sawActivityFeed = false;
+  let sawWorkerFeed = false;
+  const startedAt = Date.now();
+  const iterator = driver.observeMessages(chatId)[Symbol.asyncIterator]();
+  // Begin observing, then send (observeMessages backfills nothing, but the send
+  // round-trips after the iterator is live).
+  await driver.sendText(chatId, prompt);
+  let settleDeadline = Number.POSITIVE_INFINITY;
+  while (true) {
+    const remaining =
+      Math.min(opts.timeoutMs - (Date.now() - startedAt), settleDeadline - Date.now());
+    if (remaining <= 0) break;
+    const next = await Promise.race([
+      iterator.next(),
+      new Promise<{ done: true; value: undefined }>((r) =>
+        setTimeout(() => r({ done: true, value: undefined }), Math.max(0, remaining)),
+      ),
+    ]);
+    if (next.done || next.value == null) {
+      // timed out (either overall or settle) — stop
+      break;
+    }
+    const m = next.value as ObservedMessage;
+    if (m.senderUserId === driverUserId) continue; // our own echo
+    if (m.edited) {
+      edits.push(m);
+      if (isWorkerFeedMessage(m)) sawWorkerFeed = true;
+      if (isActivityFeedMessage(m)) sawActivityFeed = true;
+      continue;
+    }
+    botMessages.push(m);
+    if (isWorkerFeedMessage(m)) sawWorkerFeed = true;
+    else if (isActivityFeedMessage(m)) sawActivityFeed = true;
+    else if (answer == null && m.text.trim().length >= opts.minAnswerChars) {
+      answer = m;
+      // Got the answer; keep collecting for `settleMs` to catch a late
+      // fragment/duplicate/misrouted trailing send.
+      settleDeadline = Date.now() + settleMs;
+    }
+  }
+  void iterator.return?.();
+  return {
+    botMessages,
+    edits,
+    answer,
+    answerLatencyMs: answer ? answer.date.getTime() - startedAt : Date.now() - startedAt,
+    sawActivityFeed,
+    sawWorkerFeed,
+  };
+}
+export interface TurnViolation {
+  code:
+    | "no-answer"
+    | "orphaned-fragment"
+    | "surface-missing"
+    | "wrong-surface";
+  detail: string;
+}
+/**
+ * Bug detectors over a collected turn. Splits HARD violations (the universal
+ * invariants that must always hold) from SOFT warnings (work-specific surfaces
+ * that are probabilistic on a generic agent — whether it dispatches a worker /
+ * sub-agent is its judgment, so a missing feed is reported, not failed).
+ *
+ * Hard violations:
+ *  - no-answer: no substantive reply arrived at all (the answer never landed).
+ *  - orphaned-fragment: a short non-ack bot text landed, THEN ≥8s later a much
+ *    longer answer — the orphaned-reply backstop signature (fragment flushed,
+ *    real reply late). A short message that is itself the only substantive reply,
+ *    or a brief "on it" ack followed promptly, does not count.
+ *  - wrong-surface (channel): a bot message landed outside the expected chat.
+ *
+ * Soft warnings:
+ *  - surface-missing: a `requireSurface` case never showed its feed. The agent
+ *    may have answered inline (a legitimate choice) — reported for the bug hunt,
+ *    not a hard fail. When the feed DOES appear, the summary + gateway telemetry
+ *    confirm it surfaced correctly.
+ */
+export function analyzeTurn(
+  obs: TurnObservation,
+  expected: { requireSurface?: "worker" | "activity"; chatId: number },
+): { violations: TurnViolation[]; warnings: TurnViolation[] } {
+  const violations: TurnViolation[] = [];
+  const warnings: TurnViolation[] = [];
+  if (obs.answer == null) {
+    violations.push({
+      code: "no-answer",
+      detail: `no substantive reply within budget (saw ${obs.botMessages.length} bot msg(s), ` +
+        `activityFeed=${obs.sawActivityFeed} workerFeed=${obs.sawWorkerFeed})`,
+    });
+  }
+  // orphaned-fragment: a non-feed text shorter than 150 chars, sent ≥8s before
+  // the answer, that isn't a quick ack right before the answer.
+  if (obs.answer != null) {
+    const fragments = obs.botMessages.filter(
+      (m) =>
+        m.messageId !== obs.answer!.messageId &&
+        !isWorkerFeedMessage(m) &&
+        !isActivityFeedMessage(m) &&
+        m.text.trim().length > 0 &&
+        m.text.trim().length < 150 &&
+        obs.answer!.date.getTime() - m.date.getTime() >= 8_000,
+    );
+    if (fragments.length > 0) {
+      violations.push({
+        code: "orphaned-fragment",
+        detail: `${fragments.length} stub message(s) landed ≥8s before the answer ` +
+          `(e.g. ${JSON.stringify(fragments[0]!.text.slice(0, 60))}) — the orphaned-reply ` +
+          `backstop signature.`,
+      });
+    }
+  }
+  if (expected.requireSurface === "worker" && !obs.sawWorkerFeed) {
+    warnings.push({ code: "surface-missing", detail: "expected a 🛠 worker feed; agent likely answered inline" });
+  }
+  if (expected.requireSurface === "activity" && !obs.sawActivityFeed && !obs.sawWorkerFeed) {
+    warnings.push({ code: "surface-missing", detail: "expected a →/✓ activity feed; none appeared" });
+  }
+  const stray = [...obs.botMessages, ...obs.edits].filter((m) => m.chatId !== expected.chatId);
+  if (stray.length > 0) {
+    violations.push({
+      code: "wrong-surface",
+      detail: `${stray.length} bot message(s) landed in chat ${stray[0]!.chatId}, expected ${expected.chatId}`,
+    });
+  }
+  return { violations, warnings };
+}
+/** One-line human summary of a turn for the test log (bug-hunt forensics). */
+export function summarizeTurn(name: string, obs: TurnObservation): string {
+  return (
+    `[real-work] ${name}: answer=${obs.answer ? `${obs.answer.text.trim().length}ch@${Math.round(obs.answerLatencyMs / 1000)}s` : "NONE"} ` +
+    `botMsgs=${obs.botMessages.length} edits=${obs.edits.length} ` +
+    `activityFeed=${obs.sawActivityFeed} workerFeed=${obs.sawWorkerFeed}`
+  );
+}