switchroom 0.14.66 → 0.14.67

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "switchroom",
3
- "version": "0.14.66",
3
+ "version": "0.14.67",
4
4
  "description": "Run Claude Code 24/7 on your Claude Pro/Max subscription over Telegram. Open-source alternative to OpenClaw and NanoClaw — no API keys.",
5
5
  "type": "module",
6
6
  "bin": {
@@ -39034,6 +39034,13 @@ function noteOutbound2(key, now) {
39034
39034
  s.lastOutboundAt = now;
39035
39035
  s.fallbackFired = false;
39036
39036
  }
39037
+ function noteProduction(key, now) {
39038
+ const s = state2.get(key);
39039
+ if (s == null)
39040
+ return;
39041
+ s.lastOutboundAt = now;
39042
+ s.fallbackFired = false;
39043
+ }
39037
39044
  function noteThinking(key, now) {
39038
39045
  const s = state2.get(key);
39039
39046
  if (s == null)
@@ -52763,11 +52770,11 @@ function sweepStaleTurnActiveMarker(stateDir, opts) {
52763
52770
  }
52764
52771
 
52765
52772
  // ../src/build-info.ts
52766
- var VERSION = "0.14.66";
52767
- var COMMIT_SHA = "0f4f029d";
52768
- var COMMIT_DATE = "2026-06-05T07:05:45Z";
52769
- var LATEST_PR = 2167;
52770
- var COMMITS_AHEAD_OF_TAG = 2;
52773
+ var VERSION = "0.14.67";
52774
+ var COMMIT_SHA = "dcade213";
52775
+ var COMMIT_DATE = "2026-06-05T08:22:01Z";
52776
+ var LATEST_PR = 2171;
52777
+ var COMMITS_AHEAD_OF_TAG = 4;
52771
52778
 
52772
52779
  // gateway/boot-version.ts
52773
52780
  function formatRelativeAgo(iso) {
@@ -54075,7 +54082,7 @@ function findLatestEndedTurnForChat(chatId) {
54075
54082
  return latest;
54076
54083
  }
54077
54084
  function resolveAnswerThreadWithLog(chatId, explicitThreadId, originTurn, liveTurn, surface) {
54078
- const recovered = LATE_REPLY_TOPIC_RECOVERY_ENABLED && explicitThreadId == null && originTurn == null && liveTurn?.sessionThreadId == null ? findLatestEndedTurnForChat(chatId) : null;
54085
+ const recovered = LATE_REPLY_TOPIC_RECOVERY_ENABLED && explicitThreadId == null && originTurn == null && liveTurn == null ? findLatestEndedTurnForChat(chatId) : null;
54079
54086
  const threadId = resolveAnswerThreadId({
54080
54087
  explicitThreadId,
54081
54088
  originResolved: originTurn != null,
@@ -55260,6 +55267,7 @@ function parsePositiveMsEnv(name, fallbackMs) {
55260
55267
  var SILENCE_FALLBACK_MS = parsePositiveMsEnv("SWITCHROOM_SILENCE_FALLBACK_MS", 300000);
55261
55268
  var SILENCE_FALLBACK_HARD_MS = parsePositiveMsEnv("SWITCHROOM_SILENCE_FALLBACK_HARD_MS", 900000);
55262
55269
  var SILENCE_DEFER_INFLIGHT_TOOLS = process.env.SWITCHROOM_SILENCE_DEFER_INFLIGHT_TOOLS === "1";
55270
+ var SILENCE_LIVENESS_PRODUCTION = process.env.SWITCHROOM_SILENCE_LIVENESS_PRODUCTION !== "0";
55263
55271
  startTimer({
55264
55272
  thresholdsMs: { fallback: SILENCE_FALLBACK_MS, fallbackHardCeiling: SILENCE_FALLBACK_HARD_MS },
55265
55273
  deferFallbackWhileToolInFlight: SILENCE_DEFER_INFLIGHT_TOOLS,
@@ -55351,8 +55359,11 @@ startTimer({
55351
55359
  const sib = silenceMsForKey(siblingKey, fbNow);
55352
55360
  return sib == null || sib >= DEFAULT_THRESHOLDS.fallback;
55353
55361
  });
55354
- if (turnMatchesFallback && currentTurn === wedgedTurn)
55362
+ if (turnMatchesFallback && currentTurn === wedgedTurn && wedgedTurn != null) {
55363
+ process.stderr.write(`telegram gateway: ${formatTurnLifecycle("clear", "silence_fallback", wedgedTurn, Date.now())}
55364
+ `);
55355
55365
  currentTurn = null;
55366
+ }
55356
55367
  try {
55357
55368
  clearSilentEndState(fbKey);
55358
55369
  } catch {}
@@ -58076,6 +58087,9 @@ function handleSessionEvent(ev) {
58076
58087
  const rendered = appendActivityLabel(turn.mirrorLines, ev.label);
58077
58088
  if (rendered != null) {
58078
58089
  turn.lastToolLabelAt = Date.now();
58090
+ if (SILENCE_LIVENESS_PRODUCTION && currentTurn === turn) {
58091
+ noteProduction(statusKey(turn.sessionChatId, turn.sessionThreadId), Date.now());
58092
+ }
58079
58093
  turn.activityPendingRender = composeTurnActivity(turn) ?? rendered;
58080
58094
  if (turn.activityInFlight == null) {
58081
58095
  turn.activityInFlight = drainActivitySummary(turn);
@@ -58130,6 +58144,9 @@ function handleSessionEvent(ev) {
58130
58144
  logStreamingEvent(metricEv);
58131
58145
  if (currentTurn === turn) {
58132
58146
  noteSignal(statusKey(turn.sessionChatId, turn.sessionThreadId), Date.now());
58147
+ if (SILENCE_LIVENESS_PRODUCTION) {
58148
+ noteProduction(statusKey(turn.sessionChatId, turn.sessionThreadId), Date.now());
58149
+ }
58133
58150
  }
58134
58151
  },
58135
58152
  checkDedup: (text) => {
@@ -1930,11 +1930,17 @@ function resolveAnswerThreadWithLog(
1930
1930
  liveTurn: CurrentTurn | null,
1931
1931
  surface: 'reply' | 'stream_reply',
1932
1932
  ): number | undefined {
1933
+ // Recover ONLY for a genuinely LATE reply — no live turn at all. Gating on
1934
+ // `liveTurn?.sessionThreadId == null` (the original) also fired for a
1935
+ // threadless DM that still had a live turn, marking every DM reply
1936
+ // `via=recovered`/RECOVERED in the telemetry (routing result unchanged —
1937
+ // DM → undefined — but it drowned the real supergroup recoveries the marker
1938
+ // exists to surface). `liveTurn == null` is the precise late-reply condition.
1933
1939
  const recovered =
1934
1940
  LATE_REPLY_TOPIC_RECOVERY_ENABLED &&
1935
1941
  explicitThreadId == null &&
1936
1942
  originTurn == null &&
1937
- liveTurn?.sessionThreadId == null
1943
+ liveTurn == null
1938
1944
  ? findLatestEndedTurnForChat(chatId)
1939
1945
  : null
1940
1946
  const threadId = resolveAnswerThreadId({
@@ -4673,6 +4679,12 @@ function parsePositiveMsEnv(name: string, fallbackMs: number): number {
4673
4679
  const SILENCE_FALLBACK_MS = parsePositiveMsEnv('SWITCHROOM_SILENCE_FALLBACK_MS', 300_000)
4674
4680
  const SILENCE_FALLBACK_HARD_MS = parsePositiveMsEnv('SWITCHROOM_SILENCE_FALLBACK_HARD_MS', 900_000)
4675
4681
  const SILENCE_DEFER_INFLIGHT_TOOLS = process.env.SWITCHROOM_SILENCE_DEFER_INFLIGHT_TOOLS === '1'
4682
+ // Production-liveness (2026-06-05 UAT finding). Count an activity-feed render or
4683
+ // an answer-stream draft update as liveness for the silence clock, so a long
4684
+ // tool/composition turn that's visibly producing doesn't trip the 300s fallback
4685
+ // and null currentTurn mid-work. Default ON; SWITCHROOM_SILENCE_LIVENESS_PRODUCTION=0
4686
+ // restores the legacy "only a real reply resets the clock" behaviour.
4687
+ const SILENCE_LIVENESS_PRODUCTION = process.env.SWITCHROOM_SILENCE_LIVENESS_PRODUCTION !== '0'
4676
4688
 
4677
4689
  silencePoke.startTimer({
4678
4690
  thresholdsMs: { fallback: SILENCE_FALLBACK_MS, fallbackHardCeiling: SILENCE_FALLBACK_HARD_MS },
@@ -4889,7 +4901,16 @@ silencePoke.startTimer({
4889
4901
  // returns null and the regular teardown short-circuits. Without
4890
4902
  // this, the late event would re-emit `turn_ended` AND clobber
4891
4903
  // whatever fresh turn the next inbound started.
4892
- if (turnMatchesFallback && currentTurn === wedgedTurn) currentTurn = null
4904
+ if (turnMatchesFallback && currentTurn === wedgedTurn && wedgedTurn != null) {
4905
+ // Status-surface observability: emit the lifecycle CLEAR for the
4906
+ // silence-poke teardown so a fallback-nulled turn has a turn-lifecycle
4907
+ // line like every other clear path (the framework-fallback line below is
4908
+ // its own format — this makes the dark-out greppable in the same shape).
4909
+ process.stderr.write(
4910
+ `telegram gateway: ${formatTurnLifecycle('clear', 'silence_fallback', wedgedTurn, Date.now())}\n`,
4911
+ )
4912
+ currentTurn = null
4913
+ }
4893
4914
  // Best-effort: clear any pending silent-end marker so the Stop hook
4894
4915
  // doesn't double-block when claude eventually exits the wedged turn.
4895
4916
  try {
@@ -9452,6 +9473,16 @@ function handleSessionEvent(ev: SessionEvent): void {
9452
9473
  // the " · Ns" elapsed restarts from this step (and the feed itself just
9453
9474
  // advanced, so it isn't stale).
9454
9475
  turn.lastToolLabelAt = Date.now()
9476
+ // Production-liveness: a NEW model-driven activity label is genuine
9477
+ // liveness (the model emitted a new step), so reset the silence-poke
9478
+ // clock — this is the safe site, NOT drainActivitySummary, because the
9479
+ // framework feedHeartbeatTick also drains (climbing-elapsed re-renders)
9480
+ // and would falsely reset the clock forever on a hung-mid-tool turn,
9481
+ // reintroducing the #1556 dangling-turn wedge. Only the model emitting a
9482
+ // fresh label reaches here.
9483
+ if (SILENCE_LIVENESS_PRODUCTION && currentTurn === turn) {
9484
+ silencePoke.noteProduction(statusKey(turn.sessionChatId, turn.sessionThreadId), Date.now())
9485
+ }
9455
9486
  // Recompose so any active foreground sub-agent's nested block (Model A)
9456
9487
  // is preserved when the parent appends its own step. composeTurnActivity
9457
9488
  // == the flat render when no foreground sub-agent is active.
@@ -9612,6 +9643,15 @@ function handleSessionEvent(ev: SessionEvent): void {
9612
9643
  statusKey(turn.sessionChatId, turn.sessionThreadId),
9613
9644
  Date.now(),
9614
9645
  )
9646
+ // Production-liveness: a draft update is the agent visibly
9647
+ // composing — reset the silence-poke clock so a long
9648
+ // compose-only turn (no tools, no reply yet) isn't torn down.
9649
+ if (SILENCE_LIVENESS_PRODUCTION) {
9650
+ silencePoke.noteProduction(
9651
+ statusKey(turn.sessionChatId, turn.sessionThreadId),
9652
+ Date.now(),
9653
+ )
9654
+ }
9615
9655
  }
9616
9656
  },
9617
9657
  // #646 — wire the shared outboundDedup into the answer-stream
@@ -196,6 +196,31 @@ export function noteOutbound(key: string, now: number): void {
196
196
  s.fallbackFired = false
197
197
  }
198
198
 
199
+ /**
200
+ * Record observable PRODUCTION that isn't a final reply — an activity-feed
201
+ * render (`→/✓` edit-in-place message) or an answer-stream draft update. Resets
202
+ * the silence clock exactly like a reply.
203
+ *
204
+ * Why this exists (2026-06-05): the header's "only a real reply counts; tool
205
+ * churn / the model ripping through 20 tool calls is still SILENT to the user"
206
+ * rule predates the live activity feed (#2162) and the compose draft. Those
207
+ * surfaces ARE user-visible now, so a turn actively rendering them is NOT
208
+ * silent — yet the 300s fallback (which nulls `currentTurn` and kills the very
209
+ * feed/draft the user is watching) still fired on a long tool/composition turn,
210
+ * darkening the live status mid-work. Counting production as liveness makes the
211
+ * fallback fire only on GENUINE silence (no reply, no feed, no draft, no tool
212
+ * events for the window) — a real wedge. A wedged agent produces nothing
213
+ * observable, so its clock is never reset and it still recovers.
214
+ *
215
+ * No-op when the kill switch is on or the key has no turn.
216
+ */
217
+ export function noteProduction(key: string, now: number): void {
218
+ const s = state.get(key)
219
+ if (s == null) return
220
+ s.lastOutboundAt = now
221
+ s.fallbackFired = false
222
+ }
223
+
199
224
  /**
200
225
  * Record a `thinking` session event. Used to pick "still thinking…" vs
201
226
  * "still working…" wording for the 300s framework fallback.
@@ -0,0 +1,67 @@
1
+ /**
2
+ * Silence-poke production-liveness — heartbeat-safety guard (2026-06-05).
3
+ *
4
+ * The production-liveness fix resets the silence clock on observable production
5
+ * so a long WORKING turn doesn't dark out. The load-bearing constraint: the
6
+ * reset must fire ONLY on MODEL-driven production, NEVER from the framework
7
+ * `feedHeartbeatTick` — a model-INDEPENDENT setInterval that re-renders a
8
+ * climbing " · Ns" elapsed every 6s (defeating the feed's content-dedup). If the
9
+ * reset lived in `drainActivitySummary` (which the heartbeat drains), a
10
+ * hung-but-bridge-connected agent would have its 300s silence clock reset every
11
+ * 6s forever, the load-bearing silence-poke unwedge would NEVER fire, and the
12
+ * conversation would be pinned — the #1556 permanent dangling-turn wedge.
13
+ *
14
+ * An adversarial review panel caught exactly this in an earlier revision. These
15
+ * are STRUCTURAL assertions (the gateway IIFE can't be instantiated in-process —
16
+ * same pattern as multitopic-routing-wiring.test) that pin the reset to the
17
+ * model-driven sites so a refactor can't silently reintroduce the regression.
18
+ * The behavioural counterpart (noteProduction resets; STOP producing → fires)
19
+ * lives in silence-poke.test.ts; this guards the WIRING the heartbeat must not
20
+ * cross.
21
+ */
22
+ import { describe, it, expect } from 'vitest'
23
+ import { readFileSync } from 'node:fs'
24
+ import { resolve } from 'node:path'
25
+
26
+ const gatewaySrc = readFileSync(resolve(__dirname, '..', 'gateway', 'gateway.ts'), 'utf-8')
27
+
28
+ function between(src: string, startMarker: string, endMarker: string): string {
29
+ const after = src.split(startMarker)[1] ?? ''
30
+ return after.split(endMarker)[0] ?? ''
31
+ }
32
+
33
+ describe('silence-poke production-liveness — heartbeat safety', () => {
34
+ it('drainActivitySummary must NOT reset the silence clock (the framework heartbeat drains here)', () => {
35
+ const body = between(gatewaySrc, 'async function drainActivitySummary', '\nfunction feedHeartbeatTick')
36
+ expect(body.length).toBeGreaterThan(100) // sanity: the slice found the function body
37
+ expect(body).not.toMatch(/noteProduction/)
38
+ })
39
+
40
+ it('feedHeartbeatTick itself must NOT reset the silence clock (model-independent re-render)', () => {
41
+ const body = between(gatewaySrc, 'function feedHeartbeatTick(): void {', '\n}')
42
+ expect(body.length).toBeGreaterThan(50)
43
+ expect(body).not.toMatch(/noteProduction/)
44
+ })
45
+
46
+ it('the MODEL-driven tool-label append IS the reset site, gated on the live turn', () => {
47
+ // appendActivityLabel returns a fresh render only when the model emits a NEW
48
+ // labelled step — the genuine liveness signal the heartbeat can never forge.
49
+ const block = between(
50
+ gatewaySrc,
51
+ 'const rendered = appendActivityLabel(turn.mirrorLines, ev.label)',
52
+ '\n return',
53
+ )
54
+ expect(block).toMatch(/silencePoke\.noteProduction/)
55
+ expect(block).toMatch(/currentTurn === turn/)
56
+ })
57
+
58
+ it('the answer-stream draft onMetric reset is model-driven and gated on the live turn', () => {
59
+ const block = between(gatewaySrc, 'onMetric: (metricEv) => {', '\n },')
60
+ expect(block).toMatch(/silencePoke\.noteProduction/)
61
+ expect(block).toMatch(/currentTurn === turn/)
62
+ })
63
+
64
+ it('production-liveness is behind the default-ON SWITCHROOM_SILENCE_LIVENESS_PRODUCTION kill switch', () => {
65
+ expect(gatewaySrc).toMatch(/SWITCHROOM_SILENCE_LIVENESS_PRODUCTION !== '0'/)
66
+ })
67
+ })
@@ -2,6 +2,7 @@ import { describe, it, expect, beforeEach, afterEach } from 'vitest'
2
2
  import {
3
3
  startTurn,
4
4
  noteOutbound,
5
+ noteProduction,
5
6
  noteThinking,
6
7
  noteToolStart,
7
8
  noteToolEnd,
@@ -136,6 +137,47 @@ describe('silence-poke — outbound resets the silence clock', () => {
136
137
  })
137
138
  })
138
139
 
140
+ // Production-liveness (2026-06-05): an activity-feed render or draft update is
141
+ // the agent visibly working — it resets the silence clock so a long
142
+ // tool/composition turn isn't torn down mid-work.
143
+ describe('silence-poke — noteProduction resets the silence clock', () => {
144
+ it('a feed/draft render at 250s pushes the fallback measurement to it', () => {
145
+ const fx = setupDeps()
146
+ startTurn('k', 0)
147
+ noteProduction('k', 250_000)
148
+ __tickForTests(300_000) // 50s since production — no fire
149
+ expect(fx.fallbacks).toHaveLength(0)
150
+ __tickForTests(550_000) // 300s since production — fires
151
+ expect(fx.fallbacks).toHaveLength(1)
152
+ })
153
+
154
+ it('repeated production every 60s keeps a long turn alive indefinitely', () => {
155
+ const fx = setupDeps()
156
+ startTurn('k', 0)
157
+ for (let t = 60_000; t <= 600_000; t += 60_000) {
158
+ noteProduction('k', t)
159
+ __tickForTests(t)
160
+ }
161
+ // 10 min of steady feed/draft renders — never torn down.
162
+ expect(fx.fallbacks).toHaveLength(0)
163
+ })
164
+
165
+ it('production STOPS → the fallback fires 300s after the last render (genuine wedge)', () => {
166
+ const fx = setupDeps()
167
+ startTurn('k', 0)
168
+ noteProduction('k', 100_000) // last render at 100s, then silence
169
+ __tickForTests(390_000) // 290s since last render — no fire
170
+ expect(fx.fallbacks).toHaveLength(0)
171
+ __tickForTests(401_000) // 301s since last render — fires
172
+ expect(fx.fallbacks).toHaveLength(1)
173
+ })
174
+
175
+ it('is a no-op for an unknown key (no turn state)', () => {
176
+ setupDeps()
177
+ expect(() => noteProduction('nope', 1_000)).not.toThrow()
178
+ })
179
+ })
180
+
139
181
  // Pin the contract the gateway must uphold for ABNORMAL turn-ends:
140
182
  // every code path that abandons a turn before turn_end (context-
141
183
  // exhaust bail, gateway-side wedge timeout, silent-end recovery)
@@ -0,0 +1,332 @@
1
+ /**
2
+ * Real-work UAT coverage — human-style prompts that trigger actual work
3
+ * (multi-tool, web research, sub-agents, background workers) plus a turn
4
+ * collector + bug detectors for the failure classes the conversational fuzz
5
+ * never exercised.
6
+ *
7
+ * Why this exists: the existing fuzz scenarios send conversational prompts
8
+ * ("hey how's it going", emoji, markdown edge-cases) → trivial fast replies.
9
+ * The status-surface and reply-ordering bugs (live feed going dark mid-work,
10
+ * the orphaned-reply backstop flushing a fragment then the real answer landing
11
+ * late and out of order, late replies misrouting) only manifest when the agent
12
+ * does REAL work — uses tools/MCPs, spawns sub-agents, researches long enough to
13
+ * cross the silence-poke / orphaned-reply thresholds. These prompts provoke that
14
+ * work in a human voice; `collectTurn` captures the whole bot-message sequence;
15
+ * `analyzeTurn` flags the known bug signatures.
16
+ *
17
+ * Harness limits (see CLAUDE.md): mtcute observes real sendMessage/editMessageText
18
+ * (so the activity feed `→/✓` and worker feed `🛠` ARE observable) but NOT drafts
19
+ * or reactions, and has no forum-topic API (channel scenarios use the General
20
+ * topic — they prove DM-vs-channel routing, not correct-topic-among-many, which
21
+ * the gateway unit thread-assertions pin). So work-triggering is probabilistic on
22
+ * a generic agent: the UNIVERSAL invariants (a substantive answer arrives, in the
23
+ * right surface, not as an orphaned fragment) are hard; the work-specific surfaces
24
+ * (feed painted, worker surfaced) are reported and only hard-checked once their
25
+ * precondition is observed.
26
+ */
27
+
28
+ import type { Driver, ObservedMessage } from "./driver.js";
29
+ import { isWorkerFeedMessage, isActivityFeedMessage } from "./assertions.js";
30
+
31
+ export type WorkKind =
32
+ | "research" // web/multi-source research → multi-tool, long
33
+ | "multitool" // several tool calls, sequential
34
+ | "subagent" // delegates to a foreground sub-agent
35
+ | "bgworker" // dispatches a background worker (the 🛠 feed)
36
+ | "compound" // first X then Y then summarise — ordered multi-step
37
+ | "web"; // current/recent info → forces a web fetch
38
+
39
+ export interface RealWorkCase {
40
+ name: string;
41
+ /** Human-style prompt that should provoke real work. */
42
+ prompt: string;
43
+ kind: WorkKind;
44
+ /** Generous budget — deep research can run minutes. */
45
+ timeoutMs: number;
46
+ /** The substantive answer must be at least this long; a backstop fragment /
47
+ * bare ack is shorter, so this distinguishes "the answer landed" from "only a
48
+ * stub landed". */
49
+ minAnswerChars: number;
50
+ /** When true, this prompt RELIABLY triggers the named surface, so the scenario
51
+ * hard-asserts it appeared (not just reports it). Used for the semi-prescriptive
52
+ * but natural-sounding bgworker/subagent cases. */
53
+ requireSurface?: "worker" | "activity";
54
+ }
55
+
56
+ /**
57
+ * The case set. The first block is fully human-style (probabilistic work); the
58
+ * `requireSurface` block phrases the dispatch naturally but reliably enough to
59
+ * hard-assert the surface. Keep prompts provider-agnostic so they run on the
60
+ * generic test-harness agent (no marko-specific MCPs).
61
+ */
62
+ export const REAL_WORK_CASES: RealWorkCase[] = [
63
+ {
64
+ name: "deep research, take your time",
65
+ prompt:
66
+ "Can you research the current state of WebAssembly outside the browser — " +
67
+ "the main server-side runtimes, who's actually using it in production, and " +
68
+ "the real limitations today? Take your time and give me a proper rundown, " +
69
+ "not a one-liner.",
70
+ kind: "research",
71
+ timeoutMs: 180_000,
72
+ minAnswerChars: 400,
73
+ },
74
+ {
75
+ name: "current info, forces a lookup",
76
+ prompt:
77
+ "What's the latest with the Bun JavaScript runtime — the recent releases " +
78
+ "and whether people consider it production-ready yet? Check, don't guess.",
79
+ kind: "web",
80
+ timeoutMs: 150_000,
81
+ minAnswerChars: 300,
82
+ },
83
+ {
84
+ name: "multi-angle investigation",
85
+ prompt:
86
+ "Dig into Postgres vs SQLite for a small SaaS backend — look at it from a " +
87
+ "few angles (concurrency, ops burden, cost at scale) and tell me which " +
88
+ "you'd actually pick and why.",
89
+ kind: "multitool",
90
+ timeoutMs: 150_000,
91
+ minAnswerChars: 400,
92
+ },
93
+ {
94
+ name: "compound sequential ask",
95
+ prompt:
96
+ "First work out what today's date is, then how many days are left until the " +
97
+ "end of this quarter, then suggest three concrete milestones I could hit " +
98
+ "before then. Do it in that order.",
99
+ kind: "compound",
100
+ timeoutMs: 120_000,
101
+ minAnswerChars: 250,
102
+ },
103
+ {
104
+ name: "invite delegation",
105
+ prompt:
106
+ "I need a proper comparison of Stripe vs Paddle vs Lemon Squeezy for selling " +
107
+ "a digital product — pricing, who handles sales tax, and payout timing. Farm " +
108
+ "it out to a sub-agent if that's faster; just give me the bottom line at the end.",
109
+ kind: "subagent",
110
+ timeoutMs: 180_000,
111
+ minAnswerChars: 350,
112
+ },
113
+ {
114
+ name: "long sourced briefing (crosses thresholds)",
115
+ prompt:
116
+ "Give me a thorough, well-sourced briefing on the EU AI Act — what it covers, " +
117
+ "the risk tiers, the key deadlines, and what a small AI startup actually has to " +
118
+ "do. Be comprehensive; I'd rather wait and get depth.",
119
+ kind: "research",
120
+ timeoutMs: 360_000,
121
+ minAnswerChars: 500,
122
+ },
123
+ // ── reliably-triggering, still natural voice ──────────────────────────────
124
+ {
125
+ name: "background worker, ping me when done",
126
+ prompt:
127
+ "Don't answer this inline — actually dispatch a background worker for it " +
128
+ "(Task / Agent with run_in_background: true) so I can keep chatting while it " +
129
+ "runs, and ping me when it's done. The task: go through, ONE step at a time " +
130
+ "with a one-line note on each (run a quick command or jot a note per step so " +
131
+ "there's visible progress), the eight most common email-deliverability " +
132
+ "mistakes a solo founder makes — SPF, DKIM, DMARC, warmup, list hygiene, " +
133
+ "content, sending cadence, monitoring. Pace it over a couple of minutes; do " +
134
+ "all eight, then hand back the summary.",
135
+ kind: "bgworker",
136
+ // Generous: if the agent declines to background it and composes inline, a
137
+ // paced 8-step answer can run past 5 min (and, with no tracked tool in
138
+ // flight, trip the 300s silence-poke — see the 2026-06-05 UAT finding).
139
+ timeoutMs: 360_000,
140
+ minAnswerChars: 250,
141
+ requireSurface: "worker",
142
+ },
143
+ {
144
+ name: "step-by-step so the feed paints",
145
+ prompt:
146
+ "Walk through, ONE step at a time (run a quick command or note for each so I " +
147
+ "can see progress), how you'd debug a Linux box that's suddenly out of disk " +
148
+ "space — six steps: df, du on the big dirs, find large files, check logs, " +
149
+ "check deleted-but-open files, then a cleanup plan. Then give me the recap.",
150
+ kind: "multitool",
151
+ timeoutMs: 180_000,
152
+ minAnswerChars: 300,
153
+ requireSurface: "activity",
154
+ },
155
+ ];
156
+
157
+ /** What the collector observed across one turn. */
158
+ export interface TurnObservation {
159
+ /** Every bot message (initial sends only; edits tracked separately). */
160
+ botMessages: ObservedMessage[];
161
+ /** Edit events seen (worker/activity feeds grow via edits). */
162
+ edits: ObservedMessage[];
163
+ /** The first substantive answer (non-feed, >= minAnswerChars), or null. */
164
+ answer: ObservedMessage | null;
165
+ /** ms from send to the answer (or to timeout). */
166
+ answerLatencyMs: number;
167
+ /** Whether an activity feed (`→/✓`) message was seen. */
168
+ sawActivityFeed: boolean;
169
+ /** Whether a worker feed (`🛠 Worker`) message was seen. */
170
+ sawWorkerFeed: boolean;
171
+ }
172
+
173
+ /**
174
+ * Send `prompt` and collect the bot's message sequence until a substantive
175
+ * answer lands (+ a short settle to catch trailing/late sends — the very window
176
+ * the orphaned-reply bug lives in) or `timeoutMs` elapses. Observing starts
177
+ * BEFORE the send so nothing is missed.
178
+ */
179
+ export async function collectTurn(
180
+ driver: Driver,
181
+ chatId: number,
182
+ driverUserId: number,
183
+ prompt: string,
184
+ opts: { timeoutMs: number; minAnswerChars: number; settleMs?: number },
185
+ ): Promise<TurnObservation> {
186
+ const settleMs = opts.settleMs ?? 6_000;
187
+ const botMessages: ObservedMessage[] = [];
188
+ const edits: ObservedMessage[] = [];
189
+ let answer: ObservedMessage | null = null;
190
+ let sawActivityFeed = false;
191
+ let sawWorkerFeed = false;
192
+
193
+ const startedAt = Date.now();
194
+ const iterator = driver.observeMessages(chatId)[Symbol.asyncIterator]();
195
+ // Begin observing, then send (observeMessages backfills nothing, but the send
196
+ // round-trips after the iterator is live).
197
+ await driver.sendText(chatId, prompt);
198
+
199
+ let settleDeadline = Number.POSITIVE_INFINITY;
200
+ while (true) {
201
+ const remaining =
202
+ Math.min(opts.timeoutMs - (Date.now() - startedAt), settleDeadline - Date.now());
203
+ if (remaining <= 0) break;
204
+ const next = await Promise.race([
205
+ iterator.next(),
206
+ new Promise<{ done: true; value: undefined }>((r) =>
207
+ setTimeout(() => r({ done: true, value: undefined }), Math.max(0, remaining)),
208
+ ),
209
+ ]);
210
+ if (next.done || next.value == null) {
211
+ // timed out (either overall or settle) — stop
212
+ break;
213
+ }
214
+ const m = next.value as ObservedMessage;
215
+ if (m.senderUserId === driverUserId) continue; // our own echo
216
+ if (m.edited) {
217
+ edits.push(m);
218
+ if (isWorkerFeedMessage(m)) sawWorkerFeed = true;
219
+ if (isActivityFeedMessage(m)) sawActivityFeed = true;
220
+ continue;
221
+ }
222
+ botMessages.push(m);
223
+ if (isWorkerFeedMessage(m)) sawWorkerFeed = true;
224
+ else if (isActivityFeedMessage(m)) sawActivityFeed = true;
225
+ else if (answer == null && m.text.trim().length >= opts.minAnswerChars) {
226
+ answer = m;
227
+ // Got the answer; keep collecting for `settleMs` to catch a late
228
+ // fragment/duplicate/misrouted trailing send.
229
+ settleDeadline = Date.now() + settleMs;
230
+ }
231
+ }
232
+ void iterator.return?.();
233
+ return {
234
+ botMessages,
235
+ edits,
236
+ answer,
237
+ answerLatencyMs: answer ? answer.date.getTime() - startedAt : Date.now() - startedAt,
238
+ sawActivityFeed,
239
+ sawWorkerFeed,
240
+ };
241
+ }
242
+
243
+ export interface TurnViolation {
244
+ code:
245
+ | "no-answer"
246
+ | "orphaned-fragment"
247
+ | "surface-missing"
248
+ | "wrong-surface";
249
+ detail: string;
250
+ }
251
+
252
+ /**
253
+ * Bug detectors over a collected turn. Splits HARD violations (the universal
254
+ * invariants that must always hold) from SOFT warnings (work-specific surfaces
255
+ * that are probabilistic on a generic agent — whether it dispatches a worker /
256
+ * sub-agent is its judgment, so a missing feed is reported, not failed).
257
+ *
258
+ * Hard violations:
259
+ * - no-answer: no substantive reply arrived at all (the answer never landed).
260
+ * - orphaned-fragment: a short non-ack bot text landed, THEN ≥8s later a much
261
+ * longer answer — the orphaned-reply backstop signature (fragment flushed,
262
+ * real reply late). A short message that is itself the only substantive reply,
263
+ * or a brief "on it" ack followed promptly, does not count.
264
+ * - wrong-surface (channel): a bot message landed outside the expected chat.
265
+ *
266
+ * Soft warnings:
267
+ * - surface-missing: a `requireSurface` case never showed its feed. The agent
268
+ * may have answered inline (a legitimate choice) — reported for the bug hunt,
269
+ * not a hard fail. When the feed DOES appear, the summary + gateway telemetry
270
+ * confirm it surfaced correctly.
271
+ */
272
+ export function analyzeTurn(
273
+ obs: TurnObservation,
274
+ expected: { requireSurface?: "worker" | "activity"; chatId: number },
275
+ ): { violations: TurnViolation[]; warnings: TurnViolation[] } {
276
+ const violations: TurnViolation[] = [];
277
+ const warnings: TurnViolation[] = [];
278
+ if (obs.answer == null) {
279
+ violations.push({
280
+ code: "no-answer",
281
+ detail: `no substantive reply within budget (saw ${obs.botMessages.length} bot msg(s), ` +
282
+ `activityFeed=${obs.sawActivityFeed} workerFeed=${obs.sawWorkerFeed})`,
283
+ });
284
+ }
285
+
286
+ // orphaned-fragment: a non-feed text shorter than 150 chars, sent ≥8s before
287
+ // the answer, that isn't a quick ack right before the answer.
288
+ if (obs.answer != null) {
289
+ const fragments = obs.botMessages.filter(
290
+ (m) =>
291
+ m.messageId !== obs.answer!.messageId &&
292
+ !isWorkerFeedMessage(m) &&
293
+ !isActivityFeedMessage(m) &&
294
+ m.text.trim().length > 0 &&
295
+ m.text.trim().length < 150 &&
296
+ obs.answer!.date.getTime() - m.date.getTime() >= 8_000,
297
+ );
298
+ if (fragments.length > 0) {
299
+ violations.push({
300
+ code: "orphaned-fragment",
301
+ detail: `${fragments.length} stub message(s) landed ≥8s before the answer ` +
302
+ `(e.g. ${JSON.stringify(fragments[0]!.text.slice(0, 60))}) — the orphaned-reply ` +
303
+ `backstop signature.`,
304
+ });
305
+ }
306
+ }
307
+
308
+ if (expected.requireSurface === "worker" && !obs.sawWorkerFeed) {
309
+ warnings.push({ code: "surface-missing", detail: "expected a 🛠 worker feed; agent likely answered inline" });
310
+ }
311
+ if (expected.requireSurface === "activity" && !obs.sawActivityFeed && !obs.sawWorkerFeed) {
312
+ warnings.push({ code: "surface-missing", detail: "expected a →/✓ activity feed; none appeared" });
313
+ }
314
+
315
+ const stray = [...obs.botMessages, ...obs.edits].filter((m) => m.chatId !== expected.chatId);
316
+ if (stray.length > 0) {
317
+ violations.push({
318
+ code: "wrong-surface",
319
+ detail: `${stray.length} bot message(s) landed in chat ${stray[0]!.chatId}, expected ${expected.chatId}`,
320
+ });
321
+ }
322
+ return { violations, warnings };
323
+ }
324
+
325
+ /** One-line human summary of a turn for the test log (bug-hunt forensics). */
326
+ export function summarizeTurn(name: string, obs: TurnObservation): string {
327
+ return (
328
+ `[real-work] ${name}: answer=${obs.answer ? `${obs.answer.text.trim().length}ch@${Math.round(obs.answerLatencyMs / 1000)}s` : "NONE"} ` +
329
+ `botMsgs=${obs.botMessages.length} edits=${obs.edits.length} ` +
330
+ `activityFeed=${obs.sawActivityFeed} workerFeed=${obs.sawWorkerFeed}`
331
+ );
332
+ }