switchroom 0.14.66 → 0.14.68

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,67 @@
1
+ /**
2
+ * Silence-poke production-liveness — heartbeat-safety guard (2026-06-05).
3
+ *
4
+ * The production-liveness fix resets the silence clock on observable production
5
+ * so a long WORKING turn doesn't dark out. The load-bearing constraint: the
6
+ * reset must fire ONLY on MODEL-driven production, NEVER from the framework
7
+ * `feedHeartbeatTick` — a model-INDEPENDENT setInterval that re-renders a
8
+ * climbing " · Ns" elapsed every 6s (defeating the feed's content-dedup). If the
9
+ * reset lived in `drainActivitySummary` (which the heartbeat drains), a
10
+ * hung-but-bridge-connected agent would have its 300s silence clock reset every
11
+ * 6s forever, the load-bearing silence-poke unwedge would NEVER fire, and the
12
+ * conversation would be pinned — the #1556 permanent dangling-turn wedge.
13
+ *
14
+ * An adversarial review panel caught exactly this in an earlier revision. These
15
+ * are STRUCTURAL assertions (the gateway IIFE can't be instantiated in-process —
16
+ * same pattern as multitopic-routing-wiring.test) that pin the reset to the
17
+ * model-driven sites so a refactor can't silently reintroduce the regression.
18
+ * The behavioural counterpart (noteProduction resets; STOP producing → fires)
19
+ * lives in silence-poke.test.ts; this guards the WIRING the heartbeat must not
20
+ * cross.
21
+ */
22
+ import { describe, it, expect } from 'vitest'
23
+ import { readFileSync } from 'node:fs'
24
+ import { resolve } from 'node:path'
25
+
26
+ const gatewaySrc = readFileSync(resolve(__dirname, '..', 'gateway', 'gateway.ts'), 'utf-8')
27
+
28
+ function between(src: string, startMarker: string, endMarker: string): string {
29
+ const after = src.split(startMarker)[1] ?? ''
30
+ return after.split(endMarker)[0] ?? ''
31
+ }
32
+
33
+ describe('silence-poke production-liveness — heartbeat safety', () => {
34
+ it('drainActivitySummary must NOT reset the silence clock (the framework heartbeat drains here)', () => {
35
+ const body = between(gatewaySrc, 'async function drainActivitySummary', '\nfunction feedHeartbeatTick')
36
+ expect(body.length).toBeGreaterThan(100) // sanity: the slice found the function body
37
+ expect(body).not.toMatch(/noteProduction/)
38
+ })
39
+
40
+ it('feedHeartbeatTick itself must NOT reset the silence clock (model-independent re-render)', () => {
41
+ const body = between(gatewaySrc, 'function feedHeartbeatTick(): void {', '\n}')
42
+ expect(body.length).toBeGreaterThan(50)
43
+ expect(body).not.toMatch(/noteProduction/)
44
+ })
45
+
46
+ it('the MODEL-driven tool-label append IS the reset site, gated on the live turn', () => {
47
+ // appendActivityLabel returns a fresh render only when the model emits a NEW
48
+ // labelled step — the genuine liveness signal the heartbeat can never forge.
49
+ const block = between(
50
+ gatewaySrc,
51
+ 'const rendered = appendActivityLabel(turn.mirrorLines, ev.label)',
52
+ '\n return',
53
+ )
54
+ expect(block).toMatch(/silencePoke\.noteProduction/)
55
+ expect(block).toMatch(/currentTurn === turn/)
56
+ })
57
+
58
+ it('the answer-stream draft onMetric reset is model-driven and gated on the live turn', () => {
59
+ const block = between(gatewaySrc, 'onMetric: (metricEv) => {', '\n },')
60
+ expect(block).toMatch(/silencePoke\.noteProduction/)
61
+ expect(block).toMatch(/currentTurn === turn/)
62
+ })
63
+
64
+ it('production-liveness is behind the default-ON SWITCHROOM_SILENCE_LIVENESS_PRODUCTION kill switch', () => {
65
+ expect(gatewaySrc).toMatch(/SWITCHROOM_SILENCE_LIVENESS_PRODUCTION !== '0'/)
66
+ })
67
+ })
@@ -2,6 +2,7 @@ import { describe, it, expect, beforeEach, afterEach } from 'vitest'
2
2
  import {
3
3
  startTurn,
4
4
  noteOutbound,
5
+ noteProduction,
5
6
  noteThinking,
6
7
  noteToolStart,
7
8
  noteToolEnd,
@@ -136,6 +137,47 @@ describe('silence-poke — outbound resets the silence clock', () => {
136
137
  })
137
138
  })
138
139
 
140
+ // Production-liveness (2026-06-05): an activity-feed render or draft update is
141
+ // the agent visibly working — it resets the silence clock so a long
142
+ // tool/composition turn isn't torn down mid-work.
143
+ describe('silence-poke — noteProduction resets the silence clock', () => {
144
+ it('a feed/draft render at 250s pushes the fallback measurement to it', () => {
145
+ const fx = setupDeps()
146
+ startTurn('k', 0)
147
+ noteProduction('k', 250_000)
148
+ __tickForTests(300_000) // 50s since production — no fire
149
+ expect(fx.fallbacks).toHaveLength(0)
150
+ __tickForTests(550_000) // 300s since production — fires
151
+ expect(fx.fallbacks).toHaveLength(1)
152
+ })
153
+
154
+ it('repeated production every 60s keeps a long turn alive indefinitely', () => {
155
+ const fx = setupDeps()
156
+ startTurn('k', 0)
157
+ for (let t = 60_000; t <= 600_000; t += 60_000) {
158
+ noteProduction('k', t)
159
+ __tickForTests(t)
160
+ }
161
+ // 10 min of steady feed/draft renders — never torn down.
162
+ expect(fx.fallbacks).toHaveLength(0)
163
+ })
164
+
165
+ it('production STOPS → the fallback fires 300s after the last render (genuine wedge)', () => {
166
+ const fx = setupDeps()
167
+ startTurn('k', 0)
168
+ noteProduction('k', 100_000) // last render at 100s, then silence
169
+ __tickForTests(390_000) // 290s since last render — no fire
170
+ expect(fx.fallbacks).toHaveLength(0)
171
+ __tickForTests(401_000) // 301s since last render — fires
172
+ expect(fx.fallbacks).toHaveLength(1)
173
+ })
174
+
175
+ it('is a no-op for an unknown key (no turn state)', () => {
176
+ setupDeps()
177
+ expect(() => noteProduction('nope', 1_000)).not.toThrow()
178
+ })
179
+ })
180
+
139
181
  // Pin the contract the gateway must uphold for ABNORMAL turn-ends:
140
182
  // every code path that abandons a turn before turn_end (context-
141
183
  // exhaust bail, gateway-side wedge timeout, silent-end recovery)
@@ -0,0 +1,332 @@
1
+ /**
2
+ * Real-work UAT coverage — human-style prompts that trigger actual work
3
+ * (multi-tool, web research, sub-agents, background workers) plus a turn
4
+ * collector + bug detectors for the failure classes the conversational fuzz
5
+ * never exercised.
6
+ *
7
+ * Why this exists: the existing fuzz scenarios send conversational prompts
8
+ * ("hey how's it going", emoji, markdown edge-cases) → trivial fast replies.
9
+ * The status-surface and reply-ordering bugs (live feed going dark mid-work,
10
+ * the orphaned-reply backstop flushing a fragment then the real answer landing
11
+ * late and out of order, late replies misrouting) only manifest when the agent
12
+ * does REAL work — uses tools/MCPs, spawns sub-agents, researches long enough to
13
+ * cross the silence-poke / orphaned-reply thresholds. These prompts provoke that
14
+ * work in a human voice; `collectTurn` captures the whole bot-message sequence;
15
+ * `analyzeTurn` flags the known bug signatures.
16
+ *
17
+ * Harness limits (see CLAUDE.md): mtcute observes real sendMessage/editMessageText
18
+ * (so the activity feed `→/✓` and worker feed `🛠` ARE observable) but NOT drafts
19
+ * or reactions, and has no forum-topic API (channel scenarios use the General
20
+ * topic — they prove DM-vs-channel routing, not correct-topic-among-many, which
21
+ * the gateway unit thread-assertions pin). So work-triggering is probabilistic on
22
+ * a generic agent: the UNIVERSAL invariants (a substantive answer arrives, in the
23
+ * right surface, not as an orphaned fragment) are hard; the work-specific surfaces
24
+ * (feed painted, worker surfaced) are reported and only hard-checked once their
25
+ * precondition is observed.
26
+ */
27
+
28
+ import type { Driver, ObservedMessage } from "./driver.js";
29
+ import { isWorkerFeedMessage, isActivityFeedMessage } from "./assertions.js";
30
+
31
+ export type WorkKind =
32
+ | "research" // web/multi-source research → multi-tool, long
33
+ | "multitool" // several tool calls, sequential
34
+ | "subagent" // delegates to a foreground sub-agent
35
+ | "bgworker" // dispatches a background worker (the 🛠 feed)
36
+ | "compound" // first X then Y then summarise — ordered multi-step
37
+ | "web"; // current/recent info → forces a web fetch
38
+
39
+ export interface RealWorkCase {
40
+ name: string;
41
+ /** Human-style prompt that should provoke real work. */
42
+ prompt: string;
43
+ kind: WorkKind;
44
+ /** Generous budget — deep research can run minutes. */
45
+ timeoutMs: number;
46
+ /** The substantive answer must be at least this long; a backstop fragment /
47
+ * bare ack is shorter, so this distinguishes "the answer landed" from "only a
48
+ * stub landed". */
49
+ minAnswerChars: number;
50
+ /** When true, this prompt RELIABLY triggers the named surface, so the scenario
51
+ * hard-asserts it appeared (not just reports it). Used for the semi-prescriptive
52
+ * but natural-sounding bgworker/subagent cases. */
53
+ requireSurface?: "worker" | "activity";
54
+ }
55
+
56
+ /**
57
+ * The case set. The first block is fully human-style (probabilistic work); the
58
+ * `requireSurface` block phrases the dispatch naturally but reliably enough to
59
+ * hard-assert the surface. Keep prompts provider-agnostic so they run on the
60
+ * generic test-harness agent (no marko-specific MCPs).
61
+ */
62
+ export const REAL_WORK_CASES: RealWorkCase[] = [
63
+ {
64
+ name: "deep research, take your time",
65
+ prompt:
66
+ "Can you research the current state of WebAssembly outside the browser — " +
67
+ "the main server-side runtimes, who's actually using it in production, and " +
68
+ "the real limitations today? Take your time and give me a proper rundown, " +
69
+ "not a one-liner.",
70
+ kind: "research",
71
+ timeoutMs: 180_000,
72
+ minAnswerChars: 400,
73
+ },
74
+ {
75
+ name: "current info, forces a lookup",
76
+ prompt:
77
+ "What's the latest with the Bun JavaScript runtime — the recent releases " +
78
+ "and whether people consider it production-ready yet? Check, don't guess.",
79
+ kind: "web",
80
+ timeoutMs: 150_000,
81
+ minAnswerChars: 300,
82
+ },
83
+ {
84
+ name: "multi-angle investigation",
85
+ prompt:
86
+ "Dig into Postgres vs SQLite for a small SaaS backend — look at it from a " +
87
+ "few angles (concurrency, ops burden, cost at scale) and tell me which " +
88
+ "you'd actually pick and why.",
89
+ kind: "multitool",
90
+ timeoutMs: 150_000,
91
+ minAnswerChars: 400,
92
+ },
93
+ {
94
+ name: "compound sequential ask",
95
+ prompt:
96
+ "First work out what today's date is, then how many days are left until the " +
97
+ "end of this quarter, then suggest three concrete milestones I could hit " +
98
+ "before then. Do it in that order.",
99
+ kind: "compound",
100
+ timeoutMs: 120_000,
101
+ minAnswerChars: 250,
102
+ },
103
+ {
104
+ name: "invite delegation",
105
+ prompt:
106
+ "I need a proper comparison of Stripe vs Paddle vs Lemon Squeezy for selling " +
107
+ "a digital product — pricing, who handles sales tax, and payout timing. Farm " +
108
+ "it out to a sub-agent if that's faster; just give me the bottom line at the end.",
109
+ kind: "subagent",
110
+ timeoutMs: 180_000,
111
+ minAnswerChars: 350,
112
+ },
113
+ {
114
+ name: "long sourced briefing (crosses thresholds)",
115
+ prompt:
116
+ "Give me a thorough, well-sourced briefing on the EU AI Act — what it covers, " +
117
+ "the risk tiers, the key deadlines, and what a small AI startup actually has to " +
118
+ "do. Be comprehensive; I'd rather wait and get depth.",
119
+ kind: "research",
120
+ timeoutMs: 360_000,
121
+ minAnswerChars: 500,
122
+ },
123
+ // ── reliably-triggering, still natural voice ──────────────────────────────
124
+ {
125
+ name: "background worker, ping me when done",
126
+ prompt:
127
+ "Don't answer this inline — actually dispatch a background worker for it " +
128
+ "(Task / Agent with run_in_background: true) so I can keep chatting while it " +
129
+ "runs, and ping me when it's done. The task: go through, ONE step at a time " +
130
+ "with a one-line note on each (run a quick command or jot a note per step so " +
131
+ "there's visible progress), the eight most common email-deliverability " +
132
+ "mistakes a solo founder makes — SPF, DKIM, DMARC, warmup, list hygiene, " +
133
+ "content, sending cadence, monitoring. Pace it over a couple of minutes; do " +
134
+ "all eight, then hand back the summary.",
135
+ kind: "bgworker",
136
+ // Generous: if the agent declines to background it and composes inline, a
137
+ // paced 8-step answer can run past 5 min (and, with no tracked tool in
138
+ // flight, trip the 300s silence-poke — see the 2026-06-05 UAT finding).
139
+ timeoutMs: 360_000,
140
+ minAnswerChars: 250,
141
+ requireSurface: "worker",
142
+ },
143
+ {
144
+ name: "step-by-step so the feed paints",
145
+ prompt:
146
+ "Walk through, ONE step at a time (run a quick command or note for each so I " +
147
+ "can see progress), how you'd debug a Linux box that's suddenly out of disk " +
148
+ "space — six steps: df, du on the big dirs, find large files, check logs, " +
149
+ "check deleted-but-open files, then a cleanup plan. Then give me the recap.",
150
+ kind: "multitool",
151
+ timeoutMs: 180_000,
152
+ minAnswerChars: 300,
153
+ requireSurface: "activity",
154
+ },
155
+ ];
156
+
157
+ /** What the collector observed across one turn. */
158
+ export interface TurnObservation {
159
+ /** Every bot message (initial sends only; edits tracked separately). */
160
+ botMessages: ObservedMessage[];
161
+ /** Edit events seen (worker/activity feeds grow via edits). */
162
+ edits: ObservedMessage[];
163
+ /** The first substantive answer (non-feed, >= minAnswerChars), or null. */
164
+ answer: ObservedMessage | null;
165
+ /** ms from send to the answer (or to timeout). */
166
+ answerLatencyMs: number;
167
+ /** Whether an activity feed (`→/✓`) message was seen. */
168
+ sawActivityFeed: boolean;
169
+ /** Whether a worker feed (`🛠 Worker`) message was seen. */
170
+ sawWorkerFeed: boolean;
171
+ }
172
+
173
+ /**
174
+ * Send `prompt` and collect the bot's message sequence until a substantive
175
+ * answer lands (+ a short settle to catch trailing/late sends — the very window
176
+ * the orphaned-reply bug lives in) or `timeoutMs` elapses. Observing starts
177
+ * BEFORE the send so nothing is missed.
178
+ */
179
+ export async function collectTurn(
180
+ driver: Driver,
181
+ chatId: number,
182
+ driverUserId: number,
183
+ prompt: string,
184
+ opts: { timeoutMs: number; minAnswerChars: number; settleMs?: number },
185
+ ): Promise<TurnObservation> {
186
+ const settleMs = opts.settleMs ?? 6_000;
187
+ const botMessages: ObservedMessage[] = [];
188
+ const edits: ObservedMessage[] = [];
189
+ let answer: ObservedMessage | null = null;
190
+ let sawActivityFeed = false;
191
+ let sawWorkerFeed = false;
192
+
193
+ const startedAt = Date.now();
194
+ const iterator = driver.observeMessages(chatId)[Symbol.asyncIterator]();
195
+ // Begin observing, then send (observeMessages backfills nothing, but the send
196
+ // round-trips after the iterator is live).
197
+ await driver.sendText(chatId, prompt);
198
+
199
+ let settleDeadline = Number.POSITIVE_INFINITY;
200
+ while (true) {
201
+ const remaining =
202
+ Math.min(opts.timeoutMs - (Date.now() - startedAt), settleDeadline - Date.now());
203
+ if (remaining <= 0) break;
204
+ const next = await Promise.race([
205
+ iterator.next(),
206
+ new Promise<{ done: true; value: undefined }>((r) =>
207
+ setTimeout(() => r({ done: true, value: undefined }), Math.max(0, remaining)),
208
+ ),
209
+ ]);
210
+ if (next.done || next.value == null) {
211
+ // timed out (either overall or settle) — stop
212
+ break;
213
+ }
214
+ const m = next.value as ObservedMessage;
215
+ if (m.senderUserId === driverUserId) continue; // our own echo
216
+ if (m.edited) {
217
+ edits.push(m);
218
+ if (isWorkerFeedMessage(m)) sawWorkerFeed = true;
219
+ if (isActivityFeedMessage(m)) sawActivityFeed = true;
220
+ continue;
221
+ }
222
+ botMessages.push(m);
223
+ if (isWorkerFeedMessage(m)) sawWorkerFeed = true;
224
+ else if (isActivityFeedMessage(m)) sawActivityFeed = true;
225
+ else if (answer == null && m.text.trim().length >= opts.minAnswerChars) {
226
+ answer = m;
227
+ // Got the answer; keep collecting for `settleMs` to catch a late
228
+ // fragment/duplicate/misrouted trailing send.
229
+ settleDeadline = Date.now() + settleMs;
230
+ }
231
+ }
232
+ void iterator.return?.();
233
+ return {
234
+ botMessages,
235
+ edits,
236
+ answer,
237
+ answerLatencyMs: answer ? answer.date.getTime() - startedAt : Date.now() - startedAt,
238
+ sawActivityFeed,
239
+ sawWorkerFeed,
240
+ };
241
+ }
242
+
243
+ export interface TurnViolation {
244
+ code:
245
+ | "no-answer"
246
+ | "orphaned-fragment"
247
+ | "surface-missing"
248
+ | "wrong-surface";
249
+ detail: string;
250
+ }
251
+
252
+ /**
253
+ * Bug detectors over a collected turn. Splits HARD violations (the universal
254
+ * invariants that must always hold) from SOFT warnings (work-specific surfaces
255
+ * that are probabilistic on a generic agent — whether it dispatches a worker /
256
+ * sub-agent is its judgment, so a missing feed is reported, not failed).
257
+ *
258
+ * Hard violations:
259
+ * - no-answer: no substantive reply arrived at all (the answer never landed).
260
+ * - orphaned-fragment: a short non-ack bot text landed, THEN ≥8s later a much
261
+ * longer answer — the orphaned-reply backstop signature (fragment flushed,
262
+ * real reply late). A short message that is itself the only substantive reply,
263
+ * or a brief "on it" ack followed promptly, does not count.
264
+ * - wrong-surface (channel): a bot message landed outside the expected chat.
265
+ *
266
+ * Soft warnings:
267
+ * - surface-missing: a `requireSurface` case never showed its feed. The agent
268
+ * may have answered inline (a legitimate choice) — reported for the bug hunt,
269
+ * not a hard fail. When the feed DOES appear, the summary + gateway telemetry
270
+ * confirm it surfaced correctly.
271
+ */
272
+ export function analyzeTurn(
273
+ obs: TurnObservation,
274
+ expected: { requireSurface?: "worker" | "activity"; chatId: number },
275
+ ): { violations: TurnViolation[]; warnings: TurnViolation[] } {
276
+ const violations: TurnViolation[] = [];
277
+ const warnings: TurnViolation[] = [];
278
+ if (obs.answer == null) {
279
+ violations.push({
280
+ code: "no-answer",
281
+ detail: `no substantive reply within budget (saw ${obs.botMessages.length} bot msg(s), ` +
282
+ `activityFeed=${obs.sawActivityFeed} workerFeed=${obs.sawWorkerFeed})`,
283
+ });
284
+ }
285
+
286
+ // orphaned-fragment: a non-feed text shorter than 150 chars, sent ≥8s before
287
+ // the answer, that isn't a quick ack right before the answer.
288
+ if (obs.answer != null) {
289
+ const fragments = obs.botMessages.filter(
290
+ (m) =>
291
+ m.messageId !== obs.answer!.messageId &&
292
+ !isWorkerFeedMessage(m) &&
293
+ !isActivityFeedMessage(m) &&
294
+ m.text.trim().length > 0 &&
295
+ m.text.trim().length < 150 &&
296
+ obs.answer!.date.getTime() - m.date.getTime() >= 8_000,
297
+ );
298
+ if (fragments.length > 0) {
299
+ violations.push({
300
+ code: "orphaned-fragment",
301
+ detail: `${fragments.length} stub message(s) landed ≥8s before the answer ` +
302
+ `(e.g. ${JSON.stringify(fragments[0]!.text.slice(0, 60))}) — the orphaned-reply ` +
303
+ `backstop signature.`,
304
+ });
305
+ }
306
+ }
307
+
308
+ if (expected.requireSurface === "worker" && !obs.sawWorkerFeed) {
309
+ warnings.push({ code: "surface-missing", detail: "expected a 🛠 worker feed; agent likely answered inline" });
310
+ }
311
+ if (expected.requireSurface === "activity" && !obs.sawActivityFeed && !obs.sawWorkerFeed) {
312
+ warnings.push({ code: "surface-missing", detail: "expected a →/✓ activity feed; none appeared" });
313
+ }
314
+
315
+ const stray = [...obs.botMessages, ...obs.edits].filter((m) => m.chatId !== expected.chatId);
316
+ if (stray.length > 0) {
317
+ violations.push({
318
+ code: "wrong-surface",
319
+ detail: `${stray.length} bot message(s) landed in chat ${stray[0]!.chatId}, expected ${expected.chatId}`,
320
+ });
321
+ }
322
+ return { violations, warnings };
323
+ }
324
+
325
+ /** One-line human summary of a turn for the test log (bug-hunt forensics). */
326
+ export function summarizeTurn(name: string, obs: TurnObservation): string {
327
+ return (
328
+ `[real-work] ${name}: answer=${obs.answer ? `${obs.answer.text.trim().length}ch@${Math.round(obs.answerLatencyMs / 1000)}s` : "NONE"} ` +
329
+ `botMsgs=${obs.botMessages.length} edits=${obs.edits.length} ` +
330
+ `activityFeed=${obs.sawActivityFeed} workerFeed=${obs.sawWorkerFeed}`
331
+ );
332
+ }
@@ -0,0 +1,82 @@
1
+ /**
2
+ * Real-work UAT (channel) — the DM real-work suite, in a forum supergroup.
3
+ * Proves the status surface (activity/worker feed) AND the answer land IN the
4
+ * channel under genuine work — not leaked to the owner DM — and that a late
5
+ * reply after a long tool turn doesn't escape the channel. Self-skips green when
6
+ * SWITCHROOM_UAT_CHAT_ID is unset or the chat isn't a resolvable supergroup.
7
+ *
8
+ * mtcute has no forum-topic API, so this uses the supergroup's General topic: it
9
+ * proves DM-vs-channel routing, not correct-topic-among-many (the gateway unit
10
+ * thread-assertions pin that). See real-work-prompts.ts.
11
+ */
12
+ import { describe, it, expect, beforeAll } from "vitest";
13
+ import { spinUp, type Scenario } from "../harness.js";
14
+ import {
15
+ REAL_WORK_CASES,
16
+ collectTurn,
17
+ analyzeTurn,
18
+ summarizeTurn,
19
+ } from "../real-work-prompts.js";
20
+
21
+ const SUPERGROUP_ID = Number.parseInt(process.env.SWITCHROOM_UAT_CHAT_ID ?? "", 10);
22
+
23
+ describe("uat: real-work channel — status + answer land in the supergroup", () => {
24
+ let sc: Scenario | null = null;
25
+ let postable = false;
26
+
27
+ beforeAll(async () => {
28
+ if (!Number.isFinite(SUPERGROUP_ID)) {
29
+ console.warn("[uat] SWITCHROOM_UAT_CHAT_ID unset — skipping real-work channel suite");
30
+ return;
31
+ }
32
+ sc = await spinUp({ agent: "test-harness" });
33
+ await sc.driver.primeDialogs();
34
+ postable = await sc.driver.canResolve(SUPERGROUP_ID);
35
+ if (!postable) {
36
+ console.warn(`[uat] supergroup ${SUPERGROUP_ID} not resolvable — skipping real-work channel suite`);
37
+ }
38
+ });
39
+
40
+ for (const fc of REAL_WORK_CASES) {
41
+ it(
42
+ `[real-work-sg] ${fc.name} (${fc.kind}) — answer + surface land in the channel`,
43
+ async () => {
44
+ if (sc == null || !postable) return; // self-skip green
45
+ await sc.driver.primeDialogs();
46
+ const obs = await collectTurn(
47
+ sc.driver,
48
+ SUPERGROUP_ID,
49
+ sc.driverUserId,
50
+ fc.prompt,
51
+ { timeoutMs: fc.timeoutMs, minAnswerChars: fc.minAnswerChars },
52
+ );
53
+ console.log(summarizeTurn(`sg:${fc.name}`, obs));
54
+ if (obs.answer != null) {
55
+ console.log(
56
+ `[real-work-sg] ${fc.name} answer: ${JSON.stringify(obs.answer.text.slice(0, 180))}`,
57
+ );
58
+ }
59
+
60
+ const { violations, warnings } = analyzeTurn(obs, {
61
+ requireSurface: fc.requireSurface,
62
+ chatId: SUPERGROUP_ID, // wrong-surface detector = leaked out of the channel
63
+ });
64
+ for (const w of warnings) {
65
+ console.warn(`[real-work-sg] ${fc.name}: WARN ${w.code}: ${w.detail}`);
66
+ }
67
+ if (violations.length > 0) {
68
+ throw new Error(
69
+ `[real-work-sg] ${fc.name}: ${violations.length} invariant violation(s):\n` +
70
+ violations.map((x) => ` - ${x.code}: ${x.detail}`).join("\n"),
71
+ );
72
+ }
73
+ // Every observed bot message must be in the channel (the routing proof).
74
+ for (const m of [...obs.botMessages, ...obs.edits]) {
75
+ expect(m.chatId).toBe(SUPERGROUP_ID);
76
+ }
77
+ expect(obs.answer).not.toBeNull();
78
+ },
79
+ fc.timeoutMs + 45_000,
80
+ );
81
+ }
82
+ });
@@ -0,0 +1,64 @@
1
+ /**
2
+ * Real-work UAT (DM) — human-style prompts that trigger genuine work
3
+ * (multi-tool / web research / sub-agents / background workers), asserting the
4
+ * status-surface + reply-ordering invariants the conversational fuzz never
5
+ * exercised. The status-dark, orphaned-reply-fragment, and late-reply bugs only
6
+ * appear when the agent actually does work; these prompts provoke it in a human
7
+ * voice, `collectTurn` captures the whole bot-message sequence, and `analyzeTurn`
8
+ * flags the known bug signatures. See real-work-prompts.ts for rationale + the
9
+ * mtcute harness limits.
10
+ */
11
+ import { describe, it, expect } from "vitest";
12
+ import { spinUp } from "../harness.js";
13
+ import {
14
+ REAL_WORK_CASES,
15
+ collectTurn,
16
+ analyzeTurn,
17
+ summarizeTurn,
18
+ } from "../real-work-prompts.js";
19
+
20
+ describe("uat: real-work DM — status surface + ordering under genuine work", () => {
21
+ for (const fc of REAL_WORK_CASES) {
22
+ it(
23
+ `[real-work] ${fc.name} (${fc.kind}) — answer lands, surface holds`,
24
+ async () => {
25
+ const sc = await spinUp({ agent: "test-harness" });
26
+ try {
27
+ const obs = await collectTurn(
28
+ sc.driver,
29
+ sc.botUserId,
30
+ sc.driverUserId,
31
+ fc.prompt,
32
+ { timeoutMs: fc.timeoutMs, minAnswerChars: fc.minAnswerChars },
33
+ );
34
+ // Forensic log — the bug hunt reads these to spot dark feeds, late
35
+ // fragments, and surface gaps even on cases that "pass".
36
+ console.log(summarizeTurn(fc.name, obs));
37
+ if (obs.answer != null) {
38
+ console.log(
39
+ `[real-work] ${fc.name} answer: ${JSON.stringify(obs.answer.text.slice(0, 180))}`,
40
+ );
41
+ }
42
+
43
+ const { violations, warnings } = analyzeTurn(obs, {
44
+ requireSurface: fc.requireSurface,
45
+ chatId: sc.botUserId,
46
+ });
47
+ for (const w of warnings) {
48
+ console.warn(`[real-work] ${fc.name}: WARN ${w.code}: ${w.detail}`);
49
+ }
50
+ if (violations.length > 0) {
51
+ throw new Error(
52
+ `[real-work] ${fc.name}: ${violations.length} invariant violation(s):\n` +
53
+ violations.map((x) => ` - ${x.code}: ${x.detail}`).join("\n"),
54
+ );
55
+ }
56
+ expect(obs.answer).not.toBeNull();
57
+ } finally {
58
+ await sc.tearDown();
59
+ }
60
+ },
61
+ fc.timeoutMs + 45_000,
62
+ );
63
+ }
64
+ });