switchroom 0.14.66 → 0.14.68
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli/switchroom.js +556 -325
- package/dist/cli/ui/index.html +103 -38
- package/package.json +1 -1
- package/telegram-plugin/answer-stream-flag.ts +19 -0
- package/telegram-plugin/dist/gateway/gateway.js +35 -11
- package/telegram-plugin/gateway/gateway.ts +71 -7
- package/telegram-plugin/silence-poke.ts +25 -0
- package/telegram-plugin/tests/answer-stream-flag.test.ts +19 -1
- package/telegram-plugin/tests/draft-retirement-wiring.test.ts +52 -0
- package/telegram-plugin/tests/silence-liveness-wiring.test.ts +67 -0
- package/telegram-plugin/tests/silence-poke.test.ts +42 -0
- package/telegram-plugin/uat/real-work-prompts.ts +332 -0
- package/telegram-plugin/uat/scenarios/fuzz-real-work-channel.test.ts +82 -0
- package/telegram-plugin/uat/scenarios/fuzz-real-work-dm.test.ts +64 -0
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Silence-poke production-liveness — heartbeat-safety guard (2026-06-05).
|
|
3
|
+
*
|
|
4
|
+
* The production-liveness fix resets the silence clock on observable production
|
|
5
|
+
* so a long WORKING turn doesn't dark out. The load-bearing constraint: the
|
|
6
|
+
* reset must fire ONLY on MODEL-driven production, NEVER from the framework
|
|
7
|
+
* `feedHeartbeatTick` — a model-INDEPENDENT setInterval that re-renders a
|
|
8
|
+
* climbing " · Ns" elapsed every 6s (defeating the feed's content-dedup). If the
|
|
9
|
+
* reset lived in `drainActivitySummary` (which the heartbeat drains), a
|
|
10
|
+
* hung-but-bridge-connected agent would have its 300s silence clock reset every
|
|
11
|
+
* 6s forever, the load-bearing silence-poke unwedge would NEVER fire, and the
|
|
12
|
+
* conversation would be pinned — the #1556 permanent dangling-turn wedge.
|
|
13
|
+
*
|
|
14
|
+
* An adversarial review panel caught exactly this in an earlier revision. These
|
|
15
|
+
* are STRUCTURAL assertions (the gateway IIFE can't be instantiated in-process —
|
|
16
|
+
* same pattern as multitopic-routing-wiring.test) that pin the reset to the
|
|
17
|
+
* model-driven sites so a refactor can't silently reintroduce the regression.
|
|
18
|
+
* The behavioural counterpart (noteProduction resets; STOP producing → fires)
|
|
19
|
+
* lives in silence-poke.test.ts; this guards the WIRING the heartbeat must not
|
|
20
|
+
* cross.
|
|
21
|
+
*/
|
|
22
|
+
import { describe, it, expect } from 'vitest'
|
|
23
|
+
import { readFileSync } from 'node:fs'
|
|
24
|
+
import { resolve } from 'node:path'
|
|
25
|
+
|
|
26
|
+
const gatewaySrc = readFileSync(resolve(__dirname, '..', 'gateway', 'gateway.ts'), 'utf-8')
|
|
27
|
+
|
|
28
|
+
function between(src: string, startMarker: string, endMarker: string): string {
|
|
29
|
+
const after = src.split(startMarker)[1] ?? ''
|
|
30
|
+
return after.split(endMarker)[0] ?? ''
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
describe('silence-poke production-liveness — heartbeat safety', () => {
|
|
34
|
+
it('drainActivitySummary must NOT reset the silence clock (the framework heartbeat drains here)', () => {
|
|
35
|
+
const body = between(gatewaySrc, 'async function drainActivitySummary', '\nfunction feedHeartbeatTick')
|
|
36
|
+
expect(body.length).toBeGreaterThan(100) // sanity: the slice found the function body
|
|
37
|
+
expect(body).not.toMatch(/noteProduction/)
|
|
38
|
+
})
|
|
39
|
+
|
|
40
|
+
it('feedHeartbeatTick itself must NOT reset the silence clock (model-independent re-render)', () => {
|
|
41
|
+
const body = between(gatewaySrc, 'function feedHeartbeatTick(): void {', '\n}')
|
|
42
|
+
expect(body.length).toBeGreaterThan(50)
|
|
43
|
+
expect(body).not.toMatch(/noteProduction/)
|
|
44
|
+
})
|
|
45
|
+
|
|
46
|
+
it('the MODEL-driven tool-label append IS the reset site, gated on the live turn', () => {
|
|
47
|
+
// appendActivityLabel returns a fresh render only when the model emits a NEW
|
|
48
|
+
// labelled step — the genuine liveness signal the heartbeat can never forge.
|
|
49
|
+
const block = between(
|
|
50
|
+
gatewaySrc,
|
|
51
|
+
'const rendered = appendActivityLabel(turn.mirrorLines, ev.label)',
|
|
52
|
+
'\n return',
|
|
53
|
+
)
|
|
54
|
+
expect(block).toMatch(/silencePoke\.noteProduction/)
|
|
55
|
+
expect(block).toMatch(/currentTurn === turn/)
|
|
56
|
+
})
|
|
57
|
+
|
|
58
|
+
it('the answer-stream draft onMetric reset is model-driven and gated on the live turn', () => {
|
|
59
|
+
const block = between(gatewaySrc, 'onMetric: (metricEv) => {', '\n },')
|
|
60
|
+
expect(block).toMatch(/silencePoke\.noteProduction/)
|
|
61
|
+
expect(block).toMatch(/currentTurn === turn/)
|
|
62
|
+
})
|
|
63
|
+
|
|
64
|
+
it('production-liveness is behind the default-ON SWITCHROOM_SILENCE_LIVENESS_PRODUCTION kill switch', () => {
|
|
65
|
+
expect(gatewaySrc).toMatch(/SWITCHROOM_SILENCE_LIVENESS_PRODUCTION !== '0'/)
|
|
66
|
+
})
|
|
67
|
+
})
|
|
@@ -2,6 +2,7 @@ import { describe, it, expect, beforeEach, afterEach } from 'vitest'
|
|
|
2
2
|
import {
|
|
3
3
|
startTurn,
|
|
4
4
|
noteOutbound,
|
|
5
|
+
noteProduction,
|
|
5
6
|
noteThinking,
|
|
6
7
|
noteToolStart,
|
|
7
8
|
noteToolEnd,
|
|
@@ -136,6 +137,47 @@ describe('silence-poke — outbound resets the silence clock', () => {
|
|
|
136
137
|
})
|
|
137
138
|
})
|
|
138
139
|
|
|
140
|
+
// Production-liveness (2026-06-05): an activity-feed render or draft update is
|
|
141
|
+
// the agent visibly working — it resets the silence clock so a long
|
|
142
|
+
// tool/composition turn isn't torn down mid-work.
|
|
143
|
+
describe('silence-poke — noteProduction resets the silence clock', () => {
|
|
144
|
+
it('a feed/draft render at 250s pushes the fallback measurement to it', () => {
|
|
145
|
+
const fx = setupDeps()
|
|
146
|
+
startTurn('k', 0)
|
|
147
|
+
noteProduction('k', 250_000)
|
|
148
|
+
__tickForTests(300_000) // 50s since production — no fire
|
|
149
|
+
expect(fx.fallbacks).toHaveLength(0)
|
|
150
|
+
__tickForTests(550_000) // 300s since production — fires
|
|
151
|
+
expect(fx.fallbacks).toHaveLength(1)
|
|
152
|
+
})
|
|
153
|
+
|
|
154
|
+
it('repeated production every 60s keeps a long turn alive indefinitely', () => {
|
|
155
|
+
const fx = setupDeps()
|
|
156
|
+
startTurn('k', 0)
|
|
157
|
+
for (let t = 60_000; t <= 600_000; t += 60_000) {
|
|
158
|
+
noteProduction('k', t)
|
|
159
|
+
__tickForTests(t)
|
|
160
|
+
}
|
|
161
|
+
// 10 min of steady feed/draft renders — never torn down.
|
|
162
|
+
expect(fx.fallbacks).toHaveLength(0)
|
|
163
|
+
})
|
|
164
|
+
|
|
165
|
+
it('production STOPS → the fallback fires 300s after the last render (genuine wedge)', () => {
|
|
166
|
+
const fx = setupDeps()
|
|
167
|
+
startTurn('k', 0)
|
|
168
|
+
noteProduction('k', 100_000) // last render at 100s, then silence
|
|
169
|
+
__tickForTests(390_000) // 290s since last render — no fire
|
|
170
|
+
expect(fx.fallbacks).toHaveLength(0)
|
|
171
|
+
__tickForTests(401_000) // 301s since last render — fires
|
|
172
|
+
expect(fx.fallbacks).toHaveLength(1)
|
|
173
|
+
})
|
|
174
|
+
|
|
175
|
+
it('is a no-op for an unknown key (no turn state)', () => {
|
|
176
|
+
setupDeps()
|
|
177
|
+
expect(() => noteProduction('nope', 1_000)).not.toThrow()
|
|
178
|
+
})
|
|
179
|
+
})
|
|
180
|
+
|
|
139
181
|
// Pin the contract the gateway must uphold for ABNORMAL turn-ends:
|
|
140
182
|
// every code path that abandons a turn before turn_end (context-
|
|
141
183
|
// exhaust bail, gateway-side wedge timeout, silent-end recovery)
|
|
@@ -0,0 +1,332 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Real-work UAT coverage — human-style prompts that trigger actual work
|
|
3
|
+
* (multi-tool, web research, sub-agents, background workers) plus a turn
|
|
4
|
+
* collector + bug detectors for the failure classes the conversational fuzz
|
|
5
|
+
* never exercised.
|
|
6
|
+
*
|
|
7
|
+
* Why this exists: the existing fuzz scenarios send conversational prompts
|
|
8
|
+
* ("hey how's it going", emoji, markdown edge-cases) → trivial fast replies.
|
|
9
|
+
* The status-surface and reply-ordering bugs (live feed going dark mid-work,
|
|
10
|
+
* the orphaned-reply backstop flushing a fragment then the real answer landing
|
|
11
|
+
* late and out of order, late replies misrouting) only manifest when the agent
|
|
12
|
+
* does REAL work — uses tools/MCPs, spawns sub-agents, researches long enough to
|
|
13
|
+
* cross the silence-poke / orphaned-reply thresholds. These prompts provoke that
|
|
14
|
+
* work in a human voice; `collectTurn` captures the whole bot-message sequence;
|
|
15
|
+
* `analyzeTurn` flags the known bug signatures.
|
|
16
|
+
*
|
|
17
|
+
* Harness limits (see CLAUDE.md): mtcute observes real sendMessage/editMessageText
|
|
18
|
+
* (so the activity feed `→/✓` and worker feed `🛠` ARE observable) but NOT drafts
|
|
19
|
+
* or reactions, and has no forum-topic API (channel scenarios use the General
|
|
20
|
+
* topic — they prove DM-vs-channel routing, not correct-topic-among-many, which
|
|
21
|
+
* the gateway unit thread-assertions pin). So work-triggering is probabilistic on
|
|
22
|
+
* a generic agent: the UNIVERSAL invariants (a substantive answer arrives, in the
|
|
23
|
+
* right surface, not as an orphaned fragment) are hard; the work-specific surfaces
|
|
24
|
+
* (feed painted, worker surfaced) are reported and only hard-checked once their
|
|
25
|
+
* precondition is observed.
|
|
26
|
+
*/
|
|
27
|
+
|
|
28
|
+
import type { Driver, ObservedMessage } from "./driver.js";
|
|
29
|
+
import { isWorkerFeedMessage, isActivityFeedMessage } from "./assertions.js";
|
|
30
|
+
|
|
31
|
+
export type WorkKind =
|
|
32
|
+
| "research" // web/multi-source research → multi-tool, long
|
|
33
|
+
| "multitool" // several tool calls, sequential
|
|
34
|
+
| "subagent" // delegates to a foreground sub-agent
|
|
35
|
+
| "bgworker" // dispatches a background worker (the 🛠 feed)
|
|
36
|
+
| "compound" // first X then Y then summarise — ordered multi-step
|
|
37
|
+
| "web"; // current/recent info → forces a web fetch
|
|
38
|
+
|
|
39
|
+
export interface RealWorkCase {
|
|
40
|
+
name: string;
|
|
41
|
+
/** Human-style prompt that should provoke real work. */
|
|
42
|
+
prompt: string;
|
|
43
|
+
kind: WorkKind;
|
|
44
|
+
/** Generous budget — deep research can run minutes. */
|
|
45
|
+
timeoutMs: number;
|
|
46
|
+
/** The substantive answer must be at least this long; a backstop fragment /
|
|
47
|
+
* bare ack is shorter, so this distinguishes "the answer landed" from "only a
|
|
48
|
+
* stub landed". */
|
|
49
|
+
minAnswerChars: number;
|
|
50
|
+
/** When true, this prompt RELIABLY triggers the named surface, so the scenario
|
|
51
|
+
* hard-asserts it appeared (not just reports it). Used for the semi-prescriptive
|
|
52
|
+
* but natural-sounding bgworker/subagent cases. */
|
|
53
|
+
requireSurface?: "worker" | "activity";
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
/**
|
|
57
|
+
* The case set. The first block is fully human-style (probabilistic work); the
|
|
58
|
+
* `requireSurface` block phrases the dispatch naturally but reliably enough to
|
|
59
|
+
* hard-assert the surface. Keep prompts provider-agnostic so they run on the
|
|
60
|
+
* generic test-harness agent (no marko-specific MCPs).
|
|
61
|
+
*/
|
|
62
|
+
export const REAL_WORK_CASES: RealWorkCase[] = [
|
|
63
|
+
{
|
|
64
|
+
name: "deep research, take your time",
|
|
65
|
+
prompt:
|
|
66
|
+
"Can you research the current state of WebAssembly outside the browser — " +
|
|
67
|
+
"the main server-side runtimes, who's actually using it in production, and " +
|
|
68
|
+
"the real limitations today? Take your time and give me a proper rundown, " +
|
|
69
|
+
"not a one-liner.",
|
|
70
|
+
kind: "research",
|
|
71
|
+
timeoutMs: 180_000,
|
|
72
|
+
minAnswerChars: 400,
|
|
73
|
+
},
|
|
74
|
+
{
|
|
75
|
+
name: "current info, forces a lookup",
|
|
76
|
+
prompt:
|
|
77
|
+
"What's the latest with the Bun JavaScript runtime — the recent releases " +
|
|
78
|
+
"and whether people consider it production-ready yet? Check, don't guess.",
|
|
79
|
+
kind: "web",
|
|
80
|
+
timeoutMs: 150_000,
|
|
81
|
+
minAnswerChars: 300,
|
|
82
|
+
},
|
|
83
|
+
{
|
|
84
|
+
name: "multi-angle investigation",
|
|
85
|
+
prompt:
|
|
86
|
+
"Dig into Postgres vs SQLite for a small SaaS backend — look at it from a " +
|
|
87
|
+
"few angles (concurrency, ops burden, cost at scale) and tell me which " +
|
|
88
|
+
"you'd actually pick and why.",
|
|
89
|
+
kind: "multitool",
|
|
90
|
+
timeoutMs: 150_000,
|
|
91
|
+
minAnswerChars: 400,
|
|
92
|
+
},
|
|
93
|
+
{
|
|
94
|
+
name: "compound sequential ask",
|
|
95
|
+
prompt:
|
|
96
|
+
"First work out what today's date is, then how many days are left until the " +
|
|
97
|
+
"end of this quarter, then suggest three concrete milestones I could hit " +
|
|
98
|
+
"before then. Do it in that order.",
|
|
99
|
+
kind: "compound",
|
|
100
|
+
timeoutMs: 120_000,
|
|
101
|
+
minAnswerChars: 250,
|
|
102
|
+
},
|
|
103
|
+
{
|
|
104
|
+
name: "invite delegation",
|
|
105
|
+
prompt:
|
|
106
|
+
"I need a proper comparison of Stripe vs Paddle vs Lemon Squeezy for selling " +
|
|
107
|
+
"a digital product — pricing, who handles sales tax, and payout timing. Farm " +
|
|
108
|
+
"it out to a sub-agent if that's faster; just give me the bottom line at the end.",
|
|
109
|
+
kind: "subagent",
|
|
110
|
+
timeoutMs: 180_000,
|
|
111
|
+
minAnswerChars: 350,
|
|
112
|
+
},
|
|
113
|
+
{
|
|
114
|
+
name: "long sourced briefing (crosses thresholds)",
|
|
115
|
+
prompt:
|
|
116
|
+
"Give me a thorough, well-sourced briefing on the EU AI Act — what it covers, " +
|
|
117
|
+
"the risk tiers, the key deadlines, and what a small AI startup actually has to " +
|
|
118
|
+
"do. Be comprehensive; I'd rather wait and get depth.",
|
|
119
|
+
kind: "research",
|
|
120
|
+
timeoutMs: 360_000,
|
|
121
|
+
minAnswerChars: 500,
|
|
122
|
+
},
|
|
123
|
+
// ── reliably-triggering, still natural voice ──────────────────────────────
|
|
124
|
+
{
|
|
125
|
+
name: "background worker, ping me when done",
|
|
126
|
+
prompt:
|
|
127
|
+
"Don't answer this inline — actually dispatch a background worker for it " +
|
|
128
|
+
"(Task / Agent with run_in_background: true) so I can keep chatting while it " +
|
|
129
|
+
"runs, and ping me when it's done. The task: go through, ONE step at a time " +
|
|
130
|
+
"with a one-line note on each (run a quick command or jot a note per step so " +
|
|
131
|
+
"there's visible progress), the eight most common email-deliverability " +
|
|
132
|
+
"mistakes a solo founder makes — SPF, DKIM, DMARC, warmup, list hygiene, " +
|
|
133
|
+
"content, sending cadence, monitoring. Pace it over a couple of minutes; do " +
|
|
134
|
+
"all eight, then hand back the summary.",
|
|
135
|
+
kind: "bgworker",
|
|
136
|
+
// Generous: if the agent declines to background it and composes inline, a
|
|
137
|
+
// paced 8-step answer can run past 5 min (and, with no tracked tool in
|
|
138
|
+
// flight, trip the 300s silence-poke — see the 2026-06-05 UAT finding).
|
|
139
|
+
timeoutMs: 360_000,
|
|
140
|
+
minAnswerChars: 250,
|
|
141
|
+
requireSurface: "worker",
|
|
142
|
+
},
|
|
143
|
+
{
|
|
144
|
+
name: "step-by-step so the feed paints",
|
|
145
|
+
prompt:
|
|
146
|
+
"Walk through, ONE step at a time (run a quick command or note for each so I " +
|
|
147
|
+
"can see progress), how you'd debug a Linux box that's suddenly out of disk " +
|
|
148
|
+
"space — six steps: df, du on the big dirs, find large files, check logs, " +
|
|
149
|
+
"check deleted-but-open files, then a cleanup plan. Then give me the recap.",
|
|
150
|
+
kind: "multitool",
|
|
151
|
+
timeoutMs: 180_000,
|
|
152
|
+
minAnswerChars: 300,
|
|
153
|
+
requireSurface: "activity",
|
|
154
|
+
},
|
|
155
|
+
];
|
|
156
|
+
|
|
157
|
+
/** What the collector observed across one turn. */
|
|
158
|
+
export interface TurnObservation {
|
|
159
|
+
/** Every bot message (initial sends only; edits tracked separately). */
|
|
160
|
+
botMessages: ObservedMessage[];
|
|
161
|
+
/** Edit events seen (worker/activity feeds grow via edits). */
|
|
162
|
+
edits: ObservedMessage[];
|
|
163
|
+
/** The first substantive answer (non-feed, >= minAnswerChars), or null. */
|
|
164
|
+
answer: ObservedMessage | null;
|
|
165
|
+
/** ms from send to the answer (or to timeout). */
|
|
166
|
+
answerLatencyMs: number;
|
|
167
|
+
/** Whether an activity feed (`→/✓`) message was seen. */
|
|
168
|
+
sawActivityFeed: boolean;
|
|
169
|
+
/** Whether a worker feed (`🛠 Worker`) message was seen. */
|
|
170
|
+
sawWorkerFeed: boolean;
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
/**
|
|
174
|
+
* Send `prompt` and collect the bot's message sequence until a substantive
|
|
175
|
+
* answer lands (+ a short settle to catch trailing/late sends — the very window
|
|
176
|
+
* the orphaned-reply bug lives in) or `timeoutMs` elapses. Observing starts
|
|
177
|
+
* BEFORE the send so nothing is missed.
|
|
178
|
+
*/
|
|
179
|
+
export async function collectTurn(
|
|
180
|
+
driver: Driver,
|
|
181
|
+
chatId: number,
|
|
182
|
+
driverUserId: number,
|
|
183
|
+
prompt: string,
|
|
184
|
+
opts: { timeoutMs: number; minAnswerChars: number; settleMs?: number },
|
|
185
|
+
): Promise<TurnObservation> {
|
|
186
|
+
const settleMs = opts.settleMs ?? 6_000;
|
|
187
|
+
const botMessages: ObservedMessage[] = [];
|
|
188
|
+
const edits: ObservedMessage[] = [];
|
|
189
|
+
let answer: ObservedMessage | null = null;
|
|
190
|
+
let sawActivityFeed = false;
|
|
191
|
+
let sawWorkerFeed = false;
|
|
192
|
+
|
|
193
|
+
const startedAt = Date.now();
|
|
194
|
+
const iterator = driver.observeMessages(chatId)[Symbol.asyncIterator]();
|
|
195
|
+
// Begin observing, then send (observeMessages backfills nothing, but the send
|
|
196
|
+
// round-trips after the iterator is live).
|
|
197
|
+
await driver.sendText(chatId, prompt);
|
|
198
|
+
|
|
199
|
+
let settleDeadline = Number.POSITIVE_INFINITY;
|
|
200
|
+
while (true) {
|
|
201
|
+
const remaining =
|
|
202
|
+
Math.min(opts.timeoutMs - (Date.now() - startedAt), settleDeadline - Date.now());
|
|
203
|
+
if (remaining <= 0) break;
|
|
204
|
+
const next = await Promise.race([
|
|
205
|
+
iterator.next(),
|
|
206
|
+
new Promise<{ done: true; value: undefined }>((r) =>
|
|
207
|
+
setTimeout(() => r({ done: true, value: undefined }), Math.max(0, remaining)),
|
|
208
|
+
),
|
|
209
|
+
]);
|
|
210
|
+
if (next.done || next.value == null) {
|
|
211
|
+
// timed out (either overall or settle) — stop
|
|
212
|
+
break;
|
|
213
|
+
}
|
|
214
|
+
const m = next.value as ObservedMessage;
|
|
215
|
+
if (m.senderUserId === driverUserId) continue; // our own echo
|
|
216
|
+
if (m.edited) {
|
|
217
|
+
edits.push(m);
|
|
218
|
+
if (isWorkerFeedMessage(m)) sawWorkerFeed = true;
|
|
219
|
+
if (isActivityFeedMessage(m)) sawActivityFeed = true;
|
|
220
|
+
continue;
|
|
221
|
+
}
|
|
222
|
+
botMessages.push(m);
|
|
223
|
+
if (isWorkerFeedMessage(m)) sawWorkerFeed = true;
|
|
224
|
+
else if (isActivityFeedMessage(m)) sawActivityFeed = true;
|
|
225
|
+
else if (answer == null && m.text.trim().length >= opts.minAnswerChars) {
|
|
226
|
+
answer = m;
|
|
227
|
+
// Got the answer; keep collecting for `settleMs` to catch a late
|
|
228
|
+
// fragment/duplicate/misrouted trailing send.
|
|
229
|
+
settleDeadline = Date.now() + settleMs;
|
|
230
|
+
}
|
|
231
|
+
}
|
|
232
|
+
void iterator.return?.();
|
|
233
|
+
return {
|
|
234
|
+
botMessages,
|
|
235
|
+
edits,
|
|
236
|
+
answer,
|
|
237
|
+
answerLatencyMs: answer ? answer.date.getTime() - startedAt : Date.now() - startedAt,
|
|
238
|
+
sawActivityFeed,
|
|
239
|
+
sawWorkerFeed,
|
|
240
|
+
};
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
export interface TurnViolation {
|
|
244
|
+
code:
|
|
245
|
+
| "no-answer"
|
|
246
|
+
| "orphaned-fragment"
|
|
247
|
+
| "surface-missing"
|
|
248
|
+
| "wrong-surface";
|
|
249
|
+
detail: string;
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
/**
|
|
253
|
+
* Bug detectors over a collected turn. Splits HARD violations (the universal
|
|
254
|
+
* invariants that must always hold) from SOFT warnings (work-specific surfaces
|
|
255
|
+
* that are probabilistic on a generic agent — whether it dispatches a worker /
|
|
256
|
+
* sub-agent is its judgment, so a missing feed is reported, not failed).
|
|
257
|
+
*
|
|
258
|
+
* Hard violations:
|
|
259
|
+
* - no-answer: no substantive reply arrived at all (the answer never landed).
|
|
260
|
+
* - orphaned-fragment: a short non-ack bot text landed, THEN ≥8s later a much
|
|
261
|
+
* longer answer — the orphaned-reply backstop signature (fragment flushed,
|
|
262
|
+
* real reply late). A short message that is itself the only substantive reply,
|
|
263
|
+
* or a brief "on it" ack followed promptly, does not count.
|
|
264
|
+
* - wrong-surface (channel): a bot message landed outside the expected chat.
|
|
265
|
+
*
|
|
266
|
+
* Soft warnings:
|
|
267
|
+
* - surface-missing: a `requireSurface` case never showed its feed. The agent
|
|
268
|
+
* may have answered inline (a legitimate choice) — reported for the bug hunt,
|
|
269
|
+
* not a hard fail. When the feed DOES appear, the summary + gateway telemetry
|
|
270
|
+
* confirm it surfaced correctly.
|
|
271
|
+
*/
|
|
272
|
+
export function analyzeTurn(
|
|
273
|
+
obs: TurnObservation,
|
|
274
|
+
expected: { requireSurface?: "worker" | "activity"; chatId: number },
|
|
275
|
+
): { violations: TurnViolation[]; warnings: TurnViolation[] } {
|
|
276
|
+
const violations: TurnViolation[] = [];
|
|
277
|
+
const warnings: TurnViolation[] = [];
|
|
278
|
+
if (obs.answer == null) {
|
|
279
|
+
violations.push({
|
|
280
|
+
code: "no-answer",
|
|
281
|
+
detail: `no substantive reply within budget (saw ${obs.botMessages.length} bot msg(s), ` +
|
|
282
|
+
`activityFeed=${obs.sawActivityFeed} workerFeed=${obs.sawWorkerFeed})`,
|
|
283
|
+
});
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
// orphaned-fragment: a non-feed text shorter than 150 chars, sent ≥8s before
|
|
287
|
+
// the answer, that isn't a quick ack right before the answer.
|
|
288
|
+
if (obs.answer != null) {
|
|
289
|
+
const fragments = obs.botMessages.filter(
|
|
290
|
+
(m) =>
|
|
291
|
+
m.messageId !== obs.answer!.messageId &&
|
|
292
|
+
!isWorkerFeedMessage(m) &&
|
|
293
|
+
!isActivityFeedMessage(m) &&
|
|
294
|
+
m.text.trim().length > 0 &&
|
|
295
|
+
m.text.trim().length < 150 &&
|
|
296
|
+
obs.answer!.date.getTime() - m.date.getTime() >= 8_000,
|
|
297
|
+
);
|
|
298
|
+
if (fragments.length > 0) {
|
|
299
|
+
violations.push({
|
|
300
|
+
code: "orphaned-fragment",
|
|
301
|
+
detail: `${fragments.length} stub message(s) landed ≥8s before the answer ` +
|
|
302
|
+
`(e.g. ${JSON.stringify(fragments[0]!.text.slice(0, 60))}) — the orphaned-reply ` +
|
|
303
|
+
`backstop signature.`,
|
|
304
|
+
});
|
|
305
|
+
}
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
if (expected.requireSurface === "worker" && !obs.sawWorkerFeed) {
|
|
309
|
+
warnings.push({ code: "surface-missing", detail: "expected a 🛠 worker feed; agent likely answered inline" });
|
|
310
|
+
}
|
|
311
|
+
if (expected.requireSurface === "activity" && !obs.sawActivityFeed && !obs.sawWorkerFeed) {
|
|
312
|
+
warnings.push({ code: "surface-missing", detail: "expected a →/✓ activity feed; none appeared" });
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
const stray = [...obs.botMessages, ...obs.edits].filter((m) => m.chatId !== expected.chatId);
|
|
316
|
+
if (stray.length > 0) {
|
|
317
|
+
violations.push({
|
|
318
|
+
code: "wrong-surface",
|
|
319
|
+
detail: `${stray.length} bot message(s) landed in chat ${stray[0]!.chatId}, expected ${expected.chatId}`,
|
|
320
|
+
});
|
|
321
|
+
}
|
|
322
|
+
return { violations, warnings };
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
/** One-line human summary of a turn for the test log (bug-hunt forensics). */
|
|
326
|
+
export function summarizeTurn(name: string, obs: TurnObservation): string {
|
|
327
|
+
return (
|
|
328
|
+
`[real-work] ${name}: answer=${obs.answer ? `${obs.answer.text.trim().length}ch@${Math.round(obs.answerLatencyMs / 1000)}s` : "NONE"} ` +
|
|
329
|
+
`botMsgs=${obs.botMessages.length} edits=${obs.edits.length} ` +
|
|
330
|
+
`activityFeed=${obs.sawActivityFeed} workerFeed=${obs.sawWorkerFeed}`
|
|
331
|
+
);
|
|
332
|
+
}
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Real-work UAT (channel) — the DM real-work suite, in a forum supergroup.
|
|
3
|
+
* Proves the status surface (activity/worker feed) AND the answer land IN the
|
|
4
|
+
* channel under genuine work — not leaked to the owner DM — and that a late
|
|
5
|
+
* reply after a long tool turn doesn't escape the channel. Self-skips green when
|
|
6
|
+
* SWITCHROOM_UAT_CHAT_ID is unset or the chat isn't a resolvable supergroup.
|
|
7
|
+
*
|
|
8
|
+
* mtcute has no forum-topic API, so this uses the supergroup's General topic: it
|
|
9
|
+
* proves DM-vs-channel routing, not correct-topic-among-many (the gateway unit
|
|
10
|
+
* thread-assertions pin that). See real-work-prompts.ts.
|
|
11
|
+
*/
|
|
12
|
+
import { describe, it, expect, beforeAll } from "vitest";
|
|
13
|
+
import { spinUp, type Scenario } from "../harness.js";
|
|
14
|
+
import {
|
|
15
|
+
REAL_WORK_CASES,
|
|
16
|
+
collectTurn,
|
|
17
|
+
analyzeTurn,
|
|
18
|
+
summarizeTurn,
|
|
19
|
+
} from "../real-work-prompts.js";
|
|
20
|
+
|
|
21
|
+
const SUPERGROUP_ID = Number.parseInt(process.env.SWITCHROOM_UAT_CHAT_ID ?? "", 10);
|
|
22
|
+
|
|
23
|
+
describe("uat: real-work channel — status + answer land in the supergroup", () => {
|
|
24
|
+
let sc: Scenario | null = null;
|
|
25
|
+
let postable = false;
|
|
26
|
+
|
|
27
|
+
beforeAll(async () => {
|
|
28
|
+
if (!Number.isFinite(SUPERGROUP_ID)) {
|
|
29
|
+
console.warn("[uat] SWITCHROOM_UAT_CHAT_ID unset — skipping real-work channel suite");
|
|
30
|
+
return;
|
|
31
|
+
}
|
|
32
|
+
sc = await spinUp({ agent: "test-harness" });
|
|
33
|
+
await sc.driver.primeDialogs();
|
|
34
|
+
postable = await sc.driver.canResolve(SUPERGROUP_ID);
|
|
35
|
+
if (!postable) {
|
|
36
|
+
console.warn(`[uat] supergroup ${SUPERGROUP_ID} not resolvable — skipping real-work channel suite`);
|
|
37
|
+
}
|
|
38
|
+
});
|
|
39
|
+
|
|
40
|
+
for (const fc of REAL_WORK_CASES) {
|
|
41
|
+
it(
|
|
42
|
+
`[real-work-sg] ${fc.name} (${fc.kind}) — answer + surface land in the channel`,
|
|
43
|
+
async () => {
|
|
44
|
+
if (sc == null || !postable) return; // self-skip green
|
|
45
|
+
await sc.driver.primeDialogs();
|
|
46
|
+
const obs = await collectTurn(
|
|
47
|
+
sc.driver,
|
|
48
|
+
SUPERGROUP_ID,
|
|
49
|
+
sc.driverUserId,
|
|
50
|
+
fc.prompt,
|
|
51
|
+
{ timeoutMs: fc.timeoutMs, minAnswerChars: fc.minAnswerChars },
|
|
52
|
+
);
|
|
53
|
+
console.log(summarizeTurn(`sg:${fc.name}`, obs));
|
|
54
|
+
if (obs.answer != null) {
|
|
55
|
+
console.log(
|
|
56
|
+
`[real-work-sg] ${fc.name} answer: ${JSON.stringify(obs.answer.text.slice(0, 180))}`,
|
|
57
|
+
);
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
const { violations, warnings } = analyzeTurn(obs, {
|
|
61
|
+
requireSurface: fc.requireSurface,
|
|
62
|
+
chatId: SUPERGROUP_ID, // wrong-surface detector = leaked out of the channel
|
|
63
|
+
});
|
|
64
|
+
for (const w of warnings) {
|
|
65
|
+
console.warn(`[real-work-sg] ${fc.name}: WARN ${w.code}: ${w.detail}`);
|
|
66
|
+
}
|
|
67
|
+
if (violations.length > 0) {
|
|
68
|
+
throw new Error(
|
|
69
|
+
`[real-work-sg] ${fc.name}: ${violations.length} invariant violation(s):\n` +
|
|
70
|
+
violations.map((x) => ` - ${x.code}: ${x.detail}`).join("\n"),
|
|
71
|
+
);
|
|
72
|
+
}
|
|
73
|
+
// Every observed bot message must be in the channel (the routing proof).
|
|
74
|
+
for (const m of [...obs.botMessages, ...obs.edits]) {
|
|
75
|
+
expect(m.chatId).toBe(SUPERGROUP_ID);
|
|
76
|
+
}
|
|
77
|
+
expect(obs.answer).not.toBeNull();
|
|
78
|
+
},
|
|
79
|
+
fc.timeoutMs + 45_000,
|
|
80
|
+
);
|
|
81
|
+
}
|
|
82
|
+
});
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Real-work UAT (DM) — human-style prompts that trigger genuine work
|
|
3
|
+
* (multi-tool / web research / sub-agents / background workers), asserting the
|
|
4
|
+
* status-surface + reply-ordering invariants the conversational fuzz never
|
|
5
|
+
* exercised. The status-dark, orphaned-reply-fragment, and late-reply bugs only
|
|
6
|
+
* appear when the agent actually does work; these prompts provoke it in a human
|
|
7
|
+
* voice, `collectTurn` captures the whole bot-message sequence, and `analyzeTurn`
|
|
8
|
+
* flags the known bug signatures. See real-work-prompts.ts for rationale + the
|
|
9
|
+
* mtcute harness limits.
|
|
10
|
+
*/
|
|
11
|
+
import { describe, it, expect } from "vitest";
|
|
12
|
+
import { spinUp } from "../harness.js";
|
|
13
|
+
import {
|
|
14
|
+
REAL_WORK_CASES,
|
|
15
|
+
collectTurn,
|
|
16
|
+
analyzeTurn,
|
|
17
|
+
summarizeTurn,
|
|
18
|
+
} from "../real-work-prompts.js";
|
|
19
|
+
|
|
20
|
+
describe("uat: real-work DM — status surface + ordering under genuine work", () => {
|
|
21
|
+
for (const fc of REAL_WORK_CASES) {
|
|
22
|
+
it(
|
|
23
|
+
`[real-work] ${fc.name} (${fc.kind}) — answer lands, surface holds`,
|
|
24
|
+
async () => {
|
|
25
|
+
const sc = await spinUp({ agent: "test-harness" });
|
|
26
|
+
try {
|
|
27
|
+
const obs = await collectTurn(
|
|
28
|
+
sc.driver,
|
|
29
|
+
sc.botUserId,
|
|
30
|
+
sc.driverUserId,
|
|
31
|
+
fc.prompt,
|
|
32
|
+
{ timeoutMs: fc.timeoutMs, minAnswerChars: fc.minAnswerChars },
|
|
33
|
+
);
|
|
34
|
+
// Forensic log — the bug hunt reads these to spot dark feeds, late
|
|
35
|
+
// fragments, and surface gaps even on cases that "pass".
|
|
36
|
+
console.log(summarizeTurn(fc.name, obs));
|
|
37
|
+
if (obs.answer != null) {
|
|
38
|
+
console.log(
|
|
39
|
+
`[real-work] ${fc.name} answer: ${JSON.stringify(obs.answer.text.slice(0, 180))}`,
|
|
40
|
+
);
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
const { violations, warnings } = analyzeTurn(obs, {
|
|
44
|
+
requireSurface: fc.requireSurface,
|
|
45
|
+
chatId: sc.botUserId,
|
|
46
|
+
});
|
|
47
|
+
for (const w of warnings) {
|
|
48
|
+
console.warn(`[real-work] ${fc.name}: WARN ${w.code}: ${w.detail}`);
|
|
49
|
+
}
|
|
50
|
+
if (violations.length > 0) {
|
|
51
|
+
throw new Error(
|
|
52
|
+
`[real-work] ${fc.name}: ${violations.length} invariant violation(s):\n` +
|
|
53
|
+
violations.map((x) => ` - ${x.code}: ${x.detail}`).join("\n"),
|
|
54
|
+
);
|
|
55
|
+
}
|
|
56
|
+
expect(obs.answer).not.toBeNull();
|
|
57
|
+
} finally {
|
|
58
|
+
await sc.tearDown();
|
|
59
|
+
}
|
|
60
|
+
},
|
|
61
|
+
fc.timeoutMs + 45_000,
|
|
62
|
+
);
|
|
63
|
+
}
|
|
64
|
+
});
|