switchroom 0.14.66 → 0.14.67
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli/switchroom.js +453 -325
- package/package.json +1 -1
- package/telegram-plugin/dist/gateway/gateway.js +24 -7
- package/telegram-plugin/gateway/gateway.ts +42 -2
- package/telegram-plugin/silence-poke.ts +25 -0
- package/telegram-plugin/tests/silence-liveness-wiring.test.ts +67 -0
- package/telegram-plugin/tests/silence-poke.test.ts +42 -0
- package/telegram-plugin/uat/real-work-prompts.ts +332 -0
- package/telegram-plugin/uat/scenarios/fuzz-real-work-channel.test.ts +82 -0
- package/telegram-plugin/uat/scenarios/fuzz-real-work-dm.test.ts +64 -0
package/package.json
CHANGED
|
@@ -39034,6 +39034,13 @@ function noteOutbound2(key, now) {
|
|
|
39034
39034
|
s.lastOutboundAt = now;
|
|
39035
39035
|
s.fallbackFired = false;
|
|
39036
39036
|
}
|
|
39037
|
+
function noteProduction(key, now) {
|
|
39038
|
+
const s = state2.get(key);
|
|
39039
|
+
if (s == null)
|
|
39040
|
+
return;
|
|
39041
|
+
s.lastOutboundAt = now;
|
|
39042
|
+
s.fallbackFired = false;
|
|
39043
|
+
}
|
|
39037
39044
|
function noteThinking(key, now) {
|
|
39038
39045
|
const s = state2.get(key);
|
|
39039
39046
|
if (s == null)
|
|
@@ -52763,11 +52770,11 @@ function sweepStaleTurnActiveMarker(stateDir, opts) {
|
|
|
52763
52770
|
}
|
|
52764
52771
|
|
|
52765
52772
|
// ../src/build-info.ts
|
|
52766
|
-
var VERSION = "0.14.
|
|
52767
|
-
var COMMIT_SHA = "
|
|
52768
|
-
var COMMIT_DATE = "2026-06-
|
|
52769
|
-
var LATEST_PR =
|
|
52770
|
-
var COMMITS_AHEAD_OF_TAG =
|
|
52773
|
+
var VERSION = "0.14.67";
|
|
52774
|
+
var COMMIT_SHA = "dcade213";
|
|
52775
|
+
var COMMIT_DATE = "2026-06-05T08:22:01Z";
|
|
52776
|
+
var LATEST_PR = 2171;
|
|
52777
|
+
var COMMITS_AHEAD_OF_TAG = 4;
|
|
52771
52778
|
|
|
52772
52779
|
// gateway/boot-version.ts
|
|
52773
52780
|
function formatRelativeAgo(iso) {
|
|
@@ -54075,7 +54082,7 @@ function findLatestEndedTurnForChat(chatId) {
|
|
|
54075
54082
|
return latest;
|
|
54076
54083
|
}
|
|
54077
54084
|
function resolveAnswerThreadWithLog(chatId, explicitThreadId, originTurn, liveTurn, surface) {
|
|
54078
|
-
const recovered = LATE_REPLY_TOPIC_RECOVERY_ENABLED && explicitThreadId == null && originTurn == null && liveTurn
|
|
54085
|
+
const recovered = LATE_REPLY_TOPIC_RECOVERY_ENABLED && explicitThreadId == null && originTurn == null && liveTurn == null ? findLatestEndedTurnForChat(chatId) : null;
|
|
54079
54086
|
const threadId = resolveAnswerThreadId({
|
|
54080
54087
|
explicitThreadId,
|
|
54081
54088
|
originResolved: originTurn != null,
|
|
@@ -55260,6 +55267,7 @@ function parsePositiveMsEnv(name, fallbackMs) {
|
|
|
55260
55267
|
var SILENCE_FALLBACK_MS = parsePositiveMsEnv("SWITCHROOM_SILENCE_FALLBACK_MS", 300000);
|
|
55261
55268
|
var SILENCE_FALLBACK_HARD_MS = parsePositiveMsEnv("SWITCHROOM_SILENCE_FALLBACK_HARD_MS", 900000);
|
|
55262
55269
|
var SILENCE_DEFER_INFLIGHT_TOOLS = process.env.SWITCHROOM_SILENCE_DEFER_INFLIGHT_TOOLS === "1";
|
|
55270
|
+
var SILENCE_LIVENESS_PRODUCTION = process.env.SWITCHROOM_SILENCE_LIVENESS_PRODUCTION !== "0";
|
|
55263
55271
|
startTimer({
|
|
55264
55272
|
thresholdsMs: { fallback: SILENCE_FALLBACK_MS, fallbackHardCeiling: SILENCE_FALLBACK_HARD_MS },
|
|
55265
55273
|
deferFallbackWhileToolInFlight: SILENCE_DEFER_INFLIGHT_TOOLS,
|
|
@@ -55351,8 +55359,11 @@ startTimer({
|
|
|
55351
55359
|
const sib = silenceMsForKey(siblingKey, fbNow);
|
|
55352
55360
|
return sib == null || sib >= DEFAULT_THRESHOLDS.fallback;
|
|
55353
55361
|
});
|
|
55354
|
-
if (turnMatchesFallback && currentTurn === wedgedTurn)
|
|
55362
|
+
if (turnMatchesFallback && currentTurn === wedgedTurn && wedgedTurn != null) {
|
|
55363
|
+
process.stderr.write(`telegram gateway: ${formatTurnLifecycle("clear", "silence_fallback", wedgedTurn, Date.now())}
|
|
55364
|
+
`);
|
|
55355
55365
|
currentTurn = null;
|
|
55366
|
+
}
|
|
55356
55367
|
try {
|
|
55357
55368
|
clearSilentEndState(fbKey);
|
|
55358
55369
|
} catch {}
|
|
@@ -58076,6 +58087,9 @@ function handleSessionEvent(ev) {
|
|
|
58076
58087
|
const rendered = appendActivityLabel(turn.mirrorLines, ev.label);
|
|
58077
58088
|
if (rendered != null) {
|
|
58078
58089
|
turn.lastToolLabelAt = Date.now();
|
|
58090
|
+
if (SILENCE_LIVENESS_PRODUCTION && currentTurn === turn) {
|
|
58091
|
+
noteProduction(statusKey(turn.sessionChatId, turn.sessionThreadId), Date.now());
|
|
58092
|
+
}
|
|
58079
58093
|
turn.activityPendingRender = composeTurnActivity(turn) ?? rendered;
|
|
58080
58094
|
if (turn.activityInFlight == null) {
|
|
58081
58095
|
turn.activityInFlight = drainActivitySummary(turn);
|
|
@@ -58130,6 +58144,9 @@ function handleSessionEvent(ev) {
|
|
|
58130
58144
|
logStreamingEvent(metricEv);
|
|
58131
58145
|
if (currentTurn === turn) {
|
|
58132
58146
|
noteSignal(statusKey(turn.sessionChatId, turn.sessionThreadId), Date.now());
|
|
58147
|
+
if (SILENCE_LIVENESS_PRODUCTION) {
|
|
58148
|
+
noteProduction(statusKey(turn.sessionChatId, turn.sessionThreadId), Date.now());
|
|
58149
|
+
}
|
|
58133
58150
|
}
|
|
58134
58151
|
},
|
|
58135
58152
|
checkDedup: (text) => {
|
|
@@ -1930,11 +1930,17 @@ function resolveAnswerThreadWithLog(
|
|
|
1930
1930
|
liveTurn: CurrentTurn | null,
|
|
1931
1931
|
surface: 'reply' | 'stream_reply',
|
|
1932
1932
|
): number | undefined {
|
|
1933
|
+
// Recover ONLY for a genuinely LATE reply — no live turn at all. Gating on
|
|
1934
|
+
// `liveTurn?.sessionThreadId == null` (the original) also fired for a
|
|
1935
|
+
// threadless DM that still had a live turn, marking every DM reply
|
|
1936
|
+
// `via=recovered`/RECOVERED in the telemetry (routing result unchanged —
|
|
1937
|
+
// DM → undefined — but it drowned the real supergroup recoveries the marker
|
|
1938
|
+
// exists to surface). `liveTurn == null` is the precise late-reply condition.
|
|
1933
1939
|
const recovered =
|
|
1934
1940
|
LATE_REPLY_TOPIC_RECOVERY_ENABLED &&
|
|
1935
1941
|
explicitThreadId == null &&
|
|
1936
1942
|
originTurn == null &&
|
|
1937
|
-
liveTurn
|
|
1943
|
+
liveTurn == null
|
|
1938
1944
|
? findLatestEndedTurnForChat(chatId)
|
|
1939
1945
|
: null
|
|
1940
1946
|
const threadId = resolveAnswerThreadId({
|
|
@@ -4673,6 +4679,12 @@ function parsePositiveMsEnv(name: string, fallbackMs: number): number {
|
|
|
4673
4679
|
const SILENCE_FALLBACK_MS = parsePositiveMsEnv('SWITCHROOM_SILENCE_FALLBACK_MS', 300_000)
|
|
4674
4680
|
const SILENCE_FALLBACK_HARD_MS = parsePositiveMsEnv('SWITCHROOM_SILENCE_FALLBACK_HARD_MS', 900_000)
|
|
4675
4681
|
const SILENCE_DEFER_INFLIGHT_TOOLS = process.env.SWITCHROOM_SILENCE_DEFER_INFLIGHT_TOOLS === '1'
|
|
4682
|
+
// Production-liveness (2026-06-05 UAT finding). Count an activity-feed render or
|
|
4683
|
+
// an answer-stream draft update as liveness for the silence clock, so a long
|
|
4684
|
+
// tool/composition turn that's visibly producing doesn't trip the 300s fallback
|
|
4685
|
+
// and null currentTurn mid-work. Default ON; SWITCHROOM_SILENCE_LIVENESS_PRODUCTION=0
|
|
4686
|
+
// restores the legacy "only a real reply resets the clock" behaviour.
|
|
4687
|
+
const SILENCE_LIVENESS_PRODUCTION = process.env.SWITCHROOM_SILENCE_LIVENESS_PRODUCTION !== '0'
|
|
4676
4688
|
|
|
4677
4689
|
silencePoke.startTimer({
|
|
4678
4690
|
thresholdsMs: { fallback: SILENCE_FALLBACK_MS, fallbackHardCeiling: SILENCE_FALLBACK_HARD_MS },
|
|
@@ -4889,7 +4901,16 @@ silencePoke.startTimer({
|
|
|
4889
4901
|
// returns null and the regular teardown short-circuits. Without
|
|
4890
4902
|
// this, the late event would re-emit `turn_ended` AND clobber
|
|
4891
4903
|
// whatever fresh turn the next inbound started.
|
|
4892
|
-
if (turnMatchesFallback && currentTurn === wedgedTurn
|
|
4904
|
+
if (turnMatchesFallback && currentTurn === wedgedTurn && wedgedTurn != null) {
|
|
4905
|
+
// Status-surface observability: emit the lifecycle CLEAR for the
|
|
4906
|
+
// silence-poke teardown so a fallback-nulled turn has a turn-lifecycle
|
|
4907
|
+
// line like every other clear path (the framework-fallback line below is
|
|
4908
|
+
// its own format — this makes the dark-out greppable in the same shape).
|
|
4909
|
+
process.stderr.write(
|
|
4910
|
+
`telegram gateway: ${formatTurnLifecycle('clear', 'silence_fallback', wedgedTurn, Date.now())}\n`,
|
|
4911
|
+
)
|
|
4912
|
+
currentTurn = null
|
|
4913
|
+
}
|
|
4893
4914
|
// Best-effort: clear any pending silent-end marker so the Stop hook
|
|
4894
4915
|
// doesn't double-block when claude eventually exits the wedged turn.
|
|
4895
4916
|
try {
|
|
@@ -9452,6 +9473,16 @@ function handleSessionEvent(ev: SessionEvent): void {
|
|
|
9452
9473
|
// the " · Ns" elapsed restarts from this step (and the feed itself just
|
|
9453
9474
|
// advanced, so it isn't stale).
|
|
9454
9475
|
turn.lastToolLabelAt = Date.now()
|
|
9476
|
+
// Production-liveness: a NEW model-driven activity label is genuine
|
|
9477
|
+
// liveness (the model emitted a new step), so reset the silence-poke
|
|
9478
|
+
// clock — this is the safe site, NOT drainActivitySummary, because the
|
|
9479
|
+
// framework feedHeartbeatTick also drains (climbing-elapsed re-renders)
|
|
9480
|
+
// and would falsely reset the clock forever on a hung-mid-tool turn,
|
|
9481
|
+
// reintroducing the #1556 dangling-turn wedge. Only the model emitting a
|
|
9482
|
+
// fresh label reaches here.
|
|
9483
|
+
if (SILENCE_LIVENESS_PRODUCTION && currentTurn === turn) {
|
|
9484
|
+
silencePoke.noteProduction(statusKey(turn.sessionChatId, turn.sessionThreadId), Date.now())
|
|
9485
|
+
}
|
|
9455
9486
|
// Recompose so any active foreground sub-agent's nested block (Model A)
|
|
9456
9487
|
// is preserved when the parent appends its own step. composeTurnActivity
|
|
9457
9488
|
// == the flat render when no foreground sub-agent is active.
|
|
@@ -9612,6 +9643,15 @@ function handleSessionEvent(ev: SessionEvent): void {
|
|
|
9612
9643
|
statusKey(turn.sessionChatId, turn.sessionThreadId),
|
|
9613
9644
|
Date.now(),
|
|
9614
9645
|
)
|
|
9646
|
+
// Production-liveness: a draft update is the agent visibly
|
|
9647
|
+
// composing — reset the silence-poke clock so a long
|
|
9648
|
+
// compose-only turn (no tools, no reply yet) isn't torn down.
|
|
9649
|
+
if (SILENCE_LIVENESS_PRODUCTION) {
|
|
9650
|
+
silencePoke.noteProduction(
|
|
9651
|
+
statusKey(turn.sessionChatId, turn.sessionThreadId),
|
|
9652
|
+
Date.now(),
|
|
9653
|
+
)
|
|
9654
|
+
}
|
|
9615
9655
|
}
|
|
9616
9656
|
},
|
|
9617
9657
|
// #646 — wire the shared outboundDedup into the answer-stream
|
|
@@ -196,6 +196,31 @@ export function noteOutbound(key: string, now: number): void {
|
|
|
196
196
|
s.fallbackFired = false
|
|
197
197
|
}
|
|
198
198
|
|
|
199
|
+
/**
|
|
200
|
+
* Record observable PRODUCTION that isn't a final reply — an activity-feed
|
|
201
|
+
* render (`→/✓` edit-in-place message) or an answer-stream draft update. Resets
|
|
202
|
+
* the silence clock exactly like a reply.
|
|
203
|
+
*
|
|
204
|
+
* Why this exists (2026-06-05): the header's "only a real reply counts; tool
|
|
205
|
+
* churn / the model ripping through 20 tool calls is still SILENT to the user"
|
|
206
|
+
* rule predates the live activity feed (#2162) and the compose draft. Those
|
|
207
|
+
* surfaces ARE user-visible now, so a turn actively rendering them is NOT
|
|
208
|
+
* silent — yet the 300s fallback (which nulls `currentTurn` and kills the very
|
|
209
|
+
* feed/draft the user is watching) still fired on a long tool/composition turn,
|
|
210
|
+
* darkening the live status mid-work. Counting production as liveness makes the
|
|
211
|
+
* fallback fire only on GENUINE silence (no reply, no feed, no draft, no tool
|
|
212
|
+
* events for the window) — a real wedge. A wedged agent produces nothing
|
|
213
|
+
* observable, so its clock is never reset and it still recovers.
|
|
214
|
+
*
|
|
215
|
+
* No-op when the kill switch is on or the key has no turn.
|
|
216
|
+
*/
|
|
217
|
+
export function noteProduction(key: string, now: number): void {
|
|
218
|
+
const s = state.get(key)
|
|
219
|
+
if (s == null) return
|
|
220
|
+
s.lastOutboundAt = now
|
|
221
|
+
s.fallbackFired = false
|
|
222
|
+
}
|
|
223
|
+
|
|
199
224
|
/**
|
|
200
225
|
* Record a `thinking` session event. Used to pick "still thinking…" vs
|
|
201
226
|
* "still working…" wording for the 300s framework fallback.
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Silence-poke production-liveness — heartbeat-safety guard (2026-06-05).
|
|
3
|
+
*
|
|
4
|
+
* The production-liveness fix resets the silence clock on observable production
|
|
5
|
+
* so a long WORKING turn doesn't dark out. The load-bearing constraint: the
|
|
6
|
+
* reset must fire ONLY on MODEL-driven production, NEVER from the framework
|
|
7
|
+
* `feedHeartbeatTick` — a model-INDEPENDENT setInterval that re-renders a
|
|
8
|
+
* climbing " · Ns" elapsed every 6s (defeating the feed's content-dedup). If the
|
|
9
|
+
* reset lived in `drainActivitySummary` (which the heartbeat drains), a
|
|
10
|
+
* hung-but-bridge-connected agent would have its 300s silence clock reset every
|
|
11
|
+
* 6s forever, the load-bearing silence-poke unwedge would NEVER fire, and the
|
|
12
|
+
* conversation would be pinned — the #1556 permanent dangling-turn wedge.
|
|
13
|
+
*
|
|
14
|
+
* An adversarial review panel caught exactly this in an earlier revision. These
|
|
15
|
+
* are STRUCTURAL assertions (the gateway IIFE can't be instantiated in-process —
|
|
16
|
+
* same pattern as multitopic-routing-wiring.test) that pin the reset to the
|
|
17
|
+
* model-driven sites so a refactor can't silently reintroduce the regression.
|
|
18
|
+
* The behavioural counterpart (noteProduction resets; STOP producing → fires)
|
|
19
|
+
* lives in silence-poke.test.ts; this guards the WIRING the heartbeat must not
|
|
20
|
+
* cross.
|
|
21
|
+
*/
|
|
22
|
+
import { describe, it, expect } from 'vitest'
|
|
23
|
+
import { readFileSync } from 'node:fs'
|
|
24
|
+
import { resolve } from 'node:path'
|
|
25
|
+
|
|
26
|
+
const gatewaySrc = readFileSync(resolve(__dirname, '..', 'gateway', 'gateway.ts'), 'utf-8')
|
|
27
|
+
|
|
28
|
+
function between(src: string, startMarker: string, endMarker: string): string {
|
|
29
|
+
const after = src.split(startMarker)[1] ?? ''
|
|
30
|
+
return after.split(endMarker)[0] ?? ''
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
describe('silence-poke production-liveness — heartbeat safety', () => {
|
|
34
|
+
it('drainActivitySummary must NOT reset the silence clock (the framework heartbeat drains here)', () => {
|
|
35
|
+
const body = between(gatewaySrc, 'async function drainActivitySummary', '\nfunction feedHeartbeatTick')
|
|
36
|
+
expect(body.length).toBeGreaterThan(100) // sanity: the slice found the function body
|
|
37
|
+
expect(body).not.toMatch(/noteProduction/)
|
|
38
|
+
})
|
|
39
|
+
|
|
40
|
+
it('feedHeartbeatTick itself must NOT reset the silence clock (model-independent re-render)', () => {
|
|
41
|
+
const body = between(gatewaySrc, 'function feedHeartbeatTick(): void {', '\n}')
|
|
42
|
+
expect(body.length).toBeGreaterThan(50)
|
|
43
|
+
expect(body).not.toMatch(/noteProduction/)
|
|
44
|
+
})
|
|
45
|
+
|
|
46
|
+
it('the MODEL-driven tool-label append IS the reset site, gated on the live turn', () => {
|
|
47
|
+
// appendActivityLabel returns a fresh render only when the model emits a NEW
|
|
48
|
+
// labelled step — the genuine liveness signal the heartbeat can never forge.
|
|
49
|
+
const block = between(
|
|
50
|
+
gatewaySrc,
|
|
51
|
+
'const rendered = appendActivityLabel(turn.mirrorLines, ev.label)',
|
|
52
|
+
'\n return',
|
|
53
|
+
)
|
|
54
|
+
expect(block).toMatch(/silencePoke\.noteProduction/)
|
|
55
|
+
expect(block).toMatch(/currentTurn === turn/)
|
|
56
|
+
})
|
|
57
|
+
|
|
58
|
+
it('the answer-stream draft onMetric reset is model-driven and gated on the live turn', () => {
|
|
59
|
+
const block = between(gatewaySrc, 'onMetric: (metricEv) => {', '\n },')
|
|
60
|
+
expect(block).toMatch(/silencePoke\.noteProduction/)
|
|
61
|
+
expect(block).toMatch(/currentTurn === turn/)
|
|
62
|
+
})
|
|
63
|
+
|
|
64
|
+
it('production-liveness is behind the default-ON SWITCHROOM_SILENCE_LIVENESS_PRODUCTION kill switch', () => {
|
|
65
|
+
expect(gatewaySrc).toMatch(/SWITCHROOM_SILENCE_LIVENESS_PRODUCTION !== '0'/)
|
|
66
|
+
})
|
|
67
|
+
})
|
|
@@ -2,6 +2,7 @@ import { describe, it, expect, beforeEach, afterEach } from 'vitest'
|
|
|
2
2
|
import {
|
|
3
3
|
startTurn,
|
|
4
4
|
noteOutbound,
|
|
5
|
+
noteProduction,
|
|
5
6
|
noteThinking,
|
|
6
7
|
noteToolStart,
|
|
7
8
|
noteToolEnd,
|
|
@@ -136,6 +137,47 @@ describe('silence-poke — outbound resets the silence clock', () => {
|
|
|
136
137
|
})
|
|
137
138
|
})
|
|
138
139
|
|
|
140
|
+
// Production-liveness (2026-06-05): an activity-feed render or draft update is
|
|
141
|
+
// the agent visibly working — it resets the silence clock so a long
|
|
142
|
+
// tool/composition turn isn't torn down mid-work.
|
|
143
|
+
describe('silence-poke — noteProduction resets the silence clock', () => {
|
|
144
|
+
it('a feed/draft render at 250s pushes the fallback measurement to it', () => {
|
|
145
|
+
const fx = setupDeps()
|
|
146
|
+
startTurn('k', 0)
|
|
147
|
+
noteProduction('k', 250_000)
|
|
148
|
+
__tickForTests(300_000) // 50s since production — no fire
|
|
149
|
+
expect(fx.fallbacks).toHaveLength(0)
|
|
150
|
+
__tickForTests(550_000) // 300s since production — fires
|
|
151
|
+
expect(fx.fallbacks).toHaveLength(1)
|
|
152
|
+
})
|
|
153
|
+
|
|
154
|
+
it('repeated production every 60s keeps a long turn alive indefinitely', () => {
|
|
155
|
+
const fx = setupDeps()
|
|
156
|
+
startTurn('k', 0)
|
|
157
|
+
for (let t = 60_000; t <= 600_000; t += 60_000) {
|
|
158
|
+
noteProduction('k', t)
|
|
159
|
+
__tickForTests(t)
|
|
160
|
+
}
|
|
161
|
+
// 10 min of steady feed/draft renders — never torn down.
|
|
162
|
+
expect(fx.fallbacks).toHaveLength(0)
|
|
163
|
+
})
|
|
164
|
+
|
|
165
|
+
it('production STOPS → the fallback fires 300s after the last render (genuine wedge)', () => {
|
|
166
|
+
const fx = setupDeps()
|
|
167
|
+
startTurn('k', 0)
|
|
168
|
+
noteProduction('k', 100_000) // last render at 100s, then silence
|
|
169
|
+
__tickForTests(390_000) // 290s since last render — no fire
|
|
170
|
+
expect(fx.fallbacks).toHaveLength(0)
|
|
171
|
+
__tickForTests(401_000) // 301s since last render — fires
|
|
172
|
+
expect(fx.fallbacks).toHaveLength(1)
|
|
173
|
+
})
|
|
174
|
+
|
|
175
|
+
it('is a no-op for an unknown key (no turn state)', () => {
|
|
176
|
+
setupDeps()
|
|
177
|
+
expect(() => noteProduction('nope', 1_000)).not.toThrow()
|
|
178
|
+
})
|
|
179
|
+
})
|
|
180
|
+
|
|
139
181
|
// Pin the contract the gateway must uphold for ABNORMAL turn-ends:
|
|
140
182
|
// every code path that abandons a turn before turn_end (context-
|
|
141
183
|
// exhaust bail, gateway-side wedge timeout, silent-end recovery)
|
|
@@ -0,0 +1,332 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Real-work UAT coverage — human-style prompts that trigger actual work
|
|
3
|
+
* (multi-tool, web research, sub-agents, background workers) plus a turn
|
|
4
|
+
* collector + bug detectors for the failure classes the conversational fuzz
|
|
5
|
+
* never exercised.
|
|
6
|
+
*
|
|
7
|
+
* Why this exists: the existing fuzz scenarios send conversational prompts
|
|
8
|
+
* ("hey how's it going", emoji, markdown edge-cases) → trivial fast replies.
|
|
9
|
+
* The status-surface and reply-ordering bugs (live feed going dark mid-work,
|
|
10
|
+
* the orphaned-reply backstop flushing a fragment then the real answer landing
|
|
11
|
+
* late and out of order, late replies misrouting) only manifest when the agent
|
|
12
|
+
* does REAL work — uses tools/MCPs, spawns sub-agents, researches long enough to
|
|
13
|
+
* cross the silence-poke / orphaned-reply thresholds. These prompts provoke that
|
|
14
|
+
* work in a human voice; `collectTurn` captures the whole bot-message sequence;
|
|
15
|
+
* `analyzeTurn` flags the known bug signatures.
|
|
16
|
+
*
|
|
17
|
+
* Harness limits (see CLAUDE.md): mtcute observes real sendMessage/editMessageText
|
|
18
|
+
* (so the activity feed `→/✓` and worker feed `🛠` ARE observable) but NOT drafts
|
|
19
|
+
* or reactions, and has no forum-topic API (channel scenarios use the General
|
|
20
|
+
* topic — they prove DM-vs-channel routing, not correct-topic-among-many, which
|
|
21
|
+
* the gateway unit thread-assertions pin). So work-triggering is probabilistic on
|
|
22
|
+
* a generic agent: the UNIVERSAL invariants (a substantive answer arrives, in the
|
|
23
|
+
* right surface, not as an orphaned fragment) are hard; the work-specific surfaces
|
|
24
|
+
* (feed painted, worker surfaced) are reported and only hard-checked once their
|
|
25
|
+
* precondition is observed.
|
|
26
|
+
*/
|
|
27
|
+
|
|
28
|
+
import type { Driver, ObservedMessage } from "./driver.js";
|
|
29
|
+
import { isWorkerFeedMessage, isActivityFeedMessage } from "./assertions.js";
|
|
30
|
+
|
|
31
|
+
export type WorkKind =
|
|
32
|
+
| "research" // web/multi-source research → multi-tool, long
|
|
33
|
+
| "multitool" // several tool calls, sequential
|
|
34
|
+
| "subagent" // delegates to a foreground sub-agent
|
|
35
|
+
| "bgworker" // dispatches a background worker (the 🛠 feed)
|
|
36
|
+
| "compound" // first X then Y then summarise — ordered multi-step
|
|
37
|
+
| "web"; // current/recent info → forces a web fetch
|
|
38
|
+
|
|
39
|
+
export interface RealWorkCase {
|
|
40
|
+
name: string;
|
|
41
|
+
/** Human-style prompt that should provoke real work. */
|
|
42
|
+
prompt: string;
|
|
43
|
+
kind: WorkKind;
|
|
44
|
+
/** Generous budget — deep research can run minutes. */
|
|
45
|
+
timeoutMs: number;
|
|
46
|
+
/** The substantive answer must be at least this long; a backstop fragment /
|
|
47
|
+
* bare ack is shorter, so this distinguishes "the answer landed" from "only a
|
|
48
|
+
* stub landed". */
|
|
49
|
+
minAnswerChars: number;
|
|
50
|
+
/** When true, this prompt RELIABLY triggers the named surface, so the scenario
|
|
51
|
+
* hard-asserts it appeared (not just reports it). Used for the semi-prescriptive
|
|
52
|
+
* but natural-sounding bgworker/subagent cases. */
|
|
53
|
+
requireSurface?: "worker" | "activity";
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
/**
|
|
57
|
+
* The case set. The first block is fully human-style (probabilistic work); the
|
|
58
|
+
* `requireSurface` block phrases the dispatch naturally but reliably enough to
|
|
59
|
+
* hard-assert the surface. Keep prompts provider-agnostic so they run on the
|
|
60
|
+
* generic test-harness agent (no marko-specific MCPs).
|
|
61
|
+
*/
|
|
62
|
+
export const REAL_WORK_CASES: RealWorkCase[] = [
|
|
63
|
+
{
|
|
64
|
+
name: "deep research, take your time",
|
|
65
|
+
prompt:
|
|
66
|
+
"Can you research the current state of WebAssembly outside the browser — " +
|
|
67
|
+
"the main server-side runtimes, who's actually using it in production, and " +
|
|
68
|
+
"the real limitations today? Take your time and give me a proper rundown, " +
|
|
69
|
+
"not a one-liner.",
|
|
70
|
+
kind: "research",
|
|
71
|
+
timeoutMs: 180_000,
|
|
72
|
+
minAnswerChars: 400,
|
|
73
|
+
},
|
|
74
|
+
{
|
|
75
|
+
name: "current info, forces a lookup",
|
|
76
|
+
prompt:
|
|
77
|
+
"What's the latest with the Bun JavaScript runtime — the recent releases " +
|
|
78
|
+
"and whether people consider it production-ready yet? Check, don't guess.",
|
|
79
|
+
kind: "web",
|
|
80
|
+
timeoutMs: 150_000,
|
|
81
|
+
minAnswerChars: 300,
|
|
82
|
+
},
|
|
83
|
+
{
|
|
84
|
+
name: "multi-angle investigation",
|
|
85
|
+
prompt:
|
|
86
|
+
"Dig into Postgres vs SQLite for a small SaaS backend — look at it from a " +
|
|
87
|
+
"few angles (concurrency, ops burden, cost at scale) and tell me which " +
|
|
88
|
+
"you'd actually pick and why.",
|
|
89
|
+
kind: "multitool",
|
|
90
|
+
timeoutMs: 150_000,
|
|
91
|
+
minAnswerChars: 400,
|
|
92
|
+
},
|
|
93
|
+
{
|
|
94
|
+
name: "compound sequential ask",
|
|
95
|
+
prompt:
|
|
96
|
+
"First work out what today's date is, then how many days are left until the " +
|
|
97
|
+
"end of this quarter, then suggest three concrete milestones I could hit " +
|
|
98
|
+
"before then. Do it in that order.",
|
|
99
|
+
kind: "compound",
|
|
100
|
+
timeoutMs: 120_000,
|
|
101
|
+
minAnswerChars: 250,
|
|
102
|
+
},
|
|
103
|
+
{
|
|
104
|
+
name: "invite delegation",
|
|
105
|
+
prompt:
|
|
106
|
+
"I need a proper comparison of Stripe vs Paddle vs Lemon Squeezy for selling " +
|
|
107
|
+
"a digital product — pricing, who handles sales tax, and payout timing. Farm " +
|
|
108
|
+
"it out to a sub-agent if that's faster; just give me the bottom line at the end.",
|
|
109
|
+
kind: "subagent",
|
|
110
|
+
timeoutMs: 180_000,
|
|
111
|
+
minAnswerChars: 350,
|
|
112
|
+
},
|
|
113
|
+
{
|
|
114
|
+
name: "long sourced briefing (crosses thresholds)",
|
|
115
|
+
prompt:
|
|
116
|
+
"Give me a thorough, well-sourced briefing on the EU AI Act — what it covers, " +
|
|
117
|
+
"the risk tiers, the key deadlines, and what a small AI startup actually has to " +
|
|
118
|
+
"do. Be comprehensive; I'd rather wait and get depth.",
|
|
119
|
+
kind: "research",
|
|
120
|
+
timeoutMs: 360_000,
|
|
121
|
+
minAnswerChars: 500,
|
|
122
|
+
},
|
|
123
|
+
// ── reliably-triggering, still natural voice ──────────────────────────────
|
|
124
|
+
{
|
|
125
|
+
name: "background worker, ping me when done",
|
|
126
|
+
prompt:
|
|
127
|
+
"Don't answer this inline — actually dispatch a background worker for it " +
|
|
128
|
+
"(Task / Agent with run_in_background: true) so I can keep chatting while it " +
|
|
129
|
+
"runs, and ping me when it's done. The task: go through, ONE step at a time " +
|
|
130
|
+
"with a one-line note on each (run a quick command or jot a note per step so " +
|
|
131
|
+
"there's visible progress), the eight most common email-deliverability " +
|
|
132
|
+
"mistakes a solo founder makes — SPF, DKIM, DMARC, warmup, list hygiene, " +
|
|
133
|
+
"content, sending cadence, monitoring. Pace it over a couple of minutes; do " +
|
|
134
|
+
"all eight, then hand back the summary.",
|
|
135
|
+
kind: "bgworker",
|
|
136
|
+
// Generous: if the agent declines to background it and composes inline, a
|
|
137
|
+
// paced 8-step answer can run past 5 min (and, with no tracked tool in
|
|
138
|
+
// flight, trip the 300s silence-poke — see the 2026-06-05 UAT finding).
|
|
139
|
+
timeoutMs: 360_000,
|
|
140
|
+
minAnswerChars: 250,
|
|
141
|
+
requireSurface: "worker",
|
|
142
|
+
},
|
|
143
|
+
{
|
|
144
|
+
name: "step-by-step so the feed paints",
|
|
145
|
+
prompt:
|
|
146
|
+
"Walk through, ONE step at a time (run a quick command or note for each so I " +
|
|
147
|
+
"can see progress), how you'd debug a Linux box that's suddenly out of disk " +
|
|
148
|
+
"space — six steps: df, du on the big dirs, find large files, check logs, " +
|
|
149
|
+
"check deleted-but-open files, then a cleanup plan. Then give me the recap.",
|
|
150
|
+
kind: "multitool",
|
|
151
|
+
timeoutMs: 180_000,
|
|
152
|
+
minAnswerChars: 300,
|
|
153
|
+
requireSurface: "activity",
|
|
154
|
+
},
|
|
155
|
+
];
|
|
156
|
+
|
|
157
|
+
/** What the collector observed across one turn. */
|
|
158
|
+
export interface TurnObservation {
|
|
159
|
+
/** Every bot message (initial sends only; edits tracked separately). */
|
|
160
|
+
botMessages: ObservedMessage[];
|
|
161
|
+
/** Edit events seen (worker/activity feeds grow via edits). */
|
|
162
|
+
edits: ObservedMessage[];
|
|
163
|
+
/** The first substantive answer (non-feed, >= minAnswerChars), or null. */
|
|
164
|
+
answer: ObservedMessage | null;
|
|
165
|
+
/** ms from send to the answer (or to timeout). */
|
|
166
|
+
answerLatencyMs: number;
|
|
167
|
+
/** Whether an activity feed (`→/✓`) message was seen. */
|
|
168
|
+
sawActivityFeed: boolean;
|
|
169
|
+
/** Whether a worker feed (`🛠 Worker`) message was seen. */
|
|
170
|
+
sawWorkerFeed: boolean;
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
/**
|
|
174
|
+
* Send `prompt` and collect the bot's message sequence until a substantive
|
|
175
|
+
* answer lands (+ a short settle to catch trailing/late sends — the very window
|
|
176
|
+
* the orphaned-reply bug lives in) or `timeoutMs` elapses. Observing starts
|
|
177
|
+
* BEFORE the send so nothing is missed.
|
|
178
|
+
*/
|
|
179
|
+
export async function collectTurn(
|
|
180
|
+
driver: Driver,
|
|
181
|
+
chatId: number,
|
|
182
|
+
driverUserId: number,
|
|
183
|
+
prompt: string,
|
|
184
|
+
opts: { timeoutMs: number; minAnswerChars: number; settleMs?: number },
|
|
185
|
+
): Promise<TurnObservation> {
|
|
186
|
+
const settleMs = opts.settleMs ?? 6_000;
|
|
187
|
+
const botMessages: ObservedMessage[] = [];
|
|
188
|
+
const edits: ObservedMessage[] = [];
|
|
189
|
+
let answer: ObservedMessage | null = null;
|
|
190
|
+
let sawActivityFeed = false;
|
|
191
|
+
let sawWorkerFeed = false;
|
|
192
|
+
|
|
193
|
+
const startedAt = Date.now();
|
|
194
|
+
const iterator = driver.observeMessages(chatId)[Symbol.asyncIterator]();
|
|
195
|
+
// Begin observing, then send (observeMessages backfills nothing, but the send
|
|
196
|
+
// round-trips after the iterator is live).
|
|
197
|
+
await driver.sendText(chatId, prompt);
|
|
198
|
+
|
|
199
|
+
let settleDeadline = Number.POSITIVE_INFINITY;
|
|
200
|
+
while (true) {
|
|
201
|
+
const remaining =
|
|
202
|
+
Math.min(opts.timeoutMs - (Date.now() - startedAt), settleDeadline - Date.now());
|
|
203
|
+
if (remaining <= 0) break;
|
|
204
|
+
const next = await Promise.race([
|
|
205
|
+
iterator.next(),
|
|
206
|
+
new Promise<{ done: true; value: undefined }>((r) =>
|
|
207
|
+
setTimeout(() => r({ done: true, value: undefined }), Math.max(0, remaining)),
|
|
208
|
+
),
|
|
209
|
+
]);
|
|
210
|
+
if (next.done || next.value == null) {
|
|
211
|
+
// timed out (either overall or settle) — stop
|
|
212
|
+
break;
|
|
213
|
+
}
|
|
214
|
+
const m = next.value as ObservedMessage;
|
|
215
|
+
if (m.senderUserId === driverUserId) continue; // our own echo
|
|
216
|
+
if (m.edited) {
|
|
217
|
+
edits.push(m);
|
|
218
|
+
if (isWorkerFeedMessage(m)) sawWorkerFeed = true;
|
|
219
|
+
if (isActivityFeedMessage(m)) sawActivityFeed = true;
|
|
220
|
+
continue;
|
|
221
|
+
}
|
|
222
|
+
botMessages.push(m);
|
|
223
|
+
if (isWorkerFeedMessage(m)) sawWorkerFeed = true;
|
|
224
|
+
else if (isActivityFeedMessage(m)) sawActivityFeed = true;
|
|
225
|
+
else if (answer == null && m.text.trim().length >= opts.minAnswerChars) {
|
|
226
|
+
answer = m;
|
|
227
|
+
// Got the answer; keep collecting for `settleMs` to catch a late
|
|
228
|
+
// fragment/duplicate/misrouted trailing send.
|
|
229
|
+
settleDeadline = Date.now() + settleMs;
|
|
230
|
+
}
|
|
231
|
+
}
|
|
232
|
+
void iterator.return?.();
|
|
233
|
+
return {
|
|
234
|
+
botMessages,
|
|
235
|
+
edits,
|
|
236
|
+
answer,
|
|
237
|
+
answerLatencyMs: answer ? answer.date.getTime() - startedAt : Date.now() - startedAt,
|
|
238
|
+
sawActivityFeed,
|
|
239
|
+
sawWorkerFeed,
|
|
240
|
+
};
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
export interface TurnViolation {
|
|
244
|
+
code:
|
|
245
|
+
| "no-answer"
|
|
246
|
+
| "orphaned-fragment"
|
|
247
|
+
| "surface-missing"
|
|
248
|
+
| "wrong-surface";
|
|
249
|
+
detail: string;
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
/**
|
|
253
|
+
* Bug detectors over a collected turn. Splits HARD violations (the universal
|
|
254
|
+
* invariants that must always hold) from SOFT warnings (work-specific surfaces
|
|
255
|
+
* that are probabilistic on a generic agent — whether it dispatches a worker /
|
|
256
|
+
* sub-agent is its judgment, so a missing feed is reported, not failed).
|
|
257
|
+
*
|
|
258
|
+
* Hard violations:
|
|
259
|
+
* - no-answer: no substantive reply arrived at all (the answer never landed).
|
|
260
|
+
* - orphaned-fragment: a short non-ack bot text landed, THEN ≥8s later a much
|
|
261
|
+
* longer answer — the orphaned-reply backstop signature (fragment flushed,
|
|
262
|
+
* real reply late). A short message that is itself the only substantive reply,
|
|
263
|
+
* or a brief "on it" ack followed promptly, does not count.
|
|
264
|
+
* - wrong-surface (channel): a bot message landed outside the expected chat.
|
|
265
|
+
*
|
|
266
|
+
* Soft warnings:
|
|
267
|
+
* - surface-missing: a `requireSurface` case never showed its feed. The agent
|
|
268
|
+
* may have answered inline (a legitimate choice) — reported for the bug hunt,
|
|
269
|
+
* not a hard fail. When the feed DOES appear, the summary + gateway telemetry
|
|
270
|
+
* confirm it surfaced correctly.
|
|
271
|
+
*/
|
|
272
|
+
export function analyzeTurn(
|
|
273
|
+
obs: TurnObservation,
|
|
274
|
+
expected: { requireSurface?: "worker" | "activity"; chatId: number },
|
|
275
|
+
): { violations: TurnViolation[]; warnings: TurnViolation[] } {
|
|
276
|
+
const violations: TurnViolation[] = [];
|
|
277
|
+
const warnings: TurnViolation[] = [];
|
|
278
|
+
if (obs.answer == null) {
|
|
279
|
+
violations.push({
|
|
280
|
+
code: "no-answer",
|
|
281
|
+
detail: `no substantive reply within budget (saw ${obs.botMessages.length} bot msg(s), ` +
|
|
282
|
+
`activityFeed=${obs.sawActivityFeed} workerFeed=${obs.sawWorkerFeed})`,
|
|
283
|
+
});
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
// orphaned-fragment: a non-feed text shorter than 150 chars, sent ≥8s before
|
|
287
|
+
// the answer, that isn't a quick ack right before the answer.
|
|
288
|
+
if (obs.answer != null) {
|
|
289
|
+
const fragments = obs.botMessages.filter(
|
|
290
|
+
(m) =>
|
|
291
|
+
m.messageId !== obs.answer!.messageId &&
|
|
292
|
+
!isWorkerFeedMessage(m) &&
|
|
293
|
+
!isActivityFeedMessage(m) &&
|
|
294
|
+
m.text.trim().length > 0 &&
|
|
295
|
+
m.text.trim().length < 150 &&
|
|
296
|
+
obs.answer!.date.getTime() - m.date.getTime() >= 8_000,
|
|
297
|
+
);
|
|
298
|
+
if (fragments.length > 0) {
|
|
299
|
+
violations.push({
|
|
300
|
+
code: "orphaned-fragment",
|
|
301
|
+
detail: `${fragments.length} stub message(s) landed ≥8s before the answer ` +
|
|
302
|
+
`(e.g. ${JSON.stringify(fragments[0]!.text.slice(0, 60))}) — the orphaned-reply ` +
|
|
303
|
+
`backstop signature.`,
|
|
304
|
+
});
|
|
305
|
+
}
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
if (expected.requireSurface === "worker" && !obs.sawWorkerFeed) {
|
|
309
|
+
warnings.push({ code: "surface-missing", detail: "expected a 🛠 worker feed; agent likely answered inline" });
|
|
310
|
+
}
|
|
311
|
+
if (expected.requireSurface === "activity" && !obs.sawActivityFeed && !obs.sawWorkerFeed) {
|
|
312
|
+
warnings.push({ code: "surface-missing", detail: "expected a →/✓ activity feed; none appeared" });
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
const stray = [...obs.botMessages, ...obs.edits].filter((m) => m.chatId !== expected.chatId);
|
|
316
|
+
if (stray.length > 0) {
|
|
317
|
+
violations.push({
|
|
318
|
+
code: "wrong-surface",
|
|
319
|
+
detail: `${stray.length} bot message(s) landed in chat ${stray[0]!.chatId}, expected ${expected.chatId}`,
|
|
320
|
+
});
|
|
321
|
+
}
|
|
322
|
+
return { violations, warnings };
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
/** One-line human summary of a turn for the test log (bug-hunt forensics). */
|
|
326
|
+
export function summarizeTurn(name: string, obs: TurnObservation): string {
|
|
327
|
+
return (
|
|
328
|
+
`[real-work] ${name}: answer=${obs.answer ? `${obs.answer.text.trim().length}ch@${Math.round(obs.answerLatencyMs / 1000)}s` : "NONE"} ` +
|
|
329
|
+
`botMsgs=${obs.botMessages.length} edits=${obs.edits.length} ` +
|
|
330
|
+
`activityFeed=${obs.sawActivityFeed} workerFeed=${obs.sawWorkerFeed}`
|
|
331
|
+
);
|
|
332
|
+
}
|