switchroom 0.13.12 → 0.13.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli/switchroom.js +60 -5
- package/package.json +1 -1
- package/telegram-plugin/dist/gateway/gateway.js +290 -88
- package/telegram-plugin/final-answer-detect.ts +83 -0
- package/telegram-plugin/gateway/gateway.ts +213 -11
- package/telegram-plugin/hooks/silent-end-interrupt-stop.mjs +17 -5
- package/telegram-plugin/pending-work-progress.ts +377 -0
- package/telegram-plugin/runtime-metrics.ts +20 -0
- package/telegram-plugin/silent-end.ts +37 -11
- package/telegram-plugin/tests/final-answer-detect.test.ts +89 -0
- package/telegram-plugin/tests/pending-work-progress.test.ts +354 -0
- package/telegram-plugin/tests/silent-end.test.ts +118 -0
- package/telegram-plugin/uat/scenarios/cross-turn-pending-progress-dm.test.ts +237 -0
|
@@ -0,0 +1,237 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Cross-turn pending-async progress — UAT regression gate for #1445.
|
|
3
|
+
*
|
|
4
|
+
* Verifies the post-fix behaviour shipped in `pending-work-progress.ts`:
|
|
5
|
+
* when a turn ends with the model having dispatched async background
|
|
6
|
+
* work (here `Bash` with `run_in_background:true`) and the model has
|
|
7
|
+
* stopped speaking, the framework keeps editing the model's last reply
|
|
8
|
+
* *in place* at ~60s intervals so the user sees ambient liveness during
|
|
9
|
+
* the wait.
|
|
10
|
+
*
|
|
11
|
+
* ## Pre-fix behaviour (what the user complained about)
|
|
12
|
+
*
|
|
13
|
+
* 1. User sends a long-task prompt at t=0.
|
|
14
|
+
* 2. Model runs the bash command with `run_in_background:true` and
|
|
15
|
+
* sends one PING reply at ~+20s ("Background sleep running…").
|
|
16
|
+
* 3. Turn ends.
|
|
17
|
+
* 4. Silence-poke ladder is per-turn — it stops the moment endTurn()
|
|
18
|
+
* fires. There is no cross-turn ambient surface.
|
|
19
|
+
* 5. The user sees NOTHING for ~5 min until the framework's 300s
|
|
20
|
+
* silence-poke fallback fires (or — as observed in the UAT that
|
|
21
|
+
* drove the fix — does not fire at all, because the turn already
|
|
22
|
+
* ended). Production data confirms: silence-poke succeeded/fired
|
|
23
|
+
* rate is 0–7% across hundreds of fires.
|
|
24
|
+
*
|
|
25
|
+
* ## Post-fix behaviour (this scenario asserts)
|
|
26
|
+
*
|
|
27
|
+
* 1. Model sends one fresh reply at ~+20s — the anchor.
|
|
28
|
+
* 2. Turn ends.
|
|
29
|
+
* 3. Framework edits the anchor in place at ~+80s, ~+140s, ~+200s,
|
|
30
|
+
* ~+260s, ~+320s with the suffix `\n\n— still working (Nm)`.
|
|
31
|
+
* 4. All edits are SILENT (`disable_notification: true` on the edit
|
|
32
|
+
* or, equivalently, an edit which never pushes a notification).
|
|
33
|
+
* The user sees ambient liveness without any added pings.
|
|
34
|
+
* 5. Sleep completes ~+350s; the model wakes (`BashOutput` /
|
|
35
|
+
* background-task notification path), turn re-starts,
|
|
36
|
+
* `clearPending` fires — no further edits.
|
|
37
|
+
*
|
|
38
|
+
* ## What this scenario asserts
|
|
39
|
+
*
|
|
40
|
+
* 1. At least one FRESH bot message lands (the initial anchor).
|
|
41
|
+
* 2. At least one EDIT to the anchor lands AFTER the initial reply,
|
|
42
|
+
* whose text contains the framework suffix `— still working (\d+m)`.
|
|
43
|
+
* 3. Every observed EDIT message is `silent === true` (no push
|
|
44
|
+
* notification fired by the in-place edit).
|
|
45
|
+
* 4. Edits are anchored to the SAME `messageId` as the initial
|
|
46
|
+
* fresh reply (single in-place surface, not spammy new sends).
|
|
47
|
+
*
|
|
48
|
+
* The full per-message trail is dumped to console for forensic
|
|
49
|
+
* inspection regardless of pass / fail.
|
|
50
|
+
*
|
|
51
|
+
* Wall-clock budget: ~8 min.
|
|
52
|
+
*/
|
|
53
|
+
|
|
54
|
+
import { describe, expect, it } from "vitest";
|
|
55
|
+
import { spinUp } from "../harness.js";
|
|
56
|
+
import type { ObservedMessage } from "../driver.js";
|
|
57
|
+
|
|
58
|
+
const SLEEP_SECONDS = 350;
|
|
59
|
+
|
|
60
|
+
const PROMPT =
|
|
61
|
+
`This is an instrumented stress test of cross-turn pending-async ` +
|
|
62
|
+
`progress. Please run exactly this command via the Bash tool, and ` +
|
|
63
|
+
`ONLY this command, as a SINGLE call with run_in_background=true ` +
|
|
64
|
+
`(do not break it up, do not send any further reply until it ` +
|
|
65
|
+
`completes):\n\n` +
|
|
66
|
+
"```bash\n" +
|
|
67
|
+
`sleep ${SLEEP_SECONDS}\n` +
|
|
68
|
+
"```\n\n" +
|
|
69
|
+
`After the bash command returns, send exactly the single word ` +
|
|
70
|
+
`"done" as your final reply.`;
|
|
71
|
+
|
|
72
|
+
const OVERALL_DEADLINE_MS = (SLEEP_SECONDS + 240) * 1000;
|
|
73
|
+
|
|
74
|
+
interface TrailEntry {
|
|
75
|
+
relMs: number;
|
|
76
|
+
kind: "fresh" | "edit";
|
|
77
|
+
silent: boolean;
|
|
78
|
+
messageId: number;
|
|
79
|
+
text: string;
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
const SUFFIX_RE = /\n\n— still working \(\d+m\)$/;
|
|
83
|
+
|
|
84
|
+
function pad(s: string, n: number): string {
|
|
85
|
+
return s.length >= n ? s : s + " ".repeat(n - s.length);
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
describe("uat: cross-turn pending-async ambient progress (#1445)", () => {
|
|
89
|
+
it(
|
|
90
|
+
"framework edits the anchor in place during a 350s background bash",
|
|
91
|
+
async () => {
|
|
92
|
+
const sc = await spinUp({ agent: "test-harness" });
|
|
93
|
+
try {
|
|
94
|
+
const startedAt = Date.now();
|
|
95
|
+
await sc.sendDM(PROMPT);
|
|
96
|
+
console.log(`[cross-turn-pending] t=0 prompt sent`);
|
|
97
|
+
|
|
98
|
+
const trail: TrailEntry[] = [];
|
|
99
|
+
|
|
100
|
+
// Initial wait window — give the model 90s to send its first
|
|
101
|
+
// anchor reply. After that, we observe edits for the full
|
|
102
|
+
// sleep duration plus headroom; once the model's final fresh
|
|
103
|
+
// "done" lands we wind down within 10s.
|
|
104
|
+
let quiescenceDeadline = startedAt + 90_000;
|
|
105
|
+
const overallDeadline = startedAt + OVERALL_DEADLINE_MS;
|
|
106
|
+
let firstAnchorMsgId: number | null = null;
|
|
107
|
+
let sawDone = false;
|
|
108
|
+
|
|
109
|
+
while (Date.now() < overallDeadline) {
|
|
110
|
+
const remaining = Math.min(
|
|
111
|
+
quiescenceDeadline - Date.now(),
|
|
112
|
+
overallDeadline - Date.now(),
|
|
113
|
+
);
|
|
114
|
+
if (remaining <= 0) break;
|
|
115
|
+
try {
|
|
116
|
+
const msg = await sc.expectMessage(
|
|
117
|
+
(m: ObservedMessage) => m.fromBot,
|
|
118
|
+
{ from: "bot", timeout: remaining },
|
|
119
|
+
);
|
|
120
|
+
const rel = Date.now() - startedAt;
|
|
121
|
+
const entry: TrailEntry = {
|
|
122
|
+
relMs: rel,
|
|
123
|
+
kind: msg.edited ? "edit" : "fresh",
|
|
124
|
+
silent: msg.silent,
|
|
125
|
+
messageId: msg.messageId,
|
|
126
|
+
text: msg.text,
|
|
127
|
+
};
|
|
128
|
+
trail.push(entry);
|
|
129
|
+
console.log(
|
|
130
|
+
`[cross-turn-pending] +${(rel / 1000).toFixed(1)}s ` +
|
|
131
|
+
`${entry.kind.toUpperCase()} msg=${entry.messageId} ` +
|
|
132
|
+
`silent=${entry.silent} text=${JSON.stringify(
|
|
133
|
+
entry.text.slice(0, 120).replace(/\n/g, " ⏎ "),
|
|
134
|
+
)}`,
|
|
135
|
+
);
|
|
136
|
+
if (firstAnchorMsgId == null && entry.kind === "fresh") {
|
|
137
|
+
firstAnchorMsgId = entry.messageId;
|
|
138
|
+
}
|
|
139
|
+
const trimmedFinal = entry.text.trim().toLowerCase();
|
|
140
|
+
const looksLikeDone =
|
|
141
|
+
entry.kind === "fresh" &&
|
|
142
|
+
entry.messageId !== firstAnchorMsgId &&
|
|
143
|
+
(trimmedFinal === "done" || /\bdone\b/.test(trimmedFinal));
|
|
144
|
+
if (looksLikeDone) {
|
|
145
|
+
sawDone = true;
|
|
146
|
+
quiescenceDeadline = Date.now() + 10_000;
|
|
147
|
+
} else {
|
|
148
|
+
// Generous quiescence so we cover the whole sleep window
|
|
149
|
+
// plus a 90s headroom for the model's wake + final reply.
|
|
150
|
+
quiescenceDeadline = Date.now() + 120_000;
|
|
151
|
+
}
|
|
152
|
+
} catch {
|
|
153
|
+
// Timed out — quiescence reached.
|
|
154
|
+
break;
|
|
155
|
+
}
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
// Dump full trail.
|
|
159
|
+
console.log(
|
|
160
|
+
"\n========== CROSS-TURN PENDING-PROGRESS TRAIL ==========",
|
|
161
|
+
);
|
|
162
|
+
console.log(`prompt: ${SLEEP_SECONDS}s background bash`);
|
|
163
|
+
console.log(`total bot messages observed: ${trail.length}`);
|
|
164
|
+
console.log(`anchor messageId: ${firstAnchorMsgId}`);
|
|
165
|
+
console.log(`saw_done: ${sawDone}`);
|
|
166
|
+
console.log("");
|
|
167
|
+
console.log(" rel(s) kind silent msg text");
|
|
168
|
+
console.log(" ------- ----- ------ ----------- ----");
|
|
169
|
+
for (const e of trail) {
|
|
170
|
+
console.log(
|
|
171
|
+
` ${pad((e.relMs / 1000).toFixed(1) + "s", 8)} ` +
|
|
172
|
+
`${pad(e.kind, 6)} ${pad(String(e.silent), 7)} ` +
|
|
173
|
+
`${pad(String(e.messageId), 12)} ` +
|
|
174
|
+
`${e.text.slice(0, 80).replace(/\n/g, " ⏎ ")}`,
|
|
175
|
+
);
|
|
176
|
+
}
|
|
177
|
+
console.log(
|
|
178
|
+
"=======================================================\n",
|
|
179
|
+
);
|
|
180
|
+
|
|
181
|
+
// ── Regression assertions ─────────────────────────────────
|
|
182
|
+
|
|
183
|
+
// (1) at least one fresh anchor reply landed
|
|
184
|
+
const fresh = trail.filter((e) => e.kind === "fresh");
|
|
185
|
+
expect(
|
|
186
|
+
fresh.length,
|
|
187
|
+
`no fresh bot replies observed — agent isn't responding`,
|
|
188
|
+
).toBeGreaterThanOrEqual(1);
|
|
189
|
+
expect(firstAnchorMsgId).not.toBeNull();
|
|
190
|
+
|
|
191
|
+
// (2) at least one edit landed AFTER the initial anchor, and
|
|
192
|
+
// its text carries the framework's "— still working (Nm)"
|
|
193
|
+
// suffix. This is THE regression gate for the fix.
|
|
194
|
+
const edits = trail.filter((e) => e.kind === "edit");
|
|
195
|
+
const editsWithSuffix = edits.filter((e) => SUFFIX_RE.test(e.text));
|
|
196
|
+
expect(
|
|
197
|
+
editsWithSuffix.length,
|
|
198
|
+
`no in-place edits with the "— still working (Nm)" suffix ` +
|
|
199
|
+
`landed during the ${SLEEP_SECONDS}s background bash. ` +
|
|
200
|
+
`Total edits observed: ${edits.length}. The cross-turn ` +
|
|
201
|
+
`pending-progress fix is not active — see ` +
|
|
202
|
+
`\`pending-work-progress.ts\` and the gateway hooks at ` +
|
|
203
|
+
`\`noteAsyncDispatch\` / \`noteOutbound\` / \`noteTurnEnd\`. ` +
|
|
204
|
+
`Pre-fix this number is zero by construction.`,
|
|
205
|
+
).toBeGreaterThanOrEqual(1);
|
|
206
|
+
|
|
207
|
+
// (3) every observed edit is silent (an edit never pings per
|
|
208
|
+
// Telegram semantics; we double-check via the receiving-side
|
|
209
|
+
// flag so any framework regression that switched to fresh
|
|
210
|
+
// sends fails loudly).
|
|
211
|
+
const loudEdits = edits.filter((e) => !e.silent);
|
|
212
|
+
expect(
|
|
213
|
+
loudEdits.length,
|
|
214
|
+
`${loudEdits.length} edit(s) pinged the device — edits ` +
|
|
215
|
+
`should never fire a notification.`,
|
|
216
|
+
).toBe(0);
|
|
217
|
+
|
|
218
|
+
// (4) every edit is anchored to the same messageId as the
|
|
219
|
+
// initial fresh anchor — the framework is editing ONE
|
|
220
|
+
// surface, not spamming.
|
|
221
|
+
const offAnchorEdits = edits.filter(
|
|
222
|
+
(e) => e.messageId !== firstAnchorMsgId,
|
|
223
|
+
);
|
|
224
|
+
expect(
|
|
225
|
+
offAnchorEdits.length,
|
|
226
|
+
`${offAnchorEdits.length} edit(s) were anchored to a ` +
|
|
227
|
+
`different message id than the initial reply ` +
|
|
228
|
+
`(${firstAnchorMsgId}). The framework should edit a ` +
|
|
229
|
+
`single anchor in place, not a chain of messages.`,
|
|
230
|
+
).toBe(0);
|
|
231
|
+
} finally {
|
|
232
|
+
await sc.tearDown();
|
|
233
|
+
}
|
|
234
|
+
},
|
|
235
|
+
OVERALL_DEADLINE_MS + 60_000,
|
|
236
|
+
);
|
|
237
|
+
});
|