switchroom 0.13.13 → 0.13.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli/switchroom.js +2 -2
- package/package.json +1 -1
- package/telegram-plugin/dist/gateway/gateway.js +293 -92
- package/telegram-plugin/gateway/gateway.ts +223 -17
- package/telegram-plugin/pending-work-progress.ts +377 -0
- package/telegram-plugin/runtime-metrics.ts +20 -0
- package/telegram-plugin/tests/pending-work-progress.test.ts +354 -0
- package/telegram-plugin/uat/scenarios/cross-turn-pending-progress-dm.test.ts +239 -0
- package/telegram-plugin/uat/scenarios/visible-answer-stream-dm.test.ts +219 -0
|
@@ -0,0 +1,219 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Visible answer-stream — UAT for the openclaw-pattern TTFO fix
|
|
3
|
+
* (#869 Phase 1 narrow scope).
|
|
4
|
+
*
|
|
5
|
+
* Validates that when `SWITCHROOM_VISIBLE_ANSWER_STREAM=1` is set on
|
|
6
|
+
* the target agent, the framework auto-renders the model's transcript
|
|
7
|
+
* text as a user-visible edit-in-place message starting within ~5s of
|
|
8
|
+
* inbound — instead of writing to Telegram's invisible compose-box
|
|
9
|
+
* draft (the default #1664 behaviour).
|
|
10
|
+
*
|
|
11
|
+
* ## Required setup
|
|
12
|
+
*
|
|
13
|
+
* The target agent (default `test-harness`) MUST have
|
|
14
|
+
* `SWITCHROOM_VISIBLE_ANSWER_STREAM=1` in its container environment.
|
|
15
|
+
* Without that env var the scenario will (correctly) fail — the
|
|
16
|
+
* default behaviour writes to a draft the mtcute driver cannot see.
|
|
17
|
+
*
|
|
18
|
+
* ## What this asserts
|
|
19
|
+
*
|
|
20
|
+
* 1. The first user-visible bot output (fresh `sendMessage`) lands
|
|
21
|
+
* within `VISIBLE_TTFO_BUDGET_MS` (default 8 s) of the inbound.
|
|
22
|
+
* Today's median TTFO across the fleet is 17–69 s; the visible
|
|
23
|
+
* lane should drop it well under 10 s for any reply long enough
|
|
24
|
+
* to emit a text chunk.
|
|
25
|
+
* 2. The initial fresh message is silent (the answer-stream emits
|
|
26
|
+
* with `disable_notification: true` so mid-turn edits never ping).
|
|
27
|
+
* 3. Subsequent edits land on the SAME message_id — single in-place
|
|
28
|
+
* surface, not a chain of pinged sends.
|
|
29
|
+
* 4. At least one edit growth event happens between first send and
|
|
30
|
+
* turn-end (the streaming property — TTFO is fast, then content
|
|
31
|
+
* grows live).
|
|
32
|
+
*
|
|
33
|
+
* The captured trail is dumped to console for forensic inspection
|
|
34
|
+
* regardless of pass/fail.
|
|
35
|
+
*
|
|
36
|
+
* Wall-clock budget: ~90 s.
|
|
37
|
+
*/
|
|
38
|
+
|
|
39
|
+
import { describe, expect, it } from "vitest";
|
|
40
|
+
import { spinUp } from "../harness.js";
|
|
41
|
+
import type { ObservedMessage } from "../driver.js";
|
|
42
|
+
|
|
43
|
+
const VISIBLE_TTFO_BUDGET_MS = 8_000;
|
|
44
|
+
const OVERALL_DEADLINE_MS = 90_000;
|
|
45
|
+
const QUIESCENCE_MS = 8_000;
|
|
46
|
+
|
|
47
|
+
// Prompt engineered to make the model emit a multi-sentence answer
|
|
48
|
+
// over a few seconds — long enough that the streaming behaviour
|
|
49
|
+
// is observable, short enough that turn-flush isn't tempted to fire.
|
|
50
|
+
// Deliberately does NOT instruct the model to call `reply` — we want
|
|
51
|
+
// to exercise the transcript-only path that the visible-answer-stream
|
|
52
|
+
// covers.
|
|
53
|
+
const PROMPT =
|
|
54
|
+
`Please give a four-sentence overview of how Linux page-cache ` +
|
|
55
|
+
`interacts with mmap on a typical x86_64 server. Reply in a single ` +
|
|
56
|
+
`message, with substantive prose. No code blocks.`;
|
|
57
|
+
|
|
58
|
+
interface TrailEntry {
|
|
59
|
+
relMs: number;
|
|
60
|
+
kind: "fresh" | "edit";
|
|
61
|
+
silent: boolean;
|
|
62
|
+
messageId: number;
|
|
63
|
+
textPreview: string;
|
|
64
|
+
textLength: number;
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
function pad(s: string, n: number): string {
|
|
68
|
+
return s.length >= n ? s : s + " ".repeat(n - s.length);
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
describe("uat: visible answer-stream — model transcript renders live (#869 Phase 1)", () => {
|
|
72
|
+
it(
|
|
73
|
+
"first fresh message lands within VISIBLE_TTFO_BUDGET_MS; subsequent edits grow it in place",
|
|
74
|
+
async () => {
|
|
75
|
+
const sc = await spinUp({ agent: "test-harness" });
|
|
76
|
+
try {
|
|
77
|
+
const startedAt = Date.now();
|
|
78
|
+
await sc.sendDM(PROMPT);
|
|
79
|
+
console.log(`[visible-answer-stream] t=0 prompt sent`);
|
|
80
|
+
|
|
81
|
+
const trail: TrailEntry[] = [];
|
|
82
|
+
let firstAnchorMsgId: number | null = null;
|
|
83
|
+
let quiescenceDeadline = startedAt + 30_000;
|
|
84
|
+
const overallDeadline = startedAt + OVERALL_DEADLINE_MS;
|
|
85
|
+
|
|
86
|
+
while (Date.now() < overallDeadline) {
|
|
87
|
+
const remaining = Math.min(
|
|
88
|
+
quiescenceDeadline - Date.now(),
|
|
89
|
+
overallDeadline - Date.now(),
|
|
90
|
+
);
|
|
91
|
+
if (remaining <= 0) break;
|
|
92
|
+
try {
|
|
93
|
+
const msg = await sc.expectMessage(
|
|
94
|
+
(m: ObservedMessage) => m.fromBot,
|
|
95
|
+
{ from: "bot", timeout: remaining },
|
|
96
|
+
);
|
|
97
|
+
const rel = Date.now() - startedAt;
|
|
98
|
+
const entry: TrailEntry = {
|
|
99
|
+
relMs: rel,
|
|
100
|
+
kind: msg.edited ? "edit" : "fresh",
|
|
101
|
+
silent: msg.silent,
|
|
102
|
+
messageId: msg.messageId,
|
|
103
|
+
textPreview: msg.text
|
|
104
|
+
.slice(0, 120)
|
|
105
|
+
.replace(/\n/g, " ⏎ "),
|
|
106
|
+
textLength: msg.text.length,
|
|
107
|
+
};
|
|
108
|
+
trail.push(entry);
|
|
109
|
+
if (firstAnchorMsgId == null && entry.kind === "fresh") {
|
|
110
|
+
firstAnchorMsgId = entry.messageId;
|
|
111
|
+
}
|
|
112
|
+
console.log(
|
|
113
|
+
`[visible-answer-stream] +${(rel / 1000).toFixed(1)}s ` +
|
|
114
|
+
`${entry.kind.toUpperCase()} msg=${entry.messageId} ` +
|
|
115
|
+
`silent=${entry.silent} len=${entry.textLength} ` +
|
|
116
|
+
`text=${JSON.stringify(entry.textPreview)}`,
|
|
117
|
+
);
|
|
118
|
+
quiescenceDeadline = Date.now() + QUIESCENCE_MS;
|
|
119
|
+
} catch {
|
|
120
|
+
break;
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
console.log("\n========== VISIBLE-ANSWER-STREAM TRAIL ==========");
|
|
125
|
+
console.log(`total bot messages observed: ${trail.length}`);
|
|
126
|
+
console.log(`first anchor messageId: ${firstAnchorMsgId}`);
|
|
127
|
+
console.log("");
|
|
128
|
+
console.log(" rel(s) kind silent msg len text");
|
|
129
|
+
console.log(" ------- ----- ------ ----------- ---- ----");
|
|
130
|
+
for (const e of trail) {
|
|
131
|
+
console.log(
|
|
132
|
+
` ${pad((e.relMs / 1000).toFixed(1) + "s", 8)} ` +
|
|
133
|
+
`${pad(e.kind, 6)} ${pad(String(e.silent), 7)} ` +
|
|
134
|
+
`${pad(String(e.messageId), 12)} ${pad(String(e.textLength), 5)} ` +
|
|
135
|
+
`${e.textPreview}`,
|
|
136
|
+
);
|
|
137
|
+
}
|
|
138
|
+
console.log("=================================================\n");
|
|
139
|
+
|
|
140
|
+
// ── Regression assertions ─────────────────────────────────
|
|
141
|
+
|
|
142
|
+
const fresh = trail.filter((e) => e.kind === "fresh");
|
|
143
|
+
const edits = trail.filter((e) => e.kind === "edit");
|
|
144
|
+
|
|
145
|
+
// (1) at least one fresh message landed
|
|
146
|
+
expect(
|
|
147
|
+
fresh.length,
|
|
148
|
+
`no fresh bot replies observed — either the agent isn't ` +
|
|
149
|
+
`responding OR the visible-answer-stream flag is OFF ` +
|
|
150
|
+
`(SWITCHROOM_VISIBLE_ANSWER_STREAM not set on the target ` +
|
|
151
|
+
`agent's container env). Re-check the agent's compose ` +
|
|
152
|
+
`environment.`,
|
|
153
|
+
).toBeGreaterThanOrEqual(1);
|
|
154
|
+
|
|
155
|
+
// (2) first fresh landed within the TTFO budget
|
|
156
|
+
const ttfoMs = fresh[0].relMs;
|
|
157
|
+
expect(
|
|
158
|
+
ttfoMs,
|
|
159
|
+
`TTFO ${ttfoMs}ms exceeded the visible-answer-stream ` +
|
|
160
|
+
`budget of ${VISIBLE_TTFO_BUDGET_MS}ms. Either the model ` +
|
|
161
|
+
`was unusually slow to emit its first text chunk, OR the ` +
|
|
162
|
+
`visible answer-stream is not active. Default behaviour ` +
|
|
163
|
+
`(invisible draft) would never have surfaced a fresh ` +
|
|
164
|
+
`message at all, so the most likely cause is model latency.`,
|
|
165
|
+
).toBeLessThanOrEqual(VISIBLE_TTFO_BUDGET_MS);
|
|
166
|
+
|
|
167
|
+
// (3) first fresh message was silent (mid-turn edits don't ping)
|
|
168
|
+
expect(
|
|
169
|
+
fresh[0].silent,
|
|
170
|
+
`the first fresh message pinged the user — answer-stream ` +
|
|
171
|
+
`should send silently (disable_notification:true). A ping ` +
|
|
172
|
+
`here means an explicit \`reply\` tool may have fired instead.`,
|
|
173
|
+
).toBe(true);
|
|
174
|
+
|
|
175
|
+
// (4) at least one in-place EDIT landed on the same messageId
|
|
176
|
+
// (this is the "live streaming" assertion — TTFO is fast AND
|
|
177
|
+
// content grows on the same surface, not a chain of new sends).
|
|
178
|
+
const sameAnchorEdits = edits.filter(
|
|
179
|
+
(e) => e.messageId === firstAnchorMsgId,
|
|
180
|
+
);
|
|
181
|
+
expect(
|
|
182
|
+
sameAnchorEdits.length,
|
|
183
|
+
`no in-place edits to the anchor message landed — the model ` +
|
|
184
|
+
`either replied in a single shot (very short answer) or ` +
|
|
185
|
+
`the streaming path isn't running. Edits observed: ` +
|
|
186
|
+
`${edits.length}, on anchor: ${sameAnchorEdits.length}.`,
|
|
187
|
+
).toBeGreaterThanOrEqual(1);
|
|
188
|
+
|
|
189
|
+
// (5) every edit is silent (Telegram edits don't push, but
|
|
190
|
+
// we double-check via mtcute's flag in case the framework
|
|
191
|
+
// ever swaps to a fresh-send pattern by accident)
|
|
192
|
+
const loudEdits = edits.filter((e) => !e.silent);
|
|
193
|
+
expect(
|
|
194
|
+
loudEdits.length,
|
|
195
|
+
`${loudEdits.length} edit(s) pinged the device.`,
|
|
196
|
+
).toBe(0);
|
|
197
|
+
|
|
198
|
+
// (6) text length grows monotonically on the anchor (streaming
|
|
199
|
+
// by construction — once content is on the anchor, it only
|
|
200
|
+
// accumulates)
|
|
201
|
+
const anchorTrail = trail.filter(
|
|
202
|
+
(e) => e.messageId === firstAnchorMsgId,
|
|
203
|
+
);
|
|
204
|
+
for (let i = 1; i < anchorTrail.length; i++) {
|
|
205
|
+
expect(
|
|
206
|
+
anchorTrail[i].textLength,
|
|
207
|
+
`anchor message #${firstAnchorMsgId} text shrank between ` +
|
|
208
|
+
`events ${i - 1} (len=${anchorTrail[i - 1].textLength}) ` +
|
|
209
|
+
`and ${i} (len=${anchorTrail[i].textLength}) — ` +
|
|
210
|
+
`streaming text should only grow.`,
|
|
211
|
+
).toBeGreaterThanOrEqual(anchorTrail[i - 1].textLength);
|
|
212
|
+
}
|
|
213
|
+
} finally {
|
|
214
|
+
await sc.tearDown();
|
|
215
|
+
}
|
|
216
|
+
},
|
|
217
|
+
OVERALL_DEADLINE_MS + 30_000,
|
|
218
|
+
);
|
|
219
|
+
});
|