switchroom 0.13.13 → 0.13.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,219 @@
1
+ /**
2
+ * Visible answer-stream — UAT for the openclaw-pattern TTFO fix
3
+ * (#869 Phase 1 narrow scope).
4
+ *
5
+ * Validates that when `SWITCHROOM_VISIBLE_ANSWER_STREAM=1` is set on
6
+ * the target agent, the framework auto-renders the model's transcript
7
+ * text as a user-visible edit-in-place message starting within ~5s of
8
+ * inbound — instead of writing to Telegram's invisible compose-box
9
+ * draft (the default #1664 behaviour).
10
+ *
11
+ * ## Required setup
12
+ *
13
+ * The target agent (default `test-harness`) MUST have
14
+ * `SWITCHROOM_VISIBLE_ANSWER_STREAM=1` in its container environment.
15
+ * Without that env var the scenario will (correctly) fail — the
16
+ * default behaviour writes to a draft the mtcute driver cannot see.
17
+ *
18
+ * ## What this asserts
19
+ *
20
+ * 1. The first user-visible bot output (fresh `sendMessage`) lands
21
+ * within `VISIBLE_TTFO_BUDGET_MS` (default 8 s) of the inbound.
22
+ * Today's median TTFO across the fleet is 17–69 s; the visible
23
+ * lane should drop it well under 10 s for any reply long enough
24
+ * to emit a text chunk.
25
+ * 2. The initial fresh message is silent (the answer-stream emits
26
+ * with `disable_notification: true` so mid-turn edits never ping).
27
+ * 3. Subsequent edits land on the SAME message_id — single in-place
28
+ * surface, not a chain of pinged sends.
29
+ * 4. At least one edit growth event happens between first send and
30
+ * turn-end (the streaming property — TTFO is fast, then content
31
+ * grows live).
32
+ *
33
+ * The captured trail is dumped to console for forensic inspection
34
+ * regardless of pass/fail.
35
+ *
36
+ * Wall-clock budget: ~90 s.
37
+ */
38
+
39
+ import { describe, expect, it } from "vitest";
40
+ import { spinUp } from "../harness.js";
41
+ import type { ObservedMessage } from "../driver.js";
42
+
43
+ const VISIBLE_TTFO_BUDGET_MS = 8_000;
44
+ const OVERALL_DEADLINE_MS = 90_000;
45
+ const QUIESCENCE_MS = 8_000;
46
+
47
+ // Prompt engineered to make the model emit a multi-sentence answer
48
+ // over a few seconds — long enough that the streaming behaviour
49
+ // is observable, short enough that turn-flush isn't tempted to fire.
50
+ // Deliberately does NOT instruct the model to call `reply` — we want
51
+ // to exercise the transcript-only path that the visible-answer-stream
52
+ // covers.
53
+ const PROMPT =
54
+ `Please give a four-sentence overview of how Linux page-cache ` +
55
+ `interacts with mmap on a typical x86_64 server. Reply in a single ` +
56
+ `message, with substantive prose. No code blocks.`;
57
+
58
+ interface TrailEntry {
59
+ relMs: number;
60
+ kind: "fresh" | "edit";
61
+ silent: boolean;
62
+ messageId: number;
63
+ textPreview: string;
64
+ textLength: number;
65
+ }
66
+
67
+ function pad(s: string, n: number): string {
68
+ return s.length >= n ? s : s + " ".repeat(n - s.length);
69
+ }
70
+
71
+ describe("uat: visible answer-stream — model transcript renders live (#869 Phase 1)", () => {
72
+ it(
73
+ "first fresh message lands within VISIBLE_TTFO_BUDGET_MS; subsequent edits grow it in place",
74
+ async () => {
75
+ const sc = await spinUp({ agent: "test-harness" });
76
+ try {
77
+ const startedAt = Date.now();
78
+ await sc.sendDM(PROMPT);
79
+ console.log(`[visible-answer-stream] t=0 prompt sent`);
80
+
81
+ const trail: TrailEntry[] = [];
82
+ let firstAnchorMsgId: number | null = null;
83
+ let quiescenceDeadline = startedAt + 30_000;
84
+ const overallDeadline = startedAt + OVERALL_DEADLINE_MS;
85
+
86
+ while (Date.now() < overallDeadline) {
87
+ const remaining = Math.min(
88
+ quiescenceDeadline - Date.now(),
89
+ overallDeadline - Date.now(),
90
+ );
91
+ if (remaining <= 0) break;
92
+ try {
93
+ const msg = await sc.expectMessage(
94
+ (m: ObservedMessage) => m.fromBot,
95
+ { from: "bot", timeout: remaining },
96
+ );
97
+ const rel = Date.now() - startedAt;
98
+ const entry: TrailEntry = {
99
+ relMs: rel,
100
+ kind: msg.edited ? "edit" : "fresh",
101
+ silent: msg.silent,
102
+ messageId: msg.messageId,
103
+ textPreview: msg.text
104
+ .slice(0, 120)
105
+ .replace(/\n/g, " ⏎ "),
106
+ textLength: msg.text.length,
107
+ };
108
+ trail.push(entry);
109
+ if (firstAnchorMsgId == null && entry.kind === "fresh") {
110
+ firstAnchorMsgId = entry.messageId;
111
+ }
112
+ console.log(
113
+ `[visible-answer-stream] +${(rel / 1000).toFixed(1)}s ` +
114
+ `${entry.kind.toUpperCase()} msg=${entry.messageId} ` +
115
+ `silent=${entry.silent} len=${entry.textLength} ` +
116
+ `text=${JSON.stringify(entry.textPreview)}`,
117
+ );
118
+ quiescenceDeadline = Date.now() + QUIESCENCE_MS;
119
+ } catch {
120
+ break;
121
+ }
122
+ }
123
+
124
+ console.log("\n========== VISIBLE-ANSWER-STREAM TRAIL ==========");
125
+ console.log(`total bot messages observed: ${trail.length}`);
126
+ console.log(`first anchor messageId: ${firstAnchorMsgId}`);
127
+ console.log("");
128
+ console.log(" rel(s) kind silent msg len text");
129
+ console.log(" ------- ----- ------ ----------- ---- ----");
130
+ for (const e of trail) {
131
+ console.log(
132
+ ` ${pad((e.relMs / 1000).toFixed(1) + "s", 8)} ` +
133
+ `${pad(e.kind, 6)} ${pad(String(e.silent), 7)} ` +
134
+ `${pad(String(e.messageId), 12)} ${pad(String(e.textLength), 5)} ` +
135
+ `${e.textPreview}`,
136
+ );
137
+ }
138
+ console.log("=================================================\n");
139
+
140
+ // ── Regression assertions ─────────────────────────────────
141
+
142
+ const fresh = trail.filter((e) => e.kind === "fresh");
143
+ const edits = trail.filter((e) => e.kind === "edit");
144
+
145
+ // (1) at least one fresh message landed
146
+ expect(
147
+ fresh.length,
148
+ `no fresh bot replies observed — either the agent isn't ` +
149
+ `responding OR the visible-answer-stream flag is OFF ` +
150
+ `(SWITCHROOM_VISIBLE_ANSWER_STREAM not set on the target ` +
151
+ `agent's container env). Re-check the agent's compose ` +
152
+ `environment.`,
153
+ ).toBeGreaterThanOrEqual(1);
154
+
155
+ // (2) first fresh landed within the TTFO budget
156
+ const ttfoMs = fresh[0].relMs;
157
+ expect(
158
+ ttfoMs,
159
+ `TTFO ${ttfoMs}ms exceeded the visible-answer-stream ` +
160
+ `budget of ${VISIBLE_TTFO_BUDGET_MS}ms. Either the model ` +
161
+ `was unusually slow to emit its first text chunk, OR the ` +
162
+ `visible answer-stream is not active. Default behaviour ` +
163
+ `(invisible draft) would never have surfaced a fresh ` +
164
+ `message at all, so the most likely cause is model latency.`,
165
+ ).toBeLessThanOrEqual(VISIBLE_TTFO_BUDGET_MS);
166
+
167
+ // (3) first fresh message was silent (mid-turn edits don't ping)
168
+ expect(
169
+ fresh[0].silent,
170
+ `the first fresh message pinged the user — answer-stream ` +
171
+ `should send silently (disable_notification:true). A ping ` +
172
+ `here means an explicit \`reply\` tool may have fired instead.`,
173
+ ).toBe(true);
174
+
175
+ // (4) at least one in-place EDIT landed on the same messageId
176
+ // (this is the "live streaming" assertion — TTFO is fast AND
177
+ // content grows on the same surface, not a chain of new sends).
178
+ const sameAnchorEdits = edits.filter(
179
+ (e) => e.messageId === firstAnchorMsgId,
180
+ );
181
+ expect(
182
+ sameAnchorEdits.length,
183
+ `no in-place edits to the anchor message landed — the model ` +
184
+ `either replied in a single shot (very short answer) or ` +
185
+ `the streaming path isn't running. Edits observed: ` +
186
+ `${edits.length}, on anchor: ${sameAnchorEdits.length}.`,
187
+ ).toBeGreaterThanOrEqual(1);
188
+
189
+ // (5) every edit is silent (Telegram edits don't push, but
190
+ // we double-check via mtcute's flag in case the framework
191
+ // ever swaps to a fresh-send pattern by accident)
192
+ const loudEdits = edits.filter((e) => !e.silent);
193
+ expect(
194
+ loudEdits.length,
195
+ `${loudEdits.length} edit(s) pinged the device.`,
196
+ ).toBe(0);
197
+
198
+ // (6) text length grows monotonically on the anchor (streaming
199
+ // by construction — once content is on the anchor, it only
200
+ // accumulates)
201
+ const anchorTrail = trail.filter(
202
+ (e) => e.messageId === firstAnchorMsgId,
203
+ );
204
+ for (let i = 1; i < anchorTrail.length; i++) {
205
+ expect(
206
+ anchorTrail[i].textLength,
207
+ `anchor message #${firstAnchorMsgId} text shrank between ` +
208
+ `events ${i - 1} (len=${anchorTrail[i - 1].textLength}) ` +
209
+ `and ${i} (len=${anchorTrail[i].textLength}) — ` +
210
+ `streaming text should only grow.`,
211
+ ).toBeGreaterThanOrEqual(anchorTrail[i - 1].textLength);
212
+ }
213
+ } finally {
214
+ await sc.tearDown();
215
+ }
216
+ },
217
+ OVERALL_DEADLINE_MS + 30_000,
218
+ );
219
+ });