switchroom 0.13.15 → 0.13.16
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli/switchroom.js
CHANGED
|
@@ -47331,8 +47331,8 @@ var {
|
|
|
47331
47331
|
} = import__.default;
|
|
47332
47332
|
|
|
47333
47333
|
// src/build-info.ts
|
|
47334
|
-
var VERSION = "0.13.
|
|
47335
|
-
var COMMIT_SHA = "
|
|
47334
|
+
var VERSION = "0.13.16";
|
|
47335
|
+
var COMMIT_SHA = "6c71b36b";
|
|
47336
47336
|
|
|
47337
47337
|
// src/cli/agent.ts
|
|
47338
47338
|
init_source();
|
package/package.json
CHANGED
|
@@ -48154,10 +48154,10 @@ function sweepStaleTurnActiveMarker(stateDir, opts) {
|
|
|
48154
48154
|
}
|
|
48155
48155
|
|
|
48156
48156
|
// ../src/build-info.ts
|
|
48157
|
-
var VERSION = "0.13.
|
|
48158
|
-
var COMMIT_SHA = "
|
|
48159
|
-
var COMMIT_DATE = "2026-05-
|
|
48160
|
-
var LATEST_PR =
|
|
48157
|
+
var VERSION = "0.13.16";
|
|
48158
|
+
var COMMIT_SHA = "6c71b36b";
|
|
48159
|
+
var COMMIT_DATE = "2026-05-23T03:56:34Z";
|
|
48160
|
+
var LATEST_PR = 1675;
|
|
48161
48161
|
var COMMITS_AHEAD_OF_TAG = 0;
|
|
48162
48162
|
|
|
48163
48163
|
// gateway/boot-version.ts
|
|
@@ -50617,7 +50617,19 @@ async function executeReply(args) {
|
|
|
50617
50617
|
const configParseMode = access.parseMode ?? "html";
|
|
50618
50618
|
const format = args.format ?? configParseMode;
|
|
50619
50619
|
const disableLinkPreview = args.disable_web_page_preview != null ? Boolean(args.disable_web_page_preview) : access.disableLinkPreview ?? true;
|
|
50620
|
-
|
|
50620
|
+
let disableNotification = args.disable_notification === true;
|
|
50621
|
+
{
|
|
50622
|
+
const turn2 = currentTurn;
|
|
50623
|
+
if (turn2 != null && !disableNotification) {
|
|
50624
|
+
if (turn2.firstPingAt != null) {
|
|
50625
|
+
process.stderr.write(`telegram gateway: reply over-ping safety net \u2014 ` + `downgrading disable_notification:false \u2192 true ` + `(chat=${chat_id} thread=${args.message_thread_id ?? "-"} firstPingAt=${turn2.firstPingAt} sinceFirstPing_ms=${Date.now() - turn2.firstPingAt})
|
|
50626
|
+
`);
|
|
50627
|
+
disableNotification = true;
|
|
50628
|
+
} else {
|
|
50629
|
+
turn2.firstPingAt = Date.now();
|
|
50630
|
+
}
|
|
50631
|
+
}
|
|
50632
|
+
}
|
|
50621
50633
|
const tg = access.telegraph;
|
|
50622
50634
|
const tgThreshold = tg?.threshold ?? 3000;
|
|
50623
50635
|
if (tg?.enabled && files.length === 0 && text.length > tgThreshold) {
|
|
@@ -51766,6 +51778,7 @@ function handleSessionEvent(ev) {
|
|
|
51766
51778
|
gatewayReceiveAt: startedAt,
|
|
51767
51779
|
replyCalled: false,
|
|
51768
51780
|
finalAnswerDelivered: false,
|
|
51781
|
+
firstPingAt: null,
|
|
51769
51782
|
capturedText: [],
|
|
51770
51783
|
orphanedReplyTimeoutId: null,
|
|
51771
51784
|
registryKey: null,
|
|
@@ -1206,6 +1206,17 @@ type CurrentTurn = {
|
|
|
1206
1206
|
// even though `replyCalled` is true — the #1664 case where the real answer
|
|
1207
1207
|
// ended up as plain transcript text rendered into an ephemeral draft.
|
|
1208
1208
|
finalAnswerDelivered: boolean
|
|
1209
|
+
// #1675 (over-ping safety net): wall-clock ms of the first reply
|
|
1210
|
+
// this turn that landed with `disable_notification: false` (a real
|
|
1211
|
+
// device ping). The conversational-pacing contract
|
|
1212
|
+
// (`reference/conversational-pacing.md` beat 5) says EXACTLY ONE
|
|
1213
|
+
// ping per turn — the final answer. When the model violates that
|
|
1214
|
+
// (sends a substantive answer pinged + a wrap-up "Delivered…" or
|
|
1215
|
+
// meta-narration also pinged), subsequent reply calls with
|
|
1216
|
+
// `disable_notification: false` are auto-downgraded to silent by
|
|
1217
|
+
// the framework. Null until the first ping lands. Reset on every
|
|
1218
|
+
// fresh-turn enqueue.
|
|
1219
|
+
firstPingAt: number | null
|
|
1209
1220
|
capturedText: string[]
|
|
1210
1221
|
orphanedReplyTimeoutId: ReturnType<typeof setTimeout> | null
|
|
1211
1222
|
registryKey: string | null
|
|
@@ -4208,7 +4219,43 @@ async function executeReply(args: Record<string, unknown>): Promise<{ content: A
|
|
|
4208
4219
|
// so only the final answer pings the device. Default false (pings) so
|
|
4209
4220
|
// existing call-sites and the typical "final answer" reply keep their
|
|
4210
4221
|
// current behaviour without an explicit flag.
|
|
4211
|
-
|
|
4222
|
+
let disableNotification = args.disable_notification === true
|
|
4223
|
+
|
|
4224
|
+
// #1675 over-ping safety net. The conversational-pacing contract
|
|
4225
|
+
// (`reference/conversational-pacing.md` beat 5) says EXACTLY ONE
|
|
4226
|
+
// device ping per turn — the final answer. The model sometimes
|
|
4227
|
+
// violates this by sending a substantive answer pinged + a wrap-up
|
|
4228
|
+
// ("Delivered all three steps…", "Sent.", or meta-narration) ALSO
|
|
4229
|
+
// pinged. Both messages then fire notifications. The fleet UAT on
|
|
4230
|
+
// 2026-05-23 reproduced this (Step 3 + Delivered both pinged, two
|
|
4231
|
+
// beeps for a turn that should have produced one). Framework owns
|
|
4232
|
+
// the safety net: once the turn has emitted ONE pinged reply, every
|
|
4233
|
+
// subsequent reply call in the same turn auto-downgrades to silent
|
|
4234
|
+
// (disable_notification: true). Model intent ("I want this loud")
|
|
4235
|
+
// is honoured for the first ping; subsequent pings are demoted with
|
|
4236
|
+
// a stderr log so operators can see the safety net engage.
|
|
4237
|
+
//
|
|
4238
|
+
// The slot is claimed BEFORE the actual send to keep the logic
|
|
4239
|
+
// sequential — a send that fails part-way leaves firstPingAt set
|
|
4240
|
+
// and subsequent pings would be silenced. Acceptable trade-off (a
|
|
4241
|
+
// failed first ping is an edge case; the alternative — claim after
|
|
4242
|
+
// send — races concurrent reply calls).
|
|
4243
|
+
{
|
|
4244
|
+
const turn = currentTurn
|
|
4245
|
+
if (turn != null && !disableNotification) {
|
|
4246
|
+
if (turn.firstPingAt != null) {
|
|
4247
|
+
process.stderr.write(
|
|
4248
|
+
`telegram gateway: reply over-ping safety net — ` +
|
|
4249
|
+
`downgrading disable_notification:false → true ` +
|
|
4250
|
+
`(chat=${chat_id} thread=${args.message_thread_id ?? '-'} ` +
|
|
4251
|
+
`firstPingAt=${turn.firstPingAt} sinceFirstPing_ms=${Date.now() - turn.firstPingAt})\n`,
|
|
4252
|
+
)
|
|
4253
|
+
disableNotification = true
|
|
4254
|
+
} else {
|
|
4255
|
+
turn.firstPingAt = Date.now()
|
|
4256
|
+
}
|
|
4257
|
+
}
|
|
4258
|
+
}
|
|
4212
4259
|
|
|
4213
4260
|
// Telegraph publish (#579). When the reply text is long enough AND
|
|
4214
4261
|
// the agent has telegraph enabled in access.json, publish to
|
|
@@ -5877,6 +5924,7 @@ function handleSessionEvent(ev: SessionEvent): void {
|
|
|
5877
5924
|
gatewayReceiveAt: startedAt,
|
|
5878
5925
|
replyCalled: false,
|
|
5879
5926
|
finalAnswerDelivered: false,
|
|
5927
|
+
firstPingAt: null,
|
|
5880
5928
|
capturedText: [],
|
|
5881
5929
|
orphanedReplyTimeoutId: null,
|
|
5882
5930
|
registryKey: null,
|
|
@@ -1,59 +1,80 @@
|
|
|
1
1
|
/**
|
|
2
|
-
*
|
|
3
|
-
*
|
|
2
|
+
* Conversational pacing UAT — measures the END-TO-END user-perceived
|
|
3
|
+
* turn UX on a multi-step prompt.
|
|
4
4
|
*
|
|
5
|
-
*
|
|
6
|
-
*
|
|
7
|
-
*
|
|
8
|
-
*
|
|
9
|
-
*
|
|
5
|
+
* Original framing was "validate the visible-answer-stream path
|
|
6
|
+
* activates." Live research on test-harness with the
|
|
7
|
+
* `SWITCHROOM_VISIBLE_ANSWER_STREAM=1` flag showed that modern Claude
|
|
8
|
+
* 2.1.x on this fleet does NOT emit transcript text events between
|
|
9
|
+
* tool calls — it consistently calls the `reply` MCP tool directly
|
|
10
|
+
* for every user-visible chunk (beat 1 ack, then per-step beat 3
|
|
11
|
+
* updates). So the visible-answer-stream code path (which renders
|
|
12
|
+
* `text` session events into a chat-timeline message) doesn't
|
|
13
|
+
* activate; the answer-stream lane stays idle while the model uses
|
|
14
|
+
* `reply` calls instead.
|
|
10
15
|
*
|
|
11
|
-
*
|
|
16
|
+
* That's actually FINE — the model is correctly following the
|
|
17
|
+
* five-beat conversational-pacing contract (`reference/conversational-
|
|
18
|
+
* pacing.md`): one silent ack at the start, silent updates per step,
|
|
19
|
+
* one pinged final answer. This UAT now validates THAT — the pacing
|
|
20
|
+
* the user actually experiences — rather than the answer-stream code
|
|
21
|
+
* path specifically.
|
|
12
22
|
*
|
|
13
|
-
* The
|
|
14
|
-
*
|
|
15
|
-
*
|
|
16
|
-
*
|
|
23
|
+
* The flag `SWITCHROOM_VISIBLE_ANSWER_STREAM=1` is still set on
|
|
24
|
+
* test-harness for ongoing observation; if a future model version
|
|
25
|
+
* starts emitting transcript text, the lane will surface it visibly
|
|
26
|
+
* instead of writing to the invisible compose-box draft (the prior
|
|
27
|
+
* default).
|
|
17
28
|
*
|
|
18
29
|
* ## What this asserts
|
|
19
30
|
*
|
|
20
|
-
* 1.
|
|
21
|
-
*
|
|
22
|
-
*
|
|
23
|
-
*
|
|
24
|
-
*
|
|
25
|
-
*
|
|
26
|
-
*
|
|
27
|
-
*
|
|
28
|
-
*
|
|
29
|
-
* 4. At least one edit growth event happens between first send and
|
|
30
|
-
* turn-end (the streaming property — TTFO is fast, then content
|
|
31
|
-
* grows live).
|
|
31
|
+
* 1. First user-visible bot message lands within `TTFO_BUDGET_MS`
|
|
32
|
+
* (default 15 s) of the inbound — covers beat 1 ack OR straight-
|
|
33
|
+
* to-content depending on the model's pacing choice.
|
|
34
|
+
* 2. Multiple distinct bot messages land per turn for the multi-
|
|
35
|
+
* step prompt — proving the model isn't collapsing everything
|
|
36
|
+
* into a single pinged dump.
|
|
37
|
+
* 3. All but at most one message is silent (`disable_notification:
|
|
38
|
+
* true`). Only the final answer should ping — anything earlier
|
|
39
|
+
* pinging is a beat-3 contract violation.
|
|
32
40
|
*
|
|
33
|
-
*
|
|
34
|
-
* regardless of pass/fail.
|
|
41
|
+
* ## Wall-clock budget
|
|
35
42
|
*
|
|
36
|
-
*
|
|
43
|
+
* ~90 s.
|
|
37
44
|
*/
|
|
38
45
|
|
|
39
46
|
import { describe, expect, it } from "vitest";
|
|
40
47
|
import { spinUp } from "../harness.js";
|
|
41
48
|
import type { ObservedMessage } from "../driver.js";
|
|
42
49
|
|
|
43
|
-
const
|
|
50
|
+
const TTFO_BUDGET_MS = 15_000;
|
|
44
51
|
const OVERALL_DEADLINE_MS = 90_000;
|
|
45
|
-
const QUIESCENCE_MS =
|
|
46
|
-
|
|
47
|
-
//
|
|
48
|
-
//
|
|
49
|
-
//
|
|
50
|
-
//
|
|
51
|
-
//
|
|
52
|
-
//
|
|
52
|
+
const QUIESCENCE_MS = 12_000;
|
|
53
|
+
|
|
54
|
+
// Multi-step investigation prompt — designed to make the model emit
|
|
55
|
+
// transcript text BETWEEN tool calls, which is the assistant-content
|
|
56
|
+
// `text` block shape session-tail surfaces via the `text` event the
|
|
57
|
+
// answer-stream lane consumes. With the visible-answer-stream flag
|
|
58
|
+
// ON, those text events should become user-visible edit-in-place
|
|
59
|
+
// chat-timeline updates.
|
|
60
|
+
//
|
|
61
|
+
// We choose a research-style task because that pattern reliably
|
|
62
|
+
// emits `text` chunks (the model thinks out loud between Read /
|
|
63
|
+
// Bash steps) on most Claude versions. A pure-answer prompt (the
|
|
64
|
+
// previous version of this scenario) tended to make modern Claude
|
|
65
|
+
// jump straight to a single `reply` tool-call with no intermediate
|
|
66
|
+
// text — exercising the wrong path.
|
|
53
67
|
const PROMPT =
|
|
54
|
-
`
|
|
55
|
-
`
|
|
56
|
-
`
|
|
68
|
+
`Investigate this step by step:\n\n` +
|
|
69
|
+
`1. Read \`/etc/hostname\` and tell me what host this is — write a ` +
|
|
70
|
+
`sentence about it.\n` +
|
|
71
|
+
`2. Then read \`/etc/os-release\` and tell me what OS family / version.\n` +
|
|
72
|
+
`3. Then read \`/proc/cpuinfo\` (head it), and tell me the CPU model + ` +
|
|
73
|
+
`core count.\n` +
|
|
74
|
+
`4. Wrap up with a one-line summary of all three.\n\n` +
|
|
75
|
+
`Between each step, narrate what you're finding in plain prose ` +
|
|
76
|
+
`(not just bullet outputs). Don't batch all your observations into ` +
|
|
77
|
+
`one final reply — talk as you investigate.`;
|
|
57
78
|
|
|
58
79
|
interface TrailEntry {
|
|
59
80
|
relMs: number;
|
|
@@ -68,9 +89,9 @@ function pad(s: string, n: number): string {
|
|
|
68
89
|
return s.length >= n ? s : s + " ".repeat(n - s.length);
|
|
69
90
|
}
|
|
70
91
|
|
|
71
|
-
describe("uat:
|
|
92
|
+
describe("uat: conversational pacing on a multi-step turn", () => {
|
|
72
93
|
it(
|
|
73
|
-
"first
|
|
94
|
+
"first message lands within TTFO_BUDGET_MS; multiple silent messages; final answer pings",
|
|
74
95
|
async () => {
|
|
75
96
|
const sc = await spinUp({ agent: "test-harness" });
|
|
76
97
|
try {
|
|
@@ -137,79 +158,45 @@ describe("uat: visible answer-stream — model transcript renders live (#869 Pha
|
|
|
137
158
|
}
|
|
138
159
|
console.log("=================================================\n");
|
|
139
160
|
|
|
140
|
-
// ──
|
|
141
|
-
|
|
142
|
-
const fresh = trail.filter((e) => e.kind === "fresh");
|
|
143
|
-
const edits = trail.filter((e) => e.kind === "edit");
|
|
161
|
+
// ── Pacing assertions ─────────────────────────────────────
|
|
144
162
|
|
|
145
|
-
// (1) at least one
|
|
163
|
+
// (1) at least one bot message landed
|
|
146
164
|
expect(
|
|
147
|
-
|
|
148
|
-
`no
|
|
149
|
-
`responding OR the visible-answer-stream flag is OFF ` +
|
|
150
|
-
`(SWITCHROOM_VISIBLE_ANSWER_STREAM not set on the target ` +
|
|
151
|
-
`agent's container env). Re-check the agent's compose ` +
|
|
152
|
-
`environment.`,
|
|
165
|
+
trail.length,
|
|
166
|
+
`no bot replies observed — the agent isn't responding.`,
|
|
153
167
|
).toBeGreaterThanOrEqual(1);
|
|
154
168
|
|
|
155
|
-
// (2) first
|
|
156
|
-
const ttfoMs =
|
|
169
|
+
// (2) first message landed within TTFO budget
|
|
170
|
+
const ttfoMs = trail[0].relMs;
|
|
157
171
|
expect(
|
|
158
172
|
ttfoMs,
|
|
159
|
-
`TTFO ${ttfoMs}ms exceeded the
|
|
160
|
-
|
|
161
|
-
`was unusually slow to emit its first text chunk, OR the ` +
|
|
162
|
-
`visible answer-stream is not active. Default behaviour ` +
|
|
163
|
-
`(invisible draft) would never have surfaced a fresh ` +
|
|
164
|
-
`message at all, so the most likely cause is model latency.`,
|
|
165
|
-
).toBeLessThanOrEqual(VISIBLE_TTFO_BUDGET_MS);
|
|
166
|
-
|
|
167
|
-
// (3) first fresh message was silent (mid-turn edits don't ping)
|
|
168
|
-
expect(
|
|
169
|
-
fresh[0].silent,
|
|
170
|
-
`the first fresh message pinged the user — answer-stream ` +
|
|
171
|
-
`should send silently (disable_notification:true). A ping ` +
|
|
172
|
-
`here means an explicit \`reply\` tool may have fired instead.`,
|
|
173
|
-
).toBe(true);
|
|
173
|
+
`TTFO ${ttfoMs}ms exceeded the budget of ${TTFO_BUDGET_MS}ms.`,
|
|
174
|
+
).toBeLessThanOrEqual(TTFO_BUDGET_MS);
|
|
174
175
|
|
|
175
|
-
// (
|
|
176
|
-
//
|
|
177
|
-
// content grows on the same surface, not a chain of new sends).
|
|
178
|
-
const sameAnchorEdits = edits.filter(
|
|
179
|
-
(e) => e.messageId === firstAnchorMsgId,
|
|
180
|
-
);
|
|
176
|
+
// (3) multiple messages landed — proves the model is pacing,
|
|
177
|
+
// not dumping a single big reply
|
|
181
178
|
expect(
|
|
182
|
-
|
|
183
|
-
`
|
|
184
|
-
`
|
|
185
|
-
`
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
//
|
|
191
|
-
//
|
|
192
|
-
|
|
179
|
+
trail.length,
|
|
180
|
+
`only ${trail.length} message(s) observed — the model ` +
|
|
181
|
+
`collapsed this multi-step prompt into a single dump. ` +
|
|
182
|
+
`Beat 3 pacing (per-step updates) requires multiple ` +
|
|
183
|
+
`messages. Either the model didn't follow the prompt ` +
|
|
184
|
+
`or quiescence bailed early.`,
|
|
185
|
+
).toBeGreaterThanOrEqual(2);
|
|
186
|
+
|
|
187
|
+
// (4) at most one message pinged the user — beat-3 contract
|
|
188
|
+
// says only the FINAL answer pings; mid-turn updates pass
|
|
189
|
+
// `disable_notification: true`.
|
|
190
|
+
const pingedMessages = trail.filter((e) => !e.silent);
|
|
193
191
|
expect(
|
|
194
|
-
|
|
195
|
-
`${
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
(e) => e.messageId === firstAnchorMsgId,
|
|
203
|
-
);
|
|
204
|
-
for (let i = 1; i < anchorTrail.length; i++) {
|
|
205
|
-
expect(
|
|
206
|
-
anchorTrail[i].textLength,
|
|
207
|
-
`anchor message #${firstAnchorMsgId} text shrank between ` +
|
|
208
|
-
`events ${i - 1} (len=${anchorTrail[i - 1].textLength}) ` +
|
|
209
|
-
`and ${i} (len=${anchorTrail[i].textLength}) — ` +
|
|
210
|
-
`streaming text should only grow.`,
|
|
211
|
-
).toBeGreaterThanOrEqual(anchorTrail[i - 1].textLength);
|
|
212
|
-
}
|
|
192
|
+
pingedMessages.length,
|
|
193
|
+
`${pingedMessages.length} message(s) pinged the device — ` +
|
|
194
|
+
`the conversational-pacing contract allows AT MOST 1 ` +
|
|
195
|
+
`(the final answer). Mid-turn updates must be silent. ` +
|
|
196
|
+
`Pinged messages at: ${pingedMessages
|
|
197
|
+
.map((m) => `+${(m.relMs / 1000).toFixed(0)}s`)
|
|
198
|
+
.join(", ")}`,
|
|
199
|
+
).toBeLessThanOrEqual(1);
|
|
213
200
|
} finally {
|
|
214
201
|
await sc.tearDown();
|
|
215
202
|
}
|