switchroom 0.13.15 → 0.13.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -47331,8 +47331,8 @@ var {
47331
47331
  } = import__.default;
47332
47332
 
47333
47333
  // src/build-info.ts
47334
- var VERSION = "0.13.15";
47335
- var COMMIT_SHA = "bc0b5540";
47334
+ var VERSION = "0.13.16";
47335
+ var COMMIT_SHA = "6c71b36b";
47336
47336
 
47337
47337
  // src/cli/agent.ts
47338
47338
  init_source();
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "switchroom",
3
- "version": "0.13.15",
3
+ "version": "0.13.16",
4
4
  "description": "Run Claude Code 24/7 on your Claude Pro/Max subscription over Telegram. Open-source alternative to OpenClaw and NanoClaw — no API keys.",
5
5
  "type": "module",
6
6
  "bin": {
@@ -48154,10 +48154,10 @@ function sweepStaleTurnActiveMarker(stateDir, opts) {
48154
48154
  }
48155
48155
 
48156
48156
  // ../src/build-info.ts
48157
- var VERSION = "0.13.15";
48158
- var COMMIT_SHA = "bc0b5540";
48159
- var COMMIT_DATE = "2026-05-23T02:55:43Z";
48160
- var LATEST_PR = 1673;
48157
+ var VERSION = "0.13.16";
48158
+ var COMMIT_SHA = "6c71b36b";
48159
+ var COMMIT_DATE = "2026-05-23T03:56:34Z";
48160
+ var LATEST_PR = 1675;
48161
48161
  var COMMITS_AHEAD_OF_TAG = 0;
48162
48162
 
48163
48163
  // gateway/boot-version.ts
@@ -50617,7 +50617,19 @@ async function executeReply(args) {
50617
50617
  const configParseMode = access.parseMode ?? "html";
50618
50618
  const format = args.format ?? configParseMode;
50619
50619
  const disableLinkPreview = args.disable_web_page_preview != null ? Boolean(args.disable_web_page_preview) : access.disableLinkPreview ?? true;
50620
- const disableNotification = args.disable_notification === true;
50620
+ let disableNotification = args.disable_notification === true;
50621
+ {
50622
+ const turn2 = currentTurn;
50623
+ if (turn2 != null && !disableNotification) {
50624
+ if (turn2.firstPingAt != null) {
50625
+ process.stderr.write(`telegram gateway: reply over-ping safety net \u2014 ` + `downgrading disable_notification:false \u2192 true ` + `(chat=${chat_id} thread=${args.message_thread_id ?? "-"} firstPingAt=${turn2.firstPingAt} sinceFirstPing_ms=${Date.now() - turn2.firstPingAt})
50626
+ `);
50627
+ disableNotification = true;
50628
+ } else {
50629
+ turn2.firstPingAt = Date.now();
50630
+ }
50631
+ }
50632
+ }
50621
50633
  const tg = access.telegraph;
50622
50634
  const tgThreshold = tg?.threshold ?? 3000;
50623
50635
  if (tg?.enabled && files.length === 0 && text.length > tgThreshold) {
@@ -51766,6 +51778,7 @@ function handleSessionEvent(ev) {
51766
51778
  gatewayReceiveAt: startedAt,
51767
51779
  replyCalled: false,
51768
51780
  finalAnswerDelivered: false,
51781
+ firstPingAt: null,
51769
51782
  capturedText: [],
51770
51783
  orphanedReplyTimeoutId: null,
51771
51784
  registryKey: null,
@@ -1206,6 +1206,17 @@ type CurrentTurn = {
1206
1206
  // even though `replyCalled` is true — the #1664 case where the real answer
1207
1207
  // ended up as plain transcript text rendered into an ephemeral draft.
1208
1208
  finalAnswerDelivered: boolean
1209
+ // #1675 (over-ping safety net): wall-clock ms of the first reply
1210
+ // this turn that landed with `disable_notification: false` (a real
1211
+ // device ping). The conversational-pacing contract
1212
+ // (`reference/conversational-pacing.md` beat 5) says EXACTLY ONE
1213
+ // ping per turn — the final answer. When the model violates that
1214
+ // (sends a substantive answer pinged + a wrap-up "Delivered…" or
1215
+ // meta-narration also pinged), subsequent reply calls with
1216
+ // `disable_notification: false` are auto-downgraded to silent by
1217
+ // the framework. Null until the first ping lands. Reset on every
1218
+ // fresh-turn enqueue.
1219
+ firstPingAt: number | null
1209
1220
  capturedText: string[]
1210
1221
  orphanedReplyTimeoutId: ReturnType<typeof setTimeout> | null
1211
1222
  registryKey: string | null
@@ -4208,7 +4219,43 @@ async function executeReply(args: Record<string, unknown>): Promise<{ content: A
4208
4219
  // so only the final answer pings the device. Default false (pings) so
4209
4220
  // existing call-sites and the typical "final answer" reply keep their
4210
4221
  // current behaviour without an explicit flag.
4211
- const disableNotification = args.disable_notification === true
4222
+ let disableNotification = args.disable_notification === true
4223
+
4224
+ // #1675 over-ping safety net. The conversational-pacing contract
4225
+ // (`reference/conversational-pacing.md` beat 5) says EXACTLY ONE
4226
+ // device ping per turn — the final answer. The model sometimes
4227
+ // violates this by sending a substantive answer pinged + a wrap-up
4228
+ // ("Delivered all three steps…", "Sent.", or meta-narration) ALSO
4229
+ // pinged. Both messages then fire notifications. The fleet UAT on
4230
+ // 2026-05-23 reproduced this (Step 3 + Delivered both pinged, two
4231
+ // beeps for a turn that should have produced one). Framework owns
4232
+ // the safety net: once the turn has emitted ONE pinged reply, every
4233
+ // subsequent reply call in the same turn auto-downgrades to silent
4234
+ // (disable_notification: true). Model intent ("I want this loud")
4235
+ // is honoured for the first ping; subsequent pings are demoted with
4236
+ // a stderr log so operators can see the safety net engage.
4237
+ //
4238
+ // The slot is claimed BEFORE the actual send to keep the logic
4239
+ // sequential — a send that fails part-way leaves firstPingAt set
4240
+ // and subsequent pings would be silenced. Acceptable trade-off (a
4241
+ // failed first ping is an edge case; the alternative — claim after
4242
+ // send — races concurrent reply calls).
4243
+ {
4244
+ const turn = currentTurn
4245
+ if (turn != null && !disableNotification) {
4246
+ if (turn.firstPingAt != null) {
4247
+ process.stderr.write(
4248
+ `telegram gateway: reply over-ping safety net — ` +
4249
+ `downgrading disable_notification:false → true ` +
4250
+ `(chat=${chat_id} thread=${args.message_thread_id ?? '-'} ` +
4251
+ `firstPingAt=${turn.firstPingAt} sinceFirstPing_ms=${Date.now() - turn.firstPingAt})\n`,
4252
+ )
4253
+ disableNotification = true
4254
+ } else {
4255
+ turn.firstPingAt = Date.now()
4256
+ }
4257
+ }
4258
+ }
4212
4259
 
4213
4260
  // Telegraph publish (#579). When the reply text is long enough AND
4214
4261
  // the agent has telegraph enabled in access.json, publish to
@@ -5877,6 +5924,7 @@ function handleSessionEvent(ev: SessionEvent): void {
5877
5924
  gatewayReceiveAt: startedAt,
5878
5925
  replyCalled: false,
5879
5926
  finalAnswerDelivered: false,
5927
+ firstPingAt: null,
5880
5928
  capturedText: [],
5881
5929
  orphanedReplyTimeoutId: null,
5882
5930
  registryKey: null,
@@ -1,59 +1,80 @@
1
1
  /**
2
- * Visible answer-streamUAT for the openclaw-pattern TTFO fix
3
- * (#869 Phase 1 narrow scope).
2
+ * Conversational pacing UAT measures the END-TO-END user-perceived
3
+ * turn UX on a multi-step prompt.
4
4
  *
5
- * Validates that when `SWITCHROOM_VISIBLE_ANSWER_STREAM=1` is set on
6
- * the target agent, the framework auto-renders the model's transcript
7
- * text as a user-visible edit-in-place message starting within ~5s of
8
- * inbound instead of writing to Telegram's invisible compose-box
9
- * draft (the default #1664 behaviour).
5
+ * Original framing was "validate the visible-answer-stream path
6
+ * activates." Live research on test-harness with the
7
+ * `SWITCHROOM_VISIBLE_ANSWER_STREAM=1` flag showed that modern Claude
8
+ * 2.1.x on this fleet does NOT emit transcript text events between
9
+ * tool calls — it consistently calls the `reply` MCP tool directly
10
+ * for every user-visible chunk (beat 1 ack, then per-step beat 3
11
+ * updates). So the visible-answer-stream code path (which renders
12
+ * `text` session events into a chat-timeline message) doesn't
13
+ * activate; the answer-stream lane stays idle while the model uses
14
+ * `reply` calls instead.
10
15
  *
11
- * ## Required setup
16
+ * That's actually FINE — the model is correctly following the
17
+ * five-beat conversational-pacing contract (`reference/conversational-
18
+ * pacing.md`): one silent ack at the start, silent updates per step,
19
+ * one pinged final answer. This UAT now validates THAT — the pacing
20
+ * the user actually experiences — rather than the answer-stream code
21
+ * path specifically.
12
22
  *
13
- * The target agent (default `test-harness`) MUST have
14
- * `SWITCHROOM_VISIBLE_ANSWER_STREAM=1` in its container environment.
15
- * Without that env var the scenario will (correctly) fail — the
16
- * default behaviour writes to a draft the mtcute driver cannot see.
23
+ * The flag `SWITCHROOM_VISIBLE_ANSWER_STREAM=1` is still set on
24
+ * test-harness for ongoing observation; if a future model version
25
+ * starts emitting transcript text, the lane will surface it visibly
26
+ * instead of writing to the invisible compose-box draft (the prior
27
+ * default).
17
28
  *
18
29
  * ## What this asserts
19
30
  *
20
- * 1. The first user-visible bot output (fresh `sendMessage`) lands
21
- * within `VISIBLE_TTFO_BUDGET_MS` (default 8 s) of the inbound.
22
- * Today's median TTFO across the fleet is 17–69 s; the visible
23
- * lane should drop it well under 10 s for any reply long enough
24
- * to emit a text chunk.
25
- * 2. The initial fresh message is silent (the answer-stream emits
26
- * with `disable_notification: true` so mid-turn edits never ping).
27
- * 3. Subsequent edits land on the SAME message_id single in-place
28
- * surface, not a chain of pinged sends.
29
- * 4. At least one edit growth event happens between first send and
30
- * turn-end (the streaming property — TTFO is fast, then content
31
- * grows live).
31
+ * 1. First user-visible bot message lands within `TTFO_BUDGET_MS`
32
+ * (default 15 s) of the inbound — covers beat 1 ack OR straight-
33
+ * to-content depending on the model's pacing choice.
34
+ * 2. Multiple distinct bot messages land per turn for the multi-
35
+ * step prompt proving the model isn't collapsing everything
36
+ * into a single pinged dump.
37
+ * 3. All but at most one message is silent (`disable_notification:
38
+ * true`). Only the final answer should pinganything earlier
39
+ * pinging is a beat-3 contract violation.
32
40
  *
33
- * The captured trail is dumped to console for forensic inspection
34
- * regardless of pass/fail.
41
+ * ## Wall-clock budget
35
42
  *
36
- * Wall-clock budget: ~90 s.
43
+ * ~90 s.
37
44
  */
38
45
 
39
46
  import { describe, expect, it } from "vitest";
40
47
  import { spinUp } from "../harness.js";
41
48
  import type { ObservedMessage } from "../driver.js";
42
49
 
43
- const VISIBLE_TTFO_BUDGET_MS = 8_000;
50
+ const TTFO_BUDGET_MS = 15_000;
44
51
  const OVERALL_DEADLINE_MS = 90_000;
45
- const QUIESCENCE_MS = 8_000;
46
-
47
- // Prompt engineered to make the model emit a multi-sentence answer
48
- // over a few seconds long enough that the streaming behaviour
49
- // is observable, short enough that turn-flush isn't tempted to fire.
50
- // Deliberately does NOT instruct the model to call `reply` — we want
51
- // to exercise the transcript-only path that the visible-answer-stream
52
- // covers.
52
+ const QUIESCENCE_MS = 12_000;
53
+
54
+ // Multi-step investigation prompt — designed to make the model emit
55
+ // transcript text BETWEEN tool calls, which is the assistant-content
56
+ // `text` block shape session-tail surfaces via the `text` event the
57
+ // answer-stream lane consumes. With the visible-answer-stream flag
58
+ // ON, those text events should become user-visible edit-in-place
59
+ // chat-timeline updates.
60
+ //
61
+ // We choose a research-style task because that pattern reliably
62
+ // emits `text` chunks (the model thinks out loud between Read /
63
+ // Bash steps) on most Claude versions. A pure-answer prompt (the
64
+ // previous version of this scenario) tended to make modern Claude
65
+ // jump straight to a single `reply` tool-call with no intermediate
66
+ // text — exercising the wrong path.
53
67
  const PROMPT =
54
- `Please give a four-sentence overview of how Linux page-cache ` +
55
- `interacts with mmap on a typical x86_64 server. Reply in a single ` +
56
- `message, with substantive prose. No code blocks.`;
68
+ `Investigate this step by step:\n\n` +
69
+ `1. Read \`/etc/hostname\` and tell me what host this is — write a ` +
70
+ `sentence about it.\n` +
71
+ `2. Then read \`/etc/os-release\` and tell me what OS family / version.\n` +
72
+ `3. Then read \`/proc/cpuinfo\` (head it), and tell me the CPU model + ` +
73
+ `core count.\n` +
74
+ `4. Wrap up with a one-line summary of all three.\n\n` +
75
+ `Between each step, narrate what you're finding in plain prose ` +
76
+ `(not just bullet outputs). Don't batch all your observations into ` +
77
+ `one final reply — talk as you investigate.`;
57
78
 
58
79
  interface TrailEntry {
59
80
  relMs: number;
@@ -68,9 +89,9 @@ function pad(s: string, n: number): string {
68
89
  return s.length >= n ? s : s + " ".repeat(n - s.length);
69
90
  }
70
91
 
71
- describe("uat: visible answer-stream model transcript renders live (#869 Phase 1)", () => {
92
+ describe("uat: conversational pacing on a multi-step turn", () => {
72
93
  it(
73
- "first fresh message lands within VISIBLE_TTFO_BUDGET_MS; subsequent edits grow it in place",
94
+ "first message lands within TTFO_BUDGET_MS; multiple silent messages; final answer pings",
74
95
  async () => {
75
96
  const sc = await spinUp({ agent: "test-harness" });
76
97
  try {
@@ -137,79 +158,45 @@ describe("uat: visible answer-stream — model transcript renders live (#869 Pha
137
158
  }
138
159
  console.log("=================================================\n");
139
160
 
140
- // ── Regression assertions ─────────────────────────────────
141
-
142
- const fresh = trail.filter((e) => e.kind === "fresh");
143
- const edits = trail.filter((e) => e.kind === "edit");
161
+ // ── Pacing assertions ─────────────────────────────────────
144
162
 
145
- // (1) at least one fresh message landed
163
+ // (1) at least one bot message landed
146
164
  expect(
147
- fresh.length,
148
- `no fresh bot replies observed — either the agent isn't ` +
149
- `responding OR the visible-answer-stream flag is OFF ` +
150
- `(SWITCHROOM_VISIBLE_ANSWER_STREAM not set on the target ` +
151
- `agent's container env). Re-check the agent's compose ` +
152
- `environment.`,
165
+ trail.length,
166
+ `no bot replies observed — the agent isn't responding.`,
153
167
  ).toBeGreaterThanOrEqual(1);
154
168
 
155
- // (2) first fresh landed within the TTFO budget
156
- const ttfoMs = fresh[0].relMs;
169
+ // (2) first message landed within TTFO budget
170
+ const ttfoMs = trail[0].relMs;
157
171
  expect(
158
172
  ttfoMs,
159
- `TTFO ${ttfoMs}ms exceeded the visible-answer-stream ` +
160
- `budget of ${VISIBLE_TTFO_BUDGET_MS}ms. Either the model ` +
161
- `was unusually slow to emit its first text chunk, OR the ` +
162
- `visible answer-stream is not active. Default behaviour ` +
163
- `(invisible draft) would never have surfaced a fresh ` +
164
- `message at all, so the most likely cause is model latency.`,
165
- ).toBeLessThanOrEqual(VISIBLE_TTFO_BUDGET_MS);
166
-
167
- // (3) first fresh message was silent (mid-turn edits don't ping)
168
- expect(
169
- fresh[0].silent,
170
- `the first fresh message pinged the user — answer-stream ` +
171
- `should send silently (disable_notification:true). A ping ` +
172
- `here means an explicit \`reply\` tool may have fired instead.`,
173
- ).toBe(true);
173
+ `TTFO ${ttfoMs}ms exceeded the budget of ${TTFO_BUDGET_MS}ms.`,
174
+ ).toBeLessThanOrEqual(TTFO_BUDGET_MS);
174
175
 
175
- // (4) at least one in-place EDIT landed on the same messageId
176
- // (this is the "live streaming" assertion — TTFO is fast AND
177
- // content grows on the same surface, not a chain of new sends).
178
- const sameAnchorEdits = edits.filter(
179
- (e) => e.messageId === firstAnchorMsgId,
180
- );
176
+ // (3) multiple messages landed proves the model is pacing,
177
+ // not dumping a single big reply
181
178
  expect(
182
- sameAnchorEdits.length,
183
- `no in-place edits to the anchor message landed — the model ` +
184
- `either replied in a single shot (very short answer) or ` +
185
- `the streaming path isn't running. Edits observed: ` +
186
- `${edits.length}, on anchor: ${sameAnchorEdits.length}.`,
187
- ).toBeGreaterThanOrEqual(1);
188
-
189
- // (5) every edit is silent (Telegram edits don't push, but
190
- // we double-check via mtcute's flag in case the framework
191
- // ever swaps to a fresh-send pattern by accident)
192
- const loudEdits = edits.filter((e) => !e.silent);
179
+ trail.length,
180
+ `only ${trail.length} message(s) observed — the model ` +
181
+ `collapsed this multi-step prompt into a single dump. ` +
182
+ `Beat 3 pacing (per-step updates) requires multiple ` +
183
+ `messages. Either the model didn't follow the prompt ` +
184
+ `or quiescence bailed early.`,
185
+ ).toBeGreaterThanOrEqual(2);
186
+
187
+ // (4) at most one message pinged the user — beat-3 contract
188
+ // says only the FINAL answer pings; mid-turn updates pass
189
+ // `disable_notification: true`.
190
+ const pingedMessages = trail.filter((e) => !e.silent);
193
191
  expect(
194
- loudEdits.length,
195
- `${loudEdits.length} edit(s) pinged the device.`,
196
- ).toBe(0);
197
-
198
- // (6) text length grows monotonically on the anchor (streaming
199
- // by construction once content is on the anchor, it only
200
- // accumulates)
201
- const anchorTrail = trail.filter(
202
- (e) => e.messageId === firstAnchorMsgId,
203
- );
204
- for (let i = 1; i < anchorTrail.length; i++) {
205
- expect(
206
- anchorTrail[i].textLength,
207
- `anchor message #${firstAnchorMsgId} text shrank between ` +
208
- `events ${i - 1} (len=${anchorTrail[i - 1].textLength}) ` +
209
- `and ${i} (len=${anchorTrail[i].textLength}) — ` +
210
- `streaming text should only grow.`,
211
- ).toBeGreaterThanOrEqual(anchorTrail[i - 1].textLength);
212
- }
192
+ pingedMessages.length,
193
+ `${pingedMessages.length} message(s) pinged the device — ` +
194
+ `the conversational-pacing contract allows AT MOST 1 ` +
195
+ `(the final answer). Mid-turn updates must be silent. ` +
196
+ `Pinged messages at: ${pingedMessages
197
+ .map((m) => `+${(m.relMs / 1000).toFixed(0)}s`)
198
+ .join(", ")}`,
199
+ ).toBeLessThanOrEqual(1);
213
200
  } finally {
214
201
  await sc.tearDown();
215
202
  }