switchroom 0.13.14 → 0.13.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -47331,8 +47331,8 @@ var {
47331
47331
  } = import__.default;
47332
47332
 
47333
47333
  // src/build-info.ts
47334
- var VERSION = "0.13.14";
47335
- var COMMIT_SHA = "0cf961a6";
47334
+ var VERSION = "0.13.16";
47335
+ var COMMIT_SHA = "6c71b36b";
47336
47336
 
47337
47337
  // src/cli/agent.ts
47338
47338
  init_source();
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "switchroom",
3
- "version": "0.13.14",
3
+ "version": "0.13.16",
4
4
  "description": "Run Claude Code 24/7 on your Claude Pro/Max subscription over Telegram. Open-source alternative to OpenClaw and NanoClaw — no API keys.",
5
5
  "type": "module",
6
6
  "bin": {
@@ -48154,10 +48154,10 @@ function sweepStaleTurnActiveMarker(stateDir, opts) {
48154
48154
  }
48155
48155
 
48156
48156
  // ../src/build-info.ts
48157
- var VERSION = "0.13.14";
48158
- var COMMIT_SHA = "0cf961a6";
48159
- var COMMIT_DATE = "2026-05-23T01:15:10Z";
48160
- var LATEST_PR = 1670;
48157
+ var VERSION = "0.13.16";
48158
+ var COMMIT_SHA = "6c71b36b";
48159
+ var COMMIT_DATE = "2026-05-23T03:56:34Z";
48160
+ var LATEST_PR = 1675;
48161
48161
  var COMMITS_AHEAD_OF_TAG = 0;
48162
48162
 
48163
48163
  // gateway/boot-version.ts
@@ -49857,6 +49857,7 @@ var STREAM_THROTTLE_MS_OVERRIDE = (() => {
49857
49857
  return Number.isFinite(n) && n >= 0 ? n : undefined;
49858
49858
  })();
49859
49859
  var TURN_FLUSH_SAFETY_ENABLED = isTurnFlushSafetyEnabled();
49860
+ var ANSWER_STREAM_VISIBLE_ENABLED = process.env.SWITCHROOM_VISIBLE_ANSWER_STREAM === "1" || process.env.SWITCHROOM_VISIBLE_ANSWER_STREAM === "true";
49860
49861
  var progressDriver = null;
49861
49862
  var unpinProgressCardForChat = null;
49862
49863
  var getPinnedProgressCardMessageId = null;
@@ -50616,7 +50617,19 @@ async function executeReply(args) {
50616
50617
  const configParseMode = access.parseMode ?? "html";
50617
50618
  const format = args.format ?? configParseMode;
50618
50619
  const disableLinkPreview = args.disable_web_page_preview != null ? Boolean(args.disable_web_page_preview) : access.disableLinkPreview ?? true;
50619
- const disableNotification = args.disable_notification === true;
50620
+ let disableNotification = args.disable_notification === true;
50621
+ {
50622
+ const turn2 = currentTurn;
50623
+ if (turn2 != null && !disableNotification) {
50624
+ if (turn2.firstPingAt != null) {
50625
+ process.stderr.write(`telegram gateway: reply over-ping safety net \u2014 ` + `downgrading disable_notification:false \u2192 true ` + `(chat=${chat_id} thread=${args.message_thread_id ?? "-"} firstPingAt=${turn2.firstPingAt} sinceFirstPing_ms=${Date.now() - turn2.firstPingAt})
50626
+ `);
50627
+ disableNotification = true;
50628
+ } else {
50629
+ turn2.firstPingAt = Date.now();
50630
+ }
50631
+ }
50632
+ }
50620
50633
  const tg = access.telegraph;
50621
50634
  const tgThreshold = tg?.threshold ?? 3000;
50622
50635
  if (tg?.enabled && files.length === 0 && text.length > tgThreshold) {
@@ -51765,6 +51778,7 @@ function handleSessionEvent(ev) {
51765
51778
  gatewayReceiveAt: startedAt,
51766
51779
  replyCalled: false,
51767
51780
  finalAnswerDelivered: false,
51781
+ firstPingAt: null,
51768
51782
  capturedText: [],
51769
51783
  orphanedReplyTimeoutId: null,
51770
51784
  registryKey: null,
@@ -51854,7 +51868,7 @@ function handleSessionEvent(ev) {
51854
51868
  chatId: turn.sessionChatId,
51855
51869
  isPrivateChat: turn.isDm,
51856
51870
  threadId: turn.sessionThreadId,
51857
- sendMessageDraft: sendMessageDraftFn,
51871
+ ...ANSWER_STREAM_VISIBLE_ENABLED ? { minInitialChars: 1 } : { sendMessageDraft: sendMessageDraftFn },
51858
51872
  sendMessage: async (chatId, text, params) => {
51859
51873
  const tid = params?.message_thread_id;
51860
51874
  const msg = await robustApiCall(() => bot.api.sendMessage(chatId, text, {
@@ -51976,20 +51990,45 @@ function handleSessionEvent(ev) {
51976
51990
  turn.orphanedReplyTimeoutId = null;
51977
51991
  }
51978
51992
  preambleSuppressor.flushNow();
51993
+ let streamFinalizedAsAnswer = false;
51979
51994
  if (turn?.answerStream != null) {
51980
51995
  const stream = turn.answerStream;
51981
- turn.answerStream = null;
51982
- stream.retract().catch((err) => {
51983
- process.stderr.write(`telegram gateway: answer-stream retract failed: ${err instanceof Error ? err.message : String(err)}
51996
+ const streamedMsgId = stream.messageId();
51997
+ const streamedFinalText = turn.capturedText.join("").trim();
51998
+ if (ANSWER_STREAM_VISIBLE_ENABLED && !turn.replyCalled && streamedMsgId != null && streamedFinalText.length > 0) {
51999
+ turn.answerStream = null;
52000
+ stream.stop();
52001
+ streamFinalizedAsAnswer = true;
52002
+ turn.finalAnswerDelivered = true;
52003
+ try {
52004
+ outboundDedup.record(turn.sessionChatId, turn.sessionThreadId, streamedFinalText, Date.now());
52005
+ } catch {}
52006
+ if (HISTORY_ENABLED) {
52007
+ try {
52008
+ recordOutbound({
52009
+ chat_id: turn.sessionChatId,
52010
+ thread_id: turn.sessionThreadId ?? null,
52011
+ message_ids: [streamedMsgId],
52012
+ texts: [streamedFinalText]
52013
+ });
52014
+ } catch {}
52015
+ }
52016
+ process.stderr.write(`telegram gateway: answer-stream finalized as answer chat=${turn.sessionChatId} msg=${streamedMsgId} chars=${streamedFinalText.length}
51984
52017
  `);
51985
- });
52018
+ } else {
52019
+ turn.answerStream = null;
52020
+ stream.retract().catch((err) => {
52021
+ process.stderr.write(`telegram gateway: answer-stream retract failed: ${err instanceof Error ? err.message : String(err)}
52022
+ `);
52023
+ });
52024
+ }
51986
52025
  }
51987
52026
  if (turn == null)
51988
52027
  return;
51989
52028
  const chatId = turn.sessionChatId;
51990
52029
  const threadId = turn.sessionThreadId;
51991
52030
  const ctrl = activeStatusReactions.get(statusKey(chatId, threadId));
51992
- const flushDecision = decideTurnFlush({
52031
+ const flushDecision = streamFinalizedAsAnswer ? { kind: "skip", reason: "reply-called" } : decideTurnFlush({
51993
52032
  chatId: turn.sessionChatId,
51994
52033
  replyCalled: turn.replyCalled,
51995
52034
  capturedText: turn.capturedText,
@@ -1206,6 +1206,17 @@ type CurrentTurn = {
1206
1206
  // even though `replyCalled` is true — the #1664 case where the real answer
1207
1207
  // ended up as plain transcript text rendered into an ephemeral draft.
1208
1208
  finalAnswerDelivered: boolean
1209
+ // #1675 (over-ping safety net): wall-clock ms of the first reply
1210
+ // this turn that landed with `disable_notification: false` (a real
1211
+ // device ping). The conversational-pacing contract
1212
+ // (`reference/conversational-pacing.md` beat 5) says EXACTLY ONE
1213
+ // ping per turn — the final answer. When the model violates that
1214
+ // (sends a substantive answer pinged + a wrap-up "Delivered…" or
1215
+ // meta-narration also pinged), subsequent reply calls with
1216
+ // `disable_notification: false` are auto-downgraded to silent by
1217
+ // the framework. Null until the first ping lands. Reset on every
1218
+ // fresh-turn enqueue.
1219
+ firstPingAt: number | null
1209
1220
  capturedText: string[]
1210
1221
  orphanedReplyTimeoutId: ReturnType<typeof setTimeout> | null
1211
1222
  registryKey: string | null
@@ -2855,6 +2866,42 @@ const STREAM_THROTTLE_MS_OVERRIDE: number | undefined = (() => {
2855
2866
  return Number.isFinite(n) && n >= 0 ? n : undefined
2856
2867
  })()
2857
2868
  const TURN_FLUSH_SAFETY_ENABLED = isTurnFlushSafetyEnabled()
2869
+
2870
+ // #869-Phase1 / openclaw-pattern. When SET, the answer-lane stream
2871
+ // (telegram-plugin/answer-stream.ts) renders the model's transcript
2872
+ // text as a USER-VISIBLE edit-in-place message instead of writing to
2873
+ // Telegram's invisible compose-box draft (which is the default and
2874
+ // supports the #1664 "retract + re-prompt" contract). With this flag
2875
+ // on:
2876
+ // 1. createAnswerStream is instantiated without `sendMessageDraft`,
2877
+ // so it falls back to `sendMessage` + `editMessageText` for a
2878
+ // real chat-timeline message (`answer-stream.ts:212-214`).
2879
+ // 2. minInitialChars is set to 1 — the first text chunk pushes a
2880
+ // visible message immediately (TTFO under 5s for short turns).
2881
+ // 3. At turn_end, if the model never called reply / stream_reply
2882
+ // AND the streamed message has substantive captured text, the
2883
+ // gateway DOES NOT retract (which would delete a user-visible
2884
+ // message the user has been reading live); it calls
2885
+ // `stream.stop()` to freeze the current text as the final
2886
+ // answer, records the message in dedup + history, and marks
2887
+ // `turn.finalAnswerDelivered = true` so the #1664 silent-end
2888
+ // re-prompt does not fire. Turn-flush is suppressed for this
2889
+ // branch — its job (deliver captured text) is structurally
2890
+ // already done by the visible stream.
2891
+ // 4. The reply-tool / stream_reply path is unchanged — when the
2892
+ // model uses an explicit reply tool the prior streamed message
2893
+ // is retracted (delete) and the reply takes over as before.
2894
+ // Trade-off: a stream-as-final-answer turn does NOT push a device
2895
+ // notification (Telegram does not notify on edits, and we choose
2896
+ // not to send a duplicate fresh message for the ping). For short
2897
+ // turns where the user is actively watching, this is the right
2898
+ // shape — they see the answer materialise live. For longer waits,
2899
+ // the cross-turn pending-progress system (#1445/#1669) is the
2900
+ // canonical surface and DOES ping at the appropriate boundaries.
2901
+ // Default OFF; flip per-agent via env to canary the new behaviour.
2902
+ const ANSWER_STREAM_VISIBLE_ENABLED =
2903
+ process.env.SWITCHROOM_VISIBLE_ANSWER_STREAM === '1'
2904
+ || process.env.SWITCHROOM_VISIBLE_ANSWER_STREAM === 'true'
2858
2905
  // eslint-disable-next-line @typescript-eslint/no-explicit-any
2859
2906
  const progressDriver: any = null
2860
2907
  const unpinProgressCardForChat: ((chatId: string, threadId: number | undefined) => void) | null = null
@@ -4172,7 +4219,43 @@ async function executeReply(args: Record<string, unknown>): Promise<{ content: A
4172
4219
  // so only the final answer pings the device. Default false (pings) so
4173
4220
  // existing call-sites and the typical "final answer" reply keep their
4174
4221
  // current behaviour without an explicit flag.
4175
- const disableNotification = args.disable_notification === true
4222
+ let disableNotification = args.disable_notification === true
4223
+
4224
+ // #1675 over-ping safety net. The conversational-pacing contract
4225
+ // (`reference/conversational-pacing.md` beat 5) says EXACTLY ONE
4226
+ // device ping per turn — the final answer. The model sometimes
4227
+ // violates this by sending a substantive answer pinged + a wrap-up
4228
+ // ("Delivered all three steps…", "Sent.", or meta-narration) ALSO
4229
+ // pinged. Both messages then fire notifications. The fleet UAT on
4230
+ // 2026-05-23 reproduced this (Step 3 + Delivered both pinged, two
4231
+ // beeps for a turn that should have produced one). Framework owns
4232
+ // the safety net: once the turn has emitted ONE pinged reply, every
4233
+ // subsequent reply call in the same turn auto-downgrades to silent
4234
+ // (disable_notification: true). Model intent ("I want this loud")
4235
+ // is honoured for the first ping; subsequent pings are demoted with
4236
+ // a stderr log so operators can see the safety net engage.
4237
+ //
4238
+ // The slot is claimed BEFORE the actual send to keep the logic
4239
+ // sequential — a send that fails part-way leaves firstPingAt set
4240
+ // and subsequent pings would be silenced. Acceptable trade-off (a
4241
+ // failed first ping is an edge case; the alternative — claim after
4242
+ // send — races concurrent reply calls).
4243
+ {
4244
+ const turn = currentTurn
4245
+ if (turn != null && !disableNotification) {
4246
+ if (turn.firstPingAt != null) {
4247
+ process.stderr.write(
4248
+ `telegram gateway: reply over-ping safety net — ` +
4249
+ `downgrading disable_notification:false → true ` +
4250
+ `(chat=${chat_id} thread=${args.message_thread_id ?? '-'} ` +
4251
+ `firstPingAt=${turn.firstPingAt} sinceFirstPing_ms=${Date.now() - turn.firstPingAt})\n`,
4252
+ )
4253
+ disableNotification = true
4254
+ } else {
4255
+ turn.firstPingAt = Date.now()
4256
+ }
4257
+ }
4258
+ }
4176
4259
 
4177
4260
  // Telegraph publish (#579). When the reply text is long enough AND
4178
4261
  // the agent has telegraph enabled in access.json, publish to
@@ -5841,6 +5924,7 @@ function handleSessionEvent(ev: SessionEvent): void {
5841
5924
  gatewayReceiveAt: startedAt,
5842
5925
  replyCalled: false,
5843
5926
  finalAnswerDelivered: false,
5927
+ firstPingAt: null,
5844
5928
  capturedText: [],
5845
5929
  orphanedReplyTimeoutId: null,
5846
5930
  registryKey: null,
@@ -5986,7 +6070,13 @@ function handleSessionEvent(ev: SessionEvent): void {
5986
6070
  chatId: turn.sessionChatId,
5987
6071
  isPrivateChat: turn.isDm,
5988
6072
  threadId: turn.sessionThreadId,
5989
- sendMessageDraft: sendMessageDraftFn,
6073
+ // #869-Phase1 visible-answer-stream: omit the draft API so
6074
+ // the lane uses the real sendMessage / editMessageText path
6075
+ // and edits a user-visible chat-timeline message instead
6076
+ // of the invisible compose-box draft.
6077
+ ...(ANSWER_STREAM_VISIBLE_ENABLED
6078
+ ? { minInitialChars: 1 }
6079
+ : { sendMessageDraft: sendMessageDraftFn }),
5990
6080
  // #1075: route through robustApiCall so flood-wait,
5991
6081
  // benign-400, and THREAD_NOT_FOUND are handled uniformly
5992
6082
  // instead of crashing the answer-stream loop on a deleted
@@ -6189,20 +6279,71 @@ function handleSessionEvent(ev: SessionEvent): void {
6189
6279
  // (regression for short no-tool replies). Order matters here: this
6190
6280
  // call must come before the retract/null block.
6191
6281
  preambleSuppressor.flushNow()
6192
- // #656: always retract the answer-lane stream at turn_end. Turn-flush
6193
- // (gateway.ts ~3475) is the sole canonical emitter for no-reply turns —
6194
- // it runs markdownToHtml and records to outboundDedup. Materializing
6195
- // here would race turn-flush and post raw model text (no HTML conv).
6282
+ // #656: by default we ALWAYS retract the answer-lane stream at
6283
+ // turn_end. Turn-flush is the canonical emitter for no-reply
6284
+ // turns; materialising here would race it and post raw model
6285
+ // text (no HTML conv).
6286
+ //
6287
+ // #869-Phase1 override: when `ANSWER_STREAM_VISIBLE_ENABLED` is
6288
+ // on, the stream is rendering a USER-VISIBLE message in the
6289
+ // chat timeline. Retracting (delete) destroys content the user
6290
+ // has been reading live — the worst possible UX flicker. So
6291
+ // when the stream is the de-facto final answer (model never
6292
+ // called reply, captured text is substantive) we instead call
6293
+ // `stream.stop()` to freeze it as the final state, record the
6294
+ // outbound for history + dedup, mark the turn answered, and
6295
+ // suppress the turn-flush IIFE downstream.
6296
+ let streamFinalizedAsAnswer = false
6196
6297
  if (turn?.answerStream != null) {
6197
6298
  const stream = turn.answerStream
6198
- turn.answerStream = null
6199
- void stream.retract().catch((err) => {
6299
+ const streamedMsgId = stream.messageId()
6300
+ const streamedFinalText = turn.capturedText.join('').trim()
6301
+ if (
6302
+ ANSWER_STREAM_VISIBLE_ENABLED
6303
+ && !turn.replyCalled
6304
+ && streamedMsgId != null
6305
+ && streamedFinalText.length > 0
6306
+ ) {
6307
+ turn.answerStream = null
6308
+ stream.stop()
6309
+ streamFinalizedAsAnswer = true
6310
+ turn.finalAnswerDelivered = true
6311
+ // Record as canonical outbound so retries dedup against it
6312
+ // and the SQLite history can surface it. Mirrors the
6313
+ // hooks turn-flush + reply both run.
6314
+ try {
6315
+ outboundDedup.record(
6316
+ turn.sessionChatId,
6317
+ turn.sessionThreadId,
6318
+ streamedFinalText,
6319
+ Date.now(),
6320
+ )
6321
+ } catch { /* best-effort */ }
6322
+ if (HISTORY_ENABLED) {
6323
+ try {
6324
+ recordOutbound({
6325
+ chat_id: turn.sessionChatId,
6326
+ thread_id: turn.sessionThreadId ?? null,
6327
+ message_ids: [streamedMsgId],
6328
+ texts: [streamedFinalText],
6329
+ })
6330
+ } catch { /* best-effort */ }
6331
+ }
6200
6332
  process.stderr.write(
6201
- `telegram gateway: answer-stream retract failed: ${
6202
- err instanceof Error ? err.message : String(err)
6203
- }\n`,
6333
+ `telegram gateway: answer-stream finalized as answer ` +
6334
+ `chat=${turn.sessionChatId} msg=${streamedMsgId} ` +
6335
+ `chars=${streamedFinalText.length}\n`,
6204
6336
  )
6205
- })
6337
+ } else {
6338
+ turn.answerStream = null
6339
+ void stream.retract().catch((err) => {
6340
+ process.stderr.write(
6341
+ `telegram gateway: answer-stream retract failed: ${
6342
+ err instanceof Error ? err.message : String(err)
6343
+ }\n`,
6344
+ )
6345
+ })
6346
+ }
6206
6347
  }
6207
6348
  if (turn == null) return
6208
6349
  const chatId = turn.sessionChatId
@@ -6214,12 +6355,19 @@ function handleSessionEvent(ev: SessionEvent): void {
6214
6355
  // surface to recover from. The decideTurnFlush 'empty-text'
6215
6356
  // path now relies on capturedText alone.
6216
6357
 
6217
- const flushDecision = decideTurnFlush({
6218
- chatId: turn.sessionChatId,
6219
- replyCalled: turn.replyCalled,
6220
- capturedText: turn.capturedText,
6221
- flushEnabled: TURN_FLUSH_SAFETY_ENABLED,
6222
- })
6358
+ // #869-Phase1: when the answer-stream finalised as the answer
6359
+ // above, skip the turn-flush IIFE entirely — its job (deliver
6360
+ // captured text) is already done by the visible stream, and
6361
+ // running it would race a duplicate fresh-sendMessage against
6362
+ // the user-visible edited message.
6363
+ const flushDecision = streamFinalizedAsAnswer
6364
+ ? ({ kind: 'skip', reason: 'reply-called' } as ReturnType<typeof decideTurnFlush>)
6365
+ : decideTurnFlush({
6366
+ chatId: turn.sessionChatId,
6367
+ replyCalled: turn.replyCalled,
6368
+ capturedText: turn.capturedText,
6369
+ flushEnabled: TURN_FLUSH_SAFETY_ENABLED,
6370
+ })
6223
6371
  if (flushDecision.kind === 'skip' && flushDecision.reason !== 'reply-called') {
6224
6372
  process.stderr.write(
6225
6373
  `telegram gateway: turn-flush skipped — reason=${flushDecision.reason}\n`,
@@ -57,17 +57,19 @@ import type { ObservedMessage } from "../driver.js";
57
57
 
58
58
  const SLEEP_SECONDS = 350;
59
59
 
60
+ // Engineered to elicit the natural production pattern: the model
61
+ // sends a quick ack reply ("on it — background sleep running"),
62
+ // dispatches the sleep as a background Bash, ends its turn, then
63
+ // returns with "done" once the sleep completes. The framework
64
+ // fix-under-test owns the in-between ambient.
60
65
  const PROMPT =
61
- `This is an instrumented stress test of cross-turn pending-async ` +
62
- `progress. Please run exactly this command via the Bash tool, and ` +
63
- `ONLY this command, as a SINGLE call with run_in_background=true ` +
64
- `(do not break it up, do not send any further reply until it ` +
65
- `completes):\n\n` +
66
- "```bash\n" +
67
- `sleep ${SLEEP_SECONDS}\n` +
68
- "```\n\n" +
69
- `After the bash command returns, send exactly the single word ` +
70
- `"done" as your final reply.`;
66
+ `Please run \`sleep ${SLEEP_SECONDS}\` in the background using the ` +
67
+ `Bash tool with \`run_in_background: true\` this is a stress ` +
68
+ `test of the cross-turn ambient progress surface, so the sleep ` +
69
+ `duration matters. Send a brief one-line acknowledgement that ` +
70
+ `you've dispatched it (your natural beat-1 ack is fine), then ` +
71
+ `wait for it to complete. When it finishes, reply with exactly ` +
72
+ `the single word "done".`;
71
73
 
72
74
  const OVERALL_DEADLINE_MS = (SLEEP_SECONDS + 240) * 1000;
73
75
 
@@ -0,0 +1,206 @@
1
+ /**
2
+ * Conversational pacing UAT — measures the END-TO-END user-perceived
3
+ * turn UX on a multi-step prompt.
4
+ *
5
+ * Original framing was "validate the visible-answer-stream path
6
+ * activates." Live research on test-harness with the
7
+ * `SWITCHROOM_VISIBLE_ANSWER_STREAM=1` flag showed that modern Claude
8
+ * 2.1.x on this fleet does NOT emit transcript text events between
9
+ * tool calls — it consistently calls the `reply` MCP tool directly
10
+ * for every user-visible chunk (beat 1 ack, then per-step beat 3
11
+ * updates). So the visible-answer-stream code path (which renders
12
+ * `text` session events into a chat-timeline message) doesn't
13
+ * activate; the answer-stream lane stays idle while the model uses
14
+ * `reply` calls instead.
15
+ *
16
+ * That's actually FINE — the model is correctly following the
17
+ * five-beat conversational-pacing contract (`reference/conversational-
18
+ * pacing.md`): one silent ack at the start, silent updates per step,
19
+ * one pinged final answer. This UAT now validates THAT — the pacing
20
+ * the user actually experiences — rather than the answer-stream code
21
+ * path specifically.
22
+ *
23
+ * The flag `SWITCHROOM_VISIBLE_ANSWER_STREAM=1` is still set on
24
+ * test-harness for ongoing observation; if a future model version
25
+ * starts emitting transcript text, the lane will surface it visibly
26
+ * instead of writing to the invisible compose-box draft (the prior
27
+ * default).
28
+ *
29
+ * ## What this asserts
30
+ *
31
+ * 1. First user-visible bot message lands within `TTFO_BUDGET_MS`
32
+ * (default 15 s) of the inbound — covers beat 1 ack OR straight-
33
+ * to-content depending on the model's pacing choice.
34
+ * 2. Multiple distinct bot messages land per turn for the multi-
35
+ * step prompt — proving the model isn't collapsing everything
36
+ * into a single pinged dump.
37
+ * 3. All but at most one message is silent (`disable_notification:
38
+ * true`). Only the final answer should ping — anything earlier
39
+ * pinging is a beat-3 contract violation.
40
+ *
41
+ * ## Wall-clock budget
42
+ *
43
+ * ~90 s.
44
+ */
45
+
46
+ import { describe, expect, it } from "vitest";
47
+ import { spinUp } from "../harness.js";
48
+ import type { ObservedMessage } from "../driver.js";
49
+
50
+ const TTFO_BUDGET_MS = 15_000;
51
+ const OVERALL_DEADLINE_MS = 90_000;
52
+ const QUIESCENCE_MS = 12_000;
53
+
54
+ // Multi-step investigation prompt — designed to make the model emit
55
+ // transcript text BETWEEN tool calls, which is the assistant-content
56
+ // `text` block shape session-tail surfaces via the `text` event the
57
+ // answer-stream lane consumes. With the visible-answer-stream flag
58
+ // ON, those text events should become user-visible edit-in-place
59
+ // chat-timeline updates.
60
+ //
61
+ // We choose a research-style task because that pattern reliably
62
+ // emits `text` chunks (the model thinks out loud between Read /
63
+ // Bash steps) on most Claude versions. A pure-answer prompt (the
64
+ // previous version of this scenario) tended to make modern Claude
65
+ // jump straight to a single `reply` tool-call with no intermediate
66
+ // text — exercising the wrong path.
67
+ const PROMPT =
68
+ `Investigate this step by step:\n\n` +
69
+ `1. Read \`/etc/hostname\` and tell me what host this is — write a ` +
70
+ `sentence about it.\n` +
71
+ `2. Then read \`/etc/os-release\` and tell me what OS family / version.\n` +
72
+ `3. Then read \`/proc/cpuinfo\` (head it), and tell me the CPU model + ` +
73
+ `core count.\n` +
74
+ `4. Wrap up with a one-line summary of all three.\n\n` +
75
+ `Between each step, narrate what you're finding in plain prose ` +
76
+ `(not just bullet outputs). Don't batch all your observations into ` +
77
+ `one final reply — talk as you investigate.`;
78
+
79
+ interface TrailEntry {
80
+ relMs: number;
81
+ kind: "fresh" | "edit";
82
+ silent: boolean;
83
+ messageId: number;
84
+ textPreview: string;
85
+ textLength: number;
86
+ }
87
+
88
+ function pad(s: string, n: number): string {
89
+ return s.length >= n ? s : s + " ".repeat(n - s.length);
90
+ }
91
+
92
+ describe("uat: conversational pacing on a multi-step turn", () => {
93
+ it(
94
+ "first message lands within TTFO_BUDGET_MS; multiple silent messages; final answer pings",
95
+ async () => {
96
+ const sc = await spinUp({ agent: "test-harness" });
97
+ try {
98
+ const startedAt = Date.now();
99
+ await sc.sendDM(PROMPT);
100
+ console.log(`[visible-answer-stream] t=0 prompt sent`);
101
+
102
+ const trail: TrailEntry[] = [];
103
+ let firstAnchorMsgId: number | null = null;
104
+ let quiescenceDeadline = startedAt + 30_000;
105
+ const overallDeadline = startedAt + OVERALL_DEADLINE_MS;
106
+
107
+ while (Date.now() < overallDeadline) {
108
+ const remaining = Math.min(
109
+ quiescenceDeadline - Date.now(),
110
+ overallDeadline - Date.now(),
111
+ );
112
+ if (remaining <= 0) break;
113
+ try {
114
+ const msg = await sc.expectMessage(
115
+ (m: ObservedMessage) => m.fromBot,
116
+ { from: "bot", timeout: remaining },
117
+ );
118
+ const rel = Date.now() - startedAt;
119
+ const entry: TrailEntry = {
120
+ relMs: rel,
121
+ kind: msg.edited ? "edit" : "fresh",
122
+ silent: msg.silent,
123
+ messageId: msg.messageId,
124
+ textPreview: msg.text
125
+ .slice(0, 120)
126
+ .replace(/\n/g, " ⏎ "),
127
+ textLength: msg.text.length,
128
+ };
129
+ trail.push(entry);
130
+ if (firstAnchorMsgId == null && entry.kind === "fresh") {
131
+ firstAnchorMsgId = entry.messageId;
132
+ }
133
+ console.log(
134
+ `[visible-answer-stream] +${(rel / 1000).toFixed(1)}s ` +
135
+ `${entry.kind.toUpperCase()} msg=${entry.messageId} ` +
136
+ `silent=${entry.silent} len=${entry.textLength} ` +
137
+ `text=${JSON.stringify(entry.textPreview)}`,
138
+ );
139
+ quiescenceDeadline = Date.now() + QUIESCENCE_MS;
140
+ } catch {
141
+ break;
142
+ }
143
+ }
144
+
145
+ console.log("\n========== VISIBLE-ANSWER-STREAM TRAIL ==========");
146
+ console.log(`total bot messages observed: ${trail.length}`);
147
+ console.log(`first anchor messageId: ${firstAnchorMsgId}`);
148
+ console.log("");
149
+ console.log(" rel(s) kind silent msg len text");
150
+ console.log(" ------- ----- ------ ----------- ---- ----");
151
+ for (const e of trail) {
152
+ console.log(
153
+ ` ${pad((e.relMs / 1000).toFixed(1) + "s", 8)} ` +
154
+ `${pad(e.kind, 6)} ${pad(String(e.silent), 7)} ` +
155
+ `${pad(String(e.messageId), 12)} ${pad(String(e.textLength), 5)} ` +
156
+ `${e.textPreview}`,
157
+ );
158
+ }
159
+ console.log("=================================================\n");
160
+
161
+ // ── Pacing assertions ─────────────────────────────────────
162
+
163
+ // (1) at least one bot message landed
164
+ expect(
165
+ trail.length,
166
+ `no bot replies observed — the agent isn't responding.`,
167
+ ).toBeGreaterThanOrEqual(1);
168
+
169
+ // (2) first message landed within TTFO budget
170
+ const ttfoMs = trail[0].relMs;
171
+ expect(
172
+ ttfoMs,
173
+ `TTFO ${ttfoMs}ms exceeded the budget of ${TTFO_BUDGET_MS}ms.`,
174
+ ).toBeLessThanOrEqual(TTFO_BUDGET_MS);
175
+
176
+ // (3) multiple messages landed — proves the model is pacing,
177
+ // not dumping a single big reply
178
+ expect(
179
+ trail.length,
180
+ `only ${trail.length} message(s) observed — the model ` +
181
+ `collapsed this multi-step prompt into a single dump. ` +
182
+ `Beat 3 pacing (per-step updates) requires multiple ` +
183
+ `messages. Either the model didn't follow the prompt ` +
184
+ `or quiescence bailed early.`,
185
+ ).toBeGreaterThanOrEqual(2);
186
+
187
+ // (4) at most one message pinged the user — beat-3 contract
188
+ // says only the FINAL answer pings; mid-turn updates pass
189
+ // `disable_notification: true`.
190
+ const pingedMessages = trail.filter((e) => !e.silent);
191
+ expect(
192
+ pingedMessages.length,
193
+ `${pingedMessages.length} message(s) pinged the device — ` +
194
+ `the conversational-pacing contract allows AT MOST 1 ` +
195
+ `(the final answer). Mid-turn updates must be silent. ` +
196
+ `Pinged messages at: ${pingedMessages
197
+ .map((m) => `+${(m.relMs / 1000).toFixed(0)}s`)
198
+ .join(", ")}`,
199
+ ).toBeLessThanOrEqual(1);
200
+ } finally {
201
+ await sc.tearDown();
202
+ }
203
+ },
204
+ OVERALL_DEADLINE_MS + 30_000,
205
+ );
206
+ });