switchroom 0.13.14 → 0.13.16
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli/switchroom.js +2 -2
- package/package.json +1 -1
- package/telegram-plugin/dist/gateway/gateway.js +50 -11
- package/telegram-plugin/gateway/gateway.ts +166 -18
- package/telegram-plugin/uat/scenarios/cross-turn-pending-progress-dm.test.ts +12 -10
- package/telegram-plugin/uat/scenarios/visible-answer-stream-dm.test.ts +206 -0
package/dist/cli/switchroom.js
CHANGED
|
@@ -47331,8 +47331,8 @@ var {
|
|
|
47331
47331
|
} = import__.default;
|
|
47332
47332
|
|
|
47333
47333
|
// src/build-info.ts
|
|
47334
|
-
var VERSION = "0.13.
|
|
47335
|
-
var COMMIT_SHA = "
|
|
47334
|
+
var VERSION = "0.13.16";
|
|
47335
|
+
var COMMIT_SHA = "6c71b36b";
|
|
47336
47336
|
|
|
47337
47337
|
// src/cli/agent.ts
|
|
47338
47338
|
init_source();
|
package/package.json
CHANGED
|
@@ -48154,10 +48154,10 @@ function sweepStaleTurnActiveMarker(stateDir, opts) {
|
|
|
48154
48154
|
}
|
|
48155
48155
|
|
|
48156
48156
|
// ../src/build-info.ts
|
|
48157
|
-
var VERSION = "0.13.
|
|
48158
|
-
var COMMIT_SHA = "
|
|
48159
|
-
var COMMIT_DATE = "2026-05-
|
|
48160
|
-
var LATEST_PR =
|
|
48157
|
+
var VERSION = "0.13.16";
|
|
48158
|
+
var COMMIT_SHA = "6c71b36b";
|
|
48159
|
+
var COMMIT_DATE = "2026-05-23T03:56:34Z";
|
|
48160
|
+
var LATEST_PR = 1675;
|
|
48161
48161
|
var COMMITS_AHEAD_OF_TAG = 0;
|
|
48162
48162
|
|
|
48163
48163
|
// gateway/boot-version.ts
|
|
@@ -49857,6 +49857,7 @@ var STREAM_THROTTLE_MS_OVERRIDE = (() => {
|
|
|
49857
49857
|
return Number.isFinite(n) && n >= 0 ? n : undefined;
|
|
49858
49858
|
})();
|
|
49859
49859
|
var TURN_FLUSH_SAFETY_ENABLED = isTurnFlushSafetyEnabled();
|
|
49860
|
+
var ANSWER_STREAM_VISIBLE_ENABLED = process.env.SWITCHROOM_VISIBLE_ANSWER_STREAM === "1" || process.env.SWITCHROOM_VISIBLE_ANSWER_STREAM === "true";
|
|
49860
49861
|
var progressDriver = null;
|
|
49861
49862
|
var unpinProgressCardForChat = null;
|
|
49862
49863
|
var getPinnedProgressCardMessageId = null;
|
|
@@ -50616,7 +50617,19 @@ async function executeReply(args) {
|
|
|
50616
50617
|
const configParseMode = access.parseMode ?? "html";
|
|
50617
50618
|
const format = args.format ?? configParseMode;
|
|
50618
50619
|
const disableLinkPreview = args.disable_web_page_preview != null ? Boolean(args.disable_web_page_preview) : access.disableLinkPreview ?? true;
|
|
50619
|
-
|
|
50620
|
+
let disableNotification = args.disable_notification === true;
|
|
50621
|
+
{
|
|
50622
|
+
const turn2 = currentTurn;
|
|
50623
|
+
if (turn2 != null && !disableNotification) {
|
|
50624
|
+
if (turn2.firstPingAt != null) {
|
|
50625
|
+
process.stderr.write(`telegram gateway: reply over-ping safety net \u2014 ` + `downgrading disable_notification:false \u2192 true ` + `(chat=${chat_id} thread=${args.message_thread_id ?? "-"} firstPingAt=${turn2.firstPingAt} sinceFirstPing_ms=${Date.now() - turn2.firstPingAt})
|
|
50626
|
+
`);
|
|
50627
|
+
disableNotification = true;
|
|
50628
|
+
} else {
|
|
50629
|
+
turn2.firstPingAt = Date.now();
|
|
50630
|
+
}
|
|
50631
|
+
}
|
|
50632
|
+
}
|
|
50620
50633
|
const tg = access.telegraph;
|
|
50621
50634
|
const tgThreshold = tg?.threshold ?? 3000;
|
|
50622
50635
|
if (tg?.enabled && files.length === 0 && text.length > tgThreshold) {
|
|
@@ -51765,6 +51778,7 @@ function handleSessionEvent(ev) {
|
|
|
51765
51778
|
gatewayReceiveAt: startedAt,
|
|
51766
51779
|
replyCalled: false,
|
|
51767
51780
|
finalAnswerDelivered: false,
|
|
51781
|
+
firstPingAt: null,
|
|
51768
51782
|
capturedText: [],
|
|
51769
51783
|
orphanedReplyTimeoutId: null,
|
|
51770
51784
|
registryKey: null,
|
|
@@ -51854,7 +51868,7 @@ function handleSessionEvent(ev) {
|
|
|
51854
51868
|
chatId: turn.sessionChatId,
|
|
51855
51869
|
isPrivateChat: turn.isDm,
|
|
51856
51870
|
threadId: turn.sessionThreadId,
|
|
51857
|
-
sendMessageDraft: sendMessageDraftFn,
|
|
51871
|
+
...ANSWER_STREAM_VISIBLE_ENABLED ? { minInitialChars: 1 } : { sendMessageDraft: sendMessageDraftFn },
|
|
51858
51872
|
sendMessage: async (chatId, text, params) => {
|
|
51859
51873
|
const tid = params?.message_thread_id;
|
|
51860
51874
|
const msg = await robustApiCall(() => bot.api.sendMessage(chatId, text, {
|
|
@@ -51976,20 +51990,45 @@ function handleSessionEvent(ev) {
|
|
|
51976
51990
|
turn.orphanedReplyTimeoutId = null;
|
|
51977
51991
|
}
|
|
51978
51992
|
preambleSuppressor.flushNow();
|
|
51993
|
+
let streamFinalizedAsAnswer = false;
|
|
51979
51994
|
if (turn?.answerStream != null) {
|
|
51980
51995
|
const stream = turn.answerStream;
|
|
51981
|
-
|
|
51982
|
-
|
|
51983
|
-
|
|
51996
|
+
const streamedMsgId = stream.messageId();
|
|
51997
|
+
const streamedFinalText = turn.capturedText.join("").trim();
|
|
51998
|
+
if (ANSWER_STREAM_VISIBLE_ENABLED && !turn.replyCalled && streamedMsgId != null && streamedFinalText.length > 0) {
|
|
51999
|
+
turn.answerStream = null;
|
|
52000
|
+
stream.stop();
|
|
52001
|
+
streamFinalizedAsAnswer = true;
|
|
52002
|
+
turn.finalAnswerDelivered = true;
|
|
52003
|
+
try {
|
|
52004
|
+
outboundDedup.record(turn.sessionChatId, turn.sessionThreadId, streamedFinalText, Date.now());
|
|
52005
|
+
} catch {}
|
|
52006
|
+
if (HISTORY_ENABLED) {
|
|
52007
|
+
try {
|
|
52008
|
+
recordOutbound({
|
|
52009
|
+
chat_id: turn.sessionChatId,
|
|
52010
|
+
thread_id: turn.sessionThreadId ?? null,
|
|
52011
|
+
message_ids: [streamedMsgId],
|
|
52012
|
+
texts: [streamedFinalText]
|
|
52013
|
+
});
|
|
52014
|
+
} catch {}
|
|
52015
|
+
}
|
|
52016
|
+
process.stderr.write(`telegram gateway: answer-stream finalized as answer chat=${turn.sessionChatId} msg=${streamedMsgId} chars=${streamedFinalText.length}
|
|
51984
52017
|
`);
|
|
51985
|
-
}
|
|
52018
|
+
} else {
|
|
52019
|
+
turn.answerStream = null;
|
|
52020
|
+
stream.retract().catch((err) => {
|
|
52021
|
+
process.stderr.write(`telegram gateway: answer-stream retract failed: ${err instanceof Error ? err.message : String(err)}
|
|
52022
|
+
`);
|
|
52023
|
+
});
|
|
52024
|
+
}
|
|
51986
52025
|
}
|
|
51987
52026
|
if (turn == null)
|
|
51988
52027
|
return;
|
|
51989
52028
|
const chatId = turn.sessionChatId;
|
|
51990
52029
|
const threadId = turn.sessionThreadId;
|
|
51991
52030
|
const ctrl = activeStatusReactions.get(statusKey(chatId, threadId));
|
|
51992
|
-
const flushDecision = decideTurnFlush({
|
|
52031
|
+
const flushDecision = streamFinalizedAsAnswer ? { kind: "skip", reason: "reply-called" } : decideTurnFlush({
|
|
51993
52032
|
chatId: turn.sessionChatId,
|
|
51994
52033
|
replyCalled: turn.replyCalled,
|
|
51995
52034
|
capturedText: turn.capturedText,
|
|
@@ -1206,6 +1206,17 @@ type CurrentTurn = {
|
|
|
1206
1206
|
// even though `replyCalled` is true — the #1664 case where the real answer
|
|
1207
1207
|
// ended up as plain transcript text rendered into an ephemeral draft.
|
|
1208
1208
|
finalAnswerDelivered: boolean
|
|
1209
|
+
// #1675 (over-ping safety net): wall-clock ms of the first reply
|
|
1210
|
+
// this turn that landed with `disable_notification: false` (a real
|
|
1211
|
+
// device ping). The conversational-pacing contract
|
|
1212
|
+
// (`reference/conversational-pacing.md` beat 5) says EXACTLY ONE
|
|
1213
|
+
// ping per turn — the final answer. When the model violates that
|
|
1214
|
+
// (sends a substantive answer pinged + a wrap-up "Delivered…" or
|
|
1215
|
+
// meta-narration also pinged), subsequent reply calls with
|
|
1216
|
+
// `disable_notification: false` are auto-downgraded to silent by
|
|
1217
|
+
// the framework. Null until the first ping lands. Reset on every
|
|
1218
|
+
// fresh-turn enqueue.
|
|
1219
|
+
firstPingAt: number | null
|
|
1209
1220
|
capturedText: string[]
|
|
1210
1221
|
orphanedReplyTimeoutId: ReturnType<typeof setTimeout> | null
|
|
1211
1222
|
registryKey: string | null
|
|
@@ -2855,6 +2866,42 @@ const STREAM_THROTTLE_MS_OVERRIDE: number | undefined = (() => {
|
|
|
2855
2866
|
return Number.isFinite(n) && n >= 0 ? n : undefined
|
|
2856
2867
|
})()
|
|
2857
2868
|
const TURN_FLUSH_SAFETY_ENABLED = isTurnFlushSafetyEnabled()
|
|
2869
|
+
|
|
2870
|
+
// #869-Phase1 / openclaw-pattern. When SET, the answer-lane stream
|
|
2871
|
+
// (telegram-plugin/answer-stream.ts) renders the model's transcript
|
|
2872
|
+
// text as a USER-VISIBLE edit-in-place message instead of writing to
|
|
2873
|
+
// Telegram's invisible compose-box draft (which is the default and
|
|
2874
|
+
// supports the #1664 "retract + re-prompt" contract). With this flag
|
|
2875
|
+
// on:
|
|
2876
|
+
// 1. createAnswerStream is instantiated without `sendMessageDraft`,
|
|
2877
|
+
// so it falls back to `sendMessage` + `editMessageText` for a
|
|
2878
|
+
// real chat-timeline message (`answer-stream.ts:212-214`).
|
|
2879
|
+
// 2. minInitialChars is set to 1 — the first text chunk pushes a
|
|
2880
|
+
// visible message immediately (TTFO under 5s for short turns).
|
|
2881
|
+
// 3. At turn_end, if the model never called reply / stream_reply
|
|
2882
|
+
// AND the streamed message has substantive captured text, the
|
|
2883
|
+
// gateway DOES NOT retract (which would delete a user-visible
|
|
2884
|
+
// message the user has been reading live); it calls
|
|
2885
|
+
// `stream.stop()` to freeze the current text as the final
|
|
2886
|
+
// answer, records the message in dedup + history, and marks
|
|
2887
|
+
// `turn.finalAnswerDelivered = true` so the #1664 silent-end
|
|
2888
|
+
// re-prompt does not fire. Turn-flush is suppressed for this
|
|
2889
|
+
// branch — its job (deliver captured text) is structurally
|
|
2890
|
+
// already done by the visible stream.
|
|
2891
|
+
// 4. The reply-tool / stream_reply path is unchanged — when the
|
|
2892
|
+
// model uses an explicit reply tool the prior streamed message
|
|
2893
|
+
// is retracted (delete) and the reply takes over as before.
|
|
2894
|
+
// Trade-off: a stream-as-final-answer turn does NOT push a device
|
|
2895
|
+
// notification (Telegram does not notify on edits, and we choose
|
|
2896
|
+
// not to send a duplicate fresh message for the ping). For short
|
|
2897
|
+
// turns where the user is actively watching, this is the right
|
|
2898
|
+
// shape — they see the answer materialise live. For longer waits,
|
|
2899
|
+
// the cross-turn pending-progress system (#1445/#1669) is the
|
|
2900
|
+
// canonical surface and DOES ping at the appropriate boundaries.
|
|
2901
|
+
// Default OFF; flip per-agent via env to canary the new behaviour.
|
|
2902
|
+
const ANSWER_STREAM_VISIBLE_ENABLED =
|
|
2903
|
+
process.env.SWITCHROOM_VISIBLE_ANSWER_STREAM === '1'
|
|
2904
|
+
|| process.env.SWITCHROOM_VISIBLE_ANSWER_STREAM === 'true'
|
|
2858
2905
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
2859
2906
|
const progressDriver: any = null
|
|
2860
2907
|
const unpinProgressCardForChat: ((chatId: string, threadId: number | undefined) => void) | null = null
|
|
@@ -4172,7 +4219,43 @@ async function executeReply(args: Record<string, unknown>): Promise<{ content: A
|
|
|
4172
4219
|
// so only the final answer pings the device. Default false (pings) so
|
|
4173
4220
|
// existing call-sites and the typical "final answer" reply keep their
|
|
4174
4221
|
// current behaviour without an explicit flag.
|
|
4175
|
-
|
|
4222
|
+
let disableNotification = args.disable_notification === true
|
|
4223
|
+
|
|
4224
|
+
// #1675 over-ping safety net. The conversational-pacing contract
|
|
4225
|
+
// (`reference/conversational-pacing.md` beat 5) says EXACTLY ONE
|
|
4226
|
+
// device ping per turn — the final answer. The model sometimes
|
|
4227
|
+
// violates this by sending a substantive answer pinged + a wrap-up
|
|
4228
|
+
// ("Delivered all three steps…", "Sent.", or meta-narration) ALSO
|
|
4229
|
+
// pinged. Both messages then fire notifications. The fleet UAT on
|
|
4230
|
+
// 2026-05-23 reproduced this (Step 3 + Delivered both pinged, two
|
|
4231
|
+
// beeps for a turn that should have produced one). Framework owns
|
|
4232
|
+
// the safety net: once the turn has emitted ONE pinged reply, every
|
|
4233
|
+
// subsequent reply call in the same turn auto-downgrades to silent
|
|
4234
|
+
// (disable_notification: true). Model intent ("I want this loud")
|
|
4235
|
+
// is honoured for the first ping; subsequent pings are demoted with
|
|
4236
|
+
// a stderr log so operators can see the safety net engage.
|
|
4237
|
+
//
|
|
4238
|
+
// The slot is claimed BEFORE the actual send to keep the logic
|
|
4239
|
+
// sequential — a send that fails part-way leaves firstPingAt set
|
|
4240
|
+
// and subsequent pings would be silenced. Acceptable trade-off (a
|
|
4241
|
+
// failed first ping is an edge case; the alternative — claim after
|
|
4242
|
+
// send — races concurrent reply calls).
|
|
4243
|
+
{
|
|
4244
|
+
const turn = currentTurn
|
|
4245
|
+
if (turn != null && !disableNotification) {
|
|
4246
|
+
if (turn.firstPingAt != null) {
|
|
4247
|
+
process.stderr.write(
|
|
4248
|
+
`telegram gateway: reply over-ping safety net — ` +
|
|
4249
|
+
`downgrading disable_notification:false → true ` +
|
|
4250
|
+
`(chat=${chat_id} thread=${args.message_thread_id ?? '-'} ` +
|
|
4251
|
+
`firstPingAt=${turn.firstPingAt} sinceFirstPing_ms=${Date.now() - turn.firstPingAt})\n`,
|
|
4252
|
+
)
|
|
4253
|
+
disableNotification = true
|
|
4254
|
+
} else {
|
|
4255
|
+
turn.firstPingAt = Date.now()
|
|
4256
|
+
}
|
|
4257
|
+
}
|
|
4258
|
+
}
|
|
4176
4259
|
|
|
4177
4260
|
// Telegraph publish (#579). When the reply text is long enough AND
|
|
4178
4261
|
// the agent has telegraph enabled in access.json, publish to
|
|
@@ -5841,6 +5924,7 @@ function handleSessionEvent(ev: SessionEvent): void {
|
|
|
5841
5924
|
gatewayReceiveAt: startedAt,
|
|
5842
5925
|
replyCalled: false,
|
|
5843
5926
|
finalAnswerDelivered: false,
|
|
5927
|
+
firstPingAt: null,
|
|
5844
5928
|
capturedText: [],
|
|
5845
5929
|
orphanedReplyTimeoutId: null,
|
|
5846
5930
|
registryKey: null,
|
|
@@ -5986,7 +6070,13 @@ function handleSessionEvent(ev: SessionEvent): void {
|
|
|
5986
6070
|
chatId: turn.sessionChatId,
|
|
5987
6071
|
isPrivateChat: turn.isDm,
|
|
5988
6072
|
threadId: turn.sessionThreadId,
|
|
5989
|
-
|
|
6073
|
+
// #869-Phase1 visible-answer-stream: omit the draft API so
|
|
6074
|
+
// the lane uses the real sendMessage / editMessageText path
|
|
6075
|
+
// and edits a user-visible chat-timeline message instead
|
|
6076
|
+
// of the invisible compose-box draft.
|
|
6077
|
+
...(ANSWER_STREAM_VISIBLE_ENABLED
|
|
6078
|
+
? { minInitialChars: 1 }
|
|
6079
|
+
: { sendMessageDraft: sendMessageDraftFn }),
|
|
5990
6080
|
// #1075: route through robustApiCall so flood-wait,
|
|
5991
6081
|
// benign-400, and THREAD_NOT_FOUND are handled uniformly
|
|
5992
6082
|
// instead of crashing the answer-stream loop on a deleted
|
|
@@ -6189,20 +6279,71 @@ function handleSessionEvent(ev: SessionEvent): void {
|
|
|
6189
6279
|
// (regression for short no-tool replies). Order matters here: this
|
|
6190
6280
|
// call must come before the retract/null block.
|
|
6191
6281
|
preambleSuppressor.flushNow()
|
|
6192
|
-
// #656:
|
|
6193
|
-
//
|
|
6194
|
-
//
|
|
6195
|
-
//
|
|
6282
|
+
// #656: by default we ALWAYS retract the answer-lane stream at
|
|
6283
|
+
// turn_end. Turn-flush is the canonical emitter for no-reply
|
|
6284
|
+
// turns; materialising here would race it and post raw model
|
|
6285
|
+
// text (no HTML conv).
|
|
6286
|
+
//
|
|
6287
|
+
// #869-Phase1 override: when `ANSWER_STREAM_VISIBLE_ENABLED` is
|
|
6288
|
+
// on, the stream is rendering a USER-VISIBLE message in the
|
|
6289
|
+
// chat timeline. Retracting (delete) destroys content the user
|
|
6290
|
+
// has been reading live — the worst possible UX flicker. So
|
|
6291
|
+
// when the stream is the de-facto final answer (model never
|
|
6292
|
+
// called reply, captured text is substantive) we instead call
|
|
6293
|
+
// `stream.stop()` to freeze it as the final state, record the
|
|
6294
|
+
// outbound for history + dedup, mark the turn answered, and
|
|
6295
|
+
// suppress the turn-flush IIFE downstream.
|
|
6296
|
+
let streamFinalizedAsAnswer = false
|
|
6196
6297
|
if (turn?.answerStream != null) {
|
|
6197
6298
|
const stream = turn.answerStream
|
|
6198
|
-
|
|
6199
|
-
|
|
6299
|
+
const streamedMsgId = stream.messageId()
|
|
6300
|
+
const streamedFinalText = turn.capturedText.join('').trim()
|
|
6301
|
+
if (
|
|
6302
|
+
ANSWER_STREAM_VISIBLE_ENABLED
|
|
6303
|
+
&& !turn.replyCalled
|
|
6304
|
+
&& streamedMsgId != null
|
|
6305
|
+
&& streamedFinalText.length > 0
|
|
6306
|
+
) {
|
|
6307
|
+
turn.answerStream = null
|
|
6308
|
+
stream.stop()
|
|
6309
|
+
streamFinalizedAsAnswer = true
|
|
6310
|
+
turn.finalAnswerDelivered = true
|
|
6311
|
+
// Record as canonical outbound so retries dedup against it
|
|
6312
|
+
// and the SQLite history can surface it. Mirrors the
|
|
6313
|
+
// hooks turn-flush + reply both run.
|
|
6314
|
+
try {
|
|
6315
|
+
outboundDedup.record(
|
|
6316
|
+
turn.sessionChatId,
|
|
6317
|
+
turn.sessionThreadId,
|
|
6318
|
+
streamedFinalText,
|
|
6319
|
+
Date.now(),
|
|
6320
|
+
)
|
|
6321
|
+
} catch { /* best-effort */ }
|
|
6322
|
+
if (HISTORY_ENABLED) {
|
|
6323
|
+
try {
|
|
6324
|
+
recordOutbound({
|
|
6325
|
+
chat_id: turn.sessionChatId,
|
|
6326
|
+
thread_id: turn.sessionThreadId ?? null,
|
|
6327
|
+
message_ids: [streamedMsgId],
|
|
6328
|
+
texts: [streamedFinalText],
|
|
6329
|
+
})
|
|
6330
|
+
} catch { /* best-effort */ }
|
|
6331
|
+
}
|
|
6200
6332
|
process.stderr.write(
|
|
6201
|
-
`telegram gateway: answer-stream
|
|
6202
|
-
|
|
6203
|
-
}\n`,
|
|
6333
|
+
`telegram gateway: answer-stream finalized as answer ` +
|
|
6334
|
+
`chat=${turn.sessionChatId} msg=${streamedMsgId} ` +
|
|
6335
|
+
`chars=${streamedFinalText.length}\n`,
|
|
6204
6336
|
)
|
|
6205
|
-
}
|
|
6337
|
+
} else {
|
|
6338
|
+
turn.answerStream = null
|
|
6339
|
+
void stream.retract().catch((err) => {
|
|
6340
|
+
process.stderr.write(
|
|
6341
|
+
`telegram gateway: answer-stream retract failed: ${
|
|
6342
|
+
err instanceof Error ? err.message : String(err)
|
|
6343
|
+
}\n`,
|
|
6344
|
+
)
|
|
6345
|
+
})
|
|
6346
|
+
}
|
|
6206
6347
|
}
|
|
6207
6348
|
if (turn == null) return
|
|
6208
6349
|
const chatId = turn.sessionChatId
|
|
@@ -6214,12 +6355,19 @@ function handleSessionEvent(ev: SessionEvent): void {
|
|
|
6214
6355
|
// surface to recover from. The decideTurnFlush 'empty-text'
|
|
6215
6356
|
// path now relies on capturedText alone.
|
|
6216
6357
|
|
|
6217
|
-
|
|
6218
|
-
|
|
6219
|
-
|
|
6220
|
-
|
|
6221
|
-
|
|
6222
|
-
|
|
6358
|
+
// #869-Phase1: when the answer-stream finalised as the answer
|
|
6359
|
+
// above, skip the turn-flush IIFE entirely — its job (deliver
|
|
6360
|
+
// captured text) is already done by the visible stream, and
|
|
6361
|
+
// running it would race a duplicate fresh-sendMessage against
|
|
6362
|
+
// the user-visible edited message.
|
|
6363
|
+
const flushDecision = streamFinalizedAsAnswer
|
|
6364
|
+
? ({ kind: 'skip', reason: 'reply-called' } as ReturnType<typeof decideTurnFlush>)
|
|
6365
|
+
: decideTurnFlush({
|
|
6366
|
+
chatId: turn.sessionChatId,
|
|
6367
|
+
replyCalled: turn.replyCalled,
|
|
6368
|
+
capturedText: turn.capturedText,
|
|
6369
|
+
flushEnabled: TURN_FLUSH_SAFETY_ENABLED,
|
|
6370
|
+
})
|
|
6223
6371
|
if (flushDecision.kind === 'skip' && flushDecision.reason !== 'reply-called') {
|
|
6224
6372
|
process.stderr.write(
|
|
6225
6373
|
`telegram gateway: turn-flush skipped — reason=${flushDecision.reason}\n`,
|
|
@@ -57,17 +57,19 @@ import type { ObservedMessage } from "../driver.js";
|
|
|
57
57
|
|
|
58
58
|
const SLEEP_SECONDS = 350;
|
|
59
59
|
|
|
60
|
+
// Engineered to elicit the natural production pattern: the model
|
|
61
|
+
// sends a quick ack reply ("on it — background sleep running"),
|
|
62
|
+
// dispatches the sleep as a background Bash, ends its turn, then
|
|
63
|
+
// returns with "done" once the sleep completes. The framework
|
|
64
|
+
// fix-under-test owns the in-between ambient.
|
|
60
65
|
const PROMPT =
|
|
61
|
-
`
|
|
62
|
-
`
|
|
63
|
-
`
|
|
64
|
-
`
|
|
65
|
-
`
|
|
66
|
-
|
|
67
|
-
`
|
|
68
|
-
"```\n\n" +
|
|
69
|
-
`After the bash command returns, send exactly the single word ` +
|
|
70
|
-
`"done" as your final reply.`;
|
|
66
|
+
`Please run \`sleep ${SLEEP_SECONDS}\` in the background using the ` +
|
|
67
|
+
`Bash tool with \`run_in_background: true\` — this is a stress ` +
|
|
68
|
+
`test of the cross-turn ambient progress surface, so the sleep ` +
|
|
69
|
+
`duration matters. Send a brief one-line acknowledgement that ` +
|
|
70
|
+
`you've dispatched it (your natural beat-1 ack is fine), then ` +
|
|
71
|
+
`wait for it to complete. When it finishes, reply with exactly ` +
|
|
72
|
+
`the single word "done".`;
|
|
71
73
|
|
|
72
74
|
const OVERALL_DEADLINE_MS = (SLEEP_SECONDS + 240) * 1000;
|
|
73
75
|
|
|
@@ -0,0 +1,206 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Conversational pacing UAT — measures the END-TO-END user-perceived
|
|
3
|
+
* turn UX on a multi-step prompt.
|
|
4
|
+
*
|
|
5
|
+
* Original framing was "validate the visible-answer-stream path
|
|
6
|
+
* activates." Live research on test-harness with the
|
|
7
|
+
* `SWITCHROOM_VISIBLE_ANSWER_STREAM=1` flag showed that modern Claude
|
|
8
|
+
* 2.1.x on this fleet does NOT emit transcript text events between
|
|
9
|
+
* tool calls — it consistently calls the `reply` MCP tool directly
|
|
10
|
+
* for every user-visible chunk (beat 1 ack, then per-step beat 3
|
|
11
|
+
* updates). So the visible-answer-stream code path (which renders
|
|
12
|
+
* `text` session events into a chat-timeline message) doesn't
|
|
13
|
+
* activate; the answer-stream lane stays idle while the model uses
|
|
14
|
+
* `reply` calls instead.
|
|
15
|
+
*
|
|
16
|
+
* That's actually FINE — the model is correctly following the
|
|
17
|
+
* five-beat conversational-pacing contract (`reference/conversational-
|
|
18
|
+
* pacing.md`): one silent ack at the start, silent updates per step,
|
|
19
|
+
* one pinged final answer. This UAT now validates THAT — the pacing
|
|
20
|
+
* the user actually experiences — rather than the answer-stream code
|
|
21
|
+
* path specifically.
|
|
22
|
+
*
|
|
23
|
+
* The flag `SWITCHROOM_VISIBLE_ANSWER_STREAM=1` is still set on
|
|
24
|
+
* test-harness for ongoing observation; if a future model version
|
|
25
|
+
* starts emitting transcript text, the lane will surface it visibly
|
|
26
|
+
* instead of writing to the invisible compose-box draft (the prior
|
|
27
|
+
* default).
|
|
28
|
+
*
|
|
29
|
+
* ## What this asserts
|
|
30
|
+
*
|
|
31
|
+
* 1. First user-visible bot message lands within `TTFO_BUDGET_MS`
|
|
32
|
+
* (default 15 s) of the inbound — covers beat 1 ack OR straight-
|
|
33
|
+
* to-content depending on the model's pacing choice.
|
|
34
|
+
* 2. Multiple distinct bot messages land per turn for the multi-
|
|
35
|
+
* step prompt — proving the model isn't collapsing everything
|
|
36
|
+
* into a single pinged dump.
|
|
37
|
+
* 3. All but at most one message is silent (`disable_notification:
|
|
38
|
+
* true`). Only the final answer should ping — anything earlier
|
|
39
|
+
* pinging is a beat-3 contract violation.
|
|
40
|
+
*
|
|
41
|
+
* ## Wall-clock budget
|
|
42
|
+
*
|
|
43
|
+
* ~90 s.
|
|
44
|
+
*/
|
|
45
|
+
|
|
46
|
+
import { describe, expect, it } from "vitest";
|
|
47
|
+
import { spinUp } from "../harness.js";
|
|
48
|
+
import type { ObservedMessage } from "../driver.js";
|
|
49
|
+
|
|
50
|
+
const TTFO_BUDGET_MS = 15_000;
|
|
51
|
+
const OVERALL_DEADLINE_MS = 90_000;
|
|
52
|
+
const QUIESCENCE_MS = 12_000;
|
|
53
|
+
|
|
54
|
+
// Multi-step investigation prompt — designed to make the model emit
|
|
55
|
+
// transcript text BETWEEN tool calls, which is the assistant-content
|
|
56
|
+
// `text` block shape session-tail surfaces via the `text` event the
|
|
57
|
+
// answer-stream lane consumes. With the visible-answer-stream flag
|
|
58
|
+
// ON, those text events should become user-visible edit-in-place
|
|
59
|
+
// chat-timeline updates.
|
|
60
|
+
//
|
|
61
|
+
// We choose a research-style task because that pattern reliably
|
|
62
|
+
// emits `text` chunks (the model thinks out loud between Read /
|
|
63
|
+
// Bash steps) on most Claude versions. A pure-answer prompt (the
|
|
64
|
+
// previous version of this scenario) tended to make modern Claude
|
|
65
|
+
// jump straight to a single `reply` tool-call with no intermediate
|
|
66
|
+
// text — exercising the wrong path.
|
|
67
|
+
const PROMPT =
|
|
68
|
+
`Investigate this step by step:\n\n` +
|
|
69
|
+
`1. Read \`/etc/hostname\` and tell me what host this is — write a ` +
|
|
70
|
+
`sentence about it.\n` +
|
|
71
|
+
`2. Then read \`/etc/os-release\` and tell me what OS family / version.\n` +
|
|
72
|
+
`3. Then read \`/proc/cpuinfo\` (head it), and tell me the CPU model + ` +
|
|
73
|
+
`core count.\n` +
|
|
74
|
+
`4. Wrap up with a one-line summary of all three.\n\n` +
|
|
75
|
+
`Between each step, narrate what you're finding in plain prose ` +
|
|
76
|
+
`(not just bullet outputs). Don't batch all your observations into ` +
|
|
77
|
+
`one final reply — talk as you investigate.`;
|
|
78
|
+
|
|
79
|
+
interface TrailEntry {
|
|
80
|
+
relMs: number;
|
|
81
|
+
kind: "fresh" | "edit";
|
|
82
|
+
silent: boolean;
|
|
83
|
+
messageId: number;
|
|
84
|
+
textPreview: string;
|
|
85
|
+
textLength: number;
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
function pad(s: string, n: number): string {
|
|
89
|
+
return s.length >= n ? s : s + " ".repeat(n - s.length);
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
describe("uat: conversational pacing on a multi-step turn", () => {
|
|
93
|
+
it(
|
|
94
|
+
"first message lands within TTFO_BUDGET_MS; multiple silent messages; final answer pings",
|
|
95
|
+
async () => {
|
|
96
|
+
const sc = await spinUp({ agent: "test-harness" });
|
|
97
|
+
try {
|
|
98
|
+
const startedAt = Date.now();
|
|
99
|
+
await sc.sendDM(PROMPT);
|
|
100
|
+
console.log(`[visible-answer-stream] t=0 prompt sent`);
|
|
101
|
+
|
|
102
|
+
const trail: TrailEntry[] = [];
|
|
103
|
+
let firstAnchorMsgId: number | null = null;
|
|
104
|
+
let quiescenceDeadline = startedAt + 30_000;
|
|
105
|
+
const overallDeadline = startedAt + OVERALL_DEADLINE_MS;
|
|
106
|
+
|
|
107
|
+
while (Date.now() < overallDeadline) {
|
|
108
|
+
const remaining = Math.min(
|
|
109
|
+
quiescenceDeadline - Date.now(),
|
|
110
|
+
overallDeadline - Date.now(),
|
|
111
|
+
);
|
|
112
|
+
if (remaining <= 0) break;
|
|
113
|
+
try {
|
|
114
|
+
const msg = await sc.expectMessage(
|
|
115
|
+
(m: ObservedMessage) => m.fromBot,
|
|
116
|
+
{ from: "bot", timeout: remaining },
|
|
117
|
+
);
|
|
118
|
+
const rel = Date.now() - startedAt;
|
|
119
|
+
const entry: TrailEntry = {
|
|
120
|
+
relMs: rel,
|
|
121
|
+
kind: msg.edited ? "edit" : "fresh",
|
|
122
|
+
silent: msg.silent,
|
|
123
|
+
messageId: msg.messageId,
|
|
124
|
+
textPreview: msg.text
|
|
125
|
+
.slice(0, 120)
|
|
126
|
+
.replace(/\n/g, " ⏎ "),
|
|
127
|
+
textLength: msg.text.length,
|
|
128
|
+
};
|
|
129
|
+
trail.push(entry);
|
|
130
|
+
if (firstAnchorMsgId == null && entry.kind === "fresh") {
|
|
131
|
+
firstAnchorMsgId = entry.messageId;
|
|
132
|
+
}
|
|
133
|
+
console.log(
|
|
134
|
+
`[visible-answer-stream] +${(rel / 1000).toFixed(1)}s ` +
|
|
135
|
+
`${entry.kind.toUpperCase()} msg=${entry.messageId} ` +
|
|
136
|
+
`silent=${entry.silent} len=${entry.textLength} ` +
|
|
137
|
+
`text=${JSON.stringify(entry.textPreview)}`,
|
|
138
|
+
);
|
|
139
|
+
quiescenceDeadline = Date.now() + QUIESCENCE_MS;
|
|
140
|
+
} catch {
|
|
141
|
+
break;
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
console.log("\n========== VISIBLE-ANSWER-STREAM TRAIL ==========");
|
|
146
|
+
console.log(`total bot messages observed: ${trail.length}`);
|
|
147
|
+
console.log(`first anchor messageId: ${firstAnchorMsgId}`);
|
|
148
|
+
console.log("");
|
|
149
|
+
console.log(" rel(s) kind silent msg len text");
|
|
150
|
+
console.log(" ------- ----- ------ ----------- ---- ----");
|
|
151
|
+
for (const e of trail) {
|
|
152
|
+
console.log(
|
|
153
|
+
` ${pad((e.relMs / 1000).toFixed(1) + "s", 8)} ` +
|
|
154
|
+
`${pad(e.kind, 6)} ${pad(String(e.silent), 7)} ` +
|
|
155
|
+
`${pad(String(e.messageId), 12)} ${pad(String(e.textLength), 5)} ` +
|
|
156
|
+
`${e.textPreview}`,
|
|
157
|
+
);
|
|
158
|
+
}
|
|
159
|
+
console.log("=================================================\n");
|
|
160
|
+
|
|
161
|
+
// ── Pacing assertions ─────────────────────────────────────
|
|
162
|
+
|
|
163
|
+
// (1) at least one bot message landed
|
|
164
|
+
expect(
|
|
165
|
+
trail.length,
|
|
166
|
+
`no bot replies observed — the agent isn't responding.`,
|
|
167
|
+
).toBeGreaterThanOrEqual(1);
|
|
168
|
+
|
|
169
|
+
// (2) first message landed within TTFO budget
|
|
170
|
+
const ttfoMs = trail[0].relMs;
|
|
171
|
+
expect(
|
|
172
|
+
ttfoMs,
|
|
173
|
+
`TTFO ${ttfoMs}ms exceeded the budget of ${TTFO_BUDGET_MS}ms.`,
|
|
174
|
+
).toBeLessThanOrEqual(TTFO_BUDGET_MS);
|
|
175
|
+
|
|
176
|
+
// (3) multiple messages landed — proves the model is pacing,
|
|
177
|
+
// not dumping a single big reply
|
|
178
|
+
expect(
|
|
179
|
+
trail.length,
|
|
180
|
+
`only ${trail.length} message(s) observed — the model ` +
|
|
181
|
+
`collapsed this multi-step prompt into a single dump. ` +
|
|
182
|
+
`Beat 3 pacing (per-step updates) requires multiple ` +
|
|
183
|
+
`messages. Either the model didn't follow the prompt ` +
|
|
184
|
+
`or quiescence bailed early.`,
|
|
185
|
+
).toBeGreaterThanOrEqual(2);
|
|
186
|
+
|
|
187
|
+
// (4) at most one message pinged the user — beat-3 contract
|
|
188
|
+
// says only the FINAL answer pings; mid-turn updates pass
|
|
189
|
+
// `disable_notification: true`.
|
|
190
|
+
const pingedMessages = trail.filter((e) => !e.silent);
|
|
191
|
+
expect(
|
|
192
|
+
pingedMessages.length,
|
|
193
|
+
`${pingedMessages.length} message(s) pinged the device — ` +
|
|
194
|
+
`the conversational-pacing contract allows AT MOST 1 ` +
|
|
195
|
+
`(the final answer). Mid-turn updates must be silent. ` +
|
|
196
|
+
`Pinged messages at: ${pingedMessages
|
|
197
|
+
.map((m) => `+${(m.relMs / 1000).toFixed(0)}s`)
|
|
198
|
+
.join(", ")}`,
|
|
199
|
+
).toBeLessThanOrEqual(1);
|
|
200
|
+
} finally {
|
|
201
|
+
await sc.tearDown();
|
|
202
|
+
}
|
|
203
|
+
},
|
|
204
|
+
OVERALL_DEADLINE_MS + 30_000,
|
|
205
|
+
);
|
|
206
|
+
});
|