switchroom 0.13.15 → 0.13.17
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli/switchroom.js +2 -2
- package/package.json +1 -1
- package/telegram-plugin/dist/gateway/gateway.js +150 -17
- package/telegram-plugin/gateway/gateway.ts +184 -1
- package/telegram-plugin/over-ping-safety-net.ts +80 -0
- package/telegram-plugin/runtime-metrics.ts +18 -0
- package/telegram-plugin/silent-reply-anchor.ts +142 -0
- package/telegram-plugin/tests/over-ping-safety-net.test.ts +96 -0
- package/telegram-plugin/tests/silent-reply-anchor.test.ts +178 -0
- package/telegram-plugin/uat/scenarios/visible-answer-stream-dm.test.ts +92 -105
package/dist/cli/switchroom.js
CHANGED
|
@@ -47331,8 +47331,8 @@ var {
|
|
|
47331
47331
|
} = import__.default;
|
|
47332
47332
|
|
|
47333
47333
|
// src/build-info.ts
|
|
47334
|
-
var VERSION = "0.13.
|
|
47335
|
-
var COMMIT_SHA = "
|
|
47334
|
+
var VERSION = "0.13.17";
|
|
47335
|
+
var COMMIT_SHA = "84eb8ad9";
|
|
47336
47336
|
|
|
47337
47337
|
// src/cli/agent.ts
|
|
47338
47338
|
init_source();
|
package/package.json
CHANGED
|
@@ -37034,6 +37034,56 @@ function emitRuntimeMetric(event) {
|
|
|
37034
37034
|
captureEvent(event.kind, { ...event, ts: wrapped.ts });
|
|
37035
37035
|
}
|
|
37036
37036
|
|
|
37037
|
+
// over-ping-safety-net.ts
|
|
37038
|
+
function decideOverPing(input) {
|
|
37039
|
+
if (!input.modelRequestedPing) {
|
|
37040
|
+
return { suppress: false, claimSlot: false, sinceFirstPingMs: null };
|
|
37041
|
+
}
|
|
37042
|
+
if (input.firstPingAt != null) {
|
|
37043
|
+
return {
|
|
37044
|
+
suppress: true,
|
|
37045
|
+
claimSlot: false,
|
|
37046
|
+
sinceFirstPingMs: input.nowMs - input.firstPingAt
|
|
37047
|
+
};
|
|
37048
|
+
}
|
|
37049
|
+
return { suppress: false, claimSlot: true, sinceFirstPingMs: null };
|
|
37050
|
+
}
|
|
37051
|
+
|
|
37052
|
+
// silent-reply-anchor.ts
|
|
37053
|
+
var TELEGRAM_MSG_CAP = 4000;
|
|
37054
|
+
function enabled2() {
|
|
37055
|
+
const v = process.env.SWITCHROOM_DISABLE_SILENT_REPLY_AUTOEDIT;
|
|
37056
|
+
return !(v === "1" || v === "true");
|
|
37057
|
+
}
|
|
37058
|
+
function decideSilentReplyAnchor(input) {
|
|
37059
|
+
if (!enabled2()) {
|
|
37060
|
+
return { kind: "fresh", becomesAnchor: false };
|
|
37061
|
+
}
|
|
37062
|
+
if (!input.effectivelySilent) {
|
|
37063
|
+
return { kind: "fresh", becomesAnchor: false };
|
|
37064
|
+
}
|
|
37065
|
+
if (input.hasFiles || input.hasButtons) {
|
|
37066
|
+
return { kind: "fresh", becomesAnchor: false };
|
|
37067
|
+
}
|
|
37068
|
+
if (input.newReplyText.trim().length === 0) {
|
|
37069
|
+
return { kind: "fresh", becomesAnchor: false };
|
|
37070
|
+
}
|
|
37071
|
+
if (input.anchorMessageId == null) {
|
|
37072
|
+
return { kind: "fresh", becomesAnchor: true };
|
|
37073
|
+
}
|
|
37074
|
+
const merged = `${input.anchorText}
|
|
37075
|
+
|
|
37076
|
+
${input.newReplyText}`;
|
|
37077
|
+
if (merged.length > TELEGRAM_MSG_CAP) {
|
|
37078
|
+
return { kind: "fresh", becomesAnchor: true };
|
|
37079
|
+
}
|
|
37080
|
+
return {
|
|
37081
|
+
kind: "edit-anchor",
|
|
37082
|
+
messageId: input.anchorMessageId,
|
|
37083
|
+
mergedText: merged
|
|
37084
|
+
};
|
|
37085
|
+
}
|
|
37086
|
+
|
|
37037
37087
|
// inbound-classifier.ts
|
|
37038
37088
|
var STATUS_QUERY_PATTERNS = [
|
|
37039
37089
|
/^\?+$/,
|
|
@@ -37300,12 +37350,12 @@ function startTimer(deps) {
|
|
|
37300
37350
|
var EDIT_INTERVAL_MS = 60000;
|
|
37301
37351
|
var POLL_INTERVAL_MS = 5000;
|
|
37302
37352
|
var MAX_LIFETIME_MS = 30 * 60000;
|
|
37303
|
-
var
|
|
37353
|
+
var TELEGRAM_MSG_CAP2 = 4000;
|
|
37304
37354
|
var SUFFIX_RE = /\n\n\u2014 still working \(\d+m\)$/;
|
|
37305
37355
|
var stateByKey = new Map;
|
|
37306
37356
|
var timer2 = null;
|
|
37307
37357
|
var activeDeps2 = null;
|
|
37308
|
-
function
|
|
37358
|
+
function enabled3() {
|
|
37309
37359
|
const v = process.env.SWITCHROOM_DISABLE_PENDING_PROGRESS;
|
|
37310
37360
|
return !(v === "1" || v === "true");
|
|
37311
37361
|
}
|
|
@@ -37327,19 +37377,19 @@ function ensure(key) {
|
|
|
37327
37377
|
return s;
|
|
37328
37378
|
}
|
|
37329
37379
|
function noteAsyncDispatch(key) {
|
|
37330
|
-
if (!
|
|
37380
|
+
if (!enabled3())
|
|
37331
37381
|
return;
|
|
37332
37382
|
ensure(key).pending = true;
|
|
37333
37383
|
}
|
|
37334
37384
|
function noteOutbound3(key, opts) {
|
|
37335
|
-
if (!
|
|
37385
|
+
if (!enabled3())
|
|
37336
37386
|
return;
|
|
37337
37387
|
const s = ensure(key);
|
|
37338
37388
|
s.anchorMessageId = opts.messageId;
|
|
37339
37389
|
s.anchorOriginalText = opts.text.replace(SUFFIX_RE, "");
|
|
37340
37390
|
}
|
|
37341
37391
|
function noteTurnEnd(key) {
|
|
37342
|
-
if (!
|
|
37392
|
+
if (!enabled3())
|
|
37343
37393
|
return;
|
|
37344
37394
|
const s = stateByKey.get(key);
|
|
37345
37395
|
if (s == null)
|
|
@@ -37369,7 +37419,7 @@ function clearPending(key, reason) {
|
|
|
37369
37419
|
});
|
|
37370
37420
|
}
|
|
37371
37421
|
function startTimer2(deps) {
|
|
37372
|
-
if (!
|
|
37422
|
+
if (!enabled3())
|
|
37373
37423
|
return;
|
|
37374
37424
|
if (timer2 != null)
|
|
37375
37425
|
return;
|
|
@@ -37409,7 +37459,7 @@ function tick2(now) {
|
|
|
37409
37459
|
|
|
37410
37460
|
\u2014 still working (${minutes}m)`;
|
|
37411
37461
|
const newText = s.anchorOriginalText + suffix;
|
|
37412
|
-
if (newText.length >
|
|
37462
|
+
if (newText.length > TELEGRAM_MSG_CAP2) {
|
|
37413
37463
|
s.lastEditAt = now;
|
|
37414
37464
|
continue;
|
|
37415
37465
|
}
|
|
@@ -44586,9 +44636,9 @@ function transition(state3, event) {
|
|
|
44586
44636
|
|
|
44587
44637
|
// gateway/inbound-delivery-machine-shadow.ts
|
|
44588
44638
|
var state3 = initialState();
|
|
44589
|
-
var
|
|
44639
|
+
var enabled4 = process.env.SWITCHROOM_DELIVERY_MACHINE_SHADOW !== "0";
|
|
44590
44640
|
function shadowEmit(event) {
|
|
44591
|
-
if (!
|
|
44641
|
+
if (!enabled4)
|
|
44592
44642
|
return [];
|
|
44593
44643
|
try {
|
|
44594
44644
|
const result = transition(state3, event);
|
|
@@ -44646,12 +44696,12 @@ function redeliverBufferedInbound2(buffer, agent, send, spool) {
|
|
|
44646
44696
|
}
|
|
44647
44697
|
|
|
44648
44698
|
// gateway/inbound-delivery-machine-dispatch.ts
|
|
44649
|
-
var
|
|
44699
|
+
var enabled5 = process.env.SWITCHROOM_DELIVERY_MACHINE_CUTOVER !== "0";
|
|
44650
44700
|
function isDispatchEnabled() {
|
|
44651
|
-
return
|
|
44701
|
+
return enabled5;
|
|
44652
44702
|
}
|
|
44653
44703
|
function dispatchEffects(effects, ctx) {
|
|
44654
|
-
if (!
|
|
44704
|
+
if (!enabled5)
|
|
44655
44705
|
return;
|
|
44656
44706
|
for (const effect of effects) {
|
|
44657
44707
|
dispatchOne(effect, ctx);
|
|
@@ -48154,10 +48204,10 @@ function sweepStaleTurnActiveMarker(stateDir, opts) {
|
|
|
48154
48204
|
}
|
|
48155
48205
|
|
|
48156
48206
|
// ../src/build-info.ts
|
|
48157
|
-
var VERSION = "0.13.
|
|
48158
|
-
var COMMIT_SHA = "
|
|
48159
|
-
var COMMIT_DATE = "2026-05-
|
|
48160
|
-
var LATEST_PR =
|
|
48207
|
+
var VERSION = "0.13.17";
|
|
48208
|
+
var COMMIT_SHA = "84eb8ad9";
|
|
48209
|
+
var COMMIT_DATE = "2026-05-23T05:24:12Z";
|
|
48210
|
+
var LATEST_PR = 1678;
|
|
48161
48211
|
var COMMITS_AHEAD_OF_TAG = 0;
|
|
48162
48212
|
|
|
48163
48213
|
// gateway/boot-version.ts
|
|
@@ -50617,7 +50667,30 @@ async function executeReply(args) {
|
|
|
50617
50667
|
const configParseMode = access.parseMode ?? "html";
|
|
50618
50668
|
const format = args.format ?? configParseMode;
|
|
50619
50669
|
const disableLinkPreview = args.disable_web_page_preview != null ? Boolean(args.disable_web_page_preview) : access.disableLinkPreview ?? true;
|
|
50620
|
-
|
|
50670
|
+
let disableNotification = args.disable_notification === true;
|
|
50671
|
+
{
|
|
50672
|
+
const turn2 = currentTurn;
|
|
50673
|
+
if (turn2 != null) {
|
|
50674
|
+
const now = Date.now();
|
|
50675
|
+
const decision = decideOverPing({
|
|
50676
|
+
modelRequestedPing: !disableNotification,
|
|
50677
|
+
firstPingAt: turn2.firstPingAt,
|
|
50678
|
+
nowMs: now
|
|
50679
|
+
});
|
|
50680
|
+
if (decision.suppress) {
|
|
50681
|
+
process.stderr.write(`telegram gateway: reply over-ping safety net \u2014 ` + `downgrading disable_notification:false \u2192 true ` + `(chat=${chat_id} thread=${args.message_thread_id ?? "-"} firstPingAt=${turn2.firstPingAt} sinceFirstPing_ms=${decision.sinceFirstPingMs})
|
|
50682
|
+
`);
|
|
50683
|
+
emitRuntimeMetric({
|
|
50684
|
+
kind: "over_ping_suppressed",
|
|
50685
|
+
key: statusKey(chat_id, args.message_thread_id != null ? Number(args.message_thread_id) : undefined),
|
|
50686
|
+
sinceFirstPingMs: decision.sinceFirstPingMs ?? 0
|
|
50687
|
+
});
|
|
50688
|
+
disableNotification = true;
|
|
50689
|
+
} else if (decision.claimSlot) {
|
|
50690
|
+
turn2.firstPingAt = now;
|
|
50691
|
+
}
|
|
50692
|
+
}
|
|
50693
|
+
}
|
|
50621
50694
|
const tg = access.telegraph;
|
|
50622
50695
|
const tgThreshold = tg?.threshold ?? 3000;
|
|
50623
50696
|
if (tg?.enabled && files.length === 0 && text.length > tgThreshold) {
|
|
@@ -50711,6 +50784,56 @@ ${url}`;
|
|
|
50711
50784
|
previewMessageId = null;
|
|
50712
50785
|
}
|
|
50713
50786
|
startTypingLoop(chat_id);
|
|
50787
|
+
let silentAnchorEditDone = false;
|
|
50788
|
+
{
|
|
50789
|
+
const turn2 = currentTurn;
|
|
50790
|
+
if (turn2 != null && chunks.length === 1) {
|
|
50791
|
+
const decision = decideSilentReplyAnchor({
|
|
50792
|
+
effectivelySilent: disableNotification,
|
|
50793
|
+
anchorMessageId: turn2.silentAnchorMessageId,
|
|
50794
|
+
anchorText: turn2.silentAnchorText,
|
|
50795
|
+
newReplyText: effectiveText,
|
|
50796
|
+
hasFiles: files.length > 0,
|
|
50797
|
+
hasButtons: replyMarkup != null
|
|
50798
|
+
});
|
|
50799
|
+
if (decision.kind === "edit-anchor") {
|
|
50800
|
+
const editParams = {
|
|
50801
|
+
link_preview_options: { is_disabled: disableLinkPreview }
|
|
50802
|
+
};
|
|
50803
|
+
if (parseMode != null)
|
|
50804
|
+
editParams.parse_mode = parseMode;
|
|
50805
|
+
if (threadId != null)
|
|
50806
|
+
editParams.message_thread_id = threadId;
|
|
50807
|
+
try {
|
|
50808
|
+
await robustApiCall(() => lockedBot.api.editMessageText(chat_id, decision.messageId, decision.mergedText, editParams), {
|
|
50809
|
+
chat_id,
|
|
50810
|
+
verb: "reply.silent-anchor-edit",
|
|
50811
|
+
...threadId != null ? { threadId } : {}
|
|
50812
|
+
});
|
|
50813
|
+
turn2.silentAnchorText = decision.mergedText;
|
|
50814
|
+
sentIds.push(decision.messageId);
|
|
50815
|
+
logOutbound("edit", chat_id, decision.messageId, decision.mergedText.length, "silent-anchor-merge");
|
|
50816
|
+
process.stderr.write(`telegram gateway: silent-reply auto-edit \u2014 ` + `chat=${chat_id} anchor=${decision.messageId} merged_len=${decision.mergedText.length}
|
|
50817
|
+
`);
|
|
50818
|
+
silentAnchorEditDone = true;
|
|
50819
|
+
} catch (err) {
|
|
50820
|
+
process.stderr.write(`telegram gateway: silent-reply auto-edit failed, falling back to fresh send: ${err instanceof Error ? err.message : String(err)}
|
|
50821
|
+
`);
|
|
50822
|
+
}
|
|
50823
|
+
}
|
|
50824
|
+
}
|
|
50825
|
+
}
|
|
50826
|
+
if (silentAnchorEditDone) {
|
|
50827
|
+
stopTypingLoop(chat_id);
|
|
50828
|
+
return {
|
|
50829
|
+
content: [
|
|
50830
|
+
{
|
|
50831
|
+
type: "text",
|
|
50832
|
+
text: `edited (id: ${sentIds[0]})`
|
|
50833
|
+
}
|
|
50834
|
+
]
|
|
50835
|
+
};
|
|
50836
|
+
}
|
|
50714
50837
|
try {
|
|
50715
50838
|
for (let i = 0;i < chunks.length; i++) {
|
|
50716
50839
|
const shouldReplyTo = reply_to != null && replyMode !== "off" && (replyMode === "all" || i === 0);
|
|
@@ -50812,6 +50935,13 @@ ${url}`;
|
|
|
50812
50935
|
});
|
|
50813
50936
|
}
|
|
50814
50937
|
}
|
|
50938
|
+
if (chunks.length === 1 && disableNotification && files.length === 0 && replyMarkup == null && sentIds.length === 1) {
|
|
50939
|
+
const turn2 = currentTurn;
|
|
50940
|
+
if (turn2 != null) {
|
|
50941
|
+
turn2.silentAnchorMessageId = sentIds[0];
|
|
50942
|
+
turn2.silentAnchorText = effectiveText;
|
|
50943
|
+
}
|
|
50944
|
+
}
|
|
50815
50945
|
const allPhotos = files.length >= 2 && files.length <= 10 && files.every((f) => PHOTO_EXTS.has(extname(f).toLowerCase()));
|
|
50816
50946
|
const replyParams = reply_to != null && replyMode !== "off" ? { reply_parameters: { message_id: reply_to } } : {};
|
|
50817
50947
|
if (allPhotos) {
|
|
@@ -51766,6 +51896,9 @@ function handleSessionEvent(ev) {
|
|
|
51766
51896
|
gatewayReceiveAt: startedAt,
|
|
51767
51897
|
replyCalled: false,
|
|
51768
51898
|
finalAnswerDelivered: false,
|
|
51899
|
+
firstPingAt: null,
|
|
51900
|
+
silentAnchorMessageId: null,
|
|
51901
|
+
silentAnchorText: "",
|
|
51769
51902
|
capturedText: [],
|
|
51770
51903
|
orphanedReplyTimeoutId: null,
|
|
51771
51904
|
registryKey: null,
|
|
@@ -74,6 +74,8 @@ import {
|
|
|
74
74
|
shutdownAnalytics,
|
|
75
75
|
} from '../analytics-posthog.js'
|
|
76
76
|
import { emitRuntimeMetric } from '../runtime-metrics.js'
|
|
77
|
+
import { decideOverPing } from '../over-ping-safety-net.js'
|
|
78
|
+
import { decideSilentReplyAnchor } from '../silent-reply-anchor.js'
|
|
77
79
|
import { classifyInbound } from '../inbound-classifier.js'
|
|
78
80
|
import * as silencePoke from '../silence-poke.js'
|
|
79
81
|
import * as pendingProgress from '../pending-work-progress.js'
|
|
@@ -1206,6 +1208,27 @@ type CurrentTurn = {
|
|
|
1206
1208
|
// even though `replyCalled` is true — the #1664 case where the real answer
|
|
1207
1209
|
// ended up as plain transcript text rendered into an ephemeral draft.
|
|
1208
1210
|
finalAnswerDelivered: boolean
|
|
1211
|
+
// #1675 (over-ping safety net): wall-clock ms of the first reply
|
|
1212
|
+
// this turn that landed with `disable_notification: false` (a real
|
|
1213
|
+
// device ping). The conversational-pacing contract
|
|
1214
|
+
// (`reference/conversational-pacing.md` beat 5) says EXACTLY ONE
|
|
1215
|
+
// ping per turn — the final answer. When the model violates that
|
|
1216
|
+
// (sends a substantive answer pinged + a wrap-up "Delivered…" or
|
|
1217
|
+
// meta-narration also pinged), subsequent reply calls with
|
|
1218
|
+
// `disable_notification: false` are auto-downgraded to silent by
|
|
1219
|
+
// the framework. Null until the first ping lands. Reset on every
|
|
1220
|
+
// fresh-turn enqueue.
|
|
1221
|
+
firstPingAt: number | null
|
|
1222
|
+
// #1677 silent-reply auto-edit. The first silent reply of a turn
|
|
1223
|
+
// captures `silentAnchorMessageId` + `silentAnchorText`; subsequent
|
|
1224
|
+
// silent replies in the SAME turn editMessageText that anchor
|
|
1225
|
+
// (appending with paragraph-break separator). Net visual: one
|
|
1226
|
+
// growing silent bubble instead of N stacked silent bubbles.
|
|
1227
|
+
// Cleared by turn-atom replacement on enqueue. See
|
|
1228
|
+
// `telegram-plugin/silent-reply-anchor.ts` for the pure
|
|
1229
|
+
// `decideSilentReplyAnchor` predicate.
|
|
1230
|
+
silentAnchorMessageId: number | null
|
|
1231
|
+
silentAnchorText: string
|
|
1209
1232
|
capturedText: string[]
|
|
1210
1233
|
orphanedReplyTimeoutId: ReturnType<typeof setTimeout> | null
|
|
1211
1234
|
registryKey: string | null
|
|
@@ -4208,7 +4231,58 @@ async function executeReply(args: Record<string, unknown>): Promise<{ content: A
|
|
|
4208
4231
|
// so only the final answer pings the device. Default false (pings) so
|
|
4209
4232
|
// existing call-sites and the typical "final answer" reply keep their
|
|
4210
4233
|
// current behaviour without an explicit flag.
|
|
4211
|
-
|
|
4234
|
+
let disableNotification = args.disable_notification === true
|
|
4235
|
+
|
|
4236
|
+
// #1675 over-ping safety net. The conversational-pacing contract
|
|
4237
|
+
// (`reference/conversational-pacing.md` beat 5) says EXACTLY ONE
|
|
4238
|
+
// device ping per turn — the final answer. The model sometimes
|
|
4239
|
+
// violates this by sending a substantive answer pinged + a wrap-up
|
|
4240
|
+
// ("Delivered all three steps…", "Sent.", or meta-narration) ALSO
|
|
4241
|
+
// pinged. Both messages then fire notifications. The fleet UAT on
|
|
4242
|
+
// 2026-05-23 reproduced this (Step 3 + Delivered both pinged, two
|
|
4243
|
+
// beeps for a turn that should have produced one). Framework owns
|
|
4244
|
+
// the safety net: once the turn has emitted ONE pinged reply, every
|
|
4245
|
+
// subsequent reply call in the same turn auto-downgrades to silent
|
|
4246
|
+
// (disable_notification: true). Model intent ("I want this loud")
|
|
4247
|
+
// is honoured for the first ping; subsequent pings are demoted with
|
|
4248
|
+
// a stderr log so operators can see the safety net engage.
|
|
4249
|
+
//
|
|
4250
|
+
// The slot is claimed BEFORE the actual send to keep the logic
|
|
4251
|
+
// sequential — a send that fails part-way leaves firstPingAt set
|
|
4252
|
+
// and subsequent pings would be silenced. Acceptable trade-off (a
|
|
4253
|
+
// failed first ping is an edge case; the alternative — claim after
|
|
4254
|
+
// send — races concurrent reply calls).
|
|
4255
|
+
{
|
|
4256
|
+
const turn = currentTurn
|
|
4257
|
+
if (turn != null) {
|
|
4258
|
+
const now = Date.now()
|
|
4259
|
+
const decision = decideOverPing({
|
|
4260
|
+
modelRequestedPing: !disableNotification,
|
|
4261
|
+
firstPingAt: turn.firstPingAt,
|
|
4262
|
+
nowMs: now,
|
|
4263
|
+
})
|
|
4264
|
+
if (decision.suppress) {
|
|
4265
|
+
process.stderr.write(
|
|
4266
|
+
`telegram gateway: reply over-ping safety net — ` +
|
|
4267
|
+
`downgrading disable_notification:false → true ` +
|
|
4268
|
+
`(chat=${chat_id} thread=${args.message_thread_id ?? '-'} ` +
|
|
4269
|
+
`firstPingAt=${turn.firstPingAt} sinceFirstPing_ms=${decision.sinceFirstPingMs})\n`,
|
|
4270
|
+
)
|
|
4271
|
+
// Observability: surface to the unified runtime-metrics
|
|
4272
|
+
// fan-out so the cadence dashboard can track fleet-wide
|
|
4273
|
+
// over-ping rate (leading indicator of model pacing drift).
|
|
4274
|
+
emitRuntimeMetric({
|
|
4275
|
+
kind: 'over_ping_suppressed',
|
|
4276
|
+
key: statusKey(chat_id, args.message_thread_id != null
|
|
4277
|
+
? Number(args.message_thread_id) : undefined),
|
|
4278
|
+
sinceFirstPingMs: decision.sinceFirstPingMs ?? 0,
|
|
4279
|
+
})
|
|
4280
|
+
disableNotification = true
|
|
4281
|
+
} else if (decision.claimSlot) {
|
|
4282
|
+
turn.firstPingAt = now
|
|
4283
|
+
}
|
|
4284
|
+
}
|
|
4285
|
+
}
|
|
4212
4286
|
|
|
4213
4287
|
// Telegraph publish (#579). When the reply text is long enough AND
|
|
4214
4288
|
// the agent has telegraph enabled in access.json, publish to
|
|
@@ -4354,6 +4428,91 @@ async function executeReply(args: Record<string, unknown>): Promise<{ content: A
|
|
|
4354
4428
|
|
|
4355
4429
|
startTypingLoop(chat_id)
|
|
4356
4430
|
|
|
4431
|
+
// #1677 silent-reply auto-edit. Consecutive silent replies within
|
|
4432
|
+
// a turn edit a single anchor message instead of stacking new
|
|
4433
|
+
// bubbles. We branch BEFORE the chunk loop so the single-chunk
|
|
4434
|
+
// common case takes an editMessageText path; everything else
|
|
4435
|
+
// (multi-chunk, ping, files, buttons) falls through to fresh send
|
|
4436
|
+
// and either captures a new anchor or doesn't, per the predicate.
|
|
4437
|
+
let silentAnchorEditDone = false
|
|
4438
|
+
{
|
|
4439
|
+
const turn = currentTurn
|
|
4440
|
+
if (turn != null && chunks.length === 1) {
|
|
4441
|
+
const decision = decideSilentReplyAnchor({
|
|
4442
|
+
effectivelySilent: disableNotification,
|
|
4443
|
+
anchorMessageId: turn.silentAnchorMessageId,
|
|
4444
|
+
anchorText: turn.silentAnchorText,
|
|
4445
|
+
newReplyText: effectiveText,
|
|
4446
|
+
hasFiles: files.length > 0,
|
|
4447
|
+
hasButtons: replyMarkup != null,
|
|
4448
|
+
})
|
|
4449
|
+
if (decision.kind === 'edit-anchor') {
|
|
4450
|
+
const editParams: {
|
|
4451
|
+
parse_mode?: 'HTML' | 'MarkdownV2'
|
|
4452
|
+
message_thread_id?: number
|
|
4453
|
+
link_preview_options?: { is_disabled: boolean }
|
|
4454
|
+
} = {
|
|
4455
|
+
link_preview_options: { is_disabled: disableLinkPreview },
|
|
4456
|
+
}
|
|
4457
|
+
if (parseMode != null) editParams.parse_mode = parseMode
|
|
4458
|
+
if (threadId != null) editParams.message_thread_id = threadId
|
|
4459
|
+
try {
|
|
4460
|
+
await robustApiCall(
|
|
4461
|
+
() =>
|
|
4462
|
+
lockedBot.api.editMessageText(
|
|
4463
|
+
chat_id,
|
|
4464
|
+
decision.messageId,
|
|
4465
|
+
decision.mergedText,
|
|
4466
|
+
editParams,
|
|
4467
|
+
),
|
|
4468
|
+
{
|
|
4469
|
+
chat_id,
|
|
4470
|
+
verb: 'reply.silent-anchor-edit',
|
|
4471
|
+
...(threadId != null ? { threadId } : {}),
|
|
4472
|
+
},
|
|
4473
|
+
)
|
|
4474
|
+
turn.silentAnchorText = decision.mergedText
|
|
4475
|
+
sentIds.push(decision.messageId)
|
|
4476
|
+
logOutbound(
|
|
4477
|
+
'edit',
|
|
4478
|
+
chat_id,
|
|
4479
|
+
decision.messageId,
|
|
4480
|
+
decision.mergedText.length,
|
|
4481
|
+
'silent-anchor-merge',
|
|
4482
|
+
)
|
|
4483
|
+
process.stderr.write(
|
|
4484
|
+
`telegram gateway: silent-reply auto-edit — ` +
|
|
4485
|
+
`chat=${chat_id} anchor=${decision.messageId} ` +
|
|
4486
|
+
`merged_len=${decision.mergedText.length}\n`,
|
|
4487
|
+
)
|
|
4488
|
+
silentAnchorEditDone = true
|
|
4489
|
+
} catch (err) {
|
|
4490
|
+
// Edit failed (e.g. message deleted, rate limit exhausted,
|
|
4491
|
+
// parse error). Fall through to fresh-send below — the
|
|
4492
|
+
// anchor will be overwritten by whatever lands.
|
|
4493
|
+
process.stderr.write(
|
|
4494
|
+
`telegram gateway: silent-reply auto-edit failed, ` +
|
|
4495
|
+
`falling back to fresh send: ${err instanceof Error ? err.message : String(err)}\n`,
|
|
4496
|
+
)
|
|
4497
|
+
}
|
|
4498
|
+
}
|
|
4499
|
+
}
|
|
4500
|
+
}
|
|
4501
|
+
|
|
4502
|
+
if (silentAnchorEditDone) {
|
|
4503
|
+
// Skip the chunk loop entirely — the anchor edit IS the send.
|
|
4504
|
+
// Match the normal exit path: stop typing, then return.
|
|
4505
|
+
stopTypingLoop(chat_id)
|
|
4506
|
+
return {
|
|
4507
|
+
content: [
|
|
4508
|
+
{
|
|
4509
|
+
type: 'text',
|
|
4510
|
+
text: `edited (id: ${sentIds[0]})`,
|
|
4511
|
+
},
|
|
4512
|
+
],
|
|
4513
|
+
}
|
|
4514
|
+
}
|
|
4515
|
+
|
|
4357
4516
|
try {
|
|
4358
4517
|
for (let i = 0; i < chunks.length; i++) {
|
|
4359
4518
|
const shouldReplyTo =
|
|
@@ -4489,6 +4648,27 @@ async function executeReply(args: Record<string, unknown>): Promise<{ content: A
|
|
|
4489
4648
|
}
|
|
4490
4649
|
}
|
|
4491
4650
|
|
|
4651
|
+
// #1677 silent-reply auto-edit — anchor capture for the FIRST
|
|
4652
|
+
// silent reply of a turn (or the silent reply that replaced the
|
|
4653
|
+
// anchor on overflow). Only captures for the single-chunk,
|
|
4654
|
+
// silent, no-files, no-buttons happy path; the edit-anchor path
|
|
4655
|
+
// earlier in this function handles SUBSEQUENT silent replies by
|
|
4656
|
+
// editing. The next silent reply this turn will see the captured
|
|
4657
|
+
// anchor and edit it in place.
|
|
4658
|
+
if (
|
|
4659
|
+
chunks.length === 1
|
|
4660
|
+
&& disableNotification
|
|
4661
|
+
&& files.length === 0
|
|
4662
|
+
&& replyMarkup == null
|
|
4663
|
+
&& sentIds.length === 1
|
|
4664
|
+
) {
|
|
4665
|
+
const turn = currentTurn
|
|
4666
|
+
if (turn != null) {
|
|
4667
|
+
turn.silentAnchorMessageId = sentIds[0]!
|
|
4668
|
+
turn.silentAnchorText = effectiveText
|
|
4669
|
+
}
|
|
4670
|
+
}
|
|
4671
|
+
|
|
4492
4672
|
// #273: when files is 2-10 photos, batch them into a single
|
|
4493
4673
|
// sendMediaGroup album rather than N separate sendPhoto calls. The
|
|
4494
4674
|
// user's device fires one notification for the album instead of N
|
|
@@ -5877,6 +6057,9 @@ function handleSessionEvent(ev: SessionEvent): void {
|
|
|
5877
6057
|
gatewayReceiveAt: startedAt,
|
|
5878
6058
|
replyCalled: false,
|
|
5879
6059
|
finalAnswerDelivered: false,
|
|
6060
|
+
firstPingAt: null,
|
|
6061
|
+
silentAnchorMessageId: null,
|
|
6062
|
+
silentAnchorText: '',
|
|
5880
6063
|
capturedText: [],
|
|
5881
6064
|
orphanedReplyTimeoutId: null,
|
|
5882
6065
|
registryKey: null,
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* over-ping-safety-net.ts — pure decision predicate for #1674's
|
|
3
|
+
* "at-most-one device-ping per turn" framework safety net.
|
|
4
|
+
*
|
|
5
|
+
* Background. `reference/conversational-pacing.md` beat 5 is
|
|
6
|
+
* explicit: the model should deliver the answer as a fresh `reply`
|
|
7
|
+
* omitting `disable_notification` (i.e. pinging the device once).
|
|
8
|
+
* EXACTLY ONE ping per turn. The model occasionally violates this
|
|
9
|
+
* — fleet UAT 2026-05-23 reproduced a substantive Step 3 answer
|
|
10
|
+
* pinged + a wrap-up "Delivered all three steps with a wrap-up
|
|
11
|
+
* summary." ALSO pinged, two device beeps for a turn that should
|
|
12
|
+
* have produced one.
|
|
13
|
+
*
|
|
14
|
+
* This module is the framework safety net. The IO live in the
|
|
15
|
+
* gateway's `executeReply` (mutate `turn.firstPingAt`, emit log +
|
|
16
|
+
* runtime-metric, override `disableNotification`); keeping the
|
|
17
|
+
* *decision* pure makes the predicate unit-testable without
|
|
18
|
+
* standing up a gateway.
|
|
19
|
+
*
|
|
20
|
+
* Contract:
|
|
21
|
+
* - When the model requested a ping (`!disable_notification`) AND
|
|
22
|
+
* the current turn already had a ping land (`firstPingAt != null`),
|
|
23
|
+
* the decision says SUPPRESS — the caller downgrades to silent.
|
|
24
|
+
* - When the model requested a ping AND no prior ping this turn,
|
|
25
|
+
* the decision says CLAIM the slot — caller sets `firstPingAt`.
|
|
26
|
+
* - When the model requested silent, this module is a no-op.
|
|
27
|
+
*
|
|
28
|
+
* The slot is claimed BEFORE the actual send (caller responsibility).
|
|
29
|
+
* Trade-off documented inline in `gateway.ts:executeReply`.
|
|
30
|
+
*/
|
|
31
|
+
|
|
32
|
+
export interface OverPingDecisionInput {
|
|
33
|
+
/** True iff the model requested a device ping
|
|
34
|
+
* (`disable_notification:false` or omitted, since the default is to
|
|
35
|
+
* ping per Telegram Bot API). The caller computes this from the
|
|
36
|
+
* inbound `args.disable_notification === true` check. */
|
|
37
|
+
modelRequestedPing: boolean
|
|
38
|
+
/** Wall-clock ms of the FIRST ping this turn, or null if no ping
|
|
39
|
+
* has landed yet. Caller threads this through from
|
|
40
|
+
* `CurrentTurn.firstPingAt`. */
|
|
41
|
+
firstPingAt: number | null
|
|
42
|
+
/** Deterministic clock for tests; defaults to Date.now() in callers. */
|
|
43
|
+
nowMs: number
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
export interface OverPingDecision {
|
|
47
|
+
/** True iff the caller should override `disableNotification` to
|
|
48
|
+
* `true` (i.e. send this reply silently). Implies a contract
|
|
49
|
+
* violation by the model — caller should log + emit a metric. */
|
|
50
|
+
suppress: boolean
|
|
51
|
+
/** True iff the caller should claim the slot —
|
|
52
|
+
* `turn.firstPingAt = nowMs`. Mutually exclusive with `suppress`. */
|
|
53
|
+
claimSlot: boolean
|
|
54
|
+
/** When `suppress` is true, how long the first ping has been
|
|
55
|
+
* "active" (ms since `firstPingAt`). Caller surfaces this in the
|
|
56
|
+
* log + metric for forensic analysis (e.g. tight rapid double-pings
|
|
57
|
+
* vs delayed wrap-ups). Null otherwise. */
|
|
58
|
+
sinceFirstPingMs: number | null
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
/**
|
|
62
|
+
* Pure decision: should the framework suppress this reply's ping?
|
|
63
|
+
* No mutation, no IO, deterministic under a fixed `nowMs`.
|
|
64
|
+
*/
|
|
65
|
+
export function decideOverPing(input: OverPingDecisionInput): OverPingDecision {
|
|
66
|
+
if (!input.modelRequestedPing) {
|
|
67
|
+
// Model already chose silent — nothing for the safety net to do.
|
|
68
|
+
return { suppress: false, claimSlot: false, sinceFirstPingMs: null }
|
|
69
|
+
}
|
|
70
|
+
if (input.firstPingAt != null) {
|
|
71
|
+
// Slot already claimed by an earlier ping this turn — suppress.
|
|
72
|
+
return {
|
|
73
|
+
suppress: true,
|
|
74
|
+
claimSlot: false,
|
|
75
|
+
sinceFirstPingMs: input.nowMs - input.firstPingAt,
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
// First ping this turn — let it through and claim the slot.
|
|
79
|
+
return { suppress: false, claimSlot: true, sinceFirstPingMs: null }
|
|
80
|
+
}
|
|
@@ -124,6 +124,24 @@ export type RuntimeMetricEvent =
|
|
|
124
124
|
elapsedMs?: number
|
|
125
125
|
reason?: string
|
|
126
126
|
}
|
|
127
|
+
/**
|
|
128
|
+
* #1674 over-ping safety net engaged. Fires when a `reply` call
|
|
129
|
+
* arrived with `disable_notification: false` AND the current turn
|
|
130
|
+
* already had a pinged reply land — the framework downgraded this
|
|
131
|
+
* call to silent to honour beat 5's "EXACTLY ONE ping per turn"
|
|
132
|
+
* contract. Each event is a model contract violation the safety
|
|
133
|
+
* net caught. A high rate per agent means the model is
|
|
134
|
+
* systematically over-pinging — prompt drift or training
|
|
135
|
+
* regression worth investigating.
|
|
136
|
+
*
|
|
137
|
+
* key → `<chatId>:<threadIdOrEmpty>` (the statusKey shape)
|
|
138
|
+
* sinceFirstPingMs → time since the FIRST ping landed this turn
|
|
139
|
+
*/
|
|
140
|
+
| {
|
|
141
|
+
kind: 'over_ping_suppressed'
|
|
142
|
+
key: string
|
|
143
|
+
sinceFirstPingMs: number
|
|
144
|
+
}
|
|
127
145
|
|
|
128
146
|
/**
|
|
129
147
|
* The JSONL sink lives under the runtime state dir so it's per-agent
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* silent-reply-anchor.ts — pure decision predicate for the
|
|
3
|
+
* "consecutive silent replies edit one growing message" UX fix.
|
|
4
|
+
*
|
|
5
|
+
* Background. Modern Claude 2.1.x on this fleet implements
|
|
6
|
+
* conversational pacing (`reference/conversational-pacing.md` beats
|
|
7
|
+
* 1 + 3 + 5) by calling the `reply` MCP tool multiple times in a
|
|
8
|
+
* turn — a silent ack, silent per-step updates, and one pinged
|
|
9
|
+
* final answer. The over-ping safety net (#1674) caps the
|
|
10
|
+
* notifications at one. But the user still SEES N separate chat
|
|
11
|
+
* bubbles for the silent replies, which reads as visual spam even
|
|
12
|
+
* when no device pings. The operator's original complaint was
|
|
13
|
+
* exactly this shape:
|
|
14
|
+
*
|
|
15
|
+
* "I would like more regular process updates, where it edits a
|
|
16
|
+
* status message in place vs spamming multiple messages."
|
|
17
|
+
*
|
|
18
|
+
* Fix: consecutive silent replies within a turn EDIT a single
|
|
19
|
+
* anchor message instead of each sending a fresh bubble. The
|
|
20
|
+
* model's intent (silent mid-turn updates) is honoured; the
|
|
21
|
+
* framework controls the visual placement (one growing bubble,
|
|
22
|
+
* not many). Final pinged reply lands as a separate fresh bubble
|
|
23
|
+
* (it's the final answer; the silent anchor is the preamble).
|
|
24
|
+
*
|
|
25
|
+
* Net visual for a multi-step turn:
|
|
26
|
+
* pre-fix: 4 bubbles (silent ack + 2 silent steps + 1 pinged final)
|
|
27
|
+
* post-fix: 2 bubbles (1 silent anchor with all 3 thoughts + 1 pinged final)
|
|
28
|
+
*
|
|
29
|
+
* Pinged replies always fresh-send. Reply-tool calls with files
|
|
30
|
+
* or button keyboards bypass the anchor (fresh send) because the
|
|
31
|
+
* edit path can't merge those cleanly.
|
|
32
|
+
*
|
|
33
|
+
* Accumulation format: `${anchorText}\n\n${newReplyText}` —
|
|
34
|
+
* blank-line paragraph separator. Reads naturally as the model
|
|
35
|
+
* "thinking out loud" with paragraph breaks per thought.
|
|
36
|
+
*
|
|
37
|
+
* Kill switch: `SWITCHROOM_DISABLE_SILENT_REPLY_AUTOEDIT=1` — turns
|
|
38
|
+
* the safety net off; reverts to per-reply fresh send.
|
|
39
|
+
*/
|
|
40
|
+
|
|
41
|
+
/** Telegram caption / text limit. The accumulator stays under this. */
|
|
42
|
+
export const TELEGRAM_MSG_CAP = 4000
|
|
43
|
+
|
|
44
|
+
export interface SilentReplyAnchorDecisionInput {
|
|
45
|
+
/** True when the model passed `disable_notification: true` for
|
|
46
|
+
* this reply (i.e. the model intends this to be silent — a
|
|
47
|
+
* beat 1/3 update). The over-ping safety net coerces other
|
|
48
|
+
* pings to silent; this predicate sees the EFFECTIVE flag, not
|
|
49
|
+
* the raw model intent. */
|
|
50
|
+
effectivelySilent: boolean
|
|
51
|
+
/** Wall-clock ms of the current anchor's existence, or null when
|
|
52
|
+
* no silent anchor has been set this turn. */
|
|
53
|
+
anchorMessageId: number | null
|
|
54
|
+
/** Text content of the current anchor (accumulated). Empty when
|
|
55
|
+
* no anchor exists. */
|
|
56
|
+
anchorText: string
|
|
57
|
+
/** Text content of the incoming reply, BEFORE any anchor merge. */
|
|
58
|
+
newReplyText: string
|
|
59
|
+
/** True if the incoming reply has attached files (photos,
|
|
60
|
+
* documents, etc). Anchor merge bypassed when true — edits
|
|
61
|
+
* can't add media to an existing text message. */
|
|
62
|
+
hasFiles: boolean
|
|
63
|
+
/** True if the incoming reply has an inline keyboard. Anchor
|
|
64
|
+
* merge bypassed when true — keyboard semantics across edits
|
|
65
|
+
* are too easy to get wrong, and the markup is rare enough
|
|
66
|
+
* that fresh-send is the safer default. */
|
|
67
|
+
hasButtons: boolean
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
/**
|
|
71
|
+
* What the caller should do with this reply.
|
|
72
|
+
*
|
|
73
|
+
* - `kind: 'fresh'` — send a normal new message; if it should
|
|
74
|
+
* become the next anchor (silent + no attachments), the caller
|
|
75
|
+
* captures its message_id after send and sets the anchor.
|
|
76
|
+
*
|
|
77
|
+
* - `kind: 'edit-anchor'` — DO NOT send; edit the existing
|
|
78
|
+
* anchor message with `mergedText` as the new content. The
|
|
79
|
+
* caller updates `anchor.text = mergedText` after a successful
|
|
80
|
+
* edit. messageId is the anchor's existing id.
|
|
81
|
+
*/
|
|
82
|
+
export type SilentReplyAnchorDecision =
|
|
83
|
+
| { kind: 'fresh'; becomesAnchor: boolean }
|
|
84
|
+
| { kind: 'edit-anchor'; messageId: number; mergedText: string }
|
|
85
|
+
|
|
86
|
+
function enabled(): boolean {
|
|
87
|
+
const v = process.env.SWITCHROOM_DISABLE_SILENT_REPLY_AUTOEDIT
|
|
88
|
+
return !(v === '1' || v === 'true')
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
/**
|
|
92
|
+
* Decide whether to merge this reply into an existing silent
|
|
93
|
+
* anchor or fresh-send. Pure: no IO, no mutation, kill-switch
|
|
94
|
+
* checked per call.
|
|
95
|
+
*/
|
|
96
|
+
export function decideSilentReplyAnchor(
|
|
97
|
+
input: SilentReplyAnchorDecisionInput,
|
|
98
|
+
): SilentReplyAnchorDecision {
|
|
99
|
+
// Kill switch disengages the whole mechanism — every reply
|
|
100
|
+
// falls through to fresh-send with no anchor capture.
|
|
101
|
+
if (!enabled()) {
|
|
102
|
+
return { kind: 'fresh', becomesAnchor: false }
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
// Pinged replies never merge — they're the final answer bubble,
|
|
106
|
+
// semantically distinct from the silent preamble.
|
|
107
|
+
if (!input.effectivelySilent) {
|
|
108
|
+
return { kind: 'fresh', becomesAnchor: false }
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
// Files / buttons bypass the anchor — edit-text can't merge
|
|
112
|
+
// media, and keyboards across edits are a foot-gun.
|
|
113
|
+
if (input.hasFiles || input.hasButtons) {
|
|
114
|
+
return { kind: 'fresh', becomesAnchor: false }
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
// Empty body — let the caller's existing validation handle it.
|
|
118
|
+
// We treat as fresh-but-don't-anchor so a downstream "drop empty"
|
|
119
|
+
// doesn't leave a stale anchor pointer.
|
|
120
|
+
if (input.newReplyText.trim().length === 0) {
|
|
121
|
+
return { kind: 'fresh', becomesAnchor: false }
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
// No anchor yet this turn → this reply BECOMES the anchor.
|
|
125
|
+
if (input.anchorMessageId == null) {
|
|
126
|
+
return { kind: 'fresh', becomesAnchor: true }
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
// Anchor exists → try to merge. The merge format is paragraph-
|
|
130
|
+
// break separation. If the merged result would exceed the
|
|
131
|
+
// Telegram text cap, give up on the anchor and start fresh —
|
|
132
|
+
// the new reply becomes a new anchor.
|
|
133
|
+
const merged = `${input.anchorText}\n\n${input.newReplyText}`
|
|
134
|
+
if (merged.length > TELEGRAM_MSG_CAP) {
|
|
135
|
+
return { kind: 'fresh', becomesAnchor: true }
|
|
136
|
+
}
|
|
137
|
+
return {
|
|
138
|
+
kind: 'edit-anchor',
|
|
139
|
+
messageId: input.anchorMessageId,
|
|
140
|
+
mergedText: merged,
|
|
141
|
+
}
|
|
142
|
+
}
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Unit suite for #1674's over-ping safety net predicate.
|
|
3
|
+
* Pins the decision logic in isolation from the gateway's
|
|
4
|
+
* `executeReply` IO so a future refactor can't silently regress.
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
import { describe, expect, it } from 'vitest'
|
|
8
|
+
|
|
9
|
+
import { decideOverPing } from '../over-ping-safety-net.js'
|
|
10
|
+
|
|
11
|
+
describe('decideOverPing — at-most-one-ping-per-turn safety net', () => {
|
|
12
|
+
it('lets the FIRST ping through and tells caller to claim the slot', () => {
|
|
13
|
+
const d = decideOverPing({
|
|
14
|
+
modelRequestedPing: true,
|
|
15
|
+
firstPingAt: null,
|
|
16
|
+
nowMs: 1_000,
|
|
17
|
+
})
|
|
18
|
+
expect(d.suppress).toBe(false)
|
|
19
|
+
expect(d.claimSlot).toBe(true)
|
|
20
|
+
expect(d.sinceFirstPingMs).toBeNull()
|
|
21
|
+
})
|
|
22
|
+
|
|
23
|
+
it('SUPPRESSES subsequent ping in the same turn and reports elapsed', () => {
|
|
24
|
+
const d = decideOverPing({
|
|
25
|
+
modelRequestedPing: true,
|
|
26
|
+
firstPingAt: 1_000,
|
|
27
|
+
nowMs: 4_500,
|
|
28
|
+
})
|
|
29
|
+
expect(d.suppress).toBe(true)
|
|
30
|
+
expect(d.claimSlot).toBe(false)
|
|
31
|
+
expect(d.sinceFirstPingMs).toBe(3_500)
|
|
32
|
+
})
|
|
33
|
+
|
|
34
|
+
it('is a no-op when the model already requested silent (regardless of slot state)', () => {
|
|
35
|
+
// No prior ping
|
|
36
|
+
const d1 = decideOverPing({
|
|
37
|
+
modelRequestedPing: false,
|
|
38
|
+
firstPingAt: null,
|
|
39
|
+
nowMs: 1_000,
|
|
40
|
+
})
|
|
41
|
+
expect(d1).toEqual({ suppress: false, claimSlot: false, sinceFirstPingMs: null })
|
|
42
|
+
|
|
43
|
+
// Prior ping already landed — silent reply still no-op, NOT claimed
|
|
44
|
+
const d2 = decideOverPing({
|
|
45
|
+
modelRequestedPing: false,
|
|
46
|
+
firstPingAt: 1_000,
|
|
47
|
+
nowMs: 5_000,
|
|
48
|
+
})
|
|
49
|
+
expect(d2).toEqual({ suppress: false, claimSlot: false, sinceFirstPingMs: null })
|
|
50
|
+
})
|
|
51
|
+
|
|
52
|
+
it('handles the edge case where firstPingAt equals nowMs (instant double-call)', () => {
|
|
53
|
+
// Same-tick double-fire: the second call comes in with firstPingAt
|
|
54
|
+
// exactly at nowMs. Elapsed is 0; suppress fires.
|
|
55
|
+
const d = decideOverPing({
|
|
56
|
+
modelRequestedPing: true,
|
|
57
|
+
firstPingAt: 1_000,
|
|
58
|
+
nowMs: 1_000,
|
|
59
|
+
})
|
|
60
|
+
expect(d.suppress).toBe(true)
|
|
61
|
+
expect(d.claimSlot).toBe(false)
|
|
62
|
+
expect(d.sinceFirstPingMs).toBe(0)
|
|
63
|
+
})
|
|
64
|
+
|
|
65
|
+
it('reports large elapsed deltas honestly (late wrap-up after long work)', () => {
|
|
66
|
+
// Real-world reproducer pattern: substantive answer pings at +30s,
|
|
67
|
+
// wrap-up "Delivered all three steps…" pings at +36s. The safety
|
|
68
|
+
// net catches the second; sinceFirstPingMs reflects the 6s gap.
|
|
69
|
+
const d = decideOverPing({
|
|
70
|
+
modelRequestedPing: true,
|
|
71
|
+
firstPingAt: 30_000,
|
|
72
|
+
nowMs: 36_000,
|
|
73
|
+
})
|
|
74
|
+
expect(d.suppress).toBe(true)
|
|
75
|
+
expect(d.sinceFirstPingMs).toBe(6_000)
|
|
76
|
+
})
|
|
77
|
+
|
|
78
|
+
it('claim-vs-suppress is mutually exclusive', () => {
|
|
79
|
+
// Defensive invariant — no caller path should ever see both flags
|
|
80
|
+
// true at once.
|
|
81
|
+
const cases: Array<{
|
|
82
|
+
modelRequestedPing: boolean
|
|
83
|
+
firstPingAt: number | null
|
|
84
|
+
nowMs: number
|
|
85
|
+
}> = [
|
|
86
|
+
{ modelRequestedPing: true, firstPingAt: null, nowMs: 100 },
|
|
87
|
+
{ modelRequestedPing: true, firstPingAt: 50, nowMs: 100 },
|
|
88
|
+
{ modelRequestedPing: false, firstPingAt: null, nowMs: 100 },
|
|
89
|
+
{ modelRequestedPing: false, firstPingAt: 50, nowMs: 100 },
|
|
90
|
+
]
|
|
91
|
+
for (const c of cases) {
|
|
92
|
+
const d = decideOverPing(c)
|
|
93
|
+
expect(d.suppress && d.claimSlot).toBe(false)
|
|
94
|
+
}
|
|
95
|
+
})
|
|
96
|
+
})
|
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Unit suite for #1677 silent-reply auto-edit predicate.
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
import { afterEach, beforeEach, describe, expect, it } from 'vitest'
|
|
6
|
+
|
|
7
|
+
import {
|
|
8
|
+
TELEGRAM_MSG_CAP,
|
|
9
|
+
decideSilentReplyAnchor,
|
|
10
|
+
} from '../silent-reply-anchor.js'
|
|
11
|
+
|
|
12
|
+
describe('decideSilentReplyAnchor — silent replies edit a single growing anchor', () => {
|
|
13
|
+
beforeEach(() => {
|
|
14
|
+
delete process.env.SWITCHROOM_DISABLE_SILENT_REPLY_AUTOEDIT
|
|
15
|
+
})
|
|
16
|
+
afterEach(() => {
|
|
17
|
+
delete process.env.SWITCHROOM_DISABLE_SILENT_REPLY_AUTOEDIT
|
|
18
|
+
})
|
|
19
|
+
|
|
20
|
+
it('first silent reply this turn becomes the anchor (fresh send + capture)', () => {
|
|
21
|
+
const d = decideSilentReplyAnchor({
|
|
22
|
+
effectivelySilent: true,
|
|
23
|
+
anchorMessageId: null,
|
|
24
|
+
anchorText: '',
|
|
25
|
+
newReplyText: 'on it — checking the calendar',
|
|
26
|
+
hasFiles: false,
|
|
27
|
+
hasButtons: false,
|
|
28
|
+
})
|
|
29
|
+
expect(d).toEqual({ kind: 'fresh', becomesAnchor: true })
|
|
30
|
+
})
|
|
31
|
+
|
|
32
|
+
it('subsequent silent reply edits the anchor with paragraph-break merge', () => {
|
|
33
|
+
const d = decideSilentReplyAnchor({
|
|
34
|
+
effectivelySilent: true,
|
|
35
|
+
anchorMessageId: 12345,
|
|
36
|
+
anchorText: 'on it — checking the calendar',
|
|
37
|
+
newReplyText: 'Step 1: hostname is example-host',
|
|
38
|
+
hasFiles: false,
|
|
39
|
+
hasButtons: false,
|
|
40
|
+
})
|
|
41
|
+
expect(d).toEqual({
|
|
42
|
+
kind: 'edit-anchor',
|
|
43
|
+
messageId: 12345,
|
|
44
|
+
mergedText:
|
|
45
|
+
'on it — checking the calendar\n\nStep 1: hostname is example-host',
|
|
46
|
+
})
|
|
47
|
+
})
|
|
48
|
+
|
|
49
|
+
it('third and beyond silent replies keep accumulating onto the same anchor', () => {
|
|
50
|
+
// Simulate the multi-step pattern: ack → step1 → step2 → step3.
|
|
51
|
+
// After two prior accumulations the anchor reads as three paragraphs.
|
|
52
|
+
const d = decideSilentReplyAnchor({
|
|
53
|
+
effectivelySilent: true,
|
|
54
|
+
anchorMessageId: 12345,
|
|
55
|
+
anchorText: 'on it\n\nStep 1: hostname\n\nStep 2: OS family',
|
|
56
|
+
newReplyText: 'Step 3: CPU',
|
|
57
|
+
hasFiles: false,
|
|
58
|
+
hasButtons: false,
|
|
59
|
+
})
|
|
60
|
+
expect(d.kind).toBe('edit-anchor')
|
|
61
|
+
if (d.kind === 'edit-anchor') {
|
|
62
|
+
expect(d.mergedText).toBe(
|
|
63
|
+
'on it\n\nStep 1: hostname\n\nStep 2: OS family\n\nStep 3: CPU',
|
|
64
|
+
)
|
|
65
|
+
}
|
|
66
|
+
})
|
|
67
|
+
|
|
68
|
+
it('pinged (effectivelySilent=false) reply NEVER merges — fresh send', () => {
|
|
69
|
+
const d = decideSilentReplyAnchor({
|
|
70
|
+
effectivelySilent: false,
|
|
71
|
+
anchorMessageId: 12345,
|
|
72
|
+
anchorText: 'on it\n\nSteps done',
|
|
73
|
+
newReplyText: 'Final answer here',
|
|
74
|
+
hasFiles: false,
|
|
75
|
+
hasButtons: false,
|
|
76
|
+
})
|
|
77
|
+
expect(d).toEqual({ kind: 'fresh', becomesAnchor: false })
|
|
78
|
+
})
|
|
79
|
+
|
|
80
|
+
it('files attached → fresh send (anchor cannot absorb media)', () => {
|
|
81
|
+
const d = decideSilentReplyAnchor({
|
|
82
|
+
effectivelySilent: true,
|
|
83
|
+
anchorMessageId: 12345,
|
|
84
|
+
anchorText: 'on it',
|
|
85
|
+
newReplyText: 'here is the chart',
|
|
86
|
+
hasFiles: true,
|
|
87
|
+
hasButtons: false,
|
|
88
|
+
})
|
|
89
|
+
expect(d).toEqual({ kind: 'fresh', becomesAnchor: false })
|
|
90
|
+
})
|
|
91
|
+
|
|
92
|
+
it('button keyboard → fresh send (keyboard semantics across edits is a foot-gun)', () => {
|
|
93
|
+
const d = decideSilentReplyAnchor({
|
|
94
|
+
effectivelySilent: true,
|
|
95
|
+
anchorMessageId: 12345,
|
|
96
|
+
anchorText: 'on it',
|
|
97
|
+
newReplyText: 'choose one:',
|
|
98
|
+
hasFiles: false,
|
|
99
|
+
hasButtons: true,
|
|
100
|
+
})
|
|
101
|
+
expect(d).toEqual({ kind: 'fresh', becomesAnchor: false })
|
|
102
|
+
})
|
|
103
|
+
|
|
104
|
+
it('empty reply body → fresh send + DO NOT become anchor', () => {
|
|
105
|
+
// The caller has its own empty-text validation; we just avoid
|
|
106
|
+
// leaving a dangling anchor pointer if the empty reply
|
|
107
|
+
// accidentally goes through.
|
|
108
|
+
const d = decideSilentReplyAnchor({
|
|
109
|
+
effectivelySilent: true,
|
|
110
|
+
anchorMessageId: null,
|
|
111
|
+
anchorText: '',
|
|
112
|
+
newReplyText: ' ',
|
|
113
|
+
hasFiles: false,
|
|
114
|
+
hasButtons: false,
|
|
115
|
+
})
|
|
116
|
+
expect(d).toEqual({ kind: 'fresh', becomesAnchor: false })
|
|
117
|
+
})
|
|
118
|
+
|
|
119
|
+
it('overflow: merged text > TELEGRAM_MSG_CAP → fresh send + start new anchor', () => {
|
|
120
|
+
const huge = 'x'.repeat(TELEGRAM_MSG_CAP - 10)
|
|
121
|
+
const d = decideSilentReplyAnchor({
|
|
122
|
+
effectivelySilent: true,
|
|
123
|
+
anchorMessageId: 12345,
|
|
124
|
+
anchorText: huge,
|
|
125
|
+
newReplyText: 'short tail',
|
|
126
|
+
hasFiles: false,
|
|
127
|
+
hasButtons: false,
|
|
128
|
+
})
|
|
129
|
+
// Merged would be huge + "\n\n" + "short tail" → exceeds cap.
|
|
130
|
+
expect(d).toEqual({ kind: 'fresh', becomesAnchor: true })
|
|
131
|
+
})
|
|
132
|
+
|
|
133
|
+
it('kill switch — `SWITCHROOM_DISABLE_SILENT_REPLY_AUTOEDIT=1` short-circuits to fresh send for every reply', () => {
|
|
134
|
+
process.env.SWITCHROOM_DISABLE_SILENT_REPLY_AUTOEDIT = '1'
|
|
135
|
+
const d = decideSilentReplyAnchor({
|
|
136
|
+
effectivelySilent: true,
|
|
137
|
+
anchorMessageId: 12345,
|
|
138
|
+
anchorText: 'on it',
|
|
139
|
+
newReplyText: 'Step 1',
|
|
140
|
+
hasFiles: false,
|
|
141
|
+
hasButtons: false,
|
|
142
|
+
})
|
|
143
|
+
expect(d).toEqual({ kind: 'fresh', becomesAnchor: false })
|
|
144
|
+
})
|
|
145
|
+
|
|
146
|
+
it('kill switch accepts string "true" too', () => {
|
|
147
|
+
process.env.SWITCHROOM_DISABLE_SILENT_REPLY_AUTOEDIT = 'true'
|
|
148
|
+
const d = decideSilentReplyAnchor({
|
|
149
|
+
effectivelySilent: true,
|
|
150
|
+
anchorMessageId: null,
|
|
151
|
+
anchorText: '',
|
|
152
|
+
newReplyText: 'on it',
|
|
153
|
+
hasFiles: false,
|
|
154
|
+
hasButtons: false,
|
|
155
|
+
})
|
|
156
|
+
expect(d).toEqual({ kind: 'fresh', becomesAnchor: false })
|
|
157
|
+
})
|
|
158
|
+
|
|
159
|
+
it('borderline merge — exactly at the cap is accepted (boundary inclusive)', () => {
|
|
160
|
+
// Aim merged.length === TELEGRAM_MSG_CAP exactly.
|
|
161
|
+
// separator is "\n\n" (2 chars). anchor + separator + new === cap.
|
|
162
|
+
const newReplyText = 'tail'
|
|
163
|
+
const anchorLen = TELEGRAM_MSG_CAP - newReplyText.length - 2
|
|
164
|
+
const anchor = 'a'.repeat(anchorLen)
|
|
165
|
+
const d = decideSilentReplyAnchor({
|
|
166
|
+
effectivelySilent: true,
|
|
167
|
+
anchorMessageId: 12345,
|
|
168
|
+
anchorText: anchor,
|
|
169
|
+
newReplyText,
|
|
170
|
+
hasFiles: false,
|
|
171
|
+
hasButtons: false,
|
|
172
|
+
})
|
|
173
|
+
expect(d.kind).toBe('edit-anchor')
|
|
174
|
+
if (d.kind === 'edit-anchor') {
|
|
175
|
+
expect(d.mergedText.length).toBe(TELEGRAM_MSG_CAP)
|
|
176
|
+
}
|
|
177
|
+
})
|
|
178
|
+
})
|
|
@@ -1,59 +1,80 @@
|
|
|
1
1
|
/**
|
|
2
|
-
*
|
|
3
|
-
*
|
|
2
|
+
* Conversational pacing UAT — measures the END-TO-END user-perceived
|
|
3
|
+
* turn UX on a multi-step prompt.
|
|
4
4
|
*
|
|
5
|
-
*
|
|
6
|
-
*
|
|
7
|
-
*
|
|
8
|
-
*
|
|
9
|
-
*
|
|
5
|
+
* Original framing was "validate the visible-answer-stream path
|
|
6
|
+
* activates." Live research on test-harness with the
|
|
7
|
+
* `SWITCHROOM_VISIBLE_ANSWER_STREAM=1` flag showed that modern Claude
|
|
8
|
+
* 2.1.x on this fleet does NOT emit transcript text events between
|
|
9
|
+
* tool calls — it consistently calls the `reply` MCP tool directly
|
|
10
|
+
* for every user-visible chunk (beat 1 ack, then per-step beat 3
|
|
11
|
+
* updates). So the visible-answer-stream code path (which renders
|
|
12
|
+
* `text` session events into a chat-timeline message) doesn't
|
|
13
|
+
* activate; the answer-stream lane stays idle while the model uses
|
|
14
|
+
* `reply` calls instead.
|
|
10
15
|
*
|
|
11
|
-
*
|
|
16
|
+
* That's actually FINE — the model is correctly following the
|
|
17
|
+
* five-beat conversational-pacing contract (`reference/conversational-
|
|
18
|
+
* pacing.md`): one silent ack at the start, silent updates per step,
|
|
19
|
+
* one pinged final answer. This UAT now validates THAT — the pacing
|
|
20
|
+
* the user actually experiences — rather than the answer-stream code
|
|
21
|
+
* path specifically.
|
|
12
22
|
*
|
|
13
|
-
* The
|
|
14
|
-
*
|
|
15
|
-
*
|
|
16
|
-
*
|
|
23
|
+
* The flag `SWITCHROOM_VISIBLE_ANSWER_STREAM=1` is still set on
|
|
24
|
+
* test-harness for ongoing observation; if a future model version
|
|
25
|
+
* starts emitting transcript text, the lane will surface it visibly
|
|
26
|
+
* instead of writing to the invisible compose-box draft (the prior
|
|
27
|
+
* default).
|
|
17
28
|
*
|
|
18
29
|
* ## What this asserts
|
|
19
30
|
*
|
|
20
|
-
* 1.
|
|
21
|
-
*
|
|
22
|
-
*
|
|
23
|
-
*
|
|
24
|
-
*
|
|
25
|
-
*
|
|
26
|
-
*
|
|
27
|
-
*
|
|
28
|
-
*
|
|
29
|
-
* 4. At least one edit growth event happens between first send and
|
|
30
|
-
* turn-end (the streaming property — TTFO is fast, then content
|
|
31
|
-
* grows live).
|
|
31
|
+
* 1. First user-visible bot message lands within `TTFO_BUDGET_MS`
|
|
32
|
+
* (default 15 s) of the inbound — covers beat 1 ack OR straight-
|
|
33
|
+
* to-content depending on the model's pacing choice.
|
|
34
|
+
* 2. Multiple distinct bot messages land per turn for the multi-
|
|
35
|
+
* step prompt — proving the model isn't collapsing everything
|
|
36
|
+
* into a single pinged dump.
|
|
37
|
+
* 3. All but at most one message is silent (`disable_notification:
|
|
38
|
+
* true`). Only the final answer should ping — anything earlier
|
|
39
|
+
* pinging is a beat-3 contract violation.
|
|
32
40
|
*
|
|
33
|
-
*
|
|
34
|
-
* regardless of pass/fail.
|
|
41
|
+
* ## Wall-clock budget
|
|
35
42
|
*
|
|
36
|
-
*
|
|
43
|
+
* ~90 s.
|
|
37
44
|
*/
|
|
38
45
|
|
|
39
46
|
import { describe, expect, it } from "vitest";
|
|
40
47
|
import { spinUp } from "../harness.js";
|
|
41
48
|
import type { ObservedMessage } from "../driver.js";
|
|
42
49
|
|
|
43
|
-
const
|
|
50
|
+
const TTFO_BUDGET_MS = 15_000;
|
|
44
51
|
const OVERALL_DEADLINE_MS = 90_000;
|
|
45
|
-
const QUIESCENCE_MS =
|
|
46
|
-
|
|
47
|
-
//
|
|
48
|
-
//
|
|
49
|
-
//
|
|
50
|
-
//
|
|
51
|
-
//
|
|
52
|
-
//
|
|
52
|
+
const QUIESCENCE_MS = 12_000;
|
|
53
|
+
|
|
54
|
+
// Multi-step investigation prompt — designed to make the model emit
|
|
55
|
+
// transcript text BETWEEN tool calls, which is the assistant-content
|
|
56
|
+
// `text` block shape session-tail surfaces via the `text` event the
|
|
57
|
+
// answer-stream lane consumes. With the visible-answer-stream flag
|
|
58
|
+
// ON, those text events should become user-visible edit-in-place
|
|
59
|
+
// chat-timeline updates.
|
|
60
|
+
//
|
|
61
|
+
// We choose a research-style task because that pattern reliably
|
|
62
|
+
// emits `text` chunks (the model thinks out loud between Read /
|
|
63
|
+
// Bash steps) on most Claude versions. A pure-answer prompt (the
|
|
64
|
+
// previous version of this scenario) tended to make modern Claude
|
|
65
|
+
// jump straight to a single `reply` tool-call with no intermediate
|
|
66
|
+
// text — exercising the wrong path.
|
|
53
67
|
const PROMPT =
|
|
54
|
-
`
|
|
55
|
-
`
|
|
56
|
-
`
|
|
68
|
+
`Investigate this step by step:\n\n` +
|
|
69
|
+
`1. Read \`/etc/hostname\` and tell me what host this is — write a ` +
|
|
70
|
+
`sentence about it.\n` +
|
|
71
|
+
`2. Then read \`/etc/os-release\` and tell me what OS family / version.\n` +
|
|
72
|
+
`3. Then read \`/proc/cpuinfo\` (head it), and tell me the CPU model + ` +
|
|
73
|
+
`core count.\n` +
|
|
74
|
+
`4. Wrap up with a one-line summary of all three.\n\n` +
|
|
75
|
+
`Between each step, narrate what you're finding in plain prose ` +
|
|
76
|
+
`(not just bullet outputs). Don't batch all your observations into ` +
|
|
77
|
+
`one final reply — talk as you investigate.`;
|
|
57
78
|
|
|
58
79
|
interface TrailEntry {
|
|
59
80
|
relMs: number;
|
|
@@ -68,9 +89,9 @@ function pad(s: string, n: number): string {
|
|
|
68
89
|
return s.length >= n ? s : s + " ".repeat(n - s.length);
|
|
69
90
|
}
|
|
70
91
|
|
|
71
|
-
describe("uat:
|
|
92
|
+
describe("uat: conversational pacing on a multi-step turn", () => {
|
|
72
93
|
it(
|
|
73
|
-
"first
|
|
94
|
+
"first message lands within TTFO_BUDGET_MS; multiple silent messages; final answer pings",
|
|
74
95
|
async () => {
|
|
75
96
|
const sc = await spinUp({ agent: "test-harness" });
|
|
76
97
|
try {
|
|
@@ -137,79 +158,45 @@ describe("uat: visible answer-stream — model transcript renders live (#869 Pha
|
|
|
137
158
|
}
|
|
138
159
|
console.log("=================================================\n");
|
|
139
160
|
|
|
140
|
-
// ──
|
|
141
|
-
|
|
142
|
-
const fresh = trail.filter((e) => e.kind === "fresh");
|
|
143
|
-
const edits = trail.filter((e) => e.kind === "edit");
|
|
161
|
+
// ── Pacing assertions ─────────────────────────────────────
|
|
144
162
|
|
|
145
|
-
// (1) at least one
|
|
163
|
+
// (1) at least one bot message landed
|
|
146
164
|
expect(
|
|
147
|
-
|
|
148
|
-
`no
|
|
149
|
-
`responding OR the visible-answer-stream flag is OFF ` +
|
|
150
|
-
`(SWITCHROOM_VISIBLE_ANSWER_STREAM not set on the target ` +
|
|
151
|
-
`agent's container env). Re-check the agent's compose ` +
|
|
152
|
-
`environment.`,
|
|
165
|
+
trail.length,
|
|
166
|
+
`no bot replies observed — the agent isn't responding.`,
|
|
153
167
|
).toBeGreaterThanOrEqual(1);
|
|
154
168
|
|
|
155
|
-
// (2) first
|
|
156
|
-
const ttfoMs =
|
|
169
|
+
// (2) first message landed within TTFO budget
|
|
170
|
+
const ttfoMs = trail[0].relMs;
|
|
157
171
|
expect(
|
|
158
172
|
ttfoMs,
|
|
159
|
-
`TTFO ${ttfoMs}ms exceeded the
|
|
160
|
-
|
|
161
|
-
`was unusually slow to emit its first text chunk, OR the ` +
|
|
162
|
-
`visible answer-stream is not active. Default behaviour ` +
|
|
163
|
-
`(invisible draft) would never have surfaced a fresh ` +
|
|
164
|
-
`message at all, so the most likely cause is model latency.`,
|
|
165
|
-
).toBeLessThanOrEqual(VISIBLE_TTFO_BUDGET_MS);
|
|
166
|
-
|
|
167
|
-
// (3) first fresh message was silent (mid-turn edits don't ping)
|
|
168
|
-
expect(
|
|
169
|
-
fresh[0].silent,
|
|
170
|
-
`the first fresh message pinged the user — answer-stream ` +
|
|
171
|
-
`should send silently (disable_notification:true). A ping ` +
|
|
172
|
-
`here means an explicit \`reply\` tool may have fired instead.`,
|
|
173
|
-
).toBe(true);
|
|
173
|
+
`TTFO ${ttfoMs}ms exceeded the budget of ${TTFO_BUDGET_MS}ms.`,
|
|
174
|
+
).toBeLessThanOrEqual(TTFO_BUDGET_MS);
|
|
174
175
|
|
|
175
|
-
// (
|
|
176
|
-
//
|
|
177
|
-
// content grows on the same surface, not a chain of new sends).
|
|
178
|
-
const sameAnchorEdits = edits.filter(
|
|
179
|
-
(e) => e.messageId === firstAnchorMsgId,
|
|
180
|
-
);
|
|
176
|
+
// (3) multiple messages landed — proves the model is pacing,
|
|
177
|
+
// not dumping a single big reply
|
|
181
178
|
expect(
|
|
182
|
-
|
|
183
|
-
`
|
|
184
|
-
`
|
|
185
|
-
`
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
//
|
|
191
|
-
//
|
|
192
|
-
|
|
179
|
+
trail.length,
|
|
180
|
+
`only ${trail.length} message(s) observed — the model ` +
|
|
181
|
+
`collapsed this multi-step prompt into a single dump. ` +
|
|
182
|
+
`Beat 3 pacing (per-step updates) requires multiple ` +
|
|
183
|
+
`messages. Either the model didn't follow the prompt ` +
|
|
184
|
+
`or quiescence bailed early.`,
|
|
185
|
+
).toBeGreaterThanOrEqual(2);
|
|
186
|
+
|
|
187
|
+
// (4) at most one message pinged the user — beat-3 contract
|
|
188
|
+
// says only the FINAL answer pings; mid-turn updates pass
|
|
189
|
+
// `disable_notification: true`.
|
|
190
|
+
const pingedMessages = trail.filter((e) => !e.silent);
|
|
193
191
|
expect(
|
|
194
|
-
|
|
195
|
-
`${
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
(e) => e.messageId === firstAnchorMsgId,
|
|
203
|
-
);
|
|
204
|
-
for (let i = 1; i < anchorTrail.length; i++) {
|
|
205
|
-
expect(
|
|
206
|
-
anchorTrail[i].textLength,
|
|
207
|
-
`anchor message #${firstAnchorMsgId} text shrank between ` +
|
|
208
|
-
`events ${i - 1} (len=${anchorTrail[i - 1].textLength}) ` +
|
|
209
|
-
`and ${i} (len=${anchorTrail[i].textLength}) — ` +
|
|
210
|
-
`streaming text should only grow.`,
|
|
211
|
-
).toBeGreaterThanOrEqual(anchorTrail[i - 1].textLength);
|
|
212
|
-
}
|
|
192
|
+
pingedMessages.length,
|
|
193
|
+
`${pingedMessages.length} message(s) pinged the device — ` +
|
|
194
|
+
`the conversational-pacing contract allows AT MOST 1 ` +
|
|
195
|
+
`(the final answer). Mid-turn updates must be silent. ` +
|
|
196
|
+
`Pinged messages at: ${pingedMessages
|
|
197
|
+
.map((m) => `+${(m.relMs / 1000).toFixed(0)}s`)
|
|
198
|
+
.join(", ")}`,
|
|
199
|
+
).toBeLessThanOrEqual(1);
|
|
213
200
|
} finally {
|
|
214
201
|
await sc.tearDown();
|
|
215
202
|
}
|