switchroom 0.14.19 → 0.14.21
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/agent-scheduler/index.js +6 -1
- package/dist/auth-broker/index.js +6 -1
- package/dist/cli/notion-write-pretool.mjs +6 -1
- package/dist/cli/switchroom.js +17 -3
- package/dist/host-control/main.js +6 -1
- package/dist/vault/approvals/kernel-server.js +6 -1
- package/dist/vault/broker/server.js +6 -1
- package/package.json +2 -2
- package/telegram-plugin/README.md +7 -3
- package/telegram-plugin/bridge/bridge.ts +1 -1
- package/telegram-plugin/dist/bridge/bridge.js +1 -1
- package/telegram-plugin/dist/gateway/gateway.js +368 -153
- package/telegram-plugin/dist/server.js +1 -1
- package/telegram-plugin/gateway/coalesce-attachments.ts +79 -0
- package/telegram-plugin/gateway/gateway.ts +257 -39
- package/telegram-plugin/gateway/interrupt-defer.ts +106 -0
- package/telegram-plugin/gateway/pending-inbound-buffer.ts +21 -4
- package/telegram-plugin/tests/coalesce-attachments.test.ts +170 -0
- package/telegram-plugin/tests/interrupt-defer.test.ts +160 -0
- package/telegram-plugin/tests/pending-inbound-buffer.test.ts +36 -0
- package/telegram-plugin/tests/permission-verdict-resume-guard.test.ts +86 -0
- package/telegram-plugin/tests/worker-activity-feed.test.ts +127 -0
- package/telegram-plugin/uat/assertions.ts +53 -0
- package/telegram-plugin/uat/driver.ts +28 -0
- package/telegram-plugin/uat/feed-matcher.test.ts +80 -0
- package/telegram-plugin/uat/fixtures/album/blue.jpg +0 -0
- package/telegram-plugin/uat/fixtures/album/green.jpg +0 -0
- package/telegram-plugin/uat/fixtures/album/red.jpg +0 -0
- package/telegram-plugin/uat/scenarios/jtbd-album-coalescing-dm.test.ts +136 -0
- package/telegram-plugin/uat/scenarios/jtbd-forwarded-burst-dm.test.ts +158 -0
- package/telegram-plugin/uat/scenarios/jtbd-memory-survives-restart-dm.test.ts +17 -2
- package/telegram-plugin/worker-activity-feed.ts +65 -9
|
@@ -11,6 +11,59 @@
|
|
|
11
11
|
|
|
12
12
|
import type { Driver, ObservedMessage, ObservedReaction } from "./driver.js";
|
|
13
13
|
|
|
14
|
+
/**
|
|
15
|
+
* Canonical shape of a worker-activity-feed message (#2000) as rendered
|
|
16
|
+
* in Telegram: a running header `🔧 Worker · …` that edits in place and
|
|
17
|
+
* finalizes to `✅ Worker done · …` / `⚠️ Worker failed · …`. The feed is
|
|
18
|
+
* default-on fleet-wide as of v0.14.19, so background sub-agent activity
|
|
19
|
+
* now surfaces as its own bot message in any chat — including DMs whose
|
|
20
|
+
* scenario only cares about the agent's conversational reply.
|
|
21
|
+
*
|
|
22
|
+
* Single source of truth; the worker-feed scenario asserts against this,
|
|
23
|
+
* and recall/reply scenarios exclude it via {@link isWorkerFeedMessage}.
|
|
24
|
+
*/
|
|
25
|
+
export const WORKER_FEED_RE = /🔧\s*Worker|✅\s*Worker done|⚠️\s*Worker failed|Worker (?:done|failed)/i;
|
|
26
|
+
|
|
27
|
+
/**
|
|
28
|
+
* True when `m` is a worker-activity-feed message rather than the agent's
|
|
29
|
+
* own reply. Use it to skip feed noise when matching for a turn's actual
|
|
30
|
+
* answer — without it, an `expectMessage(/\S/)` can latch onto the feed's
|
|
31
|
+
* first paint and miss (or mis-time) the real reply. See #2000 / the
|
|
32
|
+
* memory-survives-restart recall scenario.
|
|
33
|
+
*/
|
|
34
|
+
export function isWorkerFeedMessage(m: ObservedMessage): boolean {
|
|
35
|
+
return WORKER_FEED_RE.test(m.text);
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
/**
|
|
39
|
+
* A single tool-activity-feed line as rendered by
|
|
40
|
+
* `renderActivityFeed` (telegram-plugin/tool-activity-summary.ts): the
|
|
41
|
+
* in-progress step is `→ <label>`, finished steps are `✓ <label>`, and a
|
|
42
|
+
* long turn gets a `✓ +N earlier…` header. Telegram strips the bold/italic
|
|
43
|
+
* wrapping, so the observed text is just the marker glyph + label.
|
|
44
|
+
*/
|
|
45
|
+
const ACTIVITY_FEED_LINE_RE = /^[→✓]\s/u;
|
|
46
|
+
|
|
47
|
+
/**
|
|
48
|
+
* True when `m` is the live tool-activity feed (the one-message list of
|
|
49
|
+
* "what the agent is doing this turn") rather than the agent's reply. A
|
|
50
|
+
* message qualifies only when EVERY non-empty line is an activity line —
|
|
51
|
+
* so a real reply that merely contains an arrow is never misclassified.
|
|
52
|
+
*
|
|
53
|
+
* Recall/reply scenarios must skip this in addition to
|
|
54
|
+
* {@link isWorkerFeedMessage}: on a turn that uses tools, the feed paints
|
|
55
|
+
* `→ Finding the right tool` as its own bot message before the real answer
|
|
56
|
+
* lands, and an `expectMessage(/\S/)` would otherwise latch onto it.
|
|
57
|
+
*/
|
|
58
|
+
export function isActivityFeedMessage(m: ObservedMessage): boolean {
|
|
59
|
+
const lines = m.text
|
|
60
|
+
.split("\n")
|
|
61
|
+
.map((l) => l.trim())
|
|
62
|
+
.filter((l) => l.length > 0);
|
|
63
|
+
if (lines.length === 0) return false;
|
|
64
|
+
return lines.every((l) => ACTIVITY_FEED_LINE_RE.test(l));
|
|
65
|
+
}
|
|
66
|
+
|
|
14
67
|
export interface PollOptions {
|
|
15
68
|
/** Hard deadline; the predicate must resolve truthy before this. */
|
|
16
69
|
timeout: number;
|
|
@@ -646,6 +646,34 @@ export class Driver {
|
|
|
646
646
|
return { messageId: sent.id };
|
|
647
647
|
}
|
|
648
648
|
|
|
649
|
+
/**
|
|
650
|
+
* Send a photo album (Telegram media_group) — multiple photos posted as
|
|
651
|
+
* one group, the way a forwarded album or a multi-image paste arrives.
|
|
652
|
+
* Exercises the gateway's A2 multi-attachment coalescing: with
|
|
653
|
+
* coalesce.max_attachments default 10, the whole album folds into ONE
|
|
654
|
+
* Claude turn (the agent sees image_path, image_path_2, …). The optional
|
|
655
|
+
* caption rides on the first item, matching Telegram client behaviour.
|
|
656
|
+
* Returns every sent message id (one per album item).
|
|
657
|
+
*/
|
|
658
|
+
async sendAlbum(
|
|
659
|
+
chatId: number,
|
|
660
|
+
photoPaths: string[],
|
|
661
|
+
caption?: string,
|
|
662
|
+
opts?: SendTextOptions,
|
|
663
|
+
): Promise<{ messageIds: number[] }> {
|
|
664
|
+
const c = this.requireClient();
|
|
665
|
+
const replyTo = opts?.replyTo ?? opts?.messageThreadId;
|
|
666
|
+
const medias = photoPaths.map((p, i) =>
|
|
667
|
+
InputMedia.photo(p, i === 0 && caption ? { caption } : undefined),
|
|
668
|
+
);
|
|
669
|
+
const sent = await c.sendMediaGroup(
|
|
670
|
+
chatId,
|
|
671
|
+
medias,
|
|
672
|
+
replyTo ? { replyTo } : undefined,
|
|
673
|
+
);
|
|
674
|
+
return { messageIds: sent.map((m) => m.id) };
|
|
675
|
+
}
|
|
676
|
+
|
|
649
677
|
/**
|
|
650
678
|
* Send or remove an emoji reaction on a target message. Used by the
|
|
651
679
|
* UAT reaction-trigger scenario (#1074) to exercise the gateway's
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
import { describe, expect, it } from "bun:test";
|
|
2
|
+
import {
|
|
3
|
+
isActivityFeedMessage,
|
|
4
|
+
isWorkerFeedMessage,
|
|
5
|
+
WORKER_FEED_RE,
|
|
6
|
+
} from "./assertions.js";
|
|
7
|
+
|
|
8
|
+
// Pins the worker-activity-feed detector (#2000) used by recall/reply
|
|
9
|
+
// scenarios to skip feed noise. The live UAT it guards can't run in CI
|
|
10
|
+
// (needs sudo + a real Telegram session), so this is the CI-verifiable
|
|
11
|
+
// floor for the matcher's behavior.
|
|
12
|
+
const feed = (text: string) => ({ text }) as Parameters<typeof isWorkerFeedMessage>[0];
|
|
13
|
+
|
|
14
|
+
describe("isWorkerFeedMessage", () => {
|
|
15
|
+
it("matches the running feed header", () => {
|
|
16
|
+
expect(isWorkerFeedMessage(feed("🔧 Worker · crawling changelog · 0:12"))).toBe(true);
|
|
17
|
+
});
|
|
18
|
+
|
|
19
|
+
it("matches the terminal done/failed recaps", () => {
|
|
20
|
+
expect(isWorkerFeedMessage(feed("✅ Worker done · 10 tools · 1:03"))).toBe(true);
|
|
21
|
+
expect(isWorkerFeedMessage(feed("⚠️ Worker failed · 3 tools"))).toBe(true);
|
|
22
|
+
});
|
|
23
|
+
|
|
24
|
+
it("matches a done/failed header even without the leading emoji", () => {
|
|
25
|
+
expect(isWorkerFeedMessage(feed("Worker done · 2 tools"))).toBe(true);
|
|
26
|
+
expect(isWorkerFeedMessage(feed("Worker failed mid-step"))).toBe(true);
|
|
27
|
+
});
|
|
28
|
+
|
|
29
|
+
it("does NOT match an ordinary agent reply", () => {
|
|
30
|
+
expect(isWorkerFeedMessage(feed("on it, pulling the logs now"))).toBe(false);
|
|
31
|
+
expect(
|
|
32
|
+
isWorkerFeedMessage(feed("SWITCHROOM_UAT_MEM_DEADBEEFCAFE1234")),
|
|
33
|
+
).toBe(false);
|
|
34
|
+
});
|
|
35
|
+
|
|
36
|
+
it("does NOT match a reply that merely mentions the word worker", () => {
|
|
37
|
+
expect(
|
|
38
|
+
isWorkerFeedMessage(feed("I'll dispatch a worker to handle the crawl.")),
|
|
39
|
+
).toBe(false);
|
|
40
|
+
});
|
|
41
|
+
|
|
42
|
+
it("exposes the regex for scenarios that assert on the feed directly", () => {
|
|
43
|
+
expect(WORKER_FEED_RE.test("🔧 Worker · x")).toBe(true);
|
|
44
|
+
});
|
|
45
|
+
});
|
|
46
|
+
|
|
47
|
+
describe("isActivityFeedMessage", () => {
|
|
48
|
+
it("matches the in-progress step line", () => {
|
|
49
|
+
expect(isActivityFeedMessage(feed("→ Finding the right tool"))).toBe(true);
|
|
50
|
+
});
|
|
51
|
+
|
|
52
|
+
it("matches a multi-line feed (done steps + in-progress)", () => {
|
|
53
|
+
expect(
|
|
54
|
+
isActivityFeedMessage(feed("✓ Reading CLAUDE.md\n→ Searching memory")),
|
|
55
|
+
).toBe(true);
|
|
56
|
+
});
|
|
57
|
+
|
|
58
|
+
it("matches the +N earlier header", () => {
|
|
59
|
+
expect(
|
|
60
|
+
isActivityFeedMessage(feed("✓ +3 earlier…\n✓ Reading CLAUDE.md\n→ Searching memory")),
|
|
61
|
+
).toBe(true);
|
|
62
|
+
});
|
|
63
|
+
|
|
64
|
+
it("does NOT match an ordinary agent reply", () => {
|
|
65
|
+
expect(isActivityFeedMessage(feed("on it, pulling the logs now"))).toBe(false);
|
|
66
|
+
expect(
|
|
67
|
+
isActivityFeedMessage(feed("SWITCHROOM_UAT_MEM_DEADBEEFCAFE1234")),
|
|
68
|
+
).toBe(false);
|
|
69
|
+
});
|
|
70
|
+
|
|
71
|
+
it("does NOT match a reply that merely contains an arrow mid-text", () => {
|
|
72
|
+
expect(
|
|
73
|
+
isActivityFeedMessage(feed("The flow is request → response → render.")),
|
|
74
|
+
).toBe(false);
|
|
75
|
+
});
|
|
76
|
+
|
|
77
|
+
it("does NOT match an empty message", () => {
|
|
78
|
+
expect(isActivityFeedMessage(feed(" "))).toBe(false);
|
|
79
|
+
});
|
|
80
|
+
});
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Album-coalescing scenario — driver sends a 3-photo Telegram album
|
|
3
|
+
* (media_group) in one shot; the gateway's A2 multi-attachment
|
|
4
|
+
* coalescing (coalesce.max_attachments, default 10 since v0.14.21)
|
|
5
|
+
* MUST fold all three into a SINGLE Claude turn, so the agent sees
|
|
6
|
+
* image_path + image_path_2 + image_path_3 together and can report a
|
|
7
|
+
* count of 3.
|
|
8
|
+
*
|
|
9
|
+
* Regression gate for the default-on flip (#2021): before max_attachments
|
|
10
|
+
* defaulted to 10, an album bypassed coalescing (each part its own turn),
|
|
11
|
+
* so the agent would only ever see ONE image per turn and answer "1".
|
|
12
|
+
* A reply of "3" proves the album coalesced.
|
|
13
|
+
*
|
|
14
|
+
* Part of: https://github.com/switchroom/switchroom/issues/865
|
|
15
|
+
*
|
|
16
|
+
* ## How the signal is read robustly
|
|
17
|
+
*
|
|
18
|
+
* The agent's answer is a bare count, which would collide with incidental
|
|
19
|
+
* digits the chat is now full of by default: the pinned progress card's
|
|
20
|
+
* timer (`00:03`) and — since the worker feed went default-on fleet-wide
|
|
21
|
+
* (#2009 / v0.14.19) — worker-feed lines like `🔧 Worker · … · 0:12`.
|
|
22
|
+
* A "first bot message containing a digit" matcher would latch onto one of
|
|
23
|
+
* those and false-fail even when coalescing is healthy (see the
|
|
24
|
+
* memory-survives-restart matcher-flake note + `isWorkerFeedMessage`).
|
|
25
|
+
*
|
|
26
|
+
* Two defences, mirroring the sibling `jtbd-forwarded-burst-dm` gate:
|
|
27
|
+
* 1. Anchor the answer on a distinctive token — the agent is told to
|
|
28
|
+
* reply `IMAGECOUNT=<n>`. "IMAGECOUNT" never appears in a card or a
|
|
29
|
+
* worker-feed line, so the matcher cannot collide with their digits.
|
|
30
|
+
* 2. Drain observed messages into a per-id map (collapsing streamed
|
|
31
|
+
* edits to latest text), skip our own sends + worker-feed noise, and
|
|
32
|
+
* poll until the ANSWER token appears — rather than returning the
|
|
33
|
+
* first message that happens to match.
|
|
34
|
+
*
|
|
35
|
+
* - Coalesced → one turn sees 3 images → `IMAGECOUNT=3`.
|
|
36
|
+
* - Non-coalesced → the turn carrying the caption/question sees only its
|
|
37
|
+
* own (first) image → `IMAGECOUNT=1`. Surfaced as an explicit failure.
|
|
38
|
+
*
|
|
39
|
+
* Fixtures: three tiny solid-colour JPEGs under fixtures/album/, committed
|
|
40
|
+
* so the gate runs without a generation step. (Regenerate with
|
|
41
|
+
* `ffmpeg -f lavfi -i color=c=red:s=320x240 -frames:v 1 red.jpg`.)
|
|
42
|
+
*/
|
|
43
|
+
|
|
44
|
+
import path from "node:path";
|
|
45
|
+
import { existsSync } from "node:fs";
|
|
46
|
+
import { describe, expect, it } from "vitest";
|
|
47
|
+
import { spinUp } from "../harness.js";
|
|
48
|
+
import { pollUntil, isWorkerFeedMessage } from "../assertions.js";
|
|
49
|
+
import type { ObservedMessage } from "../driver.js";
|
|
50
|
+
|
|
51
|
+
const AGENT = "test-harness";
|
|
52
|
+
|
|
53
|
+
const FIXTURE_DIR = path.resolve(__dirname, "..", "fixtures", "album");
|
|
54
|
+
const PHOTOS = ["red.jpg", "green.jpg", "blue.jpg"].map((f) =>
|
|
55
|
+
path.join(FIXTURE_DIR, f),
|
|
56
|
+
);
|
|
57
|
+
|
|
58
|
+
const CAPTION =
|
|
59
|
+
"I just sent you a photo album in a single message. Count the separate " +
|
|
60
|
+
"image files you received in THIS ONE incoming message and reply with " +
|
|
61
|
+
"ONLY the token IMAGECOUNT=<n> (e.g. IMAGECOUNT=3). Nothing else.";
|
|
62
|
+
|
|
63
|
+
// Warm TTFO on test-harness is ~7s; an album adds the (sub-second)
|
|
64
|
+
// coalesce window plus the model looking at three images.
|
|
65
|
+
const ANSWER_TIMEOUT_MS = 90_000;
|
|
66
|
+
|
|
67
|
+
// Pull the count out of an `IMAGECOUNT=<n>` token, tolerating "= : whitespace".
|
|
68
|
+
function imageCountIn(text: string): number | undefined {
|
|
69
|
+
const m = text.match(/IMAGECOUNT\s*[:=]?\s*(\d+)/i);
|
|
70
|
+
return m ? Number.parseInt(m[1], 10) : undefined;
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
describe("uat: album-coalescing DM round-trip", () => {
|
|
74
|
+
it(
|
|
75
|
+
"a 3-photo album folds into ONE turn — agent reports seeing 3 images",
|
|
76
|
+
async () => {
|
|
77
|
+
for (const p of PHOTOS) {
|
|
78
|
+
if (!existsSync(p)) {
|
|
79
|
+
throw new Error(
|
|
80
|
+
`album fixture missing at ${p} — see scenario header to regenerate`,
|
|
81
|
+
);
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
const sc = await spinUp({ agent: AGENT });
|
|
85
|
+
try {
|
|
86
|
+
// Observe BEFORE sending — observeMessages only sees live updates.
|
|
87
|
+
// Drain into a per-id map so streamed edits collapse to latest text;
|
|
88
|
+
// skip our own sends and worker-feed noise so neither can satisfy
|
|
89
|
+
// the count matcher.
|
|
90
|
+
const latestById = new Map<number, ObservedMessage>();
|
|
91
|
+
const stream = sc.driver.observeMessages(sc.botUserId);
|
|
92
|
+
const consume = (async () => {
|
|
93
|
+
for await (const m of stream) {
|
|
94
|
+
if (m.senderUserId === sc.driverUserId) continue;
|
|
95
|
+
if (isWorkerFeedMessage(m)) continue;
|
|
96
|
+
latestById.set(m.messageId, m);
|
|
97
|
+
}
|
|
98
|
+
})();
|
|
99
|
+
|
|
100
|
+
await sc.driver.sendAlbum(sc.botUserId, PHOTOS, CAPTION);
|
|
101
|
+
|
|
102
|
+
// Poll until a bot message carries an IMAGECOUNT token.
|
|
103
|
+
const answer = await pollUntil(
|
|
104
|
+
() => {
|
|
105
|
+
for (const m of latestById.values()) {
|
|
106
|
+
if (imageCountIn(m.text) !== undefined) return m;
|
|
107
|
+
}
|
|
108
|
+
return undefined;
|
|
109
|
+
},
|
|
110
|
+
{ timeout: ANSWER_TIMEOUT_MS, interval: 500 },
|
|
111
|
+
).catch(() => undefined);
|
|
112
|
+
|
|
113
|
+
await stream[Symbol.asyncIterator]().return?.(undefined as never);
|
|
114
|
+
await consume;
|
|
115
|
+
|
|
116
|
+
if (!answer) {
|
|
117
|
+
const seen = [...latestById.values()]
|
|
118
|
+
.map((m) => `#${m.messageId}=${JSON.stringify(m.text.slice(0, 60))}`)
|
|
119
|
+
.join(" ");
|
|
120
|
+
throw new Error(
|
|
121
|
+
`[album-coalescing] No bot reply carried an IMAGECOUNT token ` +
|
|
122
|
+
`within ${ANSWER_TIMEOUT_MS}ms. Bot messages seen: ${seen || "(none)"}.`,
|
|
123
|
+
);
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
const count = imageCountIn(answer.text);
|
|
127
|
+
// Coalesced => 3. A non-coalescing gateway answers 1 (the question
|
|
128
|
+
// rides photo #1; the other two parts spill to their own turns).
|
|
129
|
+
expect(count).toBe(3);
|
|
130
|
+
} finally {
|
|
131
|
+
await sc.tearDown();
|
|
132
|
+
}
|
|
133
|
+
},
|
|
134
|
+
ANSWER_TIMEOUT_MS + 30_000,
|
|
135
|
+
);
|
|
136
|
+
});
|
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* JTBD scenario — forwarded burst / split paste coalesces into ONE turn.
|
|
3
|
+
*
|
|
4
|
+
* Serves: `reference/steer-or-queue-mid-flight.md` — the "Forwarded
|
|
5
|
+
* burst / split paste" UAT prompt. When several messages land in quick
|
|
6
|
+
* succession from the same sender (a forward of 3-4 messages, or a long
|
|
7
|
+
* paste Telegram split into chunks), inbound coalescing must merge them
|
|
8
|
+
* into a SINGLE Claude turn with shared context — not reply to each
|
|
9
|
+
* fragment in isolation.
|
|
10
|
+
*
|
|
11
|
+
* This is the end-to-end gate for the A1 coalescing work shipped in
|
|
12
|
+
* v0.14.18 (#2007). The merge logic itself is covered by unit + fuzz
|
|
13
|
+
* tests (`inbound-coalesce.test.ts`, `pending-inbound-buffer.test.ts`),
|
|
14
|
+
* but only this scenario exercises the real inbound → gateway coalescer
|
|
15
|
+
* → claude → outbound path over a live Telegram chat.
|
|
16
|
+
*
|
|
17
|
+
* ## How the signal is constructed
|
|
18
|
+
*
|
|
19
|
+
* Naively asking the agent to "combine facts from several messages"
|
|
20
|
+
* does NOT distinguish coalesced from fanned-out: even when each
|
|
21
|
+
* message becomes its own turn, every later turn carries the PRIOR
|
|
22
|
+
* turns in its conversation history, so the model could still answer
|
|
23
|
+
* from history. History bleed makes a content-combination assertion
|
|
24
|
+
* useless as a coalescing probe.
|
|
25
|
+
*
|
|
26
|
+
* The distinguishing fact is whether a SINGLE turn saw all the parts in
|
|
27
|
+
* ONE incoming message. So we anchor the instruction on "this single
|
|
28
|
+
* message": three messages are fired near-simultaneously (so they land
|
|
29
|
+
* inside the default 500ms coalesce window), each carrying a distinct
|
|
30
|
+
* code token, and the instruction (in the last part) asks the agent to
|
|
31
|
+
* echo every token it received *in this one incoming message*.
|
|
32
|
+
*
|
|
33
|
+
* - Coalesced → the merged turn's single message contains ALPHA,
|
|
34
|
+
* BRAVO and CHARLIE → the reply names all three.
|
|
35
|
+
* - Fanned out → the message carrying the instruction contains only
|
|
36
|
+
* its own token → the reply names just that one. (And the other
|
|
37
|
+
* two tokens arrive as their own separate turns.)
|
|
38
|
+
*
|
|
39
|
+
* Tokens are deliberately odd uppercase strings so a substring match is
|
|
40
|
+
* unambiguous and won't collide with incidental words in the reply.
|
|
41
|
+
*
|
|
42
|
+
* Order-independence: the three sends are dispatched concurrently to
|
|
43
|
+
* guarantee they share a coalesce window, which means Telegram may
|
|
44
|
+
* deliver them in any order. We assert SET membership (all three
|
|
45
|
+
* present), never order.
|
|
46
|
+
*/
|
|
47
|
+
|
|
48
|
+
import { describe, it, expect } from "vitest";
|
|
49
|
+
import { spinUp } from "../harness.js";
|
|
50
|
+
import { pollUntil } from "../assertions.js";
|
|
51
|
+
import type { ObservedMessage } from "../driver.js";
|
|
52
|
+
|
|
53
|
+
const AGENT = "test-harness";
|
|
54
|
+
|
|
55
|
+
// Distinctive code tokens — unlikely to appear incidentally in a reply.
|
|
56
|
+
const TOKENS = ["ALPHA", "BRAVO", "CHARLIE"] as const;
|
|
57
|
+
|
|
58
|
+
const BURST: string[] = [
|
|
59
|
+
`BURST-PROBE part 1 of 3. Code token: ${TOKENS[0]}.`,
|
|
60
|
+
`BURST-PROBE part 2 of 3. Code token: ${TOKENS[1]}.`,
|
|
61
|
+
`BURST-PROBE part 3 of 3. Code token: ${TOKENS[2]}. ` +
|
|
62
|
+
`Reply with ONLY the BURST-PROBE code tokens contained in THIS SINGLE ` +
|
|
63
|
+
`incoming message, slash-separated. If this message contains just one ` +
|
|
64
|
+
`token, reply with only that token.`,
|
|
65
|
+
];
|
|
66
|
+
|
|
67
|
+
// Generous budget: TTFO on the test-harness is ~7s warm; coalescing
|
|
68
|
+
// adds the (sub-second) window plus normal model latency.
|
|
69
|
+
const ANSWER_TIMEOUT_MS = 40_000;
|
|
70
|
+
|
|
71
|
+
function tokensIn(text: string): string[] {
|
|
72
|
+
const upper = text.toUpperCase();
|
|
73
|
+
return TOKENS.filter((t) => upper.includes(t));
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
describe("uat: forwarded burst / split paste coalesces into one turn", () => {
|
|
77
|
+
it(
|
|
78
|
+
"a 3-message burst is answered as ONE shared-context turn",
|
|
79
|
+
async () => {
|
|
80
|
+
const sc = await spinUp({ agent: AGENT });
|
|
81
|
+
try {
|
|
82
|
+
// Start observing BEFORE the burst — observeMessages only sees
|
|
83
|
+
// live updates, not history. Drain into a per-message-id map so
|
|
84
|
+
// streamed edits collapse to the message's latest text.
|
|
85
|
+
const latestById = new Map<number, ObservedMessage>();
|
|
86
|
+
const stream = sc.driver.observeMessages(sc.botUserId);
|
|
87
|
+
const consume = (async () => {
|
|
88
|
+
for await (const m of stream) {
|
|
89
|
+
if (m.senderUserId === sc.driverUserId) continue; // skip our own sends
|
|
90
|
+
latestById.set(m.messageId, m);
|
|
91
|
+
}
|
|
92
|
+
})();
|
|
93
|
+
|
|
94
|
+
// Fire all three concurrently so they share one coalesce window.
|
|
95
|
+
// Concurrency (not serial awaits) is what keeps inter-arrival
|
|
96
|
+
// under the 500ms default — three serial round-trips could blow
|
|
97
|
+
// the window on a slow link.
|
|
98
|
+
await Promise.all(BURST.map((t) => sc.sendDM(t)));
|
|
99
|
+
|
|
100
|
+
// Wait until a single bot message names all three tokens — the
|
|
101
|
+
// proof that one turn saw the whole burst in one incoming
|
|
102
|
+
// message.
|
|
103
|
+
const allThree = await pollUntil(
|
|
104
|
+
() => {
|
|
105
|
+
for (const m of latestById.values()) {
|
|
106
|
+
if (tokensIn(m.text).length === TOKENS.length) return m;
|
|
107
|
+
}
|
|
108
|
+
return undefined;
|
|
109
|
+
},
|
|
110
|
+
{ timeout: ANSWER_TIMEOUT_MS, interval: 500 },
|
|
111
|
+
).catch(() => undefined);
|
|
112
|
+
|
|
113
|
+
// Close the observer stream.
|
|
114
|
+
await stream[Symbol.asyncIterator]().return?.(undefined as never);
|
|
115
|
+
await consume;
|
|
116
|
+
|
|
117
|
+
const botMsgs = [...latestById.values()];
|
|
118
|
+
const tokenBearing = botMsgs.filter((m) => tokensIn(m.text).length > 0);
|
|
119
|
+
|
|
120
|
+
if (!allThree) {
|
|
121
|
+
const seen = tokenBearing
|
|
122
|
+
.map((m) => `#${m.messageId}=[${tokensIn(m.text).join(",")}]`)
|
|
123
|
+
.join(" ");
|
|
124
|
+
throw new Error(
|
|
125
|
+
`[forwarded-burst] No single bot reply named all of ` +
|
|
126
|
+
`${TOKENS.join("/")}. This is the coalescing regression: the ` +
|
|
127
|
+
`burst fanned out into separate turns so no turn saw the full ` +
|
|
128
|
+
`message. Token-bearing replies: ${seen || "(none)"}.`,
|
|
129
|
+
);
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
expect(tokensIn(allThree.text).sort()).toEqual([...TOKENS].sort());
|
|
133
|
+
|
|
134
|
+
// Forensic: a coalesced burst should produce ONE answer that
|
|
135
|
+
// names the tokens. Several token-bearing replies hint at a
|
|
136
|
+
// partial fan-out even though one of them happened to be
|
|
137
|
+
// complete — worth a warning before it becomes a hard failure.
|
|
138
|
+
if (tokenBearing.length > 1) {
|
|
139
|
+
console.warn(
|
|
140
|
+
`[forwarded-burst] ${tokenBearing.length} token-bearing replies ` +
|
|
141
|
+
`observed (expected 1). Possible partial fan-out: ` +
|
|
142
|
+
tokenBearing
|
|
143
|
+
.map((m) => `#${m.messageId}=[${tokensIn(m.text).join(",")}]`)
|
|
144
|
+
.join(" "),
|
|
145
|
+
);
|
|
146
|
+
} else {
|
|
147
|
+
console.log(
|
|
148
|
+
`[forwarded-burst] One shared-context reply named all tokens ` +
|
|
149
|
+
`(#${allThree.messageId}). Coalescing healthy.`,
|
|
150
|
+
);
|
|
151
|
+
}
|
|
152
|
+
} finally {
|
|
153
|
+
await sc.tearDown();
|
|
154
|
+
}
|
|
155
|
+
},
|
|
156
|
+
ANSWER_TIMEOUT_MS + 20_000,
|
|
157
|
+
);
|
|
158
|
+
});
|
|
@@ -53,9 +53,24 @@ import { describe, it, expect, beforeAll } from "vitest";
|
|
|
53
53
|
import { execSync } from "node:child_process";
|
|
54
54
|
import { randomBytes } from "node:crypto";
|
|
55
55
|
import { spinUp } from "../harness.js";
|
|
56
|
+
import { isActivityFeedMessage, isWorkerFeedMessage } from "../assertions.js";
|
|
57
|
+
import type { ObservedMessage } from "../driver.js";
|
|
56
58
|
|
|
57
59
|
const AGENT = "test-harness";
|
|
58
60
|
|
|
61
|
+
// Two classes of bot message are NOT the agent's reply and must be
|
|
62
|
+
// skipped, or a bare `/\S/` matcher latches onto them and the
|
|
63
|
+
// verbatim-token check fails against noise instead of catching a real
|
|
64
|
+
// memory miss:
|
|
65
|
+
// 1. The worker-activity feed (#2000, default-on since v0.14.19) — a
|
|
66
|
+
// stray background sub-agent posts `🔧 Worker · …` into this DM.
|
|
67
|
+
// 2. The tool-activity feed — on a turn that uses tools (memory recall
|
|
68
|
+
// does), `→ Finding the right tool` paints as its own message before
|
|
69
|
+
// the real answer lands.
|
|
70
|
+
// Match the first non-empty bot reply that is neither.
|
|
71
|
+
const isReply = (m: ObservedMessage): boolean =>
|
|
72
|
+
/\S/.test(m.text) && !isWorkerFeedMessage(m) && !isActivityFeedMessage(m);
|
|
73
|
+
|
|
59
74
|
const RESTART_BUDGET_MS = 90_000;
|
|
60
75
|
const CAPTURE_REPLY_BUDGET_MS = 60_000;
|
|
61
76
|
const RECALL_REPLY_BUDGET_MS = 120_000;
|
|
@@ -139,7 +154,7 @@ const sudoOk = canShellSudo();
|
|
|
139
154
|
`(This is a memory-survival UAT — store it via hindsight.)`,
|
|
140
155
|
);
|
|
141
156
|
|
|
142
|
-
const captureReply = await sc1.expectMessage(
|
|
157
|
+
const captureReply = await sc1.expectMessage(isReply, {
|
|
143
158
|
from: "bot",
|
|
144
159
|
timeout: CAPTURE_REPLY_BUDGET_MS,
|
|
145
160
|
});
|
|
@@ -180,7 +195,7 @@ const sudoOk = canShellSudo();
|
|
|
180
195
|
`Reply with the token only, no extra text.`,
|
|
181
196
|
);
|
|
182
197
|
|
|
183
|
-
const recallReply = await sc2.expectMessage(
|
|
198
|
+
const recallReply = await sc2.expectMessage(isReply, {
|
|
184
199
|
from: "bot",
|
|
185
200
|
timeout: RECALL_REPLY_BUDGET_MS + 5_000,
|
|
186
201
|
});
|
|
@@ -25,15 +25,21 @@
|
|
|
25
25
|
* - 429 cooldown + message_id drift resilience (re-post on stale edit),
|
|
26
26
|
* - a forced terminal edit on `finish` regardless of throttle.
|
|
27
27
|
*
|
|
28
|
-
* The feed is gated to BACKGROUND workers and
|
|
29
|
-
* `SWITCHROOM_WORKER_ACTIVITY_FEED`
|
|
30
|
-
* watcher already drives the cues (it polls the worker jsonl
|
|
31
|
-
* so it keeps firing after the parent turn ends), which is why
|
|
32
|
-
* is fed from watcher callbacks rather than the bridge event stream.
|
|
28
|
+
* The feed is gated to BACKGROUND workers and is ON by default; set
|
|
29
|
+
* `SWITCHROOM_WORKER_ACTIVITY_FEED=0` to disable it — see the gateway
|
|
30
|
+
* wiring. The watcher already drives the cues (it polls the worker jsonl
|
|
31
|
+
* directly, so it keeps firing after the parent turn ends), which is why
|
|
32
|
+
* the feed is fed from watcher callbacks rather than the bridge event stream.
|
|
33
33
|
*/
|
|
34
34
|
|
|
35
35
|
import { escapeHtml, formatDuration, truncate } from './card-format.js'
|
|
36
36
|
|
|
37
|
+
/** Worker-activity feed is ON by default; an operator opts out with
|
|
38
|
+
* SWITCHROOM_WORKER_ACTIVITY_FEED=0. */
|
|
39
|
+
export function isWorkerActivityFeedEnabled(envVal: string | undefined): boolean {
|
|
40
|
+
return envVal !== '0'
|
|
41
|
+
}
|
|
42
|
+
|
|
37
43
|
export type WorkerActivityState = 'running' | 'done' | 'failed'
|
|
38
44
|
|
|
39
45
|
/** The render-relevant snapshot of a worker at one instant. */
|
|
@@ -46,6 +52,14 @@ export interface WorkerActivityView {
|
|
|
46
52
|
toolCount: number
|
|
47
53
|
/** The worker's latest narrative line, if any (already capped upstream). */
|
|
48
54
|
latestSummary: string
|
|
55
|
+
/**
|
|
56
|
+
* Accumulated narrative lines, oldest→newest, already deduped + capped by
|
|
57
|
+
* the feed manager. When present and non-empty, the render grows a block
|
|
58
|
+
* of `↳` lines (mirroring the main agent's live answer) instead of
|
|
59
|
+
* collapsing to the single `latestSummary` line. Absent/empty → the
|
|
60
|
+
* single-line fallback (back-compat for direct render callers).
|
|
61
|
+
*/
|
|
62
|
+
narrativeLines?: string[]
|
|
49
63
|
/** Wall-clock since dispatch, ms. */
|
|
50
64
|
elapsedMs: number
|
|
51
65
|
state: WorkerActivityState
|
|
@@ -68,6 +82,13 @@ export interface BotApiForWorkerFeed {
|
|
|
68
82
|
const DESC_MAX = 80
|
|
69
83
|
const TOOL_ARG_MAX = 64
|
|
70
84
|
const SUMMARY_MAX = 100
|
|
85
|
+
/**
|
|
86
|
+
* How many trailing narrative lines the live feed keeps visible. The feed
|
|
87
|
+
* grows like the main agent's answer but can't grow unbounded — Telegram
|
|
88
|
+
* caps message length and a wall of stale lines buries the live one. Six
|
|
89
|
+
* keeps recent context without dominating the chat.
|
|
90
|
+
*/
|
|
91
|
+
const NARRATIVE_MAX_LINES = 6
|
|
71
92
|
|
|
72
93
|
/**
|
|
73
94
|
* Render the worker-activity message body as Telegram HTML.
|
|
@@ -105,10 +126,23 @@ export function renderWorkerActivity(v: WorkerActivityView): string {
|
|
|
105
126
|
activity = `<i>starting… (${elapsed})</i>`
|
|
106
127
|
}
|
|
107
128
|
|
|
108
|
-
const summary = v.latestSummary.trim()
|
|
109
129
|
const lines = [header, activity]
|
|
110
|
-
|
|
111
|
-
|
|
130
|
+
|
|
131
|
+
// Growing narrative block when the manager has accumulated lines; the feed
|
|
132
|
+
// reads like the main agent's live answer rather than a single replaced
|
|
133
|
+
// status line. Fall back to the single latestSummary line otherwise.
|
|
134
|
+
const narrative = (v.narrativeLines ?? [])
|
|
135
|
+
.map((s) => s.trim())
|
|
136
|
+
.filter((s) => s.length > 0)
|
|
137
|
+
if (narrative.length > 0) {
|
|
138
|
+
for (const line of narrative) {
|
|
139
|
+
lines.push(` ↳ <i>${escapeHtml(truncate(line, SUMMARY_MAX))}</i>`)
|
|
140
|
+
}
|
|
141
|
+
} else {
|
|
142
|
+
const summary = v.latestSummary.trim()
|
|
143
|
+
if (summary.length > 0) {
|
|
144
|
+
lines.push(` ↳ <i>${escapeHtml(truncate(summary, SUMMARY_MAX))}</i>`)
|
|
145
|
+
}
|
|
112
146
|
}
|
|
113
147
|
return lines.join('\n')
|
|
114
148
|
}
|
|
@@ -142,6 +176,12 @@ interface WorkerHandle {
|
|
|
142
176
|
lastBody: string | null
|
|
143
177
|
lastEditAt: number
|
|
144
178
|
cooldownUntil: number
|
|
179
|
+
/**
|
|
180
|
+
* Accumulated narrative lines (oldest→newest), deduped against the
|
|
181
|
+
* immediately-preceding line and capped to NARRATIVE_MAX_LINES. Grows the
|
|
182
|
+
* live render so the feed reads like the main agent's answer.
|
|
183
|
+
*/
|
|
184
|
+
narrative: string[]
|
|
145
185
|
/** Per-worker serialization chain so ticks can't interleave sends. */
|
|
146
186
|
chain: Promise<void>
|
|
147
187
|
}
|
|
@@ -203,9 +243,24 @@ export function createWorkerActivityFeed(opts: WorkerActivityFeedOpts): WorkerAc
|
|
|
203
243
|
log(`worker-feed: ${label} 429 — backing off ${retryAfter}s`)
|
|
204
244
|
}
|
|
205
245
|
|
|
246
|
+
function accumulateNarrative(h: WorkerHandle, view: WorkerActivityView): void {
|
|
247
|
+
const line = view.latestSummary.trim()
|
|
248
|
+
if (line.length === 0) return
|
|
249
|
+
// Dedup against the immediately-preceding line — the watcher re-emits the
|
|
250
|
+
// same narrative across ticks while a tool runs; we only grow on change.
|
|
251
|
+
if (h.narrative[h.narrative.length - 1] === line) return
|
|
252
|
+
h.narrative.push(line)
|
|
253
|
+
if (h.narrative.length > NARRATIVE_MAX_LINES) {
|
|
254
|
+
h.narrative.splice(0, h.narrative.length - NARRATIVE_MAX_LINES)
|
|
255
|
+
}
|
|
256
|
+
}
|
|
257
|
+
|
|
206
258
|
async function doUpdate(h: WorkerHandle, view: WorkerActivityView): Promise<void> {
|
|
259
|
+
// Accumulate before any gate so a throttled/cooled-down tick still grows
|
|
260
|
+
// the narrative — the line surfaces on the next edit that does fire.
|
|
261
|
+
accumulateNarrative(h, view)
|
|
207
262
|
if (nowFn() < h.cooldownUntil) return
|
|
208
|
-
const body = renderWorkerActivity(view)
|
|
263
|
+
const body = renderWorkerActivity({ ...view, narrativeLines: h.narrative })
|
|
209
264
|
|
|
210
265
|
// First paint: hold off until the worker has run long enough to be
|
|
211
266
|
// worth a message; trivial workers stay silent (handback covers them).
|
|
@@ -284,6 +339,7 @@ export function createWorkerActivityFeed(opts: WorkerActivityFeedOpts): WorkerAc
|
|
|
284
339
|
lastBody: null,
|
|
285
340
|
lastEditAt: 0,
|
|
286
341
|
cooldownUntil: 0,
|
|
342
|
+
narrative: [],
|
|
287
343
|
chain: Promise.resolve(),
|
|
288
344
|
}
|
|
289
345
|
handles.set(agentId, h)
|