switchroom 0.15.44 → 0.16.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/agent-scheduler/index.js +122 -88
- package/dist/auth-broker/index.js +463 -177
- package/dist/cli/autoaccept-poll.js +4842 -35
- package/dist/cli/drive-write-pretool.mjs +17 -14
- package/dist/cli/notion-write-pretool.mjs +117 -86
- package/dist/cli/self-improve-apply-guard-pretool.mjs +626 -0
- package/dist/cli/self-improve-stop.mjs +428 -0
- package/dist/cli/skill-validate-pretool.mjs +72 -72
- package/dist/cli/switchroom.js +3249 -1241
- package/dist/cli/ui/index.html +1 -1
- package/dist/host-control/main.js +2833 -355
- package/dist/vault/approvals/kernel-server.js +7482 -7439
- package/dist/vault/broker/server.js +11315 -11272
- package/examples/minimal.yaml +1 -0
- package/examples/switchroom.yaml +1 -0
- package/package.json +3 -3
- package/profiles/_base/start.sh.hbs +88 -1
- package/profiles/_shared/execution-discipline.md.hbs +18 -0
- package/profiles/default/CLAUDE.md.hbs +3 -22
- package/telegram-plugin/.claude-plugin/plugin.json +2 -2
- package/telegram-plugin/answer-stream-flag.ts +12 -49
- package/telegram-plugin/answer-stream.ts +5 -150
- package/telegram-plugin/auth-snapshot-format.ts +280 -48
- package/telegram-plugin/auto-fallback-fleet.ts +44 -1
- package/telegram-plugin/context-exhaustion.ts +12 -0
- package/telegram-plugin/demo-mask.ts +154 -0
- package/telegram-plugin/dist/bridge/bridge.js +167 -124
- package/telegram-plugin/dist/gateway/gateway.js +3039 -1159
- package/telegram-plugin/dist/server.js +215 -172
- package/telegram-plugin/docs/waiting-ux-spec.md +2 -2
- package/telegram-plugin/draft-stream.ts +47 -410
- package/telegram-plugin/final-answer-detect.ts +17 -12
- package/telegram-plugin/fleet-fallback-resume.ts +131 -0
- package/telegram-plugin/format.ts +56 -19
- package/telegram-plugin/gateway/auth-add-flow.ts +332 -127
- package/telegram-plugin/gateway/auth-broker-client.ts +2 -2
- package/telegram-plugin/gateway/auth-command.ts +70 -14
- package/telegram-plugin/gateway/clean-shutdown-marker.ts +44 -0
- package/telegram-plugin/gateway/config-approval-handler.test.ts +91 -4
- package/telegram-plugin/gateway/config-approval-handler.ts +94 -13
- package/telegram-plugin/gateway/current-turn-map.ts +188 -0
- package/telegram-plugin/gateway/disconnect-flush.ts +3 -1
- package/telegram-plugin/gateway/effort-command.ts +8 -3
- package/telegram-plugin/gateway/emission-authority.ts +369 -0
- package/telegram-plugin/gateway/feed-open-gate.ts +292 -0
- package/telegram-plugin/gateway/gateway.ts +1837 -291
- package/telegram-plugin/gateway/inject-handler.test.ts +2 -1
- package/telegram-plugin/gateway/ms365-write-approval.test.ts +4 -4
- package/telegram-plugin/gateway/represent-guard.ts +72 -0
- package/telegram-plugin/gateway/status-surface-log.test.ts +5 -4
- package/telegram-plugin/gateway/status-surface-log.ts +14 -3
- package/telegram-plugin/history.ts +33 -11
- package/telegram-plugin/hooks/repo-context-pretool.mjs +26 -0
- package/telegram-plugin/hooks/subagent-tracker-posttool.mjs +5 -0
- package/telegram-plugin/hooks/subagent-tracker-pretool.mjs +8 -0
- package/telegram-plugin/hooks/tool-label-pretool.mjs +39 -15
- package/telegram-plugin/issues-card.ts +4 -0
- package/telegram-plugin/model-unavailable.ts +124 -0
- package/telegram-plugin/narrative-dedup.ts +69 -0
- package/telegram-plugin/over-ping-safety-net.ts +70 -4
- package/telegram-plugin/package.json +3 -3
- package/telegram-plugin/pending-work-progress.ts +12 -0
- package/telegram-plugin/permission-rule.ts +32 -5
- package/telegram-plugin/permission-title.ts +152 -9
- package/telegram-plugin/quota-check.ts +13 -0
- package/telegram-plugin/quota-watch.ts +135 -7
- package/telegram-plugin/registry/turns-schema.test.ts +24 -0
- package/telegram-plugin/registry/turns-schema.ts +9 -0
- package/telegram-plugin/runtime-metrics.ts +13 -0
- package/telegram-plugin/session-tail.ts +96 -11
- package/telegram-plugin/silence-poke.ts +170 -24
- package/telegram-plugin/slot-banner-driver.ts +3 -0
- package/telegram-plugin/status-no-truncate.ts +44 -0
- package/telegram-plugin/status-reactions.ts +20 -3
- package/telegram-plugin/stream-controller.ts +4 -23
- package/telegram-plugin/stream-reply-handler.ts +6 -24
- package/telegram-plugin/streaming-metrics.ts +91 -0
- package/telegram-plugin/subagent-watcher.ts +212 -66
- package/telegram-plugin/tests/activity-ever-opened-sticky.test.ts +47 -0
- package/telegram-plugin/tests/answer-stream-dedup.test.ts +9 -26
- package/telegram-plugin/tests/answer-stream-flag.test.ts +25 -58
- package/telegram-plugin/tests/answer-stream-silent-markers.test.ts +41 -51
- package/telegram-plugin/tests/answer-stream.test.ts +2 -411
- package/telegram-plugin/tests/auth-add-flow.test.ts +488 -253
- package/telegram-plugin/tests/auth-command-format2.test.ts +71 -1
- package/telegram-plugin/tests/auth-snapshot-format.test.ts +376 -6
- package/telegram-plugin/tests/auto-fallback-fleet.test.ts +120 -0
- package/telegram-plugin/tests/cross-turn-card-gate.test.ts +424 -0
- package/telegram-plugin/tests/demo-mask.test.ts +127 -0
- package/telegram-plugin/tests/draft-stream.test.ts +0 -827
- package/telegram-plugin/tests/emission-authority-card-drain-gate.test.ts +236 -0
- package/telegram-plugin/tests/emission-authority-facade.test.ts +488 -0
- package/telegram-plugin/tests/emission-authority-open-gate.test.ts +179 -0
- package/telegram-plugin/tests/emission-authority-ping-gate.test.ts +395 -0
- package/telegram-plugin/tests/emission-determinism-wiring.test.ts +177 -0
- package/telegram-plugin/tests/feed-heartbeat-liveness-open.test.ts +146 -0
- package/telegram-plugin/tests/feed-open-gate.test.ts +259 -0
- package/telegram-plugin/tests/feed-survival.test.ts +526 -0
- package/telegram-plugin/tests/fleet-fallback-resume.test.ts +197 -0
- package/telegram-plugin/tests/gateway-clean-shutdown-marker.test.ts +117 -0
- package/telegram-plugin/tests/gateway-no-reply-single-emit.test.ts +4 -11
- package/telegram-plugin/tests/history.test.ts +60 -0
- package/telegram-plugin/tests/model-unavailable.test.ts +118 -0
- package/telegram-plugin/tests/narrative-dedup.test.ts +118 -0
- package/telegram-plugin/tests/orphaned-reply-rearm.test.ts +285 -0
- package/telegram-plugin/tests/over-ping-final-answer-decoupling.test.ts +194 -0
- package/telegram-plugin/tests/over-ping-safety-net.test.ts +2 -2
- package/telegram-plugin/tests/per-topic-current-turn.test.ts +373 -0
- package/telegram-plugin/tests/permission-card-origin-kill-switch.test.ts +42 -0
- package/telegram-plugin/tests/permission-rule.test.ts +17 -0
- package/telegram-plugin/tests/permission-title.test.ts +206 -17
- package/telegram-plugin/tests/quota-watch.test.ts +252 -9
- package/telegram-plugin/tests/reply-terminal-reaction.test.ts +6 -1
- package/telegram-plugin/tests/repo-context-pretool.test.ts +62 -0
- package/telegram-plugin/tests/represent-guard.test.ts +162 -0
- package/telegram-plugin/tests/session-tail.test.ts +147 -3
- package/telegram-plugin/tests/silence-liveness-wiring.test.ts +18 -0
- package/telegram-plugin/tests/status-card-budget-parity.test.ts +72 -0
- package/telegram-plugin/tests/status-surface-log.test.ts +146 -0
- package/telegram-plugin/tests/subagent-watcher-clip-narrative.test.ts +58 -0
- package/telegram-plugin/tests/subagent-watcher-parent-turn-key.test.ts +102 -0
- package/telegram-plugin/tests/subagent-watcher-workflow-visibility.test.ts +225 -0
- package/telegram-plugin/tests/subagent-watcher.test.ts +147 -0
- package/telegram-plugin/tests/telegram-activity-visibility-integration.test.ts +597 -0
- package/telegram-plugin/tests/telegram-format.test.ts +101 -6
- package/telegram-plugin/tests/tool-activity-summary.test.ts +550 -15
- package/telegram-plugin/tests/tool-label-pretool.test.ts +73 -0
- package/telegram-plugin/tests/tool-label-sidecar.test.ts +44 -0
- package/telegram-plugin/tests/tool-labels.test.ts +67 -0
- package/telegram-plugin/tests/turn-liveness-floor.test.ts +196 -0
- package/telegram-plugin/tests/turn-liveness-invariant.test.ts +340 -0
- package/telegram-plugin/tests/welcome-text.test.ts +32 -3
- package/telegram-plugin/tests/worker-activity-feed.test.ts +470 -22
- package/telegram-plugin/tool-activity-summary.ts +375 -58
- package/telegram-plugin/turn-liveness-floor.ts +240 -0
- package/telegram-plugin/uat/assertions.ts +115 -0
- package/telegram-plugin/uat/driver.ts +68 -0
- package/telegram-plugin/uat/scenarios/bg-sub-agent-dispatch-dm.test.ts +119 -133
- package/telegram-plugin/uat/scenarios/jtbd-answer-pings.test.ts +94 -0
- package/telegram-plugin/uat/scenarios/jtbd-cross-turn-card-dm.test.ts +109 -0
- package/telegram-plugin/uat/scenarios/jtbd-foreground-feed-thinkgap-dm.test.ts +478 -0
- package/telegram-plugin/uat/scenarios/jtbd-foreground-feed-visibility-dm.test.ts +396 -0
- package/telegram-plugin/uat/scenarios/jtbd-liveness-feed-open-dm.test.ts +202 -0
- package/telegram-plugin/uat/scenarios/jtbd-reply-is-last-dm.test.ts +202 -0
- package/telegram-plugin/uat/scenarios/reactions-dm.test.ts +93 -87
- package/telegram-plugin/welcome-text.ts +13 -1
- package/telegram-plugin/worker-activity-feed.ts +157 -82
- package/telegram-plugin/draft-transport.ts +0 -122
- package/telegram-plugin/tests/draft-retirement-wiring.test.ts +0 -82
- package/telegram-plugin/tests/draft-transport.test.ts +0 -211
|
@@ -5,187 +5,173 @@
|
|
|
5
5
|
* Verifies three acceptance criteria from the RFC in a single run because
|
|
6
6
|
* they share setup:
|
|
7
7
|
*
|
|
8
|
-
* AC-1 — Background-dispatch-and-continue:
|
|
9
|
-
*
|
|
10
|
-
*
|
|
11
|
-
*
|
|
12
|
-
*
|
|
13
|
-
*
|
|
14
|
-
*
|
|
15
|
-
*
|
|
16
|
-
* subagent-watcher
|
|
8
|
+
* AC-1 — Background-dispatch-and-continue: worker-feed message appears
|
|
9
|
+
* while the background sub-agent runs; persists past parent
|
|
10
|
+
* `turn_end` so the user can watch the worker in flight.
|
|
11
|
+
* AC-2 — Done semantics: feed message reads `running ·` while the bg
|
|
12
|
+
* sub-agent runs; flips to `finished · completed` (or `failed`)
|
|
13
|
+
* after it terminates.
|
|
14
|
+
* AC-3 — Live activity: feed body materially changes across a 6s window
|
|
15
|
+
* while bg work is in flight (elapsed counter or narrative step
|
|
16
|
+
* advances) — proves the subagent-watcher is actually feeding the
|
|
17
|
+
* renderer.
|
|
17
18
|
*
|
|
18
19
|
* Prompt strategy: **Option 1 (explicit tool-naming)** per the RFC §
|
|
19
20
|
* "Background-dispatch prompt". An earlier Option-2 (naturalistic)
|
|
20
21
|
* attempt produced exactly the failure mode the RFC predicted —
|
|
21
|
-
* model ran the sleeps inline via Bash,
|
|
22
|
+
* model ran the sleeps inline via Bash, feed never reached Background
|
|
22
23
|
* phase. This test verifies the *visibility infra*, not the LLM's
|
|
23
24
|
* delegation judgment; pinning the tool name and arg keeps the
|
|
24
25
|
* scenario deterministic.
|
|
25
26
|
*
|
|
26
|
-
*
|
|
27
|
-
*
|
|
28
|
-
*
|
|
27
|
+
* Architecture note (post-#1122 PR3): the pinned progress card was
|
|
28
|
+
* deleted. Background sub-agent visibility is now surfaced via the
|
|
29
|
+
* worker-activity-feed (`SWITCHROOM_WORKER_ACTIVITY_FEED=1`): a regular
|
|
30
|
+
* Telegram message that posts once the worker has been running for
|
|
31
|
+
* `firstPaintMin` (8s default on test-harness) and edits in-place as
|
|
32
|
+
* activity arrives. This test drives assertions against that feed.
|
|
29
33
|
*
|
|
30
|
-
*
|
|
31
|
-
*
|
|
32
|
-
*
|
|
33
|
-
* `
|
|
34
|
+
* Requires the same env as the other DM scenarios (see SETUP.md §6).
|
|
35
|
+
*
|
|
36
|
+
* Root causes fixed in #2501 (this PR):
|
|
37
|
+
* Bug 1 — orphan correlation. `backfillJsonlAgentId` used a fuzzy
|
|
38
|
+
* (agentType, description) match to link a newly-discovered JSONL
|
|
39
|
+
* to its registry row. When the match failed (description null,
|
|
40
|
+
* or race), `jsonl_agent_id` stayed NULL, so
|
|
41
|
+
* `resolveWorkerFeedDispatch(getSubagentByJsonlId(db, id), …)`
|
|
42
|
+
* returned `{ isBackground: false }` — routing the worker as a
|
|
43
|
+
* foreground sub-agent and suppressing the worker-feed. Fix:
|
|
44
|
+
* prefer the direct `toolUseId` PK lookup that Claude Code already
|
|
45
|
+
* writes to `agent-<id>.meta.json`.
|
|
46
|
+
* Bug 2 — liveness writes silently skipped. With `jsonl_agent_id = NULL`
|
|
47
|
+
* (Bug 1 not fixed), `bumpSubagentActivity` queries by
|
|
48
|
+
* `jsonl_agent_id` and finds nothing — every liveness tick is a
|
|
49
|
+
* no-op and the last_activity_at column never updates. Fixed as a
|
|
50
|
+
* consequence of Bug 1 (once the row is linked, liveness writes
|
|
51
|
+
* land).
|
|
52
|
+
*
|
|
53
|
+
* Runtime budget is generous — the inner deadlines sum to ~225s
|
|
54
|
+
* worst-case (45s parent-ack + 75s feed-first-paint + 12s delta + 180s
|
|
55
|
+
* done) plus ~12s spinUp overhead. The outer `it()` timeout absorbs the lot.
|
|
56
|
+
* The 180s done-window accommodates the stall-detection path: the watcher
|
|
57
|
+
* fires `onFinish` 60s after the last JSONL event, because background
|
|
58
|
+
* workers don't reliably emit `sub_agent_turn_end`.
|
|
34
59
|
*/
|
|
35
60
|
|
|
36
61
|
import { describe, expect, it } from "vitest";
|
|
37
62
|
import { spinUp } from "../harness.js";
|
|
63
|
+
import { WORKER_FEED_RE } from "../assertions.js";
|
|
38
64
|
|
|
39
65
|
// Explicit dispatch prompt (Option 1 per the RFC §"Background-dispatch
|
|
40
66
|
// prompt"). The naturalistic Option-2 version didn't reliably get the
|
|
41
67
|
// model to use the Agent tool with run_in_background:true — first
|
|
42
68
|
// attempt produced the failure mode the RFC predicted (parent ran the
|
|
43
|
-
// sleeps inline via Bash;
|
|
69
|
+
// sleeps inline via Bash; feed never surfaced Background-phase activity).
|
|
44
70
|
//
|
|
45
71
|
// This test asserts the VISIBILITY INFRA works, not that the model
|
|
46
72
|
// makes good delegation judgments. Naming the tool + the arg lets the
|
|
47
|
-
// scenario be deterministic.
|
|
48
|
-
// Agent tool even with this prompt, that's an unrelated bug (model
|
|
49
|
-
// alignment / tool registration) and the scenario fails distinctly
|
|
50
|
-
// from the visibility-infra failure modes we're trying to catch.
|
|
73
|
+
// scenario be deterministic.
|
|
51
74
|
//
|
|
52
|
-
// Time profile: ~60s of bg work, paced with
|
|
53
|
-
//
|
|
54
|
-
// surface as fresh
|
|
55
|
-
//
|
|
56
|
-
// heartbeat tick (5s default), and snapshot again.
|
|
75
|
+
// Time profile: ~60s of bg work, paced with ten short steps so the
|
|
76
|
+
// worker emits multiple tool_use + narrative events the subagent-watcher
|
|
77
|
+
// can surface as fresh edits. We need the Background phase to last long
|
|
78
|
+
// enough to clear the 8s first-paint threshold and take a snapshot.
|
|
57
79
|
const BG_DISPATCH_PROMPT =
|
|
58
80
|
`Use the Agent tool with subagent_type "general-purpose" and ` +
|
|
59
81
|
`run_in_background: true to dispatch a worker with this exact task: ` +
|
|
60
|
-
`"
|
|
61
|
-
|
|
62
|
-
`
|
|
63
|
-
`
|
|
64
|
-
`
|
|
65
|
-
`progress
|
|
82
|
+
`"Do ten steps, ONE AT A TIME, k = 1 through 10. Before each step ` +
|
|
83
|
+
`write a brief one-sentence narration of what you are about to do, ` +
|
|
84
|
+
`then run \`sleep 2\` via the Bash tool, then run \`echo step-k\` via ` +
|
|
85
|
+
`the Bash tool (substitute the real number for k). Run every sleep and ` +
|
|
86
|
+
`every echo as its OWN separate Bash call — never batch or chain them ` +
|
|
87
|
+
`with && — and narrate before each so progress surfaces incrementally. ` +
|
|
88
|
+
`Do not stop early; complete all ten steps." After dispatching, send a ` +
|
|
89
|
+
`brief reply saying you've kicked off the background worker so I can ` +
|
|
90
|
+
`watch the progress feed.`;
|
|
91
|
+
|
|
92
|
+
const WORKER_RUNNING_RE = /running\s*·/i;
|
|
93
|
+
const WORKER_DONE_RE = /finished\s*·\s*(completed|failed)/i;
|
|
66
94
|
|
|
67
|
-
/**
|
|
68
|
-
* STATUS: currently red — surfaces two real production bugs the
|
|
69
|
-
* RFC §Risks predicted as possible-but-unverified. Marked `it.fails`
|
|
70
|
-
* so a future fix flips it green and a regression flips it red again.
|
|
71
|
-
*
|
|
72
|
-
* Bug 1 — orphan correlation. The parent's `Agent` tool_use_id
|
|
73
|
-
* doesn't get matched to the spawned `sub_agent_started`
|
|
74
|
-
* event. Gateway log: `pendingSpawns=0 correlated=orphan`.
|
|
75
|
-
* Result: `isBackgroundDispatch` is never set on the fleet
|
|
76
|
-
* member; the card's header phase transitions to Background
|
|
77
|
-
* only by accident (orphans defer too, but they don't carry
|
|
78
|
-
* the bg flag).
|
|
79
|
-
*
|
|
80
|
-
* Bug 2 — subagent-watcher can't track the worker. Gateway log:
|
|
81
|
-
* `subagent-watcher: liveness skip <agentId> — row not in
|
|
82
|
-
* DB yet (Phase 2 Pre hook pending)`. Result: no
|
|
83
|
-
* sub_agent_tool_use events reach the fleet member; the
|
|
84
|
-
* fleet row's `last activity` field never updates with the
|
|
85
|
-
* worker's actual tool calls. The card edits we see are
|
|
86
|
-
* just elapsed-counter ticks from the heartbeat.
|
|
87
|
-
*
|
|
88
|
-
* Both bugs are real and live on `main`. The scenario above passes
|
|
89
|
-
* AC-1 (card stays pinned), partially passes AC-2 (Background phase
|
|
90
|
-
* fires) and AC-3 (card body changes — from heartbeat alone), and
|
|
91
|
-
* fails AC-2's closing half (card never reaches Done in 120s because
|
|
92
|
-
* the orphan never terminates from the gateway's view).
|
|
93
|
-
*
|
|
94
|
-
* When Bug 1 + Bug 2 are fixed, change `describe.skip` to `describe`
|
|
95
|
-
* below — the assertions are correct; only the production code is
|
|
96
|
-
* wrong.
|
|
97
|
-
*
|
|
98
|
-
* Update post-#1105: all five RFC bugs (1–5 in earlier PRs, 6–7 in
|
|
99
|
-
* #1105) merged. Unskipped here for the next UAT re-run. If 6/6 ACs
|
|
100
|
-
* pass, close #709 / #776 / #782 / #788.
|
|
101
|
-
*/
|
|
102
95
|
describe("uat: background sub-agent visibility (#709/#776/#782/#788)", () => {
|
|
103
96
|
it(
|
|
104
|
-
"
|
|
97
|
+
"worker-feed appears with running status then flips to finished once the sub-agent completes",
|
|
105
98
|
async () => {
|
|
106
99
|
const sc = await spinUp({ agent: "test-harness" });
|
|
107
100
|
try {
|
|
108
101
|
await sc.sendDM(BG_DISPATCH_PROMPT);
|
|
109
102
|
|
|
110
|
-
//
|
|
111
|
-
|
|
112
|
-
const card = await sc.expectPinnedCard({ timeout: 15_000 });
|
|
113
|
-
expect(card.messageId).toBeGreaterThan(0);
|
|
103
|
+
// Parent ack reply — confirms the parent turn closed.
|
|
104
|
+
await sc.expectMessage(/.+/, { from: "bot", timeout: 45_000 });
|
|
114
105
|
|
|
115
|
-
//
|
|
116
|
-
//
|
|
117
|
-
//
|
|
118
|
-
//
|
|
119
|
-
// pre-fix the card would unpin).
|
|
120
|
-
await sc.expectMessage(/.+/, { from: "bot", timeout: 30_000 });
|
|
121
|
-
|
|
122
|
-
// AC-2: header MUST be 🌀 Background (post-#1039) or, if the
|
|
123
|
-
// bg dispatch happened so fast the worker hasn't started yet,
|
|
124
|
-
// it might still be ⚙️ Working with the parent zone done. We
|
|
125
|
-
// poll for the background phase with a 45s budget — long
|
|
126
|
-
// enough for the worker to actually start firing tools, short
|
|
127
|
-
// enough that "we never saw Background" surfaces as a real
|
|
128
|
-
// bug, not a timeout-tuning issue.
|
|
106
|
+
// AC-1 step 1: worker-feed message appears after first-paint delay
|
|
107
|
+
// (~8s default). The message starts with "🛠 Worker" and shows
|
|
108
|
+
// "running ·" while the worker is in flight. Generous timeout so a
|
|
109
|
+
// slow first tool_use + narrative doesn't false-flag.
|
|
129
110
|
//
|
|
130
|
-
//
|
|
131
|
-
// parent
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
timeout: 45_000,
|
|
111
|
+
// Distinct from the parent's ack — `expectMessage` starts observing
|
|
112
|
+
// from after the parent ack, so the feed paint is the next match.
|
|
113
|
+
const feed = await sc.expectMessage(WORKER_FEED_RE, {
|
|
114
|
+
from: "bot",
|
|
115
|
+
timeout: 75_000,
|
|
136
116
|
});
|
|
137
|
-
expect(
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
//
|
|
141
|
-
//
|
|
142
|
-
//
|
|
143
|
-
expect(
|
|
117
|
+
expect(feed.messageId).toBeGreaterThan(0);
|
|
118
|
+
expect(feed.text).toMatch(WORKER_FEED_RE);
|
|
119
|
+
|
|
120
|
+
// AC-2 step 1: feed body MUST show "running ·" (the in-flight
|
|
121
|
+
// status), NOT the terminal "finished ·" — the worker hasn't
|
|
122
|
+
// completed yet.
|
|
123
|
+
expect(feed.text).toMatch(WORKER_RUNNING_RE);
|
|
124
|
+
expect(feed.text).not.toMatch(WORKER_DONE_RE);
|
|
144
125
|
|
|
145
|
-
// AC-3:
|
|
146
|
-
// the current
|
|
147
|
-
//
|
|
148
|
-
// differ (elapsed counter
|
|
126
|
+
// AC-3: feed edits land regularly while the worker runs. Snapshot
|
|
127
|
+
// the current body, wait 12s (well above the 2.5s edit throttle,
|
|
128
|
+
// and enough that at least one step + sleep cycle completes), then
|
|
129
|
+
// re-fetch the SAME message. The body MUST differ (elapsed counter
|
|
130
|
+
// or narrative step advances).
|
|
149
131
|
//
|
|
150
132
|
// We re-fetch the SAME message via `driver.getMessage(chatId,
|
|
151
|
-
//
|
|
152
|
-
// listens for NEW
|
|
153
|
-
//
|
|
154
|
-
//
|
|
155
|
-
// though the card is alive and being edited (caught in the
|
|
156
|
-
// first run of this scenario).
|
|
133
|
+
// msgId)` rather than `expectMessage(WORKER_FEED_RE)` because the
|
|
134
|
+
// latter listens for NEW messages. The feed edits in-place; a new
|
|
135
|
+
// send only happens on re-post (stale messageId). So re-fetching is
|
|
136
|
+
// the right shape.
|
|
157
137
|
//
|
|
158
|
-
//
|
|
159
|
-
//
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
// `getMessage` returns null and we surface it with a clear
|
|
163
|
-
// assertion.
|
|
164
|
-
const beforeDelta = bgPhaseCard.text;
|
|
165
|
-
await new Promise((r) => setTimeout(r, 6_000));
|
|
138
|
+
// 12s instead of 6s: the first edit arrives ~6-8s after paint (one
|
|
139
|
+
// step/sleep cycle), so 6s was racy. 12s gives a safe 2x margin.
|
|
140
|
+
const beforeDelta = feed.text;
|
|
141
|
+
await new Promise((r) => setTimeout(r, 12_000));
|
|
166
142
|
const afterDeltaMsg = await sc.driver.getMessage(
|
|
167
143
|
sc.botUserId,
|
|
168
|
-
|
|
144
|
+
feed.messageId,
|
|
169
145
|
);
|
|
170
|
-
expect(afterDeltaMsg, "
|
|
146
|
+
expect(afterDeltaMsg, "feed message disappeared mid-flight (AC-1 regression)").not.toBeNull();
|
|
171
147
|
expect(afterDeltaMsg!.text).not.toBe(beforeDelta);
|
|
172
148
|
|
|
173
|
-
// AC-2 closing half: bg terminates →
|
|
174
|
-
//
|
|
175
|
-
//
|
|
176
|
-
//
|
|
177
|
-
//
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
149
|
+
// AC-2 closing half: bg terminates → body flips to "finished ·
|
|
150
|
+
// completed". The terminal edit is triggered by the subagent-watcher's
|
|
151
|
+
// stall detection (60s after the last JSONL activity), because
|
|
152
|
+
// background Claude Code workers don't always emit a sub_agent_turn_end
|
|
153
|
+
// event. Budget: worker steps (~60s) + stall window (60s) + slack.
|
|
154
|
+
// From first-paint to terminal is typically 140-165s.
|
|
155
|
+
let doneText: string | null = null;
|
|
156
|
+
const deadline = Date.now() + 180_000;
|
|
157
|
+
while (Date.now() < deadline) {
|
|
158
|
+
const m = await sc.driver.getMessage(sc.botUserId, feed.messageId);
|
|
159
|
+
if (m != null && WORKER_DONE_RE.test(m.text)) {
|
|
160
|
+
doneText = m.text;
|
|
161
|
+
break;
|
|
162
|
+
}
|
|
163
|
+
await new Promise((r) => setTimeout(r, 3_000));
|
|
164
|
+
}
|
|
165
|
+
expect(doneText, "worker-feed never reached a terminal recap").not.toBeNull();
|
|
166
|
+
expect(doneText!).toMatch(/tools?/i);
|
|
167
|
+
// Body MUST have changed between first paint and terminal.
|
|
168
|
+
expect(doneText).not.toBe(beforeDelta);
|
|
182
169
|
} finally {
|
|
183
170
|
await sc.tearDown();
|
|
184
171
|
}
|
|
185
172
|
},
|
|
186
|
-
// Outer per-test budget: sum of inner deadlines (
|
|
187
|
-
//
|
|
188
|
-
|
|
189
|
-
300_000,
|
|
173
|
+
// Outer per-test budget: sum of inner deadlines (45 + 75 + 16 + 180 =
|
|
174
|
+
// 316s) + spinUp settle (~12s) + slack.
|
|
175
|
+
360_000,
|
|
190
176
|
);
|
|
191
177
|
});
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* JTBD: "the answer pings" — notification ownership (R8 / PR-2; design
|
|
3
|
+
* `docs/message-emission-determinism.md` §over-ping).
|
|
4
|
+
*
|
|
5
|
+
* The residual the bare one-ping-per-turn safety net left: when a turn opens
|
|
6
|
+
* with an interim ACK that pings first, the ack claims the turn's single ping
|
|
7
|
+
* slot and the LATER substantive answer used to be downgraded to silent — the
|
|
8
|
+
* reply is last on screen, but the user's phone never buzzed for the actual
|
|
9
|
+
* answer. PR-2 makes `decideOverPing` aware of WHO holds the slot and lets a
|
|
10
|
+
* substantive answer UPGRADE over an ack's slot, so the answer pings.
|
|
11
|
+
*
|
|
12
|
+
* This scenario drives the exact sequence end-to-end: an "On it" style ack
|
|
13
|
+
* (pings, claims the slot) followed by a ≥300-char substantive answer, and
|
|
14
|
+
* asserts the ANSWER arrived non-silent via `assertAnswerPinged`
|
|
15
|
+
* (mtcute's `ObservedMessage.silent`).
|
|
16
|
+
*
|
|
17
|
+
* Runs under CI `uat-gate`; the full live MTProto run needs the test-harness
|
|
18
|
+
* agent + a vault session, so locally this self-skips green (no driver).
|
|
19
|
+
*
|
|
20
|
+
* Scope caveat: this end-to-end scenario only exercises PR-2's upgrade code
|
|
21
|
+
* path when the harness model delivers its final answer via the `reply` tool.
|
|
22
|
+
* If the model answers via `stream_reply` instead, that path bypasses the
|
|
23
|
+
* over-ping safety net entirely (it never reaches `decideOverPing`), so the
|
|
24
|
+
* upgrade-over-ack logic is never touched. The model's tool choice isn't
|
|
25
|
+
* forceable here, which makes this scenario a WEAKER backstop than the unit
|
|
26
|
+
* matrix — the real proof of the upgrade behaviour lives in the deterministic
|
|
27
|
+
* unit tests in `over-ping-final-answer-decoupling.test.ts`. Treat this as a
|
|
28
|
+
* live smoke-test of the happy path, not the source of truth.
|
|
29
|
+
*/
|
|
30
|
+
import { describe, it, expect, beforeAll } from "vitest";
|
|
31
|
+
import { spinUp, type Scenario } from "../harness.js";
|
|
32
|
+
import { assertAnswerPinged, isAnswer } from "../assertions.js";
|
|
33
|
+
import { collectTurn } from "../real-work-prompts.js";
|
|
34
|
+
|
|
35
|
+
/** Overall budget for the ack-then-answer turn. */
|
|
36
|
+
const TURN_BUDGET_MS = 130_000;
|
|
37
|
+
/** The answer must clear the substantive-length backstop (≥200). */
|
|
38
|
+
const MIN_ANSWER_CHARS = 200;
|
|
39
|
+
|
|
40
|
+
describe("uat: the substantive answer pings even after an ack pinged (DM)", () => {
|
|
41
|
+
let sc: Scenario | null = null;
|
|
42
|
+
|
|
43
|
+
beforeAll(async () => {
|
|
44
|
+
try {
|
|
45
|
+
sc = await spinUp({ agent: "test-harness" });
|
|
46
|
+
await sc.driver.primeDialogs();
|
|
47
|
+
} catch (err) {
|
|
48
|
+
console.warn(
|
|
49
|
+
`[answer-pings] no live driver — self-skipping green: ${(err as Error).message}`,
|
|
50
|
+
);
|
|
51
|
+
sc = null;
|
|
52
|
+
}
|
|
53
|
+
});
|
|
54
|
+
|
|
55
|
+
it(
|
|
56
|
+
"an ack pings first, then the substantive answer also pings (R8 / PR-2 upgrade)",
|
|
57
|
+
async () => {
|
|
58
|
+
if (sc == null) return; // self-skip green
|
|
59
|
+
const { driver, botUserId, driverUserId } = sc;
|
|
60
|
+
|
|
61
|
+
// Prompt the model into the ack-then-answer cadence: a quick pinging
|
|
62
|
+
// "On it" reply, then — after a beat — a thorough ≥300-character answer
|
|
63
|
+
// as a fresh (also pinging) reply. The model's exact wording isn't
|
|
64
|
+
// forceable, so we accept any substantive (≥200-char) answer that lands;
|
|
65
|
+
// collectTurn skips the short ack (below minAnswerChars) and latches onto
|
|
66
|
+
// the real answer.
|
|
67
|
+
const obs = await collectTurn(
|
|
68
|
+
driver,
|
|
69
|
+
botUserId,
|
|
70
|
+
driverUserId,
|
|
71
|
+
"First send a very short interim reply 'On it.' (pinging — do NOT set " +
|
|
72
|
+
"disable_notification). THEN, as a separate second reply, give me a " +
|
|
73
|
+
"thorough answer of at least 300 characters explaining what a Telegram " +
|
|
74
|
+
"supergroup is, how forum topics partition it, and how a bot routes a " +
|
|
75
|
+
"reply back to the topic a question came from. The long second reply is " +
|
|
76
|
+
"your final answer.",
|
|
77
|
+
{ timeoutMs: TURN_BUDGET_MS, minAnswerChars: MIN_ANSWER_CHARS, settleMs: 12_000 },
|
|
78
|
+
);
|
|
79
|
+
|
|
80
|
+
if (obs.answer == null) {
|
|
81
|
+
console.warn("[answer-pings] INCONCLUSIVE — no substantive answer landed in budget.");
|
|
82
|
+
return;
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
// Sanity: this is the answer lane, not a feed surface.
|
|
86
|
+
expect(isAnswer(obs.answer, driverUserId)).toBe(true);
|
|
87
|
+
|
|
88
|
+
// The load-bearing assertion: the substantive answer is non-silent. If an
|
|
89
|
+
// earlier ack-ping had downgraded it (the pre-PR-2 residual), this throws.
|
|
90
|
+
assertAnswerPinged(obs.answer);
|
|
91
|
+
},
|
|
92
|
+
TURN_BUDGET_MS + 30_000,
|
|
93
|
+
);
|
|
94
|
+
});
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* JTBD: "no stale 'thinking…' card opens beneath an answer the user already
|
|
3
|
+
* received in an EARLIER turn" — the cross-turn form of the reply-is-last
|
|
4
|
+
* invariant (design `docs/message-emission-determinism.md` §9 lever 4 / race
|
|
5
|
+
* C/D; PR1).
|
|
6
|
+
*
|
|
7
|
+
* The in-turn levers (#2557, sticky `finalAnswerEverDelivered`) only govern the
|
|
8
|
+
* CURRENT turn. The cross-turn surfaces — the obligation `represent` sweep and
|
|
9
|
+
* the heartbeat/liveness timer — can OPEN a card in a LATER synthetic turn,
|
|
10
|
+
* surfacing a card beneath an answer delivered in an earlier turn. PR1's lever 4
|
|
11
|
+
* gates those synthetic card-OPEN paths on `hasOutboundDeliveredSince`: if a
|
|
12
|
+
* substantive answer already landed since the obligation was raised, the card
|
|
13
|
+
* OPEN is suppressed (the represent SEND is unaffected — only the decorative
|
|
14
|
+
* card).
|
|
15
|
+
*
|
|
16
|
+
* This scenario delivers a substantive answer in turn N, then keeps pulling
|
|
17
|
+
* send-order history through a long settle window (during which the obligation
|
|
18
|
+
* sweep / heartbeat may fire a synthetic represent/liveness surface in turn
|
|
19
|
+
* N+1), and asserts no activity/worker-feed card opened BELOW the delivered
|
|
20
|
+
* answer. `assertReplyIsLast` scopes to the answer's window up to the next
|
|
21
|
+
* driver message — and because a cross-turn synthetic surface carries NO
|
|
22
|
+
* intervening driver message, a card-below-answer it opens falls inside that
|
|
23
|
+
* window and is correctly flagged.
|
|
24
|
+
*
|
|
25
|
+
* Runs under CI `uat-gate`; the full live MTProto run needs the test-harness
|
|
26
|
+
* agent + a vault session, so locally this self-skips green (no driver) — same
|
|
27
|
+
* shape as `jtbd-reply-is-last-dm.test.ts`.
|
|
28
|
+
*/
|
|
29
|
+
import { describe, it, expect, beforeAll } from "vitest";
|
|
30
|
+
import { spinUp, type Scenario } from "../harness.js";
|
|
31
|
+
import {
|
|
32
|
+
assertReplyIsLast,
|
|
33
|
+
isAnswer,
|
|
34
|
+
isActivityFeedMessage,
|
|
35
|
+
isWorkerFeedMessage,
|
|
36
|
+
} from "../assertions.js";
|
|
37
|
+
import { collectTurn } from "../real-work-prompts.js";
|
|
38
|
+
|
|
39
|
+
/** Per-case overall budget. */
|
|
40
|
+
const TURN_BUDGET_MS = 140_000;
|
|
41
|
+
/** History pull depth — covers the answer turn + any cross-turn synthetic surface. */
|
|
42
|
+
const HISTORY_LIMIT = 80;
|
|
43
|
+
/**
|
|
44
|
+
* Settle window AFTER the answer lands. Long enough that the obligation sweep
|
|
45
|
+
* (and the heartbeat liveness timer) has at least one chance to fire a
|
|
46
|
+
* cross-turn synthetic surface — the window PR1 lever 4 guards. The obligation
|
|
47
|
+
* sweep runs on its own interval, so we cannot force a represent deterministically;
|
|
48
|
+
* the durable assertion is "IF a synthetic surface fires, it must not open a
|
|
49
|
+
* card below the answer." A run where no represent fires is a valid green pass.
|
|
50
|
+
*/
|
|
51
|
+
const POST_ANSWER_SETTLE_MS = 20_000;
|
|
52
|
+
|
|
53
|
+
describe("uat: no cross-turn card opens beneath an earlier answer (DM)", () => {
|
|
54
|
+
let sc: Scenario | null = null;
|
|
55
|
+
|
|
56
|
+
beforeAll(async () => {
|
|
57
|
+
try {
|
|
58
|
+
sc = await spinUp({ agent: "test-harness" });
|
|
59
|
+
await sc.driver.primeDialogs();
|
|
60
|
+
} catch (err) {
|
|
61
|
+
console.warn(
|
|
62
|
+
`[cross-turn-card] no live driver — self-skipping green: ${(err as Error).message}`,
|
|
63
|
+
);
|
|
64
|
+
sc = null;
|
|
65
|
+
}
|
|
66
|
+
});
|
|
67
|
+
|
|
68
|
+
it(
|
|
69
|
+
"a substantive answer in turn N is not followed by a card opened in turn N+1 (lever 4 / race C/D)",
|
|
70
|
+
async () => {
|
|
71
|
+
if (sc == null) return; // self-skip green
|
|
72
|
+
const { driver, botUserId, driverUserId } = sc;
|
|
73
|
+
|
|
74
|
+
// Deliver a substantive answer (≥200 chars → trips the substantive proxy
|
|
75
|
+
// the cross-turn gate keys on). A tool is used so a card legitimately
|
|
76
|
+
// opens DURING the turn — the test then proves nothing opens BELOW the
|
|
77
|
+
// reply afterwards, across the cross-turn boundary.
|
|
78
|
+
const obs = await collectTurn(
|
|
79
|
+
driver,
|
|
80
|
+
botUserId,
|
|
81
|
+
driverUserId,
|
|
82
|
+
"Use your Bash tool to run `uname -a`, then give me a thorough answer " +
|
|
83
|
+
"(at least 220 characters) explaining what the output means field by " +
|
|
84
|
+
"field. That detailed message is your final answer.",
|
|
85
|
+
{ timeoutMs: TURN_BUDGET_MS, minAnswerChars: 200, settleMs: POST_ANSWER_SETTLE_MS },
|
|
86
|
+
);
|
|
87
|
+
|
|
88
|
+
if (obs.answer == null) {
|
|
89
|
+
console.warn("[cross-turn-card] INCONCLUSIVE — no answer landed in budget.");
|
|
90
|
+
return;
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
// Sanity: the answer is a real answer-lane message, not a feed surface.
|
|
94
|
+
expect(isAnswer(obs.answer, driverUserId)).toBe(true);
|
|
95
|
+
expect(isActivityFeedMessage(obs.answer)).toBe(false);
|
|
96
|
+
expect(isWorkerFeedMessage(obs.answer)).toBe(false);
|
|
97
|
+
|
|
98
|
+
// Pull full server send-order history AFTER the long settle. Any
|
|
99
|
+
// cross-turn synthetic surface (represent / heartbeat liveness) that
|
|
100
|
+
// opened a card would now be present with a HIGHER message_id than the
|
|
101
|
+
// answer and — having no intervening driver message — inside the answer's
|
|
102
|
+
// turn window, so assertReplyIsLast flags it. Lever 4 must have suppressed
|
|
103
|
+
// that OPEN.
|
|
104
|
+
const history = await driver.getHistory(botUserId, HISTORY_LIMIT);
|
|
105
|
+
assertReplyIsLast(history, driverUserId, { turn: obs.answer });
|
|
106
|
+
},
|
|
107
|
+
TURN_BUDGET_MS + 40_000,
|
|
108
|
+
);
|
|
109
|
+
});
|