switchroom 0.15.44 → 0.16.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (150) hide show
  1. package/dist/agent-scheduler/index.js +122 -88
  2. package/dist/auth-broker/index.js +463 -177
  3. package/dist/cli/autoaccept-poll.js +4842 -35
  4. package/dist/cli/drive-write-pretool.mjs +17 -14
  5. package/dist/cli/notion-write-pretool.mjs +117 -86
  6. package/dist/cli/self-improve-apply-guard-pretool.mjs +626 -0
  7. package/dist/cli/self-improve-stop.mjs +428 -0
  8. package/dist/cli/skill-validate-pretool.mjs +72 -72
  9. package/dist/cli/switchroom.js +3249 -1241
  10. package/dist/cli/ui/index.html +1 -1
  11. package/dist/host-control/main.js +2833 -355
  12. package/dist/vault/approvals/kernel-server.js +7482 -7439
  13. package/dist/vault/broker/server.js +11315 -11272
  14. package/examples/minimal.yaml +1 -0
  15. package/examples/switchroom.yaml +1 -0
  16. package/package.json +3 -3
  17. package/profiles/_base/start.sh.hbs +88 -1
  18. package/profiles/_shared/execution-discipline.md.hbs +18 -0
  19. package/profiles/default/CLAUDE.md.hbs +3 -22
  20. package/telegram-plugin/.claude-plugin/plugin.json +2 -2
  21. package/telegram-plugin/answer-stream-flag.ts +12 -49
  22. package/telegram-plugin/answer-stream.ts +5 -150
  23. package/telegram-plugin/auth-snapshot-format.ts +280 -48
  24. package/telegram-plugin/auto-fallback-fleet.ts +44 -1
  25. package/telegram-plugin/context-exhaustion.ts +12 -0
  26. package/telegram-plugin/demo-mask.ts +154 -0
  27. package/telegram-plugin/dist/bridge/bridge.js +167 -124
  28. package/telegram-plugin/dist/gateway/gateway.js +3039 -1159
  29. package/telegram-plugin/dist/server.js +215 -172
  30. package/telegram-plugin/docs/waiting-ux-spec.md +2 -2
  31. package/telegram-plugin/draft-stream.ts +47 -410
  32. package/telegram-plugin/final-answer-detect.ts +17 -12
  33. package/telegram-plugin/fleet-fallback-resume.ts +131 -0
  34. package/telegram-plugin/format.ts +56 -19
  35. package/telegram-plugin/gateway/auth-add-flow.ts +332 -127
  36. package/telegram-plugin/gateway/auth-broker-client.ts +2 -2
  37. package/telegram-plugin/gateway/auth-command.ts +70 -14
  38. package/telegram-plugin/gateway/clean-shutdown-marker.ts +44 -0
  39. package/telegram-plugin/gateway/config-approval-handler.test.ts +91 -4
  40. package/telegram-plugin/gateway/config-approval-handler.ts +94 -13
  41. package/telegram-plugin/gateway/current-turn-map.ts +188 -0
  42. package/telegram-plugin/gateway/disconnect-flush.ts +3 -1
  43. package/telegram-plugin/gateway/effort-command.ts +8 -3
  44. package/telegram-plugin/gateway/emission-authority.ts +369 -0
  45. package/telegram-plugin/gateway/feed-open-gate.ts +292 -0
  46. package/telegram-plugin/gateway/gateway.ts +1837 -291
  47. package/telegram-plugin/gateway/inject-handler.test.ts +2 -1
  48. package/telegram-plugin/gateway/ms365-write-approval.test.ts +4 -4
  49. package/telegram-plugin/gateway/represent-guard.ts +72 -0
  50. package/telegram-plugin/gateway/status-surface-log.test.ts +5 -4
  51. package/telegram-plugin/gateway/status-surface-log.ts +14 -3
  52. package/telegram-plugin/history.ts +33 -11
  53. package/telegram-plugin/hooks/repo-context-pretool.mjs +26 -0
  54. package/telegram-plugin/hooks/subagent-tracker-posttool.mjs +5 -0
  55. package/telegram-plugin/hooks/subagent-tracker-pretool.mjs +8 -0
  56. package/telegram-plugin/hooks/tool-label-pretool.mjs +39 -15
  57. package/telegram-plugin/issues-card.ts +4 -0
  58. package/telegram-plugin/model-unavailable.ts +124 -0
  59. package/telegram-plugin/narrative-dedup.ts +69 -0
  60. package/telegram-plugin/over-ping-safety-net.ts +70 -4
  61. package/telegram-plugin/package.json +3 -3
  62. package/telegram-plugin/pending-work-progress.ts +12 -0
  63. package/telegram-plugin/permission-rule.ts +32 -5
  64. package/telegram-plugin/permission-title.ts +152 -9
  65. package/telegram-plugin/quota-check.ts +13 -0
  66. package/telegram-plugin/quota-watch.ts +135 -7
  67. package/telegram-plugin/registry/turns-schema.test.ts +24 -0
  68. package/telegram-plugin/registry/turns-schema.ts +9 -0
  69. package/telegram-plugin/runtime-metrics.ts +13 -0
  70. package/telegram-plugin/session-tail.ts +96 -11
  71. package/telegram-plugin/silence-poke.ts +170 -24
  72. package/telegram-plugin/slot-banner-driver.ts +3 -0
  73. package/telegram-plugin/status-no-truncate.ts +44 -0
  74. package/telegram-plugin/status-reactions.ts +20 -3
  75. package/telegram-plugin/stream-controller.ts +4 -23
  76. package/telegram-plugin/stream-reply-handler.ts +6 -24
  77. package/telegram-plugin/streaming-metrics.ts +91 -0
  78. package/telegram-plugin/subagent-watcher.ts +212 -66
  79. package/telegram-plugin/tests/activity-ever-opened-sticky.test.ts +47 -0
  80. package/telegram-plugin/tests/answer-stream-dedup.test.ts +9 -26
  81. package/telegram-plugin/tests/answer-stream-flag.test.ts +25 -58
  82. package/telegram-plugin/tests/answer-stream-silent-markers.test.ts +41 -51
  83. package/telegram-plugin/tests/answer-stream.test.ts +2 -411
  84. package/telegram-plugin/tests/auth-add-flow.test.ts +488 -253
  85. package/telegram-plugin/tests/auth-command-format2.test.ts +71 -1
  86. package/telegram-plugin/tests/auth-snapshot-format.test.ts +376 -6
  87. package/telegram-plugin/tests/auto-fallback-fleet.test.ts +120 -0
  88. package/telegram-plugin/tests/cross-turn-card-gate.test.ts +424 -0
  89. package/telegram-plugin/tests/demo-mask.test.ts +127 -0
  90. package/telegram-plugin/tests/draft-stream.test.ts +0 -827
  91. package/telegram-plugin/tests/emission-authority-card-drain-gate.test.ts +236 -0
  92. package/telegram-plugin/tests/emission-authority-facade.test.ts +488 -0
  93. package/telegram-plugin/tests/emission-authority-open-gate.test.ts +179 -0
  94. package/telegram-plugin/tests/emission-authority-ping-gate.test.ts +395 -0
  95. package/telegram-plugin/tests/emission-determinism-wiring.test.ts +177 -0
  96. package/telegram-plugin/tests/feed-heartbeat-liveness-open.test.ts +146 -0
  97. package/telegram-plugin/tests/feed-open-gate.test.ts +259 -0
  98. package/telegram-plugin/tests/feed-survival.test.ts +526 -0
  99. package/telegram-plugin/tests/fleet-fallback-resume.test.ts +197 -0
  100. package/telegram-plugin/tests/gateway-clean-shutdown-marker.test.ts +117 -0
  101. package/telegram-plugin/tests/gateway-no-reply-single-emit.test.ts +4 -11
  102. package/telegram-plugin/tests/history.test.ts +60 -0
  103. package/telegram-plugin/tests/model-unavailable.test.ts +118 -0
  104. package/telegram-plugin/tests/narrative-dedup.test.ts +118 -0
  105. package/telegram-plugin/tests/orphaned-reply-rearm.test.ts +285 -0
  106. package/telegram-plugin/tests/over-ping-final-answer-decoupling.test.ts +194 -0
  107. package/telegram-plugin/tests/over-ping-safety-net.test.ts +2 -2
  108. package/telegram-plugin/tests/per-topic-current-turn.test.ts +373 -0
  109. package/telegram-plugin/tests/permission-card-origin-kill-switch.test.ts +42 -0
  110. package/telegram-plugin/tests/permission-rule.test.ts +17 -0
  111. package/telegram-plugin/tests/permission-title.test.ts +206 -17
  112. package/telegram-plugin/tests/quota-watch.test.ts +252 -9
  113. package/telegram-plugin/tests/reply-terminal-reaction.test.ts +6 -1
  114. package/telegram-plugin/tests/repo-context-pretool.test.ts +62 -0
  115. package/telegram-plugin/tests/represent-guard.test.ts +162 -0
  116. package/telegram-plugin/tests/session-tail.test.ts +147 -3
  117. package/telegram-plugin/tests/silence-liveness-wiring.test.ts +18 -0
  118. package/telegram-plugin/tests/status-card-budget-parity.test.ts +72 -0
  119. package/telegram-plugin/tests/status-surface-log.test.ts +146 -0
  120. package/telegram-plugin/tests/subagent-watcher-clip-narrative.test.ts +58 -0
  121. package/telegram-plugin/tests/subagent-watcher-parent-turn-key.test.ts +102 -0
  122. package/telegram-plugin/tests/subagent-watcher-workflow-visibility.test.ts +225 -0
  123. package/telegram-plugin/tests/subagent-watcher.test.ts +147 -0
  124. package/telegram-plugin/tests/telegram-activity-visibility-integration.test.ts +597 -0
  125. package/telegram-plugin/tests/telegram-format.test.ts +101 -6
  126. package/telegram-plugin/tests/tool-activity-summary.test.ts +550 -15
  127. package/telegram-plugin/tests/tool-label-pretool.test.ts +73 -0
  128. package/telegram-plugin/tests/tool-label-sidecar.test.ts +44 -0
  129. package/telegram-plugin/tests/tool-labels.test.ts +67 -0
  130. package/telegram-plugin/tests/turn-liveness-floor.test.ts +196 -0
  131. package/telegram-plugin/tests/turn-liveness-invariant.test.ts +340 -0
  132. package/telegram-plugin/tests/welcome-text.test.ts +32 -3
  133. package/telegram-plugin/tests/worker-activity-feed.test.ts +470 -22
  134. package/telegram-plugin/tool-activity-summary.ts +375 -58
  135. package/telegram-plugin/turn-liveness-floor.ts +240 -0
  136. package/telegram-plugin/uat/assertions.ts +115 -0
  137. package/telegram-plugin/uat/driver.ts +68 -0
  138. package/telegram-plugin/uat/scenarios/bg-sub-agent-dispatch-dm.test.ts +119 -133
  139. package/telegram-plugin/uat/scenarios/jtbd-answer-pings.test.ts +94 -0
  140. package/telegram-plugin/uat/scenarios/jtbd-cross-turn-card-dm.test.ts +109 -0
  141. package/telegram-plugin/uat/scenarios/jtbd-foreground-feed-thinkgap-dm.test.ts +478 -0
  142. package/telegram-plugin/uat/scenarios/jtbd-foreground-feed-visibility-dm.test.ts +396 -0
  143. package/telegram-plugin/uat/scenarios/jtbd-liveness-feed-open-dm.test.ts +202 -0
  144. package/telegram-plugin/uat/scenarios/jtbd-reply-is-last-dm.test.ts +202 -0
  145. package/telegram-plugin/uat/scenarios/reactions-dm.test.ts +93 -87
  146. package/telegram-plugin/welcome-text.ts +13 -1
  147. package/telegram-plugin/worker-activity-feed.ts +157 -82
  148. package/telegram-plugin/draft-transport.ts +0 -122
  149. package/telegram-plugin/tests/draft-retirement-wiring.test.ts +0 -82
  150. package/telegram-plugin/tests/draft-transport.test.ts +0 -211
@@ -5,187 +5,173 @@
5
5
  * Verifies three acceptance criteria from the RFC in a single run because
6
6
  * they share setup:
7
7
  *
8
- * AC-1 — Background-dispatch-and-continue: card stays pinned past
9
- * parent `turn_end`; fleet zone surfaces the running sub-agent.
10
- * AC-2 Done semantics: header reads 🌀 Background (not ✅ Done)
11
- * while the bg sub-agent runs; flips to Done after it
12
- * terminates.
13
- * AC-3 Live activity: card body materially changes across a 15s
14
- * window while bg work is in flight (elapsed counter or fleet
15
- * row's `last activity` advances) proves the heartbeat +
16
- * subagent-watcher are actually feeding the renderer.
8
+ * AC-1 — Background-dispatch-and-continue: worker-feed message appears
9
+ * while the background sub-agent runs; persists past parent
10
+ * `turn_end` so the user can watch the worker in flight.
11
+ * AC-2 Done semantics: feed message reads `running ·` while the bg
12
+ * sub-agent runs; flips to `finished · completed` (or `failed`)
13
+ * after it terminates.
14
+ * AC-3 Live activity: feed body materially changes across a 6s window
15
+ * while bg work is in flight (elapsed counter or narrative step
16
+ * advances) — proves the subagent-watcher is actually feeding the
17
+ * renderer.
17
18
  *
18
19
  * Prompt strategy: **Option 1 (explicit tool-naming)** per the RFC §
19
20
  * "Background-dispatch prompt". An earlier Option-2 (naturalistic)
20
21
  * attempt produced exactly the failure mode the RFC predicted —
21
- * model ran the sleeps inline via Bash, card never reached Background
22
+ * model ran the sleeps inline via Bash, feed never reached Background
22
23
  * phase. This test verifies the *visibility infra*, not the LLM's
23
24
  * delegation judgment; pinning the tool name and arg keeps the
24
25
  * scenario deterministic.
25
26
  *
26
- * Requires the same env as the other DM scenarios (see SETUP.md §6)
27
- * and the test-harness override `progress_card.delay_ms: 1000` so the
28
- * card actually fires on a short turn (SETUP.md §5).
27
+ * Architecture note (post-#1122 PR3): the pinned progress card was
28
+ * deleted. Background sub-agent visibility is now surfaced via the
29
+ * worker-activity-feed (`SWITCHROOM_WORKER_ACTIVITY_FEED=1`): a regular
30
+ * Telegram message that posts once the worker has been running for
31
+ * `firstPaintMin` (8s default on test-harness) and edits in-place as
32
+ * activity arrives. This test drives assertions against that feed.
29
33
  *
30
- * Runtime budget is generous the inner deadlines sum to ~150s
31
- * worst-case (5s pin + 30s parent-ack + 30s background phase + 15s
32
- * delta-snapshot + 120s done) plus ~12s spinUp overhead. The outer
33
- * `it()` timeout absorbs the lot.
34
+ * Requires the same env as the other DM scenarios (see SETUP.md §6).
35
+ *
36
+ * Root causes fixed in #2501 (this PR):
37
+ * Bug 1 — orphan correlation. `backfillJsonlAgentId` used a fuzzy
38
+ * (agentType, description) match to link a newly-discovered JSONL
39
+ * to its registry row. When the match failed (description null,
40
+ * or race), `jsonl_agent_id` stayed NULL, so
41
+ * `resolveWorkerFeedDispatch(getSubagentByJsonlId(db, id), …)`
42
+ * returned `{ isBackground: false }` — routing the worker as a
43
+ * foreground sub-agent and suppressing the worker-feed. Fix:
44
+ * prefer the direct `toolUseId` PK lookup that Claude Code already
45
+ * writes to `agent-<id>.meta.json`.
46
+ * Bug 2 — liveness writes silently skipped. With `jsonl_agent_id = NULL`
47
+ * (Bug 1 not fixed), `bumpSubagentActivity` queries by
48
+ * `jsonl_agent_id` and finds nothing — every liveness tick is a
49
+ * no-op and the last_activity_at column never updates. Fixed as a
50
+ * consequence of Bug 1 (once the row is linked, liveness writes
51
+ * land).
52
+ *
53
+ * Runtime budget is generous — the inner deadlines sum to ~225s
54
+ * worst-case (45s parent-ack + 75s feed-first-paint + 12s delta + 180s
55
+ * done) plus ~12s spinUp overhead. The outer `it()` timeout absorbs the lot.
56
+ * The 180s done-window accommodates the stall-detection path: the watcher
57
+ * fires `onFinish` 60s after the last JSONL event, because background
58
+ * workers don't reliably emit `sub_agent_turn_end`.
34
59
  */
35
60
 
36
61
  import { describe, expect, it } from "vitest";
37
62
  import { spinUp } from "../harness.js";
63
+ import { WORKER_FEED_RE } from "../assertions.js";
38
64
 
39
65
  // Explicit dispatch prompt (Option 1 per the RFC §"Background-dispatch
40
66
  // prompt"). The naturalistic Option-2 version didn't reliably get the
41
67
  // model to use the Agent tool with run_in_background:true — first
42
68
  // attempt produced the failure mode the RFC predicted (parent ran the
43
- // sleeps inline via Bash; card never transitioned to Background).
69
+ // sleeps inline via Bash; feed never surfaced Background-phase activity).
44
70
  //
45
71
  // This test asserts the VISIBILITY INFRA works, not that the model
46
72
  // makes good delegation judgments. Naming the tool + the arg lets the
47
- // scenario be deterministic. If the model can't be made to use the
48
- // Agent tool even with this prompt, that's an unrelated bug (model
49
- // alignment / tool registration) and the scenario fails distinctly
50
- // from the visibility-infra failure modes we're trying to catch.
73
+ // scenario be deterministic.
51
74
  //
52
- // Time profile: ~60s of bg work, paced with three separate sleeps so
53
- // the worker emits multiple tool_use events the subagent-watcher can
54
- // surface as fresh `last activity` updates. We need the Background
55
- // phase to last long enough that we can take a snapshot, wait one
56
- // heartbeat tick (5s default), and snapshot again.
75
+ // Time profile: ~60s of bg work, paced with ten short steps so the
76
+ // worker emits multiple tool_use + narrative events the subagent-watcher
77
+ // can surface as fresh edits. We need the Background phase to last long
78
+ // enough to clear the 8s first-paint threshold and take a snapshot.
57
79
  const BG_DISPATCH_PROMPT =
58
80
  `Use the Agent tool with subagent_type "general-purpose" and ` +
59
81
  `run_in_background: true to dispatch a worker with this exact task: ` +
60
- `"Run \`sleep 20\` via the Bash tool, then \`echo step1\`, then ` +
61
- `\`sleep 20\` again, then \`echo step2\`, then \`sleep 20\` a third ` +
62
- `time, then \`echo done\`. That's three separate Bash tool calls ` +
63
- `with sleeps between echoes." After dispatching, send a brief reply ` +
64
- `saying you've kicked off the background worker so I can watch the ` +
65
- `progress card.`;
82
+ `"Do ten steps, ONE AT A TIME, k = 1 through 10. Before each step ` +
83
+ `write a brief one-sentence narration of what you are about to do, ` +
84
+ `then run \`sleep 2\` via the Bash tool, then run \`echo step-k\` via ` +
85
+ `the Bash tool (substitute the real number for k). Run every sleep and ` +
86
+ `every echo as its OWN separate Bash call never batch or chain them ` +
87
+ `with && — and narrate before each so progress surfaces incrementally. ` +
88
+ `Do not stop early; complete all ten steps." After dispatching, send a ` +
89
+ `brief reply saying you've kicked off the background worker so I can ` +
90
+ `watch the progress feed.`;
91
+
92
+ const WORKER_RUNNING_RE = /running\s*·/i;
93
+ const WORKER_DONE_RE = /finished\s*·\s*(completed|failed)/i;
66
94
 
67
- /**
68
- * STATUS: currently red — surfaces two real production bugs the
69
- * RFC §Risks predicted as possible-but-unverified. Marked `it.fails`
70
- * so a future fix flips it green and a regression flips it red again.
71
- *
72
- * Bug 1 — orphan correlation. The parent's `Agent` tool_use_id
73
- * doesn't get matched to the spawned `sub_agent_started`
74
- * event. Gateway log: `pendingSpawns=0 correlated=orphan`.
75
- * Result: `isBackgroundDispatch` is never set on the fleet
76
- * member; the card's header phase transitions to Background
77
- * only by accident (orphans defer too, but they don't carry
78
- * the bg flag).
79
- *
80
- * Bug 2 — subagent-watcher can't track the worker. Gateway log:
81
- * `subagent-watcher: liveness skip <agentId> — row not in
82
- * DB yet (Phase 2 Pre hook pending)`. Result: no
83
- * sub_agent_tool_use events reach the fleet member; the
84
- * fleet row's `last activity` field never updates with the
85
- * worker's actual tool calls. The card edits we see are
86
- * just elapsed-counter ticks from the heartbeat.
87
- *
88
- * Both bugs are real and live on `main`. The scenario above passes
89
- * AC-1 (card stays pinned), partially passes AC-2 (Background phase
90
- * fires) and AC-3 (card body changes — from heartbeat alone), and
91
- * fails AC-2's closing half (card never reaches Done in 120s because
92
- * the orphan never terminates from the gateway's view).
93
- *
94
- * When Bug 1 + Bug 2 are fixed, change `describe.skip` to `describe`
95
- * below — the assertions are correct; only the production code is
96
- * wrong.
97
- *
98
- * Update post-#1105: all five RFC bugs (1–5 in earlier PRs, 6–7 in
99
- * #1105) merged. Unskipped here for the next UAT re-run. If 6/6 ACs
100
- * pass, close #709 / #776 / #782 / #788.
101
- */
102
95
  describe("uat: background sub-agent visibility (#709/#776/#782/#788)", () => {
103
96
  it(
104
- "card stays pinned with 🌀 Background header + live fleet activity, then flips to ✅ Done",
97
+ "worker-feed appears with running status then flips to finished once the sub-agent completes",
105
98
  async () => {
106
99
  const sc = await spinUp({ agent: "test-harness" });
107
100
  try {
108
101
  await sc.sendDM(BG_DISPATCH_PROMPT);
109
102
 
110
- // AC-1 step 1: card pins quickly (delay_ms: 1000 on test-harness).
111
- // Generous timeout so a slow first-turn doesn't false-flag.
112
- const card = await sc.expectPinnedCard({ timeout: 15_000 });
113
- expect(card.messageId).toBeGreaterThan(0);
103
+ // Parent ack reply confirms the parent turn closed.
104
+ await sc.expectMessage(/.+/, { from: "bot", timeout: 45_000 });
114
105
 
115
- // Parent ack reply. Note: we DON'T strictly require the model
116
- // to mention "dispatch" in the reply naturalistic prompt means
117
- // the model picks the wording. We just need *some* bot reply
118
- // so we know the parent turn closed (which is the point where
119
- // pre-fix the card would unpin).
120
- await sc.expectMessage(/.+/, { from: "bot", timeout: 30_000 });
121
-
122
- // AC-2: header MUST be 🌀 Background (post-#1039) or, if the
123
- // bg dispatch happened so fast the worker hasn't started yet,
124
- // it might still be ⚙️ Working with the parent zone done. We
125
- // poll for the background phase with a 45s budget — long
126
- // enough for the worker to actually start firing tools, short
127
- // enough that "we never saw Background" surfaces as a real
128
- // bug, not a timeout-tuning issue.
106
+ // AC-1 step 1: worker-feed message appears after first-paint delay
107
+ // (~8s default). The message starts with "🛠 Worker" and shows
108
+ // "running ·" while the worker is in flight. Generous timeout so a
109
+ // slow first tool_use + narrative doesn't false-flag.
129
110
  //
130
- // The dual-acceptable phases below model the realistic flow:
131
- // parent reply lands header should be Background (or
132
- // briefly still Working if the parent's `done` event lags
133
- // the bg dispatch's tool_use).
134
- const bgPhaseCard = await sc.waitForCardPhase(card, "background", {
135
- timeout: 45_000,
111
+ // Distinct from the parent's ack `expectMessage` starts observing
112
+ // from after the parent ack, so the feed paint is the next match.
113
+ const feed = await sc.expectMessage(WORKER_FEED_RE, {
114
+ from: "bot",
115
+ timeout: 75_000,
136
116
  });
137
- expect(bgPhaseCard.text).toMatch(/🌀|Background/i);
138
- // The negative — Done MUST NOT have fired before bg started.
139
- // Asserts the defer-gate is doing its job. If this trips, the
140
- // `hasLiveBackground` correlation at progress-card-driver.ts:1108
141
- // is broken (or the bg dispatch never registered as a fleet
142
- // member at all — see RFC §Phase 2 diagnosis paths).
143
- expect(bgPhaseCard.text).not.toMatch(/✅|\bDone\b/i);
117
+ expect(feed.messageId).toBeGreaterThan(0);
118
+ expect(feed.text).toMatch(WORKER_FEED_RE);
119
+
120
+ // AC-2 step 1: feed body MUST show "running ·" (the in-flight
121
+ // status), NOT the terminal "finished ·" the worker hasn't
122
+ // completed yet.
123
+ expect(feed.text).toMatch(WORKER_RUNNING_RE);
124
+ expect(feed.text).not.toMatch(WORKER_DONE_RE);
144
125
 
145
- // AC-3: card edits land regularly while bg runs. Snapshot
146
- // the current card body, wait one heartbeat tick (5s default
147
- // + 1s slack), then fetch the card body again. The body MUST
148
- // differ (elapsed counter, fleet last-activity age, etc.).
126
+ // AC-3: feed edits land regularly while the worker runs. Snapshot
127
+ // the current body, wait 12s (well above the 2.5s edit throttle,
128
+ // and enough that at least one step + sleep cycle completes), then
129
+ // re-fetch the SAME message. The body MUST differ (elapsed counter
130
+ // or narrative step advances).
149
131
  //
150
132
  // We re-fetch the SAME message via `driver.getMessage(chatId,
151
- // cardId)` rather than `expectPinnedCard` because the latter
152
- // listens for NEW pin events. Once the card is pinned, no
153
- // further pin event fires `expectPinnedCard` would wait
154
- // for an event that never comes and time out spuriously even
155
- // though the card is alive and being edited (caught in the
156
- // first run of this scenario).
133
+ // msgId)` rather than `expectMessage(WORKER_FEED_RE)` because the
134
+ // latter listens for NEW messages. The feed edits in-place; a new
135
+ // send only happens on re-post (stale messageId). So re-fetching is
136
+ // the right shape.
157
137
  //
158
- // If the card freezes heartbeat dead, subagent-watcher not
159
- // flushing, fleet member never registered `afterDelta` will
160
- // equal `beforeDelta` and surface the bug cleanly. If the
161
- // card was unpinned by an over-eager defer-gate release,
162
- // `getMessage` returns null and we surface it with a clear
163
- // assertion.
164
- const beforeDelta = bgPhaseCard.text;
165
- await new Promise((r) => setTimeout(r, 6_000));
138
+ // 12s instead of 6s: the first edit arrives ~6-8s after paint (one
139
+ // step/sleep cycle), so 6s was racy. 12s gives a safe 2x margin.
140
+ const beforeDelta = feed.text;
141
+ await new Promise((r) => setTimeout(r, 12_000));
166
142
  const afterDeltaMsg = await sc.driver.getMessage(
167
143
  sc.botUserId,
168
- bgPhaseCard.messageId,
144
+ feed.messageId,
169
145
  );
170
- expect(afterDeltaMsg, "card message disappeared mid-flight (AC-1 regression)").not.toBeNull();
146
+ expect(afterDeltaMsg, "feed message disappeared mid-flight (AC-1 regression)").not.toBeNull();
171
147
  expect(afterDeltaMsg!.text).not.toBe(beforeDelta);
172
148
 
173
- // AC-2 closing half: bg terminates → header flips to Done.
174
- // Generous budget the inner sleeps sum to ~60s but
175
- // post-completion the deferred-completion gate plus the
176
- // heartbeat cadence can add another 5-30s before the card
177
- // finalises.
178
- const doneCard = await sc.waitForCardPhase(bgPhaseCard, "done", {
179
- timeout: 120_000,
180
- });
181
- expect(doneCard.text).toMatch(/✅|Done/i);
149
+ // AC-2 closing half: bg terminates → body flips to "finished ·
150
+ // completed". The terminal edit is triggered by the subagent-watcher's
151
+ // stall detection (60s after the last JSONL activity), because
152
+ // background Claude Code workers don't always emit a sub_agent_turn_end
153
+ // event. Budget: worker steps (~60s) + stall window (60s) + slack.
154
+ // From first-paint to terminal is typically 140-165s.
155
+ let doneText: string | null = null;
156
+ const deadline = Date.now() + 180_000;
157
+ while (Date.now() < deadline) {
158
+ const m = await sc.driver.getMessage(sc.botUserId, feed.messageId);
159
+ if (m != null && WORKER_DONE_RE.test(m.text)) {
160
+ doneText = m.text;
161
+ break;
162
+ }
163
+ await new Promise((r) => setTimeout(r, 3_000));
164
+ }
165
+ expect(doneText, "worker-feed never reached a terminal recap").not.toBeNull();
166
+ expect(doneText!).toMatch(/tools?/i);
167
+ // Body MUST have changed between first paint and terminal.
168
+ expect(doneText).not.toBe(beforeDelta);
182
169
  } finally {
183
170
  await sc.tearDown();
184
171
  }
185
172
  },
186
- // Outer per-test budget: sum of inner deadlines (15 + 30 + 45 + 15 +
187
- // 10 + 120 = 235s) + spinUp settle (~12s) + slack. Round up to keep
188
- // the inner-deadline error visible if any of them trip.
189
- 300_000,
173
+ // Outer per-test budget: sum of inner deadlines (45 + 75 + 16 + 180 =
174
+ // 316s) + spinUp settle (~12s) + slack.
175
+ 360_000,
190
176
  );
191
177
  });
@@ -0,0 +1,94 @@
1
+ /**
2
+ * JTBD: "the answer pings" — notification ownership (R8 / PR-2; design
3
+ * `docs/message-emission-determinism.md` §over-ping).
4
+ *
5
+ * The residual the bare one-ping-per-turn safety net left: when a turn opens
6
+ * with an interim ACK that pings first, the ack claims the turn's single ping
7
+ * slot and the LATER substantive answer used to be downgraded to silent — the
8
+ * reply is last on screen, but the user's phone never buzzed for the actual
9
+ * answer. PR-2 makes `decideOverPing` aware of WHO holds the slot and lets a
10
+ * substantive answer UPGRADE over an ack's slot, so the answer pings.
11
+ *
12
+ * This scenario drives the exact sequence end-to-end: an "On it" style ack
13
+ * (pings, claims the slot) followed by a ≥300-char substantive answer, and
14
+ * asserts the ANSWER arrived non-silent via `assertAnswerPinged`
15
+ * (mtcute's `ObservedMessage.silent`).
16
+ *
17
+ * Runs under CI `uat-gate`; the full live MTProto run needs the test-harness
18
+ * agent + a vault session, so locally this self-skips green (no driver).
19
+ *
20
+ * Scope caveat: this end-to-end scenario only exercises PR-2's upgrade code
21
+ * path when the harness model delivers its final answer via the `reply` tool.
22
+ * If the model answers via `stream_reply` instead, that path bypasses the
23
+ * over-ping safety net entirely (it never reaches `decideOverPing`), so the
24
+ * upgrade-over-ack logic is never touched. The model's tool choice isn't
25
+ * forceable here, which makes this scenario a WEAKER backstop than the unit
26
+ * matrix — the real proof of the upgrade behaviour lives in the deterministic
27
+ * unit tests in `over-ping-final-answer-decoupling.test.ts`. Treat this as a
28
+ * live smoke-test of the happy path, not the source of truth.
29
+ */
30
+ import { describe, it, expect, beforeAll } from "vitest";
31
+ import { spinUp, type Scenario } from "../harness.js";
32
+ import { assertAnswerPinged, isAnswer } from "../assertions.js";
33
+ import { collectTurn } from "../real-work-prompts.js";
34
+
35
+ /** Overall budget for the ack-then-answer turn. */
36
+ const TURN_BUDGET_MS = 130_000;
37
+ /** The answer must clear the substantive-length backstop (≥200). */
38
+ const MIN_ANSWER_CHARS = 200;
39
+
40
+ describe("uat: the substantive answer pings even after an ack pinged (DM)", () => {
41
+ let sc: Scenario | null = null;
42
+
43
+ beforeAll(async () => {
44
+ try {
45
+ sc = await spinUp({ agent: "test-harness" });
46
+ await sc.driver.primeDialogs();
47
+ } catch (err) {
48
+ console.warn(
49
+ `[answer-pings] no live driver — self-skipping green: ${(err as Error).message}`,
50
+ );
51
+ sc = null;
52
+ }
53
+ });
54
+
55
+ it(
56
+ "an ack pings first, then the substantive answer also pings (R8 / PR-2 upgrade)",
57
+ async () => {
58
+ if (sc == null) return; // self-skip green
59
+ const { driver, botUserId, driverUserId } = sc;
60
+
61
+ // Prompt the model into the ack-then-answer cadence: a quick pinging
62
+ // "On it" reply, then — after a beat — a thorough ≥300-character answer
63
+ // as a fresh (also pinging) reply. The model's exact wording isn't
64
+ // forceable, so we accept any substantive (≥200-char) answer that lands;
65
+ // collectTurn skips the short ack (below minAnswerChars) and latches onto
66
+ // the real answer.
67
+ const obs = await collectTurn(
68
+ driver,
69
+ botUserId,
70
+ driverUserId,
71
+ "First send a very short interim reply 'On it.' (pinging — do NOT set " +
72
+ "disable_notification). THEN, as a separate second reply, give me a " +
73
+ "thorough answer of at least 300 characters explaining what a Telegram " +
74
+ "supergroup is, how forum topics partition it, and how a bot routes a " +
75
+ "reply back to the topic a question came from. The long second reply is " +
76
+ "your final answer.",
77
+ { timeoutMs: TURN_BUDGET_MS, minAnswerChars: MIN_ANSWER_CHARS, settleMs: 12_000 },
78
+ );
79
+
80
+ if (obs.answer == null) {
81
+ console.warn("[answer-pings] INCONCLUSIVE — no substantive answer landed in budget.");
82
+ return;
83
+ }
84
+
85
+ // Sanity: this is the answer lane, not a feed surface.
86
+ expect(isAnswer(obs.answer, driverUserId)).toBe(true);
87
+
88
+ // The load-bearing assertion: the substantive answer is non-silent. If an
89
+ // earlier ack-ping had downgraded it (the pre-PR-2 residual), this throws.
90
+ assertAnswerPinged(obs.answer);
91
+ },
92
+ TURN_BUDGET_MS + 30_000,
93
+ );
94
+ });
@@ -0,0 +1,109 @@
1
+ /**
2
+ * JTBD: "no stale 'thinking…' card opens beneath an answer the user already
3
+ * received in an EARLIER turn" — the cross-turn form of the reply-is-last
4
+ * invariant (design `docs/message-emission-determinism.md` §9 lever 4 / race
5
+ * C/D; PR1).
6
+ *
7
+ * The in-turn levers (#2557, sticky `finalAnswerEverDelivered`) only govern the
8
+ * CURRENT turn. The cross-turn surfaces — the obligation `represent` sweep and
9
+ * the heartbeat/liveness timer — can OPEN a card in a LATER synthetic turn,
10
+ * surfacing a card beneath an answer delivered in an earlier turn. PR1's lever 4
11
+ * gates those synthetic card-OPEN paths on `hasOutboundDeliveredSince`: if a
12
+ * substantive answer already landed since the obligation was raised, the card
13
+ * OPEN is suppressed (the represent SEND is unaffected — only the decorative
14
+ * card).
15
+ *
16
+ * This scenario delivers a substantive answer in turn N, then keeps pulling
17
+ * send-order history through a long settle window (during which the obligation
18
+ * sweep / heartbeat may fire a synthetic represent/liveness surface in turn
19
+ * N+1), and asserts no activity/worker-feed card opened BELOW the delivered
20
+ * answer. `assertReplyIsLast` scopes to the answer's window up to the next
21
+ * driver message — and because a cross-turn synthetic surface carries NO
22
+ * intervening driver message, a card-below-answer it opens falls inside that
23
+ * window and is correctly flagged.
24
+ *
25
+ * Runs under CI `uat-gate`; the full live MTProto run needs the test-harness
26
+ * agent + a vault session, so locally this self-skips green (no driver) — same
27
+ * shape as `jtbd-reply-is-last-dm.test.ts`.
28
+ */
29
+ import { describe, it, expect, beforeAll } from "vitest";
30
+ import { spinUp, type Scenario } from "../harness.js";
31
+ import {
32
+ assertReplyIsLast,
33
+ isAnswer,
34
+ isActivityFeedMessage,
35
+ isWorkerFeedMessage,
36
+ } from "../assertions.js";
37
+ import { collectTurn } from "../real-work-prompts.js";
38
+
39
+ /** Per-case overall budget. */
40
+ const TURN_BUDGET_MS = 140_000;
41
+ /** History pull depth — covers the answer turn + any cross-turn synthetic surface. */
42
+ const HISTORY_LIMIT = 80;
43
+ /**
44
+ * Settle window AFTER the answer lands. Long enough that the obligation sweep
45
+ * (and the heartbeat liveness timer) has at least one chance to fire a
46
+ * cross-turn synthetic surface — the window PR1 lever 4 guards. The obligation
47
+ * sweep runs on its own interval, so we cannot force a represent deterministically;
48
+ * the durable assertion is "IF a synthetic surface fires, it must not open a
49
+ * card below the answer." A run where no represent fires is a valid green pass.
50
+ */
51
+ const POST_ANSWER_SETTLE_MS = 20_000;
52
+
53
+ describe("uat: no cross-turn card opens beneath an earlier answer (DM)", () => {
54
+ let sc: Scenario | null = null;
55
+
56
+ beforeAll(async () => {
57
+ try {
58
+ sc = await spinUp({ agent: "test-harness" });
59
+ await sc.driver.primeDialogs();
60
+ } catch (err) {
61
+ console.warn(
62
+ `[cross-turn-card] no live driver — self-skipping green: ${(err as Error).message}`,
63
+ );
64
+ sc = null;
65
+ }
66
+ });
67
+
68
+ it(
69
+ "a substantive answer in turn N is not followed by a card opened in turn N+1 (lever 4 / race C/D)",
70
+ async () => {
71
+ if (sc == null) return; // self-skip green
72
+ const { driver, botUserId, driverUserId } = sc;
73
+
74
+ // Deliver a substantive answer (≥200 chars → trips the substantive proxy
75
+ // the cross-turn gate keys on). A tool is used so a card legitimately
76
+ // opens DURING the turn — the test then proves nothing opens BELOW the
77
+ // reply afterwards, across the cross-turn boundary.
78
+ const obs = await collectTurn(
79
+ driver,
80
+ botUserId,
81
+ driverUserId,
82
+ "Use your Bash tool to run `uname -a`, then give me a thorough answer " +
83
+ "(at least 220 characters) explaining what the output means field by " +
84
+ "field. That detailed message is your final answer.",
85
+ { timeoutMs: TURN_BUDGET_MS, minAnswerChars: 200, settleMs: POST_ANSWER_SETTLE_MS },
86
+ );
87
+
88
+ if (obs.answer == null) {
89
+ console.warn("[cross-turn-card] INCONCLUSIVE — no answer landed in budget.");
90
+ return;
91
+ }
92
+
93
+ // Sanity: the answer is a real answer-lane message, not a feed surface.
94
+ expect(isAnswer(obs.answer, driverUserId)).toBe(true);
95
+ expect(isActivityFeedMessage(obs.answer)).toBe(false);
96
+ expect(isWorkerFeedMessage(obs.answer)).toBe(false);
97
+
98
+ // Pull full server send-order history AFTER the long settle. Any
99
+ // cross-turn synthetic surface (represent / heartbeat liveness) that
100
+ // opened a card would now be present with a HIGHER message_id than the
101
+ // answer and — having no intervening driver message — inside the answer's
102
+ // turn window, so assertReplyIsLast flags it. Lever 4 must have suppressed
103
+ // that OPEN.
104
+ const history = await driver.getHistory(botUserId, HISTORY_LIMIT);
105
+ assertReplyIsLast(history, driverUserId, { turn: obs.answer });
106
+ },
107
+ TURN_BUDGET_MS + 40_000,
108
+ );
109
+ });