switchroom 0.15.44 → 0.16.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (150) hide show
  1. package/dist/agent-scheduler/index.js +122 -88
  2. package/dist/auth-broker/index.js +463 -177
  3. package/dist/cli/autoaccept-poll.js +4842 -35
  4. package/dist/cli/drive-write-pretool.mjs +17 -14
  5. package/dist/cli/notion-write-pretool.mjs +117 -86
  6. package/dist/cli/self-improve-apply-guard-pretool.mjs +626 -0
  7. package/dist/cli/self-improve-stop.mjs +428 -0
  8. package/dist/cli/skill-validate-pretool.mjs +72 -72
  9. package/dist/cli/switchroom.js +3249 -1241
  10. package/dist/cli/ui/index.html +1 -1
  11. package/dist/host-control/main.js +2833 -355
  12. package/dist/vault/approvals/kernel-server.js +7482 -7439
  13. package/dist/vault/broker/server.js +11315 -11272
  14. package/examples/minimal.yaml +1 -0
  15. package/examples/switchroom.yaml +1 -0
  16. package/package.json +3 -3
  17. package/profiles/_base/start.sh.hbs +88 -1
  18. package/profiles/_shared/execution-discipline.md.hbs +18 -0
  19. package/profiles/default/CLAUDE.md.hbs +3 -22
  20. package/telegram-plugin/.claude-plugin/plugin.json +2 -2
  21. package/telegram-plugin/answer-stream-flag.ts +12 -49
  22. package/telegram-plugin/answer-stream.ts +5 -150
  23. package/telegram-plugin/auth-snapshot-format.ts +280 -48
  24. package/telegram-plugin/auto-fallback-fleet.ts +44 -1
  25. package/telegram-plugin/context-exhaustion.ts +12 -0
  26. package/telegram-plugin/demo-mask.ts +154 -0
  27. package/telegram-plugin/dist/bridge/bridge.js +167 -124
  28. package/telegram-plugin/dist/gateway/gateway.js +3039 -1159
  29. package/telegram-plugin/dist/server.js +215 -172
  30. package/telegram-plugin/docs/waiting-ux-spec.md +2 -2
  31. package/telegram-plugin/draft-stream.ts +47 -410
  32. package/telegram-plugin/final-answer-detect.ts +17 -12
  33. package/telegram-plugin/fleet-fallback-resume.ts +131 -0
  34. package/telegram-plugin/format.ts +56 -19
  35. package/telegram-plugin/gateway/auth-add-flow.ts +332 -127
  36. package/telegram-plugin/gateway/auth-broker-client.ts +2 -2
  37. package/telegram-plugin/gateway/auth-command.ts +70 -14
  38. package/telegram-plugin/gateway/clean-shutdown-marker.ts +44 -0
  39. package/telegram-plugin/gateway/config-approval-handler.test.ts +91 -4
  40. package/telegram-plugin/gateway/config-approval-handler.ts +94 -13
  41. package/telegram-plugin/gateway/current-turn-map.ts +188 -0
  42. package/telegram-plugin/gateway/disconnect-flush.ts +3 -1
  43. package/telegram-plugin/gateway/effort-command.ts +8 -3
  44. package/telegram-plugin/gateway/emission-authority.ts +369 -0
  45. package/telegram-plugin/gateway/feed-open-gate.ts +292 -0
  46. package/telegram-plugin/gateway/gateway.ts +1837 -291
  47. package/telegram-plugin/gateway/inject-handler.test.ts +2 -1
  48. package/telegram-plugin/gateway/ms365-write-approval.test.ts +4 -4
  49. package/telegram-plugin/gateway/represent-guard.ts +72 -0
  50. package/telegram-plugin/gateway/status-surface-log.test.ts +5 -4
  51. package/telegram-plugin/gateway/status-surface-log.ts +14 -3
  52. package/telegram-plugin/history.ts +33 -11
  53. package/telegram-plugin/hooks/repo-context-pretool.mjs +26 -0
  54. package/telegram-plugin/hooks/subagent-tracker-posttool.mjs +5 -0
  55. package/telegram-plugin/hooks/subagent-tracker-pretool.mjs +8 -0
  56. package/telegram-plugin/hooks/tool-label-pretool.mjs +39 -15
  57. package/telegram-plugin/issues-card.ts +4 -0
  58. package/telegram-plugin/model-unavailable.ts +124 -0
  59. package/telegram-plugin/narrative-dedup.ts +69 -0
  60. package/telegram-plugin/over-ping-safety-net.ts +70 -4
  61. package/telegram-plugin/package.json +3 -3
  62. package/telegram-plugin/pending-work-progress.ts +12 -0
  63. package/telegram-plugin/permission-rule.ts +32 -5
  64. package/telegram-plugin/permission-title.ts +152 -9
  65. package/telegram-plugin/quota-check.ts +13 -0
  66. package/telegram-plugin/quota-watch.ts +135 -7
  67. package/telegram-plugin/registry/turns-schema.test.ts +24 -0
  68. package/telegram-plugin/registry/turns-schema.ts +9 -0
  69. package/telegram-plugin/runtime-metrics.ts +13 -0
  70. package/telegram-plugin/session-tail.ts +96 -11
  71. package/telegram-plugin/silence-poke.ts +170 -24
  72. package/telegram-plugin/slot-banner-driver.ts +3 -0
  73. package/telegram-plugin/status-no-truncate.ts +44 -0
  74. package/telegram-plugin/status-reactions.ts +20 -3
  75. package/telegram-plugin/stream-controller.ts +4 -23
  76. package/telegram-plugin/stream-reply-handler.ts +6 -24
  77. package/telegram-plugin/streaming-metrics.ts +91 -0
  78. package/telegram-plugin/subagent-watcher.ts +212 -66
  79. package/telegram-plugin/tests/activity-ever-opened-sticky.test.ts +47 -0
  80. package/telegram-plugin/tests/answer-stream-dedup.test.ts +9 -26
  81. package/telegram-plugin/tests/answer-stream-flag.test.ts +25 -58
  82. package/telegram-plugin/tests/answer-stream-silent-markers.test.ts +41 -51
  83. package/telegram-plugin/tests/answer-stream.test.ts +2 -411
  84. package/telegram-plugin/tests/auth-add-flow.test.ts +488 -253
  85. package/telegram-plugin/tests/auth-command-format2.test.ts +71 -1
  86. package/telegram-plugin/tests/auth-snapshot-format.test.ts +376 -6
  87. package/telegram-plugin/tests/auto-fallback-fleet.test.ts +120 -0
  88. package/telegram-plugin/tests/cross-turn-card-gate.test.ts +424 -0
  89. package/telegram-plugin/tests/demo-mask.test.ts +127 -0
  90. package/telegram-plugin/tests/draft-stream.test.ts +0 -827
  91. package/telegram-plugin/tests/emission-authority-card-drain-gate.test.ts +236 -0
  92. package/telegram-plugin/tests/emission-authority-facade.test.ts +488 -0
  93. package/telegram-plugin/tests/emission-authority-open-gate.test.ts +179 -0
  94. package/telegram-plugin/tests/emission-authority-ping-gate.test.ts +395 -0
  95. package/telegram-plugin/tests/emission-determinism-wiring.test.ts +177 -0
  96. package/telegram-plugin/tests/feed-heartbeat-liveness-open.test.ts +146 -0
  97. package/telegram-plugin/tests/feed-open-gate.test.ts +259 -0
  98. package/telegram-plugin/tests/feed-survival.test.ts +526 -0
  99. package/telegram-plugin/tests/fleet-fallback-resume.test.ts +197 -0
  100. package/telegram-plugin/tests/gateway-clean-shutdown-marker.test.ts +117 -0
  101. package/telegram-plugin/tests/gateway-no-reply-single-emit.test.ts +4 -11
  102. package/telegram-plugin/tests/history.test.ts +60 -0
  103. package/telegram-plugin/tests/model-unavailable.test.ts +118 -0
  104. package/telegram-plugin/tests/narrative-dedup.test.ts +118 -0
  105. package/telegram-plugin/tests/orphaned-reply-rearm.test.ts +285 -0
  106. package/telegram-plugin/tests/over-ping-final-answer-decoupling.test.ts +194 -0
  107. package/telegram-plugin/tests/over-ping-safety-net.test.ts +2 -2
  108. package/telegram-plugin/tests/per-topic-current-turn.test.ts +373 -0
  109. package/telegram-plugin/tests/permission-card-origin-kill-switch.test.ts +42 -0
  110. package/telegram-plugin/tests/permission-rule.test.ts +17 -0
  111. package/telegram-plugin/tests/permission-title.test.ts +206 -17
  112. package/telegram-plugin/tests/quota-watch.test.ts +252 -9
  113. package/telegram-plugin/tests/reply-terminal-reaction.test.ts +6 -1
  114. package/telegram-plugin/tests/repo-context-pretool.test.ts +62 -0
  115. package/telegram-plugin/tests/represent-guard.test.ts +162 -0
  116. package/telegram-plugin/tests/session-tail.test.ts +147 -3
  117. package/telegram-plugin/tests/silence-liveness-wiring.test.ts +18 -0
  118. package/telegram-plugin/tests/status-card-budget-parity.test.ts +72 -0
  119. package/telegram-plugin/tests/status-surface-log.test.ts +146 -0
  120. package/telegram-plugin/tests/subagent-watcher-clip-narrative.test.ts +58 -0
  121. package/telegram-plugin/tests/subagent-watcher-parent-turn-key.test.ts +102 -0
  122. package/telegram-plugin/tests/subagent-watcher-workflow-visibility.test.ts +225 -0
  123. package/telegram-plugin/tests/subagent-watcher.test.ts +147 -0
  124. package/telegram-plugin/tests/telegram-activity-visibility-integration.test.ts +597 -0
  125. package/telegram-plugin/tests/telegram-format.test.ts +101 -6
  126. package/telegram-plugin/tests/tool-activity-summary.test.ts +550 -15
  127. package/telegram-plugin/tests/tool-label-pretool.test.ts +73 -0
  128. package/telegram-plugin/tests/tool-label-sidecar.test.ts +44 -0
  129. package/telegram-plugin/tests/tool-labels.test.ts +67 -0
  130. package/telegram-plugin/tests/turn-liveness-floor.test.ts +196 -0
  131. package/telegram-plugin/tests/turn-liveness-invariant.test.ts +340 -0
  132. package/telegram-plugin/tests/welcome-text.test.ts +32 -3
  133. package/telegram-plugin/tests/worker-activity-feed.test.ts +470 -22
  134. package/telegram-plugin/tool-activity-summary.ts +375 -58
  135. package/telegram-plugin/turn-liveness-floor.ts +240 -0
  136. package/telegram-plugin/uat/assertions.ts +115 -0
  137. package/telegram-plugin/uat/driver.ts +68 -0
  138. package/telegram-plugin/uat/scenarios/bg-sub-agent-dispatch-dm.test.ts +119 -133
  139. package/telegram-plugin/uat/scenarios/jtbd-answer-pings.test.ts +94 -0
  140. package/telegram-plugin/uat/scenarios/jtbd-cross-turn-card-dm.test.ts +109 -0
  141. package/telegram-plugin/uat/scenarios/jtbd-foreground-feed-thinkgap-dm.test.ts +478 -0
  142. package/telegram-plugin/uat/scenarios/jtbd-foreground-feed-visibility-dm.test.ts +396 -0
  143. package/telegram-plugin/uat/scenarios/jtbd-liveness-feed-open-dm.test.ts +202 -0
  144. package/telegram-plugin/uat/scenarios/jtbd-reply-is-last-dm.test.ts +202 -0
  145. package/telegram-plugin/uat/scenarios/reactions-dm.test.ts +93 -87
  146. package/telegram-plugin/welcome-text.ts +13 -1
  147. package/telegram-plugin/worker-activity-feed.ts +157 -82
  148. package/telegram-plugin/draft-transport.ts +0 -122
  149. package/telegram-plugin/tests/draft-retirement-wiring.test.ts +0 -82
  150. package/telegram-plugin/tests/draft-transport.test.ts +0 -211
@@ -0,0 +1,118 @@
1
+ import { describe, it, expect } from 'vitest'
2
+ import {
3
+ normalizeNarrative,
4
+ prefixSimilarity,
5
+ isDraftOfReply,
6
+ DRAFT_SUPPRESS_THRESHOLD,
7
+ REPLY_TOOLS,
8
+ } from '../narrative-dedup.js'
9
+
10
+ describe('narrative-dedup', () => {
11
+ it('pins the threshold so a silent retune breaks the test', () => {
12
+ expect(DRAFT_SUPPRESS_THRESHOLD).toBe(0.8)
13
+ })
14
+
15
+ it('REPLY_TOOLS holds exactly reply + stream_reply', () => {
16
+ expect(REPLY_TOOLS.has('reply')).toBe(true)
17
+ expect(REPLY_TOOLS.has('stream_reply')).toBe(true)
18
+ expect(REPLY_TOOLS.has('Bash')).toBe(false)
19
+ })
20
+
21
+ describe('normalizeNarrative', () => {
22
+ it('strips markdown emphasis/heading/quote marks, collapses whitespace, lowercases', () => {
23
+ expect(normalizeNarrative('**Bold** _italic_ `code`')).toBe('bold italic code')
24
+ expect(normalizeNarrative('> # Heading\n text')).toBe('heading text')
25
+ })
26
+ })
27
+
28
+ describe('prefixSimilarity', () => {
29
+ it('returns 1 for identical strings', () => {
30
+ expect(prefixSimilarity('hello there', 'hello there')).toBe(1)
31
+ })
32
+
33
+ it('returns 0 when either side is empty (no divide-by-zero)', () => {
34
+ expect(prefixSimilarity('', 'something')).toBe(0)
35
+ expect(prefixSimilarity('something', '')).toBe(0)
36
+ expect(prefixSimilarity('', '')).toBe(0)
37
+ })
38
+
39
+ it('ratio is over the SHORTER normalized string', () => {
40
+ // "abc" vs "abcdef": shared prefix 3 of shorter length 3 = 1.0
41
+ expect(prefixSimilarity('abc', 'abcdef')).toBe(1)
42
+ // "abx" vs "abcdef": shared prefix 2 of shorter length 3 ≈ 0.667
43
+ expect(prefixSimilarity('abx', 'abcdef')).toBeCloseTo(2 / 3, 5)
44
+ })
45
+ })
46
+
47
+ describe('isDraftOfReply', () => {
48
+ it('SUPPRESS: identical draft and reply', () => {
49
+ const t = 'The repo is at /home/user/code/switchroom.'
50
+ expect(isDraftOfReply(t, t)).toBe(true)
51
+ })
52
+
53
+ it('SUPPRESS: draft whose trailing sentence was trimmed before sending (~0.85 prefix)', () => {
54
+ const draft = 'The repo is at /home/user/code/switchroom. I will start now.'
55
+ const reply = 'The repo is at /home/user/code/switchroom.'
56
+ // reply is the shorter string and is a full prefix of the draft → 1.0
57
+ expect(prefixSimilarity(draft, reply)).toBe(1)
58
+ expect(isDraftOfReply(draft, reply)).toBe(true)
59
+ // And the symmetric framing (draft slightly longer head, reply trimmed):
60
+ const draft2 = 'Found both repos and confirmed the remote is correct here.'
61
+ const reply2 = 'Found both repos and confirmed the remote is correct.'
62
+ expect(prefixSimilarity(draft2, reply2)).toBeGreaterThanOrEqual(0.85)
63
+ expect(isDraftOfReply(draft2, reply2)).toBe(true)
64
+ })
65
+
66
+ it('SHOW: post-action narration that merely precedes a different reply', () => {
67
+ // "Sent. Waiting on the build…" vs an unrelated reply payload — short
68
+ // string, near-zero shared prefix → below threshold → SHOW.
69
+ const narration = 'Sent. Waiting on the build…'
70
+ const reply = "Here's the result of the build: all green."
71
+ expect(prefixSimilarity(narration, reply)).toBeLessThan(DRAFT_SUPPRESS_THRESHOLD)
72
+ expect(isDraftOfReply(narration, reply)).toBe(false)
73
+ })
74
+
75
+ it('SHOW: empty reply text never suppresses (no divide-by-zero)', () => {
76
+ expect(isDraftOfReply('On it. Let me find the repo…', '')).toBe(false)
77
+ })
78
+
79
+ it('SUPPRESS: draft differs from reply only by markdown decoration', () => {
80
+ const draft = 'Here is the **plan**: do A then B.'
81
+ const reply = 'Here is the plan: do A then B.'
82
+ // After normalization the markdown stars vanish → identical → suppress.
83
+ expect(normalizeNarrative(draft)).toBe(normalizeNarrative(reply))
84
+ expect(isDraftOfReply(draft, reply)).toBe(true)
85
+ })
86
+
87
+ it('NIT 2: the doubled-capturedText proxy mis-suppresses; the actual reply text does not', () => {
88
+ // The bug: flushPendingNarrativeAtTurnEnd used to compare a trailing
89
+ // narration against capturedText.join(''). When the model emits the same
90
+ // short string twice in a turn — e.g. "Done." as working narration and
91
+ // then "Done." as the reply — that proxy becomes the CONCATENATION
92
+ // "Done.Done.", whose prefix the trailing narration still matches above
93
+ // threshold → genuine trailing narration WRONGLY suppressed.
94
+ const trailing = 'Done.'
95
+ const doubledProxy = 'Done.' + 'Done.' // capturedText.join('') of two "Done." blocks
96
+ const actualReply = 'Done.'
97
+
98
+ // Old (broken) comparison: trailing vs the doubled proxy → wrongly suppresses.
99
+ expect(isDraftOfReply(trailing, doubledProxy)).toBe(true)
100
+
101
+ // New comparison: trailing vs the ACTUAL reply text. Here the reply text
102
+ // really IS "Done.", so a trailing "Done." is a genuine duplicate and is
103
+ // correctly suppressed — the fix preserves the common-case suppression.
104
+ expect(isDraftOfReply(trailing, actualReply)).toBe(true)
105
+ })
106
+
107
+ it('NIT 2: genuine trailing narration is preserved when the reply text differs', () => {
108
+ // The case the proxy hurt most: the turn's reply is a SHORT distinct
109
+ // string and the trailing narration is genuine liveness. Comparing
110
+ // against the actual reply text (not a concatenation that happens to
111
+ // share a prefix) keeps the trailing narration SHOWN.
112
+ const trailingNarration = 'Done — all green, pushing now.'
113
+ const actualReply = 'Here is the summary you asked for: 3 files changed.'
114
+ // Below threshold against the real reply → SHOW (not suppressed).
115
+ expect(isDraftOfReply(trailingNarration, actualReply)).toBe(false)
116
+ })
117
+ })
118
+ })
@@ -0,0 +1,285 @@
1
+ /**
2
+ * Unit tests for the activity-feed-teardown fix (orphaned-reply backstop).
3
+ *
4
+ * Root cause: the orphaned-reply backstop fired a synthetic turn_end
5
+ * (`durationMs: -1`) after 30 s of silence, even mid-tool-call. That nulled
6
+ * `currentTurn` and dropped every subsequent `tool_label`, darkening the live
7
+ * activity feed for the rest of the turn.
8
+ *
9
+ * Fix: three layers described in the PR.
10
+ * PRIMARY — fuse fires mid-tool → re-arm instead (bounded by ORPHANED_REPLY_MAX_REARMS).
11
+ * SECONDARY — tool_label re-arms the fuse so active label streams keep it fresh.
12
+ * DEFENSIVE — turn_end entry rejects the synthetic event if tools are in flight.
13
+ *
14
+ * These tests cover the pure / unit-testable surfaces:
15
+ * - shouldArmOrphanedReplyTimeout (existing, now with midToolCall param)
16
+ * - ORPHANED_REPLY_MAX_REARMS constant math
17
+ * - The re-arm guard logic (pure decision extracted from the closure)
18
+ * - The defensive turn_end discriminator (durationMs === -1 + in-flight check)
19
+ */
20
+
21
+ import { describe, it, expect } from 'vitest'
22
+ import {
23
+ shouldArmOrphanedReplyTimeout,
24
+ ORPHANED_REPLY_TIMEOUT_MS,
25
+ ORPHANED_REPLY_MAX_REARMS,
26
+ } from '../context-exhaustion.js'
27
+ import { ToolFlightTracker } from '../gateway/interrupt-defer.js'
28
+
29
+ // ---------------------------------------------------------------------------
30
+ // Helpers — pure decision functions mirroring the gateway closure logic.
31
+ // These extract the discriminable parts of the fix so they are unit-testable
32
+ // without instantiating the full gateway.
33
+ // ---------------------------------------------------------------------------
34
+
35
+ /**
36
+ * Mirrors the PRIMARY fix decision inside the setTimeout callback:
37
+ * should the backstop re-arm (true) or fire turn_end (false)?
38
+ */
39
+ function shouldRearmInsteadOfFire(opts: {
40
+ midToolCall: boolean
41
+ rearmCount: number
42
+ maxRearms: number
43
+ }): boolean {
44
+ return opts.midToolCall && opts.rearmCount < opts.maxRearms
45
+ }
46
+
47
+ /**
48
+ * Mirrors the DEFENSIVE fix at turn_end entry:
49
+ * should a synthetic turn_end (durationMs === -1) be suppressed?
50
+ */
51
+ function shouldSuppressSyntheticTurnEnd(opts: {
52
+ durationMs: number
53
+ midToolCall: boolean
54
+ }): boolean {
55
+ return opts.durationMs === -1 && opts.midToolCall
56
+ }
57
+
58
+ // ---------------------------------------------------------------------------
59
+ // Tests: ORPHANED_REPLY_MAX_REARMS constant
60
+ // ---------------------------------------------------------------------------
61
+
62
+ describe('ORPHANED_REPLY_MAX_REARMS', () => {
63
+ it('is 20 (20 × 30 s = 10 min cap)', () => {
64
+ expect(ORPHANED_REPLY_MAX_REARMS).toBe(20)
65
+ })
66
+
67
+ it('combined with ORPHANED_REPLY_TIMEOUT_MS covers at least 10 min of tool activity', () => {
68
+ const coverageMs = ORPHANED_REPLY_MAX_REARMS * ORPHANED_REPLY_TIMEOUT_MS
69
+ // 20 × 30 000 ms = 600 000 ms = 10 min
70
+ expect(coverageMs).toBeGreaterThanOrEqual(10 * 60 * 1000)
71
+ })
72
+
73
+ it('fuse duration is still 30 s', () => {
74
+ expect(ORPHANED_REPLY_TIMEOUT_MS).toBe(30_000)
75
+ })
76
+ })
77
+
78
+ // ---------------------------------------------------------------------------
79
+ // Tests: PRIMARY fix — re-arm guard
80
+ // ---------------------------------------------------------------------------
81
+
82
+ describe('PRIMARY fix: re-arm guard (shouldRearmInsteadOfFire)', () => {
83
+ it('re-arms when a tool is in flight and rearm count is under the cap', () => {
84
+ expect(shouldRearmInsteadOfFire({ midToolCall: true, rearmCount: 0, maxRearms: 20 })).toBe(true)
85
+ expect(shouldRearmInsteadOfFire({ midToolCall: true, rearmCount: 19, maxRearms: 20 })).toBe(true)
86
+ })
87
+
88
+ it('fires once rearm count reaches the cap, even mid-tool-call', () => {
89
+ expect(shouldRearmInsteadOfFire({ midToolCall: true, rearmCount: 20, maxRearms: 20 })).toBe(false)
90
+ expect(shouldRearmInsteadOfFire({ midToolCall: true, rearmCount: 21, maxRearms: 20 })).toBe(false)
91
+ })
92
+
93
+ it('fires immediately when no tool is in flight, regardless of rearm count', () => {
94
+ expect(shouldRearmInsteadOfFire({ midToolCall: false, rearmCount: 0, maxRearms: 20 })).toBe(false)
95
+ expect(shouldRearmInsteadOfFire({ midToolCall: false, rearmCount: 5, maxRearms: 20 })).toBe(false)
96
+ })
97
+
98
+ it('rearm count transitions: 0 → cap-1 → cap fires', () => {
99
+ const max = ORPHANED_REPLY_MAX_REARMS
100
+ for (let i = 0; i < max; i++) {
101
+ expect(shouldRearmInsteadOfFire({ midToolCall: true, rearmCount: i, maxRearms: max })).toBe(true)
102
+ }
103
+ // At exactly the cap: fire
104
+ expect(shouldRearmInsteadOfFire({ midToolCall: true, rearmCount: max, maxRearms: max })).toBe(false)
105
+ })
106
+ })
107
+
108
+ // ---------------------------------------------------------------------------
109
+ // Tests: DEFENSIVE fix — synthetic turn_end suppressor
110
+ // ---------------------------------------------------------------------------
111
+
112
+ describe('DEFENSIVE fix: synthetic turn_end suppressor', () => {
113
+ it('suppresses a synthetic turn_end (durationMs === -1) when tools are in flight', () => {
114
+ expect(shouldSuppressSyntheticTurnEnd({ durationMs: -1, midToolCall: true })).toBe(true)
115
+ })
116
+
117
+ it('does NOT suppress a synthetic turn_end when no tools are in flight', () => {
118
+ // No tools → the backstop should fire normally (turn is genuinely orphaned)
119
+ expect(shouldSuppressSyntheticTurnEnd({ durationMs: -1, midToolCall: false })).toBe(false)
120
+ })
121
+
122
+ it('does NOT suppress an authoritative turn_end (durationMs >= 0)', () => {
123
+ expect(shouldSuppressSyntheticTurnEnd({ durationMs: 0, midToolCall: true })).toBe(false)
124
+ expect(shouldSuppressSyntheticTurnEnd({ durationMs: 1, midToolCall: true })).toBe(false)
125
+ expect(shouldSuppressSyntheticTurnEnd({ durationMs: 12345, midToolCall: true })).toBe(false)
126
+ expect(shouldSuppressSyntheticTurnEnd({ durationMs: 0, midToolCall: false })).toBe(false)
127
+ })
128
+
129
+ it('only durationMs === -1 is the synthetic discriminator', () => {
130
+ // Values near -1 must not accidentally trigger suppression
131
+ expect(shouldSuppressSyntheticTurnEnd({ durationMs: -2, midToolCall: true })).toBe(false)
132
+ expect(shouldSuppressSyntheticTurnEnd({ durationMs: -0.5, midToolCall: true })).toBe(false)
133
+ })
134
+ })
135
+
136
+ // ---------------------------------------------------------------------------
137
+ // Tests: ToolFlightTracker integration with the guard logic
138
+ // ---------------------------------------------------------------------------
139
+
140
+ describe('ToolFlightTracker + guard integration', () => {
141
+ it('re-arm fires when a Bash tool is in flight', () => {
142
+ const tracker = new ToolFlightTracker()
143
+ tracker.onEvent({ kind: 'tool_use', toolUseId: 'bash_1' })
144
+
145
+ expect(shouldRearmInsteadOfFire({
146
+ midToolCall: tracker.isMidToolCall(),
147
+ rearmCount: 0,
148
+ maxRearms: ORPHANED_REPLY_MAX_REARMS,
149
+ })).toBe(true)
150
+ })
151
+
152
+ it('fires normally after tool_result completes the tool', () => {
153
+ const tracker = new ToolFlightTracker()
154
+ tracker.onEvent({ kind: 'tool_use', toolUseId: 'bash_1' })
155
+ tracker.onEvent({ kind: 'tool_result', toolUseId: 'bash_1' })
156
+
157
+ expect(shouldRearmInsteadOfFire({
158
+ midToolCall: tracker.isMidToolCall(),
159
+ rearmCount: 0,
160
+ maxRearms: ORPHANED_REPLY_MAX_REARMS,
161
+ })).toBe(false)
162
+ })
163
+
164
+ it('defensive guard suppresses synthetic turn_end mid-Bash', () => {
165
+ const tracker = new ToolFlightTracker()
166
+ tracker.onEvent({ kind: 'tool_use', toolUseId: 'bash_2' })
167
+
168
+ expect(shouldSuppressSyntheticTurnEnd({
169
+ durationMs: -1,
170
+ midToolCall: tracker.isMidToolCall(),
171
+ })).toBe(true)
172
+ })
173
+
174
+ it('defensive guard allows synthetic turn_end after all tools complete', () => {
175
+ const tracker = new ToolFlightTracker()
176
+ tracker.onEvent({ kind: 'tool_use', toolUseId: 'bash_2' })
177
+ tracker.onEvent({ kind: 'tool_result', toolUseId: 'bash_2' })
178
+
179
+ expect(shouldSuppressSyntheticTurnEnd({
180
+ durationMs: -1,
181
+ midToolCall: tracker.isMidToolCall(),
182
+ })).toBe(false)
183
+ })
184
+
185
+ it('parallel tools: re-arm persists while ANY tool is in flight', () => {
186
+ const tracker = new ToolFlightTracker()
187
+ tracker.onEvent({ kind: 'tool_use', toolUseId: 'read_1' })
188
+ tracker.onEvent({ kind: 'tool_use', toolUseId: 'read_2' })
189
+ tracker.onEvent({ kind: 'tool_use', toolUseId: 'edit_1' })
190
+
191
+ // Still re-arming: 3 tools open
192
+ expect(shouldRearmInsteadOfFire({
193
+ midToolCall: tracker.isMidToolCall(),
194
+ rearmCount: 0,
195
+ maxRearms: ORPHANED_REPLY_MAX_REARMS,
196
+ })).toBe(true)
197
+
198
+ // Two complete
199
+ tracker.onEvent({ kind: 'tool_result', toolUseId: 'read_1' })
200
+ tracker.onEvent({ kind: 'tool_result', toolUseId: 'read_2' })
201
+
202
+ // Still re-arming: edit_1 open
203
+ expect(shouldRearmInsteadOfFire({
204
+ midToolCall: tracker.isMidToolCall(),
205
+ rearmCount: 1,
206
+ maxRearms: ORPHANED_REPLY_MAX_REARMS,
207
+ })).toBe(true)
208
+
209
+ // All complete
210
+ tracker.onEvent({ kind: 'tool_result', toolUseId: 'edit_1' })
211
+
212
+ expect(shouldRearmInsteadOfFire({
213
+ midToolCall: tracker.isMidToolCall(),
214
+ rearmCount: 2,
215
+ maxRearms: ORPHANED_REPLY_MAX_REARMS,
216
+ })).toBe(false)
217
+ })
218
+
219
+ it('cap fires even mid-tool after 20 re-arms (wedged tool surfaces)', () => {
220
+ const tracker = new ToolFlightTracker()
221
+ tracker.onEvent({ kind: 'tool_use', toolUseId: 'hung_bash' })
222
+
223
+ // First 20 re-arms proceed
224
+ for (let i = 0; i < ORPHANED_REPLY_MAX_REARMS; i++) {
225
+ expect(shouldRearmInsteadOfFire({
226
+ midToolCall: tracker.isMidToolCall(),
227
+ rearmCount: i,
228
+ maxRearms: ORPHANED_REPLY_MAX_REARMS,
229
+ })).toBe(true)
230
+ }
231
+
232
+ // 21st: cap exceeded — fire despite in-flight
233
+ expect(shouldRearmInsteadOfFire({
234
+ midToolCall: tracker.isMidToolCall(),
235
+ rearmCount: ORPHANED_REPLY_MAX_REARMS,
236
+ maxRearms: ORPHANED_REPLY_MAX_REARMS,
237
+ })).toBe(false)
238
+ })
239
+ })
240
+
241
+ // ---------------------------------------------------------------------------
242
+ // Tests: shouldArmOrphanedReplyTimeout (existing surface, unchanged)
243
+ // ---------------------------------------------------------------------------
244
+
245
+ describe('shouldArmOrphanedReplyTimeout (existing — unchanged by this fix)', () => {
246
+ it('arms when conditions are met', () => {
247
+ expect(
248
+ shouldArmOrphanedReplyTimeout({
249
+ currentSessionChatId: '123',
250
+ capturedTextCount: 1,
251
+ replyCalled: false,
252
+ }),
253
+ ).toBe(true)
254
+ })
255
+
256
+ it('does not arm after reply has been called', () => {
257
+ expect(
258
+ shouldArmOrphanedReplyTimeout({
259
+ currentSessionChatId: '123',
260
+ capturedTextCount: 5,
261
+ replyCalled: true,
262
+ }),
263
+ ).toBe(false)
264
+ })
265
+
266
+ it('does not arm when no chat is active', () => {
267
+ expect(
268
+ shouldArmOrphanedReplyTimeout({
269
+ currentSessionChatId: null,
270
+ capturedTextCount: 1,
271
+ replyCalled: false,
272
+ }),
273
+ ).toBe(false)
274
+ })
275
+
276
+ it('does not arm when no text captured yet', () => {
277
+ expect(
278
+ shouldArmOrphanedReplyTimeout({
279
+ currentSessionChatId: '123',
280
+ capturedTextCount: 0,
281
+ replyCalled: false,
282
+ }),
283
+ ).toBe(false)
284
+ })
285
+ })
@@ -0,0 +1,194 @@
1
+ /**
2
+ * Regression contract for #2533: the over-ping anti-spam downgrade MUST NOT
3
+ * pollute final-answer classification.
4
+ *
5
+ * The bug (surfaced by the `midturn-silent-dm` UAT against v0.15.57): in
6
+ * `executeReply`, the #1675 over-ping safety net reassigns the local
7
+ * `disableNotification` to `true` to silence a 2nd+ ping in a turn, and the
8
+ * final-answer classifier was then read with that *downgraded* value. So a
9
+ * final answer the model INTENDED to ping but the anti-spam net silenced was
10
+ * misclassified as not-final → `finalAnswerDelivered` stayed false → a
11
+ * spurious silent-end re-prompt (#1664) AND a false 'undelivered' 😐 (#2530).
12
+ *
13
+ * The contract this pins (what `executeStreamReply` already did, and what
14
+ * #2533 made `executeReply` do): final-answer classification keys on the
15
+ * MODEL'S ORIGINAL INTENT (`args.disable_notification`), not the
16
+ * over-ping-downgraded send value. The actual SEND still honours the
17
+ * downgrade — only the classification is decoupled.
18
+ *
19
+ * `executeReply` itself isn't unit-callable (it lives in the 22k-line
20
+ * gateway), so this ties the two real pure modules together to reproduce the
21
+ * exact failing sequence and assert the invariant.
22
+ */
23
+
24
+ import { describe, it, expect } from 'vitest'
25
+ import { decideOverPing } from '../over-ping-safety-net.js'
26
+ import { isFinalAnswerReply, isSubstantiveFinalReply } from '../final-answer-detect.js'
27
+
28
+ describe('#2533 — over-ping downgrade must not pollute final-answer classification', () => {
29
+ // The midturn-silent-dm failing sequence: an interim ack pings, then a
30
+ // SHORT final answer the model also intended to ping.
31
+ const ACK = { text: 'On it.', modelWantsPing: true }
32
+ const FINAL = { text: 'Hostname is example-host.', modelWantsPing: true } // <200 chars
33
+
34
+ it('reproduces the sequence: ack claims the ping slot, the short final gets over-ping-suppressed', () => {
35
+ // Beat 1 — the ack pings; first ping of the turn claims the slot (not suppressed).
36
+ const ackDecision = decideOverPing({ modelRequestedPing: ACK.modelWantsPing, firstPingAt: null, nowMs: 1_000 })
37
+ expect(ackDecision.suppress).toBe(false)
38
+ expect(ackDecision.claimSlot).toBe(true)
39
+ const firstPingAt = 1_000
40
+
41
+ // Beat 2 — the real final answer also wants to ping, but the slot is taken → SUPPRESS.
42
+ const finalDecision = decideOverPing({ modelRequestedPing: FINAL.modelWantsPing, firstPingAt, nowMs: 2_000 })
43
+ expect(finalDecision.suppress).toBe(true) // the gateway would downgrade disable_notification:false → true
44
+ })
45
+
46
+ it('classifying on the MODEL INTENT marks the suppressed short final as final (correct)', () => {
47
+ // This is what the gateway MUST do (#2533): use args.disable_notification,
48
+ // NOT the over-ping-downgraded value.
49
+ const modelDisableNotification = !FINAL.modelWantsPing // model wanted to ping → false
50
+ expect(
51
+ isFinalAnswerReply({ text: FINAL.text, disableNotification: modelDisableNotification }),
52
+ ).toBe(true) // delivered final → finalAnswerDelivered=true → no spurious re-prompt, no false 😐
53
+ })
54
+
55
+ it('classifying on the DOWNGRADED value misclassifies it as not-final (the bug #2533 fixed)', () => {
56
+ // The over-ping net forced disable_notification → true. If classification
57
+ // reads THAT (the pre-#2533 bug), the short non-done final is seen as an
58
+ // interim ack → finalAnswerDelivered stays false → spurious re-prompt + 😐.
59
+ const downgradedDisableNotification = true
60
+ expect(
61
+ isFinalAnswerReply({ text: FINAL.text, disableNotification: downgradedDisableNotification }),
62
+ ).toBe(false) // <-- this WRONG classification is exactly what the gateway must NOT produce
63
+ })
64
+
65
+ it('a genuinely-silent interim ack (model set disable_notification:true) is still NOT final — fix does not over-correct', () => {
66
+ // The decoupling must not turn EVERY reply final: a short reply the MODEL
67
+ // marked silent (a real interim ack) still classifies non-final on model intent.
68
+ const modelSilentAck = true
69
+ expect(
70
+ isFinalAnswerReply({ text: 'looking into that…', disableNotification: modelSilentAck }),
71
+ ).toBe(false)
72
+ })
73
+
74
+ it('a long over-ping-suppressed answer was already final regardless (length backstop) — fix matters for SHORT finals', () => {
75
+ const long = 'x'.repeat(250)
76
+ // Even classifying on the downgraded value, length ≥200 makes it final — so
77
+ // the bug only ever bit SHORT over-ping-suppressed finals (the #2533 case).
78
+ expect(isFinalAnswerReply({ text: long, disableNotification: true })).toBe(true)
79
+ expect(isFinalAnswerReply({ text: long, disableNotification: false })).toBe(true)
80
+ })
81
+ })
82
+
83
+ /**
84
+ * Notification ownership (R8 / PR-2 — design `docs/message-emission-
85
+ * determinism.md` §over-ping). The substantive final answer must OWN the
86
+ * turn's single device ping. The residual the bare "first ping wins" rule
87
+ * left: an interim ack pings first and claims the slot, so the later
88
+ * substantive answer is downgraded to silent — "the reply is last but the
89
+ * phone never buzzed for the answer." `decideOverPing` is now aware of WHO
90
+ * holds the slot (`firstPingWasSubstantive`) and WHO is asking
91
+ * (`substantive`) and UPGRADES a substantive answer over an ack's slot,
92
+ * while still suppressing every double-ping the #1674 guard exists for.
93
+ *
94
+ * The 2×2 ownership matrix (model wants to ping, slot already held):
95
+ *
96
+ * incoming \ slot held by │ ACK (non-substantive) │ SUBSTANTIVE
97
+ * ────────────────────────┼───────────────────────┼─────────────
98
+ * SUBSTANTIVE answer │ UPGRADE (ping, claim) │ suppress (#1674)
99
+ * ACK │ suppress (orig) │ suppress
100
+ */
101
+ describe('R8 / PR-2 — substantive final answer OWNS the turn ping (upgrade matrix)', () => {
102
+ const SUBSTANTIVE = 'x'.repeat(300) // ≥200 → isSubstantiveFinalReply true
103
+ const ACK = 'On it.' // <200, non-done → ack
104
+
105
+ it('row 1 — substantive answer pinging over an ACK-held slot ⇒ UPGRADE (not suppressed)', () => {
106
+ // The ack pinged first (claimed the slot, non-substantive).
107
+ const ack = decideOverPing({
108
+ modelRequestedPing: true,
109
+ firstPingAt: null,
110
+ substantive: isSubstantiveFinalReply({ text: ACK, disableNotification: false }),
111
+ nowMs: 1_000,
112
+ })
113
+ expect(ack.claimSlot).toBe(true)
114
+ expect(ack.upgrade).toBe(false)
115
+ // Now the substantive answer wants to ping; the slot is ack-held.
116
+ const answer = decideOverPing({
117
+ modelRequestedPing: true,
118
+ firstPingAt: 1_000,
119
+ substantive: isSubstantiveFinalReply({ text: SUBSTANTIVE, disableNotification: false }),
120
+ firstPingWasSubstantive: false, // the ack
121
+ nowMs: 2_000,
122
+ })
123
+ expect(answer.suppress).toBe(false) // the ANSWER pings — phone buzzes for the answer
124
+ expect(answer.claimSlot).toBe(true) // slot upgraded to substantive
125
+ expect(answer.upgrade).toBe(true)
126
+ })
127
+
128
+ it('row 2 — ACK pinging over a SUBSTANTIVE-held slot ⇒ suppress (no double-ping after the answer)', () => {
129
+ const d = decideOverPing({
130
+ modelRequestedPing: true,
131
+ firstPingAt: 1_000,
132
+ substantive: isSubstantiveFinalReply({ text: ACK, disableNotification: false }),
133
+ firstPingWasSubstantive: true, // the real answer already owned the slot
134
+ nowMs: 2_000,
135
+ })
136
+ expect(d.suppress).toBe(true)
137
+ expect(d.claimSlot).toBe(false)
138
+ expect(d.upgrade).toBe(false)
139
+ })
140
+
141
+ it('row 3 — SUBSTANTIVE over a SUBSTANTIVE-held slot ⇒ suppress (preserves the #1674 model-double-ping guard)', () => {
142
+ // The reproducer #1674 targeted: a substantive answer pinged, then a
143
+ // substantive wrap-up also wants to ping. One beep, not two.
144
+ const d = decideOverPing({
145
+ modelRequestedPing: true,
146
+ firstPingAt: 30_000,
147
+ substantive: isSubstantiveFinalReply({ text: SUBSTANTIVE, disableNotification: false }),
148
+ firstPingWasSubstantive: true,
149
+ nowMs: 36_000,
150
+ })
151
+ expect(d.suppress).toBe(true)
152
+ expect(d.upgrade).toBe(false)
153
+ expect(d.sinceFirstPingMs).toBe(6_000)
154
+ })
155
+
156
+ it('row 4 — ACK over an ACK-held slot ⇒ suppress (original one-ping-per-turn behaviour, unchanged)', () => {
157
+ const d = decideOverPing({
158
+ modelRequestedPing: true,
159
+ firstPingAt: 1_000,
160
+ substantive: isSubstantiveFinalReply({ text: ACK, disableNotification: false }),
161
+ firstPingWasSubstantive: false,
162
+ nowMs: 2_000,
163
+ })
164
+ expect(d.suppress).toBe(true)
165
+ expect(d.claimSlot).toBe(false)
166
+ expect(d.upgrade).toBe(false)
167
+ })
168
+
169
+ it('the upgrade fires AT MOST once: after an upgrade, a further ack does NOT re-upgrade', () => {
170
+ // ack pings (slot=ack) → answer upgrades (slot=substantive) → a trailing
171
+ // ack must now suppress, not ping a third time.
172
+ const trailingAck = decideOverPing({
173
+ modelRequestedPing: true,
174
+ firstPingAt: 2_000, // upgraded slot timestamp
175
+ substantive: false,
176
+ firstPingWasSubstantive: true, // slot now substantive after the upgrade
177
+ nowMs: 3_000,
178
+ })
179
+ expect(trailingAck.suppress).toBe(true)
180
+ expect(trailingAck.upgrade).toBe(false)
181
+ })
182
+
183
+ it('a substantive FIRST ping still claims (no upgrade flag) — upgrade is strictly the second-ping case', () => {
184
+ const d = decideOverPing({
185
+ modelRequestedPing: true,
186
+ firstPingAt: null, // no prior ping this turn
187
+ substantive: true,
188
+ nowMs: 1_000,
189
+ })
190
+ expect(d.claimSlot).toBe(true)
191
+ expect(d.upgrade).toBe(false) // first ping is a claim, not an upgrade
192
+ expect(d.suppress).toBe(false)
193
+ })
194
+ })
@@ -38,7 +38,7 @@ describe('decideOverPing — at-most-one-ping-per-turn safety net', () => {
38
38
  firstPingAt: null,
39
39
  nowMs: 1_000,
40
40
  })
41
- expect(d1).toEqual({ suppress: false, claimSlot: false, sinceFirstPingMs: null })
41
+ expect(d1).toEqual({ suppress: false, claimSlot: false, upgrade: false, sinceFirstPingMs: null })
42
42
 
43
43
  // Prior ping already landed — silent reply still no-op, NOT claimed
44
44
  const d2 = decideOverPing({
@@ -46,7 +46,7 @@ describe('decideOverPing — at-most-one-ping-per-turn safety net', () => {
46
46
  firstPingAt: 1_000,
47
47
  nowMs: 5_000,
48
48
  })
49
- expect(d2).toEqual({ suppress: false, claimSlot: false, sinceFirstPingMs: null })
49
+ expect(d2).toEqual({ suppress: false, claimSlot: false, upgrade: false, sinceFirstPingMs: null })
50
50
  })
51
51
 
52
52
  it('handles the edge case where firstPingAt equals nowMs (instant double-call)', () => {