npm - switchroom - Versions diffs - 0.15.44 → 0.16.4 - Mend

switchroom 0.15.44 → 0.16.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (150) hide show

package/dist/agent-scheduler/index.js +122 -88
package/dist/auth-broker/index.js +463 -177
package/dist/cli/autoaccept-poll.js +4842 -35
package/dist/cli/drive-write-pretool.mjs +17 -14
package/dist/cli/notion-write-pretool.mjs +117 -86
package/dist/cli/self-improve-apply-guard-pretool.mjs +626 -0
package/dist/cli/self-improve-stop.mjs +428 -0
package/dist/cli/skill-validate-pretool.mjs +72 -72
package/dist/cli/switchroom.js +3249 -1241
package/dist/cli/ui/index.html +1 -1
package/dist/host-control/main.js +2833 -355
package/dist/vault/approvals/kernel-server.js +7482 -7439
package/dist/vault/broker/server.js +11315 -11272
package/examples/minimal.yaml +1 -0
package/examples/switchroom.yaml +1 -0
package/package.json +3 -3
package/profiles/_base/start.sh.hbs +88 -1
package/profiles/_shared/execution-discipline.md.hbs +18 -0
package/profiles/default/CLAUDE.md.hbs +3 -22
package/telegram-plugin/.claude-plugin/plugin.json +2 -2
package/telegram-plugin/answer-stream-flag.ts +12 -49
package/telegram-plugin/answer-stream.ts +5 -150
package/telegram-plugin/auth-snapshot-format.ts +280 -48
package/telegram-plugin/auto-fallback-fleet.ts +44 -1
package/telegram-plugin/context-exhaustion.ts +12 -0
package/telegram-plugin/demo-mask.ts +154 -0
package/telegram-plugin/dist/bridge/bridge.js +167 -124
package/telegram-plugin/dist/gateway/gateway.js +3039 -1159
package/telegram-plugin/dist/server.js +215 -172
package/telegram-plugin/docs/waiting-ux-spec.md +2 -2
package/telegram-plugin/draft-stream.ts +47 -410
package/telegram-plugin/final-answer-detect.ts +17 -12
package/telegram-plugin/fleet-fallback-resume.ts +131 -0
package/telegram-plugin/format.ts +56 -19
package/telegram-plugin/gateway/auth-add-flow.ts +332 -127
package/telegram-plugin/gateway/auth-broker-client.ts +2 -2
package/telegram-plugin/gateway/auth-command.ts +70 -14
package/telegram-plugin/gateway/clean-shutdown-marker.ts +44 -0
package/telegram-plugin/gateway/config-approval-handler.test.ts +91 -4
package/telegram-plugin/gateway/config-approval-handler.ts +94 -13
package/telegram-plugin/gateway/current-turn-map.ts +188 -0
package/telegram-plugin/gateway/disconnect-flush.ts +3 -1
package/telegram-plugin/gateway/effort-command.ts +8 -3
package/telegram-plugin/gateway/emission-authority.ts +369 -0
package/telegram-plugin/gateway/feed-open-gate.ts +292 -0
package/telegram-plugin/gateway/gateway.ts +1837 -291
package/telegram-plugin/gateway/inject-handler.test.ts +2 -1
package/telegram-plugin/gateway/ms365-write-approval.test.ts +4 -4
package/telegram-plugin/gateway/represent-guard.ts +72 -0
package/telegram-plugin/gateway/status-surface-log.test.ts +5 -4
package/telegram-plugin/gateway/status-surface-log.ts +14 -3
package/telegram-plugin/history.ts +33 -11
package/telegram-plugin/hooks/repo-context-pretool.mjs +26 -0
package/telegram-plugin/hooks/subagent-tracker-posttool.mjs +5 -0
package/telegram-plugin/hooks/subagent-tracker-pretool.mjs +8 -0
package/telegram-plugin/hooks/tool-label-pretool.mjs +39 -15
package/telegram-plugin/issues-card.ts +4 -0
package/telegram-plugin/model-unavailable.ts +124 -0
package/telegram-plugin/narrative-dedup.ts +69 -0
package/telegram-plugin/over-ping-safety-net.ts +70 -4
package/telegram-plugin/package.json +3 -3
package/telegram-plugin/pending-work-progress.ts +12 -0
package/telegram-plugin/permission-rule.ts +32 -5
package/telegram-plugin/permission-title.ts +152 -9
package/telegram-plugin/quota-check.ts +13 -0
package/telegram-plugin/quota-watch.ts +135 -7
package/telegram-plugin/registry/turns-schema.test.ts +24 -0
package/telegram-plugin/registry/turns-schema.ts +9 -0
package/telegram-plugin/runtime-metrics.ts +13 -0
package/telegram-plugin/session-tail.ts +96 -11
package/telegram-plugin/silence-poke.ts +170 -24
package/telegram-plugin/slot-banner-driver.ts +3 -0
package/telegram-plugin/status-no-truncate.ts +44 -0
package/telegram-plugin/status-reactions.ts +20 -3
package/telegram-plugin/stream-controller.ts +4 -23
package/telegram-plugin/stream-reply-handler.ts +6 -24
package/telegram-plugin/streaming-metrics.ts +91 -0
package/telegram-plugin/subagent-watcher.ts +212 -66
package/telegram-plugin/tests/activity-ever-opened-sticky.test.ts +47 -0
package/telegram-plugin/tests/answer-stream-dedup.test.ts +9 -26
package/telegram-plugin/tests/answer-stream-flag.test.ts +25 -58
package/telegram-plugin/tests/answer-stream-silent-markers.test.ts +41 -51
package/telegram-plugin/tests/answer-stream.test.ts +2 -411
package/telegram-plugin/tests/auth-add-flow.test.ts +488 -253
package/telegram-plugin/tests/auth-command-format2.test.ts +71 -1
package/telegram-plugin/tests/auth-snapshot-format.test.ts +376 -6
package/telegram-plugin/tests/auto-fallback-fleet.test.ts +120 -0
package/telegram-plugin/tests/cross-turn-card-gate.test.ts +424 -0
package/telegram-plugin/tests/demo-mask.test.ts +127 -0
package/telegram-plugin/tests/draft-stream.test.ts +0 -827
package/telegram-plugin/tests/emission-authority-card-drain-gate.test.ts +236 -0
package/telegram-plugin/tests/emission-authority-facade.test.ts +488 -0
package/telegram-plugin/tests/emission-authority-open-gate.test.ts +179 -0
package/telegram-plugin/tests/emission-authority-ping-gate.test.ts +395 -0
package/telegram-plugin/tests/emission-determinism-wiring.test.ts +177 -0
package/telegram-plugin/tests/feed-heartbeat-liveness-open.test.ts +146 -0
package/telegram-plugin/tests/feed-open-gate.test.ts +259 -0
package/telegram-plugin/tests/feed-survival.test.ts +526 -0
package/telegram-plugin/tests/fleet-fallback-resume.test.ts +197 -0
package/telegram-plugin/tests/gateway-clean-shutdown-marker.test.ts +117 -0
package/telegram-plugin/tests/gateway-no-reply-single-emit.test.ts +4 -11
package/telegram-plugin/tests/history.test.ts +60 -0
package/telegram-plugin/tests/model-unavailable.test.ts +118 -0
package/telegram-plugin/tests/narrative-dedup.test.ts +118 -0
package/telegram-plugin/tests/orphaned-reply-rearm.test.ts +285 -0
package/telegram-plugin/tests/over-ping-final-answer-decoupling.test.ts +194 -0
package/telegram-plugin/tests/over-ping-safety-net.test.ts +2 -2
package/telegram-plugin/tests/per-topic-current-turn.test.ts +373 -0
package/telegram-plugin/tests/permission-card-origin-kill-switch.test.ts +42 -0
package/telegram-plugin/tests/permission-rule.test.ts +17 -0
package/telegram-plugin/tests/permission-title.test.ts +206 -17
package/telegram-plugin/tests/quota-watch.test.ts +252 -9
package/telegram-plugin/tests/reply-terminal-reaction.test.ts +6 -1
package/telegram-plugin/tests/repo-context-pretool.test.ts +62 -0
package/telegram-plugin/tests/represent-guard.test.ts +162 -0
package/telegram-plugin/tests/session-tail.test.ts +147 -3
package/telegram-plugin/tests/silence-liveness-wiring.test.ts +18 -0
package/telegram-plugin/tests/status-card-budget-parity.test.ts +72 -0
package/telegram-plugin/tests/status-surface-log.test.ts +146 -0
package/telegram-plugin/tests/subagent-watcher-clip-narrative.test.ts +58 -0
package/telegram-plugin/tests/subagent-watcher-parent-turn-key.test.ts +102 -0
package/telegram-plugin/tests/subagent-watcher-workflow-visibility.test.ts +225 -0
package/telegram-plugin/tests/subagent-watcher.test.ts +147 -0
package/telegram-plugin/tests/telegram-activity-visibility-integration.test.ts +597 -0
package/telegram-plugin/tests/telegram-format.test.ts +101 -6
package/telegram-plugin/tests/tool-activity-summary.test.ts +550 -15
package/telegram-plugin/tests/tool-label-pretool.test.ts +73 -0
package/telegram-plugin/tests/tool-label-sidecar.test.ts +44 -0
package/telegram-plugin/tests/tool-labels.test.ts +67 -0
package/telegram-plugin/tests/turn-liveness-floor.test.ts +196 -0
package/telegram-plugin/tests/turn-liveness-invariant.test.ts +340 -0
package/telegram-plugin/tests/welcome-text.test.ts +32 -3
package/telegram-plugin/tests/worker-activity-feed.test.ts +470 -22
package/telegram-plugin/tool-activity-summary.ts +375 -58
package/telegram-plugin/turn-liveness-floor.ts +240 -0
package/telegram-plugin/uat/assertions.ts +115 -0
package/telegram-plugin/uat/driver.ts +68 -0
package/telegram-plugin/uat/scenarios/bg-sub-agent-dispatch-dm.test.ts +119 -133
package/telegram-plugin/uat/scenarios/jtbd-answer-pings.test.ts +94 -0
package/telegram-plugin/uat/scenarios/jtbd-cross-turn-card-dm.test.ts +109 -0
package/telegram-plugin/uat/scenarios/jtbd-foreground-feed-thinkgap-dm.test.ts +478 -0
package/telegram-plugin/uat/scenarios/jtbd-foreground-feed-visibility-dm.test.ts +396 -0
package/telegram-plugin/uat/scenarios/jtbd-liveness-feed-open-dm.test.ts +202 -0
package/telegram-plugin/uat/scenarios/jtbd-reply-is-last-dm.test.ts +202 -0
package/telegram-plugin/uat/scenarios/reactions-dm.test.ts +93 -87
package/telegram-plugin/welcome-text.ts +13 -1
package/telegram-plugin/worker-activity-feed.ts +157 -82
package/telegram-plugin/draft-transport.ts +0 -122
package/telegram-plugin/tests/draft-retirement-wiring.test.ts +0 -82
package/telegram-plugin/tests/draft-transport.test.ts +0 -211

package/telegram-plugin/tests/narrative-dedup.test.ts ADDED Viewed

@@ -0,0 +1,118 @@
+import { describe, it, expect } from 'vitest'
+import {
+  normalizeNarrative,
+  prefixSimilarity,
+  isDraftOfReply,
+  DRAFT_SUPPRESS_THRESHOLD,
+  REPLY_TOOLS,
+} from '../narrative-dedup.js'
+describe('narrative-dedup', () => {
+  it('pins the threshold so a silent retune breaks the test', () => {
+    expect(DRAFT_SUPPRESS_THRESHOLD).toBe(0.8)
+  })
+  it('REPLY_TOOLS holds exactly reply + stream_reply', () => {
+    expect(REPLY_TOOLS.has('reply')).toBe(true)
+    expect(REPLY_TOOLS.has('stream_reply')).toBe(true)
+    expect(REPLY_TOOLS.has('Bash')).toBe(false)
+  })
+  describe('normalizeNarrative', () => {
+    it('strips markdown emphasis/heading/quote marks, collapses whitespace, lowercases', () => {
+      expect(normalizeNarrative('**Bold**  _italic_   `code`')).toBe('bold italic code')
+      expect(normalizeNarrative('> # Heading\n  text')).toBe('heading text')
+    })
+  })
+  describe('prefixSimilarity', () => {
+    it('returns 1 for identical strings', () => {
+      expect(prefixSimilarity('hello there', 'hello there')).toBe(1)
+    })
+    it('returns 0 when either side is empty (no divide-by-zero)', () => {
+      expect(prefixSimilarity('', 'something')).toBe(0)
+      expect(prefixSimilarity('something', '')).toBe(0)
+      expect(prefixSimilarity('', '')).toBe(0)
+    })
+    it('ratio is over the SHORTER normalized string', () => {
+      // "abc" vs "abcdef": shared prefix 3 of shorter length 3 = 1.0
+      expect(prefixSimilarity('abc', 'abcdef')).toBe(1)
+      // "abx" vs "abcdef": shared prefix 2 of shorter length 3 ≈ 0.667
+      expect(prefixSimilarity('abx', 'abcdef')).toBeCloseTo(2 / 3, 5)
+    })
+  })
+  describe('isDraftOfReply', () => {
+    it('SUPPRESS: identical draft and reply', () => {
+      const t = 'The repo is at /home/user/code/switchroom.'
+      expect(isDraftOfReply(t, t)).toBe(true)
+    })
+    it('SUPPRESS: draft whose trailing sentence was trimmed before sending (~0.85 prefix)', () => {
+      const draft = 'The repo is at /home/user/code/switchroom. I will start now.'
+      const reply = 'The repo is at /home/user/code/switchroom.'
+      // reply is the shorter string and is a full prefix of the draft → 1.0
+      expect(prefixSimilarity(draft, reply)).toBe(1)
+      expect(isDraftOfReply(draft, reply)).toBe(true)
+      // And the symmetric framing (draft slightly longer head, reply trimmed):
+      const draft2 = 'Found both repos and confirmed the remote is correct here.'
+      const reply2 = 'Found both repos and confirmed the remote is correct.'
+      expect(prefixSimilarity(draft2, reply2)).toBeGreaterThanOrEqual(0.85)
+      expect(isDraftOfReply(draft2, reply2)).toBe(true)
+    })
+    it('SHOW: post-action narration that merely precedes a different reply', () => {
+      // "Sent. Waiting on the build…" vs an unrelated reply payload — short
+      // string, near-zero shared prefix → below threshold → SHOW.
+      const narration = 'Sent. Waiting on the build…'
+      const reply = "Here's the result of the build: all green."
+      expect(prefixSimilarity(narration, reply)).toBeLessThan(DRAFT_SUPPRESS_THRESHOLD)
+      expect(isDraftOfReply(narration, reply)).toBe(false)
+    })
+    it('SHOW: empty reply text never suppresses (no divide-by-zero)', () => {
+      expect(isDraftOfReply('On it. Let me find the repo…', '')).toBe(false)
+    })
+    it('SUPPRESS: draft differs from reply only by markdown decoration', () => {
+      const draft = 'Here is the **plan**: do A then B.'
+      const reply = 'Here is the plan: do A then B.'
+      // After normalization the markdown stars vanish → identical → suppress.
+      expect(normalizeNarrative(draft)).toBe(normalizeNarrative(reply))
+      expect(isDraftOfReply(draft, reply)).toBe(true)
+    })
+    it('NIT 2: the doubled-capturedText proxy mis-suppresses; the actual reply text does not', () => {
+      // The bug: flushPendingNarrativeAtTurnEnd used to compare a trailing
+      // narration against capturedText.join(''). When the model emits the same
+      // short string twice in a turn — e.g. "Done." as working narration and
+      // then "Done." as the reply — that proxy becomes the CONCATENATION
+      // "Done.Done.", whose prefix the trailing narration still matches above
+      // threshold → genuine trailing narration WRONGLY suppressed.
+      const trailing = 'Done.'
+      const doubledProxy = 'Done.' + 'Done.' // capturedText.join('') of two "Done." blocks
+      const actualReply = 'Done.'
+      // Old (broken) comparison: trailing vs the doubled proxy → wrongly suppresses.
+      expect(isDraftOfReply(trailing, doubledProxy)).toBe(true)
+      // New comparison: trailing vs the ACTUAL reply text. Here the reply text
+      // really IS "Done.", so a trailing "Done." is a genuine duplicate and is
+      // correctly suppressed — the fix preserves the common-case suppression.
+      expect(isDraftOfReply(trailing, actualReply)).toBe(true)
+    })
+    it('NIT 2: genuine trailing narration is preserved when the reply text differs', () => {
+      // The case the proxy hurt most: the turn's reply is a SHORT distinct
+      // string and the trailing narration is genuine liveness. Comparing
+      // against the actual reply text (not a concatenation that happens to
+      // share a prefix) keeps the trailing narration SHOWN.
+      const trailingNarration = 'Done — all green, pushing now.'
+      const actualReply = 'Here is the summary you asked for: 3 files changed.'
+      // Below threshold against the real reply → SHOW (not suppressed).
+      expect(isDraftOfReply(trailingNarration, actualReply)).toBe(false)
+    })
+  })
+})

package/telegram-plugin/tests/orphaned-reply-rearm.test.ts ADDED Viewed

@@ -0,0 +1,285 @@
+/**
+ * Unit tests for the activity-feed-teardown fix (orphaned-reply backstop).
+ *
+ * Root cause: the orphaned-reply backstop fired a synthetic turn_end
+ * (`durationMs: -1`) after 30 s of silence, even mid-tool-call. That nulled
+ * `currentTurn` and dropped every subsequent `tool_label`, darkening the live
+ * activity feed for the rest of the turn.
+ *
+ * Fix: three layers described in the PR.
+ *   PRIMARY   — fuse fires mid-tool → re-arm instead (bounded by ORPHANED_REPLY_MAX_REARMS).
+ *   SECONDARY — tool_label re-arms the fuse so active label streams keep it fresh.
+ *   DEFENSIVE — turn_end entry rejects the synthetic event if tools are in flight.
+ *
+ * These tests cover the pure / unit-testable surfaces:
+ *   - shouldArmOrphanedReplyTimeout (existing, now with midToolCall param)
+ *   - ORPHANED_REPLY_MAX_REARMS constant math
+ *   - The re-arm guard logic (pure decision extracted from the closure)
+ *   - The defensive turn_end discriminator (durationMs === -1 + in-flight check)
+ */
+import { describe, it, expect } from 'vitest'
+import {
+  shouldArmOrphanedReplyTimeout,
+  ORPHANED_REPLY_TIMEOUT_MS,
+  ORPHANED_REPLY_MAX_REARMS,
+} from '../context-exhaustion.js'
+import { ToolFlightTracker } from '../gateway/interrupt-defer.js'
+// ---------------------------------------------------------------------------
+// Helpers — pure decision functions mirroring the gateway closure logic.
+// These extract the discriminable parts of the fix so they are unit-testable
+// without instantiating the full gateway.
+// ---------------------------------------------------------------------------
+/**
+ * Mirrors the PRIMARY fix decision inside the setTimeout callback:
+ * should the backstop re-arm (true) or fire turn_end (false)?
+ */
+function shouldRearmInsteadOfFire(opts: {
+  midToolCall: boolean
+  rearmCount: number
+  maxRearms: number
+}): boolean {
+  return opts.midToolCall && opts.rearmCount < opts.maxRearms
+}
+/**
+ * Mirrors the DEFENSIVE fix at turn_end entry:
+ * should a synthetic turn_end (durationMs === -1) be suppressed?
+ */
+function shouldSuppressSyntheticTurnEnd(opts: {
+  durationMs: number
+  midToolCall: boolean
+}): boolean {
+  return opts.durationMs === -1 && opts.midToolCall
+}
+// ---------------------------------------------------------------------------
+// Tests: ORPHANED_REPLY_MAX_REARMS constant
+// ---------------------------------------------------------------------------
+describe('ORPHANED_REPLY_MAX_REARMS', () => {
+  it('is 20 (20 × 30 s = 10 min cap)', () => {
+    expect(ORPHANED_REPLY_MAX_REARMS).toBe(20)
+  })
+  it('combined with ORPHANED_REPLY_TIMEOUT_MS covers at least 10 min of tool activity', () => {
+    const coverageMs = ORPHANED_REPLY_MAX_REARMS * ORPHANED_REPLY_TIMEOUT_MS
+    // 20 × 30 000 ms = 600 000 ms = 10 min
+    expect(coverageMs).toBeGreaterThanOrEqual(10 * 60 * 1000)
+  })
+  it('fuse duration is still 30 s', () => {
+    expect(ORPHANED_REPLY_TIMEOUT_MS).toBe(30_000)
+  })
+})
+// ---------------------------------------------------------------------------
+// Tests: PRIMARY fix — re-arm guard
+// ---------------------------------------------------------------------------
+describe('PRIMARY fix: re-arm guard (shouldRearmInsteadOfFire)', () => {
+  it('re-arms when a tool is in flight and rearm count is under the cap', () => {
+    expect(shouldRearmInsteadOfFire({ midToolCall: true, rearmCount: 0, maxRearms: 20 })).toBe(true)
+    expect(shouldRearmInsteadOfFire({ midToolCall: true, rearmCount: 19, maxRearms: 20 })).toBe(true)
+  })
+  it('fires once rearm count reaches the cap, even mid-tool-call', () => {
+    expect(shouldRearmInsteadOfFire({ midToolCall: true, rearmCount: 20, maxRearms: 20 })).toBe(false)
+    expect(shouldRearmInsteadOfFire({ midToolCall: true, rearmCount: 21, maxRearms: 20 })).toBe(false)
+  })
+  it('fires immediately when no tool is in flight, regardless of rearm count', () => {
+    expect(shouldRearmInsteadOfFire({ midToolCall: false, rearmCount: 0, maxRearms: 20 })).toBe(false)
+    expect(shouldRearmInsteadOfFire({ midToolCall: false, rearmCount: 5, maxRearms: 20 })).toBe(false)
+  })
+  it('rearm count transitions: 0 → cap-1 → cap fires', () => {
+    const max = ORPHANED_REPLY_MAX_REARMS
+    for (let i = 0; i < max; i++) {
+      expect(shouldRearmInsteadOfFire({ midToolCall: true, rearmCount: i, maxRearms: max })).toBe(true)
+    }
+    // At exactly the cap: fire
+    expect(shouldRearmInsteadOfFire({ midToolCall: true, rearmCount: max, maxRearms: max })).toBe(false)
+  })
+})
+// ---------------------------------------------------------------------------
+// Tests: DEFENSIVE fix — synthetic turn_end suppressor
+// ---------------------------------------------------------------------------
+describe('DEFENSIVE fix: synthetic turn_end suppressor', () => {
+  it('suppresses a synthetic turn_end (durationMs === -1) when tools are in flight', () => {
+    expect(shouldSuppressSyntheticTurnEnd({ durationMs: -1, midToolCall: true })).toBe(true)
+  })
+  it('does NOT suppress a synthetic turn_end when no tools are in flight', () => {
+    // No tools → the backstop should fire normally (turn is genuinely orphaned)
+    expect(shouldSuppressSyntheticTurnEnd({ durationMs: -1, midToolCall: false })).toBe(false)
+  })
+  it('does NOT suppress an authoritative turn_end (durationMs >= 0)', () => {
+    expect(shouldSuppressSyntheticTurnEnd({ durationMs: 0, midToolCall: true })).toBe(false)
+    expect(shouldSuppressSyntheticTurnEnd({ durationMs: 1, midToolCall: true })).toBe(false)
+    expect(shouldSuppressSyntheticTurnEnd({ durationMs: 12345, midToolCall: true })).toBe(false)
+    expect(shouldSuppressSyntheticTurnEnd({ durationMs: 0, midToolCall: false })).toBe(false)
+  })
+  it('only durationMs === -1 is the synthetic discriminator', () => {
+    // Values near -1 must not accidentally trigger suppression
+    expect(shouldSuppressSyntheticTurnEnd({ durationMs: -2, midToolCall: true })).toBe(false)
+    expect(shouldSuppressSyntheticTurnEnd({ durationMs: -0.5, midToolCall: true })).toBe(false)
+  })
+})
+// ---------------------------------------------------------------------------
+// Tests: ToolFlightTracker integration with the guard logic
+// ---------------------------------------------------------------------------
+describe('ToolFlightTracker + guard integration', () => {
+  it('re-arm fires when a Bash tool is in flight', () => {
+    const tracker = new ToolFlightTracker()
+    tracker.onEvent({ kind: 'tool_use', toolUseId: 'bash_1' })
+    expect(shouldRearmInsteadOfFire({
+      midToolCall: tracker.isMidToolCall(),
+      rearmCount: 0,
+      maxRearms: ORPHANED_REPLY_MAX_REARMS,
+    })).toBe(true)
+  })
+  it('fires normally after tool_result completes the tool', () => {
+    const tracker = new ToolFlightTracker()
+    tracker.onEvent({ kind: 'tool_use', toolUseId: 'bash_1' })
+    tracker.onEvent({ kind: 'tool_result', toolUseId: 'bash_1' })
+    expect(shouldRearmInsteadOfFire({
+      midToolCall: tracker.isMidToolCall(),
+      rearmCount: 0,
+      maxRearms: ORPHANED_REPLY_MAX_REARMS,
+    })).toBe(false)
+  })
+  it('defensive guard suppresses synthetic turn_end mid-Bash', () => {
+    const tracker = new ToolFlightTracker()
+    tracker.onEvent({ kind: 'tool_use', toolUseId: 'bash_2' })
+    expect(shouldSuppressSyntheticTurnEnd({
+      durationMs: -1,
+      midToolCall: tracker.isMidToolCall(),
+    })).toBe(true)
+  })
+  it('defensive guard allows synthetic turn_end after all tools complete', () => {
+    const tracker = new ToolFlightTracker()
+    tracker.onEvent({ kind: 'tool_use', toolUseId: 'bash_2' })
+    tracker.onEvent({ kind: 'tool_result', toolUseId: 'bash_2' })
+    expect(shouldSuppressSyntheticTurnEnd({
+      durationMs: -1,
+      midToolCall: tracker.isMidToolCall(),
+    })).toBe(false)
+  })
+  it('parallel tools: re-arm persists while ANY tool is in flight', () => {
+    const tracker = new ToolFlightTracker()
+    tracker.onEvent({ kind: 'tool_use', toolUseId: 'read_1' })
+    tracker.onEvent({ kind: 'tool_use', toolUseId: 'read_2' })
+    tracker.onEvent({ kind: 'tool_use', toolUseId: 'edit_1' })
+    // Still re-arming: 3 tools open
+    expect(shouldRearmInsteadOfFire({
+      midToolCall: tracker.isMidToolCall(),
+      rearmCount: 0,
+      maxRearms: ORPHANED_REPLY_MAX_REARMS,
+    })).toBe(true)
+    // Two complete
+    tracker.onEvent({ kind: 'tool_result', toolUseId: 'read_1' })
+    tracker.onEvent({ kind: 'tool_result', toolUseId: 'read_2' })
+    // Still re-arming: edit_1 open
+    expect(shouldRearmInsteadOfFire({
+      midToolCall: tracker.isMidToolCall(),
+      rearmCount: 1,
+      maxRearms: ORPHANED_REPLY_MAX_REARMS,
+    })).toBe(true)
+    // All complete
+    tracker.onEvent({ kind: 'tool_result', toolUseId: 'edit_1' })
+    expect(shouldRearmInsteadOfFire({
+      midToolCall: tracker.isMidToolCall(),
+      rearmCount: 2,
+      maxRearms: ORPHANED_REPLY_MAX_REARMS,
+    })).toBe(false)
+  })
+  it('cap fires even mid-tool after 20 re-arms (wedged tool surfaces)', () => {
+    const tracker = new ToolFlightTracker()
+    tracker.onEvent({ kind: 'tool_use', toolUseId: 'hung_bash' })
+    // First 20 re-arms proceed
+    for (let i = 0; i < ORPHANED_REPLY_MAX_REARMS; i++) {
+      expect(shouldRearmInsteadOfFire({
+        midToolCall: tracker.isMidToolCall(),
+        rearmCount: i,
+        maxRearms: ORPHANED_REPLY_MAX_REARMS,
+      })).toBe(true)
+    }
+    // 21st: cap exceeded — fire despite in-flight
+    expect(shouldRearmInsteadOfFire({
+      midToolCall: tracker.isMidToolCall(),
+      rearmCount: ORPHANED_REPLY_MAX_REARMS,
+      maxRearms: ORPHANED_REPLY_MAX_REARMS,
+    })).toBe(false)
+  })
+})
+// ---------------------------------------------------------------------------
+// Tests: shouldArmOrphanedReplyTimeout (existing surface, unchanged)
+// ---------------------------------------------------------------------------
+describe('shouldArmOrphanedReplyTimeout (existing — unchanged by this fix)', () => {
+  it('arms when conditions are met', () => {
+    expect(
+      shouldArmOrphanedReplyTimeout({
+        currentSessionChatId: '123',
+        capturedTextCount: 1,
+        replyCalled: false,
+      }),
+    ).toBe(true)
+  })
+  it('does not arm after reply has been called', () => {
+    expect(
+      shouldArmOrphanedReplyTimeout({
+        currentSessionChatId: '123',
+        capturedTextCount: 5,
+        replyCalled: true,
+      }),
+    ).toBe(false)
+  })
+  it('does not arm when no chat is active', () => {
+    expect(
+      shouldArmOrphanedReplyTimeout({
+        currentSessionChatId: null,
+        capturedTextCount: 1,
+        replyCalled: false,
+      }),
+    ).toBe(false)
+  })
+  it('does not arm when no text captured yet', () => {
+    expect(
+      shouldArmOrphanedReplyTimeout({
+        currentSessionChatId: '123',
+        capturedTextCount: 0,
+        replyCalled: false,
+      }),
+    ).toBe(false)
+  })
+})

package/telegram-plugin/tests/over-ping-final-answer-decoupling.test.ts ADDED Viewed

@@ -0,0 +1,194 @@
+/**
+ * Regression contract for #2533: the over-ping anti-spam downgrade MUST NOT
+ * pollute final-answer classification.
+ *
+ * The bug (surfaced by the `midturn-silent-dm` UAT against v0.15.57): in
+ * `executeReply`, the #1675 over-ping safety net reassigns the local
+ * `disableNotification` to `true` to silence a 2nd+ ping in a turn, and the
+ * final-answer classifier was then read with that *downgraded* value. So a
+ * final answer the model INTENDED to ping but the anti-spam net silenced was
+ * misclassified as not-final → `finalAnswerDelivered` stayed false → a
+ * spurious silent-end re-prompt (#1664) AND a false 'undelivered' 😐 (#2530).
+ *
+ * The contract this pins (what `executeStreamReply` already did, and what
+ * #2533 made `executeReply` do): final-answer classification keys on the
+ * MODEL'S ORIGINAL INTENT (`args.disable_notification`), not the
+ * over-ping-downgraded send value. The actual SEND still honours the
+ * downgrade — only the classification is decoupled.
+ *
+ * `executeReply` itself isn't unit-callable (it lives in the 22k-line
+ * gateway), so this ties the two real pure modules together to reproduce the
+ * exact failing sequence and assert the invariant.
+ */
+import { describe, it, expect } from 'vitest'
+import { decideOverPing } from '../over-ping-safety-net.js'
+import { isFinalAnswerReply, isSubstantiveFinalReply } from '../final-answer-detect.js'
+describe('#2533 — over-ping downgrade must not pollute final-answer classification', () => {
+  // The midturn-silent-dm failing sequence: an interim ack pings, then a
+  // SHORT final answer the model also intended to ping.
+  const ACK = { text: 'On it.', modelWantsPing: true }
+  const FINAL = { text: 'Hostname is example-host.', modelWantsPing: true } // <200 chars
+  it('reproduces the sequence: ack claims the ping slot, the short final gets over-ping-suppressed', () => {
+    // Beat 1 — the ack pings; first ping of the turn claims the slot (not suppressed).
+    const ackDecision = decideOverPing({ modelRequestedPing: ACK.modelWantsPing, firstPingAt: null, nowMs: 1_000 })
+    expect(ackDecision.suppress).toBe(false)
+    expect(ackDecision.claimSlot).toBe(true)
+    const firstPingAt = 1_000
+    // Beat 2 — the real final answer also wants to ping, but the slot is taken → SUPPRESS.
+    const finalDecision = decideOverPing({ modelRequestedPing: FINAL.modelWantsPing, firstPingAt, nowMs: 2_000 })
+    expect(finalDecision.suppress).toBe(true) // the gateway would downgrade disable_notification:false → true
+  })
+  it('classifying on the MODEL INTENT marks the suppressed short final as final (correct)', () => {
+    // This is what the gateway MUST do (#2533): use args.disable_notification,
+    // NOT the over-ping-downgraded value.
+    const modelDisableNotification = !FINAL.modelWantsPing // model wanted to ping → false
+    expect(
+      isFinalAnswerReply({ text: FINAL.text, disableNotification: modelDisableNotification }),
+    ).toBe(true) // delivered final → finalAnswerDelivered=true → no spurious re-prompt, no false 😐
+  })
+  it('classifying on the DOWNGRADED value misclassifies it as not-final (the bug #2533 fixed)', () => {
+    // The over-ping net forced disable_notification → true. If classification
+    // reads THAT (the pre-#2533 bug), the short non-done final is seen as an
+    // interim ack → finalAnswerDelivered stays false → spurious re-prompt + 😐.
+    const downgradedDisableNotification = true
+    expect(
+      isFinalAnswerReply({ text: FINAL.text, disableNotification: downgradedDisableNotification }),
+    ).toBe(false) // <-- this WRONG classification is exactly what the gateway must NOT produce
+  })
+  it('a genuinely-silent interim ack (model set disable_notification:true) is still NOT final — fix does not over-correct', () => {
+    // The decoupling must not turn EVERY reply final: a short reply the MODEL
+    // marked silent (a real interim ack) still classifies non-final on model intent.
+    const modelSilentAck = true
+    expect(
+      isFinalAnswerReply({ text: 'looking into that…', disableNotification: modelSilentAck }),
+    ).toBe(false)
+  })
+  it('a long over-ping-suppressed answer was already final regardless (length backstop) — fix matters for SHORT finals', () => {
+    const long = 'x'.repeat(250)
+    // Even classifying on the downgraded value, length ≥200 makes it final — so
+    // the bug only ever bit SHORT over-ping-suppressed finals (the #2533 case).
+    expect(isFinalAnswerReply({ text: long, disableNotification: true })).toBe(true)
+    expect(isFinalAnswerReply({ text: long, disableNotification: false })).toBe(true)
+  })
+})
+/**
+ * Notification ownership (R8 / PR-2 — design `docs/message-emission-
+ * determinism.md` §over-ping). The substantive final answer must OWN the
+ * turn's single device ping. The residual the bare "first ping wins" rule
+ * left: an interim ack pings first and claims the slot, so the later
+ * substantive answer is downgraded to silent — "the reply is last but the
+ * phone never buzzed for the answer." `decideOverPing` is now aware of WHO
+ * holds the slot (`firstPingWasSubstantive`) and WHO is asking
+ * (`substantive`) and UPGRADES a substantive answer over an ack's slot,
+ * while still suppressing every double-ping the #1674 guard exists for.
+ *
+ * The 2×2 ownership matrix (model wants to ping, slot already held):
+ *
+ *   incoming \ slot held by │ ACK (non-substantive) │ SUBSTANTIVE
+ *   ────────────────────────┼───────────────────────┼─────────────
+ *   SUBSTANTIVE answer       │ UPGRADE (ping, claim)  │ suppress (#1674)
+ *   ACK                      │ suppress (orig)        │ suppress
+ */
+describe('R8 / PR-2 — substantive final answer OWNS the turn ping (upgrade matrix)', () => {
+  const SUBSTANTIVE = 'x'.repeat(300) // ≥200 → isSubstantiveFinalReply true
+  const ACK = 'On it.' // <200, non-done → ack
+  it('row 1 — substantive answer pinging over an ACK-held slot ⇒ UPGRADE (not suppressed)', () => {
+    // The ack pinged first (claimed the slot, non-substantive).
+    const ack = decideOverPing({
+      modelRequestedPing: true,
+      firstPingAt: null,
+      substantive: isSubstantiveFinalReply({ text: ACK, disableNotification: false }),
+      nowMs: 1_000,
+    })
+    expect(ack.claimSlot).toBe(true)
+    expect(ack.upgrade).toBe(false)
+    // Now the substantive answer wants to ping; the slot is ack-held.
+    const answer = decideOverPing({
+      modelRequestedPing: true,
+      firstPingAt: 1_000,
+      substantive: isSubstantiveFinalReply({ text: SUBSTANTIVE, disableNotification: false }),
+      firstPingWasSubstantive: false, // the ack
+      nowMs: 2_000,
+    })
+    expect(answer.suppress).toBe(false) // the ANSWER pings — phone buzzes for the answer
+    expect(answer.claimSlot).toBe(true) // slot upgraded to substantive
+    expect(answer.upgrade).toBe(true)
+  })
+  it('row 2 — ACK pinging over a SUBSTANTIVE-held slot ⇒ suppress (no double-ping after the answer)', () => {
+    const d = decideOverPing({
+      modelRequestedPing: true,
+      firstPingAt: 1_000,
+      substantive: isSubstantiveFinalReply({ text: ACK, disableNotification: false }),
+      firstPingWasSubstantive: true, // the real answer already owned the slot
+      nowMs: 2_000,
+    })
+    expect(d.suppress).toBe(true)
+    expect(d.claimSlot).toBe(false)
+    expect(d.upgrade).toBe(false)
+  })
+  it('row 3 — SUBSTANTIVE over a SUBSTANTIVE-held slot ⇒ suppress (preserves the #1674 model-double-ping guard)', () => {
+    // The reproducer #1674 targeted: a substantive answer pinged, then a
+    // substantive wrap-up also wants to ping. One beep, not two.
+    const d = decideOverPing({
+      modelRequestedPing: true,
+      firstPingAt: 30_000,
+      substantive: isSubstantiveFinalReply({ text: SUBSTANTIVE, disableNotification: false }),
+      firstPingWasSubstantive: true,
+      nowMs: 36_000,
+    })
+    expect(d.suppress).toBe(true)
+    expect(d.upgrade).toBe(false)
+    expect(d.sinceFirstPingMs).toBe(6_000)
+  })
+  it('row 4 — ACK over an ACK-held slot ⇒ suppress (original one-ping-per-turn behaviour, unchanged)', () => {
+    const d = decideOverPing({
+      modelRequestedPing: true,
+      firstPingAt: 1_000,
+      substantive: isSubstantiveFinalReply({ text: ACK, disableNotification: false }),
+      firstPingWasSubstantive: false,
+      nowMs: 2_000,
+    })
+    expect(d.suppress).toBe(true)
+    expect(d.claimSlot).toBe(false)
+    expect(d.upgrade).toBe(false)
+  })
+  it('the upgrade fires AT MOST once: after an upgrade, a further ack does NOT re-upgrade', () => {
+    // ack pings (slot=ack) → answer upgrades (slot=substantive) → a trailing
+    // ack must now suppress, not ping a third time.
+    const trailingAck = decideOverPing({
+      modelRequestedPing: true,
+      firstPingAt: 2_000, // upgraded slot timestamp
+      substantive: false,
+      firstPingWasSubstantive: true, // slot now substantive after the upgrade
+      nowMs: 3_000,
+    })
+    expect(trailingAck.suppress).toBe(true)
+    expect(trailingAck.upgrade).toBe(false)
+  })
+  it('a substantive FIRST ping still claims (no upgrade flag) — upgrade is strictly the second-ping case', () => {
+    const d = decideOverPing({
+      modelRequestedPing: true,
+      firstPingAt: null, // no prior ping this turn
+      substantive: true,
+      nowMs: 1_000,
+    })
+    expect(d.claimSlot).toBe(true)
+    expect(d.upgrade).toBe(false) // first ping is a claim, not an upgrade
+    expect(d.suppress).toBe(false)
+  })
+})

package/telegram-plugin/tests/over-ping-safety-net.test.ts CHANGED Viewed

@@ -38,7 +38,7 @@ describe('decideOverPing — at-most-one-ping-per-turn safety net', () => {
       firstPingAt: null,
       nowMs: 1_000,
     })
-    expect(d1).toEqual({ suppress: false, claimSlot: false, sinceFirstPingMs: null })
+    expect(d1).toEqual({ suppress: false, claimSlot: false, upgrade: false, sinceFirstPingMs: null })
     // Prior ping already landed — silent reply still no-op, NOT claimed
     const d2 = decideOverPing({
@@ -46,7 +46,7 @@ describe('decideOverPing — at-most-one-ping-per-turn safety net', () => {
       firstPingAt: 1_000,
       nowMs: 5_000,
     })
-    expect(d2).toEqual({ suppress: false, claimSlot: false, sinceFirstPingMs: null })
+    expect(d2).toEqual({ suppress: false, claimSlot: false, upgrade: false, sinceFirstPingMs: null })
   })
   it('handles the edge case where firstPingAt equals nowMs (instant double-call)', () => {