npm - switchroom - Versions diffs - 0.13.12 → 0.13.14 - Mend

switchroom 0.13.12 → 0.13.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

package/dist/cli/switchroom.js +60 -5
package/package.json +1 -1
package/telegram-plugin/dist/gateway/gateway.js +290 -88
package/telegram-plugin/final-answer-detect.ts +83 -0
package/telegram-plugin/gateway/gateway.ts +213 -11
package/telegram-plugin/hooks/silent-end-interrupt-stop.mjs +17 -5
package/telegram-plugin/pending-work-progress.ts +377 -0
package/telegram-plugin/runtime-metrics.ts +20 -0
package/telegram-plugin/silent-end.ts +37 -11
package/telegram-plugin/tests/final-answer-detect.test.ts +89 -0
package/telegram-plugin/tests/pending-work-progress.test.ts +354 -0
package/telegram-plugin/tests/silent-end.test.ts +118 -0
package/telegram-plugin/uat/scenarios/cross-turn-pending-progress-dm.test.ts +237 -0

package/telegram-plugin/pending-work-progress.ts ADDED Viewed

@@ -0,0 +1,377 @@
+/**
+ * Cross-turn pending-async progress — issue #1445.
+ *
+ * When a turn ends with pending background async work (the model
+ * dispatched `Agent` / `Task` and ended its turn before the worker
+ * returned), keep editing the model's last reply *in place* at
+ * intervals so the user sees ambient liveness during the wait — without
+ * any new pinged messages and without re-introducing the retired
+ * progress card.
+ *
+ * Background data justifying this module (2026-05-23 forensic + UAT):
+ *
+ * - silence-poke success rate is 0–7% across hundreds of fires
+ *   (finn: 0/78, clerk: 6/91, klanker: 5/158) — the polite levels
+ *   reach the model as `<system-reminder>`s piggybacked on the next
+ *   tool result, so they (a) only land if the model is actively
+ *   cycling tools, (b) compete with hundreds of other tokens, and (c)
+ *   only ever exist while the turn is open. The 300s framework
+ *   fallback is the only user-visible silence-poke output, and its
+ *   first job is to *kill the wedged turn*.
+ *
+ * - The dominant user-visible failure mode (issue #1445) is in fact
+ *   cross-turn: the model calls `Agent` (or `Bash` with
+ *   `run_in_background:true`), sends one ack reply that pings, then
+ *   ends the turn. The silence-poke ladder is *gone* the moment
+ *   endTurn() fires. The user then sees nothing for 10–30+ minutes
+ *   until the worker returns. A live UAT confirmed: a deliberate
+ *   `sleep 350` prompt produced one `[PING] Background sleep running;
+ *   awaiting completion notification.` at +19s and the turn ended.
+ *
+ * Mechanism:
+ *
+ *   tool_use(Agent|Task)        → mark chat key `pending=true`
+ *   outbound reply              → capture anchor (messageId, text)
+ *   turn_end with pending+anchor → activate the timer for the key
+ *   tick (every 5s, edit every  → editMessageText against the anchor
+ *     EDIT_INTERVAL_MS)            appending/refreshing the suffix
+ *                                  " — still working (Nm)"
+ *   inbound user message        → clear (user re-engaged or moved on)
+ *   subagent_handback inject    → clear (model about to re-engage)
+ *   MAX_LIFETIME_MS budget cap  → clear (give up; 30 min default)
+ *
+ * Single shared timer for the whole gateway — like silence-poke's
+ * `tick()`, the per-key cost is O(map size) per poll. The poll
+ * interval is short (5s) but edits are spaced at EDIT_INTERVAL_MS so
+ * the Telegram bot.api editMessageText rate stays well under limits.
+ *
+ * Edits are plain text (no parseMode). The suffix is appended to the
+ * model's authored text; on subsequent edits the prior suffix is
+ * stripped before re-appending so the message never accumulates
+ * duplicate suffixes.
+ *
+ * Kill switch: `SWITCHROOM_DISABLE_PENDING_PROGRESS=1` disables the
+ * whole subsystem. The conversational-pacing prompt is unaffected.
+ */
+export const EDIT_INTERVAL_MS = 60_000
+export const POLL_INTERVAL_MS = 5_000
+export const MAX_LIFETIME_MS = 30 * 60_000
+/** Telegram message length limit is 4096; budget headroom for the
+ *  suffix and any escape expansion. If the anchor text plus suffix
+ *  would exceed this, we skip the edit (the user still sees the
+ *  original) rather than truncate the model's authored prose. */
+export const TELEGRAM_MSG_CAP = 4000
+/**
+ * Regex matching the suffix we append. Used to strip a prior suffix
+ * before appending the next one. The (\d+) covers "1m" / "12m" / etc.
+ * Kept anchored to end-of-string so it only matches OUR suffix, not
+ * something the model happened to write.
+ */
+const SUFFIX_RE = /\n\n— still working \(\d+m\)$/
+export interface PendingProgressEditCtx {
+  chatId: string
+  threadId: number | null
+  messageId: number
+  newText: string
+}
+/**
+ * Discriminated union — kept structurally identical to the
+ * `pending_progress_*` variants in `runtime-metrics.ts:RuntimeMetricEvent`
+ * so the gateway's `emitMetric: emitRuntimeMetric` wire-up typechecks
+ * cleanly with no cast. `started` carries only the chat key; `edited`
+ * always carries the cumulative elapsed time; `cleared` carries an
+ * optional elapsed + the reason (`inbound` | `handback` | `timeout` |
+ * `manual`).
+ */
+export type PendingProgressMetric =
+  | { kind: 'pending_progress_started'; chatKey: string }
+  | { kind: 'pending_progress_edited'; chatKey: string; elapsedMs: number }
+  | {
+      kind: 'pending_progress_cleared'
+      chatKey: string
+      elapsedMs?: number
+      reason?: string
+    }
+export interface PendingProgressDeps {
+  editMessage: (ctx: PendingProgressEditCtx) => Promise<void>
+  emitMetric?: (event: PendingProgressMetric) => void
+  /** Optional clock override for tests. */
+  nowMs?: () => number
+  /** Optional poll interval override for tests. */
+  pollIntervalMs?: number
+}
+interface State {
+  /** True after a `tool_use(Agent|Task)` was observed for this key in
+   *  the current turn. Cleared on next turn start. */
+  pending: boolean
+  /** The captured anchor — last outbound reply message_id for this
+   *  key. */
+  anchorMessageId: number | null
+  /** The captured anchor text — what the model wrote, *minus* any
+   *  prior pending-progress suffix. Used as the base for every edit. */
+  anchorOriginalText: string
+  /** Wall-clock ms when the cross-turn ambient state was *activated*
+   *  (at turn_end with pending+anchor). null before activation. */
+  activatedAt: number | null
+  /** Wall-clock ms of last edit fire — gates the EDIT_INTERVAL_MS
+   *  cadence. null until first edit fires. */
+  lastEditAt: number | null
+}
+const stateByKey = new Map<string, State>()
+let timer: ReturnType<typeof setInterval> | null = null
+let activeDeps: PendingProgressDeps | null = null
+function enabled(): boolean {
+  const v = process.env.SWITCHROOM_DISABLE_PENDING_PROGRESS
+  return !(v === '1' || v === 'true')
+}
+function nowMs(): number {
+  return activeDeps?.nowMs ? activeDeps.nowMs() : Date.now()
+}
+function ensure(key: string): State {
+  let s = stateByKey.get(key)
+  if (!s) {
+    s = {
+      pending: false,
+      anchorMessageId: null,
+      anchorOriginalText: '',
+      activatedAt: null,
+      lastEditAt: null,
+    }
+    stateByKey.set(key, s)
+  }
+  return s
+}
+/**
+ * Fresh turn — reset the per-turn `pending` flag and the per-turn
+ * anchor. The cross-turn `activated` state is per-PRIOR-turn and is
+ * cleared by the explicit clear paths (`clearPending` with reason
+ * `inbound` / `handback` / `timeout`), not by a new turn. The gateway
+ * wires those clears at TWO sites for full coverage:
+ *
+ *   1. `handleInbound` (real user message) → `clearPending('inbound')`
+ *      — the fast path; fires the moment the gateway sees an inbound,
+ *      before the new turn atom is even built.
+ *   2. `handleSessionEvent` `enqueue` case (every fresh turn atom)
+ *      → `clearPending('handback')` — the backstop covering
+ *      synthesised wakes (subagent-handback, cron, vault grant,
+ *      restart marker) that push directly to `pendingInboundBuffer`
+ *      and bypass `handleInbound`. Idempotent w/r/t the first clear.
+ *
+ * `startTurn` itself only matters if the state map already has an
+ * entry for `key` — which post-fix is impossible (the clears
+ * delete it). Kept for test ergonomics and as defence-in-depth.
+ */
+export function startTurn(key: string): void {
+  if (!enabled()) return
+  const s = stateByKey.get(key)
+  if (s == null) return
+  // Only the per-turn fields reset. activatedAt/lastEditAt belong to
+  // the prior turn's pending-progress and are cleared separately.
+  s.pending = false
+  s.anchorMessageId = null
+  s.anchorOriginalText = ''
+}
+/**
+ * Mark this chat as having dispatched async background work in the
+ * current turn. Idempotent. Called when the gateway sees a `tool_use`
+ * for `Agent` or `Task`.
+ */
+export function noteAsyncDispatch(key: string): void {
+  if (!enabled()) return
+  ensure(key).pending = true
+}
+/**
+ * Capture an outbound reply as a candidate anchor for cross-turn
+ * editing. Called on every successful bot reply send. If a prior
+ * pending-progress suffix is present in the text (rare — should only
+ * happen if we sent something to ourselves), strip it before storing
+ * so subsequent edits don't double-suffix.
+ */
+export function noteOutbound(
+  key: string,
+  opts: { messageId: number; text: string },
+): void {
+  if (!enabled()) return
+  const s = ensure(key)
+  s.anchorMessageId = opts.messageId
+  s.anchorOriginalText = opts.text.replace(SUFFIX_RE, '')
+}
+/**
+ * Called at turn_end. If the turn had a pending async dispatch AND
+ * captured an anchor, activate the cross-turn ambient state — the
+ * timer will start editing.
+ *
+ * If pending=false OR no anchor was captured, drop the state entry
+ * entirely (nothing for us to do).
+ */
+export function noteTurnEnd(key: string): void {
+  if (!enabled()) return
+  const s = stateByKey.get(key)
+  if (s == null) return
+  if (s.pending && s.anchorMessageId != null) {
+    s.activatedAt = nowMs()
+    // lastEditAt is null so the first edit fires after one full
+    // EDIT_INTERVAL_MS from activation — not immediately.
+    s.lastEditAt = s.activatedAt
+    activeDeps?.emitMetric?.({
+      kind: 'pending_progress_started',
+      chatKey: key,
+    })
+  } else {
+    stateByKey.delete(key)
+  }
+}
+/**
+ * Clear pending-progress for a chat — reasons:
+ *   'inbound'   — user sent a new message, they're re-engaged
+ *   'handback'  — switchroom injected a subagent_handback channel turn
+ *   'timeout'   — exceeded MAX_LIFETIME_MS
+ *   'manual'    — test / debug
+ */
+export function clearPending(
+  key: string,
+  reason: 'inbound' | 'handback' | 'timeout' | 'manual',
+): void {
+  if (!stateByKey.has(key)) return
+  const s = stateByKey.get(key)!
+  const elapsed = s.activatedAt != null ? nowMs() - s.activatedAt : 0
+  stateByKey.delete(key)
+  activeDeps?.emitMetric?.({
+    kind: 'pending_progress_cleared',
+    chatKey: key,
+    elapsedMs: elapsed,
+    reason,
+  })
+}
+/**
+ * Start the shared interval timer. Idempotent. Honours the kill
+ * switch — no-op when disabled.
+ */
+export function startTimer(deps: PendingProgressDeps): void {
+  if (!enabled()) return
+  if (timer != null) return
+  activeDeps = deps
+  const interval = deps.pollIntervalMs ?? POLL_INTERVAL_MS
+  timer = setInterval(() => tick(nowMs()), interval)
+  if (typeof timer.unref === 'function') timer.unref()
+}
+/** Stop the timer. Idempotent. */
+export function stopTimer(): void {
+  if (timer != null) {
+    clearInterval(timer)
+    timer = null
+  }
+  activeDeps = null
+}
+/**
+ * Parse `<chatId>:<threadIdOrEmpty>` back into structured fields,
+ * matching the `statusKey` shape used throughout the gateway.
+ */
+function parseKey(key: string): { chatId: string; threadId: number | null } {
+  const idx = key.indexOf(':')
+  if (idx < 0) return { chatId: key, threadId: null }
+  const chatId = key.slice(0, idx)
+  const tail = key.slice(idx + 1)
+  if (tail === '' || tail === 'undefined') return { chatId, threadId: null }
+  const n = Number(tail)
+  return { chatId, threadId: Number.isFinite(n) ? n : null }
+}
+function tick(now: number): void {
+  if (activeDeps == null) return
+  for (const [key, s] of stateByKey.entries()) {
+    if (s.activatedAt == null || s.anchorMessageId == null) continue
+    const elapsed = now - s.activatedAt
+    if (elapsed >= MAX_LIFETIME_MS) {
+      clearPending(key, 'timeout')
+      continue
+    }
+    const sinceEdit = s.lastEditAt == null ? 0 : now - s.lastEditAt
+    if (sinceEdit < EDIT_INTERVAL_MS) continue
+    // Build suffix from elapsed wall-clock. Always at least 1m so the
+    // user-visible counter reads honestly (we only edit at intervals
+    // ≥ EDIT_INTERVAL_MS = 60s).
+    const minutes = Math.max(1, Math.round(elapsed / 60_000))
+    const suffix = `\n\n— still working (${minutes}m)`
+    const newText = s.anchorOriginalText + suffix
+    if (newText.length > TELEGRAM_MSG_CAP) {
+      // Don't truncate the model's prose — just skip this edit.
+      // The previous edit (or the original) is still visible.
+      s.lastEditAt = now
+      continue
+    }
+    const { chatId, threadId } = parseKey(key)
+    s.lastEditAt = now
+    const editCtx: PendingProgressEditCtx = {
+      chatId,
+      threadId,
+      messageId: s.anchorMessageId,
+      newText,
+    }
+    // Fire-and-forget so a slow edit doesn't block the tick loop.
+    // Errors are logged but never bubble (a 429 / "message not modified"
+    // / chat-deleted is a soft failure).
+    void Promise.resolve()
+      .then(() => activeDeps!.editMessage(editCtx))
+      .then(() => {
+        activeDeps!.emitMetric?.({
+          kind: 'pending_progress_edited',
+          chatKey: key,
+          elapsedMs: elapsed,
+        })
+      })
+      .catch((err) => {
+        process.stderr.write(
+          `pending-work-progress: edit failed key=${key} ` +
+            `msg=${editCtx.messageId}: ${(err as Error).message}\n`,
+        )
+      })
+  }
+}
+// ─── Test helpers ─────────────────────────────────────────────────────────
+/** Test-only: drive one tick deterministically. */
+export function __tickForTests(now: number): void {
+  tick(now)
+}
+/** Test-only: install deps without starting the real timer. */
+export function __setDepsForTests(deps: PendingProgressDeps | null): void {
+  activeDeps = deps
+}
+/** Test-only: peek at per-key state. */
+export function __getStateForTests(key: string): State | undefined {
+  return stateByKey.get(key)
+}
+/** Test-only: full reset. */
+export function __resetAllForTests(): void {
+  stateByKey.clear()
+  stopTimer()
+}

package/telegram-plugin/runtime-metrics.ts CHANGED Viewed

@@ -104,6 +104,26 @@ export type RuntimeMetricEvent =
       fallback_kind: 'working' | 'thinking'
       silence_ms: number
     }
+  /**
+   * #1445 cross-turn pending-async ambient lifecycle. `started` fires
+   * when a turn ends with a captured anchor AND a pending Agent/Task/
+   * Bash-background dispatch — i.e. the framework will now edit the
+   * model's last reply in place every ~60s until cleared. `edited`
+   * fires on each successful in-place edit; `elapsed_ms` is how long
+   * ambient has been running for this chat. `cleared` fires when
+   * ambient stops — `reason` says why (inbound / handback / timeout).
+   * Targets: edited/started ratio is the "still alive minutes per
+   * activation" health proxy; cleared.reason='inbound' should
+   * dominate (model + user resolving naturally).
+   */
+  | { kind: 'pending_progress_started'; chatKey: string }
+  | { kind: 'pending_progress_edited'; chatKey: string; elapsedMs: number }
+  | {
+      kind: 'pending_progress_cleared'
+      chatKey: string
+      elapsedMs?: number
+      reason?: string
+    }
 /**
  * The JSONL sink lives under the runtime state dir so it's per-agent

package/telegram-plugin/silent-end.ts CHANGED Viewed

@@ -182,22 +182,39 @@ export function readSilentEndState(deps?: SilentEndDeps): SilentEndState | null
 }
 /**
- * Record a user-message turn that ended with zero outbound messages and
- * report whether the deterministic re-prompt has been exhausted. This is
- * the gateway's single entry point for the main turn-end path.
+ * Record a user-message turn that ended WITHOUT the model delivering a
+ * final answer, and report whether the deterministic re-prompt has been
+ * exhausted. This is the gateway's single entry point for the main
+ * turn-end path.
  *
- *   - First silent-end of a turn (no prior state, or prior `retryCount`
+ * #1664 — the trigger generalized from "zero outbound" to "no final
+ * answer delivered". Two cases reach here now:
+ *   1. Zero outbound — the turn ended with nothing sent at all (the
+ *      original #1122/#1161 silent-end case).
+ *   2. Interim-ack only — the model sent an ack via reply/stream_reply
+ *      but ended the turn with its real answer as plain transcript text
+ *      (rendered into an ephemeral answer-lane draft that gets retracted
+ *      at turn_end, never finalized). The gateway tracks this via
+ *      `CurrentTurn.finalAnswerDelivered`; case 1 is just the subset
+ *      where that flag is false because nothing landed.
+ * In both cases the model still owes the user an answer, so the same
+ * re-prompt safety net applies — the framework re-prompts; the model
+ * re-delivers via the reply tool (never the framework materializing a
+ * message from the draft — see `reference/principles.md`).
+ *
+ *   - First undelivered turn-end (no prior state, or prior `retryCount`
  *     still below `SILENT_END_MAX_RETRIES`) → writes the state file via
  *     `writeSilentEndState`, so `silent-end-interrupt-stop.mjs` blocks
  *     the stop and re-prompts the agent. Returns `{ exhausted: false }`.
  *
- *   - A silent-end where the prior state for the SAME turn already shows
- *     `retryCount >= SILENT_END_MAX_RETRIES` → the Stop hook already
- *     spent its re-prompt and the agent is STILL silent. Recovery has
- *     failed. Clears the state file (so the Stop hook on this final turn
- *     finds nothing pending and allows the stop cleanly) and returns
- *     `{ exhausted: true }` — the caller MUST then deliver a user-facing
- *     fallback so the turn never just vanishes (#1161).
+ *   - An undelivered turn-end where the prior state for the SAME turn
+ *     already shows `retryCount >= SILENT_END_MAX_RETRIES` → the Stop
+ *     hook already spent its re-prompt and the agent is STILL
+ *     undelivered. Recovery has failed. Clears the state file (so the
+ *     Stop hook on this final turn finds nothing pending and allows the
+ *     stop cleanly) and returns `{ exhausted: true }` — the caller MUST
+ *     then deliver a user-facing fallback so the turn never just
+ *     vanishes (#1161).
  *
  * Chat-less autonomous wakeup turns never reach here: the gateway only
  * creates a `currentTurn` (and therefore only runs a turn-end handler)
@@ -228,3 +245,12 @@ export function recordSilentTurnEnd(
   writeSilentEndState(args, deps)
   return { exhausted: false }
 }
+/**
+ * #1664 — semantic alias for `recordSilentTurnEnd`. The trigger is now
+ * "no final answer delivered", of which "zero outbound" is one case; new
+ * callsites should prefer this name so the intent reads correctly. The
+ * behaviour, retry semantics, and `{exhausted}` contract are identical —
+ * `recordSilentTurnEnd` is kept for the existing callers and tests.
+ */
+export const recordUndeliveredTurnEnd = recordSilentTurnEnd

package/telegram-plugin/tests/final-answer-detect.test.ts ADDED Viewed

@@ -0,0 +1,89 @@
+/**
+ * Unit coverage for the #1664 final-answer detection predicate.
+ *
+ * `isFinalAnswerReply` is the finer signal the silent-end re-prompt needs:
+ * the gateway's `replyCalled` flag flips on the first reply / stream_reply
+ * tool use and cannot tell an interim ack from the real answer. This
+ * predicate classifies each reply so a turn whose every reply was "interim"
+ * (and whose real answer ended up as plain transcript text) ends with
+ * `finalAnswerDelivered === false` and triggers the re-prompt — the #1664
+ * bug (streamed answers rendered to a draft, retracted at turn_end, lost).
+ *
+ * These tests pin the pure predicate. The gateway wires it into
+ * executeReply / executeStreamReply (covered by the gateway integration
+ * surface); pinning the policy here keeps it auditable without importing
+ * the multi-thousand-line gateway module.
+ */
+import { describe, it, expect } from 'vitest'
+import { isFinalAnswerReply, FINAL_ANSWER_MIN_CHARS } from '../final-answer-detect.js'
+describe('isFinalAnswerReply — #1664 final-answer classification', () => {
+  it('classifies a notification-bearing reply as the final answer', () => {
+    // disable_notification:false is the pacing contract's "final answer"
+    // signal — interim updates pass disable_notification:true.
+    expect(
+      isFinalAnswerReply({ text: 'short answer', disableNotification: false }),
+    ).toBe(true)
+  })
+  it('classifies a short interim ack (disable_notification:true) as NOT final', () => {
+    expect(
+      isFinalAnswerReply({ text: 'on it…', disableNotification: true }),
+    ).toBe(false)
+  })
+  it('length backstop: a long reply mis-marked interim still counts as final', () => {
+    const longText = 'x'.repeat(FINAL_ANSWER_MIN_CHARS)
+    expect(
+      isFinalAnswerReply({ text: longText, disableNotification: true }),
+    ).toBe(true)
+  })
+  it('length backstop is inclusive at exactly FINAL_ANSWER_MIN_CHARS', () => {
+    expect(
+      isFinalAnswerReply({
+        text: 'x'.repeat(FINAL_ANSWER_MIN_CHARS),
+        disableNotification: true,
+      }),
+    ).toBe(true)
+    // One char under the threshold and marked interim → still interim.
+    expect(
+      isFinalAnswerReply({
+        text: 'x'.repeat(FINAL_ANSWER_MIN_CHARS - 1),
+        disableNotification: true,
+      }),
+    ).toBe(false)
+  })
+  it('stream_reply done=true is always the final answer, even short + interim', () => {
+    // A done=true call explicitly closes the stream — it IS the answer,
+    // regardless of length or the notification flag.
+    expect(
+      isFinalAnswerReply({ text: 'ok', disableNotification: true, done: true }),
+    ).toBe(true)
+  })
+  it('a non-terminal stream_reply chunk (done=false) is classified like a plain reply', () => {
+    // Short interim chunk → not final.
+    expect(
+      isFinalAnswerReply({ text: 'thinking…', disableNotification: true, done: false }),
+    ).toBe(false)
+    // Notification-bearing chunk → final.
+    expect(
+      isFinalAnswerReply({ text: 'here it is', disableNotification: false, done: false }),
+    ).toBe(true)
+  })
+  it('an empty reply marked interim is NOT the final answer', () => {
+    expect(
+      isFinalAnswerReply({ text: '', disableNotification: true }),
+    ).toBe(false)
+  })
+  it('FINAL_ANSWER_MIN_CHARS is the documented 200-char backstop', () => {
+    // Guards the constant against silent drift — the value is referenced
+    // in the CurrentTurn doc-comment and the Stop-hook rationale.
+    expect(FINAL_ANSWER_MIN_CHARS).toBe(200)
+  })
+})