npm - switchroom - Versions diffs - 0.14.21 → 0.14.22 - Mend

switchroom 0.14.21 → 0.14.22

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

package/dist/agent-scheduler/index.js +0 -1
package/dist/auth-broker/index.js +0 -1
package/dist/cli/notion-write-pretool.mjs +0 -1
package/dist/cli/switchroom.js +14 -6
package/dist/host-control/main.js +0 -1
package/dist/vault/approvals/kernel-server.js +0 -1
package/dist/vault/broker/server.js +0 -1
package/package.json +3 -3
package/profiles/_base/start.sh.hbs +11 -24
package/profiles/_shared/telegram-style.md.hbs +2 -2
package/profiles/default/CLAUDE.md.hbs +4 -1
package/skills/switchroom-runtime/SKILL.md +6 -16
package/telegram-plugin/agent-dir.ts +15 -0
package/telegram-plugin/dist/gateway/gateway.js +640 -509
package/telegram-plugin/gateway/gateway.ts +216 -61
package/telegram-plugin/gateway/inbound-spool.ts +15 -0
package/telegram-plugin/gateway/resume-inbound-builder.ts +180 -0
package/telegram-plugin/registry/turns-schema.ts +138 -33
package/telegram-plugin/stream-reply-handler.ts +1 -11
package/telegram-plugin/tests/agent-dir.test.ts +25 -0
package/telegram-plugin/tests/e2e.test.ts +2 -77
package/telegram-plugin/tests/inbound-spool.test.ts +45 -0
package/telegram-plugin/tests/multi-turn-continuity.test.ts +0 -1
package/telegram-plugin/tests/outbound-ordering.test.ts +0 -1
package/telegram-plugin/tests/parse-mode-rotation.test.ts +0 -1
package/telegram-plugin/tests/races.test.ts +0 -26
package/telegram-plugin/tests/registry-turns.test.ts +106 -29
package/telegram-plugin/tests/resume-inbound-builder.test.ts +182 -0
package/telegram-plugin/tests/status-accent.test.ts +0 -1
package/telegram-plugin/tests/stream-reply-error-paths.test.ts +0 -1
package/telegram-plugin/tests/stream-reply-handler.test.ts +0 -24
package/telegram-plugin/tests/streaming-e2e.test.ts +0 -1
package/telegram-plugin/tests/streaming-orchestration.test.ts +0 -1
package/telegram-plugin/tests/tool-activity-summary.test.ts +44 -0
package/telegram-plugin/tests/turns-writer.test.ts +16 -6
package/telegram-plugin/tool-activity-summary.ts +55 -0
package/telegram-plugin/uat/driver.ts +3 -1
package/telegram-plugin/handoff-continuity.ts +0 -206
package/telegram-plugin/tests/handoff-continuity.test.ts +0 -262

package/telegram-plugin/gateway/gateway.ts CHANGED Viewed

@@ -66,7 +66,7 @@ import { StatusReactionController } from '../status-reactions.js'
 import { DeferredDoneReactions } from '../reaction-defer.js'
 import { createWorkerActivityFeed, isWorkerActivityFeedEnabled } from '../worker-activity-feed.js'
 import { isTelegramReplyTool, isTelegramSurfaceTool } from '../tool-names.js'
-import { appendActivityLabel } from '../tool-activity-summary.js'
+import { appendActivityLabel, renderActivityFeedWithNested } from '../tool-activity-summary.js'
 import { toolLabel } from '../tool-labels.js'
 import { createTypingWrapper } from '../typing-wrap.js'
 import { type DraftStreamHandle } from '../draft-stream.js'
@@ -210,14 +210,7 @@ import {
   isTurnFlushSafetyEnabled,
 } from '../turn-flush-safety.js'
 // #1122 PR3: turn-flush-prose-recovery removed with the progress card.
-import {
-  resolveAgentDirFromEnv,
-  consumeHandoffTopic,
-  shouldShowHandoffLine,
-  formatHandoffLine,
-  writeLastTurnSummary,
-  type HandoffFormat,
-} from '../handoff-continuity.js'
+import { resolveAgentDirFromEnv } from '../agent-dir.js'
 import {
   addActiveReaction,
   removeActiveReaction,
@@ -396,6 +389,7 @@ import {
   touchTurnActiveMarker,
   removeTurnActiveMarker,
   sweepStaleTurnActiveMarker,
+  TURN_ACTIVE_MARKER_FILE,
 } from './turn-active-marker.js'
 import {
   VERSION,
@@ -423,12 +417,17 @@ import {
 import { resolveVaultApprovalPosture } from '../vault-approval-posture.js'
 import {
   openTurnsDb,
-  markOrphanedAsRestarted,
+  markOrphanedWithTimeoutClassification,
   recordTurnStart,
   recordTurnEnd,
-  findMostRecentInterruptedTurn,
+  findLatestTurnIfInterrupted,
   findRecentTurnsForChat,
 } from '../registry/turns-schema.js'
+import {
+  buildResumeInterruptedInbound,
+  buildResumeWatchdogReportInbound,
+  selectResumeBuilder,
+} from './resume-inbound-builder.js'
 import { applySubagentsSchema, getSubagentByJsonlId } from '../registry/subagents-schema.js'
 import { resolveWorkerFeedDispatch, type WorkerFeedDispatch } from './worker-feed-dispatch.js'
 import { formatIdleFooter } from '../idle-footer.js'
@@ -969,13 +968,26 @@ if (HISTORY_ENABLED) {
   }
 }
-// ─── Turn-tracking registry (Stage 3a of simplify-restart, Phase 0 of #250) ─
-// On boot, open the per-agent registry.db and stamp any rows that never got
-// an ended_at as ended_via='restart'. Those are turns where the previous
-// gateway died mid-flight (SIGKILL / OOM / hard reboot — any path that
-// skipped the SIGTERM handler). Stages 3b/3c will populate new rows during
-// turn enqueue/end and on graceful shutdown; Stage 4 reads on cold start.
+// ─── Turn-tracking registry + honest-restart-resume ────────────────────────
+// On boot, open the per-agent registry.db and reap any turn that never got an
+// ended_at — those were killed mid-flight (operator restart, SIGKILL, OOM,
+// hard reboot). The reaper CLASSIFIES each orphan from the on-disk
+// turn-active marker's age:
+//   - marker older than the hang-watchdog window → 'timeout' (the turn
+//     stalled with no tool progress; report it, don't blindly resume).
+//   - otherwise → 'restart' (a clean interrupt; resume it).
+// Then, if the LATEST turn was interrupted, we build a synthetic resume /
+// report inbound and (further down, once the inbound spool exists) inject it
+// so the agent wakes on its own and either picks the work back up or tells
+// the user why it stopped — no human nudge required.
+//
+// The classifier MUST read the marker before the boot-cleanup sweep removes
+// it (the sweep runs much later, in the bridge-registration path). This block
+// runs at module top, so the marker is still present here.
 let turnsDb: ReturnType<typeof openTurnsDb> | null = null
+// Stashed here; pushed to the spool once it's constructed below. The spool's
+// turn_key-keyed dedup makes a re-stash across multiple restarts a no-op.
+let bootResumeInbound: { agent: string; msg: InboundMessage } | null = null
 try {
   // STATE_DIR is `<agentDir>/telegram` in production. openTurnsDb expects
   // the parent (agent dir) and joins `telegram/registry.db` itself.
@@ -987,23 +999,88 @@ try {
   // schema; subagents lives alongside in registry.db. Idempotent — safe on
   // pre-existing DBs (handles the jsonl_agent_id column migration).
   applySubagentsSchema(turnsDb)
-  const reaped = markOrphanedAsRestarted(turnsDb)
+  // Read the turn-active marker (the in-flight turn the watchdog tracks)
+  // BEFORE classifying — its mtime is "ms since last tool progress" and its
+  // payload carries the in-flight turn_key.
+  let markerTurnKey: string | null = null
+  let markerAgeMs: number | null = null
+  try {
+    const markerPath = join(STATE_DIR, TURN_ACTIVE_MARKER_FILE)
+    if (existsSync(markerPath)) {
+      const st = statSync(markerPath)
+      markerAgeMs = Date.now() - st.mtimeMs
+      try {
+        const payload = JSON.parse(readFileSync(markerPath, 'utf8')) as { turnKey?: unknown }
+        if (typeof payload.turnKey === 'string' && payload.turnKey.length > 0) {
+          markerTurnKey = payload.turnKey
+        }
+      } catch { /* unreadable/torn marker — age alone still classifies */ }
+    }
+  } catch { /* stat failure — treat as no marker (plain restart) */ }
+  // TURN_HANG_SECS is the watchdog's hang threshold (default 300s); the
+  // classifier uses the same signal so "would the watchdog have killed it"
+  // is answered identically whether or not the watchdog is live (it's
+  // disabled under Docker, but the staleness judgement still holds).
+  const hangSecs = Number(process.env.TURN_HANG_SECS)
+  const hangThresholdMs = (Number.isFinite(hangSecs) && hangSecs > 0 ? hangSecs : 300) * 1000
+  const reasonSnapshot =
+    markerAgeMs != null ? JSON.stringify({ idleMs: Math.round(markerAgeMs) }) : null
+  const { reaped, timeoutTurnKey } = markOrphanedWithTimeoutClassification(turnsDb, {
+    markerTurnKey,
+    markerAgeMs,
+    hangThresholdMs,
+    reasonSnapshot,
+  })
   if (reaped > 0) {
-    process.stderr.write(`telegram gateway: turn-registry boot-reaper stamped ${reaped} orphaned turn(s) as ended_via='restart'\n`)
+    process.stderr.write(
+      `telegram gateway: turn-registry boot-reaper stamped ${reaped} orphaned turn(s)` +
+      `${timeoutTurnKey ? ` (turnKey=${timeoutTurnKey} as 'timeout', markerAgeMs=${markerAgeMs})` : " as 'restart'"}\n`,
+    )
   } else {
     process.stderr.write(`telegram gateway: turn-registry initialized at ${join(agentDir, 'telegram', 'registry.db')}\n`)
   }
-  // Stage 4: surface the most-recently-interrupted turn to start.sh as a
-  // shell-sourceable env file. The agent's start.sh reads this on next
-  // boot, exports the env vars to the spawned `claude` process, and
-  // deletes the file (one-shot — only ever applies to the immediately
-  // following session). If there's no interrupted turn (clean previous
-  // shutdown), we delete any stale file so the resume protocol doesn't
-  // mis-fire.
+  // Build the boot resume/report inbound for the LATEST turn if it was
+  // interrupted. selectResumeBuilder owns the resume-vs-report policy.
+  const pending = findLatestTurnIfInterrupted(turnsDb)
+  const selfAgent = process.env.SWITCHROOM_AGENT_NAME ?? ''
+  if (pending != null && selfAgent) {
+    const kind = selectResumeBuilder(pending.ended_via)
+    if (kind === 'resume') {
+      bootResumeInbound = { agent: selfAgent, msg: buildResumeInterruptedInbound({ turn: pending }) }
+    } else if (kind === 'report') {
+      // idleMs: this boot's measured marker age if it just classified this
+      // turn; otherwise recover it from the persisted interrupt_reason (a
+      // later boot, marker already swept); else fall back to total runtime.
+      let idleMs = pending.turn_key === timeoutTurnKey && markerAgeMs != null ? markerAgeMs : null
+      if (idleMs == null && pending.interrupt_reason) {
+        try {
+          const parsed = JSON.parse(pending.interrupt_reason) as { idleMs?: unknown }
+          if (typeof parsed.idleMs === 'number' && Number.isFinite(parsed.idleMs)) idleMs = parsed.idleMs
+        } catch { /* malformed snapshot — fall through */ }
+      }
+      if (idleMs == null) idleMs = Math.max(0, Date.now() - pending.started_at)
+      bootResumeInbound = {
+        agent: selfAgent,
+        msg: buildResumeWatchdogReportInbound({ turn: pending, idleMs }),
+      }
+    }
+    if (bootResumeInbound != null) {
+      process.stderr.write(
+        `telegram gateway: boot-resume queued kind=${kind} turnKey=${pending.turn_key} ` +
+        `endedVia=${pending.ended_via ?? 'open'} chat=${pending.chat_id}\n`,
+      )
+    }
+  }
+  // Diagnostic env file (one-shot, sourced by start.sh) — kept for the
+  // wake-audit context. The injected inbound above is the real wake signal;
+  // these vars are passive context only.
   const pendingEnvPath = join(agentDir, '.pending-turn.env')
   try {
-    const pending = findMostRecentInterruptedTurn(turnsDb)
     if (pending != null) {
       const lines = [
         `SWITCHROOM_PENDING_TURN=true`,
@@ -1013,14 +1090,12 @@ try {
         pending.last_user_msg_id != null ? `SWITCHROOM_PENDING_USER_MSG_ID=${pending.last_user_msg_id}` : `SWITCHROOM_PENDING_USER_MSG_ID=`,
         `SWITCHROOM_PENDING_ENDED_VIA=${pending.ended_via ?? 'unknown'}`,
         `SWITCHROOM_PENDING_STARTED_AT=${pending.started_at}`,
+        pending.interrupt_reason != null ? `SWITCHROOM_PENDING_INTERRUPT_REASON=${pending.interrupt_reason}` : `SWITCHROOM_PENDING_INTERRUPT_REASON=`,
       ]
       // Atomic write: tmp + rename. Without this, a crash mid-write
       // (power loss, OOM, panic) leaves a truncated `.pending-turn.env`
       // that start.sh `source`s — partial SWITCHROOM_PENDING_* vars
-      // half-trigger the resume protocol with incomplete context, or
-      // a malformed line breaks shell parsing inside the source.
-      // Same pattern used by the access-file write a few hundred lines
-      // above and by src/issues/store.ts.
+      // or a malformed line break shell parsing inside the source.
       const pendingEnvTmp = `${pendingEnvPath}.tmp-${process.pid}`
       writeFileSync(pendingEnvTmp, lines.join('\n') + '\n', { mode: 0o600 })
       renameSync(pendingEnvTmp, pendingEnvPath)
@@ -1030,7 +1105,7 @@ try {
       process.stderr.write(`telegram gateway: pending-turn env cleared (clean previous shutdown)\n`)
     }
   } catch (err) {
-    process.stderr.write(`telegram gateway: pending-turn env write failed (${(err as Error).message}) — resume protocol may not fire\n`)
+    process.stderr.write(`telegram gateway: pending-turn env write failed (${(err as Error).message})\n`)
   }
 } catch (err) {
   process.stderr.write(`telegram gateway: turn-registry init failed (${(err as Error).message}) — turn tracking disabled\n`)
@@ -1399,6 +1474,13 @@ type CurrentTurn = {
   // (via `renderActivityFeed`) as a capped chronological list into the
   // in-place edited activity message and clears on reply. Reset per turn.
   mirrorLines: string[]
+  // Model A — foreground sub-agent nesting. A foreground sub-agent (Task/Agent
+  // with no run_in_background) runs INSIDE this turn while the parent blocks at
+  // the Task tool, so its live steps nest under the parent's activity feed
+  // rather than a separate message. Keyed by jsonl agent id; value = the
+  // sub-agent's accumulated narrative lines (oldest→newest, deduped + capped).
+  // Background workers are NOT here — they get the standalone worker feed.
+  foregroundSubAgents: Map<string, string[]>
   // Issue #195 — answer-lane streaming. Lazily created on the first text
   // event of a turn (once enough text has accumulated, the stream itself
   // gates on minInitialChars). Materialized and cleared at turn_end.
@@ -2129,23 +2211,6 @@ function probeAvailableReactions(chatId: string): void {
   })()
 }
-// ─── Handoff continuity ───────────────────────────────────────────────────
-let pendingHandoffTopic: string | null = null
-function initHandoffContinuity(): void {
-  if (!shouldShowHandoffLine()) { pendingHandoffTopic = null; return }
-  const agentDir = resolveAgentDirFromEnv()
-  if (agentDir == null) { pendingHandoffTopic = null; return }
-  pendingHandoffTopic = consumeHandoffTopic(agentDir)
-}
-function takeHandoffPrefix(format: HandoffFormat): string {
-  if (pendingHandoffTopic == null) return ''
-  const line = formatHandoffLine(pendingHandoffTopic, format)
-  pendingHandoffTopic = null
-  return line
-}
 // ─── Text chunking ────────────────────────────────────────────────────────
 const PHOTO_EXTS = new Set(['.jpg', '.jpeg', '.png', '.gif', '.webp'])
@@ -3942,6 +4007,21 @@ const inboundSpool = STATIC
       },
     })
 const pendingInboundBuffer = createPendingInboundBuffer({ spool: inboundSpool })
+// Honest-restart-resume: inject the boot resume/report inbound built by the
+// registry classifier above. When the spool exists we only PUT it (the
+// boot-replay loop below pulls it into the in-memory buffer exactly once via
+// liveEntries — pushing here too would double-queue). The turn_key-keyed
+// spoolId makes this a no-op if a prior restart already queued the same turn
+// and it hasn't been delivered yet — so a multi-restart sequence resumes a
+// given turn once, not N times. When there's no spool (STATIC mode) push
+// straight to the in-memory buffer.
+if (bootResumeInbound != null) {
+  if (inboundSpool != null) {
+    inboundSpool.put(bootResumeInbound.agent, bootResumeInbound.msg)
+  } else {
+    pendingInboundBuffer.push(bootResumeInbound.agent, bootResumeInbound.msg)
+  }
+}
 // Boot-replay: re-queue every un-acked spooled inbound into the
 // in-memory buffer so the existing drain triggers (onClientRegistered
 // / silence-poke #1546 / idle-drain #1549) deliver them. push →
@@ -5249,13 +5329,6 @@ async function executeReply(args: Record<string, unknown>): Promise<{ content: A
     effectiveText = text
   }
-  {
-    const prefix = takeHandoffPrefix(
-      format === 'html' ? 'html' : format === 'markdownv2' ? 'markdownv2' : 'text',
-    )
-    if (prefix.length > 0) effectiveText = prefix + effectiveText
-  }
   assertAllowedChat(chat_id)
   let threadId = resolveThreadId(chat_id, args.message_thread_id as string | undefined)
@@ -5989,7 +6062,6 @@ async function executeStreamReply(args: Record<string, unknown>): Promise<unknow
       markdownToHtml,
       escapeMarkdownV2,
       repairEscapedWhitespace,
-      takeHandoffPrefix,
       assertAllowedChat,
       resolveThreadId,
       disableLinkPreview: access.disableLinkPreview !== false,
@@ -7158,6 +7230,27 @@ function closeProgressLane(chatId: string, threadId: number | undefined): void {
   }
 }
+/** Accumulation cap for a foreground sub-agent's nested narrative lines.
+ *  Slightly larger than NESTED_MAX_LINES so the render's "↳ +N earlier…"
+ *  header is meaningful without growing unbounded on a long sub-agent. */
+const FOREGROUND_SUBAGENT_ACCUM_MAX = 12
+/**
+ * Render this turn's activity feed, nesting any active foreground sub-agent's
+ * narrative beneath the parent's own steps (Model A). With no active
+ * foreground sub-agent this is exactly the flat feed. Multiple concurrent
+ * foreground sub-agents (rare — parallel Task dispatch) flatten in insertion
+ * order; the single-sub-agent common case nests precisely under its
+ * Delegating line.
+ */
+function composeTurnActivity(turn: CurrentTurn): string | null {
+  const childLines: string[] = []
+  for (const narrative of turn.foregroundSubAgents.values()) {
+    childLines.push(...narrative)
+  }
+  return renderActivityFeedWithNested(turn.mirrorLines, childLines)
+}
 /**
  * Drain the tool-activity summary's pending render queue. Single-flight
  * by construction (caller assigns the returned promise to
@@ -7324,6 +7417,7 @@ function handleSessionEvent(ev: SessionEvent): void {
           activityPendingRender: null,
           activityLastSentRender: null,
           mirrorLines: [],
+          foregroundSubAgents: new Map(),
           answerStream: null,
           isDm: isDmChatId(ev.chatId),
         }
@@ -7501,7 +7595,10 @@ function handleSessionEvent(ev: SessionEvent): void {
       if (turn.replyCalled) return
       const rendered = appendActivityLabel(turn.mirrorLines, ev.label)
       if (rendered != null) {
-        turn.activityPendingRender = rendered
+        // Recompose so any active foreground sub-agent's nested block (Model A)
+        // is preserved when the parent appends its own step. composeTurnActivity
+        // == the flat render when no foreground sub-agent is active.
+        turn.activityPendingRender = composeTurnActivity(turn) ?? rendered
         if (turn.activityInFlight == null) {
           turn.activityInFlight = drainActivitySummary(turn)
         }
@@ -8508,7 +8605,6 @@ function handlePtyActivity(text: string): void {
       markdownToHtml,
       escapeMarkdownV2,
       repairEscapedWhitespace,
-      takeHandoffPrefix: () => '',
       assertAllowedChat,
       resolveThreadId,
       disableLinkPreview: access.disableLinkPreview !== false,
@@ -16982,7 +17078,6 @@ process.on('SIGINT', () => void shutdown('SIGINT'))
 // ─── Startup ──────────────────────────────────────────────────────────────
-initHandoffContinuity()
 // Top-level error handlers route through shutdown() so the startup lock is
 // released cleanly. Without this, a top-level throw would leave the lock
@@ -17577,6 +17672,12 @@ void (async () => {
             // supersedes the coarse 5-min bucket relay below to avoid
             // double-surfacing the same progress beat.
             const workerFeedEnabled = isWorkerActivityFeedEnabled(process.env.SWITCHROOM_WORKER_ACTIVITY_FEED)
+            // Model A — foreground sub-agent nesting in the parent's live
+            // activity draft. ON by default; this edits the SAME activity-
+            // summary message the tool_label feed already owns (not the
+            // compose draft, so no answer-stream contention). The kill-switch
+            // disables only the nesting; the parent's own feed is unaffected.
+            const foregroundNestingEnabled = process.env.SWITCHROOM_FOREGROUND_SUBAGENT_NESTING !== '0'
             const workerActivityFeed = createWorkerActivityFeed({
               bot: {
                 sendMessage: async (cid, text, sendOpts) => {
@@ -17735,6 +17836,28 @@ void (async () => {
                   } catch { /* best-effort */ }
                 }
                 const isBackground = dispatch.isBackground
+                if (!isBackground) {
+                  // Model A — a foreground sub-agent finished. Collapse its
+                  // nested child block from the parent's activity draft; the
+                  // parent resumes and its result returns inline as the Task
+                  // tool result, so there's no handback to deliver. Reaction
+                  // promotion already ran above.
+                  const turn = currentTurn
+                  if (
+                    turn != null &&
+                    turn.foregroundSubAgents.delete(agentId) &&
+                    !turn.replyCalled
+                  ) {
+                    const rendered = composeTurnActivity(turn)
+                    if (rendered != null) {
+                      turn.activityPendingRender = rendered
+                      if (turn.activityInFlight == null) {
+                        turn.activityInFlight = drainActivitySummary(turn)
+                      }
+                    }
+                  }
+                  return
+                }
                 // #PR2 live worker-feed: force the terminal recap edit on
                 // the worker's live message. No-op when no message was ever
                 // posted (trivial workers stay silent; handback covers them).
@@ -17843,7 +17966,39 @@ void (async () => {
                   } catch { /* best-effort */ }
                 }
                 const isBackground = dispatch.isBackground
-                if (!isBackground) return // skip overhead for foreground
+                if (!isBackground) {
+                  // Model A — a foreground sub-agent runs inside the parent's
+                  // turn, so its live narrative nests under the parent's
+                  // activity draft rather than a separate worker message. Pure
+                  // jsonl-tail → render (no model call), inside the
+                  // subscription-honest boundary.
+                  if (!foregroundNestingEnabled) return // kill-switch: skip overhead
+                  const turn = currentTurn
+                  if (turn == null || turn.replyCalled) return
+                  const child = latestSummary.trim().slice(0, 120)
+                  if (child.length === 0) return
+                  let narrative = turn.foregroundSubAgents.get(agentId)
+                  if (narrative == null) {
+                    narrative = []
+                    turn.foregroundSubAgents.set(agentId, narrative)
+                  }
+                  // Dedup against the immediately-preceding line — the watcher
+                  // re-emits the same narrative across ticks while a tool runs.
+                  if (narrative[narrative.length - 1] !== child) {
+                    narrative.push(child)
+                    if (narrative.length > FOREGROUND_SUBAGENT_ACCUM_MAX) {
+                      narrative.splice(0, narrative.length - FOREGROUND_SUBAGENT_ACCUM_MAX)
+                    }
+                  }
+                  const rendered = composeTurnActivity(turn)
+                  if (rendered != null) {
+                    turn.activityPendingRender = rendered
+                    if (turn.activityInFlight == null) {
+                      turn.activityInFlight = drainActivitySummary(turn)
+                    }
+                  }
+                  return
+                }
                 // #PR2 live worker-feed: when ON, the worker's live chat
                 // message owns the progress beat. Push a running cue and

package/telegram-plugin/gateway/inbound-spool.ts CHANGED Viewed

@@ -79,6 +79,21 @@ export function spoolId(msg: InboundMessage): string {
   ) {
     return `s:progress:${msg.meta.subagent_jsonl_id}:${msg.meta.bucket_idx}`
   }
+  // Boot-resume inbounds (honest-restart-resume): deterministic per
+  // interrupted turn so a multi-restart sequence (operator restarts again
+  // before the agent drains the first resume) collapses to ONE resume of
+  // a given turn instead of stacking N. Keyed on the synthetic messageId
+  // (=ts, fresh every boot) would re-fire each boot; the turn_key is the
+  // stable identity. Both resume sources share the namespace because a
+  // given turn can only be one or the other.
+  if (
+    (msg.meta?.source === 'resume_interrupted' ||
+      msg.meta?.source === 'resume_watchdog_timeout') &&
+    typeof msg.meta?.resume_turn_key === 'string' &&
+    msg.meta.resume_turn_key.length > 0
+  ) {
+    return `s:resume:${msg.meta.resume_turn_key}`
+  }
   if (typeof msg.messageId === 'number' && msg.messageId > 0) {
     return `m:${msg.chatId}:${msg.messageId}`
   }

package/telegram-plugin/gateway/resume-inbound-builder.ts ADDED Viewed

@@ -0,0 +1,180 @@
+/**
+ * Pure builders for the synthetic inbounds the gateway injects at boot
+ * when it inherits an interrupted turn from the previous process.
+ *
+ * Two shapes, selected by how the prior turn ended (see
+ * `selectResumeBuilder`):
+ *
+ *   - `resume_interrupted` — the turn was cut off mid-flight by an
+ *     operator restart / SIGTERM / crash while it was still making
+ *     progress. The agent should pick the work back up and tell the user
+ *     it's resuming. Blanket resume regardless of how long ago — the
+ *     elapsed time rides along so the model can frame it ("picking up the
+ *     X you asked ~3h ago").
+ *
+ *   - `resume_watchdog_timeout` — the turn stalled with no tool progress
+ *     for the full hang-watchdog window and was (or would have been)
+ *     killed as a hang. The agent must NOT silently resume; it reports
+ *     what happened honestly and asks whether to retry or take a
+ *     different angle. The honest cause is "no observable progress for N
+ *     minutes" — the framework deliberately does not invent a deeper root
+ *     cause, and neither should the model.
+ *
+ * Why a separate module (mirrors `vault-grant-inbound-builders.ts`): the
+ * InboundMessage shape is load-bearing. `meta.source` is what the bridge
+ * forwards verbatim and Claude Code renders as `<channel source="…">`, so
+ * the model keys on it to know this is a boot-resume turn rather than a
+ * human message. `meta.resume_turn_key` is the dedup anchor the spool
+ * uses (see `spoolId`) so a multi-restart sequence resumes a given turn
+ * exactly once. Pinning the builders against fixture tests keeps that
+ * contract honest without booting a real gateway.
+ */
+import type { InboundMessage } from './ipc-protocol.js'
+import type { Turn, TurnEndedVia } from '../registry/turns-schema.js'
+/** Render an elapsed duration as a coarse, human-friendly approximation
+ *  the model can drop straight into prose ("~3h ago"). Deliberately
+ *  coarse — minute/hour/day buckets, never "2h 47m" precision the user
+ *  doesn't care about on a resume. */
+export function humanizeElapsed(ms: number): string {
+  if (!Number.isFinite(ms) || ms < 0) return 'an unknown amount of time'
+  const sec = Math.round(ms / 1000)
+  if (sec < 45) return 'moments'
+  const min = Math.round(sec / 60)
+  if (min < 60) return `~${min} min`
+  const hr = Math.round(min / 60)
+  if (hr < 24) return `~${hr}h`
+  const days = Math.round(hr / 24)
+  return `~${days} day${days === 1 ? '' : 's'}`
+}
+export interface ResumeInboundContext {
+  /** The interrupted turn, straight from the registry. */
+  turn: Turn
+  /** Wall-clock ms. Drives `ts`, `messageId`, and the elapsed framing.
+   *  Defaults to Date.now(). */
+  nowMs?: number
+}
+function threadIdNum(turn: Turn): number | undefined {
+  if (turn.thread_id == null) return undefined
+  const n = Number(turn.thread_id)
+  return Number.isFinite(n) ? n : undefined
+}
+function promptClause(turn: Turn): string {
+  const p = turn.user_prompt_preview?.trim()
+  if (!p) return ''
+  // Quote-trim so a long preview doesn't bloat the channel body.
+  const snippet = p.length > 160 ? p.slice(0, 160) + '…' : p
+  return ` The request was: "${snippet}".`
+}
+/**
+ * Build the `resume_interrupted` inbound — a clean mid-flight interrupt
+ * the agent should pick back up.
+ */
+export function buildResumeInterruptedInbound(ctx: ResumeInboundContext): InboundMessage {
+  const ts = ctx.nowMs ?? Date.now()
+  const elapsed = humanizeElapsed(ts - ctx.turn.started_at)
+  const meta: Record<string, string> = {
+    source: 'resume_interrupted',
+    resume_turn_key: ctx.turn.turn_key,
+    interrupted_via: ctx.turn.ended_via ?? 'restart',
+    started_at: String(ctx.turn.started_at),
+  }
+  if (ctx.turn.user_prompt_preview) meta.original_prompt = ctx.turn.user_prompt_preview
+  const threadId = threadIdNum(ctx.turn)
+  return {
+    type: 'inbound',
+    chatId: ctx.turn.chat_id,
+    ...(threadId != null ? { threadId } : {}),
+    messageId: ts,
+    user: 'switchroom',
+    userId: 0,
+    ts,
+    text:
+      `You just restarted. Your previous turn was interrupted ${elapsed} ago, ` +
+      `before it finished — it was cut off by a restart, not completed.` +
+      promptClause(ctx.turn) +
+      ` Pick that work back up now and continue it through to completion. ` +
+      `In your first message, briefly let the user know you're resuming what ` +
+      `was interrupted (mention roughly how long ago in plain language) so ` +
+      `they're not left wondering — then carry on with the actual task. Do ` +
+      `not ask whether to resume; just resume. If you genuinely can't tell ` +
+      `what the work was, say so and ask.`,
+    meta,
+  }
+}
+/**
+ * Build the `resume_watchdog_timeout` inbound — a stalled turn the agent
+ * must report (not silently resume).
+ *
+ * `idleMs` is the no-progress duration the boot classifier measured (the
+ * marker age). It is passed explicitly rather than read off the turn so
+ * the caller can recover it from the persisted `interrupt_reason` on a
+ * later boot when the live marker is gone.
+ */
+export function buildResumeWatchdogReportInbound(
+  ctx: ResumeInboundContext & { idleMs: number },
+): InboundMessage {
+  const ts = ctx.nowMs ?? Date.now()
+  const idle = humanizeElapsed(ctx.idleMs)
+  const since = humanizeElapsed(ts - ctx.turn.started_at)
+  const toolClause =
+    ctx.turn.tool_call_count != null && ctx.turn.tool_call_count > 0
+      ? ` You'd run ${ctx.turn.tool_call_count} tool call${ctx.turn.tool_call_count === 1 ? '' : 's'} before it stalled.`
+      : ''
+  const meta: Record<string, string> = {
+    source: 'resume_watchdog_timeout',
+    resume_turn_key: ctx.turn.turn_key,
+    interrupted_via: 'timeout',
+    idle_ms: String(ctx.idleMs),
+    started_at: String(ctx.turn.started_at),
+  }
+  if (ctx.turn.tool_call_count != null) meta.tool_call_count = String(ctx.turn.tool_call_count)
+  if (ctx.turn.user_prompt_preview) meta.original_prompt = ctx.turn.user_prompt_preview
+  const threadId = threadIdNum(ctx.turn)
+  return {
+    type: 'inbound',
+    chatId: ctx.turn.chat_id,
+    ...(threadId != null ? { threadId } : {}),
+    messageId: ts,
+    user: 'switchroom',
+    userId: 0,
+    ts,
+    text:
+      `You just restarted. Your previous turn (started ${since} ago) was ` +
+      `killed by the hang-watchdog: it made no observable progress for ${idle} ` +
+      `and the watchdog restarts a turn that goes that long without activity.` +
+      toolClause +
+      promptClause(ctx.turn) +
+      ` Do NOT silently resume it — it may hang again the same way. Instead, ` +
+      `tell the user plainly what happened: that your last turn was killed ` +
+      `after ${idle} of no progress, and roughly what it was doing. Then ask ` +
+      `whether they want you to retry it or take a different angle. Report ` +
+      `only the honest cause — no observable progress for that long — don't ` +
+      `speculate about a deeper root cause you can't see.`,
+    meta,
+  }
+}
+/**
+ * Decide which resume inbound (if any) a given interrupt warrants. Pure —
+ * the gateway calls this with the classified `ended_via` so the
+ * report-vs-resume policy lives in one testable place.
+ *
+ *   - 'timeout'                         → 'report'  (watchdog kill)
+ *   - 'restart' | 'sigterm' | 'unknown' → 'resume'  (clean interrupt)
+ *   - 'stop'                            → null      (finished; nothing to do)
+ */
+export function selectResumeBuilder(
+  endedVia: TurnEndedVia | null,
+): 'resume' | 'report' | null {
+  if (endedVia === 'timeout') return 'report'
+  if (endedVia === 'restart' || endedVia === 'sigterm' || endedVia === 'unknown') return 'resume'
+  if (endedVia == null) return 'resume' // still-open at boot = killed mid-flight
+  return null
+}