switchroom 0.14.21 → 0.14.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/dist/agent-scheduler/index.js +0 -1
  2. package/dist/auth-broker/index.js +0 -1
  3. package/dist/cli/notion-write-pretool.mjs +0 -1
  4. package/dist/cli/switchroom.js +14 -6
  5. package/dist/host-control/main.js +0 -1
  6. package/dist/vault/approvals/kernel-server.js +0 -1
  7. package/dist/vault/broker/server.js +0 -1
  8. package/package.json +3 -3
  9. package/profiles/_base/start.sh.hbs +11 -24
  10. package/profiles/_shared/telegram-style.md.hbs +2 -2
  11. package/profiles/default/CLAUDE.md.hbs +4 -1
  12. package/skills/switchroom-runtime/SKILL.md +6 -16
  13. package/telegram-plugin/agent-dir.ts +15 -0
  14. package/telegram-plugin/dist/gateway/gateway.js +788 -513
  15. package/telegram-plugin/gateway/gateway.ts +216 -61
  16. package/telegram-plugin/gateway/inbound-spool.ts +15 -0
  17. package/telegram-plugin/gateway/resume-inbound-builder.ts +180 -0
  18. package/telegram-plugin/registry/turns-schema.ts +138 -33
  19. package/telegram-plugin/stream-reply-handler.ts +1 -11
  20. package/telegram-plugin/subagent-watcher.ts +79 -5
  21. package/telegram-plugin/tests/agent-dir.test.ts +25 -0
  22. package/telegram-plugin/tests/e2e.test.ts +2 -77
  23. package/telegram-plugin/tests/inbound-spool.test.ts +45 -0
  24. package/telegram-plugin/tests/multi-turn-continuity.test.ts +0 -1
  25. package/telegram-plugin/tests/outbound-ordering.test.ts +0 -1
  26. package/telegram-plugin/tests/parse-mode-rotation.test.ts +0 -1
  27. package/telegram-plugin/tests/races.test.ts +0 -26
  28. package/telegram-plugin/tests/registry-turns.test.ts +106 -29
  29. package/telegram-plugin/tests/resume-inbound-builder.test.ts +182 -0
  30. package/telegram-plugin/tests/status-accent.test.ts +0 -1
  31. package/telegram-plugin/tests/stream-reply-error-paths.test.ts +0 -1
  32. package/telegram-plugin/tests/stream-reply-handler.test.ts +0 -24
  33. package/telegram-plugin/tests/streaming-e2e.test.ts +0 -1
  34. package/telegram-plugin/tests/streaming-orchestration.test.ts +0 -1
  35. package/telegram-plugin/tests/subagent-registry-bugs.test.ts +7 -3
  36. package/telegram-plugin/tests/subagent-watcher-handback-gaps.test.ts +293 -0
  37. package/telegram-plugin/tests/subagent-watcher.test.ts +23 -15
  38. package/telegram-plugin/tests/tool-activity-summary.test.ts +44 -0
  39. package/telegram-plugin/tests/turns-writer.test.ts +16 -6
  40. package/telegram-plugin/tool-activity-summary.ts +55 -0
  41. package/telegram-plugin/uat/driver.ts +3 -1
  42. package/telegram-plugin/handoff-continuity.ts +0 -206
  43. package/telegram-plugin/tests/handoff-continuity.test.ts +0 -262
@@ -66,7 +66,7 @@ import { StatusReactionController } from '../status-reactions.js'
66
66
  import { DeferredDoneReactions } from '../reaction-defer.js'
67
67
  import { createWorkerActivityFeed, isWorkerActivityFeedEnabled } from '../worker-activity-feed.js'
68
68
  import { isTelegramReplyTool, isTelegramSurfaceTool } from '../tool-names.js'
69
- import { appendActivityLabel } from '../tool-activity-summary.js'
69
+ import { appendActivityLabel, renderActivityFeedWithNested } from '../tool-activity-summary.js'
70
70
  import { toolLabel } from '../tool-labels.js'
71
71
  import { createTypingWrapper } from '../typing-wrap.js'
72
72
  import { type DraftStreamHandle } from '../draft-stream.js'
@@ -210,14 +210,7 @@ import {
210
210
  isTurnFlushSafetyEnabled,
211
211
  } from '../turn-flush-safety.js'
212
212
  // #1122 PR3: turn-flush-prose-recovery removed with the progress card.
213
- import {
214
- resolveAgentDirFromEnv,
215
- consumeHandoffTopic,
216
- shouldShowHandoffLine,
217
- formatHandoffLine,
218
- writeLastTurnSummary,
219
- type HandoffFormat,
220
- } from '../handoff-continuity.js'
213
+ import { resolveAgentDirFromEnv } from '../agent-dir.js'
221
214
  import {
222
215
  addActiveReaction,
223
216
  removeActiveReaction,
@@ -396,6 +389,7 @@ import {
396
389
  touchTurnActiveMarker,
397
390
  removeTurnActiveMarker,
398
391
  sweepStaleTurnActiveMarker,
392
+ TURN_ACTIVE_MARKER_FILE,
399
393
  } from './turn-active-marker.js'
400
394
  import {
401
395
  VERSION,
@@ -423,12 +417,17 @@ import {
423
417
  import { resolveVaultApprovalPosture } from '../vault-approval-posture.js'
424
418
  import {
425
419
  openTurnsDb,
426
- markOrphanedAsRestarted,
420
+ markOrphanedWithTimeoutClassification,
427
421
  recordTurnStart,
428
422
  recordTurnEnd,
429
- findMostRecentInterruptedTurn,
423
+ findLatestTurnIfInterrupted,
430
424
  findRecentTurnsForChat,
431
425
  } from '../registry/turns-schema.js'
426
+ import {
427
+ buildResumeInterruptedInbound,
428
+ buildResumeWatchdogReportInbound,
429
+ selectResumeBuilder,
430
+ } from './resume-inbound-builder.js'
432
431
  import { applySubagentsSchema, getSubagentByJsonlId } from '../registry/subagents-schema.js'
433
432
  import { resolveWorkerFeedDispatch, type WorkerFeedDispatch } from './worker-feed-dispatch.js'
434
433
  import { formatIdleFooter } from '../idle-footer.js'
@@ -969,13 +968,26 @@ if (HISTORY_ENABLED) {
969
968
  }
970
969
  }
971
970
 
972
- // ─── Turn-tracking registry (Stage 3a of simplify-restart, Phase 0 of #250) ─
973
- // On boot, open the per-agent registry.db and stamp any rows that never got
974
- // an ended_at as ended_via='restart'. Those are turns where the previous
975
- // gateway died mid-flight (SIGKILL / OOM / hard reboot — any path that
976
- // skipped the SIGTERM handler). Stages 3b/3c will populate new rows during
977
- // turn enqueue/end and on graceful shutdown; Stage 4 reads on cold start.
971
+ // ─── Turn-tracking registry + honest-restart-resume ────────────────────────
972
+ // On boot, open the per-agent registry.db and reap any turn that never got an
973
+ // ended_at those were killed mid-flight (operator restart, SIGKILL, OOM,
974
+ // hard reboot). The reaper CLASSIFIES each orphan from the on-disk
975
+ // turn-active marker's age:
976
+ // - marker older than the hang-watchdog window 'timeout' (the turn
977
+ // stalled with no tool progress; report it, don't blindly resume).
978
+ // - otherwise → 'restart' (a clean interrupt; resume it).
979
+ // Then, if the LATEST turn was interrupted, we build a synthetic resume /
980
+ // report inbound and (further down, once the inbound spool exists) inject it
981
+ // so the agent wakes on its own and either picks the work back up or tells
982
+ // the user why it stopped — no human nudge required.
983
+ //
984
+ // The classifier MUST read the marker before the boot-cleanup sweep removes
985
+ // it (the sweep runs much later, in the bridge-registration path). This block
986
+ // runs at module top, so the marker is still present here.
978
987
  let turnsDb: ReturnType<typeof openTurnsDb> | null = null
988
+ // Stashed here; pushed to the spool once it's constructed below. The spool's
989
+ // turn_key-keyed dedup makes a re-stash across multiple restarts a no-op.
990
+ let bootResumeInbound: { agent: string; msg: InboundMessage } | null = null
979
991
  try {
980
992
  // STATE_DIR is `<agentDir>/telegram` in production. openTurnsDb expects
981
993
  // the parent (agent dir) and joins `telegram/registry.db` itself.
@@ -987,23 +999,88 @@ try {
987
999
  // schema; subagents lives alongside in registry.db. Idempotent — safe on
988
1000
  // pre-existing DBs (handles the jsonl_agent_id column migration).
989
1001
  applySubagentsSchema(turnsDb)
990
- const reaped = markOrphanedAsRestarted(turnsDb)
1002
+
1003
+ // Read the turn-active marker (the in-flight turn the watchdog tracks)
1004
+ // BEFORE classifying — its mtime is "ms since last tool progress" and its
1005
+ // payload carries the in-flight turn_key.
1006
+ let markerTurnKey: string | null = null
1007
+ let markerAgeMs: number | null = null
1008
+ try {
1009
+ const markerPath = join(STATE_DIR, TURN_ACTIVE_MARKER_FILE)
1010
+ if (existsSync(markerPath)) {
1011
+ const st = statSync(markerPath)
1012
+ markerAgeMs = Date.now() - st.mtimeMs
1013
+ try {
1014
+ const payload = JSON.parse(readFileSync(markerPath, 'utf8')) as { turnKey?: unknown }
1015
+ if (typeof payload.turnKey === 'string' && payload.turnKey.length > 0) {
1016
+ markerTurnKey = payload.turnKey
1017
+ }
1018
+ } catch { /* unreadable/torn marker — age alone still classifies */ }
1019
+ }
1020
+ } catch { /* stat failure — treat as no marker (plain restart) */ }
1021
+
1022
+ // TURN_HANG_SECS is the watchdog's hang threshold (default 300s); the
1023
+ // classifier uses the same signal so "would the watchdog have killed it"
1024
+ // is answered identically whether or not the watchdog is live (it's
1025
+ // disabled under Docker, but the staleness judgement still holds).
1026
+ const hangSecs = Number(process.env.TURN_HANG_SECS)
1027
+ const hangThresholdMs = (Number.isFinite(hangSecs) && hangSecs > 0 ? hangSecs : 300) * 1000
1028
+ const reasonSnapshot =
1029
+ markerAgeMs != null ? JSON.stringify({ idleMs: Math.round(markerAgeMs) }) : null
1030
+
1031
+ const { reaped, timeoutTurnKey } = markOrphanedWithTimeoutClassification(turnsDb, {
1032
+ markerTurnKey,
1033
+ markerAgeMs,
1034
+ hangThresholdMs,
1035
+ reasonSnapshot,
1036
+ })
991
1037
  if (reaped > 0) {
992
- process.stderr.write(`telegram gateway: turn-registry boot-reaper stamped ${reaped} orphaned turn(s) as ended_via='restart'\n`)
1038
+ process.stderr.write(
1039
+ `telegram gateway: turn-registry boot-reaper stamped ${reaped} orphaned turn(s)` +
1040
+ `${timeoutTurnKey ? ` (turnKey=${timeoutTurnKey} as 'timeout', markerAgeMs=${markerAgeMs})` : " as 'restart'"}\n`,
1041
+ )
993
1042
  } else {
994
1043
  process.stderr.write(`telegram gateway: turn-registry initialized at ${join(agentDir, 'telegram', 'registry.db')}\n`)
995
1044
  }
996
1045
 
997
- // Stage 4: surface the most-recently-interrupted turn to start.sh as a
998
- // shell-sourceable env file. The agent's start.sh reads this on next
999
- // boot, exports the env vars to the spawned `claude` process, and
1000
- // deletes the file (one-shot — only ever applies to the immediately
1001
- // following session). If there's no interrupted turn (clean previous
1002
- // shutdown), we delete any stale file so the resume protocol doesn't
1003
- // mis-fire.
1046
+ // Build the boot resume/report inbound for the LATEST turn if it was
1047
+ // interrupted. selectResumeBuilder owns the resume-vs-report policy.
1048
+ const pending = findLatestTurnIfInterrupted(turnsDb)
1049
+ const selfAgent = process.env.SWITCHROOM_AGENT_NAME ?? ''
1050
+ if (pending != null && selfAgent) {
1051
+ const kind = selectResumeBuilder(pending.ended_via)
1052
+ if (kind === 'resume') {
1053
+ bootResumeInbound = { agent: selfAgent, msg: buildResumeInterruptedInbound({ turn: pending }) }
1054
+ } else if (kind === 'report') {
1055
+ // idleMs: this boot's measured marker age if it just classified this
1056
+ // turn; otherwise recover it from the persisted interrupt_reason (a
1057
+ // later boot, marker already swept); else fall back to total runtime.
1058
+ let idleMs = pending.turn_key === timeoutTurnKey && markerAgeMs != null ? markerAgeMs : null
1059
+ if (idleMs == null && pending.interrupt_reason) {
1060
+ try {
1061
+ const parsed = JSON.parse(pending.interrupt_reason) as { idleMs?: unknown }
1062
+ if (typeof parsed.idleMs === 'number' && Number.isFinite(parsed.idleMs)) idleMs = parsed.idleMs
1063
+ } catch { /* malformed snapshot — fall through */ }
1064
+ }
1065
+ if (idleMs == null) idleMs = Math.max(0, Date.now() - pending.started_at)
1066
+ bootResumeInbound = {
1067
+ agent: selfAgent,
1068
+ msg: buildResumeWatchdogReportInbound({ turn: pending, idleMs }),
1069
+ }
1070
+ }
1071
+ if (bootResumeInbound != null) {
1072
+ process.stderr.write(
1073
+ `telegram gateway: boot-resume queued kind=${kind} turnKey=${pending.turn_key} ` +
1074
+ `endedVia=${pending.ended_via ?? 'open'} chat=${pending.chat_id}\n`,
1075
+ )
1076
+ }
1077
+ }
1078
+
1079
+ // Diagnostic env file (one-shot, sourced by start.sh) — kept for the
1080
+ // wake-audit context. The injected inbound above is the real wake signal;
1081
+ // these vars are passive context only.
1004
1082
  const pendingEnvPath = join(agentDir, '.pending-turn.env')
1005
1083
  try {
1006
- const pending = findMostRecentInterruptedTurn(turnsDb)
1007
1084
  if (pending != null) {
1008
1085
  const lines = [
1009
1086
  `SWITCHROOM_PENDING_TURN=true`,
@@ -1013,14 +1090,12 @@ try {
1013
1090
  pending.last_user_msg_id != null ? `SWITCHROOM_PENDING_USER_MSG_ID=${pending.last_user_msg_id}` : `SWITCHROOM_PENDING_USER_MSG_ID=`,
1014
1091
  `SWITCHROOM_PENDING_ENDED_VIA=${pending.ended_via ?? 'unknown'}`,
1015
1092
  `SWITCHROOM_PENDING_STARTED_AT=${pending.started_at}`,
1093
+ pending.interrupt_reason != null ? `SWITCHROOM_PENDING_INTERRUPT_REASON=${pending.interrupt_reason}` : `SWITCHROOM_PENDING_INTERRUPT_REASON=`,
1016
1094
  ]
1017
1095
  // Atomic write: tmp + rename. Without this, a crash mid-write
1018
1096
  // (power loss, OOM, panic) leaves a truncated `.pending-turn.env`
1019
1097
  // that start.sh `source`s — partial SWITCHROOM_PENDING_* vars
1020
- // half-trigger the resume protocol with incomplete context, or
1021
- // a malformed line breaks shell parsing inside the source.
1022
- // Same pattern used by the access-file write a few hundred lines
1023
- // above and by src/issues/store.ts.
1098
+ // or a malformed line break shell parsing inside the source.
1024
1099
  const pendingEnvTmp = `${pendingEnvPath}.tmp-${process.pid}`
1025
1100
  writeFileSync(pendingEnvTmp, lines.join('\n') + '\n', { mode: 0o600 })
1026
1101
  renameSync(pendingEnvTmp, pendingEnvPath)
@@ -1030,7 +1105,7 @@ try {
1030
1105
  process.stderr.write(`telegram gateway: pending-turn env cleared (clean previous shutdown)\n`)
1031
1106
  }
1032
1107
  } catch (err) {
1033
- process.stderr.write(`telegram gateway: pending-turn env write failed (${(err as Error).message}) — resume protocol may not fire\n`)
1108
+ process.stderr.write(`telegram gateway: pending-turn env write failed (${(err as Error).message})\n`)
1034
1109
  }
1035
1110
  } catch (err) {
1036
1111
  process.stderr.write(`telegram gateway: turn-registry init failed (${(err as Error).message}) — turn tracking disabled\n`)
@@ -1399,6 +1474,13 @@ type CurrentTurn = {
1399
1474
  // (via `renderActivityFeed`) as a capped chronological list into the
1400
1475
  // in-place edited activity message and clears on reply. Reset per turn.
1401
1476
  mirrorLines: string[]
1477
+ // Model A — foreground sub-agent nesting. A foreground sub-agent (Task/Agent
1478
+ // with no run_in_background) runs INSIDE this turn while the parent blocks at
1479
+ // the Task tool, so its live steps nest under the parent's activity feed
1480
+ // rather than a separate message. Keyed by jsonl agent id; value = the
1481
+ // sub-agent's accumulated narrative lines (oldest→newest, deduped + capped).
1482
+ // Background workers are NOT here — they get the standalone worker feed.
1483
+ foregroundSubAgents: Map<string, string[]>
1402
1484
  // Issue #195 — answer-lane streaming. Lazily created on the first text
1403
1485
  // event of a turn (once enough text has accumulated, the stream itself
1404
1486
  // gates on minInitialChars). Materialized and cleared at turn_end.
@@ -2129,23 +2211,6 @@ function probeAvailableReactions(chatId: string): void {
2129
2211
  })()
2130
2212
  }
2131
2213
 
2132
- // ─── Handoff continuity ───────────────────────────────────────────────────
2133
- let pendingHandoffTopic: string | null = null
2134
-
2135
- function initHandoffContinuity(): void {
2136
- if (!shouldShowHandoffLine()) { pendingHandoffTopic = null; return }
2137
- const agentDir = resolveAgentDirFromEnv()
2138
- if (agentDir == null) { pendingHandoffTopic = null; return }
2139
- pendingHandoffTopic = consumeHandoffTopic(agentDir)
2140
- }
2141
-
2142
- function takeHandoffPrefix(format: HandoffFormat): string {
2143
- if (pendingHandoffTopic == null) return ''
2144
- const line = formatHandoffLine(pendingHandoffTopic, format)
2145
- pendingHandoffTopic = null
2146
- return line
2147
- }
2148
-
2149
2214
  // ─── Text chunking ────────────────────────────────────────────────────────
2150
2215
  const PHOTO_EXTS = new Set(['.jpg', '.jpeg', '.png', '.gif', '.webp'])
2151
2216
 
@@ -3942,6 +4007,21 @@ const inboundSpool = STATIC
3942
4007
  },
3943
4008
  })
3944
4009
  const pendingInboundBuffer = createPendingInboundBuffer({ spool: inboundSpool })
4010
+ // Honest-restart-resume: inject the boot resume/report inbound built by the
4011
+ // registry classifier above. When the spool exists we only PUT it (the
4012
+ // boot-replay loop below pulls it into the in-memory buffer exactly once via
4013
+ // liveEntries — pushing here too would double-queue). The turn_key-keyed
4014
+ // spoolId makes this a no-op if a prior restart already queued the same turn
4015
+ // and it hasn't been delivered yet — so a multi-restart sequence resumes a
4016
+ // given turn once, not N times. When there's no spool (STATIC mode) push
4017
+ // straight to the in-memory buffer.
4018
+ if (bootResumeInbound != null) {
4019
+ if (inboundSpool != null) {
4020
+ inboundSpool.put(bootResumeInbound.agent, bootResumeInbound.msg)
4021
+ } else {
4022
+ pendingInboundBuffer.push(bootResumeInbound.agent, bootResumeInbound.msg)
4023
+ }
4024
+ }
3945
4025
  // Boot-replay: re-queue every un-acked spooled inbound into the
3946
4026
  // in-memory buffer so the existing drain triggers (onClientRegistered
3947
4027
  // / silence-poke #1546 / idle-drain #1549) deliver them. push →
@@ -5249,13 +5329,6 @@ async function executeReply(args: Record<string, unknown>): Promise<{ content: A
5249
5329
  effectiveText = text
5250
5330
  }
5251
5331
 
5252
- {
5253
- const prefix = takeHandoffPrefix(
5254
- format === 'html' ? 'html' : format === 'markdownv2' ? 'markdownv2' : 'text',
5255
- )
5256
- if (prefix.length > 0) effectiveText = prefix + effectiveText
5257
- }
5258
-
5259
5332
  assertAllowedChat(chat_id)
5260
5333
 
5261
5334
  let threadId = resolveThreadId(chat_id, args.message_thread_id as string | undefined)
@@ -5989,7 +6062,6 @@ async function executeStreamReply(args: Record<string, unknown>): Promise<unknow
5989
6062
  markdownToHtml,
5990
6063
  escapeMarkdownV2,
5991
6064
  repairEscapedWhitespace,
5992
- takeHandoffPrefix,
5993
6065
  assertAllowedChat,
5994
6066
  resolveThreadId,
5995
6067
  disableLinkPreview: access.disableLinkPreview !== false,
@@ -7158,6 +7230,27 @@ function closeProgressLane(chatId: string, threadId: number | undefined): void {
7158
7230
  }
7159
7231
  }
7160
7232
 
7233
+ /** Accumulation cap for a foreground sub-agent's nested narrative lines.
7234
+ * Slightly larger than NESTED_MAX_LINES so the render's "↳ +N earlier…"
7235
+ * header is meaningful without growing unbounded on a long sub-agent. */
7236
+ const FOREGROUND_SUBAGENT_ACCUM_MAX = 12
7237
+
7238
+ /**
7239
+ * Render this turn's activity feed, nesting any active foreground sub-agent's
7240
+ * narrative beneath the parent's own steps (Model A). With no active
7241
+ * foreground sub-agent this is exactly the flat feed. Multiple concurrent
7242
+ * foreground sub-agents (rare — parallel Task dispatch) flatten in insertion
7243
+ * order; the single-sub-agent common case nests precisely under its
7244
+ * Delegating line.
7245
+ */
7246
+ function composeTurnActivity(turn: CurrentTurn): string | null {
7247
+ const childLines: string[] = []
7248
+ for (const narrative of turn.foregroundSubAgents.values()) {
7249
+ childLines.push(...narrative)
7250
+ }
7251
+ return renderActivityFeedWithNested(turn.mirrorLines, childLines)
7252
+ }
7253
+
7161
7254
  /**
7162
7255
  * Drain the tool-activity summary's pending render queue. Single-flight
7163
7256
  * by construction (caller assigns the returned promise to
@@ -7324,6 +7417,7 @@ function handleSessionEvent(ev: SessionEvent): void {
7324
7417
  activityPendingRender: null,
7325
7418
  activityLastSentRender: null,
7326
7419
  mirrorLines: [],
7420
+ foregroundSubAgents: new Map(),
7327
7421
  answerStream: null,
7328
7422
  isDm: isDmChatId(ev.chatId),
7329
7423
  }
@@ -7501,7 +7595,10 @@ function handleSessionEvent(ev: SessionEvent): void {
7501
7595
  if (turn.replyCalled) return
7502
7596
  const rendered = appendActivityLabel(turn.mirrorLines, ev.label)
7503
7597
  if (rendered != null) {
7504
- turn.activityPendingRender = rendered
7598
+ // Recompose so any active foreground sub-agent's nested block (Model A)
7599
+ // is preserved when the parent appends its own step. composeTurnActivity
7600
+ // == the flat render when no foreground sub-agent is active.
7601
+ turn.activityPendingRender = composeTurnActivity(turn) ?? rendered
7505
7602
  if (turn.activityInFlight == null) {
7506
7603
  turn.activityInFlight = drainActivitySummary(turn)
7507
7604
  }
@@ -8508,7 +8605,6 @@ function handlePtyActivity(text: string): void {
8508
8605
  markdownToHtml,
8509
8606
  escapeMarkdownV2,
8510
8607
  repairEscapedWhitespace,
8511
- takeHandoffPrefix: () => '',
8512
8608
  assertAllowedChat,
8513
8609
  resolveThreadId,
8514
8610
  disableLinkPreview: access.disableLinkPreview !== false,
@@ -16982,7 +17078,6 @@ process.on('SIGINT', () => void shutdown('SIGINT'))
16982
17078
 
16983
17079
 
16984
17080
  // ─── Startup ──────────────────────────────────────────────────────────────
16985
- initHandoffContinuity()
16986
17081
 
16987
17082
  // Top-level error handlers route through shutdown() so the startup lock is
16988
17083
  // released cleanly. Without this, a top-level throw would leave the lock
@@ -17577,6 +17672,12 @@ void (async () => {
17577
17672
  // supersedes the coarse 5-min bucket relay below to avoid
17578
17673
  // double-surfacing the same progress beat.
17579
17674
  const workerFeedEnabled = isWorkerActivityFeedEnabled(process.env.SWITCHROOM_WORKER_ACTIVITY_FEED)
17675
+ // Model A — foreground sub-agent nesting in the parent's live
17676
+ // activity draft. ON by default; this edits the SAME activity-
17677
+ // summary message the tool_label feed already owns (not the
17678
+ // compose draft, so no answer-stream contention). The kill-switch
17679
+ // disables only the nesting; the parent's own feed is unaffected.
17680
+ const foregroundNestingEnabled = process.env.SWITCHROOM_FOREGROUND_SUBAGENT_NESTING !== '0'
17580
17681
  const workerActivityFeed = createWorkerActivityFeed({
17581
17682
  bot: {
17582
17683
  sendMessage: async (cid, text, sendOpts) => {
@@ -17735,6 +17836,28 @@ void (async () => {
17735
17836
  } catch { /* best-effort */ }
17736
17837
  }
17737
17838
  const isBackground = dispatch.isBackground
17839
+ if (!isBackground) {
17840
+ // Model A — a foreground sub-agent finished. Collapse its
17841
+ // nested child block from the parent's activity draft; the
17842
+ // parent resumes and its result returns inline as the Task
17843
+ // tool result, so there's no handback to deliver. Reaction
17844
+ // promotion already ran above.
17845
+ const turn = currentTurn
17846
+ if (
17847
+ turn != null &&
17848
+ turn.foregroundSubAgents.delete(agentId) &&
17849
+ !turn.replyCalled
17850
+ ) {
17851
+ const rendered = composeTurnActivity(turn)
17852
+ if (rendered != null) {
17853
+ turn.activityPendingRender = rendered
17854
+ if (turn.activityInFlight == null) {
17855
+ turn.activityInFlight = drainActivitySummary(turn)
17856
+ }
17857
+ }
17858
+ }
17859
+ return
17860
+ }
17738
17861
  // #PR2 live worker-feed: force the terminal recap edit on
17739
17862
  // the worker's live message. No-op when no message was ever
17740
17863
  // posted (trivial workers stay silent; handback covers them).
@@ -17843,7 +17966,39 @@ void (async () => {
17843
17966
  } catch { /* best-effort */ }
17844
17967
  }
17845
17968
  const isBackground = dispatch.isBackground
17846
- if (!isBackground) return // skip overhead for foreground
17969
+ if (!isBackground) {
17970
+ // Model A — a foreground sub-agent runs inside the parent's
17971
+ // turn, so its live narrative nests under the parent's
17972
+ // activity draft rather than a separate worker message. Pure
17973
+ // jsonl-tail → render (no model call), inside the
17974
+ // subscription-honest boundary.
17975
+ if (!foregroundNestingEnabled) return // kill-switch: skip overhead
17976
+ const turn = currentTurn
17977
+ if (turn == null || turn.replyCalled) return
17978
+ const child = latestSummary.trim().slice(0, 120)
17979
+ if (child.length === 0) return
17980
+ let narrative = turn.foregroundSubAgents.get(agentId)
17981
+ if (narrative == null) {
17982
+ narrative = []
17983
+ turn.foregroundSubAgents.set(agentId, narrative)
17984
+ }
17985
+ // Dedup against the immediately-preceding line — the watcher
17986
+ // re-emits the same narrative across ticks while a tool runs.
17987
+ if (narrative[narrative.length - 1] !== child) {
17988
+ narrative.push(child)
17989
+ if (narrative.length > FOREGROUND_SUBAGENT_ACCUM_MAX) {
17990
+ narrative.splice(0, narrative.length - FOREGROUND_SUBAGENT_ACCUM_MAX)
17991
+ }
17992
+ }
17993
+ const rendered = composeTurnActivity(turn)
17994
+ if (rendered != null) {
17995
+ turn.activityPendingRender = rendered
17996
+ if (turn.activityInFlight == null) {
17997
+ turn.activityInFlight = drainActivitySummary(turn)
17998
+ }
17999
+ }
18000
+ return
18001
+ }
17847
18002
 
17848
18003
  // #PR2 live worker-feed: when ON, the worker's live chat
17849
18004
  // message owns the progress beat. Push a running cue and
@@ -79,6 +79,21 @@ export function spoolId(msg: InboundMessage): string {
79
79
  ) {
80
80
  return `s:progress:${msg.meta.subagent_jsonl_id}:${msg.meta.bucket_idx}`
81
81
  }
82
+ // Boot-resume inbounds (honest-restart-resume): deterministic per
83
+ // interrupted turn so a multi-restart sequence (operator restarts again
84
+ // before the agent drains the first resume) collapses to ONE resume of
85
+ // a given turn instead of stacking N. Keyed on the synthetic messageId
86
+ // (=ts, fresh every boot) would re-fire each boot; the turn_key is the
87
+ // stable identity. Both resume sources share the namespace because a
88
+ // given turn can only be one or the other.
89
+ if (
90
+ (msg.meta?.source === 'resume_interrupted' ||
91
+ msg.meta?.source === 'resume_watchdog_timeout') &&
92
+ typeof msg.meta?.resume_turn_key === 'string' &&
93
+ msg.meta.resume_turn_key.length > 0
94
+ ) {
95
+ return `s:resume:${msg.meta.resume_turn_key}`
96
+ }
82
97
  if (typeof msg.messageId === 'number' && msg.messageId > 0) {
83
98
  return `m:${msg.chatId}:${msg.messageId}`
84
99
  }
@@ -0,0 +1,180 @@
1
+ /**
2
+ * Pure builders for the synthetic inbounds the gateway injects at boot
3
+ * when it inherits an interrupted turn from the previous process.
4
+ *
5
+ * Two shapes, selected by how the prior turn ended (see
6
+ * `selectResumeBuilder`):
7
+ *
8
+ * - `resume_interrupted` — the turn was cut off mid-flight by an
9
+ * operator restart / SIGTERM / crash while it was still making
10
+ * progress. The agent should pick the work back up and tell the user
11
+ * it's resuming. Blanket resume regardless of how long ago — the
12
+ * elapsed time rides along so the model can frame it ("picking up the
13
+ * X you asked ~3h ago").
14
+ *
15
+ * - `resume_watchdog_timeout` — the turn stalled with no tool progress
16
+ * for the full hang-watchdog window and was (or would have been)
17
+ * killed as a hang. The agent must NOT silently resume; it reports
18
+ * what happened honestly and asks whether to retry or take a
19
+ * different angle. The honest cause is "no observable progress for N
20
+ * minutes" — the framework deliberately does not invent a deeper root
21
+ * cause, and neither should the model.
22
+ *
23
+ * Why a separate module (mirrors `vault-grant-inbound-builders.ts`): the
24
+ * InboundMessage shape is load-bearing. `meta.source` is what the bridge
25
+ * forwards verbatim and Claude Code renders as `<channel source="…">`, so
26
+ * the model keys on it to know this is a boot-resume turn rather than a
27
+ * human message. `meta.resume_turn_key` is the dedup anchor the spool
28
+ * uses (see `spoolId`) so a multi-restart sequence resumes a given turn
29
+ * exactly once. Pinning the builders against fixture tests keeps that
30
+ * contract honest without booting a real gateway.
31
+ */
32
+
33
+ import type { InboundMessage } from './ipc-protocol.js'
34
+ import type { Turn, TurnEndedVia } from '../registry/turns-schema.js'
35
+
36
+ /** Render an elapsed duration as a coarse, human-friendly approximation
37
+ * the model can drop straight into prose ("~3h ago"). Deliberately
38
+ * coarse — minute/hour/day buckets, never "2h 47m" precision the user
39
+ * doesn't care about on a resume. */
40
+ export function humanizeElapsed(ms: number): string {
41
+ if (!Number.isFinite(ms) || ms < 0) return 'an unknown amount of time'
42
+ const sec = Math.round(ms / 1000)
43
+ if (sec < 45) return 'moments'
44
+ const min = Math.round(sec / 60)
45
+ if (min < 60) return `~${min} min`
46
+ const hr = Math.round(min / 60)
47
+ if (hr < 24) return `~${hr}h`
48
+ const days = Math.round(hr / 24)
49
+ return `~${days} day${days === 1 ? '' : 's'}`
50
+ }
51
+
52
+ export interface ResumeInboundContext {
53
+ /** The interrupted turn, straight from the registry. */
54
+ turn: Turn
55
+ /** Wall-clock ms. Drives `ts`, `messageId`, and the elapsed framing.
56
+ * Defaults to Date.now(). */
57
+ nowMs?: number
58
+ }
59
+
60
+ function threadIdNum(turn: Turn): number | undefined {
61
+ if (turn.thread_id == null) return undefined
62
+ const n = Number(turn.thread_id)
63
+ return Number.isFinite(n) ? n : undefined
64
+ }
65
+
66
+ function promptClause(turn: Turn): string {
67
+ const p = turn.user_prompt_preview?.trim()
68
+ if (!p) return ''
69
+ // Quote-trim so a long preview doesn't bloat the channel body.
70
+ const snippet = p.length > 160 ? p.slice(0, 160) + '…' : p
71
+ return ` The request was: "${snippet}".`
72
+ }
73
+
74
+ /**
75
+ * Build the `resume_interrupted` inbound — a clean mid-flight interrupt
76
+ * the agent should pick back up.
77
+ */
78
+ export function buildResumeInterruptedInbound(ctx: ResumeInboundContext): InboundMessage {
79
+ const ts = ctx.nowMs ?? Date.now()
80
+ const elapsed = humanizeElapsed(ts - ctx.turn.started_at)
81
+ const meta: Record<string, string> = {
82
+ source: 'resume_interrupted',
83
+ resume_turn_key: ctx.turn.turn_key,
84
+ interrupted_via: ctx.turn.ended_via ?? 'restart',
85
+ started_at: String(ctx.turn.started_at),
86
+ }
87
+ if (ctx.turn.user_prompt_preview) meta.original_prompt = ctx.turn.user_prompt_preview
88
+ const threadId = threadIdNum(ctx.turn)
89
+ return {
90
+ type: 'inbound',
91
+ chatId: ctx.turn.chat_id,
92
+ ...(threadId != null ? { threadId } : {}),
93
+ messageId: ts,
94
+ user: 'switchroom',
95
+ userId: 0,
96
+ ts,
97
+ text:
98
+ `You just restarted. Your previous turn was interrupted ${elapsed} ago, ` +
99
+ `before it finished — it was cut off by a restart, not completed.` +
100
+ promptClause(ctx.turn) +
101
+ ` Pick that work back up now and continue it through to completion. ` +
102
+ `In your first message, briefly let the user know you're resuming what ` +
103
+ `was interrupted (mention roughly how long ago in plain language) so ` +
104
+ `they're not left wondering — then carry on with the actual task. Do ` +
105
+ `not ask whether to resume; just resume. If you genuinely can't tell ` +
106
+ `what the work was, say so and ask.`,
107
+ meta,
108
+ }
109
+ }
110
+
111
+ /**
112
+ * Build the `resume_watchdog_timeout` inbound — a stalled turn the agent
113
+ * must report (not silently resume).
114
+ *
115
+ * `idleMs` is the no-progress duration the boot classifier measured (the
116
+ * marker age). It is passed explicitly rather than read off the turn so
117
+ * the caller can recover it from the persisted `interrupt_reason` on a
118
+ * later boot when the live marker is gone.
119
+ */
120
+ export function buildResumeWatchdogReportInbound(
121
+ ctx: ResumeInboundContext & { idleMs: number },
122
+ ): InboundMessage {
123
+ const ts = ctx.nowMs ?? Date.now()
124
+ const idle = humanizeElapsed(ctx.idleMs)
125
+ const since = humanizeElapsed(ts - ctx.turn.started_at)
126
+ const toolClause =
127
+ ctx.turn.tool_call_count != null && ctx.turn.tool_call_count > 0
128
+ ? ` You'd run ${ctx.turn.tool_call_count} tool call${ctx.turn.tool_call_count === 1 ? '' : 's'} before it stalled.`
129
+ : ''
130
+ const meta: Record<string, string> = {
131
+ source: 'resume_watchdog_timeout',
132
+ resume_turn_key: ctx.turn.turn_key,
133
+ interrupted_via: 'timeout',
134
+ idle_ms: String(ctx.idleMs),
135
+ started_at: String(ctx.turn.started_at),
136
+ }
137
+ if (ctx.turn.tool_call_count != null) meta.tool_call_count = String(ctx.turn.tool_call_count)
138
+ if (ctx.turn.user_prompt_preview) meta.original_prompt = ctx.turn.user_prompt_preview
139
+ const threadId = threadIdNum(ctx.turn)
140
+ return {
141
+ type: 'inbound',
142
+ chatId: ctx.turn.chat_id,
143
+ ...(threadId != null ? { threadId } : {}),
144
+ messageId: ts,
145
+ user: 'switchroom',
146
+ userId: 0,
147
+ ts,
148
+ text:
149
+ `You just restarted. Your previous turn (started ${since} ago) was ` +
150
+ `killed by the hang-watchdog: it made no observable progress for ${idle} ` +
151
+ `and the watchdog restarts a turn that goes that long without activity.` +
152
+ toolClause +
153
+ promptClause(ctx.turn) +
154
+ ` Do NOT silently resume it — it may hang again the same way. Instead, ` +
155
+ `tell the user plainly what happened: that your last turn was killed ` +
156
+ `after ${idle} of no progress, and roughly what it was doing. Then ask ` +
157
+ `whether they want you to retry it or take a different angle. Report ` +
158
+ `only the honest cause — no observable progress for that long — don't ` +
159
+ `speculate about a deeper root cause you can't see.`,
160
+ meta,
161
+ }
162
+ }
163
+
164
+ /**
165
+ * Decide which resume inbound (if any) a given interrupt warrants. Pure —
166
+ * the gateway calls this with the classified `ended_via` so the
167
+ * report-vs-resume policy lives in one testable place.
168
+ *
169
+ * - 'timeout' → 'report' (watchdog kill)
170
+ * - 'restart' | 'sigterm' | 'unknown' → 'resume' (clean interrupt)
171
+ * - 'stop' → null (finished; nothing to do)
172
+ */
173
+ export function selectResumeBuilder(
174
+ endedVia: TurnEndedVia | null,
175
+ ): 'resume' | 'report' | null {
176
+ if (endedVia === 'timeout') return 'report'
177
+ if (endedVia === 'restart' || endedVia === 'sigterm' || endedVia === 'unknown') return 'resume'
178
+ if (endedVia == null) return 'resume' // still-open at boot = killed mid-flight
179
+ return null
180
+ }