switchroom 0.14.21 → 0.14.22
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/agent-scheduler/index.js +0 -1
- package/dist/auth-broker/index.js +0 -1
- package/dist/cli/notion-write-pretool.mjs +0 -1
- package/dist/cli/switchroom.js +14 -6
- package/dist/host-control/main.js +0 -1
- package/dist/vault/approvals/kernel-server.js +0 -1
- package/dist/vault/broker/server.js +0 -1
- package/package.json +3 -3
- package/profiles/_base/start.sh.hbs +11 -24
- package/profiles/_shared/telegram-style.md.hbs +2 -2
- package/profiles/default/CLAUDE.md.hbs +4 -1
- package/skills/switchroom-runtime/SKILL.md +6 -16
- package/telegram-plugin/agent-dir.ts +15 -0
- package/telegram-plugin/dist/gateway/gateway.js +640 -509
- package/telegram-plugin/gateway/gateway.ts +216 -61
- package/telegram-plugin/gateway/inbound-spool.ts +15 -0
- package/telegram-plugin/gateway/resume-inbound-builder.ts +180 -0
- package/telegram-plugin/registry/turns-schema.ts +138 -33
- package/telegram-plugin/stream-reply-handler.ts +1 -11
- package/telegram-plugin/tests/agent-dir.test.ts +25 -0
- package/telegram-plugin/tests/e2e.test.ts +2 -77
- package/telegram-plugin/tests/inbound-spool.test.ts +45 -0
- package/telegram-plugin/tests/multi-turn-continuity.test.ts +0 -1
- package/telegram-plugin/tests/outbound-ordering.test.ts +0 -1
- package/telegram-plugin/tests/parse-mode-rotation.test.ts +0 -1
- package/telegram-plugin/tests/races.test.ts +0 -26
- package/telegram-plugin/tests/registry-turns.test.ts +106 -29
- package/telegram-plugin/tests/resume-inbound-builder.test.ts +182 -0
- package/telegram-plugin/tests/status-accent.test.ts +0 -1
- package/telegram-plugin/tests/stream-reply-error-paths.test.ts +0 -1
- package/telegram-plugin/tests/stream-reply-handler.test.ts +0 -24
- package/telegram-plugin/tests/streaming-e2e.test.ts +0 -1
- package/telegram-plugin/tests/streaming-orchestration.test.ts +0 -1
- package/telegram-plugin/tests/tool-activity-summary.test.ts +44 -0
- package/telegram-plugin/tests/turns-writer.test.ts +16 -6
- package/telegram-plugin/tool-activity-summary.ts +55 -0
- package/telegram-plugin/uat/driver.ts +3 -1
- package/telegram-plugin/handoff-continuity.ts +0 -206
- package/telegram-plugin/tests/handoff-continuity.test.ts +0 -262
|
@@ -66,7 +66,7 @@ import { StatusReactionController } from '../status-reactions.js'
|
|
|
66
66
|
import { DeferredDoneReactions } from '../reaction-defer.js'
|
|
67
67
|
import { createWorkerActivityFeed, isWorkerActivityFeedEnabled } from '../worker-activity-feed.js'
|
|
68
68
|
import { isTelegramReplyTool, isTelegramSurfaceTool } from '../tool-names.js'
|
|
69
|
-
import { appendActivityLabel } from '../tool-activity-summary.js'
|
|
69
|
+
import { appendActivityLabel, renderActivityFeedWithNested } from '../tool-activity-summary.js'
|
|
70
70
|
import { toolLabel } from '../tool-labels.js'
|
|
71
71
|
import { createTypingWrapper } from '../typing-wrap.js'
|
|
72
72
|
import { type DraftStreamHandle } from '../draft-stream.js'
|
|
@@ -210,14 +210,7 @@ import {
|
|
|
210
210
|
isTurnFlushSafetyEnabled,
|
|
211
211
|
} from '../turn-flush-safety.js'
|
|
212
212
|
// #1122 PR3: turn-flush-prose-recovery removed with the progress card.
|
|
213
|
-
import {
|
|
214
|
-
resolveAgentDirFromEnv,
|
|
215
|
-
consumeHandoffTopic,
|
|
216
|
-
shouldShowHandoffLine,
|
|
217
|
-
formatHandoffLine,
|
|
218
|
-
writeLastTurnSummary,
|
|
219
|
-
type HandoffFormat,
|
|
220
|
-
} from '../handoff-continuity.js'
|
|
213
|
+
import { resolveAgentDirFromEnv } from '../agent-dir.js'
|
|
221
214
|
import {
|
|
222
215
|
addActiveReaction,
|
|
223
216
|
removeActiveReaction,
|
|
@@ -396,6 +389,7 @@ import {
|
|
|
396
389
|
touchTurnActiveMarker,
|
|
397
390
|
removeTurnActiveMarker,
|
|
398
391
|
sweepStaleTurnActiveMarker,
|
|
392
|
+
TURN_ACTIVE_MARKER_FILE,
|
|
399
393
|
} from './turn-active-marker.js'
|
|
400
394
|
import {
|
|
401
395
|
VERSION,
|
|
@@ -423,12 +417,17 @@ import {
|
|
|
423
417
|
import { resolveVaultApprovalPosture } from '../vault-approval-posture.js'
|
|
424
418
|
import {
|
|
425
419
|
openTurnsDb,
|
|
426
|
-
|
|
420
|
+
markOrphanedWithTimeoutClassification,
|
|
427
421
|
recordTurnStart,
|
|
428
422
|
recordTurnEnd,
|
|
429
|
-
|
|
423
|
+
findLatestTurnIfInterrupted,
|
|
430
424
|
findRecentTurnsForChat,
|
|
431
425
|
} from '../registry/turns-schema.js'
|
|
426
|
+
import {
|
|
427
|
+
buildResumeInterruptedInbound,
|
|
428
|
+
buildResumeWatchdogReportInbound,
|
|
429
|
+
selectResumeBuilder,
|
|
430
|
+
} from './resume-inbound-builder.js'
|
|
432
431
|
import { applySubagentsSchema, getSubagentByJsonlId } from '../registry/subagents-schema.js'
|
|
433
432
|
import { resolveWorkerFeedDispatch, type WorkerFeedDispatch } from './worker-feed-dispatch.js'
|
|
434
433
|
import { formatIdleFooter } from '../idle-footer.js'
|
|
@@ -969,13 +968,26 @@ if (HISTORY_ENABLED) {
|
|
|
969
968
|
}
|
|
970
969
|
}
|
|
971
970
|
|
|
972
|
-
// ─── Turn-tracking registry
|
|
973
|
-
// On boot, open the per-agent registry.db and
|
|
974
|
-
//
|
|
975
|
-
//
|
|
976
|
-
//
|
|
977
|
-
//
|
|
971
|
+
// ─── Turn-tracking registry + honest-restart-resume ────────────────────────
|
|
972
|
+
// On boot, open the per-agent registry.db and reap any turn that never got an
|
|
973
|
+
// ended_at — those were killed mid-flight (operator restart, SIGKILL, OOM,
|
|
974
|
+
// hard reboot). The reaper CLASSIFIES each orphan from the on-disk
|
|
975
|
+
// turn-active marker's age:
|
|
976
|
+
// - marker older than the hang-watchdog window → 'timeout' (the turn
|
|
977
|
+
// stalled with no tool progress; report it, don't blindly resume).
|
|
978
|
+
// - otherwise → 'restart' (a clean interrupt; resume it).
|
|
979
|
+
// Then, if the LATEST turn was interrupted, we build a synthetic resume /
|
|
980
|
+
// report inbound and (further down, once the inbound spool exists) inject it
|
|
981
|
+
// so the agent wakes on its own and either picks the work back up or tells
|
|
982
|
+
// the user why it stopped — no human nudge required.
|
|
983
|
+
//
|
|
984
|
+
// The classifier MUST read the marker before the boot-cleanup sweep removes
|
|
985
|
+
// it (the sweep runs much later, in the bridge-registration path). This block
|
|
986
|
+
// runs at module top, so the marker is still present here.
|
|
978
987
|
let turnsDb: ReturnType<typeof openTurnsDb> | null = null
|
|
988
|
+
// Stashed here; pushed to the spool once it's constructed below. The spool's
|
|
989
|
+
// turn_key-keyed dedup makes a re-stash across multiple restarts a no-op.
|
|
990
|
+
let bootResumeInbound: { agent: string; msg: InboundMessage } | null = null
|
|
979
991
|
try {
|
|
980
992
|
// STATE_DIR is `<agentDir>/telegram` in production. openTurnsDb expects
|
|
981
993
|
// the parent (agent dir) and joins `telegram/registry.db` itself.
|
|
@@ -987,23 +999,88 @@ try {
|
|
|
987
999
|
// schema; subagents lives alongside in registry.db. Idempotent — safe on
|
|
988
1000
|
// pre-existing DBs (handles the jsonl_agent_id column migration).
|
|
989
1001
|
applySubagentsSchema(turnsDb)
|
|
990
|
-
|
|
1002
|
+
|
|
1003
|
+
// Read the turn-active marker (the in-flight turn the watchdog tracks)
|
|
1004
|
+
// BEFORE classifying — its mtime is "ms since last tool progress" and its
|
|
1005
|
+
// payload carries the in-flight turn_key.
|
|
1006
|
+
let markerTurnKey: string | null = null
|
|
1007
|
+
let markerAgeMs: number | null = null
|
|
1008
|
+
try {
|
|
1009
|
+
const markerPath = join(STATE_DIR, TURN_ACTIVE_MARKER_FILE)
|
|
1010
|
+
if (existsSync(markerPath)) {
|
|
1011
|
+
const st = statSync(markerPath)
|
|
1012
|
+
markerAgeMs = Date.now() - st.mtimeMs
|
|
1013
|
+
try {
|
|
1014
|
+
const payload = JSON.parse(readFileSync(markerPath, 'utf8')) as { turnKey?: unknown }
|
|
1015
|
+
if (typeof payload.turnKey === 'string' && payload.turnKey.length > 0) {
|
|
1016
|
+
markerTurnKey = payload.turnKey
|
|
1017
|
+
}
|
|
1018
|
+
} catch { /* unreadable/torn marker — age alone still classifies */ }
|
|
1019
|
+
}
|
|
1020
|
+
} catch { /* stat failure — treat as no marker (plain restart) */ }
|
|
1021
|
+
|
|
1022
|
+
// TURN_HANG_SECS is the watchdog's hang threshold (default 300s); the
|
|
1023
|
+
// classifier uses the same signal so "would the watchdog have killed it"
|
|
1024
|
+
// is answered identically whether or not the watchdog is live (it's
|
|
1025
|
+
// disabled under Docker, but the staleness judgement still holds).
|
|
1026
|
+
const hangSecs = Number(process.env.TURN_HANG_SECS)
|
|
1027
|
+
const hangThresholdMs = (Number.isFinite(hangSecs) && hangSecs > 0 ? hangSecs : 300) * 1000
|
|
1028
|
+
const reasonSnapshot =
|
|
1029
|
+
markerAgeMs != null ? JSON.stringify({ idleMs: Math.round(markerAgeMs) }) : null
|
|
1030
|
+
|
|
1031
|
+
const { reaped, timeoutTurnKey } = markOrphanedWithTimeoutClassification(turnsDb, {
|
|
1032
|
+
markerTurnKey,
|
|
1033
|
+
markerAgeMs,
|
|
1034
|
+
hangThresholdMs,
|
|
1035
|
+
reasonSnapshot,
|
|
1036
|
+
})
|
|
991
1037
|
if (reaped > 0) {
|
|
992
|
-
process.stderr.write(
|
|
1038
|
+
process.stderr.write(
|
|
1039
|
+
`telegram gateway: turn-registry boot-reaper stamped ${reaped} orphaned turn(s)` +
|
|
1040
|
+
`${timeoutTurnKey ? ` (turnKey=${timeoutTurnKey} as 'timeout', markerAgeMs=${markerAgeMs})` : " as 'restart'"}\n`,
|
|
1041
|
+
)
|
|
993
1042
|
} else {
|
|
994
1043
|
process.stderr.write(`telegram gateway: turn-registry initialized at ${join(agentDir, 'telegram', 'registry.db')}\n`)
|
|
995
1044
|
}
|
|
996
1045
|
|
|
997
|
-
//
|
|
998
|
-
//
|
|
999
|
-
|
|
1000
|
-
|
|
1001
|
-
|
|
1002
|
-
|
|
1003
|
-
|
|
1046
|
+
// Build the boot resume/report inbound for the LATEST turn if it was
|
|
1047
|
+
// interrupted. selectResumeBuilder owns the resume-vs-report policy.
|
|
1048
|
+
const pending = findLatestTurnIfInterrupted(turnsDb)
|
|
1049
|
+
const selfAgent = process.env.SWITCHROOM_AGENT_NAME ?? ''
|
|
1050
|
+
if (pending != null && selfAgent) {
|
|
1051
|
+
const kind = selectResumeBuilder(pending.ended_via)
|
|
1052
|
+
if (kind === 'resume') {
|
|
1053
|
+
bootResumeInbound = { agent: selfAgent, msg: buildResumeInterruptedInbound({ turn: pending }) }
|
|
1054
|
+
} else if (kind === 'report') {
|
|
1055
|
+
// idleMs: this boot's measured marker age if it just classified this
|
|
1056
|
+
// turn; otherwise recover it from the persisted interrupt_reason (a
|
|
1057
|
+
// later boot, marker already swept); else fall back to total runtime.
|
|
1058
|
+
let idleMs = pending.turn_key === timeoutTurnKey && markerAgeMs != null ? markerAgeMs : null
|
|
1059
|
+
if (idleMs == null && pending.interrupt_reason) {
|
|
1060
|
+
try {
|
|
1061
|
+
const parsed = JSON.parse(pending.interrupt_reason) as { idleMs?: unknown }
|
|
1062
|
+
if (typeof parsed.idleMs === 'number' && Number.isFinite(parsed.idleMs)) idleMs = parsed.idleMs
|
|
1063
|
+
} catch { /* malformed snapshot — fall through */ }
|
|
1064
|
+
}
|
|
1065
|
+
if (idleMs == null) idleMs = Math.max(0, Date.now() - pending.started_at)
|
|
1066
|
+
bootResumeInbound = {
|
|
1067
|
+
agent: selfAgent,
|
|
1068
|
+
msg: buildResumeWatchdogReportInbound({ turn: pending, idleMs }),
|
|
1069
|
+
}
|
|
1070
|
+
}
|
|
1071
|
+
if (bootResumeInbound != null) {
|
|
1072
|
+
process.stderr.write(
|
|
1073
|
+
`telegram gateway: boot-resume queued kind=${kind} turnKey=${pending.turn_key} ` +
|
|
1074
|
+
`endedVia=${pending.ended_via ?? 'open'} chat=${pending.chat_id}\n`,
|
|
1075
|
+
)
|
|
1076
|
+
}
|
|
1077
|
+
}
|
|
1078
|
+
|
|
1079
|
+
// Diagnostic env file (one-shot, sourced by start.sh) — kept for the
|
|
1080
|
+
// wake-audit context. The injected inbound above is the real wake signal;
|
|
1081
|
+
// these vars are passive context only.
|
|
1004
1082
|
const pendingEnvPath = join(agentDir, '.pending-turn.env')
|
|
1005
1083
|
try {
|
|
1006
|
-
const pending = findMostRecentInterruptedTurn(turnsDb)
|
|
1007
1084
|
if (pending != null) {
|
|
1008
1085
|
const lines = [
|
|
1009
1086
|
`SWITCHROOM_PENDING_TURN=true`,
|
|
@@ -1013,14 +1090,12 @@ try {
|
|
|
1013
1090
|
pending.last_user_msg_id != null ? `SWITCHROOM_PENDING_USER_MSG_ID=${pending.last_user_msg_id}` : `SWITCHROOM_PENDING_USER_MSG_ID=`,
|
|
1014
1091
|
`SWITCHROOM_PENDING_ENDED_VIA=${pending.ended_via ?? 'unknown'}`,
|
|
1015
1092
|
`SWITCHROOM_PENDING_STARTED_AT=${pending.started_at}`,
|
|
1093
|
+
pending.interrupt_reason != null ? `SWITCHROOM_PENDING_INTERRUPT_REASON=${pending.interrupt_reason}` : `SWITCHROOM_PENDING_INTERRUPT_REASON=`,
|
|
1016
1094
|
]
|
|
1017
1095
|
// Atomic write: tmp + rename. Without this, a crash mid-write
|
|
1018
1096
|
// (power loss, OOM, panic) leaves a truncated `.pending-turn.env`
|
|
1019
1097
|
// that start.sh `source`s — partial SWITCHROOM_PENDING_* vars
|
|
1020
|
-
//
|
|
1021
|
-
// a malformed line breaks shell parsing inside the source.
|
|
1022
|
-
// Same pattern used by the access-file write a few hundred lines
|
|
1023
|
-
// above and by src/issues/store.ts.
|
|
1098
|
+
// or a malformed line break shell parsing inside the source.
|
|
1024
1099
|
const pendingEnvTmp = `${pendingEnvPath}.tmp-${process.pid}`
|
|
1025
1100
|
writeFileSync(pendingEnvTmp, lines.join('\n') + '\n', { mode: 0o600 })
|
|
1026
1101
|
renameSync(pendingEnvTmp, pendingEnvPath)
|
|
@@ -1030,7 +1105,7 @@ try {
|
|
|
1030
1105
|
process.stderr.write(`telegram gateway: pending-turn env cleared (clean previous shutdown)\n`)
|
|
1031
1106
|
}
|
|
1032
1107
|
} catch (err) {
|
|
1033
|
-
process.stderr.write(`telegram gateway: pending-turn env write failed (${(err as Error).message})
|
|
1108
|
+
process.stderr.write(`telegram gateway: pending-turn env write failed (${(err as Error).message})\n`)
|
|
1034
1109
|
}
|
|
1035
1110
|
} catch (err) {
|
|
1036
1111
|
process.stderr.write(`telegram gateway: turn-registry init failed (${(err as Error).message}) — turn tracking disabled\n`)
|
|
@@ -1399,6 +1474,13 @@ type CurrentTurn = {
|
|
|
1399
1474
|
// (via `renderActivityFeed`) as a capped chronological list into the
|
|
1400
1475
|
// in-place edited activity message and clears on reply. Reset per turn.
|
|
1401
1476
|
mirrorLines: string[]
|
|
1477
|
+
// Model A — foreground sub-agent nesting. A foreground sub-agent (Task/Agent
|
|
1478
|
+
// with no run_in_background) runs INSIDE this turn while the parent blocks at
|
|
1479
|
+
// the Task tool, so its live steps nest under the parent's activity feed
|
|
1480
|
+
// rather than a separate message. Keyed by jsonl agent id; value = the
|
|
1481
|
+
// sub-agent's accumulated narrative lines (oldest→newest, deduped + capped).
|
|
1482
|
+
// Background workers are NOT here — they get the standalone worker feed.
|
|
1483
|
+
foregroundSubAgents: Map<string, string[]>
|
|
1402
1484
|
// Issue #195 — answer-lane streaming. Lazily created on the first text
|
|
1403
1485
|
// event of a turn (once enough text has accumulated, the stream itself
|
|
1404
1486
|
// gates on minInitialChars). Materialized and cleared at turn_end.
|
|
@@ -2129,23 +2211,6 @@ function probeAvailableReactions(chatId: string): void {
|
|
|
2129
2211
|
})()
|
|
2130
2212
|
}
|
|
2131
2213
|
|
|
2132
|
-
// ─── Handoff continuity ───────────────────────────────────────────────────
|
|
2133
|
-
let pendingHandoffTopic: string | null = null
|
|
2134
|
-
|
|
2135
|
-
function initHandoffContinuity(): void {
|
|
2136
|
-
if (!shouldShowHandoffLine()) { pendingHandoffTopic = null; return }
|
|
2137
|
-
const agentDir = resolveAgentDirFromEnv()
|
|
2138
|
-
if (agentDir == null) { pendingHandoffTopic = null; return }
|
|
2139
|
-
pendingHandoffTopic = consumeHandoffTopic(agentDir)
|
|
2140
|
-
}
|
|
2141
|
-
|
|
2142
|
-
function takeHandoffPrefix(format: HandoffFormat): string {
|
|
2143
|
-
if (pendingHandoffTopic == null) return ''
|
|
2144
|
-
const line = formatHandoffLine(pendingHandoffTopic, format)
|
|
2145
|
-
pendingHandoffTopic = null
|
|
2146
|
-
return line
|
|
2147
|
-
}
|
|
2148
|
-
|
|
2149
2214
|
// ─── Text chunking ────────────────────────────────────────────────────────
|
|
2150
2215
|
const PHOTO_EXTS = new Set(['.jpg', '.jpeg', '.png', '.gif', '.webp'])
|
|
2151
2216
|
|
|
@@ -3942,6 +4007,21 @@ const inboundSpool = STATIC
|
|
|
3942
4007
|
},
|
|
3943
4008
|
})
|
|
3944
4009
|
const pendingInboundBuffer = createPendingInboundBuffer({ spool: inboundSpool })
|
|
4010
|
+
// Honest-restart-resume: inject the boot resume/report inbound built by the
|
|
4011
|
+
// registry classifier above. When the spool exists we only PUT it (the
|
|
4012
|
+
// boot-replay loop below pulls it into the in-memory buffer exactly once via
|
|
4013
|
+
// liveEntries — pushing here too would double-queue). The turn_key-keyed
|
|
4014
|
+
// spoolId makes this a no-op if a prior restart already queued the same turn
|
|
4015
|
+
// and it hasn't been delivered yet — so a multi-restart sequence resumes a
|
|
4016
|
+
// given turn once, not N times. When there's no spool (STATIC mode) push
|
|
4017
|
+
// straight to the in-memory buffer.
|
|
4018
|
+
if (bootResumeInbound != null) {
|
|
4019
|
+
if (inboundSpool != null) {
|
|
4020
|
+
inboundSpool.put(bootResumeInbound.agent, bootResumeInbound.msg)
|
|
4021
|
+
} else {
|
|
4022
|
+
pendingInboundBuffer.push(bootResumeInbound.agent, bootResumeInbound.msg)
|
|
4023
|
+
}
|
|
4024
|
+
}
|
|
3945
4025
|
// Boot-replay: re-queue every un-acked spooled inbound into the
|
|
3946
4026
|
// in-memory buffer so the existing drain triggers (onClientRegistered
|
|
3947
4027
|
// / silence-poke #1546 / idle-drain #1549) deliver them. push →
|
|
@@ -5249,13 +5329,6 @@ async function executeReply(args: Record<string, unknown>): Promise<{ content: A
|
|
|
5249
5329
|
effectiveText = text
|
|
5250
5330
|
}
|
|
5251
5331
|
|
|
5252
|
-
{
|
|
5253
|
-
const prefix = takeHandoffPrefix(
|
|
5254
|
-
format === 'html' ? 'html' : format === 'markdownv2' ? 'markdownv2' : 'text',
|
|
5255
|
-
)
|
|
5256
|
-
if (prefix.length > 0) effectiveText = prefix + effectiveText
|
|
5257
|
-
}
|
|
5258
|
-
|
|
5259
5332
|
assertAllowedChat(chat_id)
|
|
5260
5333
|
|
|
5261
5334
|
let threadId = resolveThreadId(chat_id, args.message_thread_id as string | undefined)
|
|
@@ -5989,7 +6062,6 @@ async function executeStreamReply(args: Record<string, unknown>): Promise<unknow
|
|
|
5989
6062
|
markdownToHtml,
|
|
5990
6063
|
escapeMarkdownV2,
|
|
5991
6064
|
repairEscapedWhitespace,
|
|
5992
|
-
takeHandoffPrefix,
|
|
5993
6065
|
assertAllowedChat,
|
|
5994
6066
|
resolveThreadId,
|
|
5995
6067
|
disableLinkPreview: access.disableLinkPreview !== false,
|
|
@@ -7158,6 +7230,27 @@ function closeProgressLane(chatId: string, threadId: number | undefined): void {
|
|
|
7158
7230
|
}
|
|
7159
7231
|
}
|
|
7160
7232
|
|
|
7233
|
+
/** Accumulation cap for a foreground sub-agent's nested narrative lines.
|
|
7234
|
+
* Slightly larger than NESTED_MAX_LINES so the render's "↳ +N earlier…"
|
|
7235
|
+
* header is meaningful without growing unbounded on a long sub-agent. */
|
|
7236
|
+
const FOREGROUND_SUBAGENT_ACCUM_MAX = 12
|
|
7237
|
+
|
|
7238
|
+
/**
|
|
7239
|
+
* Render this turn's activity feed, nesting any active foreground sub-agent's
|
|
7240
|
+
* narrative beneath the parent's own steps (Model A). With no active
|
|
7241
|
+
* foreground sub-agent this is exactly the flat feed. Multiple concurrent
|
|
7242
|
+
* foreground sub-agents (rare — parallel Task dispatch) flatten in insertion
|
|
7243
|
+
* order; the single-sub-agent common case nests precisely under its
|
|
7244
|
+
* Delegating line.
|
|
7245
|
+
*/
|
|
7246
|
+
function composeTurnActivity(turn: CurrentTurn): string | null {
|
|
7247
|
+
const childLines: string[] = []
|
|
7248
|
+
for (const narrative of turn.foregroundSubAgents.values()) {
|
|
7249
|
+
childLines.push(...narrative)
|
|
7250
|
+
}
|
|
7251
|
+
return renderActivityFeedWithNested(turn.mirrorLines, childLines)
|
|
7252
|
+
}
|
|
7253
|
+
|
|
7161
7254
|
/**
|
|
7162
7255
|
* Drain the tool-activity summary's pending render queue. Single-flight
|
|
7163
7256
|
* by construction (caller assigns the returned promise to
|
|
@@ -7324,6 +7417,7 @@ function handleSessionEvent(ev: SessionEvent): void {
|
|
|
7324
7417
|
activityPendingRender: null,
|
|
7325
7418
|
activityLastSentRender: null,
|
|
7326
7419
|
mirrorLines: [],
|
|
7420
|
+
foregroundSubAgents: new Map(),
|
|
7327
7421
|
answerStream: null,
|
|
7328
7422
|
isDm: isDmChatId(ev.chatId),
|
|
7329
7423
|
}
|
|
@@ -7501,7 +7595,10 @@ function handleSessionEvent(ev: SessionEvent): void {
|
|
|
7501
7595
|
if (turn.replyCalled) return
|
|
7502
7596
|
const rendered = appendActivityLabel(turn.mirrorLines, ev.label)
|
|
7503
7597
|
if (rendered != null) {
|
|
7504
|
-
|
|
7598
|
+
// Recompose so any active foreground sub-agent's nested block (Model A)
|
|
7599
|
+
// is preserved when the parent appends its own step. composeTurnActivity
|
|
7600
|
+
// == the flat render when no foreground sub-agent is active.
|
|
7601
|
+
turn.activityPendingRender = composeTurnActivity(turn) ?? rendered
|
|
7505
7602
|
if (turn.activityInFlight == null) {
|
|
7506
7603
|
turn.activityInFlight = drainActivitySummary(turn)
|
|
7507
7604
|
}
|
|
@@ -8508,7 +8605,6 @@ function handlePtyActivity(text: string): void {
|
|
|
8508
8605
|
markdownToHtml,
|
|
8509
8606
|
escapeMarkdownV2,
|
|
8510
8607
|
repairEscapedWhitespace,
|
|
8511
|
-
takeHandoffPrefix: () => '',
|
|
8512
8608
|
assertAllowedChat,
|
|
8513
8609
|
resolveThreadId,
|
|
8514
8610
|
disableLinkPreview: access.disableLinkPreview !== false,
|
|
@@ -16982,7 +17078,6 @@ process.on('SIGINT', () => void shutdown('SIGINT'))
|
|
|
16982
17078
|
|
|
16983
17079
|
|
|
16984
17080
|
// ─── Startup ──────────────────────────────────────────────────────────────
|
|
16985
|
-
initHandoffContinuity()
|
|
16986
17081
|
|
|
16987
17082
|
// Top-level error handlers route through shutdown() so the startup lock is
|
|
16988
17083
|
// released cleanly. Without this, a top-level throw would leave the lock
|
|
@@ -17577,6 +17672,12 @@ void (async () => {
|
|
|
17577
17672
|
// supersedes the coarse 5-min bucket relay below to avoid
|
|
17578
17673
|
// double-surfacing the same progress beat.
|
|
17579
17674
|
const workerFeedEnabled = isWorkerActivityFeedEnabled(process.env.SWITCHROOM_WORKER_ACTIVITY_FEED)
|
|
17675
|
+
// Model A — foreground sub-agent nesting in the parent's live
|
|
17676
|
+
// activity draft. ON by default; this edits the SAME activity-
|
|
17677
|
+
// summary message the tool_label feed already owns (not the
|
|
17678
|
+
// compose draft, so no answer-stream contention). The kill-switch
|
|
17679
|
+
// disables only the nesting; the parent's own feed is unaffected.
|
|
17680
|
+
const foregroundNestingEnabled = process.env.SWITCHROOM_FOREGROUND_SUBAGENT_NESTING !== '0'
|
|
17580
17681
|
const workerActivityFeed = createWorkerActivityFeed({
|
|
17581
17682
|
bot: {
|
|
17582
17683
|
sendMessage: async (cid, text, sendOpts) => {
|
|
@@ -17735,6 +17836,28 @@ void (async () => {
|
|
|
17735
17836
|
} catch { /* best-effort */ }
|
|
17736
17837
|
}
|
|
17737
17838
|
const isBackground = dispatch.isBackground
|
|
17839
|
+
if (!isBackground) {
|
|
17840
|
+
// Model A — a foreground sub-agent finished. Collapse its
|
|
17841
|
+
// nested child block from the parent's activity draft; the
|
|
17842
|
+
// parent resumes and its result returns inline as the Task
|
|
17843
|
+
// tool result, so there's no handback to deliver. Reaction
|
|
17844
|
+
// promotion already ran above.
|
|
17845
|
+
const turn = currentTurn
|
|
17846
|
+
if (
|
|
17847
|
+
turn != null &&
|
|
17848
|
+
turn.foregroundSubAgents.delete(agentId) &&
|
|
17849
|
+
!turn.replyCalled
|
|
17850
|
+
) {
|
|
17851
|
+
const rendered = composeTurnActivity(turn)
|
|
17852
|
+
if (rendered != null) {
|
|
17853
|
+
turn.activityPendingRender = rendered
|
|
17854
|
+
if (turn.activityInFlight == null) {
|
|
17855
|
+
turn.activityInFlight = drainActivitySummary(turn)
|
|
17856
|
+
}
|
|
17857
|
+
}
|
|
17858
|
+
}
|
|
17859
|
+
return
|
|
17860
|
+
}
|
|
17738
17861
|
// #PR2 live worker-feed: force the terminal recap edit on
|
|
17739
17862
|
// the worker's live message. No-op when no message was ever
|
|
17740
17863
|
// posted (trivial workers stay silent; handback covers them).
|
|
@@ -17843,7 +17966,39 @@ void (async () => {
|
|
|
17843
17966
|
} catch { /* best-effort */ }
|
|
17844
17967
|
}
|
|
17845
17968
|
const isBackground = dispatch.isBackground
|
|
17846
|
-
if (!isBackground)
|
|
17969
|
+
if (!isBackground) {
|
|
17970
|
+
// Model A — a foreground sub-agent runs inside the parent's
|
|
17971
|
+
// turn, so its live narrative nests under the parent's
|
|
17972
|
+
// activity draft rather than a separate worker message. Pure
|
|
17973
|
+
// jsonl-tail → render (no model call), inside the
|
|
17974
|
+
// subscription-honest boundary.
|
|
17975
|
+
if (!foregroundNestingEnabled) return // kill-switch: skip overhead
|
|
17976
|
+
const turn = currentTurn
|
|
17977
|
+
if (turn == null || turn.replyCalled) return
|
|
17978
|
+
const child = latestSummary.trim().slice(0, 120)
|
|
17979
|
+
if (child.length === 0) return
|
|
17980
|
+
let narrative = turn.foregroundSubAgents.get(agentId)
|
|
17981
|
+
if (narrative == null) {
|
|
17982
|
+
narrative = []
|
|
17983
|
+
turn.foregroundSubAgents.set(agentId, narrative)
|
|
17984
|
+
}
|
|
17985
|
+
// Dedup against the immediately-preceding line — the watcher
|
|
17986
|
+
// re-emits the same narrative across ticks while a tool runs.
|
|
17987
|
+
if (narrative[narrative.length - 1] !== child) {
|
|
17988
|
+
narrative.push(child)
|
|
17989
|
+
if (narrative.length > FOREGROUND_SUBAGENT_ACCUM_MAX) {
|
|
17990
|
+
narrative.splice(0, narrative.length - FOREGROUND_SUBAGENT_ACCUM_MAX)
|
|
17991
|
+
}
|
|
17992
|
+
}
|
|
17993
|
+
const rendered = composeTurnActivity(turn)
|
|
17994
|
+
if (rendered != null) {
|
|
17995
|
+
turn.activityPendingRender = rendered
|
|
17996
|
+
if (turn.activityInFlight == null) {
|
|
17997
|
+
turn.activityInFlight = drainActivitySummary(turn)
|
|
17998
|
+
}
|
|
17999
|
+
}
|
|
18000
|
+
return
|
|
18001
|
+
}
|
|
17847
18002
|
|
|
17848
18003
|
// #PR2 live worker-feed: when ON, the worker's live chat
|
|
17849
18004
|
// message owns the progress beat. Push a running cue and
|
|
@@ -79,6 +79,21 @@ export function spoolId(msg: InboundMessage): string {
|
|
|
79
79
|
) {
|
|
80
80
|
return `s:progress:${msg.meta.subagent_jsonl_id}:${msg.meta.bucket_idx}`
|
|
81
81
|
}
|
|
82
|
+
// Boot-resume inbounds (honest-restart-resume): deterministic per
|
|
83
|
+
// interrupted turn so a multi-restart sequence (operator restarts again
|
|
84
|
+
// before the agent drains the first resume) collapses to ONE resume of
|
|
85
|
+
// a given turn instead of stacking N. Keyed on the synthetic messageId
|
|
86
|
+
// (=ts, fresh every boot) would re-fire each boot; the turn_key is the
|
|
87
|
+
// stable identity. Both resume sources share the namespace because a
|
|
88
|
+
// given turn can only be one or the other.
|
|
89
|
+
if (
|
|
90
|
+
(msg.meta?.source === 'resume_interrupted' ||
|
|
91
|
+
msg.meta?.source === 'resume_watchdog_timeout') &&
|
|
92
|
+
typeof msg.meta?.resume_turn_key === 'string' &&
|
|
93
|
+
msg.meta.resume_turn_key.length > 0
|
|
94
|
+
) {
|
|
95
|
+
return `s:resume:${msg.meta.resume_turn_key}`
|
|
96
|
+
}
|
|
82
97
|
if (typeof msg.messageId === 'number' && msg.messageId > 0) {
|
|
83
98
|
return `m:${msg.chatId}:${msg.messageId}`
|
|
84
99
|
}
|
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Pure builders for the synthetic inbounds the gateway injects at boot
|
|
3
|
+
* when it inherits an interrupted turn from the previous process.
|
|
4
|
+
*
|
|
5
|
+
* Two shapes, selected by how the prior turn ended (see
|
|
6
|
+
* `selectResumeBuilder`):
|
|
7
|
+
*
|
|
8
|
+
* - `resume_interrupted` — the turn was cut off mid-flight by an
|
|
9
|
+
* operator restart / SIGTERM / crash while it was still making
|
|
10
|
+
* progress. The agent should pick the work back up and tell the user
|
|
11
|
+
* it's resuming. Blanket resume regardless of how long ago — the
|
|
12
|
+
* elapsed time rides along so the model can frame it ("picking up the
|
|
13
|
+
* X you asked ~3h ago").
|
|
14
|
+
*
|
|
15
|
+
* - `resume_watchdog_timeout` — the turn stalled with no tool progress
|
|
16
|
+
* for the full hang-watchdog window and was (or would have been)
|
|
17
|
+
* killed as a hang. The agent must NOT silently resume; it reports
|
|
18
|
+
* what happened honestly and asks whether to retry or take a
|
|
19
|
+
* different angle. The honest cause is "no observable progress for N
|
|
20
|
+
* minutes" — the framework deliberately does not invent a deeper root
|
|
21
|
+
* cause, and neither should the model.
|
|
22
|
+
*
|
|
23
|
+
* Why a separate module (mirrors `vault-grant-inbound-builders.ts`): the
|
|
24
|
+
* InboundMessage shape is load-bearing. `meta.source` is what the bridge
|
|
25
|
+
* forwards verbatim and Claude Code renders as `<channel source="…">`, so
|
|
26
|
+
* the model keys on it to know this is a boot-resume turn rather than a
|
|
27
|
+
* human message. `meta.resume_turn_key` is the dedup anchor the spool
|
|
28
|
+
* uses (see `spoolId`) so a multi-restart sequence resumes a given turn
|
|
29
|
+
* exactly once. Pinning the builders against fixture tests keeps that
|
|
30
|
+
* contract honest without booting a real gateway.
|
|
31
|
+
*/
|
|
32
|
+
|
|
33
|
+
import type { InboundMessage } from './ipc-protocol.js'
|
|
34
|
+
import type { Turn, TurnEndedVia } from '../registry/turns-schema.js'
|
|
35
|
+
|
|
36
|
+
/** Render an elapsed duration as a coarse, human-friendly approximation
|
|
37
|
+
* the model can drop straight into prose ("~3h ago"). Deliberately
|
|
38
|
+
* coarse — minute/hour/day buckets, never "2h 47m" precision the user
|
|
39
|
+
* doesn't care about on a resume. */
|
|
40
|
+
export function humanizeElapsed(ms: number): string {
|
|
41
|
+
if (!Number.isFinite(ms) || ms < 0) return 'an unknown amount of time'
|
|
42
|
+
const sec = Math.round(ms / 1000)
|
|
43
|
+
if (sec < 45) return 'moments'
|
|
44
|
+
const min = Math.round(sec / 60)
|
|
45
|
+
if (min < 60) return `~${min} min`
|
|
46
|
+
const hr = Math.round(min / 60)
|
|
47
|
+
if (hr < 24) return `~${hr}h`
|
|
48
|
+
const days = Math.round(hr / 24)
|
|
49
|
+
return `~${days} day${days === 1 ? '' : 's'}`
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
export interface ResumeInboundContext {
|
|
53
|
+
/** The interrupted turn, straight from the registry. */
|
|
54
|
+
turn: Turn
|
|
55
|
+
/** Wall-clock ms. Drives `ts`, `messageId`, and the elapsed framing.
|
|
56
|
+
* Defaults to Date.now(). */
|
|
57
|
+
nowMs?: number
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
function threadIdNum(turn: Turn): number | undefined {
|
|
61
|
+
if (turn.thread_id == null) return undefined
|
|
62
|
+
const n = Number(turn.thread_id)
|
|
63
|
+
return Number.isFinite(n) ? n : undefined
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
function promptClause(turn: Turn): string {
|
|
67
|
+
const p = turn.user_prompt_preview?.trim()
|
|
68
|
+
if (!p) return ''
|
|
69
|
+
// Quote-trim so a long preview doesn't bloat the channel body.
|
|
70
|
+
const snippet = p.length > 160 ? p.slice(0, 160) + '…' : p
|
|
71
|
+
return ` The request was: "${snippet}".`
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
/**
|
|
75
|
+
* Build the `resume_interrupted` inbound — a clean mid-flight interrupt
|
|
76
|
+
* the agent should pick back up.
|
|
77
|
+
*/
|
|
78
|
+
export function buildResumeInterruptedInbound(ctx: ResumeInboundContext): InboundMessage {
|
|
79
|
+
const ts = ctx.nowMs ?? Date.now()
|
|
80
|
+
const elapsed = humanizeElapsed(ts - ctx.turn.started_at)
|
|
81
|
+
const meta: Record<string, string> = {
|
|
82
|
+
source: 'resume_interrupted',
|
|
83
|
+
resume_turn_key: ctx.turn.turn_key,
|
|
84
|
+
interrupted_via: ctx.turn.ended_via ?? 'restart',
|
|
85
|
+
started_at: String(ctx.turn.started_at),
|
|
86
|
+
}
|
|
87
|
+
if (ctx.turn.user_prompt_preview) meta.original_prompt = ctx.turn.user_prompt_preview
|
|
88
|
+
const threadId = threadIdNum(ctx.turn)
|
|
89
|
+
return {
|
|
90
|
+
type: 'inbound',
|
|
91
|
+
chatId: ctx.turn.chat_id,
|
|
92
|
+
...(threadId != null ? { threadId } : {}),
|
|
93
|
+
messageId: ts,
|
|
94
|
+
user: 'switchroom',
|
|
95
|
+
userId: 0,
|
|
96
|
+
ts,
|
|
97
|
+
text:
|
|
98
|
+
`You just restarted. Your previous turn was interrupted ${elapsed} ago, ` +
|
|
99
|
+
`before it finished — it was cut off by a restart, not completed.` +
|
|
100
|
+
promptClause(ctx.turn) +
|
|
101
|
+
` Pick that work back up now and continue it through to completion. ` +
|
|
102
|
+
`In your first message, briefly let the user know you're resuming what ` +
|
|
103
|
+
`was interrupted (mention roughly how long ago in plain language) so ` +
|
|
104
|
+
`they're not left wondering — then carry on with the actual task. Do ` +
|
|
105
|
+
`not ask whether to resume; just resume. If you genuinely can't tell ` +
|
|
106
|
+
`what the work was, say so and ask.`,
|
|
107
|
+
meta,
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
/**
|
|
112
|
+
* Build the `resume_watchdog_timeout` inbound — a stalled turn the agent
|
|
113
|
+
* must report (not silently resume).
|
|
114
|
+
*
|
|
115
|
+
* `idleMs` is the no-progress duration the boot classifier measured (the
|
|
116
|
+
* marker age). It is passed explicitly rather than read off the turn so
|
|
117
|
+
* the caller can recover it from the persisted `interrupt_reason` on a
|
|
118
|
+
* later boot when the live marker is gone.
|
|
119
|
+
*/
|
|
120
|
+
export function buildResumeWatchdogReportInbound(
|
|
121
|
+
ctx: ResumeInboundContext & { idleMs: number },
|
|
122
|
+
): InboundMessage {
|
|
123
|
+
const ts = ctx.nowMs ?? Date.now()
|
|
124
|
+
const idle = humanizeElapsed(ctx.idleMs)
|
|
125
|
+
const since = humanizeElapsed(ts - ctx.turn.started_at)
|
|
126
|
+
const toolClause =
|
|
127
|
+
ctx.turn.tool_call_count != null && ctx.turn.tool_call_count > 0
|
|
128
|
+
? ` You'd run ${ctx.turn.tool_call_count} tool call${ctx.turn.tool_call_count === 1 ? '' : 's'} before it stalled.`
|
|
129
|
+
: ''
|
|
130
|
+
const meta: Record<string, string> = {
|
|
131
|
+
source: 'resume_watchdog_timeout',
|
|
132
|
+
resume_turn_key: ctx.turn.turn_key,
|
|
133
|
+
interrupted_via: 'timeout',
|
|
134
|
+
idle_ms: String(ctx.idleMs),
|
|
135
|
+
started_at: String(ctx.turn.started_at),
|
|
136
|
+
}
|
|
137
|
+
if (ctx.turn.tool_call_count != null) meta.tool_call_count = String(ctx.turn.tool_call_count)
|
|
138
|
+
if (ctx.turn.user_prompt_preview) meta.original_prompt = ctx.turn.user_prompt_preview
|
|
139
|
+
const threadId = threadIdNum(ctx.turn)
|
|
140
|
+
return {
|
|
141
|
+
type: 'inbound',
|
|
142
|
+
chatId: ctx.turn.chat_id,
|
|
143
|
+
...(threadId != null ? { threadId } : {}),
|
|
144
|
+
messageId: ts,
|
|
145
|
+
user: 'switchroom',
|
|
146
|
+
userId: 0,
|
|
147
|
+
ts,
|
|
148
|
+
text:
|
|
149
|
+
`You just restarted. Your previous turn (started ${since} ago) was ` +
|
|
150
|
+
`killed by the hang-watchdog: it made no observable progress for ${idle} ` +
|
|
151
|
+
`and the watchdog restarts a turn that goes that long without activity.` +
|
|
152
|
+
toolClause +
|
|
153
|
+
promptClause(ctx.turn) +
|
|
154
|
+
` Do NOT silently resume it — it may hang again the same way. Instead, ` +
|
|
155
|
+
`tell the user plainly what happened: that your last turn was killed ` +
|
|
156
|
+
`after ${idle} of no progress, and roughly what it was doing. Then ask ` +
|
|
157
|
+
`whether they want you to retry it or take a different angle. Report ` +
|
|
158
|
+
`only the honest cause — no observable progress for that long — don't ` +
|
|
159
|
+
`speculate about a deeper root cause you can't see.`,
|
|
160
|
+
meta,
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
/**
|
|
165
|
+
* Decide which resume inbound (if any) a given interrupt warrants. Pure —
|
|
166
|
+
* the gateway calls this with the classified `ended_via` so the
|
|
167
|
+
* report-vs-resume policy lives in one testable place.
|
|
168
|
+
*
|
|
169
|
+
* - 'timeout' → 'report' (watchdog kill)
|
|
170
|
+
* - 'restart' | 'sigterm' | 'unknown' → 'resume' (clean interrupt)
|
|
171
|
+
* - 'stop' → null (finished; nothing to do)
|
|
172
|
+
*/
|
|
173
|
+
export function selectResumeBuilder(
|
|
174
|
+
endedVia: TurnEndedVia | null,
|
|
175
|
+
): 'resume' | 'report' | null {
|
|
176
|
+
if (endedVia === 'timeout') return 'report'
|
|
177
|
+
if (endedVia === 'restart' || endedVia === 'sigterm' || endedVia === 'unknown') return 'resume'
|
|
178
|
+
if (endedVia == null) return 'resume' // still-open at boot = killed mid-flight
|
|
179
|
+
return null
|
|
180
|
+
}
|