switchroom 0.15.37 → 0.15.38

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. package/dist/agent-scheduler/index.js +89 -89
  2. package/dist/auth-broker/index.js +89 -89
  3. package/dist/cli/autoaccept-poll.js +13 -7
  4. package/dist/cli/drive-write-pretool.mjs +10 -10
  5. package/dist/cli/notion-write-pretool.mjs +91 -91
  6. package/dist/cli/skill-validate-pretool.mjs +72 -72
  7. package/dist/cli/switchroom.js +821 -572
  8. package/dist/cli/ui/index.html +87 -17
  9. package/dist/host-control/main.js +158 -158
  10. package/dist/vault/approvals/kernel-server.js +91 -91
  11. package/dist/vault/broker/server.js +92 -92
  12. package/package.json +1 -1
  13. package/profiles/_base/cron-session.sh.hbs +1 -1
  14. package/profiles/_base/start.sh.hbs +1 -1
  15. package/skills/switchroom-manage/SKILL.md +1 -1
  16. package/skills/switchroom-runtime/SKILL.md +1 -1
  17. package/telegram-plugin/answer-stream.ts +1 -1
  18. package/telegram-plugin/bridge/bridge.ts +18 -1
  19. package/telegram-plugin/bridge/ipc-client.ts +4 -1
  20. package/telegram-plugin/bridge/tool-filter.ts +77 -0
  21. package/telegram-plugin/chat-lock.ts +1 -1
  22. package/telegram-plugin/credits-watch.ts +1 -1
  23. package/telegram-plugin/dist/bridge/bridge.js +141 -115
  24. package/telegram-plugin/dist/gateway/gateway.js +318 -207
  25. package/telegram-plugin/dist/server.js +193 -164
  26. package/telegram-plugin/gateway/auto-classify-mid-turn.ts +1 -1
  27. package/telegram-plugin/gateway/boot-card.ts +5 -1
  28. package/telegram-plugin/gateway/boot-probes.ts +62 -0
  29. package/telegram-plugin/gateway/cron-session.ts +1 -1
  30. package/telegram-plugin/gateway/gateway.ts +133 -12
  31. package/telegram-plugin/gateway/grant-restart.ts +1 -1
  32. package/telegram-plugin/gateway/inbound-delivery-machine-dispatch.ts +1 -1
  33. package/telegram-plugin/gateway/inbound-delivery-machine-shadow.ts +1 -1
  34. package/telegram-plugin/gateway/inbound-delivery-machine.ts +1 -1
  35. package/telegram-plugin/gateway/interrupt-defer.ts +1 -1
  36. package/telegram-plugin/gateway/ipc-protocol.ts +12 -0
  37. package/telegram-plugin/gateway/permission-card-origin.ts +62 -0
  38. package/telegram-plugin/gateway/permission-timeout.ts +70 -0
  39. package/telegram-plugin/gateway/prefix-warmup.ts +1 -1
  40. package/telegram-plugin/gateway/webhook-ingest-server.test.ts +1 -1
  41. package/telegram-plugin/gateway/webhook-ingest-server.ts +1 -1
  42. package/telegram-plugin/hooks/subagent-tracker-pretool.mjs +1 -1
  43. package/telegram-plugin/interrupt-marker.ts +1 -1
  44. package/telegram-plugin/over-ping-safety-net.ts +1 -1
  45. package/telegram-plugin/scoped-approval.ts +1 -1
  46. package/telegram-plugin/secret-detect/vault-error.ts +1 -1
  47. package/telegram-plugin/silence-poke.ts +2 -2
  48. package/telegram-plugin/silent-reply-anchor.ts +1 -1
  49. package/telegram-plugin/slot-banner-driver.ts +1 -1
  50. package/telegram-plugin/startup-reset.ts +1 -1
  51. package/telegram-plugin/tests/boot-probes-connections.test.ts +66 -0
  52. package/telegram-plugin/tests/gateway-startup-reset.test.ts +1 -1
  53. package/telegram-plugin/tests/inbound-delivery-machine.test.ts +1 -1
  54. package/telegram-plugin/tests/permission-card-origin.test.ts +97 -0
  55. package/telegram-plugin/tests/permission-card-routing.test.ts +23 -0
  56. package/telegram-plugin/tests/permission-no-repeat-wiring.test.ts +76 -0
  57. package/telegram-plugin/tests/permission-timeout.test.ts +87 -0
  58. package/telegram-plugin/tests/scoped-approval.test.ts +1 -1
  59. package/telegram-plugin/tests/silence-poke.test.ts +1 -1
  60. package/telegram-plugin/tests/tool-filter.test.ts +87 -0
  61. package/telegram-plugin/tests/turn-flush-safety.test.ts +1 -1
  62. package/telegram-plugin/turn-flush-safety.ts +1 -1
  63. package/telegram-plugin/uat/assertions.ts +1 -1
  64. package/telegram-plugin/uat/scenarios/bg-sub-agent-dispatch-dm.test.ts +1 -1
  65. package/telegram-plugin/uat/scenarios/fuzz-extended-dm.test.ts +1 -1
  66. package/telegram-plugin/uat/scenarios/jtbd-fast-ack-dm.test.ts +1 -1
  67. package/telegram-plugin/uat/scenarios/jtbd-fast-trivial-dm.test.ts +2 -2
  68. package/telegram-plugin/uat/scenarios/jtbd-forwarded-burst-dm.test.ts +1 -1
  69. package/telegram-plugin/uat/scenarios/jtbd-memory-survives-restart-dm.test.ts +1 -1
  70. package/telegram-plugin/uat/scenarios/jtbd-rapid-followup-dm.test.ts +1 -1
  71. package/telegram-plugin/uat/scenarios/jtbd-reflective-status-reaction-dm.test.ts +1 -1
  72. package/telegram-plugin/uat/scenarios/jtbd-wake-audit-content-dm.test.ts +1 -1
@@ -7,7 +7,7 @@
7
7
  *
8
8
  * Today a no-prefix mid-turn message always QUEUES (the default flipped
9
9
  * 2026-04-17 away from the blunt "everything steers" — see
10
- * reference/steer-or-queue-mid-flight.md). This module is the basis for a
10
+ * reference/jobs/steer-or-queue-mid-flight.md). This module is the basis for a
11
11
  * smarter default. It ships first in SHADOW mode (the gateway logs what it WOULD
12
12
  * decide but still queues), to gather real-world data — how often mid-turn
13
13
  * messages are same-topic continuations vs cross-topic new tasks, and the
@@ -46,6 +46,7 @@ import {
46
46
  probeBroker,
47
47
  probeKernel,
48
48
  probeSkills,
49
+ probeConnections,
49
50
  watchAgentProcess,
50
51
  AGENT_LIVE_WINDOW_MS,
51
52
  AGENT_LIVE_POLL_INTERVAL_MS,
@@ -120,6 +121,7 @@ export type ProbeKey =
120
121
  | 'broker'
121
122
  | 'kernel'
122
123
  | 'skills'
124
+ | 'connections'
123
125
 
124
126
  export type ProbeMap = Partial<Record<ProbeKey, ProbeResult | null>>
125
127
 
@@ -253,11 +255,12 @@ const PROBE_LABELS: Record<ProbeKey, string> = {
253
255
  broker: 'Broker',
254
256
  kernel: 'Kernel',
255
257
  skills: 'Skills',
258
+ connections: 'Connections',
256
259
  }
257
260
 
258
261
  const PROBE_KEYS: ReadonlyArray<ProbeKey> = [
259
262
  'account', 'agent', 'gateway', 'quota', 'hindsight',
260
- 'scheduler', 'broker', 'kernel', 'skills',
263
+ 'scheduler', 'broker', 'kernel', 'skills', 'connections',
261
264
  ]
262
265
 
263
266
  const REASON_EMOJI: Record<RestartReason, string> = {
@@ -617,6 +620,7 @@ export async function runAllProbes(opts: RunProbesOpts): Promise<ProbeMap> {
617
620
  probeBroker(undefined, { dockerMode: opts.dockerMode }).then(r => { probes.broker = r }),
618
621
  probeKernel(undefined, { dockerMode: opts.dockerMode }).then(r => { probes.kernel = r }),
619
622
  probeSkills(opts.agentDir, { agentName: opts.agentSlug ?? opts.agentName }).then(r => { probes.skills = r }),
623
+ probeConnections(opts.agentDir).then(r => { probes.connections = r }),
620
624
  ])
621
625
 
622
626
  return probes
@@ -1421,6 +1421,68 @@ function renderBucketedSkills(switchroom: string[], agent: string[]): string {
1421
1421
  return parts.length === 0 ? 'none resolved' : parts.join(' · ')
1422
1422
  }
1423
1423
 
1424
+ // ─── Probe: Connections (configured-but-unauthed MCP integrations) ───────────
1425
+
1426
+ /**
1427
+ * Surface configured-but-unauthed MCP connections at agent start. The auth
1428
+ * verdict can't be computed in-container (this boot probe must not do
1429
+ * vault/grant work — see the module header), so `switchroom apply` computes
1430
+ * it host-side and drops a snapshot at
1431
+ * `<agentDir>/.claude/connection-health.json` (src/agents/connection-health.ts).
1432
+ * This probe just reads it.
1433
+ *
1434
+ * ok — snapshot missing/unparseable (not yet computed → assume
1435
+ * healthy, don't nag) OR zero issues
1436
+ * degraded — ≥1 connection configured but not authed; detail names the
1437
+ * servers, nextStep carries the first fix
1438
+ *
1439
+ * Never `fail`: an unauthed integration degrades that one capability, it
1440
+ * doesn't take the agent down, and the silent-when-healthy boot card
1441
+ * should not red an agent over a missing third-party token.
1442
+ */
1443
+ export interface ConnectionIssueShape {
1444
+ server: string
1445
+ key: string
1446
+ kind: string
1447
+ detail: string
1448
+ fix: string
1449
+ }
1450
+
1451
+ export async function probeConnections(
1452
+ agentDir: string,
1453
+ opts: { readFileImpl?: (path: string) => string } = {},
1454
+ ): Promise<ProbeResult> {
1455
+ return withTimeout('Connections', (async (): Promise<ProbeResult> => {
1456
+ const path = join(agentDir, '.claude', 'connection-health.json')
1457
+ const read = opts.readFileImpl ?? ((p: string) => readFileSync(p, 'utf8'))
1458
+ let issues: ConnectionIssueShape[] = []
1459
+ try {
1460
+ const parsed = JSON.parse(read(path)) as { issues?: ConnectionIssueShape[] }
1461
+ issues = Array.isArray(parsed.issues) ? parsed.issues : []
1462
+ } catch {
1463
+ // ENOENT (never applied with this build) or malformed — assume healthy.
1464
+ return { status: 'ok', label: 'Connections', detail: 'no issues' }
1465
+ }
1466
+ if (issues.length === 0) {
1467
+ return { status: 'ok', label: 'Connections', detail: 'all authed' }
1468
+ }
1469
+ const servers = [...new Set(issues.map((i) => i.server))]
1470
+ const named = servers.slice(0, 4).join(', ')
1471
+ const more = servers.length > 4 ? ` +${servers.length - 4} more` : ''
1472
+ const first = issues[0]
1473
+ const extra =
1474
+ issues.length > 1
1475
+ ? ` (+${issues.length - 1} more — run \`switchroom doctor\`)`
1476
+ : ''
1477
+ return {
1478
+ status: 'degraded',
1479
+ label: 'Connections',
1480
+ detail: `${servers.length} integration(s) configured but not authed: ${named}${more}`,
1481
+ nextStep: `${first.fix}${extra}`,
1482
+ }
1483
+ })())
1484
+ }
1485
+
1424
1486
  export interface SkillsFsImpl {
1425
1487
  readdir: (p: string) => string[]
1426
1488
  exists: (p: string) => boolean
@@ -1,5 +1,5 @@
1
1
  /**
2
- * Cheap-cron session identity — docs/rfcs/cheap-cron-sessions.md §3.3.
2
+ * Cheap-cron session identity — reference/rfcs/cheap-cron-sessions.md §3.3.
3
3
  *
4
4
  * Rather than rekey the gateway's hardened single-bridge machinery
5
5
  * (agentIndex / pendingInboundBuffer / handleRegister, each carrying
@@ -67,6 +67,13 @@ import { DeferredDoneReactions } from '../reaction-defer.js'
67
67
  import { createWorkerActivityFeed, isWorkerActivityFeedEnabled } from '../worker-activity-feed.js'
68
68
  import { formatTurnLifecycle, detectStatusSurfaceDegraded } from './status-surface-log.js'
69
69
  import { parseSourceMessageId } from './source-message-id.js'
70
+ import {
71
+ permissionSignature,
72
+ timeoutDenyMessage,
73
+ duplicateDenyMessage,
74
+ isRecentTimeoutDuplicate,
75
+ } from './permission-timeout.js'
76
+ import { pickRecoveredPermissionOrigin } from './permission-card-origin.js'
70
77
  import { isTelegramReplyTool, isTelegramSurfaceTool } from '../tool-names.js'
71
78
  import { appendActivityLabel, renderActivityFeedWithNested } from '../tool-activity-summary.js'
72
79
  import { toolLabel } from '../tool-labels.js'
@@ -563,7 +570,7 @@ const INBOX_DIR = join(STATE_DIR, 'inbox')
563
570
  * different agent's container from inside our own (no docker.sock).
564
571
  * - else (v0.6 legacy non-docker path, scheduled for removal in
565
572
  * Phase 3 of the host-control daemon rollout — see
566
- * `docs/rfcs/host-control-daemon.md`): detached `systemctl --user
573
+ * `reference/rfcs/host-control-daemon.md`): detached `systemctl --user
567
574
  * restart` of the two units. This branch is never reached on
568
575
  * v0.7+ docker installs (the `isDocker` guard above takes the
569
576
  * docker branch); only callable on legacy systemd hosts that
@@ -1901,7 +1908,7 @@ type CurrentTurn = {
1901
1908
  // #1675 (over-ping safety net): wall-clock ms of the first reply
1902
1909
  // this turn that landed with `disable_notification: false` (a real
1903
1910
  // device ping). The conversational-pacing contract
1904
- // (`reference/conversational-pacing.md` beat 5) says EXACTLY ONE
1911
+ // (`reference/rfcs/conversational-pacing.md` beat 5) says EXACTLY ONE
1905
1912
  // ping per turn — the final answer. When the model violates that
1906
1913
  // (sends a substantive answer pinged + a wrap-up "Delivered…" or
1907
1914
  // meta-narration also pinged), subsequent reply calls with
@@ -3280,6 +3287,29 @@ function resolvePermissionCardTargets(): Array<{ chatId: string; threadId: numbe
3280
3287
  if (turn != null) {
3281
3288
  return [{ chatId: turn.sessionChatId, threadId: turn.sessionThreadId }]
3282
3289
  }
3290
+ // currentTurn was nulled — most commonly because the orphaned-reply backstop
3291
+ // force-closed the turn while the single claude session kept running and then
3292
+ // hit a permission-gated tool (e.g. a retry after a first card auto-denied:
3293
+ // marko Rentals-budget, 2026-06-17). Recover the originating topic from the
3294
+ // recently-started turn registry so the card lands where the operator is
3295
+ // working, instead of fanning out to operator DMs (thread-stripped) where it
3296
+ // sits unseen until the 10-min TTL auto-denies it. Kill switch (=0) restores
3297
+ // the legacy DM fan-out.
3298
+ if (PERMISSION_CARD_ORIGIN_RECOVERY_ENABLED) {
3299
+ const recovered = pickRecoveredPermissionOrigin(
3300
+ recentTurnsById.values(),
3301
+ Date.now(),
3302
+ PERMISSION_CARD_ORIGIN_MAX_AGE_MS,
3303
+ )
3304
+ if (recovered != null) {
3305
+ process.stderr.write(
3306
+ `telegram gateway: permission-card origin recovered from recent turn ` +
3307
+ `chat=${recovered.chatId} thread=${recovered.threadId ?? '-'} ` +
3308
+ `(currentTurn was null — force-closed turn)\n`,
3309
+ )
3310
+ return [recovered]
3311
+ }
3312
+ }
3283
3313
  const sg = resolveAgentSupergroupChatId()
3284
3314
  const topic = resolveAgentOutboundTopic({
3285
3315
  kind: 'permission',
@@ -3699,6 +3729,39 @@ const STATUS_QUERY_RE = /^\s*status\??\s*$/i
3699
3729
  const PERMISSION_REPLY_RE = /^\s*(y|yes|n|no)\s+([a-km-z]{5})\s*$/i
3700
3730
  const pendingPermissions = new Map<string, { tool_name: string; description: string; input_preview: string; startedAt: number }>()
3701
3731
  const PERMISSION_TTL_MS = 10 * 60_000
3732
+ // No-repeat-on-timeout (marko Rentals-budget loop, 2026-06-17). When a card
3733
+ // auto-denies on TTL, the model is told it was a TIMEOUT (not a denial) so it
3734
+ // doesn't retry; if it retries the identical (tool, input) anyway while the
3735
+ // operator is still absent, we short-circuit-deny it WITHOUT posting a second
3736
+ // card. `permissionTimeoutSignatures` maps signature → last-timeout epoch ms;
3737
+ // it is cleared the moment the operator is active again (answers any card, or
3738
+ // sends a message), so suppression only ever holds during genuine absence.
3739
+ // Kill switch: SWITCHROOM_PERMISSION_NO_REPEAT=0.
3740
+ const PERMISSION_NO_REPEAT_ENABLED =
3741
+ process.env.SWITCHROOM_PERMISSION_NO_REPEAT !== '0'
3742
+ // Safety cap on how long a timed-out signature suppresses retries even if the
3743
+ // operator-activity reset is somehow missed; the reset is the primary bound.
3744
+ const PERMISSION_DUPLICATE_WINDOW_MS = 60 * 60_000
3745
+ const permissionTimeoutSignatures = new Map<string, number>()
3746
+ function clearPermissionTimeoutSuppression(reason: string): void {
3747
+ if (permissionTimeoutSignatures.size === 0) return
3748
+ const n = permissionTimeoutSignatures.size
3749
+ permissionTimeoutSignatures.clear()
3750
+ process.stderr.write(
3751
+ `telegram gateway: permission no-repeat suppression cleared (${n} sig(s)) — ${reason}\n`,
3752
+ )
3753
+ }
3754
+ // Permission/approval-card origin recovery (marko Rentals-budget, 2026-06-17).
3755
+ // When `currentTurn` was force-closed by the orphaned-reply backstop but the
3756
+ // claude session kept running into a permission-gated tool, recover the card's
3757
+ // origin topic from the recently-started turn registry instead of fanning out
3758
+ // to operator DMs. Kill switch: SWITCHROOM_PERMISSION_CARD_ORIGIN_RECOVERY=0.
3759
+ const PERMISSION_CARD_ORIGIN_RECOVERY_ENABLED =
3760
+ process.env.SWITCHROOM_PERMISSION_CARD_ORIGIN_RECOVERY !== '0'
3761
+ // A backstop-closed turn is seconds-to-minutes old; bound recovery so a
3762
+ // long-idle agent's stale registry entry can't mis-route a much later
3763
+ // permission into an old topic (it falls back to the operator-DM fan-out).
3764
+ const PERMISSION_CARD_ORIGIN_MAX_AGE_MS = 30 * 60_000
3702
3765
 
3703
3766
  // #1977 — single-tap correlation for the durable "🔁 Always allow"
3704
3767
  // flow. When the gateway dispatches a `config_propose_edit` to hostd in
@@ -4305,23 +4368,46 @@ const pendingStateReaper = setInterval(() => {
4305
4368
  // permission (or takes a fallback). Routed through
4306
4369
  // dispatchPermissionVerdict so it's buffered+redelivered too if
4307
4370
  // the bridge is also offline at sweep time.
4308
- dispatchPermissionVerdict({ type: 'permission', requestId: k, behavior: 'deny' })
4371
+ // Carry a TIMEOUT reason to the model (claude renders it as "…the user
4372
+ // said: …") so it can tell a timeout from a real denial and not retry
4373
+ // the identical call — the duplicate-card loop this series closes.
4374
+ const timeoutMinutes = Math.round(PERMISSION_TTL_MS / 60000)
4375
+ dispatchPermissionVerdict({
4376
+ type: 'permission',
4377
+ requestId: k,
4378
+ behavior: 'deny',
4379
+ message: timeoutDenyMessage(timeoutMinutes),
4380
+ })
4309
4381
  // The auto-deny un-parks the suspended turn — flip 🙏 → working so
4310
4382
  // it doesn't sit on the awaiting glyph (or stall) after the timeout.
4311
4383
  resumeReactionAfterVerdict()
4312
4384
  postPermissionResumeMessage({
4313
4385
  behavior: 'deny',
4314
4386
  action: naturalAction(v.tool_name, v.input_preview),
4315
- timeoutMinutes: Math.round(PERMISSION_TTL_MS / 60000),
4387
+ timeoutMinutes,
4316
4388
  })
4389
+ // Remember this (tool, input) timed out so an immediate identical retry
4390
+ // (while the operator is still absent) is short-circuited without a
4391
+ // second card. Cleared on operator activity.
4392
+ if (PERMISSION_NO_REPEAT_ENABLED) {
4393
+ permissionTimeoutSignatures.set(
4394
+ permissionSignature(v.tool_name, v.input_preview),
4395
+ now,
4396
+ )
4397
+ }
4317
4398
  process.stderr.write(
4318
4399
  `telegram gateway: permission TTL expired — auto-deny request=${k} ` +
4319
4400
  `tool=${v.tool_name} (no operator response in ` +
4320
- `${Math.round(PERMISSION_TTL_MS / 60000)}m)\n`,
4401
+ `${timeoutMinutes}m)\n`,
4321
4402
  )
4322
4403
  pendingPermissions.delete(k)
4323
4404
  }
4324
4405
  }
4406
+ // Drop no-repeat suppression entries past the safety-cap window (the primary
4407
+ // bound is the operator-activity reset; this just keeps the map from growing).
4408
+ for (const [sig, at] of permissionTimeoutSignatures) {
4409
+ if (now - at > PERMISSION_DUPLICATE_WINDOW_MS) permissionTimeoutSignatures.delete(sig)
4410
+ }
4325
4411
  for (const [k, v] of vaultPassphraseCache) {
4326
4412
  if (now > v.expiresAt) vaultPassphraseCache.delete(k)
4327
4413
  }
@@ -5803,7 +5889,7 @@ const ipcServer: IpcServer = createIpcServer({
5803
5889
  // (5-min cooldown per agent), and skipped if no boot chat resolves.
5804
5890
  // Claude responds NO_REPLY per inline instruction; existing
5805
5891
  // silent-marker suppression at gateway.ts:5906 swallows the
5806
- // outbound. See docs/rfcs/cold-start-ttfo.md Option A.
5892
+ // outbound. See reference/rfcs/cold-start-ttfo.md Option A.
5807
5893
  if (client.agentName != null) {
5808
5894
  maybeFireWarmup({
5809
5895
  selfAgent: client.agentName,
@@ -6097,6 +6183,30 @@ const ipcServer: IpcServer = createIpcServer({
6097
6183
  return
6098
6184
  }
6099
6185
  }
6186
+ // No-repeat short-circuit: this exact (tool, input) already timed out and
6187
+ // the operator hasn't been active since (the suppression map is cleared on
6188
+ // any operator activity). Deny it WITH a timeout-duplicate reason and post
6189
+ // NO second card — the model retrying into an absent operator is the loop
6190
+ // this closes. The turn still unblocks (deny verdict), and a returning
6191
+ // operator resets suppression so the next ask gets a fresh card.
6192
+ if (PERMISSION_NO_REPEAT_ENABLED) {
6193
+ const sig = permissionSignature(toolName, inputPreview)
6194
+ if (isRecentTimeoutDuplicate(permissionTimeoutSignatures, sig, Date.now(), PERMISSION_DUPLICATE_WINDOW_MS)) {
6195
+ // no-card-verdict: no card was posted and the turn was never parked on
6196
+ // the awaiting glyph, so we omit the resume-reaction flip / resume msg.
6197
+ dispatchPermissionVerdict({
6198
+ type: 'permission',
6199
+ requestId,
6200
+ behavior: 'deny',
6201
+ message: duplicateDenyMessage,
6202
+ })
6203
+ process.stderr.write(
6204
+ `telegram gateway: permission no-repeat short-circuit — duplicate of a ` +
6205
+ `timed-out request tool=${toolName} request=${requestId} (no card posted)\n`,
6206
+ )
6207
+ return
6208
+ }
6209
+ }
6100
6210
  pendingPermissions.set(requestId, { tool_name: toolName, description, input_preview: inputPreview, startedAt: Date.now() })
6101
6211
  // Natural-language card body — a plain sentence ("Gymbro wants to
6102
6212
  // edit: supplement-log.md" + a why-line), never a raw tool id.
@@ -6586,7 +6696,7 @@ const ipcServer: IpcServer = createIpcServer({
6586
6696
  const source = typeof msg.inbound.meta?.source === 'string'
6587
6697
  ? msg.inbound.meta.source
6588
6698
  : 'unknown'
6589
- // Cheap-cron (docs/rfcs/cheap-cron-sessions.md §3.3): a Tier-1 fire
6699
+ // Cheap-cron (reference/rfcs/cheap-cron-sessions.md §3.3): a Tier-1 fire
6590
6700
  // carries meta.session='cron' → route to the derived `<agent>-cron`
6591
6701
  // bridge (a 2nd interactive Sonnet session in the same container).
6592
6702
  // Every other fire (and all of today's callers) routes to the agent
@@ -7148,7 +7258,7 @@ async function executeReply(args: Record<string, unknown>): Promise<{ content: A
7148
7258
  let disableNotification = args.disable_notification === true
7149
7259
 
7150
7260
  // #1675 over-ping safety net. The conversational-pacing contract
7151
- // (`reference/conversational-pacing.md` beat 5) says EXACTLY ONE
7261
+ // (`reference/rfcs/conversational-pacing.md` beat 5) says EXACTLY ONE
7152
7262
  // device ping per turn — the final answer. The model sometimes
7153
7263
  // violates this by sending a substantive answer pinged + a wrap-up
7154
7264
  // ("Delivered all three steps…", "Sent.", or meta-narration) ALSO
@@ -10254,7 +10364,7 @@ function handleSessionEvent(ev: SessionEvent): void {
10254
10364
  // only fires for text-only turns where the stream IS the
10255
10365
  // answer): PING. The user reached for the agent and the
10256
10366
  // model produced an answer; per beat 5 of
10257
- // `reference/conversational-pacing.md` the final answer MUST
10367
+ // `reference/rfcs/conversational-pacing.md` the final answer MUST
10258
10368
  // ping the device exactly once. Without this carve-out, a
10259
10369
  // short text-only turn ("on it" being the whole response)
10260
10370
  // lands silently and the user has no notification to know
@@ -11580,6 +11690,11 @@ async function handleInbound(
11580
11690
  return
11581
11691
  }
11582
11692
 
11693
+ // A real message from an allowed sender (gate passed) ⇒ the operator is
11694
+ // present, so reset any no-repeat suppression: the next time the agent asks
11695
+ // for something that timed out earlier, they should see a fresh card.
11696
+ clearPermissionTimeoutSuppression('operator inbound')
11697
+
11583
11698
  // Capture wall-clock receive time for inbound_ack metric (#203).
11584
11699
  // Must be after gate() so early-exit paths (drop/pair) don't skew the delta.
11585
11700
  //
@@ -11699,7 +11814,7 @@ async function handleInbound(
11699
11814
  }
11700
11815
 
11701
11816
  // `!`-prefix interrupt (#575). Closes
11702
- // `reference/steer-or-queue-mid-flight.md`'s correction path.
11817
+ // `reference/jobs/steer-or-queue-mid-flight.md`'s correction path.
11703
11818
  //
11704
11819
  // Behavior:
11705
11820
  // 1. SIGINT the agent service. This kills any in-flight turn —
@@ -13156,7 +13271,7 @@ function resolveBootChatId(
13156
13271
  // operator sees lifecycle events in a predictable lane instead of
13157
13272
  // chat-root. For fleet-mode / DM agents the helper returns undefined
13158
13273
  // → behavior unchanged (lands at chat-root as today). PR4b of
13159
- // supergroup-mode rollout (docs/rfcs/supergroup-mode.md).
13274
+ // supergroup-mode rollout (reference/rfcs/supergroup-mode.md).
13160
13275
  const supergroupBootTopic = resolveAgentOutboundTopic({ kind: 'boot' })
13161
13276
  const bootSupergroup = resolveAgentSupergroupChatId()
13162
13277
  // The boot topic is valid only in the agent's supergroup — attach it per
@@ -14254,7 +14369,7 @@ async function buildLiveProbeRows(agentName: string): Promise<StatusProbeRow[]>
14254
14369
  // Render order matches the boot card's PROBE_KEYS so the two
14255
14370
  // surfaces tell the same story in the same order.
14256
14371
  const order = ['account', 'agent', 'gateway', 'quota', 'hindsight',
14257
- 'scheduler', 'broker', 'kernel', 'skills'] as const
14372
+ 'scheduler', 'broker', 'kernel', 'skills', 'connections'] as const
14258
14373
  for (const k of order) {
14259
14374
  const r = probes[k]
14260
14375
  if (!r) continue
@@ -15134,6 +15249,8 @@ async function handlePermissionSlash(ctx: Context, behavior: 'allow' | 'deny'):
15134
15249
  )
15135
15250
  return
15136
15251
  }
15252
+ // Operator answered via slash ⇒ present; reset no-repeat suppression.
15253
+ clearPermissionTimeoutSuppression('operator answered via /approve|/deny')
15137
15254
  // Forward to connected bridges — same IPC the button handler uses.
15138
15255
  dispatchPermissionVerdict({ type: 'permission', requestId: request_id, behavior })
15139
15256
  resumeReactionAfterVerdict()
@@ -19640,6 +19757,9 @@ bot.on('callback_query:data', async ctx => {
19640
19757
  // scopes (resolveTimeBox → null) and the disabled tier (ttl<=0) stay truly
19641
19758
  // once. The verdict is still dispatched WITHOUT a `rule` (below), so the
19642
19759
  // bridge never caches it untimed — the window lives only in scopedGrants.
19760
+ // Operator tapped a verdict ⇒ they are present; reset no-repeat suppression
19761
+ // so a later identical ask is shown fresh rather than silently short-circuited.
19762
+ clearPermissionTimeoutSuppression('operator answered a permission card')
19643
19763
  const pd = pendingPermissions.get(request_id)
19644
19764
  const resumeAction = pd ? naturalAction(pd.tool_name, pd.input_preview) : ''
19645
19765
  const scopedTtl = scopedApprovalTtlMs()
@@ -20919,6 +21039,7 @@ async function shutdown(signal: string): Promise<void> {
20919
21039
  pendingReauthFlows.clear()
20920
21040
  pendingVaultOps.clear()
20921
21041
  pendingPermissions.clear()
21042
+ permissionTimeoutSignatures.clear()
20922
21043
 
20923
21044
  try {
20924
21045
  await ipcServer.close()
@@ -6,7 +6,7 @@
6
6
  * turn-deferred-vs-now — unit-tests without gateway.ts's boot side-effects
7
7
  * (same pattern as scoped-approval.ts / admin-commands/index.ts).
8
8
  *
9
- * Contract (reference/access-model.md): the restart only ever follows an
9
+ * Contract (reference/rfcs/access-model.md): the restart only ever follows an
10
10
  * operator-approved, single-agent, additive `tools.allow` edit, and only
11
11
  * ever bounces the CALLER's own agent — never a peer, never fleet-wide.
12
12
  */
@@ -1,7 +1,7 @@
1
1
  /**
2
2
  * InboundDeliveryStateMachine — DISPATCH (Phase 2b PR 3a, bridgeUp cutover).
3
3
  *
4
- * Per RFC `docs/rfcs/inbound-delivery-state-machine.md`, the state
4
+ * Per RFC `reference/rfcs/inbound-delivery-state-machine.md`, the state
5
5
  * machine is pure: `transition(state, event) → { state', effects[] }`.
6
6
  * The gateway's job is to (a) emit events at the right moments and
7
7
  * (b) execute the returned effects against real I/O. This module owns
@@ -1,7 +1,7 @@
1
1
  /**
2
2
  * InboundDeliveryStateMachine — SHADOW MODE wiring (Phase 2b PR 2).
3
3
  *
4
- * Per RFC `docs/rfcs/inbound-delivery-state-machine.md` Phase 2b PR 2:
4
+ * Per RFC `reference/rfcs/inbound-delivery-state-machine.md` Phase 2b PR 2:
5
5
  * the state machine runs ALONGSIDE the existing imperative gateway
6
6
  * code, recording predicted effects to a structured trace. Behavior
7
7
  * is unchanged — every existing code path still executes the actual
@@ -2,7 +2,7 @@
2
2
  * InboundDeliveryStateMachine — pure transition function for the
3
3
  * gateway's inbound→bridge→outbound pipeline.
4
4
  *
5
- * Per `docs/rfcs/inbound-delivery-state-machine.md` (RFC merged in
5
+ * Per `reference/rfcs/inbound-delivery-state-machine.md` (RFC merged in
6
6
  * PR #1576): the gateway's delivery state was implicit and scattered
7
7
  * across 8+ pieces of mutable state. The wedge cluster of 2026-05-19
8
8
  * (9 PRs in 36h all patching variants of "inbound stranded → 5-min
@@ -3,7 +3,7 @@
3
3
  // A `!`-prefix interrupt SIGINTs the agent's in-flight turn (tmux C-c) and
4
4
  // then resumes with the replacement body as a fresh turn. Firing the SIGINT
5
5
  // the instant `!` arrives can land mid-tool-call — a C-c during a Write or a
6
- // Bash leaves the tool's work half-done. `reference/steer-or-queue-mid-flight.md`
6
+ // Bash leaves the tool's work half-done. `reference/jobs/steer-or-queue-mid-flight.md`
7
7
  // names this exact anti-pattern: "Mid-tool-call is not 'amend time.'"
8
8
  //
9
9
  // We can't pause claude's internal loop (the unmodified-CLI constraint — the
@@ -38,6 +38,18 @@ export interface PermissionEvent {
38
38
  * (`mcp__<server>__*`).
39
39
  */
40
40
  rule?: string;
41
+ /**
42
+ * Optional human-readable reason for the verdict, surfaced to the model
43
+ * verbatim by claude's permission channel as "…the user said: ${message}".
44
+ * Only set on `deny`. switchroom uses it to make a TIMEOUT auto-deny (no
45
+ * operator response within the TTL) distinguishable from a deliberate
46
+ * operator denial — otherwise both render as the generic "Denied" and the
47
+ * model retries the identical call, re-raising an identical card 10 min
48
+ * later (marko Rentals-budget loop, 2026-06-17). When absent, claude falls
49
+ * back to its default "Denied", so this degrades safely on any claude that
50
+ * ignores the field.
51
+ */
52
+ message?: string;
41
53
  }
42
54
 
43
55
  export interface StatusEvent {
@@ -0,0 +1,62 @@
1
+ /**
2
+ * Pure origin-recovery for a permission/approval card when the gateway's live
3
+ * `currentTurn` has already been nulled.
4
+ *
5
+ * Why this exists (marko Rentals-budget incident, 2026-06-17). A
6
+ * supergroup-owned agent that delivers its final answer as plain transcript
7
+ * text — never calling the `reply` tool — has its turn force-closed by the
8
+ * gateway's orphaned-reply backstop ~30s later, which nulls `currentTurn`. If
9
+ * the single claude session is still running and then calls a permission-gated
10
+ * tool (the real case: retrying `meta_ads_set_budget` after a first card had
11
+ * auto-denied), the gate fires with `currentTurn == null`. The card emitter
12
+ * then fell through to broadcasting the card to the operator-DM allowlist,
13
+ * thread-stripped — so the card never reached the forum topic the operator was
14
+ * working in. Unanswered there, it hit the 10-minute TTL and auto-denied, and
15
+ * an explicitly-approved budget change silently never ran.
16
+ *
17
+ * A switchroom agent runs exactly ONE claude session, so a tool permission can
18
+ * only belong to the turn that session most recently had open. We recover that
19
+ * origin from the bounded recently-started turn registry: the most-recently-
20
+ * started turn still within `maxAgeMs`. A turn force-closed by the backstop is,
21
+ * by construction, seconds-to-minutes old, so the freshness ceiling costs
22
+ * nothing for the incident class while keeping a long-idle agent's stale
23
+ * registry entry from mis-routing a much later permission into an old topic —
24
+ * beyond the ceiling we return null and the caller keeps the existing
25
+ * operator-DM fan-out. This only ever ADDS topic recovery; it never changes the
26
+ * idle/turn-less path.
27
+ */
28
+
29
+ /** The subset of a turn this recovery needs — kept structural so the gateway's
30
+ * richer `CurrentTurn` satisfies it without a cast. */
31
+ export interface RecoverableTurn {
32
+ sessionChatId: string
33
+ sessionThreadId: number | undefined
34
+ startedAt: number
35
+ }
36
+
37
+ export interface PermissionCardOrigin {
38
+ chatId: string
39
+ threadId: number | undefined
40
+ }
41
+
42
+ /**
43
+ * Pick the most-recently-started turn within the freshness window as the
44
+ * permission card's origin, or null when none qualifies (caller falls back to
45
+ * the operator-DM fan-out). Order-independent — selects by `startedAt`, not by
46
+ * the iteration order of the source registry, so it is robust to any
47
+ * out-of-order insertion.
48
+ */
49
+ export function pickRecoveredPermissionOrigin(
50
+ recentTurns: Iterable<RecoverableTurn>,
51
+ now: number,
52
+ maxAgeMs: number,
53
+ ): PermissionCardOrigin | null {
54
+ let best: RecoverableTurn | null = null
55
+ for (const t of recentTurns) {
56
+ if (now - t.startedAt > maxAgeMs) continue
57
+ if (best == null || t.startedAt >= best.startedAt) best = t
58
+ }
59
+ return best == null
60
+ ? null
61
+ : { chatId: best.sessionChatId, threadId: best.sessionThreadId }
62
+ }
@@ -0,0 +1,70 @@
1
+ /**
2
+ * Pure helpers for permission-card TIMEOUT handling — making a "no operator
3
+ * responded" auto-deny distinguishable from a deliberate denial, and
4
+ * suppressing the duplicate card a model raises when it retries the identical
5
+ * call after such a timeout.
6
+ *
7
+ * Background (marko Rentals-budget loop, 2026-06-17). switchroom forwards a
8
+ * permission verdict to claude as `{ behavior, message? }`; with no `message`,
9
+ * claude renders the generic "the user said: Denied". A 10-minute TTL
10
+ * auto-deny was therefore indistinguishable from a real operator "Deny", so
11
+ * the model read it as transient and retried the SAME tool call — re-raising
12
+ * an identical card 10 minutes later, in a loop the operator never asked for.
13
+ *
14
+ * Two levers, both pure here and wired in gateway.ts:
15
+ * 1. `timeoutDenyMessage` — the `message` we attach ONLY to a TTL auto-deny,
16
+ * telling the model it was a timeout (not a denial) and not to retry.
17
+ * 2. `permissionSignature` + `isRecentTimeoutDuplicate` — recognise a retry of
18
+ * the exact same (tool, input) shortly after it timed out, so the gateway
19
+ * can short-circuit it (deny with `duplicateDenyMessage`) WITHOUT posting a
20
+ * second identical card. The suppression is reset on operator activity
21
+ * (handled gateway-side), so it only ever holds while the operator is
22
+ * genuinely absent — re-showing a card to an absent operator is the noise
23
+ * this removes.
24
+ */
25
+
26
+ // NUL — can appear in neither a tool name nor a rendered input preview, so it
27
+ // safely delimits the two halves of a signature (a printable separator could
28
+ // collide: ("a b","c") vs ("a","b c")). Built at runtime so the SOURCE file
29
+ // stays plain text (a literal NUL byte would make git treat it as binary).
30
+ const SIGNATURE_SEP = String.fromCharCode(0)
31
+
32
+ /**
33
+ * Stable identity for a permission request: the tool plus its input preview
34
+ * (the same string the card renders). Same tool + same preview ⇒ same action.
35
+ */
36
+ export function permissionSignature(toolName: string, inputPreview: string): string {
37
+ return toolName + SIGNATURE_SEP + inputPreview
38
+ }
39
+
40
+ /** The `message` attached to a TTL auto-deny so the model treats it as a
41
+ * timeout, not a denial, and does not retry the identical call. */
42
+ export function timeoutDenyMessage(timeoutMinutes: number): string {
43
+ return (
44
+ `No operator responded within ${timeoutMinutes} minutes, so this request timed out. ` +
45
+ `This is a TIMEOUT, not a denial — the operator is likely away. ` +
46
+ `Do NOT retry this exact action automatically. Tell the user it is still ` +
47
+ `awaiting their approval, then continue with other work or stop.`
48
+ )
49
+ }
50
+
51
+ /** The `message` attached when we short-circuit a duplicate retry of an
52
+ * already-timed-out request (no new card posted). */
53
+ export const duplicateDenyMessage =
54
+ `This exact action already timed out awaiting the operator, and they have not ` +
55
+ `responded since. Do NOT keep re-requesting it — tell the user it needs their ` +
56
+ `approval when they are back, and move on to other work or stop.`
57
+
58
+ /**
59
+ * True when `sig` timed out within `windowMs` of `now` (so a fresh request for
60
+ * it is a retry to suppress). `timeouts` maps signature → last-timeout epoch ms.
61
+ */
62
+ export function isRecentTimeoutDuplicate(
63
+ timeouts: ReadonlyMap<string, number>,
64
+ sig: string,
65
+ now: number,
66
+ windowMs: number,
67
+ ): boolean {
68
+ const at = timeouts.get(sig)
69
+ return at != null && now - at <= windowMs
70
+ }
@@ -1,7 +1,7 @@
1
1
  /**
2
2
  * Prefix-cache warmup turn — opt-in cold-start TTFO optimization.
3
3
  *
4
- * Per cold-start TTFO RFC (docs/rfcs/cold-start-ttfo.md, PR #1589),
4
+ * Per cold-start TTFO RFC (reference/rfcs/cold-start-ttfo.md, PR #1589),
5
5
  * Option A. On every bridge-up after a restart, synthesize a synthetic
6
6
  * inbound (`__WARMUP_PING__`, meta.source="warmup") and deliver it to
7
7
  * the just-registered bridge. Claude processes the message — paying
@@ -1,6 +1,6 @@
1
1
  /**
2
2
  * Tests for the peercred-gated webhook ingest UDS server
3
- * (RFC docs/rfcs/webhook-via-gateway-socket.md).
3
+ * (RFC reference/rfcs/webhook-via-gateway-socket.md).
4
4
  *
5
5
  * MUST run under `bun test`: the peer-credential gate calls
6
6
  * `getPeerCred` (bun:ffi getsockopt SO_PEERCRED), which returns null
@@ -1,5 +1,5 @@
1
1
  /**
2
- * Webhook ingest UDS server (RFC docs/rfcs/webhook-via-gateway-socket.md).
2
+ * Webhook ingest UDS server (RFC reference/rfcs/webhook-via-gateway-socket.md).
3
3
  *
4
4
  * A dedicated, peercred-gated Unix socket the host-side web receiver
5
5
  * forwards verified webhook events to. It is deliberately SEPARATE from
@@ -20,7 +20,7 @@
20
20
  * writing to a registry.db nobody read, leaving every bg sub-agent
21
21
  * invisible to the watcher. Surfaced by
22
22
  * bg-sub-agent-dispatch-dm.test.ts; see RFC Phase 2 §Bug 2 in
23
- * reference/sub-agent-visibility-rfc.md.
23
+ * reference/rfcs/sub-agent-visibility.md.
24
24
  * 3. process.cwd() (legacy fallback for ad-hoc invocations).
25
25
  *
26
26
  * Performance: the actual DB write is deferred via setImmediate (Node 22+
@@ -1,5 +1,5 @@
1
1
  /**
2
- * `!`-prefix interrupt marker — closes #575 / part of `reference/steer-or-queue-mid-flight.md`.
2
+ * `!`-prefix interrupt marker — closes #575 / part of `reference/jobs/steer-or-queue-mid-flight.md`.
3
3
  *
4
4
  * The product contract: when the user starts a Telegram message with
5
5
  * `!`, they're saying "drop what you're doing and handle this