@agfpd/iapeer 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@agfpd/iapeer",
3
- "version": "0.2.0",
3
+ "version": "0.2.1",
4
4
  "description": "Foundation core for the IAPeer multi-agent ecosystem: identity, registry, storage, codec.",
5
5
  "type": "module",
6
6
  "bin": {
@@ -21,7 +21,14 @@
21
21
  },
22
22
  "scripts": {
23
23
  "test": "IAPEER_TEST_SANDBOX=1 bun test",
24
- "typecheck": "tsc --noEmit"
24
+ "typecheck": "tsc --noEmit",
25
+ "release": "npm version patch && npm publish && git push --follow-tags",
26
+ "release:minor": "npm version minor && npm publish && git push --follow-tags",
27
+ "release:major": "npm version major && npm publish && git push --follow-tags",
28
+ "prepublishOnly": "test -z \"$(git status --porcelain)\" || (echo 'release: working tree is dirty — commit or stash before release' >&2 && exit 1)"
29
+ },
30
+ "publishConfig": {
31
+ "access": "public"
25
32
  },
26
33
  "dependencies": {
27
34
  "@modelcontextprotocol/sdk": "1.29.0",
@@ -7,8 +7,8 @@ import { afterEach, beforeEach, describe, expect, test } from 'bun:test'
7
7
  import { mkdtempSync, rmSync, writeFileSync } from 'fs'
8
8
  import { tmpdir } from 'os'
9
9
  import { join } from 'path'
10
- import { formatListTable, listPeers, parseArgs, sendMessage, startPeer, stopPeer } from './index.ts'
11
- import { upsertPeer } from '../registry/index.ts'
10
+ import { formatListTable, listPeers, parseArgs, removePeerCli, sendMessage, startPeer, stopPeer } from './index.ts'
11
+ import { findPeer, readPeersIndex, upsertPeer } from '../registry/index.ts'
12
12
  import { isStopped, loadLifecycleConfig, setStopped } from '../lifecycle/index.ts'
13
13
  import { launchdPlistPath } from '../launch/launchd.ts'
14
14
 
@@ -90,6 +90,27 @@ describe('FLEET GUARD (H4) — foreign persistent-peer launchd plist is off-limi
90
90
  })
91
91
  })
92
92
 
93
+ describe('remove (registry record via the locked writer)', () => {
94
+ test('removes a registered peer through registry.removePeer', async () => {
95
+ await register('zombie')
96
+ const e = env()
97
+ expect(findPeer(readPeersIndex({ env: e }), 'zombie')).not.toBeNull()
98
+ const o = await removePeerCli('zombie', { env: e })
99
+ expect(o.action).toBe('removed')
100
+ expect(findPeer(readPeersIndex({ env: e }), 'zombie')).toBeNull()
101
+ })
102
+ test('removing an absent peer is an idempotent no-op (not an error)', async () => {
103
+ const o = await removePeerCli('never-existed', { env: env() })
104
+ expect(o.action).toBe('absent')
105
+ })
106
+ test('a second remove of the same peer is also a no-op', async () => {
107
+ await register('twice')
108
+ const e = env()
109
+ expect((await removePeerCli('twice', { env: e })).action).toBe('removed')
110
+ expect((await removePeerCli('twice', { env: e })).action).toBe('absent')
111
+ })
112
+ })
113
+
93
114
  describe('send validation', () => {
94
115
  test('invalid --from identity → throws', async () => {
95
116
  await register('alpha')
package/src/cli/index.ts CHANGED
@@ -21,7 +21,7 @@ import {
21
21
  } from '../core/constants.ts'
22
22
  import { buildProcessAddress, buildSocketPath, parseSessionName } from '../core/socket.ts'
23
23
  import { ensureGlobalIapScaffold } from '../storage/index.ts'
24
- import { findPeer, readPeersIndex, type PeerRecord } from '../registry/index.ts'
24
+ import { findPeer, readPeersIndex, removePeer, type PeerRecord } from '../registry/index.ts'
25
25
  import { isPeerLive, routeControl, routeSend, type WakeFn } from '../transport/index.ts'
26
26
  import {
27
27
  attachPeer,
@@ -226,6 +226,51 @@ export function startPeer(personality: string, runtime: string | undefined, opts
226
226
  return out
227
227
  }
228
228
 
229
+ // ─────────────────────────────────────────────────────────────────────────────
230
+ // remove — delete a peer's record from the registry through the LOCKED writer
231
+ // (registry.removePeer). Direct edits of peers-profiles.json are refused at
232
+ // storage.ts:304 (locked-writer invariant); this is the operator path that used
233
+ // to require dropping into `bun -e removePeer(...)`. The use case is reaping the
234
+ // ephemeral zombie records a retired spawn leaves behind.
235
+ // ─────────────────────────────────────────────────────────────────────────────
236
+
237
+ export interface RemoveOutcome {
238
+ personality: string
239
+ action: 'removed' | 'absent' | 'refused-live'
240
+ reason?: string
241
+ }
242
+
243
+ /**
244
+ * remove <peer> [--force]: drop the registry record via the locked writer.
245
+ * IDEMPOTENT — an absent peer is a no-op success (`absent`), never an error.
246
+ * SAFETY: refuses a peer that is currently LIVE on any runtime — deleting a
247
+ * running session's record would orphan it from routing (resolveCallerIdentity /
248
+ * findPeer would no longer resolve it while it still runs). --force overrides.
249
+ * A zombie record is dead by definition, so the guard never blocks the cleanup
250
+ * it exists for.
251
+ */
252
+ export async function removePeerCli(
253
+ personality: string,
254
+ opts: CliEnvOptions & { force?: boolean } = {},
255
+ ): Promise<RemoveOutcome> {
256
+ const env = opts.env ?? process.env
257
+ const peer = findPeer(readPeersIndex({ env }), personality)
258
+ if (!peer) return { personality, action: 'absent' }
259
+ if (!opts.force) {
260
+ const cfg = loadLifecycleConfig(env)
261
+ const liveRt = peer.runtimes.find(rt => isPeerLive(rt, personality, cfg.sockDir))
262
+ if (liveRt) {
263
+ return {
264
+ personality,
265
+ action: 'refused-live',
266
+ reason: `"${personality}" is LIVE on ${liveRt} — removing its registry record would orphan the running session from routing; stop it first or pass --force`,
267
+ }
268
+ }
269
+ }
270
+ await removePeer(personality, { env })
271
+ return { personality, action: 'removed' }
272
+ }
273
+
229
274
  // ─────────────────────────────────────────────────────────────────────────────
230
275
  // send — manual IAP send fallback (contract Примитивы §send). Goes through the
231
276
  // same router path as send_to_peer (resolve → deliver / wake), in-process so it
@@ -309,6 +354,7 @@ const USAGE = `usage: iapeer <verb> [args]
309
354
  list [--json] registered peers + per-runtime liveness
310
355
  stop <peer> [runtime] | --all durable-stop a warm peer / bootout an always-on one
311
356
  start <peer> [runtime] re-enable a stopped peer / bootstrap an always-on one
357
+ remove <peer> [--force] delete a peer's registry record (locked writer); refuses a LIVE peer unless --force
312
358
  send <target> (--message <text> | --message-file <f|->) [--from <id>] [--attachment <p>]… [--topic <t>] manual IAP send (fallback)
313
359
  <runtime> launch the cwd's peer (ALWAYS fresh)
314
360
  enable <plugin> [peer] [--no-setup] install + enable an agfpd capability for a peer
@@ -459,6 +505,17 @@ export async function runCli(argv: string[], env: NodeJS.ProcessEnv = process.en
459
505
  for (const o of outcomes) out(`${o.personality} (${o.runtime}): ${o.action}${o.reason ? ` — ${o.reason}` : ''}\n`)
460
506
  return outcomes.some(o => o.action === 'refused-foreign-launchd') ? 1 : 0
461
507
  }
508
+ case 'remove': {
509
+ // Reap a registry record through the locked writer (the operator path over
510
+ // registry.removePeer). Idempotent on an absent peer (exit 0). Refuses a LIVE
511
+ // peer unless --force (orphaning a running session from routing is the risk).
512
+ if (!positionals[0]) return usage(errOut)
513
+ const o = await removePeerCli(positionals[0], { force: flags.force === true, env })
514
+ if (o.action === 'removed') out(`removed "${o.personality}" from the registry\n`)
515
+ else if (o.action === 'absent') out(`"${o.personality}" not registered — no-op\n`)
516
+ else errOut(`remove: ${o.reason}\n`)
517
+ return o.action === 'refused-live' ? 1 : 0
518
+ }
462
519
  case 'send': {
463
520
  // Message body from EITHER --message <text> OR --message-file <f> (f='-' →
464
521
  // stdin). The runtime packages (telegram/notifier) + monitor deliver via
@@ -13,7 +13,13 @@ export const SUPPORTED_LOCAL_RUNTIMES = ['claude', 'codex'] as const
13
13
  export type SupportedLocalRuntime = (typeof SUPPORTED_LOCAL_RUNTIMES)[number]
14
14
 
15
15
  export const PEERS_SCHEMA_VERSION = 2
16
- export const MAX_DESCRIPTION_LEN = 250
16
+ // 450 (was 250): self-documenting API-peer descriptions (notifier timer/watcher)
17
+ // must fit "who the peer is + registration format + a live example" — dense full
18
+ // texts run to ~408 chars; 250 cut them mid-word so the caller could not compose
19
+ // the call. Bumped with Arthur's sanction (2026-06-08). NB: this is COMPILE-TIME
20
+ // baked — the live daemon re-clamps descriptions on read (registry parsePeerRecord),
21
+ // so the running router keeps the OLD limit until restarted onto the new binary.
22
+ export const MAX_DESCRIPTION_LEN = 450
17
23
 
18
24
  // Contract vocabulary (docs/Идентичность, Артур 05.06): the nature of the
19
25
  // intelligence expressing itself through a runtime.
@@ -101,6 +101,11 @@ export async function startConfiguredDaemon(opts: ConfiguredDaemonOptions = {}):
101
101
  intervalMs: opts.superviseIntervalMs ?? DEFAULT_SUPERVISE_INTERVAL_MS,
102
102
  // idle-reap / zombie-sweep, THEN the eager fresh re-launch for any peer whose
103
103
  // session died carrying a .new-eager mark (owner /new; async, best-effort).
104
+ // The DURABLE decision trace (which peer, what outcome, when, why) is emitted
105
+ // INSIDE superviseTick (lifecycle/eventlog.ts → logs/iapeer/lifecycle.log), so
106
+ // every reap is recorded regardless of entry point (this timer AND the heal-at-
107
+ // wake superviseTick inside wakeOrSpawn). The outcomes array drives only the
108
+ // eager relaunch here; the trace does not depend on consuming it.
104
109
  tick: async () => {
105
110
  const outcomes = superviseTick(cfg, { env })
106
111
  await processEagerRelaunches(cfg, outcomes, { env })
@@ -0,0 +1,114 @@
1
+ // eventlog — the daemon's durable, rotated lifecycle decision log. Tests the pure
2
+ // logfmt formatter, the append path (into an explicit temp logDir — never the real
3
+ // ~/.iapeer), and the size-rotation chain. No daemon, no tmux — pure FS.
4
+
5
+ import { afterEach, beforeEach, describe, expect, test } from 'bun:test'
6
+ import { existsSync, mkdtempSync, readFileSync, rmSync, statSync } from 'fs'
7
+ import { tmpdir } from 'os'
8
+ import { join } from 'path'
9
+ import {
10
+ appendLifecycleEvent,
11
+ fmtValue,
12
+ formatEventLine,
13
+ lifecycleLogPath,
14
+ } from './eventlog.ts'
15
+
16
+ const TS = 1_749_470_400_000 // fixed epoch-ms → a stable ISO for golden lines
17
+ const ISO = new Date(TS).toISOString()
18
+
19
+ describe('fmtValue (logfmt escaping)', () => {
20
+ test('bare token stays bare', () => {
21
+ expect(fmtValue('reaped-idle')).toBe('reaped-idle')
22
+ expect(fmtValue('claude-boris')).toBe('claude-boris')
23
+ expect(fmtValue(42)).toBe('42')
24
+ })
25
+ test('empty string → ""', () => {
26
+ expect(fmtValue('')).toBe('""')
27
+ })
28
+ test('whitespace / = / " force quoting and escape', () => {
29
+ expect(fmtValue('session no longer live')).toBe('"session no longer live"')
30
+ expect(fmtValue('a=b')).toBe('"a=b"')
31
+ expect(fmtValue('say "hi"')).toBe('"say \\"hi\\""')
32
+ expect(fmtValue('back\\slash here')).toBe('"back\\\\slash here"')
33
+ })
34
+ })
35
+
36
+ describe('formatEventLine', () => {
37
+ test('ts is first; fields keep insertion order; undefined skipped', () => {
38
+ const line = formatEventLine(TS, {
39
+ ev: 'supervise',
40
+ identity: 'claude-boris',
41
+ action: 'reaped-gone',
42
+ reason: 'session no longer live',
43
+ ref: undefined, // dropped
44
+ outcome: 'fresh-next-msg',
45
+ })
46
+ expect(line).toBe(
47
+ `ts=${ISO} ev=supervise identity=claude-boris action=reaped-gone reason="session no longer live" outcome=fresh-next-msg`,
48
+ )
49
+ })
50
+ test('age field renders as a bare token', () => {
51
+ const line = formatEventLine(TS, { ev: 'supervise', identity: 'claude-x', action: 'reaped-idle', age: '4230s' })
52
+ expect(line).toBe(`ts=${ISO} ev=supervise identity=claude-x action=reaped-idle age=4230s`)
53
+ })
54
+ })
55
+
56
+ describe('appendLifecycleEvent', () => {
57
+ let dir: string
58
+
59
+ beforeEach(() => {
60
+ dir = mkdtempSync(join(tmpdir(), 'iapeer-eventlog-'))
61
+ })
62
+ afterEach(() => {
63
+ rmSync(dir, { recursive: true, force: true })
64
+ })
65
+
66
+ test('falsy logDir → no-op (a partial cfg never writes / never resolves a real path)', () => {
67
+ expect(() => appendLifecycleEvent(undefined, { ev: 'supervise', identity: 'x' }, { nowMs: TS })).not.toThrow()
68
+ expect(() => appendLifecycleEvent('', { ev: 'supervise', identity: 'x' }, { nowMs: TS })).not.toThrow()
69
+ })
70
+
71
+ test('writes one logfmt line per call, appended in order', () => {
72
+ appendLifecycleEvent(dir, { ev: 'wake', personality: 'boris', mode: 'fresh', cause: 'crash-or-self-close' }, { nowMs: TS })
73
+ appendLifecycleEvent(dir, { ev: 'supervise', identity: 'claude-doc', action: 'reaped-gone' }, { nowMs: TS + 1000 })
74
+ const body = readFileSync(lifecycleLogPath(dir), 'utf8')
75
+ const lines = body.trimEnd().split('\n')
76
+ expect(lines).toHaveLength(2)
77
+ expect(lines[0]).toBe(`ts=${ISO} ev=wake personality=boris mode=fresh cause=crash-or-self-close`)
78
+ expect(lines[1]).toContain('ev=supervise identity=claude-doc action=reaped-gone')
79
+ })
80
+
81
+ test('creates the log dir if absent', () => {
82
+ const nested = join(dir, 'logs', 'iapeer')
83
+ appendLifecycleEvent(nested, { ev: 'supervise', identity: 'x' }, { nowMs: TS })
84
+ expect(existsSync(lifecycleLogPath(nested))).toBe(true)
85
+ })
86
+
87
+ test('size rotation: base → .1, oldest dropped past keep', () => {
88
+ const env = { IAPEER_LIFECYCLE_LOG_MAX_BYTES: '120', IAPEER_LIFECYCLE_LOG_KEEP: '2' }
89
+ const path = lifecycleLogPath(dir)
90
+ for (let i = 0; i < 6; i++) {
91
+ appendLifecycleEvent(dir, { ev: 'supervise', identity: `claude-peer${i}`, action: 'reaped-gone', n: i }, { env, nowMs: TS + i })
92
+ }
93
+ expect(existsSync(path)).toBe(true)
94
+ expect(existsSync(`${path}.1`)).toBe(true)
95
+ expect(existsSync(`${path}.2`)).toBe(true)
96
+ expect(existsSync(`${path}.3`)).toBe(false) // keep=2 → never a .3
97
+ expect(statSync(path).size).toBeLessThanOrEqual(200)
98
+ expect(readFileSync(path, 'utf8')).toContain('claude-peer5') // newest in the live base file
99
+ })
100
+
101
+ test('rotation preserves chronological order across files (.N oldest, base newest)', () => {
102
+ const env = { IAPEER_LIFECYCLE_LOG_MAX_BYTES: '90', IAPEER_LIFECYCLE_LOG_KEEP: '3' }
103
+ const path = lifecycleLogPath(dir)
104
+ for (let i = 0; i < 4; i++) {
105
+ appendLifecycleEvent(dir, { ev: 'supervise', identity: `claude-p${i}` }, { env, nowMs: TS + i })
106
+ }
107
+ const ordered = ['.3', '.2', '.1', '']
108
+ .map(suf => (existsSync(path + suf) ? readFileSync(path + suf, 'utf8') : ''))
109
+ .join('')
110
+ const seen = [...ordered.matchAll(/identity=claude-p(\d)/g)].map(m => Number(m[1]))
111
+ expect(seen).toEqual([...seen].sort((a, b) => a - b))
112
+ expect(seen[seen.length - 1]).toBe(3) // newest line is p3, in the base file
113
+ })
114
+ })
@@ -0,0 +1,133 @@
1
+ // Lifecycle event log — the daemon's DURABLE, ROTATED trace of every lifecycle
2
+ // DECISION it makes. This is the observability gap the boris-fresh incident hit:
3
+ // a peer woke fresh and there was NO record of when/how its prior session ended,
4
+ // nor of the daemon's fresh-vs-resume reasoning, because superviseTick's outcomes
5
+ // were dropped and the daemon never wrote a decision line anywhere.
6
+ //
7
+ // Design:
8
+ // • One line per decision, logfmt (`key=value`, values quoted iff they contain
9
+ // whitespace/quotes/`=`). Human-greppable AND machine-parseable. The state
10
+ // markers (.idle-reaped / .deaths) are CONSUMED on the next wake — this log is
11
+ // the part that survives, so a postmortem can reconstruct the death even after
12
+ // the marker is gone.
13
+ // • Append-only, app-managed SIZE rotation (NOT launchd's stdout/stderr, which
14
+ // are unbounded and truncated on restart). lifecycle.log → .1 … .N.
15
+ // • The target directory is passed IN (cfg.eventLogDir), NOT re-resolved from
16
+ // env — so it is isolated by the SAME cfg the rest of lifecycle routes through
17
+ // (a test that sandboxes cfg.stateDir also sandboxes this log; no leak to the
18
+ // real ~/.iapeer). A falsy dir → no-op (a partial test cfg never writes).
19
+ // • Best-effort throughout: a write/rotate failure is swallowed. Observability
20
+ // must never take down the daemon or fail a wake/reap.
21
+ //
22
+ // Lifted-out-able: the rotate-append primitive is path-parameterized, so the
23
+ // adjacent "log rotation" phase can promote it to storage/ and point other log
24
+ // producers at it without touching this module's call sites.
25
+
26
+ import { appendFileSync, mkdirSync, renameSync, rmSync, statSync } from 'fs'
27
+ import { join } from 'path'
28
+
29
+ /** Default cap per lifecycle.log file before it rotates to lifecycle.log.1. */
30
+ const DEFAULT_MAX_BYTES = 5 * 1024 * 1024 // 5 MiB
31
+ /** Default number of rotated backups kept (lifecycle.log.1 … .KEEP). */
32
+ const DEFAULT_KEEP = 5
33
+
34
+ /** The durable lifecycle decision log inside `logDir` (cfg.eventLogDir). */
35
+ export function lifecycleLogPath(logDir: string): string {
36
+ return join(logDir, 'lifecycle.log')
37
+ }
38
+
39
+ function envPosInt(raw: string | undefined, dflt: number): number {
40
+ const n = parseInt(raw ?? '', 10)
41
+ return Number.isFinite(n) && n > 0 ? n : dflt
42
+ }
43
+
44
+ /** Whether to also log the steady-state non-decisions (alive / skipped-launchd).
45
+ * Off by default — they fire every tick per live/launchd peer and would bury the
46
+ * actual decisions (reap / wake) under heartbeat noise. */
47
+ export function superviseLogVerbose(env: NodeJS.ProcessEnv = process.env): boolean {
48
+ const v = env.IAPEER_SUPERVISE_LOG_VERBOSE?.trim().toLowerCase()
49
+ return v === '1' || v === 'true' || v === 'yes'
50
+ }
51
+
52
+ /** logfmt value: bare token, or double-quoted with `"`/`\` escaped, when it
53
+ * contains whitespace, `=` or `"`. Empty string → `""`. */
54
+ export function fmtValue(v: string | number): string {
55
+ const s = String(v)
56
+ if (s === '') return '""'
57
+ if (/[\s"=]/.test(s)) return `"${s.replace(/\\/g, '\\\\').replace(/"/g, '\\"')}"`
58
+ return s
59
+ }
60
+
61
+ /** Render one logfmt line (ts first, then fields in insertion order; undefined
62
+ * fields are skipped). No trailing newline. Pure — unit-testable. */
63
+ export function formatEventLine(nowMs: number, fields: Record<string, string | number | undefined>): string {
64
+ const parts = [`ts=${new Date(nowMs).toISOString()}`]
65
+ for (const [k, v] of Object.entries(fields)) {
66
+ if (v === undefined) continue
67
+ parts.push(`${k}=${fmtValue(v)}`)
68
+ }
69
+ return parts.join(' ')
70
+ }
71
+
72
+ /** Size-rotate `path` (and its .1 … .keep backups) when the next line would push
73
+ * it over `maxBytes`. Drops the oldest, shifts each backup up by one, base→.1.
74
+ * Best-effort: any fs hiccup leaves the chain as-is (we then just append). */
75
+ function rotateIfNeeded(path: string, lineLen: number, maxBytes: number, keep: number): void {
76
+ let size: number
77
+ try {
78
+ size = statSync(path).size
79
+ } catch {
80
+ return // no file yet → nothing to rotate
81
+ }
82
+ if (size + lineLen <= maxBytes) return
83
+ try {
84
+ rmSync(`${path}.${keep}`, { force: true })
85
+ } catch {
86
+ /* best-effort */
87
+ }
88
+ for (let i = keep - 1; i >= 1; i--) {
89
+ try {
90
+ renameSync(`${path}.${i}`, `${path}.${i + 1}`)
91
+ } catch {
92
+ /* that backup may not exist yet */
93
+ }
94
+ }
95
+ try {
96
+ renameSync(path, `${path}.1`)
97
+ } catch {
98
+ /* best-effort */
99
+ }
100
+ }
101
+
102
+ export interface AppendEventOptions {
103
+ /** Reads the rotation knobs IAPEER_LIFECYCLE_LOG_MAX_BYTES / _KEEP. */
104
+ env?: NodeJS.ProcessEnv
105
+ /** Stamp the line with this epoch-ms (superviseTick passes its own tick clock so
106
+ * the log timestamp agrees with the death/idle accounting). Default Date.now(). */
107
+ nowMs?: number
108
+ }
109
+
110
+ /**
111
+ * Append one lifecycle decision line into `logDir`/lifecycle.log. A falsy `logDir`
112
+ * is a no-op (a partial test cfg without eventLogDir never writes — and never
113
+ * resolves a real path). Fully best-effort — never throws.
114
+ */
115
+ export function appendLifecycleEvent(
116
+ logDir: string | undefined,
117
+ fields: Record<string, string | number | undefined>,
118
+ opts: AppendEventOptions = {},
119
+ ): void {
120
+ if (!logDir) return
121
+ const env = opts.env ?? process.env
122
+ const path = lifecycleLogPath(logDir)
123
+ const line = `${formatEventLine(opts.nowMs ?? Date.now(), fields)}\n`
124
+ const maxBytes = envPosInt(env.IAPEER_LIFECYCLE_LOG_MAX_BYTES, DEFAULT_MAX_BYTES)
125
+ const keep = envPosInt(env.IAPEER_LIFECYCLE_LOG_KEEP, DEFAULT_KEEP)
126
+ try {
127
+ mkdirSync(logDir, { recursive: true, mode: 0o700 })
128
+ rotateIfNeeded(path, line.length, maxBytes, keep)
129
+ appendFileSync(path, line, { mode: 0o600 })
130
+ } catch {
131
+ /* observability is best-effort — a log failure must never break a wake/reap */
132
+ }
133
+ }
@@ -40,6 +40,7 @@ import {
40
40
  type LaunchSpec,
41
41
  } from '../launch/index.ts'
42
42
  import { composeSystemPrompt, gatherPromptInput } from '../launch/composeSystemPrompt.ts'
43
+ import { appendLifecycleEvent, superviseLogVerbose } from './eventlog.ts'
43
44
 
44
45
  // ─────────────────────────────────────────────────────────────────────────────
45
46
  // Config
@@ -51,6 +52,11 @@ export interface LifecycleConfig {
51
52
  sockDir: string
52
53
  stateDir: string // ~/.iapeer/state/lifecycle
53
54
  logDir: string // ~/.iapeer/logs/lifecycle
55
+ /** Where the durable lifecycle DECISION log (lifecycle.log) is written
56
+ * (~/.iapeer/logs/iapeer — next to daemon-stdout/stderr.log, where the first
57
+ * investigator looks). Routed through cfg — NOT re-resolved from env — so it is
58
+ * isolated by the same sandbox as stateDir (eventlog.ts). */
59
+ eventLogDir: string
54
60
  bootDeadlineSecs: number
55
61
  readyGateSecs: number
56
62
  idleSecs: number
@@ -74,6 +80,7 @@ export function loadLifecycleConfig(env: NodeJS.ProcessEnv = process.env): Lifec
74
80
  sockDir: resolveSockDir(env),
75
81
  stateDir: join(root, STATE_DIR, 'lifecycle'),
76
82
  logDir: join(root, LOGS_DIR, 'lifecycle'),
83
+ eventLogDir: join(root, LOGS_DIR, 'iapeer'),
77
84
  bootDeadlineSecs: num(env.IAPEER_BOOT_DEADLINE_SECS, 240),
78
85
  readyGateSecs: num(env.IAPEER_READY_GATE_SECS, 120),
79
86
  idleSecs: num(env.IAPEER_IDLE_SECS, 3600),
@@ -331,6 +338,10 @@ export interface WakeMode {
331
338
  /** Set ONLY for an EXPLICIT resume request that found nothing to resume — the
332
339
  * caller must fail loud (never a silent fresh fallback). */
333
340
  failReason?: string
341
+ /** Which decision branch fired — the durable "why fresh / why resume" reason.
342
+ * Logged by wakeOrSpawn: the .idle-reaped marker is CONSUMED inside this
343
+ * function (branch 3b), so this cause is the only surviving record of it. */
344
+ cause?: string
334
345
  }
335
346
 
336
347
  /**
@@ -371,29 +382,33 @@ export function resolveWakeMode(
371
382
  incomingTopic?: string,
372
383
  ): WakeMode {
373
384
  // 1. folder-launch → always fresh.
374
- if (argsResume === false) return { resume: false }
385
+ if (argsResume === false) return { resume: false, cause: 'folder-launch' }
375
386
  // 2. attach → always resume, fail-loud if nothing to resume.
376
387
  if (argsResume === true) {
377
388
  const r = resolveResume(cwd)
378
- if (!r.ok) return { resume: false, failReason: r.reason ?? 'resume requested but nothing to resume' }
379
- return { resume: true, resumeRef: r.ref }
389
+ if (!r.ok) return { resume: false, failReason: r.reason ?? 'resume requested but nothing to resume', cause: 'attach-nothing-to-resume' }
390
+ return { resume: true, resumeRef: r.ref, cause: 'attach' }
380
391
  }
381
392
  // 3. default (a message woke a dead/asleep peer): decide by the death cause.
382
393
  // 3a. NOT idle-reaped → it died on its own (crash / self-close) → clean FRESH.
383
- if (!hasIdleReaped(cfg, identity)) return { resume: false }
394
+ if (!hasIdleReaped(cfg, identity)) return { resume: false, cause: 'crash-or-self-close' }
384
395
  // 3b. idle-reaped → resume-eligible. Consume the marker now (it has done its job).
385
396
  clearIdleReaped(cfg, identity)
386
397
  // human-conversational dialogue never auto-freshes; only an explicit /new resets it.
387
398
  if (isHumanConversational(cwd)) {
388
399
  const r = resolveResume(cwd)
389
- return r.ok ? { resume: true, resumeRef: r.ref } : { resume: false }
400
+ return r.ok
401
+ ? { resume: true, resumeRef: r.ref, cause: 'idle-reaped-human' }
402
+ : { resume: false, cause: 'idle-reaped-human-no-resume' }
390
403
  }
391
404
  // executor: a NEW topic (non-empty and differing from the stored one) means new
392
405
  // work → FRESH; same topic, or no incoming topic → continue the work → RESUME.
393
406
  const topic = incomingTopic?.trim() ?? ''
394
- if (topic && topic !== readTopic(cfg, identity)) return { resume: false }
407
+ if (topic && topic !== readTopic(cfg, identity)) return { resume: false, cause: 'idle-reaped-new-topic' }
395
408
  const r = resolveResume(cwd)
396
- return r.ok ? { resume: true, resumeRef: r.ref } : { resume: false }
409
+ return r.ok
410
+ ? { resume: true, resumeRef: r.ref, cause: 'idle-reaped-resume' }
411
+ : { resume: false, cause: 'idle-reaped-no-resume' }
397
412
  }
398
413
 
399
414
  export function readSessionStates(cfg: LifecycleConfig): SessionState[] {
@@ -636,6 +651,13 @@ export async function wakeOrSpawn(args: WakeArgs, deps: WakeDeps = {}): Promise<
636
651
  const env = deps.env ?? process.env
637
652
  const cfg = deps.cfg ?? loadLifecycleConfig(env)
638
653
 
654
+ // Durable wake-decision trace (eventlog.ts): one line per bring-up decision —
655
+ // fresh / resume (with the resolveWakeMode cause) or a refusal (stopped / crash-
656
+ // loop / launchd). This is the direct answer to "why did peer X come up fresh",
657
+ // and the only surviving record of the .idle-reaped marker resolveWakeMode consumes.
658
+ const logWake = (fields: Record<string, string | number | undefined>): void =>
659
+ appendLifecycleEvent(cfg.eventLogDir, { ev: 'wake', personality: args.personality, ...fields }, { env })
660
+
639
661
  // Heal strays before launching — the sweep-at-spawn-start. This is the SAME
640
662
  // H4-guarded superviseTick the daemon timer runs, so both reap entry points
641
663
  // (timer + wake) go through one guarded path. Best-effort: never block a wake.
@@ -651,6 +673,7 @@ export async function wakeOrSpawn(args: WakeArgs, deps: WakeDeps = {}): Promise<
651
673
 
652
674
  // H4 — never wake a launchd-managed peer (launchd KeepAlive owns it).
653
675
  if (isLaunchdManaged(args.personality, env)) {
676
+ logWake({ runtime: args.runtime, mode: 'refused', cause: 'launchd-managed' })
654
677
  return {
655
678
  status: 'FAILED',
656
679
  woke: false,
@@ -681,6 +704,7 @@ export async function wakeOrSpawn(args: WakeArgs, deps: WakeDeps = {}): Promise<
681
704
  // halt: refuse with stopped:true so the sender gets an explicit "stopped" error,
682
705
  // not a generic "offline" — and no message is queued. `start` clears the flag.
683
706
  if (isStopped(cfg, identity)) {
707
+ logWake({ identity, runtime, mode: 'refused', cause: 'stopped' })
684
708
  return {
685
709
  status: 'FAILED',
686
710
  woke: false,
@@ -698,9 +722,11 @@ export async function wakeOrSpawn(args: WakeArgs, deps: WakeDeps = {}): Promise<
698
722
  // refusal). A stop racing DURING the spawn is a narrower window the wake-lock does not
699
723
  // cover (stop does not take this lock).
700
724
  if (isStopped(cfg, identity)) {
725
+ logWake({ identity, runtime, mode: 'refused', cause: 'stopped-mid-wake' })
701
726
  return { status: 'FAILED', woke: false, runtime, stopped: true, reason: `"${args.personality}" (${runtime}) is stopped and not accepting messages; start it to resume` }
702
727
  }
703
728
  if (isLaunchdManaged(args.personality, env)) {
729
+ logWake({ identity, runtime, mode: 'refused', cause: 'launchd-managed-mid-wake' })
704
730
  return { status: 'FAILED', woke: false, runtime, reason: `"${args.personality}" became launchd-managed mid-wake; the daemon does not wake it` }
705
731
  }
706
732
  // Idempotent fast path inside the lock: a live session wins (a concurrent
@@ -719,6 +745,7 @@ export async function wakeOrSpawn(args: WakeArgs, deps: WakeDeps = {}): Promise<
719
745
  // trims the ring, so the guard only fires on a genuine tight loop.
720
746
  const recentDeaths = countRecentDeaths(cfg, identity, cfg.crashLoopWindowSecs, Date.now())
721
747
  if (recentDeaths >= cfg.crashLoopMax) {
748
+ logWake({ identity, runtime, mode: 'refused', cause: 'crash-loop', reason: `${recentDeaths} deaths in ${cfg.crashLoopWindowSecs}s` })
722
749
  return {
723
750
  status: 'FAILED',
724
751
  woke: false,
@@ -732,6 +759,17 @@ export async function wakeOrSpawn(args: WakeArgs, deps: WakeDeps = {}): Promise<
732
759
  // that finds nothing to resume fails loud. incomingTopic (args.topic) is the
733
760
  // executor discriminator.
734
761
  const mode = resolveWakeMode(cfg, identity, cwd, args.resume, c => adapter.resolveResume(c), args.topic)
762
+ // The bring-up decision is the durable trace — log it BEFORE launch (the decision
763
+ // stands regardless of whether the subsequent launch succeeds). resolveWakeMode has
764
+ // already consumed any .idle-reaped marker, so `cause` is now its only record.
765
+ logWake({
766
+ identity,
767
+ runtime,
768
+ mode: mode.failReason ? 'fail' : mode.resume ? 'resume' : 'fresh',
769
+ cause: mode.cause,
770
+ ref: mode.resumeRef,
771
+ reason: mode.failReason,
772
+ })
735
773
  if (mode.failReason) return { status: 'FAILED', woke: false, runtime, reason: mode.failReason }
736
774
  const resume = mode.resume
737
775
  const resumeRef = mode.resumeRef
@@ -844,11 +882,19 @@ export interface SuperviseDeps {
844
882
  export function superviseTick(cfg: LifecycleConfig, deps: SuperviseDeps = {}): SuperviseOutcome[] {
845
883
  const env = deps.env ?? process.env
846
884
  const nowMs = deps.nowMs ?? Date.now()
885
+ const verbose = superviseLogVerbose(env)
886
+ // Durable decision trace (eventlog.ts): every reap/death/eager-fresh gets a line
887
+ // so a postmortem can answer "when & how did peer X's prior session end" even
888
+ // after the .idle-reaped / .deaths markers are consumed. alive / skipped-launchd
889
+ // are steady-state non-decisions → logged only under IAPEER_SUPERVISE_LOG_VERBOSE.
890
+ const trace = (fields: Record<string, string | number | undefined>): void =>
891
+ appendLifecycleEvent(cfg.eventLogDir, { ev: 'supervise', ...fields }, { env, nowMs })
847
892
  const out: SuperviseOutcome[] = []
848
893
  for (const s of readSessionStates(cfg)) {
849
894
  // H4 — FIRST, before any reap. A launchd-managed peer is read-only.
850
895
  if (isLaunchdManaged(s.personality, env)) {
851
896
  out.push({ identity: s.identity, action: 'skipped-launchd' })
897
+ if (verbose) trace({ identity: s.identity, action: 'skipped-launchd', outcome: 'read-only-h4' })
852
898
  continue
853
899
  }
854
900
  const sock = buildSocketPath(s.runtime, s.personality, cfg.sockDir)
@@ -869,11 +915,13 @@ export function superviseTick(cfg: LifecycleConfig, deps: SuperviseDeps = {}): S
869
915
  personality: s.personality,
870
916
  runtime: s.runtime,
871
917
  })
918
+ trace({ identity: s.identity, action: 'needs-eager-fresh', reason: '/new eager mark', outcome: 'eager-fresh' })
872
919
  continue
873
920
  }
874
921
  // Crash / self-close: NO marker written, NO eager relaunch — the peer stays
875
922
  // asleep and wakes FRESH lazily on the next message (resolveWakeMode branch 3a).
876
923
  out.push({ identity: s.identity, action: 'reaped-gone', reason: 'session no longer live' })
924
+ trace({ identity: s.identity, action: 'reaped-gone', reason: 'session no longer live', outcome: 'fresh-next-msg' })
877
925
  continue
878
926
  }
879
927
  // Idle accounting via the runtime adapter's activity proxy (claude transcript
@@ -896,8 +944,10 @@ export function superviseTick(cfg: LifecycleConfig, deps: SuperviseDeps = {}): S
896
944
  setIdleReaped(cfg, s.identity)
897
945
  removeSessionState(cfg, s.identity)
898
946
  out.push({ identity: s.identity, action: 'reaped-idle', reason: `idle ${ageSecs}s` })
947
+ trace({ identity: s.identity, action: 'reaped-idle', age: `${ageSecs}s`, outcome: 'resume-eligible' })
899
948
  } else {
900
949
  out.push({ identity: s.identity, action: 'alive' })
950
+ if (verbose) trace({ identity: s.identity, action: 'alive', age: `${ageSecs}s` })
901
951
  }
902
952
  }
903
953
  return out
@@ -1,5 +1,5 @@
1
1
  import { afterEach, beforeEach, describe, expect, test } from 'bun:test'
2
- import { existsSync, mkdirSync, mkdtempSync, rmSync, writeFileSync } from 'fs'
2
+ import { existsSync, mkdirSync, mkdtempSync, readFileSync, rmSync, writeFileSync } from 'fs'
3
3
  import { tmpdir } from 'os'
4
4
  import { join } from 'path'
5
5
  import {
@@ -176,6 +176,7 @@ describe('superviseTick H4 guard', () => {
176
176
  sockDir: '/tmp',
177
177
  stateDir,
178
178
  logDir: stateDir,
179
+ eventLogDir: stateDir, // isolate the decision log into the temp dir (no real-root leak)
179
180
  bootDeadlineSecs: 1,
180
181
  readyGateSecs: 1,
181
182
  idleSecs: 1,
@@ -204,11 +205,17 @@ describe('superviseTick H4 guard', () => {
204
205
  })
205
206
 
206
207
  test('a no-plist peer with a dead session → reaped-gone, state removed', () => {
208
+ const c = cfg()
207
209
  const id = writeState('iapeer-supgone') // no plist, no live tmux session
208
- const out = superviseTick(cfg(), { env: env(), nowMs: Date.now() })
210
+ const out = superviseTick(c, { env: env(), nowMs: Date.now() })
209
211
  const o = out.find(x => x.identity === id)
210
212
  expect(o?.action).toBe('reaped-gone')
211
213
  expect(existsSync(join(stateDir, `${id}.session`))).toBe(false)
214
+ // the decision leaves a DURABLE trace line (the observability contract) — and it
215
+ // lands in the SANDBOXED eventLogDir, never the real ~/.iapeer.
216
+ const logged = readFileSync(join(c.eventLogDir, 'lifecycle.log'), 'utf8')
217
+ expect(logged).toContain(`ev=supervise identity=${id} action=reaped-gone`)
218
+ expect(logged).toContain('outcome=fresh-next-msg')
212
219
  })
213
220
 
214
221
  test('empty state dir → no outcomes', () => {
@@ -390,10 +397,10 @@ describe('resolveWakeMode (TARGET: death-cause + peer-type/topic)', () => {
390
397
 
391
398
  // ── branch 1/2: explicit fresh / explicit resume (unchanged) ────────────────
392
399
  test('argsResume=false (folder-launch) → FRESH', () => {
393
- expect(resolveWakeMode(cfg(), 'claude-p', cwd(), false, hasTranscript)).toEqual({ resume: false })
400
+ expect(resolveWakeMode(cfg(), 'claude-p', cwd(), false, hasTranscript)).toEqual({ resume: false, cause: 'folder-launch' })
394
401
  })
395
402
  test('argsResume=true (attach) + transcript → RESUME', () => {
396
- expect(resolveWakeMode(cfg(), 'claude-p', cwd(), true, hasTranscript)).toEqual({ resume: true, resumeRef: 'uuid-1' })
403
+ expect(resolveWakeMode(cfg(), 'claude-p', cwd(), true, hasTranscript)).toEqual({ resume: true, resumeRef: 'uuid-1', cause: 'attach' })
397
404
  })
398
405
  test('argsResume=true + nothing to resume → FAIL-LOUD (failReason, no silent fresh)', () => {
399
406
  const m = resolveWakeMode(cfg(), 'claude-p', cwd(), true, noTranscript)
@@ -405,7 +412,7 @@ describe('resolveWakeMode (TARGET: death-cause + peer-type/topic)', () => {
405
412
  test('DEFAULT + NOT idle-reaped (crash/self-close) → FRESH even when a transcript exists', () => {
406
413
  // INVERSION of the old polarity: absence of the daemon's idle-reaped marker = died
407
414
  // on its own = clean fresh, NOT a resume of a possibly-broken context.
408
- expect(resolveWakeMode(cfg(), 'claude-p', cwd(), undefined, hasTranscript)).toEqual({ resume: false })
415
+ expect(resolveWakeMode(cfg(), 'claude-p', cwd(), undefined, hasTranscript)).toEqual({ resume: false, cause: 'crash-or-self-close' })
409
416
  })
410
417
 
411
418
  // ── branch 3b: default + idle-reaped → resume-eligible, CONSUME the marker ───
@@ -413,25 +420,25 @@ describe('resolveWakeMode (TARGET: death-cause + peer-type/topic)', () => {
413
420
  const c = cfg()
414
421
  setIdleReaped(c, 'claude-p')
415
422
  const human = cwd(true)
416
- expect(resolveWakeMode(c, 'claude-p', human, undefined, hasTranscript)).toEqual({ resume: true, resumeRef: 'uuid-1' })
423
+ expect(resolveWakeMode(c, 'claude-p', human, undefined, hasTranscript)).toEqual({ resume: true, resumeRef: 'uuid-1', cause: 'idle-reaped-human' })
417
424
  expect(hasIdleReaped(c, 'claude-p')).toBe(false) // consumed
418
425
  })
419
426
  test('DEFAULT + idle-reaped + executor + NO incoming topic → RESUME (continue the work)', () => {
420
427
  const c = cfg()
421
428
  setIdleReaped(c, 'claude-p')
422
- expect(resolveWakeMode(c, 'claude-p', cwd(false), undefined, hasTranscript)).toEqual({ resume: true, resumeRef: 'uuid-1' })
429
+ expect(resolveWakeMode(c, 'claude-p', cwd(false), undefined, hasTranscript)).toEqual({ resume: true, resumeRef: 'uuid-1', cause: 'idle-reaped-resume' })
423
430
  })
424
431
  test('DEFAULT + idle-reaped + executor + SAME topic → RESUME', () => {
425
432
  const c = cfg()
426
433
  setIdleReaped(c, 'claude-p')
427
434
  writeTopic(c, 'claude-p', 'deploy')
428
- expect(resolveWakeMode(c, 'claude-p', cwd(false), undefined, hasTranscript, 'deploy')).toEqual({ resume: true, resumeRef: 'uuid-1' })
435
+ expect(resolveWakeMode(c, 'claude-p', cwd(false), undefined, hasTranscript, 'deploy')).toEqual({ resume: true, resumeRef: 'uuid-1', cause: 'idle-reaped-resume' })
429
436
  })
430
437
  test('DEFAULT + idle-reaped + executor + DIFFERENT topic → FRESH (new work), marker consumed', () => {
431
438
  const c = cfg()
432
439
  setIdleReaped(c, 'claude-p')
433
440
  writeTopic(c, 'claude-p', 'deploy')
434
- expect(resolveWakeMode(c, 'claude-p', cwd(false), undefined, hasTranscript, 'unrelated-bug')).toEqual({ resume: false })
441
+ expect(resolveWakeMode(c, 'claude-p', cwd(false), undefined, hasTranscript, 'unrelated-bug')).toEqual({ resume: false, cause: 'idle-reaped-new-topic' })
435
442
  expect(hasIdleReaped(c, 'claude-p')).toBe(false) // consumed even on the fresh executor branch
436
443
  })
437
444
  })
@@ -3,6 +3,7 @@ import { mkdtempSync, rmSync, readFileSync, writeFileSync } from 'fs'
3
3
  import { tmpdir } from 'os'
4
4
  import { join } from 'path'
5
5
  import {
6
+ clampDescription,
6
7
  findPeer,
7
8
  readPeersIndex,
8
9
  removePeer,
@@ -10,7 +11,7 @@ import {
10
11
  withPeersLock,
11
12
  type PeerRecord,
12
13
  } from './index.ts'
13
- import { defaultIntelligenceForRuntime, type Intelligence } from '../core/constants.ts'
14
+ import { MAX_DESCRIPTION_LEN, defaultIntelligenceForRuntime, type Intelligence } from '../core/constants.ts'
14
15
  import { writeFileAtomic, resolvePeersPaths } from '../storage/index.ts'
15
16
 
16
17
  let root: string
@@ -398,3 +399,34 @@ describe('Companion fix — withPeersLock fail-closed sandbox isolation', () =>
398
399
  expect(out).toBe('ok')
399
400
  })
400
401
  })
402
+
403
+ // ─────────────────────────────────────────────────────────────────────────────
404
+ // clampDescription boundary — the limit was raised 250 → 450 so self-documenting
405
+ // API-peer descriptions (notifier timer/watcher, ~408 chars) survive intact. The
406
+ // boundary is exact: length == MAX passes untouched, length == MAX+1 truncates.
407
+ // ─────────────────────────────────────────────────────────────────────────────
408
+
409
+ describe('clampDescription — MAX_DESCRIPTION_LEN boundary (450)', () => {
410
+ test('the limit is 450', () => {
411
+ expect(MAX_DESCRIPTION_LEN).toBe(450)
412
+ })
413
+ test('a 450-char description passes through untouched', () => {
414
+ const at = 'x'.repeat(450)
415
+ const r = clampDescription(at)
416
+ expect(r.truncated).toBe(false)
417
+ expect(r.description).toBe(at)
418
+ expect(r.description.length).toBe(450)
419
+ })
420
+ test('a 451-char description is truncated to 450', () => {
421
+ const over = 'y'.repeat(451)
422
+ const r = clampDescription(over)
423
+ expect(r.truncated).toBe(true)
424
+ expect(r.description.length).toBe(450)
425
+ expect(r.description).toBe('y'.repeat(450))
426
+ })
427
+ test('upsertPeer persists a full 450-char description (no clamp at the boundary)', async () => {
428
+ const desc = 'z'.repeat(450)
429
+ await upsertPeer({ personality: 'verbose', runtime: 'claude', cwd: '/tmp/verbose', description: desc }, opts())
430
+ expect(findPeer(readPeersIndex(opts()), 'verbose')!.description).toBe(desc)
431
+ })
432
+ })