@agfpd/iapeer 0.2.20 → 0.2.22

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@agfpd/iapeer",
3
- "version": "0.2.20",
3
+ "version": "0.2.22",
4
4
  "description": "Foundation core for the iapeer multi-agent ecosystem: identity, registry, storage, codec.",
5
5
  "type": "module",
6
6
  "bin": {
@@ -153,6 +153,37 @@ describe('send validation', () => {
153
153
  })
154
154
  })
155
155
 
156
+ describe('send → ephemeral target: M3 FIFO parity with the daemon path (iapeer-memory ask)', () => {
157
+ test('a CLI send to a wake_policy:ephemeral peer ENQUEUES (queued ack), no live/miss bypass', async () => {
158
+ // an ephemeral worker cwd (profile declares wake_policy) registered in the index
159
+ const cwd = mkdtempSync(join(tmpdir(), 'iapeer-cli-eph-'))
160
+ mkdirSync(join(cwd, '.iapeer'), { recursive: true })
161
+ writeFileSync(
162
+ join(cwd, '.iapeer', 'peer-profile.json'),
163
+ JSON.stringify({ personality: 'ephw', runtime: 'claude', runtimes: ['claude'], intelligence: 'artificial', wake_policy: 'ephemeral' }),
164
+ )
165
+ const e = env()
166
+ // routeSend resolves the peers index from the PROCESS env (transport reads
167
+ // readPeersIndex() bare) — point the process-level root at the sandbox too.
168
+ const prevRoot = process.env.IAPEER_ROOT
169
+ process.env.IAPEER_ROOT = root
170
+ try {
171
+ await upsertPeer({ personality: 'ephw', runtime: 'claude', cwd, intelligence: 'artificial' }, { rootDir: root })
172
+ await register('sender')
173
+ const r = await sendMessage({ from: 'claude-sender', target: 'ephw', message: 'task', env: e })
174
+ expect(r.queued).toBe(true) // serialized via the disk FIFO, exactly like the daemon path
175
+ expect(r.queueDepth).toBe(1)
176
+ // the task is durably on disk for the daemon tick to drain
177
+ const qdir = join(loadLifecycleConfig(e).stateDir, 'claude-ephw.queue')
178
+ expect(existsSync(qdir)).toBe(true)
179
+ } finally {
180
+ if (prevRoot !== undefined) process.env.IAPEER_ROOT = prevRoot
181
+ else delete process.env.IAPEER_ROOT
182
+ rmSync(cwd, { recursive: true, force: true })
183
+ }
184
+ })
185
+ })
186
+
156
187
  describe('--help/-h global intercept (CLI hygiene — usage printed, NOTHING executed)', () => {
157
188
  let captured: string
158
189
  let origWrite: typeof process.stdout.write
package/src/cli/index.ts CHANGED
@@ -323,9 +323,21 @@ export interface SendOptions extends CliEnvOptions {
323
323
  const cliWake: WakeFn = req =>
324
324
  wakeOrSpawn({ personality: req.personality, runtime: req.runtime, topic: req.topic, task: req.task })
325
325
 
326
- export async function sendMessage(opts: SendOptions): Promise<{ ok: true; delivered_to: { personality: string; runtime: string } }> {
326
+ export async function sendMessage(
327
+ opts: SendOptions,
328
+ ): Promise<{ ok: true; delivered_to: { personality: string; runtime: string }; queued?: boolean; queueDepth?: number }> {
327
329
  const env = opts.env ?? process.env
328
330
  const caller = resolveCallerIdentity(parseIdentity(opts.from), readPeersIndex({ env }))
331
+ // wake_policy:ephemeral M3 parity (iapeer-memory ask, 10.06): the CLI path used
332
+ // to route an ephemeral target through the normal live/miss path — a notifier
333
+ // burst landed as TURNS in one live worker session instead of serializing
334
+ // through the disk FIFO the daemon path uses. Same seam, ONE difference: the
335
+ // drain kick is a NOOP here — a CLI process exits right after the ack, so an
336
+ // unawaited in-process wake would die with it; the daemon's supervise-tick
337
+ // drain scan (≤60 s) picks the queue up — the EXISTING retry path for failed
338
+ // kicks, not a new mechanism.
339
+ const { makeEphemeralRouteDeps } = await import('../daemon/main.ts')
340
+ const cfg = loadLifecycleConfig(env)
329
341
  const result = await routeSend(
330
342
  caller,
331
343
  {
@@ -335,10 +347,15 @@ export async function sendMessage(opts: SendOptions): Promise<{ ok: true; delive
335
347
  topic: opts.topic,
336
348
  attachments: opts.attachments,
337
349
  },
338
- { wake: cliWake },
350
+ { wake: cliWake, ephemeral: makeEphemeralRouteDeps(cfg, env, () => {}) },
339
351
  )
340
352
  if (!result.ok) throw new Error(result.error.message)
341
- return { ok: true, delivered_to: result.value.delivered_to }
353
+ return {
354
+ ok: true,
355
+ delivered_to: result.value.delivered_to,
356
+ queued: result.value.queued,
357
+ queueDepth: result.value.queueDepth,
358
+ }
342
359
  }
343
360
 
344
361
  function parseIdentity(identity: string): { personality: string; runtime: Runtime } {
@@ -714,7 +731,11 @@ export async function runCli(argv: string[], env: NodeJS.ProcessEnv = process.en
714
731
  attachments: attachments.length ? attachments : undefined,
715
732
  env,
716
733
  })
717
- out(`delivered to ${r.delivered_to.personality} (${r.delivered_to.runtime})\n`)
734
+ out(
735
+ r.queued
736
+ ? `queued for ${r.delivered_to.personality} (${r.delivered_to.runtime}), depth ${r.queueDepth ?? '?'} — the daemon tick drains it\n`
737
+ : `delivered to ${r.delivered_to.personality} (${r.delivered_to.runtime})\n`,
738
+ )
718
739
  return 0
719
740
  }
720
741
  case 'version':
@@ -26,6 +26,18 @@ import { tmpdir } from 'os'
26
26
  import { join } from 'path'
27
27
  import { spawnSync } from 'child_process'
28
28
 
29
+ /** CROSS-PRODUCT CONTRACT (agreed with iapeer-memory, 10.06): this CN is the
30
+ * SHARED signing identity of the whole agfpd stack. Each product signs with its
31
+ * OWN --identifier (foundation: com.agfpd.iapeer; memory: com.agfpd.iapeer-memory),
32
+ * so TCC subjects stay separate while the host carries ONE key (one keychain
33
+ * prompt ever). Creation is first-needs-creates with the IDENTICAL profile (EKU
34
+ * codeSigning, system LibreSSL p12, import -T /usr/bin/codesign) on both sides.
35
+ * Changing the CN or the creation profile is a COORDINATED change across repos.
36
+ * Known shared costs: re-creating the identity (deleted/expired — cert is 10 y)
37
+ * migrates the TCC grants of EVERY stack product at once; a concurrent
38
+ * first-creation by two installers could duplicate the CN (codesign would then
39
+ * report an ambiguous identity) — installs are operator-sequential, residual
40
+ * risk accepted. */
29
41
  export const SIGNING_IDENTITY_CN = 'iapeer Local Codesign'
30
42
  export const SIGNING_IDENTIFIER = 'com.agfpd.iapeer'
31
43
 
@@ -0,0 +1,173 @@
1
+ // Server-death canary — pure script-builder shape + live-tmux behavior:
2
+ // dirty server death (SIGKILL) → ev=server-exit line + forensics snapshot;
3
+ // clean teardown (signal / killSession) → silent, no record. Live tests follow
4
+ // the suite's test.if(tmuxAvailable) sandbox-socket pattern (lifecycle.test.ts).
5
+
6
+ import { afterAll, describe, expect, test } from 'bun:test'
7
+ import { spawnSync } from 'child_process'
8
+ import { existsSync, mkdtempSync, readFileSync, readdirSync, rmSync } from 'fs'
9
+ import { tmpdir } from 'os'
10
+ import { join } from 'path'
11
+ import {
12
+ canaryChannel,
13
+ canaryScript,
14
+ ensureServerCanary,
15
+ exitLogPath,
16
+ serverDeathsDir,
17
+ signalCanaryClean,
18
+ } from './canary.ts'
19
+ import { killSession } from '../lifecycle/index.ts'
20
+
21
+ const tmuxAvailable = spawnSync('tmux', ['-V'], { stdio: 'ignore' }).status === 0
22
+
23
+ const sleep = (ms: number) => new Promise(r => setTimeout(r, ms))
24
+
25
+ /** Poll until `cond` is true or `ms` elapsed. */
26
+ async function waitFor(cond: () => boolean, ms: number): Promise<boolean> {
27
+ const deadline = Date.now() + ms
28
+ while (Date.now() < deadline) {
29
+ if (cond()) return true
30
+ await sleep(100)
31
+ }
32
+ return cond()
33
+ }
34
+
35
+ function canaryRunning(identity: string): boolean {
36
+ return (
37
+ spawnSync('pgrep', ['-f', `wait-for.*${canaryChannel(identity)}([^a-z0-9-]|$)`], { stdio: 'ignore' }).status === 0
38
+ )
39
+ }
40
+
41
+ function serverPid(sock: string): number {
42
+ const r = spawnSync('tmux', ['-S', sock, 'display-message', '-p', '#{pid}'], { encoding: 'utf8' })
43
+ return Number((r.stdout ?? '').trim())
44
+ }
45
+
46
+ describe('canary script (pure)', () => {
47
+ test('channel is per-identity', () => {
48
+ expect(canaryChannel('claude-bob')).toBe('iap-canary-claude-bob')
49
+ })
50
+
51
+ test('exitLogPath → exits.log inside the dir', () => {
52
+ expect(exitLogPath('/r/logs/iapeer')).toBe('/r/logs/iapeer/exits.log')
53
+ })
54
+
55
+ test('script carries the protocol: wait-for channel, silent-exit guards, record line, forensics', () => {
56
+ const s = canaryScript({
57
+ identity: 'claude-bob',
58
+ sock: '/tmp/x.sock',
59
+ tmuxBin: '/opt/homebrew/bin/tmux',
60
+ exitLogFile: '/r/logs/iapeer/exits.log',
61
+ forensicsDir: '/r/logs/iapeer/server-deaths',
62
+ })
63
+ expect(s).toContain(`wait-for 'iap-canary-claude-bob'`) // the blocking client
64
+ expect(s).toContain(`trap 'exit 0' HUP INT TERM`) // signaled canary = silent
65
+ expect(s).toContain(`[ "$rc" -eq 0 ] || [ "$rc" -ge 128 ]`) // clean/signaled guards
66
+ expect(s).toContain('ev=server-exit identity=claude-bob') // the exits.log record
67
+ expect(s).toContain(`>> '/r/logs/iapeer/exits.log'`)
68
+ expect(s).toContain('/r/logs/iapeer/server-deaths/claude-bob') // forensics file
69
+ expect(s).toContain('vm_stat') // memory evidence (OOM hypothesis)
70
+ expect(s).toContain(`'/opt/homebrew/bin/tmux' -S '/tmp/x.sock'`) // abs tmux, quoted
71
+ })
72
+
73
+ test('ensureServerCanary without exitLogDir → skipped (observability off)', () => {
74
+ expect(ensureServerCanary({ identity: 'claude-none', sock: '/tmp/none.sock' })).toBe('skipped')
75
+ })
76
+ })
77
+
78
+ describe.if(tmuxAvailable)('canary live (sandbox tmux servers)', () => {
79
+ const dir = mkdtempSync(join(tmpdir(), 'iap-canary-'))
80
+ const socks: string[] = []
81
+
82
+ function bringUp(identity: string): { sock: string; logDir: string } {
83
+ const sock = join(dir, `${identity}.sock`)
84
+ socks.push(sock)
85
+ const r = spawnSync('tmux', ['-S', sock, 'new-session', '-d', '-s', identity, 'sleep', '120'], {
86
+ stdio: 'ignore',
87
+ })
88
+ expect(r.status).toBe(0)
89
+ return { sock, logDir: join(dir, `logs-${identity}`) }
90
+ }
91
+
92
+ afterAll(() => {
93
+ for (const sock of socks) {
94
+ // teardown is DELIBERATE → signal each canary before killing its server
95
+ for (const id of ['claude-canadirty', 'claude-canaclean', 'claude-canakill']) {
96
+ signalCanaryClean(sock, id)
97
+ }
98
+ spawnSync('tmux', ['-S', sock, 'kill-server'], { stdio: 'ignore' })
99
+ }
100
+ rmSync(dir, { recursive: true, force: true })
101
+ })
102
+
103
+ test(
104
+ 'dirty server death (SIGKILL) → ev=server-exit + forensics snapshot',
105
+ async () => {
106
+ const identity = 'claude-canadirty'
107
+ const { sock, logDir } = bringUp(identity)
108
+ const pid = serverPid(sock)
109
+ expect(pid).toBeGreaterThan(0)
110
+
111
+ expect(ensureServerCanary({ identity, sock, exitLogDir: logDir })).toBe('spawned')
112
+ // idempotency: a second ensure must NOT double-spawn
113
+ expect(await waitFor(() => canaryRunning(identity), 3000)).toBe(true)
114
+ expect(ensureServerCanary({ identity, sock, exitLogDir: logDir })).toBe('already')
115
+ await sleep(500) // let the wait-for client register on the channel
116
+
117
+ process.kill(pid, 'SIGKILL') // the silent server-killer class
118
+ const log = exitLogPath(logDir)
119
+ expect(await waitFor(() => existsSync(log) && readFileSync(log, 'utf8').includes('ev=server-exit'), 8000)).toBe(
120
+ true,
121
+ )
122
+ const line = readFileSync(log, 'utf8')
123
+ expect(line).toContain(`ev=server-exit identity=${identity}`)
124
+ expect(line).toContain(`server_pid=${pid}`)
125
+ const snaps = readdirSync(serverDeathsDir(logDir))
126
+ expect(snaps.length).toBe(1)
127
+ const snap = readFileSync(join(serverDeathsDir(logDir), snaps[0]!), 'utf8')
128
+ expect(snap).toContain(`server-death identity=${identity}`)
129
+ // canary is one-shot: after firing it must be gone
130
+ expect(await waitFor(() => !canaryRunning(identity), 3000)).toBe(true)
131
+ },
132
+ 20000,
133
+ )
134
+
135
+ test(
136
+ 'clean teardown (signal, then kill-server) → silent, no record',
137
+ async () => {
138
+ const identity = 'claude-canaclean'
139
+ const { sock, logDir } = bringUp(identity)
140
+ expect(ensureServerCanary({ identity, sock, exitLogDir: logDir })).toBe('spawned')
141
+ expect(await waitFor(() => canaryRunning(identity), 3000)).toBe(true)
142
+ await sleep(500) // let the wait-for client register before signaling
143
+
144
+ signalCanaryClean(sock, identity)
145
+ expect(await waitFor(() => !canaryRunning(identity), 3000)).toBe(true) // exited silently
146
+ spawnSync('tmux', ['-S', sock, 'kill-server'], { stdio: 'ignore' })
147
+ await sleep(300)
148
+ expect(existsSync(exitLogPath(logDir)) && readFileSync(exitLogPath(logDir), 'utf8').includes('ev=server-exit')).toBe(
149
+ false,
150
+ )
151
+ },
152
+ 20000,
153
+ )
154
+
155
+ test(
156
+ 'killSession (lifecycle clean reap) signals the canary before kill-server → no record',
157
+ async () => {
158
+ const identity = 'claude-canakill'
159
+ const { sock, logDir } = bringUp(identity)
160
+ expect(ensureServerCanary({ identity, sock, exitLogDir: logDir })).toBe('spawned')
161
+ expect(await waitFor(() => canaryRunning(identity), 3000)).toBe(true)
162
+ await sleep(500)
163
+
164
+ killSession(sock, identity) // last session → kills the SERVER, signal-first
165
+ expect(await waitFor(() => !canaryRunning(identity), 3000)).toBe(true)
166
+ await sleep(300)
167
+ expect(existsSync(exitLogPath(logDir)) && readFileSync(exitLogPath(logDir), 'utf8').includes('ev=server-exit')).toBe(
168
+ false,
169
+ )
170
+ },
171
+ 20000,
172
+ )
173
+ })
@@ -0,0 +1,152 @@
1
+ // Server-death canary — the SERVER-level catcher for the exit-cause gap the
2
+ // pane-died hook structurally cannot close (контракт: pane-died needs a living
3
+ // tmux event loop; SIGKILL/OOM to the tmux SERVER runs no hook). Raised from the
4
+ // deferred backlog after the SECOND live case of the class (iapeer-memory 10.06
5
+ // 01:43, boris 10.06 11:41 — both: a healthy session with active Bash
6
+ // subprocessing, the whole server gone silently, zero system traces).
7
+ //
8
+ // Mechanism: one tiny detached `sh` per LIVE tmux server, holding a blocking
9
+ // client `tmux -S <sock> wait-for iap-canary-<identity>` — a process OUTSIDE the
10
+ // dying server, connected via its socket, so the server's death (any cause,
11
+ // including SIGKILL) is observed the moment the connection drops. Protocol:
12
+ // • clean teardown (idle-reap / stop / pre-clean / killServerIfEmpty) SIGNALS
13
+ // the channel first (`wait-for -S`) → the canary exits 0, silently;
14
+ // • the canary itself signaled (TERM/HUP/INT, e.g. launch pre-clean pkill) →
15
+ // rc ≥ 128 / trap → exit silently (someone manages it deliberately);
16
+ // • anything else (wait-for returns an error = the server vanished under us)
17
+ // → ONE logfmt line `ev=server-exit` into exits.log (the per-peer death-cause
18
+ // home, next to pane-died's `ev=session-exit`) + a forensics snapshot file
19
+ // (vm_stat / swap / top-RSS ps / fresh DiagnosticReports) captured WITHIN
20
+ // SECONDS of the death — the evidence the 60 s supervise tick can never
21
+ // recover ("системных следов ноль" was the recurring investigation outcome).
22
+ //
23
+ // The canary is pure observability: it never wakes, reaps, restarts or otherwise
24
+ // manages anything (H4-compatible by construction), it fires at most once, and
25
+ // every failure to spawn/record is swallowed (never load-bearing). The daemon's
26
+ // reaped-gone death-class (classifyGoneSession) remains the detection backstop
27
+ // when no canary was running.
28
+
29
+ import { spawn, spawnSync } from 'child_process'
30
+ import { mkdirSync } from 'fs'
31
+ import { join } from 'path'
32
+ import { resolveExecutable } from './launchd.ts'
33
+
34
+ /** The exit-cause log file inside `exitLogDir` (sibling to lifecycle.log).
35
+ * Lives here (not index.ts) so canary ⇆ launch never import-cycle; index.ts
36
+ * re-exports it, keeping the public import surface unchanged. */
37
+ export function exitLogPath(exitLogDir: string): string {
38
+ return `${exitLogDir}/exits.log`
39
+ }
40
+
41
+ /** The per-identity tmux wait-for channel the clean-teardown paths signal. */
42
+ export function canaryChannel(identity: string): string {
43
+ return `iap-canary-${identity}`
44
+ }
45
+
46
+ /** Forensics snapshots live next to exits.log: `<exitLogDir>/server-deaths/`. */
47
+ export function serverDeathsDir(exitLogDir: string): string {
48
+ return join(exitLogDir, 'server-deaths')
49
+ }
50
+
51
+ export interface CanaryScriptOptions {
52
+ identity: string
53
+ sock: string
54
+ /** Absolute tmux path — the detached canary may outlive any rich PATH. */
55
+ tmuxBin: string
56
+ exitLogFile: string
57
+ forensicsDir: string
58
+ }
59
+
60
+ /**
61
+ * Build the canary shell script (PURE — unit-testable). Quoting: identity is a
62
+ * runtime-personality (`[a-z0-9-]`), sock/log paths are ~/.iapeer-style (no
63
+ * single quotes) — same assumption as exitCauseHook. All forensic tools are
64
+ * /usr/bin- or /bin-resident, so the script survives launchd's minimal PATH;
65
+ * only tmux needs the baked absolute path.
66
+ */
67
+ export function canaryScript(o: CanaryScriptOptions): string {
68
+ const ch = canaryChannel(o.identity)
69
+ return [
70
+ // Server PID captured while alive — the postmortem grep key for system logs.
71
+ `SPID="$('${o.tmuxBin}' -S '${o.sock}' display-message -p '#{pid}' 2>/dev/null)"`,
72
+ // A signal to the CANARY is deliberate management (pre-clean pkill) → silent.
73
+ `trap 'exit 0' HUP INT TERM`,
74
+ `'${o.tmuxBin}' -S '${o.sock}' wait-for '${ch}'`,
75
+ `rc=$?`,
76
+ // rc=0 → clean-teardown signal; rc≥128 → the tmux client itself was signaled.
77
+ `if [ "$rc" -eq 0 ] || [ "$rc" -ge 128 ]; then exit 0; fi`,
78
+ // The server vanished under us — record, within seconds of the death.
79
+ `ts="$(date -u +%Y-%m-%dT%H:%M:%SZ)"`,
80
+ `f='${o.forensicsDir}/${o.identity}'-"$(date +%s)".txt`,
81
+ `{`,
82
+ ` echo "server-death identity=${o.identity} ts=$ts server_pid=$SPID wait_rc=$rc"`,
83
+ ` echo '--- vm_stat'; vm_stat`,
84
+ ` echo '--- swapusage'; sysctl -n vm.swapusage`,
85
+ ` echo '--- ps-top-rss'; ps axo pid,ppid,rss,etime,command | sort -rn -k3 | head -25`,
86
+ ` echo '--- diagnosticreports-user'; ls -t "$HOME/Library/Logs/DiagnosticReports" 2>/dev/null | head`,
87
+ ` echo '--- diagnosticreports-system'; ls -t /Library/Logs/DiagnosticReports 2>/dev/null | head`,
88
+ `} > "$f" 2>&1`,
89
+ `printf 'ts=%s ev=server-exit identity=${o.identity} server_pid=%s wait_rc=%s forensics=%s\\n' "$ts" "$SPID" "$rc" "$f" >> '${o.exitLogFile}'`,
90
+ ].join('\n')
91
+ }
92
+
93
+ export type CanaryEnsureState = 'spawned' | 'already' | 'skipped' | 'failed'
94
+
95
+ export interface EnsureCanaryOptions {
96
+ identity: string
97
+ sock: string
98
+ /** Same gate as installExitHook: falsy → observability off, no canary. */
99
+ exitLogDir?: string
100
+ env?: NodeJS.ProcessEnv
101
+ }
102
+
103
+ /**
104
+ * Ensure ONE canary is watching this identity's tmux server. Idempotent via a
105
+ * pgrep on the wait-for channel (anchored so an identity can never match another
106
+ * identity's prefix). Called from launch (newborn server) and from the supervise
107
+ * tick's alive-branch (retrofits canaries onto a fleet launched by older code —
108
+ * coverage within one tick of a deploy, no session restarts). Best-effort:
109
+ * every failure returns a state, never throws.
110
+ */
111
+ export function ensureServerCanary(o: EnsureCanaryOptions): CanaryEnsureState {
112
+ if (!o.exitLogDir) return 'skipped'
113
+ const env = o.env ?? process.env
114
+ try {
115
+ const probe = spawnSync('pgrep', ['-f', `wait-for.*${canaryChannel(o.identity)}([^a-z0-9-]|$)`], {
116
+ stdio: 'ignore',
117
+ env: env as Record<string, string>,
118
+ })
119
+ if (probe.status === 0) return 'already'
120
+ const tmuxBin = resolveExecutable('tmux', env)
121
+ if (!tmuxBin) return 'failed'
122
+ const forensicsDir = serverDeathsDir(o.exitLogDir)
123
+ mkdirSync(forensicsDir, { recursive: true, mode: 0o700 })
124
+ mkdirSync(o.exitLogDir, { recursive: true, mode: 0o700 })
125
+ const script = canaryScript({
126
+ identity: o.identity,
127
+ sock: o.sock,
128
+ tmuxBin,
129
+ exitLogFile: exitLogPath(o.exitLogDir),
130
+ forensicsDir,
131
+ })
132
+ const child = spawn('/bin/sh', ['-c', script], { detached: true, stdio: 'ignore', env })
133
+ child.unref()
134
+ return 'spawned'
135
+ } catch {
136
+ return 'failed' // observability is best-effort — never block a launch/tick
137
+ }
138
+ }
139
+
140
+ /**
141
+ * Signal the canary that the upcoming server teardown is DELIBERATE (idle-reap /
142
+ * stop / launch pre-clean / empty-server kill) so it exits silently instead of
143
+ * recording a false server-death. Best-effort: no server / no canary on the
144
+ * channel → harmless no-op.
145
+ */
146
+ export function signalCanaryClean(sock: string, identity: string): void {
147
+ try {
148
+ spawnSync('tmux', ['-S', sock, 'wait-for', '-S', canaryChannel(identity)], { stdio: 'ignore' })
149
+ } catch {
150
+ /* best-effort */
151
+ }
152
+ }
@@ -19,6 +19,7 @@ import { dirname } from 'path'
19
19
  import { spawnSync } from 'child_process'
20
20
  import { CODEX_BEARER_ENV_VAR, CODEX_DUMMY_BEARER, type Runtime } from '../core/constants.ts'
21
21
  import { readLaunchEnv } from '../storage/index.ts'
22
+ import { ensureServerCanary, exitLogPath, signalCanaryClean } from './canary.ts'
22
23
  import { claudeAdapter } from './adapters/claude.ts'
23
24
  import { codexAdapter } from './adapters/codex.ts'
24
25
  import { telegramAdapter } from './adapters/telegram.ts'
@@ -127,14 +128,15 @@ function ready(identity: string): LaunchResult {
127
128
  // • SIGTERM/SIGKILL/crash to the PROCESS → `dead_status= dead_signal=<name>`
128
129
  // • daemon-initiated `kill-session` (idle-reap / self-TTL / stop) does NOT fire
129
130
  // pane-died → NO line here (those are already in lifecycle.log — no double-log).
130
- // IRREDUCIBLE GAP: SIGKILL to the tmux SERVER process itself runs no hook (the
131
- // event loop is gone); only the daemon's post-factum reaped-gone catches that.
131
+ // SERVER-LEVEL GAP: SIGKILL to the tmux SERVER process itself runs no hook (the
132
+ // event loop is gone). Closed one level up by the server-death canary
133
+ // (canary.ts — `ev=server-exit` + forensics snapshot); the daemon's post-factum
134
+ // reaped-gone death-class remains the detection backstop.
132
135
  // ─────────────────────────────────────────────────────────────────────────────
133
136
 
134
- /** The exit-cause log file inside `exitLogDir` (sibling to lifecycle.log). */
135
- export function exitLogPath(exitLogDir: string): string {
136
- return `${exitLogDir}/exits.log`
137
- }
137
+ // The exit-cause log path lives in canary.ts (no import cycle); re-exported here
138
+ // so the public surface (`launch/index.ts → exitLogPath`) is unchanged.
139
+ export { exitLogPath } from './canary.ts'
138
140
 
139
141
  /**
140
142
  * Build the tmux `pane-died` hook command string (the value of `set-hook -t <id>
@@ -210,6 +212,11 @@ export const launch: LaunchFn = async (
210
212
  mkdirSync(dirname(sock), { recursive: true })
211
213
 
212
214
  // (1) Pre-clean any stale tmux server on this socket, then launch detached.
215
+ // Signal the server-death canary FIRST: this teardown is deliberate, the
216
+ // canary must exit silently, not record a false server-death (the pkill
217
+ // below also matches the canary's cmdline — by design, it sweeps a stale
218
+ // canary along with the stale server; a signaled canary is already gone).
219
+ signalCanaryClean(sock, identity)
213
220
  spawnSync('pkill', ['-f', `tmux -S ${sock} `], { stdio: 'ignore' })
214
221
  tmux(sock, 'kill-server')
215
222
 
@@ -256,6 +263,13 @@ export const launch: LaunchFn = async (
256
263
  // boot leaves a cause. No-op without cfg.exitLogDir (remain-on-exit off).
257
264
  installExitHook(sock, identity, cfg.exitLogDir)
258
265
 
266
+ // (2.6) Server-death canary (canary.ts): the SERVER-level catcher pane-died
267
+ // structurally cannot be — a detached wait-for client OUTSIDE the server
268
+ // that records `ev=server-exit` + a forensics snapshot when the whole
269
+ // server dies dirty (SIGKILL/OOM class). Same gate as the exit hook;
270
+ // best-effort by construction (every failure → a state, never a throw).
271
+ ensureServerCanary({ identity, sock, exitLogDir: cfg.exitLogDir, env })
272
+
259
273
  // (3) pipe-pane the session output to the per-identity log.
260
274
  mkdirSync(cfg.logDir, { recursive: true, mode: 0o700 })
261
275
  const paneLog = `${cfg.logDir}/${identity}.log`
@@ -46,6 +46,7 @@ import {
46
46
  type LaunchSpec,
47
47
  } from '../launch/index.ts'
48
48
  import { composeSystemPrompt, gatherPromptInput } from '../launch/composeSystemPrompt.ts'
49
+ import { ensureServerCanary, signalCanaryClean } from '../launch/canary.ts'
49
50
  import { appendLifecycleEvent, superviseLogVerbose } from './eventlog.ts'
50
51
 
51
52
  // ─────────────────────────────────────────────────────────────────────────────
@@ -594,7 +595,8 @@ function sessionAlive(sock: string, identity: string): boolean {
594
595
  * exits.log should carry the cause.
595
596
  * - `server-dead` — the server itself is gone: the socket file is missing, or it
596
597
  * exists but nothing serves it (stale socket — the SIGKILL/OOM class). pane-died
597
- * could never fire, so the lifecycle.log line is the only durable trace. */
598
+ * could never fire; the server-death canary (launch/canary.ts) is the recorder
599
+ * for this class (`ev=server-exit` + forensics), this line is the backstop. */
598
600
  export function classifyGoneSession(sock: string): { death: 'server-dead' | 'session-gone'; reason: string } {
599
601
  if (!existsSync(sock)) {
600
602
  return { death: 'server-dead', reason: 'tmux server gone (socket file missing)' }
@@ -603,7 +605,7 @@ export function classifyGoneSession(sock: string): { death: 'server-dead' | 'ses
603
605
  // proves the server is alive and merely lost this session.
604
606
  return tmux(sock, 'list-sessions').ok
605
607
  ? { death: 'session-gone', reason: 'session gone, tmux server alive (exit cause should be in exits.log)' }
606
- : { death: 'server-dead', reason: 'tmux server dead — stale socket (SIGKILL/OOM class; exits.log has no entry)' }
608
+ : { death: 'server-dead', reason: 'tmux server dead — stale socket (SIGKILL/OOM class; check exits.log ev=server-exit canary record)' }
607
609
  }
608
610
 
609
611
  // ─────────────────────────────────────────────────────────────────────────────
@@ -958,6 +960,9 @@ export function killSession(sock: string, identity: string): void {
958
960
  tmux(sock, 'kill-session', '-t', identity)
959
961
  const sessions = tmux(sock, 'list-sessions', '-F', '#{session_name}').out
960
962
  if (!sessions.trim()) {
963
+ // Deliberate server teardown → signal the server-death canary FIRST so it
964
+ // exits silently instead of recording a false `ev=server-exit` (canary.ts).
965
+ signalCanaryClean(sock, identity)
961
966
  tmux(sock, 'kill-server')
962
967
  try {
963
968
  rmSync(sock, { force: true })
@@ -1108,6 +1113,11 @@ export function superviseTick(cfg: LifecycleConfig, deps: SuperviseDeps = {}): S
1108
1113
  out.push({ identity: s.identity, action: 'reaped-idle', reason: `idle ${ageSecs}s` })
1109
1114
  trace({ identity: s.identity, action: 'reaped-idle', age: `${ageSecs}s`, outcome: 'resume-eligible' })
1110
1115
  } else {
1116
+ // Server-death canary retrofit (canary.ts): ensure a canary is watching
1117
+ // every ALIVE daemon-owned server — covers a fleet launched by older code
1118
+ // within one tick of a deploy, no session restarts. Idempotent (pgrep on
1119
+ // the per-identity wait-for channel), best-effort, pure observability.
1120
+ ensureServerCanary({ identity: s.identity, sock, exitLogDir: cfg.eventLogDir, env })
1111
1121
  out.push({ identity: s.identity, action: 'alive' })
1112
1122
  if (verbose) trace({ identity: s.identity, action: 'alive', age: `${ageSecs}s` })
1113
1123
  }