@agfpd/iapeer 0.2.21 → 0.2.23
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/cli/cli.test.ts +26 -1
- package/src/cli/index.ts +18 -1
- package/src/launch/canary.test.ts +173 -0
- package/src/launch/canary.ts +152 -0
- package/src/launch/index.ts +20 -6
- package/src/lifecycle/index.ts +50 -2
package/package.json
CHANGED
package/src/cli/cli.test.ts
CHANGED
|
@@ -9,7 +9,7 @@ import { tmpdir } from 'os'
|
|
|
9
9
|
import { join } from 'path'
|
|
10
10
|
import { formatListTable, listPeers, parseArgs, removePeerCli, runCli, sendMessage, startPeer, stopPeer } from './index.ts'
|
|
11
11
|
import { findPeer, readPeersIndex, upsertPeer } from '../registry/index.ts'
|
|
12
|
-
import { hasIdleReaped, isStopped, loadLifecycleConfig, setStopped } from '../lifecycle/index.ts'
|
|
12
|
+
import { hasIdleReaped, isStopped, loadLifecycleConfig, setIdleReaped, setStopped } from '../lifecycle/index.ts'
|
|
13
13
|
import { launchdPlistPath } from '../launch/launchd.ts'
|
|
14
14
|
|
|
15
15
|
let root: string
|
|
@@ -130,6 +130,31 @@ describe('remove (registry record via the locked writer)', () => {
|
|
|
130
130
|
expect((await removePeerCli('twice', { env: e })).action).toBe('removed')
|
|
131
131
|
expect((await removePeerCli('twice', { env: e })).action).toBe('absent')
|
|
132
132
|
})
|
|
133
|
+
test('purges identity-keyed lifecycle state with the record — a namesake newborn must not inherit a dead peer\'s parking (boris 10.06)', async () => {
|
|
134
|
+
await register('reborn')
|
|
135
|
+
const e = env()
|
|
136
|
+
const cfg = loadLifecycleConfig(e)
|
|
137
|
+
// the dead peer left the full marker cemetery behind (the live-defect shape:
|
|
138
|
+
// .stopped + .idle-reaped → a namesake newborn is REFUSED its wake)
|
|
139
|
+
setStopped(cfg, 'claude-reborn')
|
|
140
|
+
setIdleReaped(cfg, 'claude-reborn')
|
|
141
|
+
writeFileSync(join(cfg.stateDir, 'claude-reborn.topic'), 'old-topic')
|
|
142
|
+
mkdirSync(join(cfg.stateDir, 'claude-reborn.queue'), { recursive: true })
|
|
143
|
+
// a NEIGHBOR identity sharing the name as a PREFIX must survive untouched
|
|
144
|
+
setStopped(cfg, 'claude-reborn2')
|
|
145
|
+
|
|
146
|
+
const o = await removePeerCli('reborn', { env: e })
|
|
147
|
+
expect(o.action).toBe('removed')
|
|
148
|
+
expect(o.purgedState?.sort()).toEqual([
|
|
149
|
+
'claude-reborn.idle-reaped',
|
|
150
|
+
'claude-reborn.queue',
|
|
151
|
+
'claude-reborn.stopped',
|
|
152
|
+
'claude-reborn.topic',
|
|
153
|
+
])
|
|
154
|
+
expect(isStopped(cfg, 'claude-reborn')).toBe(false) // the newborn namesake wakes
|
|
155
|
+
expect(existsSync(join(cfg.stateDir, 'claude-reborn.queue'))).toBe(false)
|
|
156
|
+
expect(isStopped(cfg, 'claude-reborn2')).toBe(true) // dot-delimited: no prefix bleed
|
|
157
|
+
})
|
|
133
158
|
})
|
|
134
159
|
|
|
135
160
|
describe('send validation', () => {
|
package/src/cli/index.ts
CHANGED
|
@@ -33,6 +33,7 @@ import {
|
|
|
33
33
|
isStopped,
|
|
34
34
|
killSession,
|
|
35
35
|
loadLifecycleConfig,
|
|
36
|
+
purgeIdentityState,
|
|
36
37
|
removeSessionState,
|
|
37
38
|
setIdleReaped,
|
|
38
39
|
setNewEager,
|
|
@@ -271,6 +272,11 @@ export interface RemoveOutcome {
|
|
|
271
272
|
* deliberately keeps the folder — user data is never deleted by a registry reap
|
|
272
273
|
* (boris's finding 10.06: say so in the output instead of leaving silent orphans). */
|
|
273
274
|
cwd?: string
|
|
275
|
+
/** Identity-keyed lifecycle artifacts purged with the record (state/lifecycle/
|
|
276
|
+
* `<identity>.*` per runtime). Without this purge a NEWBORN peer reusing the
|
|
277
|
+
* personality inherits the dead namesake's parking (live defect, boris 10.06:
|
|
278
|
+
* stale .stopped → `mode=refused cause=stopped` on a freshly-created peer). */
|
|
279
|
+
purgedState?: string[]
|
|
274
280
|
}
|
|
275
281
|
|
|
276
282
|
/**
|
|
@@ -301,7 +307,13 @@ export async function removePeerCli(
|
|
|
301
307
|
}
|
|
302
308
|
}
|
|
303
309
|
await removePeer(personality, { env })
|
|
304
|
-
|
|
310
|
+
// Purge identity-keyed lifecycle state WITH the record (per runtime): stale
|
|
311
|
+
// .stopped/.idle-reaped/... must never outlive the peer and ambush a future
|
|
312
|
+
// namesake (purgeIdentityState doc). After the registry write, so a failed
|
|
313
|
+
// remove never half-purges a still-registered peer.
|
|
314
|
+
const cfg = loadLifecycleConfig(env)
|
|
315
|
+
const purgedState = peer.runtimes.flatMap(rt => purgeIdentityState(cfg, buildProcessAddress(rt, personality)))
|
|
316
|
+
return { personality, action: 'removed', cwd: peer.cwd, purgedState }
|
|
305
317
|
}
|
|
306
318
|
|
|
307
319
|
// ─────────────────────────────────────────────────────────────────────────────
|
|
@@ -692,6 +704,11 @@ export async function runCli(argv: string[], env: NodeJS.ProcessEnv = process.en
|
|
|
692
704
|
const o = await removePeerCli(positionals[0], { force: flags.force === true, env })
|
|
693
705
|
if (o.action === 'removed') {
|
|
694
706
|
out(`removed "${o.personality}" from the registry\n`)
|
|
707
|
+
// Stale identity-keyed markers must die with the record (boris 10.06: a
|
|
708
|
+
// namesake newborn inherited a dead peer's .stopped → refused to wake).
|
|
709
|
+
if (o.purgedState?.length) {
|
|
710
|
+
out(`lifecycle state purged: ${o.purgedState.join(', ')}\n`)
|
|
711
|
+
}
|
|
695
712
|
// Deliberate: the registry reap never deletes user data — but SAY so, or
|
|
696
713
|
// the default-location peers leave silent orphan folders (boris 10.06).
|
|
697
714
|
if (o.cwd && existsSync(o.cwd)) {
|
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
// Server-death canary — pure script-builder shape + live-tmux behavior:
|
|
2
|
+
// dirty server death (SIGKILL) → ev=server-exit line + forensics snapshot;
|
|
3
|
+
// clean teardown (signal / killSession) → silent, no record. Live tests follow
|
|
4
|
+
// the suite's test.if(tmuxAvailable) sandbox-socket pattern (lifecycle.test.ts).
|
|
5
|
+
|
|
6
|
+
import { afterAll, describe, expect, test } from 'bun:test'
|
|
7
|
+
import { spawnSync } from 'child_process'
|
|
8
|
+
import { existsSync, mkdtempSync, readFileSync, readdirSync, rmSync } from 'fs'
|
|
9
|
+
import { tmpdir } from 'os'
|
|
10
|
+
import { join } from 'path'
|
|
11
|
+
import {
|
|
12
|
+
canaryChannel,
|
|
13
|
+
canaryScript,
|
|
14
|
+
ensureServerCanary,
|
|
15
|
+
exitLogPath,
|
|
16
|
+
serverDeathsDir,
|
|
17
|
+
signalCanaryClean,
|
|
18
|
+
} from './canary.ts'
|
|
19
|
+
import { killSession } from '../lifecycle/index.ts'
|
|
20
|
+
|
|
21
|
+
const tmuxAvailable = spawnSync('tmux', ['-V'], { stdio: 'ignore' }).status === 0
|
|
22
|
+
|
|
23
|
+
const sleep = (ms: number) => new Promise(r => setTimeout(r, ms))
|
|
24
|
+
|
|
25
|
+
/** Poll until `cond` is true or `ms` elapsed. */
|
|
26
|
+
async function waitFor(cond: () => boolean, ms: number): Promise<boolean> {
|
|
27
|
+
const deadline = Date.now() + ms
|
|
28
|
+
while (Date.now() < deadline) {
|
|
29
|
+
if (cond()) return true
|
|
30
|
+
await sleep(100)
|
|
31
|
+
}
|
|
32
|
+
return cond()
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
function canaryRunning(identity: string): boolean {
|
|
36
|
+
return (
|
|
37
|
+
spawnSync('pgrep', ['-f', `wait-for.*${canaryChannel(identity)}([^a-z0-9-]|$)`], { stdio: 'ignore' }).status === 0
|
|
38
|
+
)
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
function serverPid(sock: string): number {
|
|
42
|
+
const r = spawnSync('tmux', ['-S', sock, 'display-message', '-p', '#{pid}'], { encoding: 'utf8' })
|
|
43
|
+
return Number((r.stdout ?? '').trim())
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
describe('canary script (pure)', () => {
|
|
47
|
+
test('channel is per-identity', () => {
|
|
48
|
+
expect(canaryChannel('claude-bob')).toBe('iap-canary-claude-bob')
|
|
49
|
+
})
|
|
50
|
+
|
|
51
|
+
test('exitLogPath → exits.log inside the dir', () => {
|
|
52
|
+
expect(exitLogPath('/r/logs/iapeer')).toBe('/r/logs/iapeer/exits.log')
|
|
53
|
+
})
|
|
54
|
+
|
|
55
|
+
test('script carries the protocol: wait-for channel, silent-exit guards, record line, forensics', () => {
|
|
56
|
+
const s = canaryScript({
|
|
57
|
+
identity: 'claude-bob',
|
|
58
|
+
sock: '/tmp/x.sock',
|
|
59
|
+
tmuxBin: '/opt/homebrew/bin/tmux',
|
|
60
|
+
exitLogFile: '/r/logs/iapeer/exits.log',
|
|
61
|
+
forensicsDir: '/r/logs/iapeer/server-deaths',
|
|
62
|
+
})
|
|
63
|
+
expect(s).toContain(`wait-for 'iap-canary-claude-bob'`) // the blocking client
|
|
64
|
+
expect(s).toContain(`trap 'exit 0' HUP INT TERM`) // signaled canary = silent
|
|
65
|
+
expect(s).toContain(`[ "$rc" -eq 0 ] || [ "$rc" -ge 128 ]`) // clean/signaled guards
|
|
66
|
+
expect(s).toContain('ev=server-exit identity=claude-bob') // the exits.log record
|
|
67
|
+
expect(s).toContain(`>> '/r/logs/iapeer/exits.log'`)
|
|
68
|
+
expect(s).toContain('/r/logs/iapeer/server-deaths/claude-bob') // forensics file
|
|
69
|
+
expect(s).toContain('vm_stat') // memory evidence (OOM hypothesis)
|
|
70
|
+
expect(s).toContain(`'/opt/homebrew/bin/tmux' -S '/tmp/x.sock'`) // abs tmux, quoted
|
|
71
|
+
})
|
|
72
|
+
|
|
73
|
+
test('ensureServerCanary without exitLogDir → skipped (observability off)', () => {
|
|
74
|
+
expect(ensureServerCanary({ identity: 'claude-none', sock: '/tmp/none.sock' })).toBe('skipped')
|
|
75
|
+
})
|
|
76
|
+
})
|
|
77
|
+
|
|
78
|
+
describe.if(tmuxAvailable)('canary live (sandbox tmux servers)', () => {
|
|
79
|
+
const dir = mkdtempSync(join(tmpdir(), 'iap-canary-'))
|
|
80
|
+
const socks: string[] = []
|
|
81
|
+
|
|
82
|
+
function bringUp(identity: string): { sock: string; logDir: string } {
|
|
83
|
+
const sock = join(dir, `${identity}.sock`)
|
|
84
|
+
socks.push(sock)
|
|
85
|
+
const r = spawnSync('tmux', ['-S', sock, 'new-session', '-d', '-s', identity, 'sleep', '120'], {
|
|
86
|
+
stdio: 'ignore',
|
|
87
|
+
})
|
|
88
|
+
expect(r.status).toBe(0)
|
|
89
|
+
return { sock, logDir: join(dir, `logs-${identity}`) }
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
afterAll(() => {
|
|
93
|
+
for (const sock of socks) {
|
|
94
|
+
// teardown is DELIBERATE → signal each canary before killing its server
|
|
95
|
+
for (const id of ['claude-canadirty', 'claude-canaclean', 'claude-canakill']) {
|
|
96
|
+
signalCanaryClean(sock, id)
|
|
97
|
+
}
|
|
98
|
+
spawnSync('tmux', ['-S', sock, 'kill-server'], { stdio: 'ignore' })
|
|
99
|
+
}
|
|
100
|
+
rmSync(dir, { recursive: true, force: true })
|
|
101
|
+
})
|
|
102
|
+
|
|
103
|
+
test(
|
|
104
|
+
'dirty server death (SIGKILL) → ev=server-exit + forensics snapshot',
|
|
105
|
+
async () => {
|
|
106
|
+
const identity = 'claude-canadirty'
|
|
107
|
+
const { sock, logDir } = bringUp(identity)
|
|
108
|
+
const pid = serverPid(sock)
|
|
109
|
+
expect(pid).toBeGreaterThan(0)
|
|
110
|
+
|
|
111
|
+
expect(ensureServerCanary({ identity, sock, exitLogDir: logDir })).toBe('spawned')
|
|
112
|
+
// idempotency: a second ensure must NOT double-spawn
|
|
113
|
+
expect(await waitFor(() => canaryRunning(identity), 3000)).toBe(true)
|
|
114
|
+
expect(ensureServerCanary({ identity, sock, exitLogDir: logDir })).toBe('already')
|
|
115
|
+
await sleep(500) // let the wait-for client register on the channel
|
|
116
|
+
|
|
117
|
+
process.kill(pid, 'SIGKILL') // the silent server-killer class
|
|
118
|
+
const log = exitLogPath(logDir)
|
|
119
|
+
expect(await waitFor(() => existsSync(log) && readFileSync(log, 'utf8').includes('ev=server-exit'), 8000)).toBe(
|
|
120
|
+
true,
|
|
121
|
+
)
|
|
122
|
+
const line = readFileSync(log, 'utf8')
|
|
123
|
+
expect(line).toContain(`ev=server-exit identity=${identity}`)
|
|
124
|
+
expect(line).toContain(`server_pid=${pid}`)
|
|
125
|
+
const snaps = readdirSync(serverDeathsDir(logDir))
|
|
126
|
+
expect(snaps.length).toBe(1)
|
|
127
|
+
const snap = readFileSync(join(serverDeathsDir(logDir), snaps[0]!), 'utf8')
|
|
128
|
+
expect(snap).toContain(`server-death identity=${identity}`)
|
|
129
|
+
// canary is one-shot: after firing it must be gone
|
|
130
|
+
expect(await waitFor(() => !canaryRunning(identity), 3000)).toBe(true)
|
|
131
|
+
},
|
|
132
|
+
20000,
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
test(
|
|
136
|
+
'clean teardown (signal, then kill-server) → silent, no record',
|
|
137
|
+
async () => {
|
|
138
|
+
const identity = 'claude-canaclean'
|
|
139
|
+
const { sock, logDir } = bringUp(identity)
|
|
140
|
+
expect(ensureServerCanary({ identity, sock, exitLogDir: logDir })).toBe('spawned')
|
|
141
|
+
expect(await waitFor(() => canaryRunning(identity), 3000)).toBe(true)
|
|
142
|
+
await sleep(500) // let the wait-for client register before signaling
|
|
143
|
+
|
|
144
|
+
signalCanaryClean(sock, identity)
|
|
145
|
+
expect(await waitFor(() => !canaryRunning(identity), 3000)).toBe(true) // exited silently
|
|
146
|
+
spawnSync('tmux', ['-S', sock, 'kill-server'], { stdio: 'ignore' })
|
|
147
|
+
await sleep(300)
|
|
148
|
+
expect(existsSync(exitLogPath(logDir)) && readFileSync(exitLogPath(logDir), 'utf8').includes('ev=server-exit')).toBe(
|
|
149
|
+
false,
|
|
150
|
+
)
|
|
151
|
+
},
|
|
152
|
+
20000,
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
test(
|
|
156
|
+
'killSession (lifecycle clean reap) signals the canary before kill-server → no record',
|
|
157
|
+
async () => {
|
|
158
|
+
const identity = 'claude-canakill'
|
|
159
|
+
const { sock, logDir } = bringUp(identity)
|
|
160
|
+
expect(ensureServerCanary({ identity, sock, exitLogDir: logDir })).toBe('spawned')
|
|
161
|
+
expect(await waitFor(() => canaryRunning(identity), 3000)).toBe(true)
|
|
162
|
+
await sleep(500)
|
|
163
|
+
|
|
164
|
+
killSession(sock, identity) // last session → kills the SERVER, signal-first
|
|
165
|
+
expect(await waitFor(() => !canaryRunning(identity), 3000)).toBe(true)
|
|
166
|
+
await sleep(300)
|
|
167
|
+
expect(existsSync(exitLogPath(logDir)) && readFileSync(exitLogPath(logDir), 'utf8').includes('ev=server-exit')).toBe(
|
|
168
|
+
false,
|
|
169
|
+
)
|
|
170
|
+
},
|
|
171
|
+
20000,
|
|
172
|
+
)
|
|
173
|
+
})
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
// Server-death canary — the SERVER-level catcher for the exit-cause gap the
|
|
2
|
+
// pane-died hook structurally cannot close (контракт: pane-died needs a living
|
|
3
|
+
// tmux event loop; SIGKILL/OOM to the tmux SERVER runs no hook). Raised from the
|
|
4
|
+
// deferred backlog after the SECOND live case of the class (iapeer-memory 10.06
|
|
5
|
+
// 01:43, boris 10.06 11:41 — both: a healthy session with active Bash
|
|
6
|
+
// subprocessing, the whole server gone silently, zero system traces).
|
|
7
|
+
//
|
|
8
|
+
// Mechanism: one tiny detached `sh` per LIVE tmux server, holding a blocking
|
|
9
|
+
// client `tmux -S <sock> wait-for iap-canary-<identity>` — a process OUTSIDE the
|
|
10
|
+
// dying server, connected via its socket, so the server's death (any cause,
|
|
11
|
+
// including SIGKILL) is observed the moment the connection drops. Protocol:
|
|
12
|
+
// • clean teardown (idle-reap / stop / pre-clean / killServerIfEmpty) SIGNALS
|
|
13
|
+
// the channel first (`wait-for -S`) → the canary exits 0, silently;
|
|
14
|
+
// • the canary itself signaled (TERM/HUP/INT, e.g. launch pre-clean pkill) →
|
|
15
|
+
// rc ≥ 128 / trap → exit silently (someone manages it deliberately);
|
|
16
|
+
// • anything else (wait-for returns an error = the server vanished under us)
|
|
17
|
+
// → ONE logfmt line `ev=server-exit` into exits.log (the per-peer death-cause
|
|
18
|
+
// home, next to pane-died's `ev=session-exit`) + a forensics snapshot file
|
|
19
|
+
// (vm_stat / swap / top-RSS ps / fresh DiagnosticReports) captured WITHIN
|
|
20
|
+
// SECONDS of the death — the evidence the 60 s supervise tick can never
|
|
21
|
+
// recover ("системных следов ноль" was the recurring investigation outcome).
|
|
22
|
+
//
|
|
23
|
+
// The canary is pure observability: it never wakes, reaps, restarts or otherwise
|
|
24
|
+
// manages anything (H4-compatible by construction), it fires at most once, and
|
|
25
|
+
// every failure to spawn/record is swallowed (never load-bearing). The daemon's
|
|
26
|
+
// reaped-gone death-class (classifyGoneSession) remains the detection backstop
|
|
27
|
+
// when no canary was running.
|
|
28
|
+
|
|
29
|
+
import { spawn, spawnSync } from 'child_process'
|
|
30
|
+
import { mkdirSync } from 'fs'
|
|
31
|
+
import { join } from 'path'
|
|
32
|
+
import { resolveExecutable } from './launchd.ts'
|
|
33
|
+
|
|
34
|
+
/** The exit-cause log file inside `exitLogDir` (sibling to lifecycle.log).
|
|
35
|
+
* Lives here (not index.ts) so canary ⇆ launch never import-cycle; index.ts
|
|
36
|
+
* re-exports it, keeping the public import surface unchanged. */
|
|
37
|
+
export function exitLogPath(exitLogDir: string): string {
|
|
38
|
+
return `${exitLogDir}/exits.log`
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
/** The per-identity tmux wait-for channel the clean-teardown paths signal. */
|
|
42
|
+
export function canaryChannel(identity: string): string {
|
|
43
|
+
return `iap-canary-${identity}`
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
/** Forensics snapshots live next to exits.log: `<exitLogDir>/server-deaths/`. */
|
|
47
|
+
export function serverDeathsDir(exitLogDir: string): string {
|
|
48
|
+
return join(exitLogDir, 'server-deaths')
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
export interface CanaryScriptOptions {
|
|
52
|
+
identity: string
|
|
53
|
+
sock: string
|
|
54
|
+
/** Absolute tmux path — the detached canary may outlive any rich PATH. */
|
|
55
|
+
tmuxBin: string
|
|
56
|
+
exitLogFile: string
|
|
57
|
+
forensicsDir: string
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
/**
|
|
61
|
+
* Build the canary shell script (PURE — unit-testable). Quoting: identity is a
|
|
62
|
+
* runtime-personality (`[a-z0-9-]`), sock/log paths are ~/.iapeer-style (no
|
|
63
|
+
* single quotes) — same assumption as exitCauseHook. All forensic tools are
|
|
64
|
+
* /usr/bin- or /bin-resident, so the script survives launchd's minimal PATH;
|
|
65
|
+
* only tmux needs the baked absolute path.
|
|
66
|
+
*/
|
|
67
|
+
export function canaryScript(o: CanaryScriptOptions): string {
|
|
68
|
+
const ch = canaryChannel(o.identity)
|
|
69
|
+
return [
|
|
70
|
+
// Server PID captured while alive — the postmortem grep key for system logs.
|
|
71
|
+
`SPID="$('${o.tmuxBin}' -S '${o.sock}' display-message -p '#{pid}' 2>/dev/null)"`,
|
|
72
|
+
// A signal to the CANARY is deliberate management (pre-clean pkill) → silent.
|
|
73
|
+
`trap 'exit 0' HUP INT TERM`,
|
|
74
|
+
`'${o.tmuxBin}' -S '${o.sock}' wait-for '${ch}'`,
|
|
75
|
+
`rc=$?`,
|
|
76
|
+
// rc=0 → clean-teardown signal; rc≥128 → the tmux client itself was signaled.
|
|
77
|
+
`if [ "$rc" -eq 0 ] || [ "$rc" -ge 128 ]; then exit 0; fi`,
|
|
78
|
+
// The server vanished under us — record, within seconds of the death.
|
|
79
|
+
`ts="$(date -u +%Y-%m-%dT%H:%M:%SZ)"`,
|
|
80
|
+
`f='${o.forensicsDir}/${o.identity}'-"$(date +%s)".txt`,
|
|
81
|
+
`{`,
|
|
82
|
+
` echo "server-death identity=${o.identity} ts=$ts server_pid=$SPID wait_rc=$rc"`,
|
|
83
|
+
` echo '--- vm_stat'; vm_stat`,
|
|
84
|
+
` echo '--- swapusage'; sysctl -n vm.swapusage`,
|
|
85
|
+
` echo '--- ps-top-rss'; ps axo pid,ppid,rss,etime,command | sort -rn -k3 | head -25`,
|
|
86
|
+
` echo '--- diagnosticreports-user'; ls -t "$HOME/Library/Logs/DiagnosticReports" 2>/dev/null | head`,
|
|
87
|
+
` echo '--- diagnosticreports-system'; ls -t /Library/Logs/DiagnosticReports 2>/dev/null | head`,
|
|
88
|
+
`} > "$f" 2>&1`,
|
|
89
|
+
`printf 'ts=%s ev=server-exit identity=${o.identity} server_pid=%s wait_rc=%s forensics=%s\\n' "$ts" "$SPID" "$rc" "$f" >> '${o.exitLogFile}'`,
|
|
90
|
+
].join('\n')
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
export type CanaryEnsureState = 'spawned' | 'already' | 'skipped' | 'failed'
|
|
94
|
+
|
|
95
|
+
export interface EnsureCanaryOptions {
|
|
96
|
+
identity: string
|
|
97
|
+
sock: string
|
|
98
|
+
/** Same gate as installExitHook: falsy → observability off, no canary. */
|
|
99
|
+
exitLogDir?: string
|
|
100
|
+
env?: NodeJS.ProcessEnv
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
/**
|
|
104
|
+
* Ensure ONE canary is watching this identity's tmux server. Idempotent via a
|
|
105
|
+
* pgrep on the wait-for channel (anchored so an identity can never match another
|
|
106
|
+
* identity's prefix). Called from launch (newborn server) and from the supervise
|
|
107
|
+
* tick's alive-branch (retrofits canaries onto a fleet launched by older code —
|
|
108
|
+
* coverage within one tick of a deploy, no session restarts). Best-effort:
|
|
109
|
+
* every failure returns a state, never throws.
|
|
110
|
+
*/
|
|
111
|
+
export function ensureServerCanary(o: EnsureCanaryOptions): CanaryEnsureState {
|
|
112
|
+
if (!o.exitLogDir) return 'skipped'
|
|
113
|
+
const env = o.env ?? process.env
|
|
114
|
+
try {
|
|
115
|
+
const probe = spawnSync('pgrep', ['-f', `wait-for.*${canaryChannel(o.identity)}([^a-z0-9-]|$)`], {
|
|
116
|
+
stdio: 'ignore',
|
|
117
|
+
env: env as Record<string, string>,
|
|
118
|
+
})
|
|
119
|
+
if (probe.status === 0) return 'already'
|
|
120
|
+
const tmuxBin = resolveExecutable('tmux', env)
|
|
121
|
+
if (!tmuxBin) return 'failed'
|
|
122
|
+
const forensicsDir = serverDeathsDir(o.exitLogDir)
|
|
123
|
+
mkdirSync(forensicsDir, { recursive: true, mode: 0o700 })
|
|
124
|
+
mkdirSync(o.exitLogDir, { recursive: true, mode: 0o700 })
|
|
125
|
+
const script = canaryScript({
|
|
126
|
+
identity: o.identity,
|
|
127
|
+
sock: o.sock,
|
|
128
|
+
tmuxBin,
|
|
129
|
+
exitLogFile: exitLogPath(o.exitLogDir),
|
|
130
|
+
forensicsDir,
|
|
131
|
+
})
|
|
132
|
+
const child = spawn('/bin/sh', ['-c', script], { detached: true, stdio: 'ignore', env })
|
|
133
|
+
child.unref()
|
|
134
|
+
return 'spawned'
|
|
135
|
+
} catch {
|
|
136
|
+
return 'failed' // observability is best-effort — never block a launch/tick
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
/**
|
|
141
|
+
* Signal the canary that the upcoming server teardown is DELIBERATE (idle-reap /
|
|
142
|
+
* stop / launch pre-clean / empty-server kill) so it exits silently instead of
|
|
143
|
+
* recording a false server-death. Best-effort: no server / no canary on the
|
|
144
|
+
* channel → harmless no-op.
|
|
145
|
+
*/
|
|
146
|
+
export function signalCanaryClean(sock: string, identity: string): void {
|
|
147
|
+
try {
|
|
148
|
+
spawnSync('tmux', ['-S', sock, 'wait-for', '-S', canaryChannel(identity)], { stdio: 'ignore' })
|
|
149
|
+
} catch {
|
|
150
|
+
/* best-effort */
|
|
151
|
+
}
|
|
152
|
+
}
|
package/src/launch/index.ts
CHANGED
|
@@ -19,6 +19,7 @@ import { dirname } from 'path'
|
|
|
19
19
|
import { spawnSync } from 'child_process'
|
|
20
20
|
import { CODEX_BEARER_ENV_VAR, CODEX_DUMMY_BEARER, type Runtime } from '../core/constants.ts'
|
|
21
21
|
import { readLaunchEnv } from '../storage/index.ts'
|
|
22
|
+
import { ensureServerCanary, exitLogPath, signalCanaryClean } from './canary.ts'
|
|
22
23
|
import { claudeAdapter } from './adapters/claude.ts'
|
|
23
24
|
import { codexAdapter } from './adapters/codex.ts'
|
|
24
25
|
import { telegramAdapter } from './adapters/telegram.ts'
|
|
@@ -127,14 +128,15 @@ function ready(identity: string): LaunchResult {
|
|
|
127
128
|
// • SIGTERM/SIGKILL/crash to the PROCESS → `dead_status= dead_signal=<name>`
|
|
128
129
|
// • daemon-initiated `kill-session` (idle-reap / self-TTL / stop) does NOT fire
|
|
129
130
|
// pane-died → NO line here (those are already in lifecycle.log — no double-log).
|
|
130
|
-
//
|
|
131
|
-
// event loop is gone)
|
|
131
|
+
// SERVER-LEVEL GAP: SIGKILL to the tmux SERVER process itself runs no hook (the
|
|
132
|
+
// event loop is gone). Closed one level up by the server-death canary
|
|
133
|
+
// (canary.ts — `ev=server-exit` + forensics snapshot); the daemon's post-factum
|
|
134
|
+
// reaped-gone death-class remains the detection backstop.
|
|
132
135
|
// ─────────────────────────────────────────────────────────────────────────────
|
|
133
136
|
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
}
|
|
137
|
+
// The exit-cause log path lives in canary.ts (no import cycle); re-exported here
|
|
138
|
+
// so the public surface (`launch/index.ts → exitLogPath`) is unchanged.
|
|
139
|
+
export { exitLogPath } from './canary.ts'
|
|
138
140
|
|
|
139
141
|
/**
|
|
140
142
|
* Build the tmux `pane-died` hook command string (the value of `set-hook -t <id>
|
|
@@ -210,6 +212,11 @@ export const launch: LaunchFn = async (
|
|
|
210
212
|
mkdirSync(dirname(sock), { recursive: true })
|
|
211
213
|
|
|
212
214
|
// (1) Pre-clean any stale tmux server on this socket, then launch detached.
|
|
215
|
+
// Signal the server-death canary FIRST: this teardown is deliberate, the
|
|
216
|
+
// canary must exit silently, not record a false server-death (the pkill
|
|
217
|
+
// below also matches the canary's cmdline — by design, it sweeps a stale
|
|
218
|
+
// canary along with the stale server; a signaled canary is already gone).
|
|
219
|
+
signalCanaryClean(sock, identity)
|
|
213
220
|
spawnSync('pkill', ['-f', `tmux -S ${sock} `], { stdio: 'ignore' })
|
|
214
221
|
tmux(sock, 'kill-server')
|
|
215
222
|
|
|
@@ -256,6 +263,13 @@ export const launch: LaunchFn = async (
|
|
|
256
263
|
// boot leaves a cause. No-op without cfg.exitLogDir (remain-on-exit off).
|
|
257
264
|
installExitHook(sock, identity, cfg.exitLogDir)
|
|
258
265
|
|
|
266
|
+
// (2.6) Server-death canary (canary.ts): the SERVER-level catcher pane-died
|
|
267
|
+
// structurally cannot be — a detached wait-for client OUTSIDE the server
|
|
268
|
+
// that records `ev=server-exit` + a forensics snapshot when the whole
|
|
269
|
+
// server dies dirty (SIGKILL/OOM class). Same gate as the exit hook;
|
|
270
|
+
// best-effort by construction (every failure → a state, never a throw).
|
|
271
|
+
ensureServerCanary({ identity, sock, exitLogDir: cfg.exitLogDir, env })
|
|
272
|
+
|
|
259
273
|
// (3) pipe-pane the session output to the per-identity log.
|
|
260
274
|
mkdirSync(cfg.logDir, { recursive: true, mode: 0o700 })
|
|
261
275
|
const paneLog = `${cfg.logDir}/${identity}.log`
|
package/src/lifecycle/index.ts
CHANGED
|
@@ -46,6 +46,7 @@ import {
|
|
|
46
46
|
type LaunchSpec,
|
|
47
47
|
} from '../launch/index.ts'
|
|
48
48
|
import { composeSystemPrompt, gatherPromptInput } from '../launch/composeSystemPrompt.ts'
|
|
49
|
+
import { ensureServerCanary, signalCanaryClean } from '../launch/canary.ts'
|
|
49
50
|
import { appendLifecycleEvent, superviseLogVerbose } from './eventlog.ts'
|
|
50
51
|
|
|
51
52
|
// ─────────────────────────────────────────────────────────────────────────────
|
|
@@ -379,6 +380,44 @@ export function writeTopic(cfg: LifecycleConfig, identity: string, topic: string
|
|
|
379
380
|
}
|
|
380
381
|
}
|
|
381
382
|
|
|
383
|
+
/**
|
|
384
|
+
* Purge EVERY identity-keyed lifecycle artifact of `<identity>` from stateDir:
|
|
385
|
+
* the marker files (`.stopped` / `.idle-reaped` / `.deaths` / `.topic` /
|
|
386
|
+
* `.new-eager` / `.ephemeral-armed`), the supervise `.session`, the `.wake.lock`
|
|
387
|
+
* and the M3 `.queue/` dir — everything matching `<identity>.*` (the dot
|
|
388
|
+
* delimiter keeps `claude-bob` from ever matching `claude-bob2.*`).
|
|
389
|
+
*
|
|
390
|
+
* Consumer: `iapeer remove` (live defect, boris 10.06 cutover): a removed peer's
|
|
391
|
+
* stale `.stopped`/`.idle-reaped` survived in state/lifecycle, and a NEWBORN peer
|
|
392
|
+
* REUSING the personality inherited the dead namesake's parking — the daemon
|
|
393
|
+
* refused to wake it (`mode=refused cause=stopped`). Identity-keyed state must
|
|
394
|
+
* die with the registry record. Deliberately NOT called at birth: provision runs
|
|
395
|
+
* on EXISTING peers too (init re-runs), and purging there would erase a parked
|
|
396
|
+
* peer's `.idle-reaped` → its next wake comes up FRESH instead of RESUME
|
|
397
|
+
* (violates «на resume нет потери контекста»).
|
|
398
|
+
*
|
|
399
|
+
* Returns the removed entry names (for the verb's output); never throws.
|
|
400
|
+
*/
|
|
401
|
+
export function purgeIdentityState(cfg: LifecycleConfig, identity: string): string[] {
|
|
402
|
+
const removed: string[] = []
|
|
403
|
+
let entries: string[]
|
|
404
|
+
try {
|
|
405
|
+
entries = readdirSync(cfg.stateDir)
|
|
406
|
+
} catch {
|
|
407
|
+
return removed // no state dir → nothing to purge
|
|
408
|
+
}
|
|
409
|
+
for (const name of entries) {
|
|
410
|
+
if (!name.startsWith(`${identity}.`)) continue
|
|
411
|
+
try {
|
|
412
|
+
rmSync(join(cfg.stateDir, name), { recursive: true, force: true })
|
|
413
|
+
removed.push(name)
|
|
414
|
+
} catch {
|
|
415
|
+
/* best-effort — a locked/vanished entry must not fail the remove */
|
|
416
|
+
}
|
|
417
|
+
}
|
|
418
|
+
return removed
|
|
419
|
+
}
|
|
420
|
+
|
|
382
421
|
// ─────────────────────────────────────────────────────────────────────────────
|
|
383
422
|
// resolveWakeMode — the resume-vs-fresh decision (TARGET redesign). The DAEMON
|
|
384
423
|
// decides by the DEATH CAUSE it tracks (.idle-reaped marker), plus peer-type /
|
|
@@ -594,7 +633,8 @@ function sessionAlive(sock: string, identity: string): boolean {
|
|
|
594
633
|
* exits.log should carry the cause.
|
|
595
634
|
* - `server-dead` — the server itself is gone: the socket file is missing, or it
|
|
596
635
|
* exists but nothing serves it (stale socket — the SIGKILL/OOM class). pane-died
|
|
597
|
-
* could never fire
|
|
636
|
+
* could never fire; the server-death canary (launch/canary.ts) is the recorder
|
|
637
|
+
* for this class (`ev=server-exit` + forensics), this line is the backstop. */
|
|
598
638
|
export function classifyGoneSession(sock: string): { death: 'server-dead' | 'session-gone'; reason: string } {
|
|
599
639
|
if (!existsSync(sock)) {
|
|
600
640
|
return { death: 'server-dead', reason: 'tmux server gone (socket file missing)' }
|
|
@@ -603,7 +643,7 @@ export function classifyGoneSession(sock: string): { death: 'server-dead' | 'ses
|
|
|
603
643
|
// proves the server is alive and merely lost this session.
|
|
604
644
|
return tmux(sock, 'list-sessions').ok
|
|
605
645
|
? { death: 'session-gone', reason: 'session gone, tmux server alive (exit cause should be in exits.log)' }
|
|
606
|
-
: { death: 'server-dead', reason: 'tmux server dead — stale socket (SIGKILL/OOM class; exits.log
|
|
646
|
+
: { death: 'server-dead', reason: 'tmux server dead — stale socket (SIGKILL/OOM class; check exits.log ev=server-exit canary record)' }
|
|
607
647
|
}
|
|
608
648
|
|
|
609
649
|
// ─────────────────────────────────────────────────────────────────────────────
|
|
@@ -958,6 +998,9 @@ export function killSession(sock: string, identity: string): void {
|
|
|
958
998
|
tmux(sock, 'kill-session', '-t', identity)
|
|
959
999
|
const sessions = tmux(sock, 'list-sessions', '-F', '#{session_name}').out
|
|
960
1000
|
if (!sessions.trim()) {
|
|
1001
|
+
// Deliberate server teardown → signal the server-death canary FIRST so it
|
|
1002
|
+
// exits silently instead of recording a false `ev=server-exit` (canary.ts).
|
|
1003
|
+
signalCanaryClean(sock, identity)
|
|
961
1004
|
tmux(sock, 'kill-server')
|
|
962
1005
|
try {
|
|
963
1006
|
rmSync(sock, { force: true })
|
|
@@ -1108,6 +1151,11 @@ export function superviseTick(cfg: LifecycleConfig, deps: SuperviseDeps = {}): S
|
|
|
1108
1151
|
out.push({ identity: s.identity, action: 'reaped-idle', reason: `idle ${ageSecs}s` })
|
|
1109
1152
|
trace({ identity: s.identity, action: 'reaped-idle', age: `${ageSecs}s`, outcome: 'resume-eligible' })
|
|
1110
1153
|
} else {
|
|
1154
|
+
// Server-death canary retrofit (canary.ts): ensure a canary is watching
|
|
1155
|
+
// every ALIVE daemon-owned server — covers a fleet launched by older code
|
|
1156
|
+
// within one tick of a deploy, no session restarts. Idempotent (pgrep on
|
|
1157
|
+
// the per-identity wait-for channel), best-effort, pure observability.
|
|
1158
|
+
ensureServerCanary({ identity: s.identity, sock, exitLogDir: cfg.eventLogDir, env })
|
|
1111
1159
|
out.push({ identity: s.identity, action: 'alive' })
|
|
1112
1160
|
if (verbose) trace({ identity: s.identity, action: 'alive', age: `${ageSecs}s` })
|
|
1113
1161
|
}
|