@agfpd/iapeer 0.2.25 → 0.2.27
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/cli/cli.test.ts +11 -0
- package/src/cli/index.ts +68 -1
- package/src/launch/canary.test.ts +93 -5
- package/src/launch/canary.ts +76 -16
- package/src/launch/index.ts +30 -7
- package/src/launch/launch.test.ts +8 -0
- package/src/launch/launchdRun.ts +4 -1
- package/src/lifecycle/index.ts +49 -4
- package/src/lifecycle/lifecycle.test.ts +63 -0
package/package.json
CHANGED
package/src/cli/cli.test.ts
CHANGED
|
@@ -130,6 +130,17 @@ describe('remove (registry record via the locked writer)', () => {
|
|
|
130
130
|
expect((await removePeerCli('twice', { env: e })).action).toBe('removed')
|
|
131
131
|
expect((await removePeerCli('twice', { env: e })).action).toBe('absent')
|
|
132
132
|
})
|
|
133
|
+
test('self-done arms the caller\'s own quiet-reap (non-waking silent finish); refuses without PEER_IDENTITY', async () => {
|
|
134
|
+
const e = env()
|
|
135
|
+
// no PEER_IDENTITY → self-call refusal
|
|
136
|
+
expect(await runCli(['self-done'], e)).toBe(1)
|
|
137
|
+
// with PEER_IDENTITY → marker set, exit 0, nobody contacted
|
|
138
|
+
const e2 = { ...e, PEER_IDENTITY: 'claude-silentworker' }
|
|
139
|
+
expect(await runCli(['self-done'], e2)).toBe(0)
|
|
140
|
+
const { hasEphemeralArmed } = await import('../lifecycle/index.ts')
|
|
141
|
+
expect(hasEphemeralArmed(loadLifecycleConfig(e2), 'claude-silentworker')).toBe(true)
|
|
142
|
+
})
|
|
143
|
+
|
|
133
144
|
test('purges identity-keyed lifecycle state with the record — a namesake newborn must not inherit a dead peer\'s parking (boris 10.06)', async () => {
|
|
134
145
|
await register('reborn')
|
|
135
146
|
const e = env()
|
package/src/cli/index.ts
CHANGED
|
@@ -30,11 +30,13 @@ import {
|
|
|
30
30
|
clearStopped,
|
|
31
31
|
folderLaunch,
|
|
32
32
|
isLaunchdManaged,
|
|
33
|
+
isEphemeralPeer,
|
|
33
34
|
isStopped,
|
|
34
35
|
killSession,
|
|
35
36
|
loadLifecycleConfig,
|
|
36
37
|
purgeIdentityState,
|
|
37
38
|
removeSessionState,
|
|
39
|
+
setEphemeralArmed,
|
|
38
40
|
setIdleReaped,
|
|
39
41
|
setNewEager,
|
|
40
42
|
setStopped,
|
|
@@ -348,8 +350,9 @@ export async function sendMessage(
|
|
|
348
350
|
// unawaited in-process wake would die with it; the daemon's supervise-tick
|
|
349
351
|
// drain scan (≤60 s) picks the queue up — the EXISTING retry path for failed
|
|
350
352
|
// kicks, not a new mechanism.
|
|
351
|
-
const { makeEphemeralRouteDeps } = await import('../daemon/main.ts')
|
|
353
|
+
const { makeArmEphemeralOnDelivered, makeEphemeralRouteDeps } = await import('../daemon/main.ts')
|
|
352
354
|
const cfg = loadLifecycleConfig(env)
|
|
355
|
+
const t0 = Date.now()
|
|
353
356
|
const result = await routeSend(
|
|
354
357
|
caller,
|
|
355
358
|
{
|
|
@@ -361,7 +364,39 @@ export async function sendMessage(
|
|
|
361
364
|
},
|
|
362
365
|
{ wake: cliWake, ephemeral: makeEphemeralRouteDeps(cfg, env, () => {}) },
|
|
363
366
|
)
|
|
367
|
+
// delivery.log sink — CLI-path parity (boris's observability gap 10.06: enqueues
|
|
368
|
+
// routed through the CLI left to=<peer> at ZERO for the day while real wakes
|
|
369
|
+
// happened; the daemon tool-path logs, this path was blind). Same fields, plus
|
|
370
|
+
// path=cli so the two entry points are distinguishable. Both branches logged.
|
|
371
|
+
const { appendDeliveryEvent } = await import('../daemon/deliverylog.ts')
|
|
372
|
+
appendDeliveryEvent(cfg.eventLogDir, {
|
|
373
|
+
ev: 'delivery',
|
|
374
|
+
path: 'cli',
|
|
375
|
+
caller: caller.address,
|
|
376
|
+
to: opts.target,
|
|
377
|
+
rt: opts.runtime,
|
|
378
|
+
ok: String(result.ok),
|
|
379
|
+
via: result.ok ? `${result.value.delivered_to.runtime}-${result.value.delivered_to.personality}` : undefined,
|
|
380
|
+
woke: result.ok ? String(result.value.woke) : undefined,
|
|
381
|
+
queued: result.ok && result.value.queued ? 'true' : undefined,
|
|
382
|
+
qd: result.ok ? result.value.queueDepth : undefined,
|
|
383
|
+
ms: Date.now() - t0,
|
|
384
|
+
len: opts.message.length,
|
|
385
|
+
att: opts.attachments?.length || undefined,
|
|
386
|
+
topic: opts.topic,
|
|
387
|
+
err: result.ok ? undefined : result.error.message,
|
|
388
|
+
})
|
|
364
389
|
if (!result.ok) throw new Error(result.error.message)
|
|
390
|
+
// M2 arm-on-outbound — CLI-path parity (live gap 10.06: an ephemeral worker's
|
|
391
|
+
// final reply sent through the CLI fallback — e.g. inside a daemon-restart
|
|
392
|
+
// window, four deploys that day — never armed, so the worker idled to the
|
|
393
|
+
// unarmed bound and stalled its FIFO). Same hook the daemon path uses; ONLY on
|
|
394
|
+
// an ok outcome, errors swallowed (arming is best-effort, never fails the send).
|
|
395
|
+
try {
|
|
396
|
+
makeArmEphemeralOnDelivered(cfg)(caller)
|
|
397
|
+
} catch {
|
|
398
|
+
/* best-effort */
|
|
399
|
+
}
|
|
365
400
|
return {
|
|
366
401
|
ok: true,
|
|
367
402
|
delivered_to: result.value.delivered_to,
|
|
@@ -432,6 +467,7 @@ const USAGE = `usage: iapeer <verb> [args]
|
|
|
432
467
|
interrupt <peer> [runtime] interrupt the current turn (Escape) — context intact
|
|
433
468
|
compact <peer> [runtime] compact the peer's context (/compact)
|
|
434
469
|
self-fresh (agent self-call) mark /new eager-fresh + self-kill — the daemon relaunches fresh
|
|
470
|
+
self-done (agent self-call, ephemeral) silent finish: arm own quiet-reap, wake no one
|
|
435
471
|
native-memory <off|on> (--peer <p> | --all) gate/restore runtimes' native memory (canonized lever; контракт «Слот памяти»)
|
|
436
472
|
memory-plugin <on|off> (--peer <p> | --all) install/remove the slot-declared provider plugin (claude per-peer, codex host-global)
|
|
437
473
|
`
|
|
@@ -913,6 +949,37 @@ export async function runCli(argv: string[], env: NodeJS.ProcessEnv = process.en
|
|
|
913
949
|
if (!positionals[0] || !positionals[1]) return usage(errOut)
|
|
914
950
|
return await runAlwaysOn(positionals[0], positionals[1], process.cwd())
|
|
915
951
|
}
|
|
952
|
+
case 'self-done': {
|
|
953
|
+
// SILENT-FINISH self-call for an ephemeral worker (контракт ЖЦ §wake_policy;
|
|
954
|
+
// развилка boris 10.06): a worker whose task produced NOTHING to send must
|
|
955
|
+
// still release its M3 FIFO — but an EMPTY report would violate Артур's
|
|
956
|
+
// invariant «событие-всё-отфильтровано = тишина» (no empty wakes of the
|
|
957
|
+
// target). This verb is the non-waking arm: it sets the worker's OWN
|
|
958
|
+
// .ephemeral-armed (same marker the ok-outbound hook sets), so the quiet
|
|
959
|
+
// window reaps it within seconds and the drain feeds the next task — nobody
|
|
960
|
+
// is woken. Doctrine for silent finishers: «нечего отправлять → iapeer
|
|
961
|
+
// self-done вместо ответа». The unarmed idle bound (ephemeralUnarmedIdleSecs)
|
|
962
|
+
// remains the backstop for workers that do neither. On a NON-ephemeral peer
|
|
963
|
+
// the marker is inert (quiet-reap keys on wake_policy) — warn, exit 0.
|
|
964
|
+
const identity = env.PEER_IDENTITY?.trim()
|
|
965
|
+
if (!identity) {
|
|
966
|
+
errOut('self-done: PEER_IDENTITY is not set — this verb is an agent self-call from inside a session\n')
|
|
967
|
+
return 1
|
|
968
|
+
}
|
|
969
|
+
if (!parseSessionName(identity)) {
|
|
970
|
+
errOut(`self-done: invalid PEER_IDENTITY "${identity}" — expected <runtime>-<personality>\n`)
|
|
971
|
+
return 1
|
|
972
|
+
}
|
|
973
|
+
const cfg = loadLifecycleConfig(env)
|
|
974
|
+
setEphemeralArmed(cfg, identity)
|
|
975
|
+
const ephemeral = isEphemeralPeer(process.cwd())
|
|
976
|
+
out(
|
|
977
|
+
`self-done: armed ${identity} for the quiet-window reap (no one woken)` +
|
|
978
|
+
(ephemeral ? '' : ' — NOTE: this peer is not wake_policy:ephemeral, the marker is inert') +
|
|
979
|
+
'\n',
|
|
980
|
+
)
|
|
981
|
+
return 0
|
|
982
|
+
}
|
|
916
983
|
case 'self-fresh': {
|
|
917
984
|
// /new AGENT-FACING TRIGGER (TARGET redesign). Run BY the agent itself as the
|
|
918
985
|
// FINAL step of a /new graceful wind-down (the owner triggers it via a per-peer
|
|
@@ -10,7 +10,9 @@ import { tmpdir } from 'os'
|
|
|
10
10
|
import { join } from 'path'
|
|
11
11
|
import {
|
|
12
12
|
canaryChannel,
|
|
13
|
+
canaryProcessPattern,
|
|
13
14
|
canaryScript,
|
|
15
|
+
dismissCanary,
|
|
14
16
|
ensureServerCanary,
|
|
15
17
|
exitLogPath,
|
|
16
18
|
serverDeathsDir,
|
|
@@ -53,7 +55,7 @@ describe('canary script (pure)', () => {
|
|
|
53
55
|
expect(exitLogPath('/r/logs/iapeer')).toBe('/r/logs/iapeer/exits.log')
|
|
54
56
|
})
|
|
55
57
|
|
|
56
|
-
test('script carries the protocol: wait-for channel,
|
|
58
|
+
test('script carries the v2 protocol: wait-for channel, deliberate-silence guards, liveness probe, record line, forensics', () => {
|
|
57
59
|
const s = canaryScript({
|
|
58
60
|
identity: 'claude-bob',
|
|
59
61
|
sock: '/tmp/x.sock',
|
|
@@ -62,8 +64,15 @@ describe('canary script (pure)', () => {
|
|
|
62
64
|
forensicsDir: '/r/logs/iapeer/server-deaths',
|
|
63
65
|
})
|
|
64
66
|
expect(s).toContain(`wait-for 'iap-canary-claude-bob'`) // the blocking client
|
|
65
|
-
expect(s).toContain(`trap 'exit 0' HUP INT TERM`) //
|
|
66
|
-
|
|
67
|
+
expect(s).toContain(`trap 'exit 0' HUP INT TERM`) // dismissed sh = silent (the ONLY deliberate silencer)
|
|
68
|
+
// v2: NO exit code is trusted as deliberate (a TERMed client returns rc=0 —
|
|
69
|
+
// proven live); the SERVER's liveness decides, after a dismissal grace sleep.
|
|
70
|
+
expect(s).not.toContain(`[ "$rc" -eq 0 ] || [ "$rc" -ge 128 ]; then exit 0`) // the v1 hole
|
|
71
|
+
expect(s).toContain('sleep 2') // dismissal grace window
|
|
72
|
+
expect(s).toContain(`has-session 2>/dev/null; then exit 0`) // server alive → nothing to record
|
|
73
|
+
expect(s).toContain('cause=server-vanished') // connection drop (SIGKILL/OOM class)
|
|
74
|
+
expect(s).toContain('cause=signaled-server-gone') // rc=0: channel/client-TERM, server died
|
|
75
|
+
expect(s).toContain('cause=client-killed-server-gone') // rc≥128: client took a hard kill
|
|
67
76
|
expect(s).toContain('ev=server-exit identity=claude-bob') // the exits.log record
|
|
68
77
|
expect(s).toContain(`>> '/r/logs/iapeer/exits.log'`)
|
|
69
78
|
expect(s).toContain('/r/logs/iapeer/server-deaths/claude-bob') // forensics file
|
|
@@ -74,6 +83,19 @@ describe('canary script (pure)', () => {
|
|
|
74
83
|
test('ensureServerCanary without exitLogDir → skipped (observability off)', () => {
|
|
75
84
|
expect(ensureServerCanary({ identity: 'claude-none', sock: '/tmp/none.sock' })).toBe('skipped')
|
|
76
85
|
})
|
|
86
|
+
|
|
87
|
+
test('canaryProcessPattern is identity-anchored (no prefix bleed) and matches both canary processes', () => {
|
|
88
|
+
const p = canaryProcessPattern('claude-iap')
|
|
89
|
+
expect(p).toBe('iap-canary-claude-iap([^a-z0-9-]|$)')
|
|
90
|
+
const re = new RegExp(p)
|
|
91
|
+
expect(re.test(`/opt/homebrew/bin/tmux -S /tmp/x.sock wait-for iap-canary-claude-iap`)).toBe(true) // client argv
|
|
92
|
+
expect(re.test(`/bin/sh -c ... wait-for 'iap-canary-claude-iap'\n...`)).toBe(true) // sh -c script (quoted channel)
|
|
93
|
+
expect(re.test(`tmux wait-for iap-canary-claude-iap-memory`)).toBe(false) // prefix identity must NOT match
|
|
94
|
+
})
|
|
95
|
+
|
|
96
|
+
test('dismissCanary is a harmless no-op when no canary runs', () => {
|
|
97
|
+
expect(() => dismissCanary('claude-nobody-here')).not.toThrow()
|
|
98
|
+
})
|
|
77
99
|
})
|
|
78
100
|
|
|
79
101
|
describe.if(tmuxAvailable)('canary live (sandbox tmux servers)', () => {
|
|
@@ -90,11 +112,22 @@ describe.if(tmuxAvailable)('canary live (sandbox tmux servers)', () => {
|
|
|
90
112
|
return { sock, logDir: join(dir, `logs-${identity}`) }
|
|
91
113
|
}
|
|
92
114
|
|
|
115
|
+
const allIds = [
|
|
116
|
+
'claude-canadirty',
|
|
117
|
+
'claude-canaclean',
|
|
118
|
+
'claude-canakill',
|
|
119
|
+
'notifier-canatear',
|
|
120
|
+
'claude-canasweep',
|
|
121
|
+
'claude-canaclient',
|
|
122
|
+
]
|
|
123
|
+
|
|
93
124
|
afterAll(() => {
|
|
94
125
|
for (const sock of socks) {
|
|
95
|
-
// teardown is DELIBERATE →
|
|
96
|
-
|
|
126
|
+
// teardown is DELIBERATE → silence each canary (signal + dismiss) before
|
|
127
|
+
// killing its server — the v2 contract for every deliberate path.
|
|
128
|
+
for (const id of allIds) {
|
|
97
129
|
signalCanaryClean(sock, id)
|
|
130
|
+
dismissCanary(id)
|
|
98
131
|
}
|
|
99
132
|
spawnSync('tmux', ['-S', sock, 'kill-server'], { stdio: 'ignore' })
|
|
100
133
|
}
|
|
@@ -197,4 +230,59 @@ describe.if(tmuxAvailable)('canary live (sandbox tmux servers)', () => {
|
|
|
197
230
|
},
|
|
198
231
|
20000,
|
|
199
232
|
)
|
|
233
|
+
|
|
234
|
+
test(
|
|
235
|
+
'death-#4 shape: external killer sweeps server AND canary client → ev=server-exit cause=client-signaled',
|
|
236
|
+
async () => {
|
|
237
|
+
const identity = 'claude-canasweep'
|
|
238
|
+
const { sock, logDir } = bringUp(identity)
|
|
239
|
+
const pid = serverPid(sock)
|
|
240
|
+
expect(ensureServerCanary({ identity, sock, exitLogDir: logDir })).toBe('spawned')
|
|
241
|
+
expect(await waitFor(() => canaryRunning(identity), 3000)).toBe(true)
|
|
242
|
+
await sleep(500)
|
|
243
|
+
|
|
244
|
+
// The pre-clean-shaped external killer: one pattern takes the server AND
|
|
245
|
+
// the canary CLIENT (both argv contain `tmux -S <sock> `), the sh recorder
|
|
246
|
+
// survives. The TERMed client returns rc=0 (proven live) — v1 read that as
|
|
247
|
+
// a clean channel signal and stayed silent; exactly how deaths #4–#6
|
|
248
|
+
// (10.06) left zero records. v2 probes the server instead: dead → record.
|
|
249
|
+
spawnSync('pkill', ['-f', `tmux -S ${sock} `], { stdio: 'ignore' })
|
|
250
|
+
const log = exitLogPath(logDir)
|
|
251
|
+
expect(
|
|
252
|
+
await waitFor(() => existsSync(log) && readFileSync(log, 'utf8').includes('ev=server-exit'), 10000),
|
|
253
|
+
).toBe(true)
|
|
254
|
+
const line = readFileSync(log, 'utf8')
|
|
255
|
+
expect(line).toContain(`ev=server-exit identity=${identity}`)
|
|
256
|
+
expect(line).toContain('cause=signaled-server-gone') // rc=0 shape, server found dead
|
|
257
|
+
expect(line).toContain(`server_pid=${pid}`)
|
|
258
|
+
expect(readdirSync(serverDeathsDir(logDir)).length).toBe(1) // forensics captured
|
|
259
|
+
},
|
|
260
|
+
20000,
|
|
261
|
+
)
|
|
262
|
+
|
|
263
|
+
test(
|
|
264
|
+
'canary client killed while server lives → silent (no false record), canary gone for the retrofit to re-arm',
|
|
265
|
+
async () => {
|
|
266
|
+
const identity = 'claude-canaclient'
|
|
267
|
+
const { sock, logDir } = bringUp(identity)
|
|
268
|
+
expect(ensureServerCanary({ identity, sock, exitLogDir: logDir })).toBe('spawned')
|
|
269
|
+
expect(await waitFor(() => canaryRunning(identity), 3000)).toBe(true)
|
|
270
|
+
await sleep(500)
|
|
271
|
+
|
|
272
|
+
// TERM the CLIENT only (argv ends with the bare channel — the sh's quoted
|
|
273
|
+
// form does not match this $-anchored pattern), server stays up.
|
|
274
|
+
spawnSync('pkill', ['-f', `wait-for ${canaryChannel(identity)}$`], { stdio: 'ignore' })
|
|
275
|
+
// the sh probes (≈2 s), finds the server ALIVE → exits silently
|
|
276
|
+
expect(await waitFor(() => !canaryRunning(identity), 6000)).toBe(true)
|
|
277
|
+
await sleep(300)
|
|
278
|
+
expect(
|
|
279
|
+
existsSync(exitLogPath(logDir)) && readFileSync(exitLogPath(logDir), 'utf8').includes('ev=server-exit'),
|
|
280
|
+
).toBe(false) // no false server-death while the server lives
|
|
281
|
+
// server must still be alive — and ensure re-arms a fresh canary (the
|
|
282
|
+
// retrofit path; in prod the supervise tick does this and logs it)
|
|
283
|
+
expect(spawnSync('tmux', ['-S', sock, 'has-session', '-t', identity], { stdio: 'ignore' }).status).toBe(0)
|
|
284
|
+
expect(ensureServerCanary({ identity, sock, exitLogDir: logDir })).toBe('spawned')
|
|
285
|
+
},
|
|
286
|
+
20000,
|
|
287
|
+
)
|
|
200
288
|
})
|
package/src/launch/canary.ts
CHANGED
|
@@ -8,17 +8,32 @@
|
|
|
8
8
|
// Mechanism: one tiny detached `sh` per LIVE tmux server, holding a blocking
|
|
9
9
|
// client `tmux -S <sock> wait-for iap-canary-<identity>` — a process OUTSIDE the
|
|
10
10
|
// dying server, connected via its socket, so the server's death (any cause,
|
|
11
|
-
// including SIGKILL) is observed the moment the connection drops. Protocol
|
|
12
|
-
//
|
|
13
|
-
//
|
|
14
|
-
//
|
|
15
|
-
//
|
|
16
|
-
// •
|
|
17
|
-
//
|
|
18
|
-
//
|
|
19
|
-
// (
|
|
20
|
-
//
|
|
11
|
+
// including SIGKILL) is observed the moment the connection drops. Protocol (v2 —
|
|
12
|
+
// death #4 10.06 15:42Z proved v1's exit-code trust wrong twice over: the killer
|
|
13
|
+
// took the SERVER and the canary CLIENT together, AND a TERMed tmux client
|
|
14
|
+
// returns rc=0 — indistinguishable from a clean channel signal — so the rc-based
|
|
15
|
+
// `0 || ≥128 → silent` guard silenced a real death):
|
|
16
|
+
// • DELIBERATE silence is an explicit act, never an exit-code inference:
|
|
17
|
+
// every deliberate teardown (idle-reap / stop / pre-clean / pane-died hook /
|
|
18
|
+
// bootout teardown) signals the channel (`wait-for -S`) AND dismisses the
|
|
19
|
+
// sh recorder (`dismissCanary` → TERM → trap → silent exit). POSIX trap
|
|
20
|
+
// semantics make this race-free: the trap runs before any recording.
|
|
21
|
+
// • When wait-for returns — ANY code — the script sleeps 2 s (a concurrent
|
|
22
|
+
// dismissal TERM wins here), then probes SERVER LIVENESS: alive → exit
|
|
23
|
+
// silently (a lost canary is re-armed and logged by the supervise retrofit
|
|
24
|
+
// within a tick); dead with nobody having dismissed us → the death is real
|
|
25
|
+
// and unclaimed → ONE logfmt line `ev=server-exit` into exits.log (the
|
|
26
|
+
// per-peer death-cause home, next to pane-died's `ev=session-exit`) + a
|
|
27
|
+
// forensics snapshot (vm_stat / swap / top-RSS ps / fresh DiagnosticReports)
|
|
28
|
+
// captured within seconds — the evidence the 60 s supervise tick can never
|
|
21
29
|
// recover ("системных следов ноль" was the recurring investigation outcome).
|
|
30
|
+
// The raw wait_rc still ATTRIBUTES the death (cause=server-vanished /
|
|
31
|
+
// signaled-server-gone / client-killed-server-gone).
|
|
32
|
+
// Residual blind spot (structural): a killer that SIGKILLs the sh recorder
|
|
33
|
+
// itself leaves no in-process way to record. With v2 the ABSENCE of a record on
|
|
34
|
+
// a server-dead reap narrows the diagnosis to exactly that shape; the canary
|
|
35
|
+
// ensure-state lines in lifecycle.log (origin=launch/retrofit) evidence the
|
|
36
|
+
// churn post-hoc.
|
|
22
37
|
//
|
|
23
38
|
// The canary is pure observability: it never wakes, reaps, restarts or otherwise
|
|
24
39
|
// manages anything (H4-compatible by construction), it fires at most once, and
|
|
@@ -69,24 +84,45 @@ export function canaryScript(o: CanaryScriptOptions): string {
|
|
|
69
84
|
return [
|
|
70
85
|
// Server PID captured while alive — the postmortem grep key for system logs.
|
|
71
86
|
`SPID="$('${o.tmuxBin}' -S '${o.sock}' display-message -p '#{pid}' 2>/dev/null)"`,
|
|
72
|
-
// A signal to the
|
|
87
|
+
// A signal to the SH WRAPPER is deliberate dismissal (dismissCanary) or host
|
|
88
|
+
// shutdown → silent. POSIX: the trap runs after the foreground command
|
|
89
|
+
// completes — so a TERM delivered during wait-for/sleep always exits us
|
|
90
|
+
// BEFORE any recording below (the race-free deliberate-silence guarantee).
|
|
73
91
|
`trap 'exit 0' HUP INT TERM`,
|
|
74
92
|
`'${o.tmuxBin}' -S '${o.sock}' wait-for '${ch}'`,
|
|
75
93
|
`rc=$?`,
|
|
76
|
-
//
|
|
77
|
-
|
|
78
|
-
//
|
|
94
|
+
// NO wait-for exit code is trusted as "deliberate" by itself — PROVEN live
|
|
95
|
+
// (death-#4 postmortem): a TERM to the tmux CLIENT returns rc=0, identical
|
|
96
|
+
// to a clean channel signal, so an external killer sweeping server+client
|
|
97
|
+
// rides the clean-looking code straight past any rc-based guard (v1's
|
|
98
|
+
// `rc=0 || rc>=128 → silent` was exactly that hole). v2 contract instead:
|
|
99
|
+
// • deliberate teardowns DISMISS this sh (TERM → trap above) — the sleep
|
|
100
|
+
// below gives a concurrently-delivered dismissal time to win;
|
|
101
|
+
// • then the SERVER's liveness, not the exit code, decides: alive →
|
|
102
|
+
// nothing to record (a lost canary is re-armed and logged by the
|
|
103
|
+
// supervise retrofit within a tick); dead and nobody dismissed us →
|
|
104
|
+
// the death is real and unclaimed → record it.
|
|
105
|
+
`sleep 2`,
|
|
106
|
+
`if '${o.tmuxBin}' -S '${o.sock}' has-session 2>/dev/null; then exit 0; fi`,
|
|
107
|
+
// The exit code still ATTRIBUTES the recorded death (raw wait_rc is kept):
|
|
108
|
+
// rc=0 → signaled-server-gone (channel signal or client-TERM, server died)
|
|
109
|
+
// rc≥128 → client-killed-server-gone (client took a non-TERM kill)
|
|
110
|
+
// else → server-vanished (connection drop — SIGKILL/OOM class)
|
|
111
|
+
`cause=server-vanished`,
|
|
112
|
+
`if [ "$rc" -eq 0 ]; then cause=signaled-server-gone; fi`,
|
|
113
|
+
`if [ "$rc" -ge 128 ]; then cause=client-killed-server-gone; fi`,
|
|
114
|
+
// The server is gone under us — record, within seconds of the death.
|
|
79
115
|
`ts="$(date -u +%Y-%m-%dT%H:%M:%SZ)"`,
|
|
80
116
|
`f='${o.forensicsDir}/${o.identity}'-"$(date +%s)".txt`,
|
|
81
117
|
`{`,
|
|
82
|
-
` echo "server-death identity=${o.identity} ts=$ts server_pid=$SPID wait_rc=$rc"`,
|
|
118
|
+
` echo "server-death identity=${o.identity} ts=$ts server_pid=$SPID wait_rc=$rc cause=$cause"`,
|
|
83
119
|
` echo '--- vm_stat'; vm_stat`,
|
|
84
120
|
` echo '--- swapusage'; sysctl -n vm.swapusage`,
|
|
85
121
|
` echo '--- ps-top-rss'; ps axo pid,ppid,rss,etime,command | sort -rn -k3 | head -25`,
|
|
86
122
|
` echo '--- diagnosticreports-user'; ls -t "$HOME/Library/Logs/DiagnosticReports" 2>/dev/null | head`,
|
|
87
123
|
` echo '--- diagnosticreports-system'; ls -t /Library/Logs/DiagnosticReports 2>/dev/null | head`,
|
|
88
124
|
`} > "$f" 2>&1`,
|
|
89
|
-
`printf 'ts=%s ev=server-exit identity=${o.identity} server_pid=%s wait_rc=%s forensics=%s\\n' "$ts" "$SPID" "$rc" "$f" >> '${o.exitLogFile}'`,
|
|
125
|
+
`printf 'ts=%s ev=server-exit identity=${o.identity} server_pid=%s wait_rc=%s cause=%s forensics=%s\\n' "$ts" "$SPID" "$rc" "$cause" "$f" >> '${o.exitLogFile}'`,
|
|
90
126
|
].join('\n')
|
|
91
127
|
}
|
|
92
128
|
|
|
@@ -150,3 +186,27 @@ export function signalCanaryClean(sock: string, identity: string): void {
|
|
|
150
186
|
/* best-effort */
|
|
151
187
|
}
|
|
152
188
|
}
|
|
189
|
+
|
|
190
|
+
/** The pgrep/pkill ERE matching BOTH canary processes of ONE identity — the sh
|
|
191
|
+
* wrapper (its -c script quotes the channel: `…'iap-canary-<id>'…`) and the
|
|
192
|
+
* tmux client (argv ends with the bare channel). Anchored so an identity can
|
|
193
|
+
* never match another identity's prefix (claude-iapeer ≠ claude-iapeer-memory). */
|
|
194
|
+
export function canaryProcessPattern(identity: string): string {
|
|
195
|
+
return `${canaryChannel(identity)}([^a-z0-9-]|$)`
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
/**
|
|
199
|
+
* Dismiss this identity's canary BEFORE a deliberate server teardown: TERM the
|
|
200
|
+
* sh wrapper (trap → silent exit, race-free — the trap always runs before the
|
|
201
|
+
* v2 recording branch) and the tmux client. The explicit counterpart of the
|
|
202
|
+
* channel signal: with v2 a signaled CLIENT alone is no longer read as
|
|
203
|
+
* deliberate, so every deliberate path must dismiss the RECORDER (the sh).
|
|
204
|
+
* Best-effort: no canary running → harmless no-op (pkill exits 1).
|
|
205
|
+
*/
|
|
206
|
+
export function dismissCanary(identity: string): void {
|
|
207
|
+
try {
|
|
208
|
+
spawnSync('pkill', ['-f', canaryProcessPattern(identity)], { stdio: 'ignore' })
|
|
209
|
+
} catch {
|
|
210
|
+
/* best-effort */
|
|
211
|
+
}
|
|
212
|
+
}
|
package/src/launch/index.ts
CHANGED
|
@@ -19,7 +19,8 @@ import { dirname } from 'path'
|
|
|
19
19
|
import { spawnSync } from 'child_process'
|
|
20
20
|
import { CODEX_BEARER_ENV_VAR, CODEX_DUMMY_BEARER, type Runtime } from '../core/constants.ts'
|
|
21
21
|
import { readLaunchEnv } from '../storage/index.ts'
|
|
22
|
-
import { ensureServerCanary, exitLogPath, signalCanaryClean } from './canary.ts'
|
|
22
|
+
import { canaryChannel, dismissCanary, ensureServerCanary, exitLogPath, signalCanaryClean } from './canary.ts'
|
|
23
|
+
import { appendLifecycleEvent } from '../lifecycle/eventlog.ts'
|
|
23
24
|
import { claudeAdapter } from './adapters/claude.ts'
|
|
24
25
|
import { codexAdapter } from './adapters/codex.ts'
|
|
25
26
|
import { telegramAdapter } from './adapters/telegram.ts'
|
|
@@ -155,7 +156,18 @@ export function exitCauseHook(identity: string, exitLogFile: string): string {
|
|
|
155
156
|
`dead_status=#{pane_dead_status} dead_signal=#{pane_dead_signal}\\n`
|
|
156
157
|
const log =
|
|
157
158
|
`printf "${line}" "$(date -u +%Y-%m-%dT%H:%M:%SZ)" >> "${exitLogFile}"`
|
|
158
|
-
|
|
159
|
+
// Silence the server-death canary BEFORE kill-session: with a single-session
|
|
160
|
+
// server the kill empties the server and exit-empty takes it down — without
|
|
161
|
+
// explicit silence the canary would append a second, muddier `ev=server-exit`
|
|
162
|
+
// record for a death this very hook just captured (the session-exit line is
|
|
163
|
+
// the richer, authoritative one: it has the exit code/signal). v2 contract:
|
|
164
|
+
// deliberate = channel signal (tmux-NATIVE wait-for -S) + DISMISS the sh
|
|
165
|
+
// recorder (abs-path /usr/bin/pkill — survives the minimal launchd PATH;
|
|
166
|
+
// `\$` keeps the regex anchor literal through the sh double-quote layer; the
|
|
167
|
+
// `[i]` class keeps the pattern from matching its OWN occurrence in this
|
|
168
|
+
// hook-sh's cmdline — the pgrep self-match classic).
|
|
169
|
+
const dismiss = `/usr/bin/pkill -f "[i]${canaryChannel(identity).slice(1)}([^a-z0-9-]|\\$)"`
|
|
170
|
+
return `run-shell '${log} ; ${dismiss}' ; wait-for -S "${canaryChannel(identity)}" ; kill-session -t "${identity}"`
|
|
159
171
|
}
|
|
160
172
|
|
|
161
173
|
/** Install the exit-cause observability on a freshly-created session: ensure the
|
|
@@ -212,11 +224,15 @@ export const launch: LaunchFn = async (
|
|
|
212
224
|
mkdirSync(dirname(sock), { recursive: true })
|
|
213
225
|
|
|
214
226
|
// (1) Pre-clean any stale tmux server on this socket, then launch detached.
|
|
215
|
-
//
|
|
216
|
-
//
|
|
217
|
-
//
|
|
218
|
-
//
|
|
227
|
+
// Silence the server-death canary EXPLICITLY first — this teardown is
|
|
228
|
+
// deliberate: signal the channel (client exits 0) AND dismiss the sh
|
|
229
|
+
// recorder (TERM → trap → silent; canary v2 no longer reads a signaled
|
|
230
|
+
// client alone as deliberate — an external killer sweeping server+client
|
|
231
|
+
// was exactly the death-#4 silence). Only then sweep the server processes
|
|
232
|
+
// (the pkill also matches a leftover canary client — harmless, both
|
|
233
|
+
// canary processes are already dismissed).
|
|
219
234
|
signalCanaryClean(sock, identity)
|
|
235
|
+
dismissCanary(identity)
|
|
220
236
|
spawnSync('pkill', ['-f', `tmux -S ${sock} `], { stdio: 'ignore' })
|
|
221
237
|
tmux(sock, 'kill-server')
|
|
222
238
|
|
|
@@ -268,7 +284,14 @@ export const launch: LaunchFn = async (
|
|
|
268
284
|
// that records `ev=server-exit` + a forensics snapshot when the whole
|
|
269
285
|
// server dies dirty (SIGKILL/OOM class). Same gate as the exit hook;
|
|
270
286
|
// best-effort by construction (every failure → a state, never a throw).
|
|
271
|
-
|
|
287
|
+
// The ensure-state is logged (origin=launch) so the arming trail is
|
|
288
|
+
// complete in lifecycle.log: 'spawned' is the newborn-server norm here,
|
|
289
|
+
// 'failed' a newborn server starting its life UNWATCHED (death-#4
|
|
290
|
+
// postmortem hinged on this very question being unanswerable).
|
|
291
|
+
const canaryState = ensureServerCanary({ identity, sock, exitLogDir: cfg.exitLogDir, env })
|
|
292
|
+
if (canaryState !== 'skipped') {
|
|
293
|
+
appendLifecycleEvent(cfg.exitLogDir, { ev: 'canary', identity, state: canaryState, origin: 'launch' }, { env })
|
|
294
|
+
}
|
|
272
295
|
|
|
273
296
|
// (3) pipe-pane the session output to the per-identity log.
|
|
274
297
|
mkdirSync(cfg.logDir, { recursive: true, mode: 0o700 })
|
|
@@ -227,6 +227,14 @@ describe('exitCauseHook (exit-cause observability)', () => {
|
|
|
227
227
|
// tmux-NATIVE kill-session (no shell `tmux`) → needs no PATH (launchd minimal env).
|
|
228
228
|
expect(hook).toContain('kill-session -t "claude-iapeer"')
|
|
229
229
|
})
|
|
230
|
+
test('silences the server-death canary (native wait-for -S) BEFORE kill-session — no double record', () => {
|
|
231
|
+
// kill-session on a single-session server → exit-empty takes the server down;
|
|
232
|
+
// without the signal the canary would add a second `ev=server-exit` record for
|
|
233
|
+
// a death this hook just captured (session-exit carries the code/signal).
|
|
234
|
+
expect(hook).toContain('wait-for -S "iap-canary-claude-iapeer"')
|
|
235
|
+
expect(hook.indexOf('wait-for -S')).toBeLessThan(hook.indexOf('kill-session'))
|
|
236
|
+
expect(hook.indexOf('run-shell')).toBeLessThan(hook.indexOf('wait-for -S')) // log first
|
|
237
|
+
})
|
|
230
238
|
test('quoting: single-quoted run-shell arg (tmux layer) wrapping double-quoted sh', () => {
|
|
231
239
|
expect(hook).toMatch(/run-shell '.*'/)
|
|
232
240
|
expect(hook).not.toContain("''") // no empty/again-collapsed single-quote pair
|
package/src/launch/launchdRun.ts
CHANGED
|
@@ -23,7 +23,7 @@ import { buildProcessAddress, buildSocketPath } from '../core/socket.ts'
|
|
|
23
23
|
import { peerLogsDir, pluginLogsDir } from '../storage/index.ts'
|
|
24
24
|
import { readPeerProfile } from '../identity/index.ts'
|
|
25
25
|
import { getAdapter, launch } from './index.ts'
|
|
26
|
-
import { signalCanaryClean } from './canary.ts'
|
|
26
|
+
import { dismissCanary, signalCanaryClean } from './canary.ts'
|
|
27
27
|
import type { LaunchConfig, LaunchSpec } from './types.ts'
|
|
28
28
|
|
|
29
29
|
/** Block-watch poll cadence — seconds, deliberately NOT a tight loop (the session
|
|
@@ -55,7 +55,10 @@ function sessionAlive(sock: string, identity: string): boolean {
|
|
|
55
55
|
* both paths now converge on the same end state.
|
|
56
56
|
*/
|
|
57
57
|
export function teardownAlwaysOnSession(sock: string, identity: string): void {
|
|
58
|
+
// Explicit canary silence (v2): channel signal (client exits 0) + dismiss the
|
|
59
|
+
// sh recorder — a signaled client alone is no longer read as deliberate.
|
|
58
60
|
signalCanaryClean(sock, identity)
|
|
61
|
+
dismissCanary(identity)
|
|
59
62
|
spawnSync('tmux', ['-S', sock, 'kill-session', '-t', identity], { stdio: 'ignore' })
|
|
60
63
|
const ls = spawnSync('tmux', ['-S', sock, 'list-sessions', '-F', '#{session_name}'], { encoding: 'utf8' })
|
|
61
64
|
if (!(ls.stdout ?? '').trim()) {
|
package/src/lifecycle/index.ts
CHANGED
|
@@ -46,7 +46,7 @@ import {
|
|
|
46
46
|
type LaunchSpec,
|
|
47
47
|
} from '../launch/index.ts'
|
|
48
48
|
import { composeSystemPrompt, gatherPromptInput } from '../launch/composeSystemPrompt.ts'
|
|
49
|
-
import { ensureServerCanary, signalCanaryClean } from '../launch/canary.ts'
|
|
49
|
+
import { dismissCanary, ensureServerCanary, signalCanaryClean } from '../launch/canary.ts'
|
|
50
50
|
import { appendLifecycleEvent, superviseLogVerbose } from './eventlog.ts'
|
|
51
51
|
|
|
52
52
|
// ─────────────────────────────────────────────────────────────────────────────
|
|
@@ -79,6 +79,16 @@ export interface LifecycleConfig {
|
|
|
79
79
|
* transcript mtime is a LIVENESS proxy — "no longer writing" — not a semantic
|
|
80
80
|
* "done" signal. */
|
|
81
81
|
ephemeralQuietSecs: number
|
|
82
|
+
/** wake_policy:ephemeral — the UNARMED idle bound (seconds): an ephemeral session
|
|
83
|
+
* that never armed (finished silently / lost its arm to a daemon-restart window)
|
|
84
|
+
* is reaped after this much activity-proxy silence. Live case (scriber 10.06):
|
|
85
|
+
* a worker that ended «тихо» without its final outbound stalled its M3 FIFO for
|
|
86
|
+
* the FULL generic idleSecs (1 h) — with serial drain that blocks the whole
|
|
87
|
+
* conveyor per silent worker. Bound chosen ≫ the legitimate silent-tool case
|
|
88
|
+
* (sleep-180) and ≪ idleSecs. DOCUMENTED RISK: an ephemeral worker whose tool
|
|
89
|
+
* stays silent longer than this MID-TASK is reaped and its consumed queue item
|
|
90
|
+
* is lost — ephemeral workers must emit activity (or their reply) within it. */
|
|
91
|
+
ephemeralUnarmedIdleSecs: number
|
|
82
92
|
}
|
|
83
93
|
|
|
84
94
|
export function loadLifecycleConfig(env: NodeJS.ProcessEnv = process.env): LifecycleConfig {
|
|
@@ -102,6 +112,7 @@ export function loadLifecycleConfig(env: NodeJS.ProcessEnv = process.env): Lifec
|
|
|
102
112
|
crashLoopMax: num(env.IAPEER_CRASHLOOP_MAX, 3),
|
|
103
113
|
crashLoopWindowSecs: num(env.IAPEER_CRASHLOOP_WINDOW_SECS, 300),
|
|
104
114
|
ephemeralQuietSecs: num(env.IAPEER_EPHEMERAL_QUIET_SECS, 20),
|
|
115
|
+
ephemeralUnarmedIdleSecs: num(env.IAPEER_EPHEMERAL_UNARMED_IDLE_SECS, 600),
|
|
105
116
|
}
|
|
106
117
|
}
|
|
107
118
|
|
|
@@ -998,9 +1009,11 @@ export function killSession(sock: string, identity: string): void {
|
|
|
998
1009
|
tmux(sock, 'kill-session', '-t', identity)
|
|
999
1010
|
const sessions = tmux(sock, 'list-sessions', '-F', '#{session_name}').out
|
|
1000
1011
|
if (!sessions.trim()) {
|
|
1001
|
-
// Deliberate server teardown →
|
|
1002
|
-
// exits
|
|
1012
|
+
// Deliberate server teardown → silence the canary EXPLICITLY first: signal
|
|
1013
|
+
// the channel (client exits 0) AND dismiss the sh recorder (canary v2 no
|
|
1014
|
+
// longer reads a signaled client alone as deliberate — see canary.ts).
|
|
1003
1015
|
signalCanaryClean(sock, identity)
|
|
1016
|
+
dismissCanary(identity)
|
|
1004
1017
|
tmux(sock, 'kill-server')
|
|
1005
1018
|
try {
|
|
1006
1019
|
rmSync(sock, { force: true })
|
|
@@ -1140,6 +1153,30 @@ export function superviseTick(cfg: LifecycleConfig, deps: SuperviseDeps = {}): S
|
|
|
1140
1153
|
trace({ identity: s.identity, action: 'reaped-ephemeral', age: `${ageSecs}s`, outcome: 'ephemeral-done' })
|
|
1141
1154
|
continue
|
|
1142
1155
|
}
|
|
1156
|
+
// UNARMED ephemeral idle bound (live case scriber 10.06): a worker that ended
|
|
1157
|
+
// SILENTLY (no final outbound → never armed; or its arm was lost to a CLI/
|
|
1158
|
+
// daemon-restart window) used to wait out the FULL generic idleSecs (1 h) —
|
|
1159
|
+
// and the M3 serial drain waits for the session's death, so ONE silent worker
|
|
1160
|
+
// stalled its whole conveyor. This bound is the defense-in-depth backstop:
|
|
1161
|
+
// ≫ the legitimate silent-tool case (sleep-180), ≪ idleSecs. The ШТАТНЫЙ
|
|
1162
|
+
// silent-finish path is `iapeer self-done` (arm without waking anyone —
|
|
1163
|
+
// Артур's invariant «нет пустых пробуждений» stays intact); this branch only
|
|
1164
|
+
// bounds the damage when a worker does neither. Policy reap: NO .idle-reaped
|
|
1165
|
+
// (ephemeral never resumes), NO recordDeath (the ring counts faults).
|
|
1166
|
+
if (isEphemeralPeer(s.cwd) && ageSecs > cfg.ephemeralUnarmedIdleSecs) {
|
|
1167
|
+
killSession(sock, s.identity)
|
|
1168
|
+
clearEphemeralArmed(cfg, s.identity)
|
|
1169
|
+
removeSessionState(cfg, s.identity)
|
|
1170
|
+
out.push({
|
|
1171
|
+
identity: s.identity,
|
|
1172
|
+
action: 'reaped-ephemeral',
|
|
1173
|
+
reason: `unarmed idle ${ageSecs}s (silent-finish backstop; штатный путь — iapeer self-done)`,
|
|
1174
|
+
personality: s.personality,
|
|
1175
|
+
runtime: s.runtime,
|
|
1176
|
+
})
|
|
1177
|
+
trace({ identity: s.identity, action: 'reaped-ephemeral', age: `${ageSecs}s`, outcome: 'ephemeral-unarmed-bound' })
|
|
1178
|
+
continue
|
|
1179
|
+
}
|
|
1143
1180
|
if (ageSecs > cfg.idleSecs) {
|
|
1144
1181
|
// THE ONLY place .idle-reaped is written: this is the one death the daemon
|
|
1145
1182
|
// INITIATES. Its presence on the next wake = the session was parked cleanly =
|
|
@@ -1155,7 +1192,15 @@ export function superviseTick(cfg: LifecycleConfig, deps: SuperviseDeps = {}): S
|
|
|
1155
1192
|
// every ALIVE daemon-owned server — covers a fleet launched by older code
|
|
1156
1193
|
// within one tick of a deploy, no session restarts. Idempotent (pgrep on
|
|
1157
1194
|
// the per-identity wait-for channel), best-effort, pure observability.
|
|
1158
|
-
|
|
1195
|
+
// A non-'already' state IS a decision-grade event (not verbose-gated):
|
|
1196
|
+
// post-0.2.22 every launch arms a canary, so a retrofit 'spawned' on an
|
|
1197
|
+
// alive server means the previous canary VANISHED mid-watch (the death-#4
|
|
1198
|
+
// blind spot was exactly this churn being invisible); 'failed' means the
|
|
1199
|
+
// server is currently UNWATCHED. At most one line per loss, not per tick.
|
|
1200
|
+
const canaryState = ensureServerCanary({ identity: s.identity, sock, exitLogDir: cfg.eventLogDir, env })
|
|
1201
|
+
if (canaryState !== 'already') {
|
|
1202
|
+
trace({ identity: s.identity, action: 'canary', state: canaryState, origin: 'retrofit' })
|
|
1203
|
+
}
|
|
1159
1204
|
out.push({ identity: s.identity, action: 'alive' })
|
|
1160
1205
|
if (verbose) trace({ identity: s.identity, action: 'alive', age: `${ageSecs}s` })
|
|
1161
1206
|
}
|
|
@@ -802,6 +802,69 @@ describe('superviseTick quiet-reap (M2 die-after-reply, real tmux)', () => {
|
|
|
802
802
|
},
|
|
803
803
|
30000,
|
|
804
804
|
)
|
|
805
|
+
|
|
806
|
+
test.if(tmuxAvailable)(
|
|
807
|
+
'UNARMED ephemeral past the unarmed idle bound → reaped-ephemeral (silent-finish backstop; live case scriber 10.06)',
|
|
808
|
+
() => {
|
|
809
|
+
const root = mkdtempSync(join(tmpdir(), 'iapeer-eu-root-'))
|
|
810
|
+
const laDir = mkdtempSync(join(tmpdir(), 'iapeer-eu-la-'))
|
|
811
|
+
const cwd = profileCwd(false, true) // ephemeral worker profile
|
|
812
|
+
const env = {
|
|
813
|
+
...process.env,
|
|
814
|
+
IAPEER_ROOT: root,
|
|
815
|
+
IAPEER_LAUNCHAGENTS_DIR: laDir,
|
|
816
|
+
IAPEER_SOCK_DIR: join(root, 'socks'),
|
|
817
|
+
IAPEER_EPHEMERAL_UNARMED_IDLE_SECS: '30', // ≪ the 60s age below, ≫ quiet 20s
|
|
818
|
+
}
|
|
819
|
+
const cfg = loadLifecycleConfig(env)
|
|
820
|
+
const identity = 'claude-eu'
|
|
821
|
+
const sock = join(root, 'socks', 'tmux-iap-claude-eu.sock')
|
|
822
|
+
const alive = () => spawnSync('tmux', ['-S', sock, 'has-session', '-t', identity]).status === 0
|
|
823
|
+
try {
|
|
824
|
+
mkdirSync(join(root, 'socks'), { recursive: true })
|
|
825
|
+
spawnSync('tmux', ['-S', sock, 'new-session', '-d', '-s', identity, 'sleep', '300'])
|
|
826
|
+
expect(alive()).toBe(true)
|
|
827
|
+
mkdirSync(cfg.stateDir, { recursive: true })
|
|
828
|
+
writeFileSync(
|
|
829
|
+
join(cfg.stateDir, `${identity}.session`),
|
|
830
|
+
JSON.stringify({ identity, runtime: 'claude', personality: 'eu', cwd, wokeAt: Date.now() - 60_000 }),
|
|
831
|
+
)
|
|
832
|
+
// NOT armed (the worker ended silently) — past the unarmed bound → policy reap
|
|
833
|
+
const o = superviseTick(cfg, { env }).find(x => x.identity === identity)
|
|
834
|
+
expect(o?.action).toBe('reaped-ephemeral')
|
|
835
|
+
expect(o?.reason).toContain('unarmed idle')
|
|
836
|
+
expect(o?.personality).toBe('eu') // M3 drain fields present → queue feeds next
|
|
837
|
+
expect(alive()).toBe(false)
|
|
838
|
+
// policy death: no resume-eligibility, no crash-loop count
|
|
839
|
+
expect(hasIdleReaped(cfg, identity)).toBe(false)
|
|
840
|
+
expect(readDeaths(cfg, identity).length).toBe(0)
|
|
841
|
+
const logged = readFileSync(join(cfg.eventLogDir, 'lifecycle.log'), 'utf8')
|
|
842
|
+
expect(logged).toContain('outcome=ephemeral-unarmed-bound')
|
|
843
|
+
// a NON-ephemeral peer with the same age is untouched by this bound
|
|
844
|
+
// (its session lives on ITS OWN identity-derived socket — the tick keys
|
|
845
|
+
// sockets on runtime-personality, not on the test's prior sock)
|
|
846
|
+
const plainCwd = profileCwd(false, false)
|
|
847
|
+
const eupSock = join(root, 'socks', 'tmux-iap-claude-eup.sock')
|
|
848
|
+
try {
|
|
849
|
+
writeFileSync(
|
|
850
|
+
join(cfg.stateDir, `claude-eup.session`),
|
|
851
|
+
JSON.stringify({ identity: 'claude-eup', runtime: 'claude', personality: 'eup', cwd: plainCwd, wokeAt: Date.now() - 60_000 }),
|
|
852
|
+
)
|
|
853
|
+
spawnSync('tmux', ['-S', eupSock, 'new-session', '-d', '-s', 'claude-eup', 'sleep', '300'])
|
|
854
|
+
expect(superviseTick(cfg, { env }).find(x => x.identity === 'claude-eup')?.action).toBe('alive')
|
|
855
|
+
} finally {
|
|
856
|
+
spawnSync('tmux', ['-S', eupSock, 'kill-server'], { stdio: 'ignore' })
|
|
857
|
+
rmSync(plainCwd, { recursive: true, force: true })
|
|
858
|
+
}
|
|
859
|
+
} finally {
|
|
860
|
+
spawnSync('tmux', ['-S', sock, 'kill-server'], { stdio: 'ignore' })
|
|
861
|
+
rmSync(root, { recursive: true, force: true })
|
|
862
|
+
rmSync(laDir, { recursive: true, force: true })
|
|
863
|
+
rmSync(cwd, { recursive: true, force: true })
|
|
864
|
+
}
|
|
865
|
+
},
|
|
866
|
+
30000,
|
|
867
|
+
)
|
|
805
868
|
})
|
|
806
869
|
|
|
807
870
|
// ─────────────────────────────────────────────────────────────────────────────
|