@agfpd/iapeer 0.2.25 → 0.2.27

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@agfpd/iapeer",
3
- "version": "0.2.25",
3
+ "version": "0.2.27",
4
4
  "description": "Foundation core for the iapeer multi-agent ecosystem: identity, registry, storage, codec.",
5
5
  "type": "module",
6
6
  "bin": {
@@ -130,6 +130,17 @@ describe('remove (registry record via the locked writer)', () => {
130
130
  expect((await removePeerCli('twice', { env: e })).action).toBe('removed')
131
131
  expect((await removePeerCli('twice', { env: e })).action).toBe('absent')
132
132
  })
133
+ test('self-done arms the caller\'s own quiet-reap (non-waking silent finish); refuses without PEER_IDENTITY', async () => {
134
+ const e = env()
135
+ // no PEER_IDENTITY → self-call refusal
136
+ expect(await runCli(['self-done'], e)).toBe(1)
137
+ // with PEER_IDENTITY → marker set, exit 0, nobody contacted
138
+ const e2 = { ...e, PEER_IDENTITY: 'claude-silentworker' }
139
+ expect(await runCli(['self-done'], e2)).toBe(0)
140
+ const { hasEphemeralArmed } = await import('../lifecycle/index.ts')
141
+ expect(hasEphemeralArmed(loadLifecycleConfig(e2), 'claude-silentworker')).toBe(true)
142
+ })
143
+
133
144
  test('purges identity-keyed lifecycle state with the record — a namesake newborn must not inherit a dead peer\'s parking (boris 10.06)', async () => {
134
145
  await register('reborn')
135
146
  const e = env()
package/src/cli/index.ts CHANGED
@@ -30,11 +30,13 @@ import {
30
30
  clearStopped,
31
31
  folderLaunch,
32
32
  isLaunchdManaged,
33
+ isEphemeralPeer,
33
34
  isStopped,
34
35
  killSession,
35
36
  loadLifecycleConfig,
36
37
  purgeIdentityState,
37
38
  removeSessionState,
39
+ setEphemeralArmed,
38
40
  setIdleReaped,
39
41
  setNewEager,
40
42
  setStopped,
@@ -348,8 +350,9 @@ export async function sendMessage(
348
350
  // unawaited in-process wake would die with it; the daemon's supervise-tick
349
351
  // drain scan (≤60 s) picks the queue up — the EXISTING retry path for failed
350
352
  // kicks, not a new mechanism.
351
- const { makeEphemeralRouteDeps } = await import('../daemon/main.ts')
353
+ const { makeArmEphemeralOnDelivered, makeEphemeralRouteDeps } = await import('../daemon/main.ts')
352
354
  const cfg = loadLifecycleConfig(env)
355
+ const t0 = Date.now()
353
356
  const result = await routeSend(
354
357
  caller,
355
358
  {
@@ -361,7 +364,39 @@ export async function sendMessage(
361
364
  },
362
365
  { wake: cliWake, ephemeral: makeEphemeralRouteDeps(cfg, env, () => {}) },
363
366
  )
367
+ // delivery.log sink — CLI-path parity (boris's observability gap 10.06: enqueues
368
+ // routed through the CLI left to=<peer> at ZERO for the day while real wakes
369
+ // happened; the daemon tool-path logs, this path was blind). Same fields, plus
370
+ // path=cli so the two entry points are distinguishable. Both branches logged.
371
+ const { appendDeliveryEvent } = await import('../daemon/deliverylog.ts')
372
+ appendDeliveryEvent(cfg.eventLogDir, {
373
+ ev: 'delivery',
374
+ path: 'cli',
375
+ caller: caller.address,
376
+ to: opts.target,
377
+ rt: opts.runtime,
378
+ ok: String(result.ok),
379
+ via: result.ok ? `${result.value.delivered_to.runtime}-${result.value.delivered_to.personality}` : undefined,
380
+ woke: result.ok ? String(result.value.woke) : undefined,
381
+ queued: result.ok && result.value.queued ? 'true' : undefined,
382
+ qd: result.ok ? result.value.queueDepth : undefined,
383
+ ms: Date.now() - t0,
384
+ len: opts.message.length,
385
+ att: opts.attachments?.length || undefined,
386
+ topic: opts.topic,
387
+ err: result.ok ? undefined : result.error.message,
388
+ })
364
389
  if (!result.ok) throw new Error(result.error.message)
390
+ // M2 arm-on-outbound — CLI-path parity (live gap 10.06: an ephemeral worker's
391
+ // final reply sent through the CLI fallback — e.g. inside a daemon-restart
392
+ // window, four deploys that day — never armed, so the worker idled to the
393
+ // unarmed bound and stalled its FIFO). Same hook the daemon path uses; ONLY on
394
+ // an ok outcome, errors swallowed (arming is best-effort, never fails the send).
395
+ try {
396
+ makeArmEphemeralOnDelivered(cfg)(caller)
397
+ } catch {
398
+ /* best-effort */
399
+ }
365
400
  return {
366
401
  ok: true,
367
402
  delivered_to: result.value.delivered_to,
@@ -432,6 +467,7 @@ const USAGE = `usage: iapeer <verb> [args]
432
467
  interrupt <peer> [runtime] interrupt the current turn (Escape) — context intact
433
468
  compact <peer> [runtime] compact the peer's context (/compact)
434
469
  self-fresh (agent self-call) mark /new eager-fresh + self-kill — the daemon relaunches fresh
470
+ self-done (agent self-call, ephemeral) silent finish: arm own quiet-reap, wake no one
435
471
  native-memory <off|on> (--peer <p> | --all) gate/restore runtimes' native memory (canonized lever; контракт «Слот памяти»)
436
472
  memory-plugin <on|off> (--peer <p> | --all) install/remove the slot-declared provider plugin (claude per-peer, codex host-global)
437
473
  `
@@ -913,6 +949,37 @@ export async function runCli(argv: string[], env: NodeJS.ProcessEnv = process.en
913
949
  if (!positionals[0] || !positionals[1]) return usage(errOut)
914
950
  return await runAlwaysOn(positionals[0], positionals[1], process.cwd())
915
951
  }
952
+ case 'self-done': {
953
+ // SILENT-FINISH self-call for an ephemeral worker (контракт ЖЦ §wake_policy;
954
+ // развилка boris 10.06): a worker whose task produced NOTHING to send must
955
+ // still release its M3 FIFO — but an EMPTY report would violate Артур's
956
+ // invariant «событие-всё-отфильтровано = тишина» (no empty wakes of the
957
+ // target). This verb is the non-waking arm: it sets the worker's OWN
958
+ // .ephemeral-armed (same marker the ok-outbound hook sets), so the quiet
959
+ // window reaps it within seconds and the drain feeds the next task — nobody
960
+ // is woken. Doctrine for silent finishers: «нечего отправлять → iapeer
961
+ // self-done вместо ответа». The unarmed idle bound (ephemeralUnarmedIdleSecs)
962
+ // remains the backstop for workers that do neither. On a NON-ephemeral peer
963
+ // the marker is inert (quiet-reap keys on wake_policy) — warn, exit 0.
964
+ const identity = env.PEER_IDENTITY?.trim()
965
+ if (!identity) {
966
+ errOut('self-done: PEER_IDENTITY is not set — this verb is an agent self-call from inside a session\n')
967
+ return 1
968
+ }
969
+ if (!parseSessionName(identity)) {
970
+ errOut(`self-done: invalid PEER_IDENTITY "${identity}" — expected <runtime>-<personality>\n`)
971
+ return 1
972
+ }
973
+ const cfg = loadLifecycleConfig(env)
974
+ setEphemeralArmed(cfg, identity)
975
+ const ephemeral = isEphemeralPeer(process.cwd())
976
+ out(
977
+ `self-done: armed ${identity} for the quiet-window reap (no one woken)` +
978
+ (ephemeral ? '' : ' — NOTE: this peer is not wake_policy:ephemeral, the marker is inert') +
979
+ '\n',
980
+ )
981
+ return 0
982
+ }
916
983
  case 'self-fresh': {
917
984
  // /new AGENT-FACING TRIGGER (TARGET redesign). Run BY the agent itself as the
918
985
  // FINAL step of a /new graceful wind-down (the owner triggers it via a per-peer
@@ -10,7 +10,9 @@ import { tmpdir } from 'os'
10
10
  import { join } from 'path'
11
11
  import {
12
12
  canaryChannel,
13
+ canaryProcessPattern,
13
14
  canaryScript,
15
+ dismissCanary,
14
16
  ensureServerCanary,
15
17
  exitLogPath,
16
18
  serverDeathsDir,
@@ -53,7 +55,7 @@ describe('canary script (pure)', () => {
53
55
  expect(exitLogPath('/r/logs/iapeer')).toBe('/r/logs/iapeer/exits.log')
54
56
  })
55
57
 
56
- test('script carries the protocol: wait-for channel, silent-exit guards, record line, forensics', () => {
58
+ test('script carries the v2 protocol: wait-for channel, deliberate-silence guards, liveness probe, record line, forensics', () => {
57
59
  const s = canaryScript({
58
60
  identity: 'claude-bob',
59
61
  sock: '/tmp/x.sock',
@@ -62,8 +64,15 @@ describe('canary script (pure)', () => {
62
64
  forensicsDir: '/r/logs/iapeer/server-deaths',
63
65
  })
64
66
  expect(s).toContain(`wait-for 'iap-canary-claude-bob'`) // the blocking client
65
- expect(s).toContain(`trap 'exit 0' HUP INT TERM`) // signaled canary = silent
66
- expect(s).toContain(`[ "$rc" -eq 0 ] || [ "$rc" -ge 128 ]`) // clean/signaled guards
67
+ expect(s).toContain(`trap 'exit 0' HUP INT TERM`) // dismissed sh = silent (the ONLY deliberate silencer)
68
+ // v2: NO exit code is trusted as deliberate (a TERMed client returns rc=0 —
69
+ // proven live); the SERVER's liveness decides, after a dismissal grace sleep.
70
+ expect(s).not.toContain(`[ "$rc" -eq 0 ] || [ "$rc" -ge 128 ]; then exit 0`) // the v1 hole
71
+ expect(s).toContain('sleep 2') // dismissal grace window
72
+ expect(s).toContain(`has-session 2>/dev/null; then exit 0`) // server alive → nothing to record
73
+ expect(s).toContain('cause=server-vanished') // connection drop (SIGKILL/OOM class)
74
+ expect(s).toContain('cause=signaled-server-gone') // rc=0: channel/client-TERM, server died
75
+ expect(s).toContain('cause=client-killed-server-gone') // rc≥128: client took a hard kill
67
76
  expect(s).toContain('ev=server-exit identity=claude-bob') // the exits.log record
68
77
  expect(s).toContain(`>> '/r/logs/iapeer/exits.log'`)
69
78
  expect(s).toContain('/r/logs/iapeer/server-deaths/claude-bob') // forensics file
@@ -74,6 +83,19 @@ describe('canary script (pure)', () => {
74
83
  test('ensureServerCanary without exitLogDir → skipped (observability off)', () => {
75
84
  expect(ensureServerCanary({ identity: 'claude-none', sock: '/tmp/none.sock' })).toBe('skipped')
76
85
  })
86
+
87
+ test('canaryProcessPattern is identity-anchored (no prefix bleed) and matches both canary processes', () => {
88
+ const p = canaryProcessPattern('claude-iap')
89
+ expect(p).toBe('iap-canary-claude-iap([^a-z0-9-]|$)')
90
+ const re = new RegExp(p)
91
+ expect(re.test(`/opt/homebrew/bin/tmux -S /tmp/x.sock wait-for iap-canary-claude-iap`)).toBe(true) // client argv
92
+ expect(re.test(`/bin/sh -c ... wait-for 'iap-canary-claude-iap'\n...`)).toBe(true) // sh -c script (quoted channel)
93
+ expect(re.test(`tmux wait-for iap-canary-claude-iap-memory`)).toBe(false) // prefix identity must NOT match
94
+ })
95
+
96
+ test('dismissCanary is a harmless no-op when no canary runs', () => {
97
+ expect(() => dismissCanary('claude-nobody-here')).not.toThrow()
98
+ })
77
99
  })
78
100
 
79
101
  describe.if(tmuxAvailable)('canary live (sandbox tmux servers)', () => {
@@ -90,11 +112,22 @@ describe.if(tmuxAvailable)('canary live (sandbox tmux servers)', () => {
90
112
  return { sock, logDir: join(dir, `logs-${identity}`) }
91
113
  }
92
114
 
115
+ const allIds = [
116
+ 'claude-canadirty',
117
+ 'claude-canaclean',
118
+ 'claude-canakill',
119
+ 'notifier-canatear',
120
+ 'claude-canasweep',
121
+ 'claude-canaclient',
122
+ ]
123
+
93
124
  afterAll(() => {
94
125
  for (const sock of socks) {
95
- // teardown is DELIBERATE → signal each canary before killing its server
96
- for (const id of ['claude-canadirty', 'claude-canaclean', 'claude-canakill', 'notifier-canatear']) {
126
+ // teardown is DELIBERATE → silence each canary (signal + dismiss) before
127
+ // killing its server the v2 contract for every deliberate path.
128
+ for (const id of allIds) {
97
129
  signalCanaryClean(sock, id)
130
+ dismissCanary(id)
98
131
  }
99
132
  spawnSync('tmux', ['-S', sock, 'kill-server'], { stdio: 'ignore' })
100
133
  }
@@ -197,4 +230,59 @@ describe.if(tmuxAvailable)('canary live (sandbox tmux servers)', () => {
197
230
  },
198
231
  20000,
199
232
  )
233
+
234
+ test(
235
+ 'death-#4 shape: external killer sweeps server AND canary client → ev=server-exit cause=client-signaled',
236
+ async () => {
237
+ const identity = 'claude-canasweep'
238
+ const { sock, logDir } = bringUp(identity)
239
+ const pid = serverPid(sock)
240
+ expect(ensureServerCanary({ identity, sock, exitLogDir: logDir })).toBe('spawned')
241
+ expect(await waitFor(() => canaryRunning(identity), 3000)).toBe(true)
242
+ await sleep(500)
243
+
244
+ // The pre-clean-shaped external killer: one pattern takes the server AND
245
+ // the canary CLIENT (both argv contain `tmux -S <sock> `), the sh recorder
246
+ // survives. The TERMed client returns rc=0 (proven live) — v1 read that as
247
+ // a clean channel signal and stayed silent; exactly how deaths #4–#6
248
+ // (10.06) left zero records. v2 probes the server instead: dead → record.
249
+ spawnSync('pkill', ['-f', `tmux -S ${sock} `], { stdio: 'ignore' })
250
+ const log = exitLogPath(logDir)
251
+ expect(
252
+ await waitFor(() => existsSync(log) && readFileSync(log, 'utf8').includes('ev=server-exit'), 10000),
253
+ ).toBe(true)
254
+ const line = readFileSync(log, 'utf8')
255
+ expect(line).toContain(`ev=server-exit identity=${identity}`)
256
+ expect(line).toContain('cause=signaled-server-gone') // rc=0 shape, server found dead
257
+ expect(line).toContain(`server_pid=${pid}`)
258
+ expect(readdirSync(serverDeathsDir(logDir)).length).toBe(1) // forensics captured
259
+ },
260
+ 20000,
261
+ )
262
+
263
+ test(
264
+ 'canary client killed while server lives → silent (no false record), canary gone for the retrofit to re-arm',
265
+ async () => {
266
+ const identity = 'claude-canaclient'
267
+ const { sock, logDir } = bringUp(identity)
268
+ expect(ensureServerCanary({ identity, sock, exitLogDir: logDir })).toBe('spawned')
269
+ expect(await waitFor(() => canaryRunning(identity), 3000)).toBe(true)
270
+ await sleep(500)
271
+
272
+ // TERM the CLIENT only (argv ends with the bare channel — the sh's quoted
273
+ // form does not match this $-anchored pattern), server stays up.
274
+ spawnSync('pkill', ['-f', `wait-for ${canaryChannel(identity)}$`], { stdio: 'ignore' })
275
+ // the sh probes (≈2 s), finds the server ALIVE → exits silently
276
+ expect(await waitFor(() => !canaryRunning(identity), 6000)).toBe(true)
277
+ await sleep(300)
278
+ expect(
279
+ existsSync(exitLogPath(logDir)) && readFileSync(exitLogPath(logDir), 'utf8').includes('ev=server-exit'),
280
+ ).toBe(false) // no false server-death while the server lives
281
+ // server must still be alive — and ensure re-arms a fresh canary (the
282
+ // retrofit path; in prod the supervise tick does this and logs it)
283
+ expect(spawnSync('tmux', ['-S', sock, 'has-session', '-t', identity], { stdio: 'ignore' }).status).toBe(0)
284
+ expect(ensureServerCanary({ identity, sock, exitLogDir: logDir })).toBe('spawned')
285
+ },
286
+ 20000,
287
+ )
200
288
  })
@@ -8,17 +8,32 @@
8
8
  // Mechanism: one tiny detached `sh` per LIVE tmux server, holding a blocking
9
9
  // client `tmux -S <sock> wait-for iap-canary-<identity>` — a process OUTSIDE the
10
10
  // dying server, connected via its socket, so the server's death (any cause,
11
- // including SIGKILL) is observed the moment the connection drops. Protocol:
12
- // clean teardown (idle-reap / stop / pre-clean / killServerIfEmpty) SIGNALS
13
- // the channel first (`wait-for -S`) the canary exits 0, silently;
14
- // the canary itself signaled (TERM/HUP/INT, e.g. launch pre-clean pkill)
15
- // rc 128 / trap exit silently (someone manages it deliberately);
16
- // • anything else (wait-for returns an error = the server vanished under us)
17
- // ONE logfmt line `ev=server-exit` into exits.log (the per-peer death-cause
18
- // home, next to pane-died's `ev=session-exit`) + a forensics snapshot file
19
- // (vm_stat / swap / top-RSS ps / fresh DiagnosticReports) captured WITHIN
20
- // SECONDS of the death the evidence the 60 s supervise tick can never
11
+ // including SIGKILL) is observed the moment the connection drops. Protocol (v2 —
12
+ // death #4 10.06 15:42Z proved v1's exit-code trust wrong twice over: the killer
13
+ // took the SERVER and the canary CLIENT together, AND a TERMed tmux client
14
+ // returns rc=0 indistinguishable from a clean channel signal — so the rc-based
15
+ // `0 || ≥128 → silent` guard silenced a real death):
16
+ // • DELIBERATE silence is an explicit act, never an exit-code inference:
17
+ // every deliberate teardown (idle-reap / stop / pre-clean / pane-died hook /
18
+ // bootout teardown) signals the channel (`wait-for -S`) AND dismisses the
19
+ // sh recorder (`dismissCanary` TERM trap silent exit). POSIX trap
20
+ // semantics make this race-free: the trap runs before any recording.
21
+ // • When wait-for returns — ANY code — the script sleeps 2 s (a concurrent
22
+ // dismissal TERM wins here), then probes SERVER LIVENESS: alive → exit
23
+ // silently (a lost canary is re-armed and logged by the supervise retrofit
24
+ // within a tick); dead with nobody having dismissed us → the death is real
25
+ // and unclaimed → ONE logfmt line `ev=server-exit` into exits.log (the
26
+ // per-peer death-cause home, next to pane-died's `ev=session-exit`) + a
27
+ // forensics snapshot (vm_stat / swap / top-RSS ps / fresh DiagnosticReports)
28
+ // captured within seconds — the evidence the 60 s supervise tick can never
21
29
  // recover ("системных следов ноль" was the recurring investigation outcome).
30
+ // The raw wait_rc still ATTRIBUTES the death (cause=server-vanished /
31
+ // signaled-server-gone / client-killed-server-gone).
32
+ // Residual blind spot (structural): a killer that SIGKILLs the sh recorder
33
+ // itself leaves no in-process way to record. With v2 the ABSENCE of a record on
34
+ // a server-dead reap narrows the diagnosis to exactly that shape; the canary
35
+ // ensure-state lines in lifecycle.log (origin=launch/retrofit) evidence the
36
+ // churn post-hoc.
22
37
  //
23
38
  // The canary is pure observability: it never wakes, reaps, restarts or otherwise
24
39
  // manages anything (H4-compatible by construction), it fires at most once, and
@@ -69,24 +84,45 @@ export function canaryScript(o: CanaryScriptOptions): string {
69
84
  return [
70
85
  // Server PID captured while alive — the postmortem grep key for system logs.
71
86
  `SPID="$('${o.tmuxBin}' -S '${o.sock}' display-message -p '#{pid}' 2>/dev/null)"`,
72
- // A signal to the CANARY is deliberate management (pre-clean pkill) silent.
87
+ // A signal to the SH WRAPPER is deliberate dismissal (dismissCanary) or host
88
+ // shutdown → silent. POSIX: the trap runs after the foreground command
89
+ // completes — so a TERM delivered during wait-for/sleep always exits us
90
+ // BEFORE any recording below (the race-free deliberate-silence guarantee).
73
91
  `trap 'exit 0' HUP INT TERM`,
74
92
  `'${o.tmuxBin}' -S '${o.sock}' wait-for '${ch}'`,
75
93
  `rc=$?`,
76
- // rc=0 → clean-teardown signal; rc≥128 the tmux client itself was signaled.
77
- `if [ "$rc" -eq 0 ] || [ "$rc" -ge 128 ]; then exit 0; fi`,
78
- // The server vanished under us record, within seconds of the death.
94
+ // NO wait-for exit code is trusted as "deliberate" by itself PROVEN live
95
+ // (death-#4 postmortem): a TERM to the tmux CLIENT returns rc=0, identical
96
+ // to a clean channel signal, so an external killer sweeping server+client
97
+ // rides the clean-looking code straight past any rc-based guard (v1's
98
+ // `rc=0 || rc>=128 → silent` was exactly that hole). v2 contract instead:
99
+ // • deliberate teardowns DISMISS this sh (TERM → trap above) — the sleep
100
+ // below gives a concurrently-delivered dismissal time to win;
101
+ // • then the SERVER's liveness, not the exit code, decides: alive →
102
+ // nothing to record (a lost canary is re-armed and logged by the
103
+ // supervise retrofit within a tick); dead and nobody dismissed us →
104
+ // the death is real and unclaimed → record it.
105
+ `sleep 2`,
106
+ `if '${o.tmuxBin}' -S '${o.sock}' has-session 2>/dev/null; then exit 0; fi`,
107
+ // The exit code still ATTRIBUTES the recorded death (raw wait_rc is kept):
108
+ // rc=0 → signaled-server-gone (channel signal or client-TERM, server died)
109
+ // rc≥128 → client-killed-server-gone (client took a non-TERM kill)
110
+ // else → server-vanished (connection drop — SIGKILL/OOM class)
111
+ `cause=server-vanished`,
112
+ `if [ "$rc" -eq 0 ]; then cause=signaled-server-gone; fi`,
113
+ `if [ "$rc" -ge 128 ]; then cause=client-killed-server-gone; fi`,
114
+ // The server is gone under us — record, within seconds of the death.
79
115
  `ts="$(date -u +%Y-%m-%dT%H:%M:%SZ)"`,
80
116
  `f='${o.forensicsDir}/${o.identity}'-"$(date +%s)".txt`,
81
117
  `{`,
82
- ` echo "server-death identity=${o.identity} ts=$ts server_pid=$SPID wait_rc=$rc"`,
118
+ ` echo "server-death identity=${o.identity} ts=$ts server_pid=$SPID wait_rc=$rc cause=$cause"`,
83
119
  ` echo '--- vm_stat'; vm_stat`,
84
120
  ` echo '--- swapusage'; sysctl -n vm.swapusage`,
85
121
  ` echo '--- ps-top-rss'; ps axo pid,ppid,rss,etime,command | sort -rn -k3 | head -25`,
86
122
  ` echo '--- diagnosticreports-user'; ls -t "$HOME/Library/Logs/DiagnosticReports" 2>/dev/null | head`,
87
123
  ` echo '--- diagnosticreports-system'; ls -t /Library/Logs/DiagnosticReports 2>/dev/null | head`,
88
124
  `} > "$f" 2>&1`,
89
- `printf 'ts=%s ev=server-exit identity=${o.identity} server_pid=%s wait_rc=%s forensics=%s\\n' "$ts" "$SPID" "$rc" "$f" >> '${o.exitLogFile}'`,
125
+ `printf 'ts=%s ev=server-exit identity=${o.identity} server_pid=%s wait_rc=%s cause=%s forensics=%s\\n' "$ts" "$SPID" "$rc" "$cause" "$f" >> '${o.exitLogFile}'`,
90
126
  ].join('\n')
91
127
  }
92
128
 
@@ -150,3 +186,27 @@ export function signalCanaryClean(sock: string, identity: string): void {
150
186
  /* best-effort */
151
187
  }
152
188
  }
189
+
190
+ /** The pgrep/pkill ERE matching BOTH canary processes of ONE identity — the sh
191
+ * wrapper (its -c script quotes the channel: `…'iap-canary-<id>'…`) and the
192
+ * tmux client (argv ends with the bare channel). Anchored so an identity can
193
+ * never match another identity's prefix (claude-iapeer ≠ claude-iapeer-memory). */
194
+ export function canaryProcessPattern(identity: string): string {
195
+ return `${canaryChannel(identity)}([^a-z0-9-]|$)`
196
+ }
197
+
198
+ /**
199
+ * Dismiss this identity's canary BEFORE a deliberate server teardown: TERM the
200
+ * sh wrapper (trap → silent exit, race-free — the trap always runs before the
201
+ * v2 recording branch) and the tmux client. The explicit counterpart of the
202
+ * channel signal: with v2 a signaled CLIENT alone is no longer read as
203
+ * deliberate, so every deliberate path must dismiss the RECORDER (the sh).
204
+ * Best-effort: no canary running → harmless no-op (pkill exits 1).
205
+ */
206
+ export function dismissCanary(identity: string): void {
207
+ try {
208
+ spawnSync('pkill', ['-f', canaryProcessPattern(identity)], { stdio: 'ignore' })
209
+ } catch {
210
+ /* best-effort */
211
+ }
212
+ }
@@ -19,7 +19,8 @@ import { dirname } from 'path'
19
19
  import { spawnSync } from 'child_process'
20
20
  import { CODEX_BEARER_ENV_VAR, CODEX_DUMMY_BEARER, type Runtime } from '../core/constants.ts'
21
21
  import { readLaunchEnv } from '../storage/index.ts'
22
- import { ensureServerCanary, exitLogPath, signalCanaryClean } from './canary.ts'
22
+ import { canaryChannel, dismissCanary, ensureServerCanary, exitLogPath, signalCanaryClean } from './canary.ts'
23
+ import { appendLifecycleEvent } from '../lifecycle/eventlog.ts'
23
24
  import { claudeAdapter } from './adapters/claude.ts'
24
25
  import { codexAdapter } from './adapters/codex.ts'
25
26
  import { telegramAdapter } from './adapters/telegram.ts'
@@ -155,7 +156,18 @@ export function exitCauseHook(identity: string, exitLogFile: string): string {
155
156
  `dead_status=#{pane_dead_status} dead_signal=#{pane_dead_signal}\\n`
156
157
  const log =
157
158
  `printf "${line}" "$(date -u +%Y-%m-%dT%H:%M:%SZ)" >> "${exitLogFile}"`
158
- return `run-shell '${log}' ; kill-session -t "${identity}"`
159
+ // Silence the server-death canary BEFORE kill-session: with a single-session
160
+ // server the kill empties the server and exit-empty takes it down — without
161
+ // explicit silence the canary would append a second, muddier `ev=server-exit`
162
+ // record for a death this very hook just captured (the session-exit line is
163
+ // the richer, authoritative one: it has the exit code/signal). v2 contract:
164
+ // deliberate = channel signal (tmux-NATIVE wait-for -S) + DISMISS the sh
165
+ // recorder (abs-path /usr/bin/pkill — survives the minimal launchd PATH;
166
+ // `\$` keeps the regex anchor literal through the sh double-quote layer; the
167
+ // `[i]` class keeps the pattern from matching its OWN occurrence in this
168
+ // hook-sh's cmdline — the pgrep self-match classic).
169
+ const dismiss = `/usr/bin/pkill -f "[i]${canaryChannel(identity).slice(1)}([^a-z0-9-]|\\$)"`
170
+ return `run-shell '${log} ; ${dismiss}' ; wait-for -S "${canaryChannel(identity)}" ; kill-session -t "${identity}"`
159
171
  }
160
172
 
161
173
  /** Install the exit-cause observability on a freshly-created session: ensure the
@@ -212,11 +224,15 @@ export const launch: LaunchFn = async (
212
224
  mkdirSync(dirname(sock), { recursive: true })
213
225
 
214
226
  // (1) Pre-clean any stale tmux server on this socket, then launch detached.
215
- // Signal the server-death canary FIRST: this teardown is deliberate, the
216
- // canary must exit silently, not record a false server-death (the pkill
217
- // below also matches the canary's cmdline by design, it sweeps a stale
218
- // canary along with the stale server; a signaled canary is already gone).
227
+ // Silence the server-death canary EXPLICITLY first — this teardown is
228
+ // deliberate: signal the channel (client exits 0) AND dismiss the sh
229
+ // recorder (TERM trap silent; canary v2 no longer reads a signaled
230
+ // client alone as deliberate an external killer sweeping server+client
231
+ // was exactly the death-#4 silence). Only then sweep the server processes
232
+ // (the pkill also matches a leftover canary client — harmless, both
233
+ // canary processes are already dismissed).
219
234
  signalCanaryClean(sock, identity)
235
+ dismissCanary(identity)
220
236
  spawnSync('pkill', ['-f', `tmux -S ${sock} `], { stdio: 'ignore' })
221
237
  tmux(sock, 'kill-server')
222
238
 
@@ -268,7 +284,14 @@ export const launch: LaunchFn = async (
268
284
  // that records `ev=server-exit` + a forensics snapshot when the whole
269
285
  // server dies dirty (SIGKILL/OOM class). Same gate as the exit hook;
270
286
  // best-effort by construction (every failure → a state, never a throw).
271
- ensureServerCanary({ identity, sock, exitLogDir: cfg.exitLogDir, env })
287
+ // The ensure-state is logged (origin=launch) so the arming trail is
288
+ // complete in lifecycle.log: 'spawned' is the newborn-server norm here,
289
+ // 'failed' a newborn server starting its life UNWATCHED (death-#4
290
+ // postmortem hinged on this very question being unanswerable).
291
+ const canaryState = ensureServerCanary({ identity, sock, exitLogDir: cfg.exitLogDir, env })
292
+ if (canaryState !== 'skipped') {
293
+ appendLifecycleEvent(cfg.exitLogDir, { ev: 'canary', identity, state: canaryState, origin: 'launch' }, { env })
294
+ }
272
295
 
273
296
  // (3) pipe-pane the session output to the per-identity log.
274
297
  mkdirSync(cfg.logDir, { recursive: true, mode: 0o700 })
@@ -227,6 +227,14 @@ describe('exitCauseHook (exit-cause observability)', () => {
227
227
  // tmux-NATIVE kill-session (no shell `tmux`) → needs no PATH (launchd minimal env).
228
228
  expect(hook).toContain('kill-session -t "claude-iapeer"')
229
229
  })
230
+ test('silences the server-death canary (native wait-for -S) BEFORE kill-session — no double record', () => {
231
+ // kill-session on a single-session server → exit-empty takes the server down;
232
+ // without the signal the canary would add a second `ev=server-exit` record for
233
+ // a death this hook just captured (session-exit carries the code/signal).
234
+ expect(hook).toContain('wait-for -S "iap-canary-claude-iapeer"')
235
+ expect(hook.indexOf('wait-for -S')).toBeLessThan(hook.indexOf('kill-session'))
236
+ expect(hook.indexOf('run-shell')).toBeLessThan(hook.indexOf('wait-for -S')) // log first
237
+ })
230
238
  test('quoting: single-quoted run-shell arg (tmux layer) wrapping double-quoted sh', () => {
231
239
  expect(hook).toMatch(/run-shell '.*'/)
232
240
  expect(hook).not.toContain("''") // no empty/again-collapsed single-quote pair
@@ -23,7 +23,7 @@ import { buildProcessAddress, buildSocketPath } from '../core/socket.ts'
23
23
  import { peerLogsDir, pluginLogsDir } from '../storage/index.ts'
24
24
  import { readPeerProfile } from '../identity/index.ts'
25
25
  import { getAdapter, launch } from './index.ts'
26
- import { signalCanaryClean } from './canary.ts'
26
+ import { dismissCanary, signalCanaryClean } from './canary.ts'
27
27
  import type { LaunchConfig, LaunchSpec } from './types.ts'
28
28
 
29
29
  /** Block-watch poll cadence — seconds, deliberately NOT a tight loop (the session
@@ -55,7 +55,10 @@ function sessionAlive(sock: string, identity: string): boolean {
55
55
  * both paths now converge on the same end state.
56
56
  */
57
57
  export function teardownAlwaysOnSession(sock: string, identity: string): void {
58
+ // Explicit canary silence (v2): channel signal (client exits 0) + dismiss the
59
+ // sh recorder — a signaled client alone is no longer read as deliberate.
58
60
  signalCanaryClean(sock, identity)
61
+ dismissCanary(identity)
59
62
  spawnSync('tmux', ['-S', sock, 'kill-session', '-t', identity], { stdio: 'ignore' })
60
63
  const ls = spawnSync('tmux', ['-S', sock, 'list-sessions', '-F', '#{session_name}'], { encoding: 'utf8' })
61
64
  if (!(ls.stdout ?? '').trim()) {
@@ -46,7 +46,7 @@ import {
46
46
  type LaunchSpec,
47
47
  } from '../launch/index.ts'
48
48
  import { composeSystemPrompt, gatherPromptInput } from '../launch/composeSystemPrompt.ts'
49
- import { ensureServerCanary, signalCanaryClean } from '../launch/canary.ts'
49
+ import { dismissCanary, ensureServerCanary, signalCanaryClean } from '../launch/canary.ts'
50
50
  import { appendLifecycleEvent, superviseLogVerbose } from './eventlog.ts'
51
51
 
52
52
  // ─────────────────────────────────────────────────────────────────────────────
@@ -79,6 +79,16 @@ export interface LifecycleConfig {
79
79
  * transcript mtime is a LIVENESS proxy — "no longer writing" — not a semantic
80
80
  * "done" signal. */
81
81
  ephemeralQuietSecs: number
82
+ /** wake_policy:ephemeral — the UNARMED idle bound (seconds): an ephemeral session
83
+ * that never armed (finished silently / lost its arm to a daemon-restart window)
84
+ * is reaped after this much activity-proxy silence. Live case (scriber 10.06):
85
+ * a worker that ended «тихо» without its final outbound stalled its M3 FIFO for
86
+ * the FULL generic idleSecs (1 h) — with serial drain that blocks the whole
87
+ * conveyor per silent worker. Bound chosen ≫ the legitimate silent-tool case
88
+ * (sleep-180) and ≪ idleSecs. DOCUMENTED RISK: an ephemeral worker whose tool
89
+ * stays silent longer than this MID-TASK is reaped and its consumed queue item
90
+ * is lost — ephemeral workers must emit activity (or their reply) within it. */
91
+ ephemeralUnarmedIdleSecs: number
82
92
  }
83
93
 
84
94
  export function loadLifecycleConfig(env: NodeJS.ProcessEnv = process.env): LifecycleConfig {
@@ -102,6 +112,7 @@ export function loadLifecycleConfig(env: NodeJS.ProcessEnv = process.env): Lifec
102
112
  crashLoopMax: num(env.IAPEER_CRASHLOOP_MAX, 3),
103
113
  crashLoopWindowSecs: num(env.IAPEER_CRASHLOOP_WINDOW_SECS, 300),
104
114
  ephemeralQuietSecs: num(env.IAPEER_EPHEMERAL_QUIET_SECS, 20),
115
+ ephemeralUnarmedIdleSecs: num(env.IAPEER_EPHEMERAL_UNARMED_IDLE_SECS, 600),
105
116
  }
106
117
  }
107
118
 
@@ -998,9 +1009,11 @@ export function killSession(sock: string, identity: string): void {
998
1009
  tmux(sock, 'kill-session', '-t', identity)
999
1010
  const sessions = tmux(sock, 'list-sessions', '-F', '#{session_name}').out
1000
1011
  if (!sessions.trim()) {
1001
- // Deliberate server teardown → signal the server-death canary FIRST so it
1002
- // exits silently instead of recording a false `ev=server-exit` (canary.ts).
1012
+ // Deliberate server teardown → silence the canary EXPLICITLY first: signal
1013
+ // the channel (client exits 0) AND dismiss the sh recorder (canary v2 no
1014
+ // longer reads a signaled client alone as deliberate — see canary.ts).
1003
1015
  signalCanaryClean(sock, identity)
1016
+ dismissCanary(identity)
1004
1017
  tmux(sock, 'kill-server')
1005
1018
  try {
1006
1019
  rmSync(sock, { force: true })
@@ -1140,6 +1153,30 @@ export function superviseTick(cfg: LifecycleConfig, deps: SuperviseDeps = {}): S
1140
1153
  trace({ identity: s.identity, action: 'reaped-ephemeral', age: `${ageSecs}s`, outcome: 'ephemeral-done' })
1141
1154
  continue
1142
1155
  }
1156
+ // UNARMED ephemeral idle bound (live case scriber 10.06): a worker that ended
1157
+ // SILENTLY (no final outbound → never armed; or its arm was lost to a CLI/
1158
+ // daemon-restart window) used to wait out the FULL generic idleSecs (1 h) —
1159
+ // and the M3 serial drain waits for the session's death, so ONE silent worker
1160
+ // stalled its whole conveyor. This bound is the defense-in-depth backstop:
1161
+ // ≫ the legitimate silent-tool case (sleep-180), ≪ idleSecs. The ШТАТНЫЙ
1162
+ // silent-finish path is `iapeer self-done` (arm without waking anyone —
1163
+ // Артур's invariant «нет пустых пробуждений» stays intact); this branch only
1164
+ // bounds the damage when a worker does neither. Policy reap: NO .idle-reaped
1165
+ // (ephemeral never resumes), NO recordDeath (the ring counts faults).
1166
+ if (isEphemeralPeer(s.cwd) && ageSecs > cfg.ephemeralUnarmedIdleSecs) {
1167
+ killSession(sock, s.identity)
1168
+ clearEphemeralArmed(cfg, s.identity)
1169
+ removeSessionState(cfg, s.identity)
1170
+ out.push({
1171
+ identity: s.identity,
1172
+ action: 'reaped-ephemeral',
1173
+ reason: `unarmed idle ${ageSecs}s (silent-finish backstop; штатный путь — iapeer self-done)`,
1174
+ personality: s.personality,
1175
+ runtime: s.runtime,
1176
+ })
1177
+ trace({ identity: s.identity, action: 'reaped-ephemeral', age: `${ageSecs}s`, outcome: 'ephemeral-unarmed-bound' })
1178
+ continue
1179
+ }
1143
1180
  if (ageSecs > cfg.idleSecs) {
1144
1181
  // THE ONLY place .idle-reaped is written: this is the one death the daemon
1145
1182
  // INITIATES. Its presence on the next wake = the session was parked cleanly =
@@ -1155,7 +1192,15 @@ export function superviseTick(cfg: LifecycleConfig, deps: SuperviseDeps = {}): S
1155
1192
  // every ALIVE daemon-owned server — covers a fleet launched by older code
1156
1193
  // within one tick of a deploy, no session restarts. Idempotent (pgrep on
1157
1194
  // the per-identity wait-for channel), best-effort, pure observability.
1158
- ensureServerCanary({ identity: s.identity, sock, exitLogDir: cfg.eventLogDir, env })
1195
+ // A non-'already' state IS a decision-grade event (not verbose-gated):
1196
+ // post-0.2.22 every launch arms a canary, so a retrofit 'spawned' on an
1197
+ // alive server means the previous canary VANISHED mid-watch (the death-#4
1198
+ // blind spot was exactly this churn being invisible); 'failed' means the
1199
+ // server is currently UNWATCHED. At most one line per loss, not per tick.
1200
+ const canaryState = ensureServerCanary({ identity: s.identity, sock, exitLogDir: cfg.eventLogDir, env })
1201
+ if (canaryState !== 'already') {
1202
+ trace({ identity: s.identity, action: 'canary', state: canaryState, origin: 'retrofit' })
1203
+ }
1159
1204
  out.push({ identity: s.identity, action: 'alive' })
1160
1205
  if (verbose) trace({ identity: s.identity, action: 'alive', age: `${ageSecs}s` })
1161
1206
  }
@@ -802,6 +802,69 @@ describe('superviseTick quiet-reap (M2 die-after-reply, real tmux)', () => {
802
802
  },
803
803
  30000,
804
804
  )
805
+
806
+ test.if(tmuxAvailable)(
807
+ 'UNARMED ephemeral past the unarmed idle bound → reaped-ephemeral (silent-finish backstop; live case scriber 10.06)',
808
+ () => {
809
+ const root = mkdtempSync(join(tmpdir(), 'iapeer-eu-root-'))
810
+ const laDir = mkdtempSync(join(tmpdir(), 'iapeer-eu-la-'))
811
+ const cwd = profileCwd(false, true) // ephemeral worker profile
812
+ const env = {
813
+ ...process.env,
814
+ IAPEER_ROOT: root,
815
+ IAPEER_LAUNCHAGENTS_DIR: laDir,
816
+ IAPEER_SOCK_DIR: join(root, 'socks'),
817
+ IAPEER_EPHEMERAL_UNARMED_IDLE_SECS: '30', // ≪ the 60s age below, ≫ quiet 20s
818
+ }
819
+ const cfg = loadLifecycleConfig(env)
820
+ const identity = 'claude-eu'
821
+ const sock = join(root, 'socks', 'tmux-iap-claude-eu.sock')
822
+ const alive = () => spawnSync('tmux', ['-S', sock, 'has-session', '-t', identity]).status === 0
823
+ try {
824
+ mkdirSync(join(root, 'socks'), { recursive: true })
825
+ spawnSync('tmux', ['-S', sock, 'new-session', '-d', '-s', identity, 'sleep', '300'])
826
+ expect(alive()).toBe(true)
827
+ mkdirSync(cfg.stateDir, { recursive: true })
828
+ writeFileSync(
829
+ join(cfg.stateDir, `${identity}.session`),
830
+ JSON.stringify({ identity, runtime: 'claude', personality: 'eu', cwd, wokeAt: Date.now() - 60_000 }),
831
+ )
832
+ // NOT armed (the worker ended silently) — past the unarmed bound → policy reap
833
+ const o = superviseTick(cfg, { env }).find(x => x.identity === identity)
834
+ expect(o?.action).toBe('reaped-ephemeral')
835
+ expect(o?.reason).toContain('unarmed idle')
836
+ expect(o?.personality).toBe('eu') // M3 drain fields present → queue feeds next
837
+ expect(alive()).toBe(false)
838
+ // policy death: no resume-eligibility, no crash-loop count
839
+ expect(hasIdleReaped(cfg, identity)).toBe(false)
840
+ expect(readDeaths(cfg, identity).length).toBe(0)
841
+ const logged = readFileSync(join(cfg.eventLogDir, 'lifecycle.log'), 'utf8')
842
+ expect(logged).toContain('outcome=ephemeral-unarmed-bound')
843
+ // a NON-ephemeral peer with the same age is untouched by this bound
844
+ // (its session lives on ITS OWN identity-derived socket — the tick keys
845
+ // sockets on runtime-personality, not on the test's prior sock)
846
+ const plainCwd = profileCwd(false, false)
847
+ const eupSock = join(root, 'socks', 'tmux-iap-claude-eup.sock')
848
+ try {
849
+ writeFileSync(
850
+ join(cfg.stateDir, `claude-eup.session`),
851
+ JSON.stringify({ identity: 'claude-eup', runtime: 'claude', personality: 'eup', cwd: plainCwd, wokeAt: Date.now() - 60_000 }),
852
+ )
853
+ spawnSync('tmux', ['-S', eupSock, 'new-session', '-d', '-s', 'claude-eup', 'sleep', '300'])
854
+ expect(superviseTick(cfg, { env }).find(x => x.identity === 'claude-eup')?.action).toBe('alive')
855
+ } finally {
856
+ spawnSync('tmux', ['-S', eupSock, 'kill-server'], { stdio: 'ignore' })
857
+ rmSync(plainCwd, { recursive: true, force: true })
858
+ }
859
+ } finally {
860
+ spawnSync('tmux', ['-S', sock, 'kill-server'], { stdio: 'ignore' })
861
+ rmSync(root, { recursive: true, force: true })
862
+ rmSync(laDir, { recursive: true, force: true })
863
+ rmSync(cwd, { recursive: true, force: true })
864
+ }
865
+ },
866
+ 30000,
867
+ )
805
868
  })
806
869
 
807
870
  // ─────────────────────────────────────────────────────────────────────────────