@agfpd/iapeer 0.2.26 → 0.2.27

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@agfpd/iapeer",
3
- "version": "0.2.26",
3
+ "version": "0.2.27",
4
4
  "description": "Foundation core for the iapeer multi-agent ecosystem: identity, registry, storage, codec.",
5
5
  "type": "module",
6
6
  "bin": {
@@ -10,7 +10,9 @@ import { tmpdir } from 'os'
10
10
  import { join } from 'path'
11
11
  import {
12
12
  canaryChannel,
13
+ canaryProcessPattern,
13
14
  canaryScript,
15
+ dismissCanary,
14
16
  ensureServerCanary,
15
17
  exitLogPath,
16
18
  serverDeathsDir,
@@ -53,7 +55,7 @@ describe('canary script (pure)', () => {
53
55
  expect(exitLogPath('/r/logs/iapeer')).toBe('/r/logs/iapeer/exits.log')
54
56
  })
55
57
 
56
- test('script carries the protocol: wait-for channel, silent-exit guards, record line, forensics', () => {
58
+ test('script carries the v2 protocol: wait-for channel, deliberate-silence guards, liveness probe, record line, forensics', () => {
57
59
  const s = canaryScript({
58
60
  identity: 'claude-bob',
59
61
  sock: '/tmp/x.sock',
@@ -62,8 +64,15 @@ describe('canary script (pure)', () => {
62
64
  forensicsDir: '/r/logs/iapeer/server-deaths',
63
65
  })
64
66
  expect(s).toContain(`wait-for 'iap-canary-claude-bob'`) // the blocking client
65
- expect(s).toContain(`trap 'exit 0' HUP INT TERM`) // signaled canary = silent
66
- expect(s).toContain(`[ "$rc" -eq 0 ] || [ "$rc" -ge 128 ]`) // clean/signaled guards
67
+ expect(s).toContain(`trap 'exit 0' HUP INT TERM`) // dismissed sh = silent (the ONLY deliberate silencer)
68
+ // v2: NO exit code is trusted as deliberate (a TERMed client returns rc=0 —
69
+ // proven live); the SERVER's liveness decides, after a dismissal grace sleep.
70
+ expect(s).not.toContain(`[ "$rc" -eq 0 ] || [ "$rc" -ge 128 ]; then exit 0`) // the v1 hole
71
+ expect(s).toContain('sleep 2') // dismissal grace window
72
+ expect(s).toContain(`has-session 2>/dev/null; then exit 0`) // server alive → nothing to record
73
+ expect(s).toContain('cause=server-vanished') // connection drop (SIGKILL/OOM class)
74
+ expect(s).toContain('cause=signaled-server-gone') // rc=0: channel/client-TERM, server died
75
+ expect(s).toContain('cause=client-killed-server-gone') // rc≥128: client took a hard kill
67
76
  expect(s).toContain('ev=server-exit identity=claude-bob') // the exits.log record
68
77
  expect(s).toContain(`>> '/r/logs/iapeer/exits.log'`)
69
78
  expect(s).toContain('/r/logs/iapeer/server-deaths/claude-bob') // forensics file
@@ -74,6 +83,19 @@ describe('canary script (pure)', () => {
74
83
  test('ensureServerCanary without exitLogDir → skipped (observability off)', () => {
75
84
  expect(ensureServerCanary({ identity: 'claude-none', sock: '/tmp/none.sock' })).toBe('skipped')
76
85
  })
86
+
87
+ test('canaryProcessPattern is identity-anchored (no prefix bleed) and matches both canary processes', () => {
88
+ const p = canaryProcessPattern('claude-iap')
89
+ expect(p).toBe('iap-canary-claude-iap([^a-z0-9-]|$)')
90
+ const re = new RegExp(p)
91
+ expect(re.test(`/opt/homebrew/bin/tmux -S /tmp/x.sock wait-for iap-canary-claude-iap`)).toBe(true) // client argv
92
+ expect(re.test(`/bin/sh -c ... wait-for 'iap-canary-claude-iap'\n...`)).toBe(true) // sh -c script (quoted channel)
93
+ expect(re.test(`tmux wait-for iap-canary-claude-iap-memory`)).toBe(false) // prefix identity must NOT match
94
+ })
95
+
96
+ test('dismissCanary is a harmless no-op when no canary runs', () => {
97
+ expect(() => dismissCanary('claude-nobody-here')).not.toThrow()
98
+ })
77
99
  })
78
100
 
79
101
  describe.if(tmuxAvailable)('canary live (sandbox tmux servers)', () => {
@@ -90,11 +112,22 @@ describe.if(tmuxAvailable)('canary live (sandbox tmux servers)', () => {
90
112
  return { sock, logDir: join(dir, `logs-${identity}`) }
91
113
  }
92
114
 
115
+ const allIds = [
116
+ 'claude-canadirty',
117
+ 'claude-canaclean',
118
+ 'claude-canakill',
119
+ 'notifier-canatear',
120
+ 'claude-canasweep',
121
+ 'claude-canaclient',
122
+ ]
123
+
93
124
  afterAll(() => {
94
125
  for (const sock of socks) {
95
- // teardown is DELIBERATE → signal each canary before killing its server
96
- for (const id of ['claude-canadirty', 'claude-canaclean', 'claude-canakill', 'notifier-canatear']) {
126
+ // teardown is DELIBERATE → silence each canary (signal + dismiss) before
127
+ // killing its server the v2 contract for every deliberate path.
128
+ for (const id of allIds) {
97
129
  signalCanaryClean(sock, id)
130
+ dismissCanary(id)
98
131
  }
99
132
  spawnSync('tmux', ['-S', sock, 'kill-server'], { stdio: 'ignore' })
100
133
  }
@@ -197,4 +230,59 @@ describe.if(tmuxAvailable)('canary live (sandbox tmux servers)', () => {
197
230
  },
198
231
  20000,
199
232
  )
233
+
234
+ test(
235
+ 'death-#4 shape: external killer sweeps server AND canary client → ev=server-exit cause=client-signaled',
236
+ async () => {
237
+ const identity = 'claude-canasweep'
238
+ const { sock, logDir } = bringUp(identity)
239
+ const pid = serverPid(sock)
240
+ expect(ensureServerCanary({ identity, sock, exitLogDir: logDir })).toBe('spawned')
241
+ expect(await waitFor(() => canaryRunning(identity), 3000)).toBe(true)
242
+ await sleep(500)
243
+
244
+ // The pre-clean-shaped external killer: one pattern takes the server AND
245
+ // the canary CLIENT (both argv contain `tmux -S <sock> `), the sh recorder
246
+ // survives. The TERMed client returns rc=0 (proven live) — v1 read that as
247
+ // a clean channel signal and stayed silent; exactly how deaths #4–#6
248
+ // (10.06) left zero records. v2 probes the server instead: dead → record.
249
+ spawnSync('pkill', ['-f', `tmux -S ${sock} `], { stdio: 'ignore' })
250
+ const log = exitLogPath(logDir)
251
+ expect(
252
+ await waitFor(() => existsSync(log) && readFileSync(log, 'utf8').includes('ev=server-exit'), 10000),
253
+ ).toBe(true)
254
+ const line = readFileSync(log, 'utf8')
255
+ expect(line).toContain(`ev=server-exit identity=${identity}`)
256
+ expect(line).toContain('cause=signaled-server-gone') // rc=0 shape, server found dead
257
+ expect(line).toContain(`server_pid=${pid}`)
258
+ expect(readdirSync(serverDeathsDir(logDir)).length).toBe(1) // forensics captured
259
+ },
260
+ 20000,
261
+ )
262
+
263
+ test(
264
+ 'canary client killed while server lives → silent (no false record), canary gone for the retrofit to re-arm',
265
+ async () => {
266
+ const identity = 'claude-canaclient'
267
+ const { sock, logDir } = bringUp(identity)
268
+ expect(ensureServerCanary({ identity, sock, exitLogDir: logDir })).toBe('spawned')
269
+ expect(await waitFor(() => canaryRunning(identity), 3000)).toBe(true)
270
+ await sleep(500)
271
+
272
+ // TERM the CLIENT only (argv ends with the bare channel — the sh's quoted
273
+ // form does not match this $-anchored pattern), server stays up.
274
+ spawnSync('pkill', ['-f', `wait-for ${canaryChannel(identity)}$`], { stdio: 'ignore' })
275
+ // the sh probes (≈2 s), finds the server ALIVE → exits silently
276
+ expect(await waitFor(() => !canaryRunning(identity), 6000)).toBe(true)
277
+ await sleep(300)
278
+ expect(
279
+ existsSync(exitLogPath(logDir)) && readFileSync(exitLogPath(logDir), 'utf8').includes('ev=server-exit'),
280
+ ).toBe(false) // no false server-death while the server lives
281
+ // server must still be alive — and ensure re-arms a fresh canary (the
282
+ // retrofit path; in prod the supervise tick does this and logs it)
283
+ expect(spawnSync('tmux', ['-S', sock, 'has-session', '-t', identity], { stdio: 'ignore' }).status).toBe(0)
284
+ expect(ensureServerCanary({ identity, sock, exitLogDir: logDir })).toBe('spawned')
285
+ },
286
+ 20000,
287
+ )
200
288
  })
@@ -8,17 +8,32 @@
8
8
  // Mechanism: one tiny detached `sh` per LIVE tmux server, holding a blocking
9
9
  // client `tmux -S <sock> wait-for iap-canary-<identity>` — a process OUTSIDE the
10
10
  // dying server, connected via its socket, so the server's death (any cause,
11
- // including SIGKILL) is observed the moment the connection drops. Protocol:
12
- // clean teardown (idle-reap / stop / pre-clean / killServerIfEmpty) SIGNALS
13
- // the channel first (`wait-for -S`) the canary exits 0, silently;
14
- // the canary itself signaled (TERM/HUP/INT, e.g. launch pre-clean pkill)
15
- // rc 128 / trap exit silently (someone manages it deliberately);
16
- // • anything else (wait-for returns an error = the server vanished under us)
17
- // ONE logfmt line `ev=server-exit` into exits.log (the per-peer death-cause
18
- // home, next to pane-died's `ev=session-exit`) + a forensics snapshot file
19
- // (vm_stat / swap / top-RSS ps / fresh DiagnosticReports) captured WITHIN
20
- // SECONDS of the death the evidence the 60 s supervise tick can never
11
+ // including SIGKILL) is observed the moment the connection drops. Protocol (v2 —
12
+ // death #4 10.06 15:42Z proved v1's exit-code trust wrong twice over: the killer
13
+ // took the SERVER and the canary CLIENT together, AND a TERMed tmux client
14
+ // returns rc=0 indistinguishable from a clean channel signal — so the rc-based
15
+ // `0 || ≥128 → silent` guard silenced a real death):
16
+ // • DELIBERATE silence is an explicit act, never an exit-code inference:
17
+ // every deliberate teardown (idle-reap / stop / pre-clean / pane-died hook /
18
+ // bootout teardown) signals the channel (`wait-for -S`) AND dismisses the
19
+ // sh recorder (`dismissCanary` TERM trap silent exit). POSIX trap
20
+ // semantics make this race-free: the trap runs before any recording.
21
+ // • When wait-for returns — ANY code — the script sleeps 2 s (a concurrent
22
+ // dismissal TERM wins here), then probes SERVER LIVENESS: alive → exit
23
+ // silently (a lost canary is re-armed and logged by the supervise retrofit
24
+ // within a tick); dead with nobody having dismissed us → the death is real
25
+ // and unclaimed → ONE logfmt line `ev=server-exit` into exits.log (the
26
+ // per-peer death-cause home, next to pane-died's `ev=session-exit`) + a
27
+ // forensics snapshot (vm_stat / swap / top-RSS ps / fresh DiagnosticReports)
28
+ // captured within seconds — the evidence the 60 s supervise tick can never
21
29
  // recover ("системных следов ноль" was the recurring investigation outcome).
30
+ // The raw wait_rc still ATTRIBUTES the death (cause=server-vanished /
31
+ // signaled-server-gone / client-killed-server-gone).
32
+ // Residual blind spot (structural): a killer that SIGKILLs the sh recorder
33
+ // itself leaves no in-process way to record. With v2 the ABSENCE of a record on
34
+ // a server-dead reap narrows the diagnosis to exactly that shape; the canary
35
+ // ensure-state lines in lifecycle.log (origin=launch/retrofit) evidence the
36
+ // churn post-hoc.
22
37
  //
23
38
  // The canary is pure observability: it never wakes, reaps, restarts or otherwise
24
39
  // manages anything (H4-compatible by construction), it fires at most once, and
@@ -69,24 +84,45 @@ export function canaryScript(o: CanaryScriptOptions): string {
69
84
  return [
70
85
  // Server PID captured while alive — the postmortem grep key for system logs.
71
86
  `SPID="$('${o.tmuxBin}' -S '${o.sock}' display-message -p '#{pid}' 2>/dev/null)"`,
72
- // A signal to the CANARY is deliberate management (pre-clean pkill) silent.
87
+ // A signal to the SH WRAPPER is deliberate dismissal (dismissCanary) or host
88
+ // shutdown → silent. POSIX: the trap runs after the foreground command
89
+ // completes — so a TERM delivered during wait-for/sleep always exits us
90
+ // BEFORE any recording below (the race-free deliberate-silence guarantee).
73
91
  `trap 'exit 0' HUP INT TERM`,
74
92
  `'${o.tmuxBin}' -S '${o.sock}' wait-for '${ch}'`,
75
93
  `rc=$?`,
76
- // rc=0 → clean-teardown signal; rc≥128 the tmux client itself was signaled.
77
- `if [ "$rc" -eq 0 ] || [ "$rc" -ge 128 ]; then exit 0; fi`,
78
- // The server vanished under us record, within seconds of the death.
94
+ // NO wait-for exit code is trusted as "deliberate" by itself PROVEN live
95
+ // (death-#4 postmortem): a TERM to the tmux CLIENT returns rc=0, identical
96
+ // to a clean channel signal, so an external killer sweeping server+client
97
+ // rides the clean-looking code straight past any rc-based guard (v1's
98
+ // `rc=0 || rc>=128 → silent` was exactly that hole). v2 contract instead:
99
+ // • deliberate teardowns DISMISS this sh (TERM → trap above) — the sleep
100
+ // below gives a concurrently-delivered dismissal time to win;
101
+ // • then the SERVER's liveness, not the exit code, decides: alive →
102
+ // nothing to record (a lost canary is re-armed and logged by the
103
+ // supervise retrofit within a tick); dead and nobody dismissed us →
104
+ // the death is real and unclaimed → record it.
105
+ `sleep 2`,
106
+ `if '${o.tmuxBin}' -S '${o.sock}' has-session 2>/dev/null; then exit 0; fi`,
107
+ // The exit code still ATTRIBUTES the recorded death (raw wait_rc is kept):
108
+ // rc=0 → signaled-server-gone (channel signal or client-TERM, server died)
109
+ // rc≥128 → client-killed-server-gone (client took a non-TERM kill)
110
+ // else → server-vanished (connection drop — SIGKILL/OOM class)
111
+ `cause=server-vanished`,
112
+ `if [ "$rc" -eq 0 ]; then cause=signaled-server-gone; fi`,
113
+ `if [ "$rc" -ge 128 ]; then cause=client-killed-server-gone; fi`,
114
+ // The server is gone under us — record, within seconds of the death.
79
115
  `ts="$(date -u +%Y-%m-%dT%H:%M:%SZ)"`,
80
116
  `f='${o.forensicsDir}/${o.identity}'-"$(date +%s)".txt`,
81
117
  `{`,
82
- ` echo "server-death identity=${o.identity} ts=$ts server_pid=$SPID wait_rc=$rc"`,
118
+ ` echo "server-death identity=${o.identity} ts=$ts server_pid=$SPID wait_rc=$rc cause=$cause"`,
83
119
  ` echo '--- vm_stat'; vm_stat`,
84
120
  ` echo '--- swapusage'; sysctl -n vm.swapusage`,
85
121
  ` echo '--- ps-top-rss'; ps axo pid,ppid,rss,etime,command | sort -rn -k3 | head -25`,
86
122
  ` echo '--- diagnosticreports-user'; ls -t "$HOME/Library/Logs/DiagnosticReports" 2>/dev/null | head`,
87
123
  ` echo '--- diagnosticreports-system'; ls -t /Library/Logs/DiagnosticReports 2>/dev/null | head`,
88
124
  `} > "$f" 2>&1`,
89
- `printf 'ts=%s ev=server-exit identity=${o.identity} server_pid=%s wait_rc=%s forensics=%s\\n' "$ts" "$SPID" "$rc" "$f" >> '${o.exitLogFile}'`,
125
+ `printf 'ts=%s ev=server-exit identity=${o.identity} server_pid=%s wait_rc=%s cause=%s forensics=%s\\n' "$ts" "$SPID" "$rc" "$cause" "$f" >> '${o.exitLogFile}'`,
90
126
  ].join('\n')
91
127
  }
92
128
 
@@ -150,3 +186,27 @@ export function signalCanaryClean(sock: string, identity: string): void {
150
186
  /* best-effort */
151
187
  }
152
188
  }
189
+
190
+ /** The pgrep/pkill ERE matching BOTH canary processes of ONE identity — the sh
191
+ * wrapper (its -c script quotes the channel: `…'iap-canary-<id>'…`) and the
192
+ * tmux client (argv ends with the bare channel). Anchored so an identity can
193
+ * never match another identity's prefix (claude-iapeer ≠ claude-iapeer-memory). */
194
+ export function canaryProcessPattern(identity: string): string {
195
+ return `${canaryChannel(identity)}([^a-z0-9-]|$)`
196
+ }
197
+
198
+ /**
199
+ * Dismiss this identity's canary BEFORE a deliberate server teardown: TERM the
200
+ * sh wrapper (trap → silent exit, race-free — the trap always runs before the
201
+ * v2 recording branch) and the tmux client. The explicit counterpart of the
202
+ * channel signal: with v2 a signaled CLIENT alone is no longer read as
203
+ * deliberate, so every deliberate path must dismiss the RECORDER (the sh).
204
+ * Best-effort: no canary running → harmless no-op (pkill exits 1).
205
+ */
206
+ export function dismissCanary(identity: string): void {
207
+ try {
208
+ spawnSync('pkill', ['-f', canaryProcessPattern(identity)], { stdio: 'ignore' })
209
+ } catch {
210
+ /* best-effort */
211
+ }
212
+ }
@@ -19,7 +19,8 @@ import { dirname } from 'path'
19
19
  import { spawnSync } from 'child_process'
20
20
  import { CODEX_BEARER_ENV_VAR, CODEX_DUMMY_BEARER, type Runtime } from '../core/constants.ts'
21
21
  import { readLaunchEnv } from '../storage/index.ts'
22
- import { ensureServerCanary, exitLogPath, signalCanaryClean } from './canary.ts'
22
+ import { canaryChannel, dismissCanary, ensureServerCanary, exitLogPath, signalCanaryClean } from './canary.ts'
23
+ import { appendLifecycleEvent } from '../lifecycle/eventlog.ts'
23
24
  import { claudeAdapter } from './adapters/claude.ts'
24
25
  import { codexAdapter } from './adapters/codex.ts'
25
26
  import { telegramAdapter } from './adapters/telegram.ts'
@@ -155,7 +156,18 @@ export function exitCauseHook(identity: string, exitLogFile: string): string {
155
156
  `dead_status=#{pane_dead_status} dead_signal=#{pane_dead_signal}\\n`
156
157
  const log =
157
158
  `printf "${line}" "$(date -u +%Y-%m-%dT%H:%M:%SZ)" >> "${exitLogFile}"`
158
- return `run-shell '${log}' ; kill-session -t "${identity}"`
159
+ // Silence the server-death canary BEFORE kill-session: with a single-session
160
+ // server the kill empties the server and exit-empty takes it down — without
161
+ // explicit silence the canary would append a second, muddier `ev=server-exit`
162
+ // record for a death this very hook just captured (the session-exit line is
163
+ // the richer, authoritative one: it has the exit code/signal). v2 contract:
164
+ // deliberate = channel signal (tmux-NATIVE wait-for -S) + DISMISS the sh
165
+ // recorder (abs-path /usr/bin/pkill — survives the minimal launchd PATH;
166
+ // `\$` keeps the regex anchor literal through the sh double-quote layer; the
167
+ // `[i]` class keeps the pattern from matching its OWN occurrence in this
168
+ // hook-sh's cmdline — the pgrep self-match classic).
169
+ const dismiss = `/usr/bin/pkill -f "[i]${canaryChannel(identity).slice(1)}([^a-z0-9-]|\\$)"`
170
+ return `run-shell '${log} ; ${dismiss}' ; wait-for -S "${canaryChannel(identity)}" ; kill-session -t "${identity}"`
159
171
  }
160
172
 
161
173
  /** Install the exit-cause observability on a freshly-created session: ensure the
@@ -212,11 +224,15 @@ export const launch: LaunchFn = async (
212
224
  mkdirSync(dirname(sock), { recursive: true })
213
225
 
214
226
  // (1) Pre-clean any stale tmux server on this socket, then launch detached.
215
- // Signal the server-death canary FIRST: this teardown is deliberate, the
216
- // canary must exit silently, not record a false server-death (the pkill
217
- // below also matches the canary's cmdline by design, it sweeps a stale
218
- // canary along with the stale server; a signaled canary is already gone).
227
+ // Silence the server-death canary EXPLICITLY first — this teardown is
228
+ // deliberate: signal the channel (client exits 0) AND dismiss the sh
229
+ // recorder (TERM trap silent; canary v2 no longer reads a signaled
230
+ // client alone as deliberate an external killer sweeping server+client
231
+ // was exactly the death-#4 silence). Only then sweep the server processes
232
+ // (the pkill also matches a leftover canary client — harmless, both
233
+ // canary processes are already dismissed).
219
234
  signalCanaryClean(sock, identity)
235
+ dismissCanary(identity)
220
236
  spawnSync('pkill', ['-f', `tmux -S ${sock} `], { stdio: 'ignore' })
221
237
  tmux(sock, 'kill-server')
222
238
 
@@ -268,7 +284,14 @@ export const launch: LaunchFn = async (
268
284
  // that records `ev=server-exit` + a forensics snapshot when the whole
269
285
  // server dies dirty (SIGKILL/OOM class). Same gate as the exit hook;
270
286
  // best-effort by construction (every failure → a state, never a throw).
271
- ensureServerCanary({ identity, sock, exitLogDir: cfg.exitLogDir, env })
287
+ // The ensure-state is logged (origin=launch) so the arming trail is
288
+ // complete in lifecycle.log: 'spawned' is the newborn-server norm here,
289
+ // 'failed' a newborn server starting its life UNWATCHED (death-#4
290
+ // postmortem hinged on this very question being unanswerable).
291
+ const canaryState = ensureServerCanary({ identity, sock, exitLogDir: cfg.exitLogDir, env })
292
+ if (canaryState !== 'skipped') {
293
+ appendLifecycleEvent(cfg.exitLogDir, { ev: 'canary', identity, state: canaryState, origin: 'launch' }, { env })
294
+ }
272
295
 
273
296
  // (3) pipe-pane the session output to the per-identity log.
274
297
  mkdirSync(cfg.logDir, { recursive: true, mode: 0o700 })
@@ -227,6 +227,14 @@ describe('exitCauseHook (exit-cause observability)', () => {
227
227
  // tmux-NATIVE kill-session (no shell `tmux`) → needs no PATH (launchd minimal env).
228
228
  expect(hook).toContain('kill-session -t "claude-iapeer"')
229
229
  })
230
+ test('silences the server-death canary (native wait-for -S) BEFORE kill-session — no double record', () => {
231
+ // kill-session on a single-session server → exit-empty takes the server down;
232
+ // without the signal the canary would add a second `ev=server-exit` record for
233
+ // a death this hook just captured (session-exit carries the code/signal).
234
+ expect(hook).toContain('wait-for -S "iap-canary-claude-iapeer"')
235
+ expect(hook.indexOf('wait-for -S')).toBeLessThan(hook.indexOf('kill-session'))
236
+ expect(hook.indexOf('run-shell')).toBeLessThan(hook.indexOf('wait-for -S')) // log first
237
+ })
230
238
  test('quoting: single-quoted run-shell arg (tmux layer) wrapping double-quoted sh', () => {
231
239
  expect(hook).toMatch(/run-shell '.*'/)
232
240
  expect(hook).not.toContain("''") // no empty/again-collapsed single-quote pair
@@ -23,7 +23,7 @@ import { buildProcessAddress, buildSocketPath } from '../core/socket.ts'
23
23
  import { peerLogsDir, pluginLogsDir } from '../storage/index.ts'
24
24
  import { readPeerProfile } from '../identity/index.ts'
25
25
  import { getAdapter, launch } from './index.ts'
26
- import { signalCanaryClean } from './canary.ts'
26
+ import { dismissCanary, signalCanaryClean } from './canary.ts'
27
27
  import type { LaunchConfig, LaunchSpec } from './types.ts'
28
28
 
29
29
  /** Block-watch poll cadence — seconds, deliberately NOT a tight loop (the session
@@ -55,7 +55,10 @@ function sessionAlive(sock: string, identity: string): boolean {
55
55
  * both paths now converge on the same end state.
56
56
  */
57
57
  export function teardownAlwaysOnSession(sock: string, identity: string): void {
58
+ // Explicit canary silence (v2): channel signal (client exits 0) + dismiss the
59
+ // sh recorder — a signaled client alone is no longer read as deliberate.
58
60
  signalCanaryClean(sock, identity)
61
+ dismissCanary(identity)
59
62
  spawnSync('tmux', ['-S', sock, 'kill-session', '-t', identity], { stdio: 'ignore' })
60
63
  const ls = spawnSync('tmux', ['-S', sock, 'list-sessions', '-F', '#{session_name}'], { encoding: 'utf8' })
61
64
  if (!(ls.stdout ?? '').trim()) {
@@ -46,7 +46,7 @@ import {
46
46
  type LaunchSpec,
47
47
  } from '../launch/index.ts'
48
48
  import { composeSystemPrompt, gatherPromptInput } from '../launch/composeSystemPrompt.ts'
49
- import { ensureServerCanary, signalCanaryClean } from '../launch/canary.ts'
49
+ import { dismissCanary, ensureServerCanary, signalCanaryClean } from '../launch/canary.ts'
50
50
  import { appendLifecycleEvent, superviseLogVerbose } from './eventlog.ts'
51
51
 
52
52
  // ─────────────────────────────────────────────────────────────────────────────
@@ -1009,9 +1009,11 @@ export function killSession(sock: string, identity: string): void {
1009
1009
  tmux(sock, 'kill-session', '-t', identity)
1010
1010
  const sessions = tmux(sock, 'list-sessions', '-F', '#{session_name}').out
1011
1011
  if (!sessions.trim()) {
1012
- // Deliberate server teardown → signal the server-death canary FIRST so it
1013
- // exits silently instead of recording a false `ev=server-exit` (canary.ts).
1012
+ // Deliberate server teardown → silence the canary EXPLICITLY first: signal
1013
+ // the channel (client exits 0) AND dismiss the sh recorder (canary v2 no
1014
+ // longer reads a signaled client alone as deliberate — see canary.ts).
1014
1015
  signalCanaryClean(sock, identity)
1016
+ dismissCanary(identity)
1015
1017
  tmux(sock, 'kill-server')
1016
1018
  try {
1017
1019
  rmSync(sock, { force: true })
@@ -1190,7 +1192,15 @@ export function superviseTick(cfg: LifecycleConfig, deps: SuperviseDeps = {}): S
1190
1192
  // every ALIVE daemon-owned server — covers a fleet launched by older code
1191
1193
  // within one tick of a deploy, no session restarts. Idempotent (pgrep on
1192
1194
  // the per-identity wait-for channel), best-effort, pure observability.
1193
- ensureServerCanary({ identity: s.identity, sock, exitLogDir: cfg.eventLogDir, env })
1195
+ // A non-'already' state IS a decision-grade event (not verbose-gated):
1196
+ // post-0.2.22 every launch arms a canary, so a retrofit 'spawned' on an
1197
+ // alive server means the previous canary VANISHED mid-watch (the death-#4
1198
+ // blind spot was exactly this churn being invisible); 'failed' means the
1199
+ // server is currently UNWATCHED. At most one line per loss, not per tick.
1200
+ const canaryState = ensureServerCanary({ identity: s.identity, sock, exitLogDir: cfg.eventLogDir, env })
1201
+ if (canaryState !== 'already') {
1202
+ trace({ identity: s.identity, action: 'canary', state: canaryState, origin: 'retrofit' })
1203
+ }
1194
1204
  out.push({ identity: s.identity, action: 'alive' })
1195
1205
  if (verbose) trace({ identity: s.identity, action: 'alive', age: `${ageSecs}s` })
1196
1206
  }