@agfpd/iapeer 0.2.26 → 0.2.27
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/launch/canary.test.ts +93 -5
- package/src/launch/canary.ts +76 -16
- package/src/launch/index.ts +30 -7
- package/src/launch/launch.test.ts +8 -0
- package/src/launch/launchdRun.ts +4 -1
- package/src/lifecycle/index.ts +14 -4
package/package.json
CHANGED
|
@@ -10,7 +10,9 @@ import { tmpdir } from 'os'
|
|
|
10
10
|
import { join } from 'path'
|
|
11
11
|
import {
|
|
12
12
|
canaryChannel,
|
|
13
|
+
canaryProcessPattern,
|
|
13
14
|
canaryScript,
|
|
15
|
+
dismissCanary,
|
|
14
16
|
ensureServerCanary,
|
|
15
17
|
exitLogPath,
|
|
16
18
|
serverDeathsDir,
|
|
@@ -53,7 +55,7 @@ describe('canary script (pure)', () => {
|
|
|
53
55
|
expect(exitLogPath('/r/logs/iapeer')).toBe('/r/logs/iapeer/exits.log')
|
|
54
56
|
})
|
|
55
57
|
|
|
56
|
-
test('script carries the protocol: wait-for channel,
|
|
58
|
+
test('script carries the v2 protocol: wait-for channel, deliberate-silence guards, liveness probe, record line, forensics', () => {
|
|
57
59
|
const s = canaryScript({
|
|
58
60
|
identity: 'claude-bob',
|
|
59
61
|
sock: '/tmp/x.sock',
|
|
@@ -62,8 +64,15 @@ describe('canary script (pure)', () => {
|
|
|
62
64
|
forensicsDir: '/r/logs/iapeer/server-deaths',
|
|
63
65
|
})
|
|
64
66
|
expect(s).toContain(`wait-for 'iap-canary-claude-bob'`) // the blocking client
|
|
65
|
-
expect(s).toContain(`trap 'exit 0' HUP INT TERM`) //
|
|
66
|
-
|
|
67
|
+
expect(s).toContain(`trap 'exit 0' HUP INT TERM`) // dismissed sh = silent (the ONLY deliberate silencer)
|
|
68
|
+
// v2: NO exit code is trusted as deliberate (a TERMed client returns rc=0 —
|
|
69
|
+
// proven live); the SERVER's liveness decides, after a dismissal grace sleep.
|
|
70
|
+
expect(s).not.toContain(`[ "$rc" -eq 0 ] || [ "$rc" -ge 128 ]; then exit 0`) // the v1 hole
|
|
71
|
+
expect(s).toContain('sleep 2') // dismissal grace window
|
|
72
|
+
expect(s).toContain(`has-session 2>/dev/null; then exit 0`) // server alive → nothing to record
|
|
73
|
+
expect(s).toContain('cause=server-vanished') // connection drop (SIGKILL/OOM class)
|
|
74
|
+
expect(s).toContain('cause=signaled-server-gone') // rc=0: channel/client-TERM, server died
|
|
75
|
+
expect(s).toContain('cause=client-killed-server-gone') // rc≥128: client took a hard kill
|
|
67
76
|
expect(s).toContain('ev=server-exit identity=claude-bob') // the exits.log record
|
|
68
77
|
expect(s).toContain(`>> '/r/logs/iapeer/exits.log'`)
|
|
69
78
|
expect(s).toContain('/r/logs/iapeer/server-deaths/claude-bob') // forensics file
|
|
@@ -74,6 +83,19 @@ describe('canary script (pure)', () => {
|
|
|
74
83
|
test('ensureServerCanary without exitLogDir → skipped (observability off)', () => {
|
|
75
84
|
expect(ensureServerCanary({ identity: 'claude-none', sock: '/tmp/none.sock' })).toBe('skipped')
|
|
76
85
|
})
|
|
86
|
+
|
|
87
|
+
test('canaryProcessPattern is identity-anchored (no prefix bleed) and matches both canary processes', () => {
|
|
88
|
+
const p = canaryProcessPattern('claude-iap')
|
|
89
|
+
expect(p).toBe('iap-canary-claude-iap([^a-z0-9-]|$)')
|
|
90
|
+
const re = new RegExp(p)
|
|
91
|
+
expect(re.test(`/opt/homebrew/bin/tmux -S /tmp/x.sock wait-for iap-canary-claude-iap`)).toBe(true) // client argv
|
|
92
|
+
expect(re.test(`/bin/sh -c ... wait-for 'iap-canary-claude-iap'\n...`)).toBe(true) // sh -c script (quoted channel)
|
|
93
|
+
expect(re.test(`tmux wait-for iap-canary-claude-iap-memory`)).toBe(false) // prefix identity must NOT match
|
|
94
|
+
})
|
|
95
|
+
|
|
96
|
+
test('dismissCanary is a harmless no-op when no canary runs', () => {
|
|
97
|
+
expect(() => dismissCanary('claude-nobody-here')).not.toThrow()
|
|
98
|
+
})
|
|
77
99
|
})
|
|
78
100
|
|
|
79
101
|
describe.if(tmuxAvailable)('canary live (sandbox tmux servers)', () => {
|
|
@@ -90,11 +112,22 @@ describe.if(tmuxAvailable)('canary live (sandbox tmux servers)', () => {
|
|
|
90
112
|
return { sock, logDir: join(dir, `logs-${identity}`) }
|
|
91
113
|
}
|
|
92
114
|
|
|
115
|
+
const allIds = [
|
|
116
|
+
'claude-canadirty',
|
|
117
|
+
'claude-canaclean',
|
|
118
|
+
'claude-canakill',
|
|
119
|
+
'notifier-canatear',
|
|
120
|
+
'claude-canasweep',
|
|
121
|
+
'claude-canaclient',
|
|
122
|
+
]
|
|
123
|
+
|
|
93
124
|
afterAll(() => {
|
|
94
125
|
for (const sock of socks) {
|
|
95
|
-
// teardown is DELIBERATE →
|
|
96
|
-
|
|
126
|
+
// teardown is DELIBERATE → silence each canary (signal + dismiss) before
|
|
127
|
+
// killing its server — the v2 contract for every deliberate path.
|
|
128
|
+
for (const id of allIds) {
|
|
97
129
|
signalCanaryClean(sock, id)
|
|
130
|
+
dismissCanary(id)
|
|
98
131
|
}
|
|
99
132
|
spawnSync('tmux', ['-S', sock, 'kill-server'], { stdio: 'ignore' })
|
|
100
133
|
}
|
|
@@ -197,4 +230,59 @@ describe.if(tmuxAvailable)('canary live (sandbox tmux servers)', () => {
|
|
|
197
230
|
},
|
|
198
231
|
20000,
|
|
199
232
|
)
|
|
233
|
+
|
|
234
|
+
test(
|
|
235
|
+
'death-#4 shape: external killer sweeps server AND canary client → ev=server-exit cause=client-signaled',
|
|
236
|
+
async () => {
|
|
237
|
+
const identity = 'claude-canasweep'
|
|
238
|
+
const { sock, logDir } = bringUp(identity)
|
|
239
|
+
const pid = serverPid(sock)
|
|
240
|
+
expect(ensureServerCanary({ identity, sock, exitLogDir: logDir })).toBe('spawned')
|
|
241
|
+
expect(await waitFor(() => canaryRunning(identity), 3000)).toBe(true)
|
|
242
|
+
await sleep(500)
|
|
243
|
+
|
|
244
|
+
// The pre-clean-shaped external killer: one pattern takes the server AND
|
|
245
|
+
// the canary CLIENT (both argv contain `tmux -S <sock> `), the sh recorder
|
|
246
|
+
// survives. The TERMed client returns rc=0 (proven live) — v1 read that as
|
|
247
|
+
// a clean channel signal and stayed silent; exactly how deaths #4–#6
|
|
248
|
+
// (10.06) left zero records. v2 probes the server instead: dead → record.
|
|
249
|
+
spawnSync('pkill', ['-f', `tmux -S ${sock} `], { stdio: 'ignore' })
|
|
250
|
+
const log = exitLogPath(logDir)
|
|
251
|
+
expect(
|
|
252
|
+
await waitFor(() => existsSync(log) && readFileSync(log, 'utf8').includes('ev=server-exit'), 10000),
|
|
253
|
+
).toBe(true)
|
|
254
|
+
const line = readFileSync(log, 'utf8')
|
|
255
|
+
expect(line).toContain(`ev=server-exit identity=${identity}`)
|
|
256
|
+
expect(line).toContain('cause=signaled-server-gone') // rc=0 shape, server found dead
|
|
257
|
+
expect(line).toContain(`server_pid=${pid}`)
|
|
258
|
+
expect(readdirSync(serverDeathsDir(logDir)).length).toBe(1) // forensics captured
|
|
259
|
+
},
|
|
260
|
+
20000,
|
|
261
|
+
)
|
|
262
|
+
|
|
263
|
+
test(
|
|
264
|
+
'canary client killed while server lives → silent (no false record), canary gone for the retrofit to re-arm',
|
|
265
|
+
async () => {
|
|
266
|
+
const identity = 'claude-canaclient'
|
|
267
|
+
const { sock, logDir } = bringUp(identity)
|
|
268
|
+
expect(ensureServerCanary({ identity, sock, exitLogDir: logDir })).toBe('spawned')
|
|
269
|
+
expect(await waitFor(() => canaryRunning(identity), 3000)).toBe(true)
|
|
270
|
+
await sleep(500)
|
|
271
|
+
|
|
272
|
+
// TERM the CLIENT only (argv ends with the bare channel — the sh's quoted
|
|
273
|
+
// form does not match this $-anchored pattern), server stays up.
|
|
274
|
+
spawnSync('pkill', ['-f', `wait-for ${canaryChannel(identity)}$`], { stdio: 'ignore' })
|
|
275
|
+
// the sh probes (≈2 s), finds the server ALIVE → exits silently
|
|
276
|
+
expect(await waitFor(() => !canaryRunning(identity), 6000)).toBe(true)
|
|
277
|
+
await sleep(300)
|
|
278
|
+
expect(
|
|
279
|
+
existsSync(exitLogPath(logDir)) && readFileSync(exitLogPath(logDir), 'utf8').includes('ev=server-exit'),
|
|
280
|
+
).toBe(false) // no false server-death while the server lives
|
|
281
|
+
// server must still be alive — and ensure re-arms a fresh canary (the
|
|
282
|
+
// retrofit path; in prod the supervise tick does this and logs it)
|
|
283
|
+
expect(spawnSync('tmux', ['-S', sock, 'has-session', '-t', identity], { stdio: 'ignore' }).status).toBe(0)
|
|
284
|
+
expect(ensureServerCanary({ identity, sock, exitLogDir: logDir })).toBe('spawned')
|
|
285
|
+
},
|
|
286
|
+
20000,
|
|
287
|
+
)
|
|
200
288
|
})
|
package/src/launch/canary.ts
CHANGED
|
@@ -8,17 +8,32 @@
|
|
|
8
8
|
// Mechanism: one tiny detached `sh` per LIVE tmux server, holding a blocking
|
|
9
9
|
// client `tmux -S <sock> wait-for iap-canary-<identity>` — a process OUTSIDE the
|
|
10
10
|
// dying server, connected via its socket, so the server's death (any cause,
|
|
11
|
-
// including SIGKILL) is observed the moment the connection drops. Protocol
|
|
12
|
-
//
|
|
13
|
-
//
|
|
14
|
-
//
|
|
15
|
-
//
|
|
16
|
-
// •
|
|
17
|
-
//
|
|
18
|
-
//
|
|
19
|
-
// (
|
|
20
|
-
//
|
|
11
|
+
// including SIGKILL) is observed the moment the connection drops. Protocol (v2 —
|
|
12
|
+
// death #4 10.06 15:42Z proved v1's exit-code trust wrong twice over: the killer
|
|
13
|
+
// took the SERVER and the canary CLIENT together, AND a TERMed tmux client
|
|
14
|
+
// returns rc=0 — indistinguishable from a clean channel signal — so the rc-based
|
|
15
|
+
// `0 || ≥128 → silent` guard silenced a real death):
|
|
16
|
+
// • DELIBERATE silence is an explicit act, never an exit-code inference:
|
|
17
|
+
// every deliberate teardown (idle-reap / stop / pre-clean / pane-died hook /
|
|
18
|
+
// bootout teardown) signals the channel (`wait-for -S`) AND dismisses the
|
|
19
|
+
// sh recorder (`dismissCanary` → TERM → trap → silent exit). POSIX trap
|
|
20
|
+
// semantics make this race-free: the trap runs before any recording.
|
|
21
|
+
// • When wait-for returns — ANY code — the script sleeps 2 s (a concurrent
|
|
22
|
+
// dismissal TERM wins here), then probes SERVER LIVENESS: alive → exit
|
|
23
|
+
// silently (a lost canary is re-armed and logged by the supervise retrofit
|
|
24
|
+
// within a tick); dead with nobody having dismissed us → the death is real
|
|
25
|
+
// and unclaimed → ONE logfmt line `ev=server-exit` into exits.log (the
|
|
26
|
+
// per-peer death-cause home, next to pane-died's `ev=session-exit`) + a
|
|
27
|
+
// forensics snapshot (vm_stat / swap / top-RSS ps / fresh DiagnosticReports)
|
|
28
|
+
// captured within seconds — the evidence the 60 s supervise tick can never
|
|
21
29
|
// recover ("системных следов ноль" was the recurring investigation outcome).
|
|
30
|
+
// The raw wait_rc still ATTRIBUTES the death (cause=server-vanished /
|
|
31
|
+
// signaled-server-gone / client-killed-server-gone).
|
|
32
|
+
// Residual blind spot (structural): a killer that SIGKILLs the sh recorder
|
|
33
|
+
// itself leaves no in-process way to record. With v2 the ABSENCE of a record on
|
|
34
|
+
// a server-dead reap narrows the diagnosis to exactly that shape; the canary
|
|
35
|
+
// ensure-state lines in lifecycle.log (origin=launch/retrofit) evidence the
|
|
36
|
+
// churn post-hoc.
|
|
22
37
|
//
|
|
23
38
|
// The canary is pure observability: it never wakes, reaps, restarts or otherwise
|
|
24
39
|
// manages anything (H4-compatible by construction), it fires at most once, and
|
|
@@ -69,24 +84,45 @@ export function canaryScript(o: CanaryScriptOptions): string {
|
|
|
69
84
|
return [
|
|
70
85
|
// Server PID captured while alive — the postmortem grep key for system logs.
|
|
71
86
|
`SPID="$('${o.tmuxBin}' -S '${o.sock}' display-message -p '#{pid}' 2>/dev/null)"`,
|
|
72
|
-
// A signal to the
|
|
87
|
+
// A signal to the SH WRAPPER is deliberate dismissal (dismissCanary) or host
|
|
88
|
+
// shutdown → silent. POSIX: the trap runs after the foreground command
|
|
89
|
+
// completes — so a TERM delivered during wait-for/sleep always exits us
|
|
90
|
+
// BEFORE any recording below (the race-free deliberate-silence guarantee).
|
|
73
91
|
`trap 'exit 0' HUP INT TERM`,
|
|
74
92
|
`'${o.tmuxBin}' -S '${o.sock}' wait-for '${ch}'`,
|
|
75
93
|
`rc=$?`,
|
|
76
|
-
//
|
|
77
|
-
|
|
78
|
-
//
|
|
94
|
+
// NO wait-for exit code is trusted as "deliberate" by itself — PROVEN live
|
|
95
|
+
// (death-#4 postmortem): a TERM to the tmux CLIENT returns rc=0, identical
|
|
96
|
+
// to a clean channel signal, so an external killer sweeping server+client
|
|
97
|
+
// rides the clean-looking code straight past any rc-based guard (v1's
|
|
98
|
+
// `rc=0 || rc>=128 → silent` was exactly that hole). v2 contract instead:
|
|
99
|
+
// • deliberate teardowns DISMISS this sh (TERM → trap above) — the sleep
|
|
100
|
+
// below gives a concurrently-delivered dismissal time to win;
|
|
101
|
+
// • then the SERVER's liveness, not the exit code, decides: alive →
|
|
102
|
+
// nothing to record (a lost canary is re-armed and logged by the
|
|
103
|
+
// supervise retrofit within a tick); dead and nobody dismissed us →
|
|
104
|
+
// the death is real and unclaimed → record it.
|
|
105
|
+
`sleep 2`,
|
|
106
|
+
`if '${o.tmuxBin}' -S '${o.sock}' has-session 2>/dev/null; then exit 0; fi`,
|
|
107
|
+
// The exit code still ATTRIBUTES the recorded death (raw wait_rc is kept):
|
|
108
|
+
// rc=0 → signaled-server-gone (channel signal or client-TERM, server died)
|
|
109
|
+
// rc≥128 → client-killed-server-gone (client took a non-TERM kill)
|
|
110
|
+
// else → server-vanished (connection drop — SIGKILL/OOM class)
|
|
111
|
+
`cause=server-vanished`,
|
|
112
|
+
`if [ "$rc" -eq 0 ]; then cause=signaled-server-gone; fi`,
|
|
113
|
+
`if [ "$rc" -ge 128 ]; then cause=client-killed-server-gone; fi`,
|
|
114
|
+
// The server is gone under us — record, within seconds of the death.
|
|
79
115
|
`ts="$(date -u +%Y-%m-%dT%H:%M:%SZ)"`,
|
|
80
116
|
`f='${o.forensicsDir}/${o.identity}'-"$(date +%s)".txt`,
|
|
81
117
|
`{`,
|
|
82
|
-
` echo "server-death identity=${o.identity} ts=$ts server_pid=$SPID wait_rc=$rc"`,
|
|
118
|
+
` echo "server-death identity=${o.identity} ts=$ts server_pid=$SPID wait_rc=$rc cause=$cause"`,
|
|
83
119
|
` echo '--- vm_stat'; vm_stat`,
|
|
84
120
|
` echo '--- swapusage'; sysctl -n vm.swapusage`,
|
|
85
121
|
` echo '--- ps-top-rss'; ps axo pid,ppid,rss,etime,command | sort -rn -k3 | head -25`,
|
|
86
122
|
` echo '--- diagnosticreports-user'; ls -t "$HOME/Library/Logs/DiagnosticReports" 2>/dev/null | head`,
|
|
87
123
|
` echo '--- diagnosticreports-system'; ls -t /Library/Logs/DiagnosticReports 2>/dev/null | head`,
|
|
88
124
|
`} > "$f" 2>&1`,
|
|
89
|
-
`printf 'ts=%s ev=server-exit identity=${o.identity} server_pid=%s wait_rc=%s forensics=%s\\n' "$ts" "$SPID" "$rc" "$f" >> '${o.exitLogFile}'`,
|
|
125
|
+
`printf 'ts=%s ev=server-exit identity=${o.identity} server_pid=%s wait_rc=%s cause=%s forensics=%s\\n' "$ts" "$SPID" "$rc" "$cause" "$f" >> '${o.exitLogFile}'`,
|
|
90
126
|
].join('\n')
|
|
91
127
|
}
|
|
92
128
|
|
|
@@ -150,3 +186,27 @@ export function signalCanaryClean(sock: string, identity: string): void {
|
|
|
150
186
|
/* best-effort */
|
|
151
187
|
}
|
|
152
188
|
}
|
|
189
|
+
|
|
190
|
+
/** The pgrep/pkill ERE matching BOTH canary processes of ONE identity — the sh
|
|
191
|
+
* wrapper (its -c script quotes the channel: `…'iap-canary-<id>'…`) and the
|
|
192
|
+
* tmux client (argv ends with the bare channel). Anchored so an identity can
|
|
193
|
+
* never match another identity's prefix (claude-iapeer ≠ claude-iapeer-memory). */
|
|
194
|
+
export function canaryProcessPattern(identity: string): string {
|
|
195
|
+
return `${canaryChannel(identity)}([^a-z0-9-]|$)`
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
/**
|
|
199
|
+
* Dismiss this identity's canary BEFORE a deliberate server teardown: TERM the
|
|
200
|
+
* sh wrapper (trap → silent exit, race-free — the trap always runs before the
|
|
201
|
+
* v2 recording branch) and the tmux client. The explicit counterpart of the
|
|
202
|
+
* channel signal: with v2 a signaled CLIENT alone is no longer read as
|
|
203
|
+
* deliberate, so every deliberate path must dismiss the RECORDER (the sh).
|
|
204
|
+
* Best-effort: no canary running → harmless no-op (pkill exits 1).
|
|
205
|
+
*/
|
|
206
|
+
export function dismissCanary(identity: string): void {
|
|
207
|
+
try {
|
|
208
|
+
spawnSync('pkill', ['-f', canaryProcessPattern(identity)], { stdio: 'ignore' })
|
|
209
|
+
} catch {
|
|
210
|
+
/* best-effort */
|
|
211
|
+
}
|
|
212
|
+
}
|
package/src/launch/index.ts
CHANGED
|
@@ -19,7 +19,8 @@ import { dirname } from 'path'
|
|
|
19
19
|
import { spawnSync } from 'child_process'
|
|
20
20
|
import { CODEX_BEARER_ENV_VAR, CODEX_DUMMY_BEARER, type Runtime } from '../core/constants.ts'
|
|
21
21
|
import { readLaunchEnv } from '../storage/index.ts'
|
|
22
|
-
import { ensureServerCanary, exitLogPath, signalCanaryClean } from './canary.ts'
|
|
22
|
+
import { canaryChannel, dismissCanary, ensureServerCanary, exitLogPath, signalCanaryClean } from './canary.ts'
|
|
23
|
+
import { appendLifecycleEvent } from '../lifecycle/eventlog.ts'
|
|
23
24
|
import { claudeAdapter } from './adapters/claude.ts'
|
|
24
25
|
import { codexAdapter } from './adapters/codex.ts'
|
|
25
26
|
import { telegramAdapter } from './adapters/telegram.ts'
|
|
@@ -155,7 +156,18 @@ export function exitCauseHook(identity: string, exitLogFile: string): string {
|
|
|
155
156
|
`dead_status=#{pane_dead_status} dead_signal=#{pane_dead_signal}\\n`
|
|
156
157
|
const log =
|
|
157
158
|
`printf "${line}" "$(date -u +%Y-%m-%dT%H:%M:%SZ)" >> "${exitLogFile}"`
|
|
158
|
-
|
|
159
|
+
// Silence the server-death canary BEFORE kill-session: with a single-session
|
|
160
|
+
// server the kill empties the server and exit-empty takes it down — without
|
|
161
|
+
// explicit silence the canary would append a second, muddier `ev=server-exit`
|
|
162
|
+
// record for a death this very hook just captured (the session-exit line is
|
|
163
|
+
// the richer, authoritative one: it has the exit code/signal). v2 contract:
|
|
164
|
+
// deliberate = channel signal (tmux-NATIVE wait-for -S) + DISMISS the sh
|
|
165
|
+
// recorder (abs-path /usr/bin/pkill — survives the minimal launchd PATH;
|
|
166
|
+
// `\$` keeps the regex anchor literal through the sh double-quote layer; the
|
|
167
|
+
// `[i]` class keeps the pattern from matching its OWN occurrence in this
|
|
168
|
+
// hook-sh's cmdline — the pgrep self-match classic).
|
|
169
|
+
const dismiss = `/usr/bin/pkill -f "[i]${canaryChannel(identity).slice(1)}([^a-z0-9-]|\\$)"`
|
|
170
|
+
return `run-shell '${log} ; ${dismiss}' ; wait-for -S "${canaryChannel(identity)}" ; kill-session -t "${identity}"`
|
|
159
171
|
}
|
|
160
172
|
|
|
161
173
|
/** Install the exit-cause observability on a freshly-created session: ensure the
|
|
@@ -212,11 +224,15 @@ export const launch: LaunchFn = async (
|
|
|
212
224
|
mkdirSync(dirname(sock), { recursive: true })
|
|
213
225
|
|
|
214
226
|
// (1) Pre-clean any stale tmux server on this socket, then launch detached.
|
|
215
|
-
//
|
|
216
|
-
//
|
|
217
|
-
//
|
|
218
|
-
//
|
|
227
|
+
// Silence the server-death canary EXPLICITLY first — this teardown is
|
|
228
|
+
// deliberate: signal the channel (client exits 0) AND dismiss the sh
|
|
229
|
+
// recorder (TERM → trap → silent; canary v2 no longer reads a signaled
|
|
230
|
+
// client alone as deliberate — an external killer sweeping server+client
|
|
231
|
+
// was exactly the death-#4 silence). Only then sweep the server processes
|
|
232
|
+
// (the pkill also matches a leftover canary client — harmless, both
|
|
233
|
+
// canary processes are already dismissed).
|
|
219
234
|
signalCanaryClean(sock, identity)
|
|
235
|
+
dismissCanary(identity)
|
|
220
236
|
spawnSync('pkill', ['-f', `tmux -S ${sock} `], { stdio: 'ignore' })
|
|
221
237
|
tmux(sock, 'kill-server')
|
|
222
238
|
|
|
@@ -268,7 +284,14 @@ export const launch: LaunchFn = async (
|
|
|
268
284
|
// that records `ev=server-exit` + a forensics snapshot when the whole
|
|
269
285
|
// server dies dirty (SIGKILL/OOM class). Same gate as the exit hook;
|
|
270
286
|
// best-effort by construction (every failure → a state, never a throw).
|
|
271
|
-
|
|
287
|
+
// The ensure-state is logged (origin=launch) so the arming trail is
|
|
288
|
+
// complete in lifecycle.log: 'spawned' is the newborn-server norm here,
|
|
289
|
+
// 'failed' a newborn server starting its life UNWATCHED (death-#4
|
|
290
|
+
// postmortem hinged on this very question being unanswerable).
|
|
291
|
+
const canaryState = ensureServerCanary({ identity, sock, exitLogDir: cfg.exitLogDir, env })
|
|
292
|
+
if (canaryState !== 'skipped') {
|
|
293
|
+
appendLifecycleEvent(cfg.exitLogDir, { ev: 'canary', identity, state: canaryState, origin: 'launch' }, { env })
|
|
294
|
+
}
|
|
272
295
|
|
|
273
296
|
// (3) pipe-pane the session output to the per-identity log.
|
|
274
297
|
mkdirSync(cfg.logDir, { recursive: true, mode: 0o700 })
|
|
@@ -227,6 +227,14 @@ describe('exitCauseHook (exit-cause observability)', () => {
|
|
|
227
227
|
// tmux-NATIVE kill-session (no shell `tmux`) → needs no PATH (launchd minimal env).
|
|
228
228
|
expect(hook).toContain('kill-session -t "claude-iapeer"')
|
|
229
229
|
})
|
|
230
|
+
test('silences the server-death canary (native wait-for -S) BEFORE kill-session — no double record', () => {
|
|
231
|
+
// kill-session on a single-session server → exit-empty takes the server down;
|
|
232
|
+
// without the signal the canary would add a second `ev=server-exit` record for
|
|
233
|
+
// a death this hook just captured (session-exit carries the code/signal).
|
|
234
|
+
expect(hook).toContain('wait-for -S "iap-canary-claude-iapeer"')
|
|
235
|
+
expect(hook.indexOf('wait-for -S')).toBeLessThan(hook.indexOf('kill-session'))
|
|
236
|
+
expect(hook.indexOf('run-shell')).toBeLessThan(hook.indexOf('wait-for -S')) // log first
|
|
237
|
+
})
|
|
230
238
|
test('quoting: single-quoted run-shell arg (tmux layer) wrapping double-quoted sh', () => {
|
|
231
239
|
expect(hook).toMatch(/run-shell '.*'/)
|
|
232
240
|
expect(hook).not.toContain("''") // no empty/again-collapsed single-quote pair
|
package/src/launch/launchdRun.ts
CHANGED
|
@@ -23,7 +23,7 @@ import { buildProcessAddress, buildSocketPath } from '../core/socket.ts'
|
|
|
23
23
|
import { peerLogsDir, pluginLogsDir } from '../storage/index.ts'
|
|
24
24
|
import { readPeerProfile } from '../identity/index.ts'
|
|
25
25
|
import { getAdapter, launch } from './index.ts'
|
|
26
|
-
import { signalCanaryClean } from './canary.ts'
|
|
26
|
+
import { dismissCanary, signalCanaryClean } from './canary.ts'
|
|
27
27
|
import type { LaunchConfig, LaunchSpec } from './types.ts'
|
|
28
28
|
|
|
29
29
|
/** Block-watch poll cadence — seconds, deliberately NOT a tight loop (the session
|
|
@@ -55,7 +55,10 @@ function sessionAlive(sock: string, identity: string): boolean {
|
|
|
55
55
|
* both paths now converge on the same end state.
|
|
56
56
|
*/
|
|
57
57
|
export function teardownAlwaysOnSession(sock: string, identity: string): void {
|
|
58
|
+
// Explicit canary silence (v2): channel signal (client exits 0) + dismiss the
|
|
59
|
+
// sh recorder — a signaled client alone is no longer read as deliberate.
|
|
58
60
|
signalCanaryClean(sock, identity)
|
|
61
|
+
dismissCanary(identity)
|
|
59
62
|
spawnSync('tmux', ['-S', sock, 'kill-session', '-t', identity], { stdio: 'ignore' })
|
|
60
63
|
const ls = spawnSync('tmux', ['-S', sock, 'list-sessions', '-F', '#{session_name}'], { encoding: 'utf8' })
|
|
61
64
|
if (!(ls.stdout ?? '').trim()) {
|
package/src/lifecycle/index.ts
CHANGED
|
@@ -46,7 +46,7 @@ import {
|
|
|
46
46
|
type LaunchSpec,
|
|
47
47
|
} from '../launch/index.ts'
|
|
48
48
|
import { composeSystemPrompt, gatherPromptInput } from '../launch/composeSystemPrompt.ts'
|
|
49
|
-
import { ensureServerCanary, signalCanaryClean } from '../launch/canary.ts'
|
|
49
|
+
import { dismissCanary, ensureServerCanary, signalCanaryClean } from '../launch/canary.ts'
|
|
50
50
|
import { appendLifecycleEvent, superviseLogVerbose } from './eventlog.ts'
|
|
51
51
|
|
|
52
52
|
// ─────────────────────────────────────────────────────────────────────────────
|
|
@@ -1009,9 +1009,11 @@ export function killSession(sock: string, identity: string): void {
|
|
|
1009
1009
|
tmux(sock, 'kill-session', '-t', identity)
|
|
1010
1010
|
const sessions = tmux(sock, 'list-sessions', '-F', '#{session_name}').out
|
|
1011
1011
|
if (!sessions.trim()) {
|
|
1012
|
-
// Deliberate server teardown →
|
|
1013
|
-
// exits
|
|
1012
|
+
// Deliberate server teardown → silence the canary EXPLICITLY first: signal
|
|
1013
|
+
// the channel (client exits 0) AND dismiss the sh recorder (canary v2 no
|
|
1014
|
+
// longer reads a signaled client alone as deliberate — see canary.ts).
|
|
1014
1015
|
signalCanaryClean(sock, identity)
|
|
1016
|
+
dismissCanary(identity)
|
|
1015
1017
|
tmux(sock, 'kill-server')
|
|
1016
1018
|
try {
|
|
1017
1019
|
rmSync(sock, { force: true })
|
|
@@ -1190,7 +1192,15 @@ export function superviseTick(cfg: LifecycleConfig, deps: SuperviseDeps = {}): S
|
|
|
1190
1192
|
// every ALIVE daemon-owned server — covers a fleet launched by older code
|
|
1191
1193
|
// within one tick of a deploy, no session restarts. Idempotent (pgrep on
|
|
1192
1194
|
// the per-identity wait-for channel), best-effort, pure observability.
|
|
1193
|
-
|
|
1195
|
+
// A non-'already' state IS a decision-grade event (not verbose-gated):
|
|
1196
|
+
// post-0.2.22 every launch arms a canary, so a retrofit 'spawned' on an
|
|
1197
|
+
// alive server means the previous canary VANISHED mid-watch (the death-#4
|
|
1198
|
+
// blind spot was exactly this churn being invisible); 'failed' means the
|
|
1199
|
+
// server is currently UNWATCHED. At most one line per loss, not per tick.
|
|
1200
|
+
const canaryState = ensureServerCanary({ identity: s.identity, sock, exitLogDir: cfg.eventLogDir, env })
|
|
1201
|
+
if (canaryState !== 'already') {
|
|
1202
|
+
trace({ identity: s.identity, action: 'canary', state: canaryState, origin: 'retrofit' })
|
|
1203
|
+
}
|
|
1194
1204
|
out.push({ identity: s.identity, action: 'alive' })
|
|
1195
1205
|
if (verbose) trace({ identity: s.identity, action: 'alive', age: `${ageSecs}s` })
|
|
1196
1206
|
}
|