@agfpd/iapeer 0.1.2 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +9 -2
- package/src/cli/cli.test.ts +23 -2
- package/src/cli/index.ts +90 -2
- package/src/core/constants.ts +7 -1
- package/src/daemon/main.ts +7 -2
- package/src/lifecycle/eventlog.test.ts +114 -0
- package/src/lifecycle/eventlog.ts +133 -0
- package/src/lifecycle/index.ts +292 -56
- package/src/lifecycle/lifecycle.test.ts +208 -63
- package/src/registry/registry.test.ts +33 -1
package/src/lifecycle/index.ts
CHANGED
|
@@ -40,6 +40,7 @@ import {
|
|
|
40
40
|
type LaunchSpec,
|
|
41
41
|
} from '../launch/index.ts'
|
|
42
42
|
import { composeSystemPrompt, gatherPromptInput } from '../launch/composeSystemPrompt.ts'
|
|
43
|
+
import { appendLifecycleEvent, superviseLogVerbose } from './eventlog.ts'
|
|
43
44
|
|
|
44
45
|
// ─────────────────────────────────────────────────────────────────────────────
|
|
45
46
|
// Config
|
|
@@ -51,10 +52,19 @@ export interface LifecycleConfig {
|
|
|
51
52
|
sockDir: string
|
|
52
53
|
stateDir: string // ~/.iapeer/state/lifecycle
|
|
53
54
|
logDir: string // ~/.iapeer/logs/lifecycle
|
|
55
|
+
/** Where the durable lifecycle DECISION log (lifecycle.log) is written
|
|
56
|
+
* (~/.iapeer/logs/iapeer — next to daemon-stdout/stderr.log, where the first
|
|
57
|
+
* investigator looks). Routed through cfg — NOT re-resolved from env — so it is
|
|
58
|
+
* isolated by the same sandbox as stateDir (eventlog.ts). */
|
|
59
|
+
eventLogDir: string
|
|
54
60
|
bootDeadlineSecs: number
|
|
55
61
|
readyGateSecs: number
|
|
56
62
|
idleSecs: number
|
|
57
63
|
maxAgeSecs: number
|
|
64
|
+
/** Crash-loop guard: refuse to (re)launch after this many deaths within the window. */
|
|
65
|
+
crashLoopMax: number
|
|
66
|
+
/** Crash-loop guard: the sliding window (seconds) the death count is measured over. */
|
|
67
|
+
crashLoopWindowSecs: number
|
|
58
68
|
}
|
|
59
69
|
|
|
60
70
|
export function loadLifecycleConfig(env: NodeJS.ProcessEnv = process.env): LifecycleConfig {
|
|
@@ -70,10 +80,13 @@ export function loadLifecycleConfig(env: NodeJS.ProcessEnv = process.env): Lifec
|
|
|
70
80
|
sockDir: resolveSockDir(env),
|
|
71
81
|
stateDir: join(root, STATE_DIR, 'lifecycle'),
|
|
72
82
|
logDir: join(root, LOGS_DIR, 'lifecycle'),
|
|
83
|
+
eventLogDir: join(root, LOGS_DIR, 'iapeer'),
|
|
73
84
|
bootDeadlineSecs: num(env.IAPEER_BOOT_DEADLINE_SECS, 240),
|
|
74
85
|
readyGateSecs: num(env.IAPEER_READY_GATE_SECS, 120),
|
|
75
86
|
idleSecs: num(env.IAPEER_IDLE_SECS, 3600),
|
|
76
87
|
maxAgeSecs: num(env.IAPEER_MAX_AGE_SECS, 14400),
|
|
88
|
+
crashLoopMax: num(env.IAPEER_CRASHLOOP_MAX, 3),
|
|
89
|
+
crashLoopWindowSecs: num(env.IAPEER_CRASHLOOP_WINDOW_SECS, 300),
|
|
77
90
|
}
|
|
78
91
|
}
|
|
79
92
|
|
|
@@ -168,42 +181,155 @@ export function clearStopped(cfg: LifecycleConfig, identity: string): void {
|
|
|
168
181
|
}
|
|
169
182
|
|
|
170
183
|
// ─────────────────────────────────────────────────────────────────────────────
|
|
171
|
-
//
|
|
172
|
-
//
|
|
173
|
-
//
|
|
174
|
-
//
|
|
175
|
-
//
|
|
176
|
-
//
|
|
184
|
+
// Lifecycle markers — the DAEMON decides fresh-vs-resume by the DEATH CAUSE it
|
|
185
|
+
// tracks itself (TARGET redesign). Plain files in state/lifecycle/<identity>.* :
|
|
186
|
+
//
|
|
187
|
+
// .idle-reaped : written ONLY when the daemon idle-reaps the session (the only
|
|
188
|
+
// death the daemon initiates). Presence on the next wake = session was parked
|
|
189
|
+
// cleanly = RESUME-eligible. ABSENT on a dead session = it died on its own
|
|
190
|
+
// (crash / self-close) = FRESH. (resolver branch 3.)
|
|
191
|
+
// .new-eager : set when /new is invoked (owner reset, via `iapeer self-fresh`).
|
|
192
|
+
// Presence on a dead session = the daemon EAGERLY relaunches FRESH (does NOT
|
|
193
|
+
// wait for a message) and injects initial_prompt. Consumed on the relaunch.
|
|
194
|
+
// .deaths : crash-loop guard — a small JSON ring of recent death epoch-ms.
|
|
195
|
+
// .topic : the topic tag of the current/last session (executor fresh-vs-
|
|
196
|
+
// resume discriminator).
|
|
197
|
+
//
|
|
198
|
+
// Boolean markers carry an ISO timestamp line (audit-friendly); .deaths is a JSON
|
|
199
|
+
// array; .topic is the raw topic string.
|
|
177
200
|
// ─────────────────────────────────────────────────────────────────────────────
|
|
178
201
|
|
|
179
|
-
function
|
|
180
|
-
return join(cfg.stateDir, `${identity}.
|
|
202
|
+
function idleReapedPath(cfg: LifecycleConfig, identity: string): string {
|
|
203
|
+
return join(cfg.stateDir, `${identity}.idle-reaped`)
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
/** True iff the identity was idle-reaped by the daemon (→ RESUME-eligible). */
|
|
207
|
+
export function hasIdleReaped(cfg: LifecycleConfig, identity: string): boolean {
|
|
208
|
+
return existsSync(idleReapedPath(cfg, identity))
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
/** Write the idle-reaped marker — ONLY the idle-reap path in superviseTick does this. */
|
|
212
|
+
export function setIdleReaped(cfg: LifecycleConfig, identity: string): void {
|
|
213
|
+
mkdirSync(cfg.stateDir, { recursive: true, mode: 0o700 })
|
|
214
|
+
writeFileSync(idleReapedPath(cfg, identity), `${new Date().toISOString()}\n`, { mode: 0o600 })
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
/** Consume the idle-reaped marker (the resolver does this on the resume decision). */
|
|
218
|
+
export function clearIdleReaped(cfg: LifecycleConfig, identity: string): void {
|
|
219
|
+
try {
|
|
220
|
+
rmSync(idleReapedPath(cfg, identity), { force: true })
|
|
221
|
+
} catch {
|
|
222
|
+
/* already gone */
|
|
223
|
+
}
|
|
181
224
|
}
|
|
182
225
|
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
return existsSync(newMarkPath(cfg, identity))
|
|
226
|
+
function newEagerPath(cfg: LifecycleConfig, identity: string): string {
|
|
227
|
+
return join(cfg.stateDir, `${identity}.new-eager`)
|
|
186
228
|
}
|
|
187
229
|
|
|
188
|
-
/**
|
|
189
|
-
export function
|
|
230
|
+
/** True iff the identity carries a /new eager-fresh mark (→ eager fresh + seed). */
|
|
231
|
+
export function hasNewEager(cfg: LifecycleConfig, identity: string): boolean {
|
|
232
|
+
return existsSync(newEagerPath(cfg, identity))
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
/** Set the /new eager-fresh mark (the `self-fresh` verb does this before self-kill). */
|
|
236
|
+
export function setNewEager(cfg: LifecycleConfig, identity: string): void {
|
|
190
237
|
mkdirSync(cfg.stateDir, { recursive: true, mode: 0o700 })
|
|
191
|
-
writeFileSync(
|
|
238
|
+
writeFileSync(newEagerPath(cfg, identity), `${new Date().toISOString()}\n`, { mode: 0o600 })
|
|
192
239
|
}
|
|
193
240
|
|
|
194
|
-
/** Consume the /new mark (the daemon does this on the eager
|
|
195
|
-
export function
|
|
241
|
+
/** Consume the /new eager-fresh mark (the daemon does this on the eager relaunch). */
|
|
242
|
+
export function clearNewEager(cfg: LifecycleConfig, identity: string): void {
|
|
196
243
|
try {
|
|
197
|
-
rmSync(
|
|
244
|
+
rmSync(newEagerPath(cfg, identity), { force: true })
|
|
198
245
|
} catch {
|
|
199
246
|
/* already gone */
|
|
200
247
|
}
|
|
201
248
|
}
|
|
202
249
|
|
|
250
|
+
function deathsPath(cfg: LifecycleConfig, identity: string): string {
|
|
251
|
+
return join(cfg.stateDir, `${identity}.deaths`)
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
/** Read the crash-loop death ring (epoch-ms timestamps). Garbage → empty. */
|
|
255
|
+
export function readDeaths(cfg: LifecycleConfig, identity: string): number[] {
|
|
256
|
+
try {
|
|
257
|
+
const arr = JSON.parse(readFileSync(deathsPath(cfg, identity), 'utf8'))
|
|
258
|
+
return Array.isArray(arr) ? arr.filter((n): n is number => typeof n === 'number' && Number.isFinite(n)) : []
|
|
259
|
+
} catch {
|
|
260
|
+
return []
|
|
261
|
+
}
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
/** Append a death epoch-ms to the crash-loop ring (best-effort, bounded). */
|
|
265
|
+
export function recordDeath(cfg: LifecycleConfig, identity: string, nowMs: number = Date.now()): void {
|
|
266
|
+
mkdirSync(cfg.stateDir, { recursive: true, mode: 0o700 })
|
|
267
|
+
// Keep the ring small — only the most recent matter for the window check.
|
|
268
|
+
const next = [...readDeaths(cfg, identity), nowMs].slice(-16)
|
|
269
|
+
try {
|
|
270
|
+
writeFileSync(deathsPath(cfg, identity), JSON.stringify(next), { mode: 0o600 })
|
|
271
|
+
} catch {
|
|
272
|
+
/* best-effort accounting; never block a reap */
|
|
273
|
+
}
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
/** Count deaths within `windowSecs` of `nowMs` (crash-loop guard input). */
|
|
277
|
+
export function countRecentDeaths(
|
|
278
|
+
cfg: LifecycleConfig,
|
|
279
|
+
identity: string,
|
|
280
|
+
windowSecs: number,
|
|
281
|
+
nowMs: number = Date.now(),
|
|
282
|
+
): number {
|
|
283
|
+
const cutoff = nowMs - windowSecs * 1000
|
|
284
|
+
return readDeaths(cfg, identity).filter(t => t >= cutoff).length
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
/** Trim the death ring to the window (called on a successful wake to reset the loop). */
|
|
288
|
+
export function trimDeaths(
|
|
289
|
+
cfg: LifecycleConfig,
|
|
290
|
+
identity: string,
|
|
291
|
+
windowSecs: number,
|
|
292
|
+
nowMs: number = Date.now(),
|
|
293
|
+
): void {
|
|
294
|
+
const cutoff = nowMs - windowSecs * 1000
|
|
295
|
+
const kept = readDeaths(cfg, identity).filter(t => t >= cutoff)
|
|
296
|
+
try {
|
|
297
|
+
if (kept.length === 0) rmSync(deathsPath(cfg, identity), { force: true })
|
|
298
|
+
else writeFileSync(deathsPath(cfg, identity), JSON.stringify(kept), { mode: 0o600 })
|
|
299
|
+
} catch {
|
|
300
|
+
/* best-effort */
|
|
301
|
+
}
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
function topicPath(cfg: LifecycleConfig, identity: string): string {
|
|
305
|
+
return join(cfg.stateDir, `${identity}.topic`)
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
/** Read the stored topic of the current/last session (executor discriminator). '' if none. */
|
|
309
|
+
export function readTopic(cfg: LifecycleConfig, identity: string): string {
|
|
310
|
+
try {
|
|
311
|
+
return readFileSync(topicPath(cfg, identity), 'utf8').trim()
|
|
312
|
+
} catch {
|
|
313
|
+
return ''
|
|
314
|
+
}
|
|
315
|
+
}
|
|
316
|
+
|
|
317
|
+
/** Store the incoming topic for the established session (raw string, best-effort). */
|
|
318
|
+
export function writeTopic(cfg: LifecycleConfig, identity: string, topic: string): void {
|
|
319
|
+
mkdirSync(cfg.stateDir, { recursive: true, mode: 0o700 })
|
|
320
|
+
try {
|
|
321
|
+
writeFileSync(topicPath(cfg, identity), topic, { mode: 0o600 })
|
|
322
|
+
} catch {
|
|
323
|
+
/* best-effort — topic is a discriminator hint, never blocks a wake */
|
|
324
|
+
}
|
|
325
|
+
}
|
|
326
|
+
|
|
203
327
|
// ─────────────────────────────────────────────────────────────────────────────
|
|
204
|
-
// resolveWakeMode
|
|
205
|
-
//
|
|
206
|
-
//
|
|
328
|
+
// resolveWakeMode — the resume-vs-fresh decision (TARGET redesign). The DAEMON
|
|
329
|
+
// decides by the DEATH CAUSE it tracks (.idle-reaped marker), plus peer-type /
|
|
330
|
+
// topic — NOT an agent-dropped fresh mark. Takes the adapter's resolveResume as a
|
|
331
|
+
// parameter so it is unit-testable without a runtime. The .idle-reaped marker is
|
|
332
|
+
// CONSUMED when the default branch acts on it (a wake side-effect).
|
|
207
333
|
// ─────────────────────────────────────────────────────────────────────────────
|
|
208
334
|
|
|
209
335
|
export interface WakeMode {
|
|
@@ -212,17 +338,40 @@ export interface WakeMode {
|
|
|
212
338
|
/** Set ONLY for an EXPLICIT resume request that found nothing to resume — the
|
|
213
339
|
* caller must fail loud (never a silent fresh fallback). */
|
|
214
340
|
failReason?: string
|
|
341
|
+
/** Which decision branch fired — the durable "why fresh / why resume" reason.
|
|
342
|
+
* Logged by wakeOrSpawn: the .idle-reaped marker is CONSUMED inside this
|
|
343
|
+
* function (branch 3b), so this cause is the only surviving record of it. */
|
|
344
|
+
cause?: string
|
|
215
345
|
}
|
|
216
346
|
|
|
217
347
|
/**
|
|
218
|
-
*
|
|
219
|
-
*
|
|
220
|
-
*
|
|
221
|
-
*
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
348
|
+
* True iff the peer of `cwd` is a human-conversational peer — i.e. its local
|
|
349
|
+
* profile declares an `interfaces.telegram` binding (a telegram-fronted dialogue).
|
|
350
|
+
* Such a peer NEVER auto-freshes a resume-eligible wake; only an explicit /new
|
|
351
|
+
* (eager) resets it. A profile read hiccup → not-human (safe default: an executor).
|
|
352
|
+
*/
|
|
353
|
+
function isHumanConversational(cwd: string): boolean {
|
|
354
|
+
try {
|
|
355
|
+
const ifaces = readPeerProfile(cwd)?.interfaces
|
|
356
|
+
return !!(ifaces && ifaces.telegram != null)
|
|
357
|
+
} catch {
|
|
358
|
+
return false
|
|
359
|
+
}
|
|
360
|
+
}
|
|
361
|
+
|
|
362
|
+
/**
|
|
363
|
+
* Decide resume vs fresh on a wake (TARGET redesign). Branch order:
|
|
364
|
+
* 1. argsResume === false (folder-launch `iapeer <runtime>`) → FRESH.
|
|
365
|
+
* 2. argsResume === true (attach) → RESUME; FAIL-LOUD via resolveResume if there
|
|
366
|
+
* is nothing to resume (failReason set; never a silent fresh fallback).
|
|
367
|
+
* 3. default (argsResume undefined — a message woke a dead/asleep peer):
|
|
368
|
+
* a. NOT hasIdleReaped → FRESH. (It died on its own: crash / self-close. A
|
|
369
|
+
* crash needs a CLEAN fresh, not a re-crashing resume of a broken context;
|
|
370
|
+
* durable handoff carries continuity.)
|
|
371
|
+
* b. hasIdleReaped (CONSUME the marker now) → resume-eligible, then by type:
|
|
372
|
+
* - human-conversational (interfaces.telegram present) → RESUME.
|
|
373
|
+
* - executor: incomingTopic non-empty AND differs from stored .topic →
|
|
374
|
+
* FRESH (new work); else (same topic, or no incoming topic) → RESUME.
|
|
226
375
|
*/
|
|
227
376
|
export function resolveWakeMode(
|
|
228
377
|
cfg: LifecycleConfig,
|
|
@@ -230,19 +379,36 @@ export function resolveWakeMode(
|
|
|
230
379
|
cwd: string,
|
|
231
380
|
argsResume: boolean | undefined,
|
|
232
381
|
resolveResume: (cwd: string) => { ok: boolean; ref?: string; reason?: string },
|
|
382
|
+
incomingTopic?: string,
|
|
233
383
|
): WakeMode {
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
}
|
|
238
|
-
if (argsResume === false) return { resume: false }
|
|
384
|
+
// 1. folder-launch → always fresh.
|
|
385
|
+
if (argsResume === false) return { resume: false, cause: 'folder-launch' }
|
|
386
|
+
// 2. attach → always resume, fail-loud if nothing to resume.
|
|
239
387
|
if (argsResume === true) {
|
|
240
388
|
const r = resolveResume(cwd)
|
|
241
|
-
if (!r.ok) return { resume: false, failReason: r.reason ?? 'resume requested but nothing to resume' }
|
|
242
|
-
return { resume: true, resumeRef: r.ref }
|
|
389
|
+
if (!r.ok) return { resume: false, failReason: r.reason ?? 'resume requested but nothing to resume', cause: 'attach-nothing-to-resume' }
|
|
390
|
+
return { resume: true, resumeRef: r.ref, cause: 'attach' }
|
|
243
391
|
}
|
|
392
|
+
// 3. default (a message woke a dead/asleep peer): decide by the death cause.
|
|
393
|
+
// 3a. NOT idle-reaped → it died on its own (crash / self-close) → clean FRESH.
|
|
394
|
+
if (!hasIdleReaped(cfg, identity)) return { resume: false, cause: 'crash-or-self-close' }
|
|
395
|
+
// 3b. idle-reaped → resume-eligible. Consume the marker now (it has done its job).
|
|
396
|
+
clearIdleReaped(cfg, identity)
|
|
397
|
+
// human-conversational dialogue never auto-freshes; only an explicit /new resets it.
|
|
398
|
+
if (isHumanConversational(cwd)) {
|
|
399
|
+
const r = resolveResume(cwd)
|
|
400
|
+
return r.ok
|
|
401
|
+
? { resume: true, resumeRef: r.ref, cause: 'idle-reaped-human' }
|
|
402
|
+
: { resume: false, cause: 'idle-reaped-human-no-resume' }
|
|
403
|
+
}
|
|
404
|
+
// executor: a NEW topic (non-empty and differing from the stored one) means new
|
|
405
|
+
// work → FRESH; same topic, or no incoming topic → continue the work → RESUME.
|
|
406
|
+
const topic = incomingTopic?.trim() ?? ''
|
|
407
|
+
if (topic && topic !== readTopic(cfg, identity)) return { resume: false, cause: 'idle-reaped-new-topic' }
|
|
244
408
|
const r = resolveResume(cwd)
|
|
245
|
-
return r.ok
|
|
409
|
+
return r.ok
|
|
410
|
+
? { resume: true, resumeRef: r.ref, cause: 'idle-reaped-resume' }
|
|
411
|
+
: { resume: false, cause: 'idle-reaped-no-resume' }
|
|
246
412
|
}
|
|
247
413
|
|
|
248
414
|
export function readSessionStates(cfg: LifecycleConfig): SessionState[] {
|
|
@@ -485,6 +651,13 @@ export async function wakeOrSpawn(args: WakeArgs, deps: WakeDeps = {}): Promise<
|
|
|
485
651
|
const env = deps.env ?? process.env
|
|
486
652
|
const cfg = deps.cfg ?? loadLifecycleConfig(env)
|
|
487
653
|
|
|
654
|
+
// Durable wake-decision trace (eventlog.ts): one line per bring-up decision —
|
|
655
|
+
// fresh / resume (with the resolveWakeMode cause) or a refusal (stopped / crash-
|
|
656
|
+
// loop / launchd). This is the direct answer to "why did peer X come up fresh",
|
|
657
|
+
// and the only surviving record of the .idle-reaped marker resolveWakeMode consumes.
|
|
658
|
+
const logWake = (fields: Record<string, string | number | undefined>): void =>
|
|
659
|
+
appendLifecycleEvent(cfg.eventLogDir, { ev: 'wake', personality: args.personality, ...fields }, { env })
|
|
660
|
+
|
|
488
661
|
// Heal strays before launching — the sweep-at-spawn-start. This is the SAME
|
|
489
662
|
// H4-guarded superviseTick the daemon timer runs, so both reap entry points
|
|
490
663
|
// (timer + wake) go through one guarded path. Best-effort: never block a wake.
|
|
@@ -500,6 +673,7 @@ export async function wakeOrSpawn(args: WakeArgs, deps: WakeDeps = {}): Promise<
|
|
|
500
673
|
|
|
501
674
|
// H4 — never wake a launchd-managed peer (launchd KeepAlive owns it).
|
|
502
675
|
if (isLaunchdManaged(args.personality, env)) {
|
|
676
|
+
logWake({ runtime: args.runtime, mode: 'refused', cause: 'launchd-managed' })
|
|
503
677
|
return {
|
|
504
678
|
status: 'FAILED',
|
|
505
679
|
woke: false,
|
|
@@ -530,6 +704,7 @@ export async function wakeOrSpawn(args: WakeArgs, deps: WakeDeps = {}): Promise<
|
|
|
530
704
|
// halt: refuse with stopped:true so the sender gets an explicit "stopped" error,
|
|
531
705
|
// not a generic "offline" — and no message is queued. `start` clears the flag.
|
|
532
706
|
if (isStopped(cfg, identity)) {
|
|
707
|
+
logWake({ identity, runtime, mode: 'refused', cause: 'stopped' })
|
|
533
708
|
return {
|
|
534
709
|
status: 'FAILED',
|
|
535
710
|
woke: false,
|
|
@@ -547,9 +722,11 @@ export async function wakeOrSpawn(args: WakeArgs, deps: WakeDeps = {}): Promise<
|
|
|
547
722
|
// refusal). A stop racing DURING the spawn is a narrower window the wake-lock does not
|
|
548
723
|
// cover (stop does not take this lock).
|
|
549
724
|
if (isStopped(cfg, identity)) {
|
|
725
|
+
logWake({ identity, runtime, mode: 'refused', cause: 'stopped-mid-wake' })
|
|
550
726
|
return { status: 'FAILED', woke: false, runtime, stopped: true, reason: `"${args.personality}" (${runtime}) is stopped and not accepting messages; start it to resume` }
|
|
551
727
|
}
|
|
552
728
|
if (isLaunchdManaged(args.personality, env)) {
|
|
729
|
+
logWake({ identity, runtime, mode: 'refused', cause: 'launchd-managed-mid-wake' })
|
|
553
730
|
return { status: 'FAILED', woke: false, runtime, reason: `"${args.personality}" became launchd-managed mid-wake; the daemon does not wake it` }
|
|
554
731
|
}
|
|
555
732
|
// Idempotent fast path inside the lock: a live session wins (a concurrent
|
|
@@ -562,9 +739,37 @@ export async function wakeOrSpawn(args: WakeArgs, deps: WakeDeps = {}): Promise<
|
|
|
562
739
|
return { status: 'FAILED', woke: false, runtime, reason: `peer cwd does not exist: ${cwd}` }
|
|
563
740
|
}
|
|
564
741
|
|
|
565
|
-
//
|
|
566
|
-
//
|
|
567
|
-
|
|
742
|
+
// Crash-loop guard — BEFORE launching: if the peer has died crashLoopMax times
|
|
743
|
+
// within crashLoopWindowSecs, refuse to (re)launch and leave it asleep (a clear
|
|
744
|
+
// FAILED reason, not a silent fresh that re-crashes). A successful wake below
|
|
745
|
+
// trims the ring, so the guard only fires on a genuine tight loop.
|
|
746
|
+
const recentDeaths = countRecentDeaths(cfg, identity, cfg.crashLoopWindowSecs, Date.now())
|
|
747
|
+
if (recentDeaths >= cfg.crashLoopMax) {
|
|
748
|
+
logWake({ identity, runtime, mode: 'refused', cause: 'crash-loop', reason: `${recentDeaths} deaths in ${cfg.crashLoopWindowSecs}s` })
|
|
749
|
+
return {
|
|
750
|
+
status: 'FAILED',
|
|
751
|
+
woke: false,
|
|
752
|
+
runtime,
|
|
753
|
+
reason: `crash-loop guard: ${recentDeaths} deaths in ${cfg.crashLoopWindowSecs}s, leaving asleep`,
|
|
754
|
+
}
|
|
755
|
+
}
|
|
756
|
+
|
|
757
|
+
// Resolve resume vs fresh (TARGET redesign resolveWakeMode): the daemon decides
|
|
758
|
+
// by the death cause (.idle-reaped marker) + peer-type/topic. An EXPLICIT resume
|
|
759
|
+
// that finds nothing to resume fails loud. incomingTopic (args.topic) is the
|
|
760
|
+
// executor discriminator.
|
|
761
|
+
const mode = resolveWakeMode(cfg, identity, cwd, args.resume, c => adapter.resolveResume(c), args.topic)
|
|
762
|
+
// The bring-up decision is the durable trace — log it BEFORE launch (the decision
|
|
763
|
+
// stands regardless of whether the subsequent launch succeeds). resolveWakeMode has
|
|
764
|
+
// already consumed any .idle-reaped marker, so `cause` is now its only record.
|
|
765
|
+
logWake({
|
|
766
|
+
identity,
|
|
767
|
+
runtime,
|
|
768
|
+
mode: mode.failReason ? 'fail' : mode.resume ? 'resume' : 'fresh',
|
|
769
|
+
cause: mode.cause,
|
|
770
|
+
ref: mode.resumeRef,
|
|
771
|
+
reason: mode.failReason,
|
|
772
|
+
})
|
|
568
773
|
if (mode.failReason) return { status: 'FAILED', woke: false, runtime, reason: mode.failReason }
|
|
569
774
|
const resume = mode.resume
|
|
570
775
|
const resumeRef = mode.resumeRef
|
|
@@ -622,6 +827,10 @@ export async function wakeOrSpawn(args: WakeArgs, deps: WakeDeps = {}): Promise<
|
|
|
622
827
|
`[iapeer] WARN session-state write failed for ${identity} — session is live + TTL-bounded but not idle-reap-supervised: ${e instanceof Error ? e.message : String(e)}\n`,
|
|
623
828
|
)
|
|
624
829
|
}
|
|
830
|
+
// Establish the session's topic (executor discriminator) and reset the crash-loop
|
|
831
|
+
// ring — a successful wake means this is NOT a tight crash loop. Best-effort.
|
|
832
|
+
writeTopic(cfg, identity, args.topic?.trim() ?? '')
|
|
833
|
+
trimDeaths(cfg, identity, cfg.crashLoopWindowSecs, Date.now())
|
|
625
834
|
return { status: 'READY', woke: true, runtime, process_address: identity }
|
|
626
835
|
})
|
|
627
836
|
}
|
|
@@ -659,8 +868,8 @@ export interface SuperviseOutcome {
|
|
|
659
868
|
identity: string
|
|
660
869
|
action: 'reaped-idle' | 'reaped-gone' | 'skipped-launchd' | 'alive' | 'needs-eager-fresh'
|
|
661
870
|
reason?: string
|
|
662
|
-
/** For 'needs-eager-fresh'
|
|
663
|
-
*
|
|
871
|
+
/** For 'needs-eager-fresh': the peer to EAGERLY re-launch fresh (its session died
|
|
872
|
+
* carrying a .new-eager mark). The daemon timer drives the async relaunch. */
|
|
664
873
|
personality?: string
|
|
665
874
|
runtime?: Runtime
|
|
666
875
|
}
|
|
@@ -673,31 +882,46 @@ export interface SuperviseDeps {
|
|
|
673
882
|
export function superviseTick(cfg: LifecycleConfig, deps: SuperviseDeps = {}): SuperviseOutcome[] {
|
|
674
883
|
const env = deps.env ?? process.env
|
|
675
884
|
const nowMs = deps.nowMs ?? Date.now()
|
|
885
|
+
const verbose = superviseLogVerbose(env)
|
|
886
|
+
// Durable decision trace (eventlog.ts): every reap/death/eager-fresh gets a line
|
|
887
|
+
// so a postmortem can answer "when & how did peer X's prior session end" even
|
|
888
|
+
// after the .idle-reaped / .deaths markers are consumed. alive / skipped-launchd
|
|
889
|
+
// are steady-state non-decisions → logged only under IAPEER_SUPERVISE_LOG_VERBOSE.
|
|
890
|
+
const trace = (fields: Record<string, string | number | undefined>): void =>
|
|
891
|
+
appendLifecycleEvent(cfg.eventLogDir, { ev: 'supervise', ...fields }, { env, nowMs })
|
|
676
892
|
const out: SuperviseOutcome[] = []
|
|
677
893
|
for (const s of readSessionStates(cfg)) {
|
|
678
894
|
// H4 — FIRST, before any reap. A launchd-managed peer is read-only.
|
|
679
895
|
if (isLaunchdManaged(s.personality, env)) {
|
|
680
896
|
out.push({ identity: s.identity, action: 'skipped-launchd' })
|
|
897
|
+
if (verbose) trace({ identity: s.identity, action: 'skipped-launchd', outcome: 'read-only-h4' })
|
|
681
898
|
continue
|
|
682
899
|
}
|
|
683
900
|
const sock = buildSocketPath(s.runtime, s.personality, cfg.sockDir)
|
|
684
901
|
if (!sessionAlive(sock, s.identity)) {
|
|
902
|
+
// A dead session: record a death for crash-loop accounting, then branch on the
|
|
903
|
+
// .new-eager mark. This death was NOT daemon-initiated (the daemon only initiates
|
|
904
|
+
// the idle-reap below) → it died on its own → do NOT write .idle-reaped here.
|
|
905
|
+
recordDeath(cfg, s.identity, nowMs)
|
|
685
906
|
removeSessionState(cfg, s.identity)
|
|
686
|
-
//
|
|
687
|
-
//
|
|
688
|
-
//
|
|
689
|
-
|
|
690
|
-
if (hasNewMark(cfg, s.identity)) {
|
|
907
|
+
// A session that died carrying a .new-eager mark is an owner /new: re-launch
|
|
908
|
+
// EAGERLY as fresh (not lazily on the next message). The mark is LEFT for the
|
|
909
|
+
// eager relaunch (processEagerRelaunches) to consume; the daemon timer drives it.
|
|
910
|
+
if (hasNewEager(cfg, s.identity)) {
|
|
691
911
|
out.push({
|
|
692
912
|
identity: s.identity,
|
|
693
913
|
action: 'needs-eager-fresh',
|
|
694
|
-
reason: '/new
|
|
914
|
+
reason: '/new eager mark — eager fresh re-launch',
|
|
695
915
|
personality: s.personality,
|
|
696
916
|
runtime: s.runtime,
|
|
697
917
|
})
|
|
918
|
+
trace({ identity: s.identity, action: 'needs-eager-fresh', reason: '/new eager mark', outcome: 'eager-fresh' })
|
|
698
919
|
continue
|
|
699
920
|
}
|
|
921
|
+
// Crash / self-close: NO marker written, NO eager relaunch — the peer stays
|
|
922
|
+
// asleep and wakes FRESH lazily on the next message (resolveWakeMode branch 3a).
|
|
700
923
|
out.push({ identity: s.identity, action: 'reaped-gone', reason: 'session no longer live' })
|
|
924
|
+
trace({ identity: s.identity, action: 'reaped-gone', reason: 'session no longer live', outcome: 'fresh-next-msg' })
|
|
701
925
|
continue
|
|
702
926
|
}
|
|
703
927
|
// Idle accounting via the runtime adapter's activity proxy (claude transcript
|
|
@@ -712,25 +936,33 @@ export function superviseTick(cfg: LifecycleConfig, deps: SuperviseDeps = {}): S
|
|
|
712
936
|
}
|
|
713
937
|
const ageSecs = Math.floor((nowMs - mt) / 1000)
|
|
714
938
|
if (ageSecs > cfg.idleSecs) {
|
|
939
|
+
// THE ONLY place .idle-reaped is written: this is the one death the daemon
|
|
940
|
+
// INITIATES. Its presence on the next wake = the session was parked cleanly =
|
|
941
|
+
// RESUME-eligible (resolveWakeMode branch 3b). A crash/self-close (the dead
|
|
942
|
+
// branch above) never writes it → that wakes FRESH (branch 3a).
|
|
715
943
|
killSession(sock, s.identity)
|
|
944
|
+
setIdleReaped(cfg, s.identity)
|
|
716
945
|
removeSessionState(cfg, s.identity)
|
|
717
946
|
out.push({ identity: s.identity, action: 'reaped-idle', reason: `idle ${ageSecs}s` })
|
|
947
|
+
trace({ identity: s.identity, action: 'reaped-idle', age: `${ageSecs}s`, outcome: 'resume-eligible' })
|
|
718
948
|
} else {
|
|
719
949
|
out.push({ identity: s.identity, action: 'alive' })
|
|
950
|
+
if (verbose) trace({ identity: s.identity, action: 'alive', age: `${ageSecs}s` })
|
|
720
951
|
}
|
|
721
952
|
}
|
|
722
953
|
return out
|
|
723
954
|
}
|
|
724
955
|
|
|
725
956
|
/**
|
|
726
|
-
*
|
|
727
|
-
*
|
|
728
|
-
*
|
|
729
|
-
*
|
|
730
|
-
*
|
|
731
|
-
*
|
|
732
|
-
*
|
|
733
|
-
*
|
|
957
|
+
* Drive the EAGER fresh re-launch for peers superviseTick flagged 'needs-eager-fresh'
|
|
958
|
+
* (their session died carrying a .new-eager mark — an owner /new). Async + best-effort:
|
|
959
|
+
* task='' so the seed (initial_prompt) is self-sufficient (a /new has no incoming message
|
|
960
|
+
* — the agent auto-reports "I'm up" from the seed). The relaunch is FRESH BY CONSTRUCTION:
|
|
961
|
+
* we CONSUME .new-eager here and pass resume:false so wakeOrSpawn's resolveWakeMode takes
|
|
962
|
+
* the folder-launch fresh branch WITHOUT consulting the death-cause markers. The mark is
|
|
963
|
+
* consumed BEFORE the relaunch so a relaunch failure does not loop on the same eager mark
|
|
964
|
+
* (it then fresh-wakes lazily on its next message — branch 3a — never lost). NB: a /new'd
|
|
965
|
+
* peer is expected to carry an initial_prompt; without one the first turn delivers nothing.
|
|
734
966
|
*/
|
|
735
967
|
export async function processEagerRelaunches(
|
|
736
968
|
cfg: LifecycleConfig,
|
|
@@ -740,9 +972,13 @@ export async function processEagerRelaunches(
|
|
|
740
972
|
const results: WakeResult[] = []
|
|
741
973
|
for (const o of outcomes) {
|
|
742
974
|
if (o.action !== 'needs-eager-fresh' || !o.personality || !o.runtime) continue
|
|
975
|
+
clearNewEager(cfg, o.identity) // consume the eager mark — the relaunch is fresh by construction
|
|
743
976
|
try {
|
|
744
977
|
results.push(
|
|
745
|
-
await wakeOrSpawn(
|
|
978
|
+
await wakeOrSpawn(
|
|
979
|
+
{ personality: o.personality, runtime: o.runtime, task: '', resume: false },
|
|
980
|
+
{ cfg, env: deps.env },
|
|
981
|
+
),
|
|
746
982
|
)
|
|
747
983
|
} catch (e) {
|
|
748
984
|
results.push({ status: 'FAILED', woke: false, reason: e instanceof Error ? e.message : String(e) })
|