@agfpd/iapeer 0.1.2 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -40,6 +40,7 @@ import {
40
40
  type LaunchSpec,
41
41
  } from '../launch/index.ts'
42
42
  import { composeSystemPrompt, gatherPromptInput } from '../launch/composeSystemPrompt.ts'
43
+ import { appendLifecycleEvent, superviseLogVerbose } from './eventlog.ts'
43
44
 
44
45
  // ─────────────────────────────────────────────────────────────────────────────
45
46
  // Config
@@ -51,10 +52,19 @@ export interface LifecycleConfig {
51
52
  sockDir: string
52
53
  stateDir: string // ~/.iapeer/state/lifecycle
53
54
  logDir: string // ~/.iapeer/logs/lifecycle
55
+ /** Where the durable lifecycle DECISION log (lifecycle.log) is written
56
+ * (~/.iapeer/logs/iapeer — next to daemon-stdout/stderr.log, where the first
57
+ * investigator looks). Routed through cfg — NOT re-resolved from env — so it is
58
+ * isolated by the same sandbox as stateDir (eventlog.ts). */
59
+ eventLogDir: string
54
60
  bootDeadlineSecs: number
55
61
  readyGateSecs: number
56
62
  idleSecs: number
57
63
  maxAgeSecs: number
64
+ /** Crash-loop guard: refuse to (re)launch after this many deaths within the window. */
65
+ crashLoopMax: number
66
+ /** Crash-loop guard: the sliding window (seconds) the death count is measured over. */
67
+ crashLoopWindowSecs: number
58
68
  }
59
69
 
60
70
  export function loadLifecycleConfig(env: NodeJS.ProcessEnv = process.env): LifecycleConfig {
@@ -70,10 +80,13 @@ export function loadLifecycleConfig(env: NodeJS.ProcessEnv = process.env): Lifec
70
80
  sockDir: resolveSockDir(env),
71
81
  stateDir: join(root, STATE_DIR, 'lifecycle'),
72
82
  logDir: join(root, LOGS_DIR, 'lifecycle'),
83
+ eventLogDir: join(root, LOGS_DIR, 'iapeer'),
73
84
  bootDeadlineSecs: num(env.IAPEER_BOOT_DEADLINE_SECS, 240),
74
85
  readyGateSecs: num(env.IAPEER_READY_GATE_SECS, 120),
75
86
  idleSecs: num(env.IAPEER_IDLE_SECS, 3600),
76
87
  maxAgeSecs: num(env.IAPEER_MAX_AGE_SECS, 14400),
88
+ crashLoopMax: num(env.IAPEER_CRASHLOOP_MAX, 3),
89
+ crashLoopWindowSecs: num(env.IAPEER_CRASHLOOP_WINDOW_SECS, 300),
77
90
  }
78
91
  }
79
92
 
@@ -168,42 +181,155 @@ export function clearStopped(cfg: LifecycleConfig, identity: string): void {
168
181
  }
169
182
 
170
183
  // ─────────────────────────────────────────────────────────────────────────────
171
- // C4/new graceful mark (contract ЖЦ §/new). The AGENT, on an owner /new,
172
- // writes a handoff to durable memory, drops THIS mark, and self-kills. The daemon
173
- // detects the mark and re-launches EAGERLY as FRESH + initial_prompt (contrast:
174
- // idle-reap is markless lazy resume on the next message). The mark is consumed on
175
- // that fresh re-launch. The mark TEXT/agent-side (doctrine /new instruction) is a
176
- // separate deploy artifact; THIS is only the daemon side (detect fresh + seed).
184
+ // Lifecycle markers the DAEMON decides fresh-vs-resume by the DEATH CAUSE it
185
+ // tracks itself (TARGET redesign). Plain files in state/lifecycle/<identity>.* :
186
+ //
187
+ // .idle-reaped : written ONLY when the daemon idle-reaps the session (the only
188
+ // death the daemon initiates). Presence on the next wake = session was parked
189
+ // cleanly = RESUME-eligible. ABSENT on a dead session = it died on its own
190
+ // (crash / self-close) = FRESH. (resolver branch 3.)
191
+ // .new-eager : set when /new is invoked (owner reset, via `iapeer self-fresh`).
192
+ // Presence on a dead session = the daemon EAGERLY relaunches FRESH (does NOT
193
+ // wait for a message) and injects initial_prompt. Consumed on the relaunch.
194
+ // .deaths : crash-loop guard — a small JSON ring of recent death epoch-ms.
195
+ // .topic : the topic tag of the current/last session (executor fresh-vs-
196
+ // resume discriminator).
197
+ //
198
+ // Boolean markers carry an ISO timestamp line (audit-friendly); .deaths is a JSON
199
+ // array; .topic is the raw topic string.
177
200
  // ─────────────────────────────────────────────────────────────────────────────
178
201
 
179
- function newMarkPath(cfg: LifecycleConfig, identity: string): string {
180
- return join(cfg.stateDir, `${identity}.new`)
202
+ function idleReapedPath(cfg: LifecycleConfig, identity: string): string {
203
+ return join(cfg.stateDir, `${identity}.idle-reaped`)
204
+ }
205
+
206
+ /** True iff the identity was idle-reaped by the daemon (→ RESUME-eligible). */
207
+ export function hasIdleReaped(cfg: LifecycleConfig, identity: string): boolean {
208
+ return existsSync(idleReapedPath(cfg, identity))
209
+ }
210
+
211
+ /** Write the idle-reaped marker — ONLY the idle-reap path in superviseTick does this. */
212
+ export function setIdleReaped(cfg: LifecycleConfig, identity: string): void {
213
+ mkdirSync(cfg.stateDir, { recursive: true, mode: 0o700 })
214
+ writeFileSync(idleReapedPath(cfg, identity), `${new Date().toISOString()}\n`, { mode: 0o600 })
215
+ }
216
+
217
+ /** Consume the idle-reaped marker (the resolver does this on the resume decision). */
218
+ export function clearIdleReaped(cfg: LifecycleConfig, identity: string): void {
219
+ try {
220
+ rmSync(idleReapedPath(cfg, identity), { force: true })
221
+ } catch {
222
+ /* already gone */
223
+ }
181
224
  }
182
225
 
183
- /** True iff the identity carries a /new graceful mark (→ eager fresh + seed). */
184
- export function hasNewMark(cfg: LifecycleConfig, identity: string): boolean {
185
- return existsSync(newMarkPath(cfg, identity))
226
+ function newEagerPath(cfg: LifecycleConfig, identity: string): string {
227
+ return join(cfg.stateDir, `${identity}.new-eager`)
186
228
  }
187
229
 
188
- /** Drop the /new mark (the agent's self-kill ritual does this before exiting). */
189
- export function setNewMark(cfg: LifecycleConfig, identity: string): void {
230
+ /** True iff the identity carries a /new eager-fresh mark ( eager fresh + seed). */
231
+ export function hasNewEager(cfg: LifecycleConfig, identity: string): boolean {
232
+ return existsSync(newEagerPath(cfg, identity))
233
+ }
234
+
235
+ /** Set the /new eager-fresh mark (the `self-fresh` verb does this before self-kill). */
236
+ export function setNewEager(cfg: LifecycleConfig, identity: string): void {
190
237
  mkdirSync(cfg.stateDir, { recursive: true, mode: 0o700 })
191
- writeFileSync(newMarkPath(cfg, identity), `${new Date().toISOString()}\n`, { mode: 0o600 })
238
+ writeFileSync(newEagerPath(cfg, identity), `${new Date().toISOString()}\n`, { mode: 0o600 })
192
239
  }
193
240
 
194
- /** Consume the /new mark (the daemon does this on the eager fresh re-launch). */
195
- export function clearNewMark(cfg: LifecycleConfig, identity: string): void {
241
+ /** Consume the /new eager-fresh mark (the daemon does this on the eager relaunch). */
242
+ export function clearNewEager(cfg: LifecycleConfig, identity: string): void {
196
243
  try {
197
- rmSync(newMarkPath(cfg, identity), { force: true })
244
+ rmSync(newEagerPath(cfg, identity), { force: true })
198
245
  } catch {
199
246
  /* already gone */
200
247
  }
201
248
  }
202
249
 
250
+ function deathsPath(cfg: LifecycleConfig, identity: string): string {
251
+ return join(cfg.stateDir, `${identity}.deaths`)
252
+ }
253
+
254
+ /** Read the crash-loop death ring (epoch-ms timestamps). Garbage → empty. */
255
+ export function readDeaths(cfg: LifecycleConfig, identity: string): number[] {
256
+ try {
257
+ const arr = JSON.parse(readFileSync(deathsPath(cfg, identity), 'utf8'))
258
+ return Array.isArray(arr) ? arr.filter((n): n is number => typeof n === 'number' && Number.isFinite(n)) : []
259
+ } catch {
260
+ return []
261
+ }
262
+ }
263
+
264
+ /** Append a death epoch-ms to the crash-loop ring (best-effort, bounded). */
265
+ export function recordDeath(cfg: LifecycleConfig, identity: string, nowMs: number = Date.now()): void {
266
+ mkdirSync(cfg.stateDir, { recursive: true, mode: 0o700 })
267
+ // Keep the ring small — only the most recent matter for the window check.
268
+ const next = [...readDeaths(cfg, identity), nowMs].slice(-16)
269
+ try {
270
+ writeFileSync(deathsPath(cfg, identity), JSON.stringify(next), { mode: 0o600 })
271
+ } catch {
272
+ /* best-effort accounting; never block a reap */
273
+ }
274
+ }
275
+
276
+ /** Count deaths within `windowSecs` of `nowMs` (crash-loop guard input). */
277
+ export function countRecentDeaths(
278
+ cfg: LifecycleConfig,
279
+ identity: string,
280
+ windowSecs: number,
281
+ nowMs: number = Date.now(),
282
+ ): number {
283
+ const cutoff = nowMs - windowSecs * 1000
284
+ return readDeaths(cfg, identity).filter(t => t >= cutoff).length
285
+ }
286
+
287
+ /** Trim the death ring to the window (called on a successful wake to reset the loop). */
288
+ export function trimDeaths(
289
+ cfg: LifecycleConfig,
290
+ identity: string,
291
+ windowSecs: number,
292
+ nowMs: number = Date.now(),
293
+ ): void {
294
+ const cutoff = nowMs - windowSecs * 1000
295
+ const kept = readDeaths(cfg, identity).filter(t => t >= cutoff)
296
+ try {
297
+ if (kept.length === 0) rmSync(deathsPath(cfg, identity), { force: true })
298
+ else writeFileSync(deathsPath(cfg, identity), JSON.stringify(kept), { mode: 0o600 })
299
+ } catch {
300
+ /* best-effort */
301
+ }
302
+ }
303
+
304
+ function topicPath(cfg: LifecycleConfig, identity: string): string {
305
+ return join(cfg.stateDir, `${identity}.topic`)
306
+ }
307
+
308
+ /** Read the stored topic of the current/last session (executor discriminator). '' if none. */
309
+ export function readTopic(cfg: LifecycleConfig, identity: string): string {
310
+ try {
311
+ return readFileSync(topicPath(cfg, identity), 'utf8').trim()
312
+ } catch {
313
+ return ''
314
+ }
315
+ }
316
+
317
+ /** Store the incoming topic for the established session (raw string, best-effort). */
318
+ export function writeTopic(cfg: LifecycleConfig, identity: string, topic: string): void {
319
+ mkdirSync(cfg.stateDir, { recursive: true, mode: 0o700 })
320
+ try {
321
+ writeFileSync(topicPath(cfg, identity), topic, { mode: 0o600 })
322
+ } catch {
323
+ /* best-effort — topic is a discriminator hint, never blocks a wake */
324
+ }
325
+ }
326
+
203
327
  // ─────────────────────────────────────────────────────────────────────────────
204
- // resolveWakeMode (C3a + C4a) — the resume-vs-fresh decision, contract ЖЦ
205
- // §resume/fresh. Pure but for the /new-mark consume (a wake side-effect); takes the
206
- // adapter's resolveResume as a parameter so it is unit-testable without a runtime.
328
+ // resolveWakeMode — the resume-vs-fresh decision (TARGET redesign). The DAEMON
329
+ // decides by the DEATH CAUSE it tracks (.idle-reaped marker), plus peer-type /
330
+ // topic NOT an agent-dropped fresh mark. Takes the adapter's resolveResume as a
331
+ // parameter so it is unit-testable without a runtime. The .idle-reaped marker is
332
+ // CONSUMED when the default branch acts on it (a wake side-effect).
207
333
  // ─────────────────────────────────────────────────────────────────────────────
208
334
 
209
335
  export interface WakeMode {
@@ -212,17 +338,40 @@ export interface WakeMode {
212
338
  /** Set ONLY for an EXPLICIT resume request that found nothing to resume — the
213
339
  * caller must fail loud (never a silent fresh fallback). */
214
340
  failReason?: string
341
+ /** Which decision branch fired — the durable "why fresh / why resume" reason.
342
+ * Logged by wakeOrSpawn: the .idle-reaped marker is CONSUMED inside this
343
+ * function (branch 3b), so this cause is the only surviving record of it. */
344
+ cause?: string
215
345
  }
216
346
 
217
347
  /**
218
- * Decide resume vs fresh on a wake. Priority (contract ЖЦ §resume/fresh, /new):
219
- * 1. /new-mark present eager graceful re-launch: FRESH; consume the mark.
220
- * 2. explicit fresh (argsResume === false) FRESH.
221
- * 3. explicit resume (argsResume === true, e.g. attach)RESUME, FAIL-LOUD if the
222
- * preflight finds nothing (failReason set; never a silent fresh fallback).
223
- * 4. default (argsResume undefined) warm-asleep RESUME when a transcript exists,
224
- * else FRESH (a first-ever launch has nothing to resume — NOT an error here).
225
- * Fixes the prior divergence (code: always fresh; contract: warm-asleep → resume).
348
+ * True iff the peer of `cwd` is a human-conversational peer i.e. its local
349
+ * profile declares an `interfaces.telegram` binding (a telegram-fronted dialogue).
350
+ * Such a peer NEVER auto-freshes a resume-eligible wake; only an explicit /new
351
+ * (eager) resets it. A profile read hiccupnot-human (safe default: an executor).
352
+ */
353
+ function isHumanConversational(cwd: string): boolean {
354
+ try {
355
+ const ifaces = readPeerProfile(cwd)?.interfaces
356
+ return !!(ifaces && ifaces.telegram != null)
357
+ } catch {
358
+ return false
359
+ }
360
+ }
361
+
362
+ /**
363
+ * Decide resume vs fresh on a wake (TARGET redesign). Branch order:
364
+ * 1. argsResume === false (folder-launch `iapeer <runtime>`) → FRESH.
365
+ * 2. argsResume === true (attach) → RESUME; FAIL-LOUD via resolveResume if there
366
+ * is nothing to resume (failReason set; never a silent fresh fallback).
367
+ * 3. default (argsResume undefined — a message woke a dead/asleep peer):
368
+ * a. NOT hasIdleReaped → FRESH. (It died on its own: crash / self-close. A
369
+ * crash needs a CLEAN fresh, not a re-crashing resume of a broken context;
370
+ * durable handoff carries continuity.)
371
+ * b. hasIdleReaped (CONSUME the marker now) → resume-eligible, then by type:
372
+ * - human-conversational (interfaces.telegram present) → RESUME.
373
+ * - executor: incomingTopic non-empty AND differs from stored .topic →
374
+ * FRESH (new work); else (same topic, or no incoming topic) → RESUME.
226
375
  */
227
376
  export function resolveWakeMode(
228
377
  cfg: LifecycleConfig,
@@ -230,19 +379,36 @@ export function resolveWakeMode(
230
379
  cwd: string,
231
380
  argsResume: boolean | undefined,
232
381
  resolveResume: (cwd: string) => { ok: boolean; ref?: string; reason?: string },
382
+ incomingTopic?: string,
233
383
  ): WakeMode {
234
- if (hasNewMark(cfg, identity)) {
235
- clearNewMark(cfg, identity) // consume the graceful mark on the fresh re-launch
236
- return { resume: false }
237
- }
238
- if (argsResume === false) return { resume: false }
384
+ // 1. folder-launch → always fresh.
385
+ if (argsResume === false) return { resume: false, cause: 'folder-launch' }
386
+ // 2. attach → always resume, fail-loud if nothing to resume.
239
387
  if (argsResume === true) {
240
388
  const r = resolveResume(cwd)
241
- if (!r.ok) return { resume: false, failReason: r.reason ?? 'resume requested but nothing to resume' }
242
- return { resume: true, resumeRef: r.ref }
389
+ if (!r.ok) return { resume: false, failReason: r.reason ?? 'resume requested but nothing to resume', cause: 'attach-nothing-to-resume' }
390
+ return { resume: true, resumeRef: r.ref, cause: 'attach' }
243
391
  }
392
+ // 3. default (a message woke a dead/asleep peer): decide by the death cause.
393
+ // 3a. NOT idle-reaped → it died on its own (crash / self-close) → clean FRESH.
394
+ if (!hasIdleReaped(cfg, identity)) return { resume: false, cause: 'crash-or-self-close' }
395
+ // 3b. idle-reaped → resume-eligible. Consume the marker now (it has done its job).
396
+ clearIdleReaped(cfg, identity)
397
+ // human-conversational dialogue never auto-freshes; only an explicit /new resets it.
398
+ if (isHumanConversational(cwd)) {
399
+ const r = resolveResume(cwd)
400
+ return r.ok
401
+ ? { resume: true, resumeRef: r.ref, cause: 'idle-reaped-human' }
402
+ : { resume: false, cause: 'idle-reaped-human-no-resume' }
403
+ }
404
+ // executor: a NEW topic (non-empty and differing from the stored one) means new
405
+ // work → FRESH; same topic, or no incoming topic → continue the work → RESUME.
406
+ const topic = incomingTopic?.trim() ?? ''
407
+ if (topic && topic !== readTopic(cfg, identity)) return { resume: false, cause: 'idle-reaped-new-topic' }
244
408
  const r = resolveResume(cwd)
245
- return r.ok ? { resume: true, resumeRef: r.ref } : { resume: false }
409
+ return r.ok
410
+ ? { resume: true, resumeRef: r.ref, cause: 'idle-reaped-resume' }
411
+ : { resume: false, cause: 'idle-reaped-no-resume' }
246
412
  }
247
413
 
248
414
  export function readSessionStates(cfg: LifecycleConfig): SessionState[] {
@@ -485,6 +651,13 @@ export async function wakeOrSpawn(args: WakeArgs, deps: WakeDeps = {}): Promise<
485
651
  const env = deps.env ?? process.env
486
652
  const cfg = deps.cfg ?? loadLifecycleConfig(env)
487
653
 
654
+ // Durable wake-decision trace (eventlog.ts): one line per bring-up decision —
655
+ // fresh / resume (with the resolveWakeMode cause) or a refusal (stopped / crash-
656
+ // loop / launchd). This is the direct answer to "why did peer X come up fresh",
657
+ // and the only surviving record of the .idle-reaped marker resolveWakeMode consumes.
658
+ const logWake = (fields: Record<string, string | number | undefined>): void =>
659
+ appendLifecycleEvent(cfg.eventLogDir, { ev: 'wake', personality: args.personality, ...fields }, { env })
660
+
488
661
  // Heal strays before launching — the sweep-at-spawn-start. This is the SAME
489
662
  // H4-guarded superviseTick the daemon timer runs, so both reap entry points
490
663
  // (timer + wake) go through one guarded path. Best-effort: never block a wake.
@@ -500,6 +673,7 @@ export async function wakeOrSpawn(args: WakeArgs, deps: WakeDeps = {}): Promise<
500
673
 
501
674
  // H4 — never wake a launchd-managed peer (launchd KeepAlive owns it).
502
675
  if (isLaunchdManaged(args.personality, env)) {
676
+ logWake({ runtime: args.runtime, mode: 'refused', cause: 'launchd-managed' })
503
677
  return {
504
678
  status: 'FAILED',
505
679
  woke: false,
@@ -530,6 +704,7 @@ export async function wakeOrSpawn(args: WakeArgs, deps: WakeDeps = {}): Promise<
530
704
  // halt: refuse with stopped:true so the sender gets an explicit "stopped" error,
531
705
  // not a generic "offline" — and no message is queued. `start` clears the flag.
532
706
  if (isStopped(cfg, identity)) {
707
+ logWake({ identity, runtime, mode: 'refused', cause: 'stopped' })
533
708
  return {
534
709
  status: 'FAILED',
535
710
  woke: false,
@@ -547,9 +722,11 @@ export async function wakeOrSpawn(args: WakeArgs, deps: WakeDeps = {}): Promise<
547
722
  // refusal). A stop racing DURING the spawn is a narrower window the wake-lock does not
548
723
  // cover (stop does not take this lock).
549
724
  if (isStopped(cfg, identity)) {
725
+ logWake({ identity, runtime, mode: 'refused', cause: 'stopped-mid-wake' })
550
726
  return { status: 'FAILED', woke: false, runtime, stopped: true, reason: `"${args.personality}" (${runtime}) is stopped and not accepting messages; start it to resume` }
551
727
  }
552
728
  if (isLaunchdManaged(args.personality, env)) {
729
+ logWake({ identity, runtime, mode: 'refused', cause: 'launchd-managed-mid-wake' })
553
730
  return { status: 'FAILED', woke: false, runtime, reason: `"${args.personality}" became launchd-managed mid-wake; the daemon does not wake it` }
554
731
  }
555
732
  // Idempotent fast path inside the lock: a live session wins (a concurrent
@@ -562,9 +739,37 @@ export async function wakeOrSpawn(args: WakeArgs, deps: WakeDeps = {}): Promise<
562
739
  return { status: 'FAILED', woke: false, runtime, reason: `peer cwd does not exist: ${cwd}` }
563
740
  }
564
741
 
565
- // C3a + C4a resolve resume vs fresh (extracted resolveWakeMode, contract ЖЦ
566
- // §resume/fresh). An EXPLICIT resume that finds nothing to resume fails loud.
567
- const mode = resolveWakeMode(cfg, identity, cwd, args.resume, c => adapter.resolveResume(c))
742
+ // Crash-loop guardBEFORE launching: if the peer has died crashLoopMax times
743
+ // within crashLoopWindowSecs, refuse to (re)launch and leave it asleep (a clear
744
+ // FAILED reason, not a silent fresh that re-crashes). A successful wake below
745
+ // trims the ring, so the guard only fires on a genuine tight loop.
746
+ const recentDeaths = countRecentDeaths(cfg, identity, cfg.crashLoopWindowSecs, Date.now())
747
+ if (recentDeaths >= cfg.crashLoopMax) {
748
+ logWake({ identity, runtime, mode: 'refused', cause: 'crash-loop', reason: `${recentDeaths} deaths in ${cfg.crashLoopWindowSecs}s` })
749
+ return {
750
+ status: 'FAILED',
751
+ woke: false,
752
+ runtime,
753
+ reason: `crash-loop guard: ${recentDeaths} deaths in ${cfg.crashLoopWindowSecs}s, leaving asleep`,
754
+ }
755
+ }
756
+
757
+ // Resolve resume vs fresh (TARGET redesign resolveWakeMode): the daemon decides
758
+ // by the death cause (.idle-reaped marker) + peer-type/topic. An EXPLICIT resume
759
+ // that finds nothing to resume fails loud. incomingTopic (args.topic) is the
760
+ // executor discriminator.
761
+ const mode = resolveWakeMode(cfg, identity, cwd, args.resume, c => adapter.resolveResume(c), args.topic)
762
+ // The bring-up decision is the durable trace — log it BEFORE launch (the decision
763
+ // stands regardless of whether the subsequent launch succeeds). resolveWakeMode has
764
+ // already consumed any .idle-reaped marker, so `cause` is now its only record.
765
+ logWake({
766
+ identity,
767
+ runtime,
768
+ mode: mode.failReason ? 'fail' : mode.resume ? 'resume' : 'fresh',
769
+ cause: mode.cause,
770
+ ref: mode.resumeRef,
771
+ reason: mode.failReason,
772
+ })
568
773
  if (mode.failReason) return { status: 'FAILED', woke: false, runtime, reason: mode.failReason }
569
774
  const resume = mode.resume
570
775
  const resumeRef = mode.resumeRef
@@ -622,6 +827,10 @@ export async function wakeOrSpawn(args: WakeArgs, deps: WakeDeps = {}): Promise<
622
827
  `[iapeer] WARN session-state write failed for ${identity} — session is live + TTL-bounded but not idle-reap-supervised: ${e instanceof Error ? e.message : String(e)}\n`,
623
828
  )
624
829
  }
830
+ // Establish the session's topic (executor discriminator) and reset the crash-loop
831
+ // ring — a successful wake means this is NOT a tight crash loop. Best-effort.
832
+ writeTopic(cfg, identity, args.topic?.trim() ?? '')
833
+ trimDeaths(cfg, identity, cfg.crashLoopWindowSecs, Date.now())
625
834
  return { status: 'READY', woke: true, runtime, process_address: identity }
626
835
  })
627
836
  }
@@ -659,8 +868,8 @@ export interface SuperviseOutcome {
659
868
  identity: string
660
869
  action: 'reaped-idle' | 'reaped-gone' | 'skipped-launchd' | 'alive' | 'needs-eager-fresh'
661
870
  reason?: string
662
- /** For 'needs-eager-fresh' (C4b): the peer to EAGERLY re-launch fresh (its session
663
- * died carrying a /new-mark). The daemon timer drives the async relaunch. */
871
+ /** For 'needs-eager-fresh': the peer to EAGERLY re-launch fresh (its session died
872
+ * carrying a .new-eager mark). The daemon timer drives the async relaunch. */
664
873
  personality?: string
665
874
  runtime?: Runtime
666
875
  }
@@ -673,31 +882,46 @@ export interface SuperviseDeps {
673
882
  export function superviseTick(cfg: LifecycleConfig, deps: SuperviseDeps = {}): SuperviseOutcome[] {
674
883
  const env = deps.env ?? process.env
675
884
  const nowMs = deps.nowMs ?? Date.now()
885
+ const verbose = superviseLogVerbose(env)
886
+ // Durable decision trace (eventlog.ts): every reap/death/eager-fresh gets a line
887
+ // so a postmortem can answer "when & how did peer X's prior session end" even
888
+ // after the .idle-reaped / .deaths markers are consumed. alive / skipped-launchd
889
+ // are steady-state non-decisions → logged only under IAPEER_SUPERVISE_LOG_VERBOSE.
890
+ const trace = (fields: Record<string, string | number | undefined>): void =>
891
+ appendLifecycleEvent(cfg.eventLogDir, { ev: 'supervise', ...fields }, { env, nowMs })
676
892
  const out: SuperviseOutcome[] = []
677
893
  for (const s of readSessionStates(cfg)) {
678
894
  // H4 — FIRST, before any reap. A launchd-managed peer is read-only.
679
895
  if (isLaunchdManaged(s.personality, env)) {
680
896
  out.push({ identity: s.identity, action: 'skipped-launchd' })
897
+ if (verbose) trace({ identity: s.identity, action: 'skipped-launchd', outcome: 'read-only-h4' })
681
898
  continue
682
899
  }
683
900
  const sock = buildSocketPath(s.runtime, s.personality, cfg.sockDir)
684
901
  if (!sessionAlive(sock, s.identity)) {
902
+ // A dead session: record a death for crash-loop accounting, then branch on the
903
+ // .new-eager mark. This death was NOT daemon-initiated (the daemon only initiates
904
+ // the idle-reap below) → it died on its own → do NOT write .idle-reaped here.
905
+ recordDeath(cfg, s.identity, nowMs)
685
906
  removeSessionState(cfg, s.identity)
686
- // C4b — a session that died carrying a /new-mark is a GRACEFUL завершение by the
687
- // owner: re-launch EAGERLY as fresh (not lazily on the next message, the way a
688
- // markless idle-reap death resumes). The mark is LEFT for the relaunch's
689
- // resolveWakeMode to consume; the daemon timer drives the async wakeOrSpawn.
690
- if (hasNewMark(cfg, s.identity)) {
907
+ // A session that died carrying a .new-eager mark is an owner /new: re-launch
908
+ // EAGERLY as fresh (not lazily on the next message). The mark is LEFT for the
909
+ // eager relaunch (processEagerRelaunches) to consume; the daemon timer drives it.
910
+ if (hasNewEager(cfg, s.identity)) {
691
911
  out.push({
692
912
  identity: s.identity,
693
913
  action: 'needs-eager-fresh',
694
- reason: '/new graceful mark — eager fresh re-launch',
914
+ reason: '/new eager mark — eager fresh re-launch',
695
915
  personality: s.personality,
696
916
  runtime: s.runtime,
697
917
  })
918
+ trace({ identity: s.identity, action: 'needs-eager-fresh', reason: '/new eager mark', outcome: 'eager-fresh' })
698
919
  continue
699
920
  }
921
+ // Crash / self-close: NO marker written, NO eager relaunch — the peer stays
922
+ // asleep and wakes FRESH lazily on the next message (resolveWakeMode branch 3a).
700
923
  out.push({ identity: s.identity, action: 'reaped-gone', reason: 'session no longer live' })
924
+ trace({ identity: s.identity, action: 'reaped-gone', reason: 'session no longer live', outcome: 'fresh-next-msg' })
701
925
  continue
702
926
  }
703
927
  // Idle accounting via the runtime adapter's activity proxy (claude transcript
@@ -712,25 +936,33 @@ export function superviseTick(cfg: LifecycleConfig, deps: SuperviseDeps = {}): S
712
936
  }
713
937
  const ageSecs = Math.floor((nowMs - mt) / 1000)
714
938
  if (ageSecs > cfg.idleSecs) {
939
+ // THE ONLY place .idle-reaped is written: this is the one death the daemon
940
+ // INITIATES. Its presence on the next wake = the session was parked cleanly =
941
+ // RESUME-eligible (resolveWakeMode branch 3b). A crash/self-close (the dead
942
+ // branch above) never writes it → that wakes FRESH (branch 3a).
715
943
  killSession(sock, s.identity)
944
+ setIdleReaped(cfg, s.identity)
716
945
  removeSessionState(cfg, s.identity)
717
946
  out.push({ identity: s.identity, action: 'reaped-idle', reason: `idle ${ageSecs}s` })
947
+ trace({ identity: s.identity, action: 'reaped-idle', age: `${ageSecs}s`, outcome: 'resume-eligible' })
718
948
  } else {
719
949
  out.push({ identity: s.identity, action: 'alive' })
950
+ if (verbose) trace({ identity: s.identity, action: 'alive', age: `${ageSecs}s` })
720
951
  }
721
952
  }
722
953
  return out
723
954
  }
724
955
 
725
956
  /**
726
- * C4b — drive the EAGER fresh re-launch for peers superviseTick flagged
727
- * 'needs-eager-fresh' (their session died carrying a /new graceful mark). Async +
728
- * best-effort: task='' so the seed (initial_prompt) is self-sufficient (a /new has no
729
- * incoming message — the agent auto-reports "I'm up" from the seed). resolveWakeMode
730
- * consumes the /new-mark on the relaunch (→ fresh). A relaunch failure leaves the
731
- * mark, so the peer still fresh-wakes on its next message graceful degrades to lazy,
732
- * never lost. NB: a /new'd peer is expected to carry an initial_prompt (the report
733
- * directive); without one the seed is empty and the first turn delivers nothing.
957
+ * Drive the EAGER fresh re-launch for peers superviseTick flagged 'needs-eager-fresh'
958
+ * (their session died carrying a .new-eager mark — an owner /new). Async + best-effort:
959
+ * task='' so the seed (initial_prompt) is self-sufficient (a /new has no incoming message
960
+ * — the agent auto-reports "I'm up" from the seed). The relaunch is FRESH BY CONSTRUCTION:
961
+ * we CONSUME .new-eager here and pass resume:false so wakeOrSpawn's resolveWakeMode takes
962
+ * the folder-launch fresh branch WITHOUT consulting the death-cause markers. The mark is
963
+ * consumed BEFORE the relaunch so a relaunch failure does not loop on the same eager mark
964
+ * (it then fresh-wakes lazily on its next message branch 3a never lost). NB: a /new'd
965
+ * peer is expected to carry an initial_prompt; without one the first turn delivers nothing.
734
966
  */
735
967
  export async function processEagerRelaunches(
736
968
  cfg: LifecycleConfig,
@@ -740,9 +972,13 @@ export async function processEagerRelaunches(
740
972
  const results: WakeResult[] = []
741
973
  for (const o of outcomes) {
742
974
  if (o.action !== 'needs-eager-fresh' || !o.personality || !o.runtime) continue
975
+ clearNewEager(cfg, o.identity) // consume the eager mark — the relaunch is fresh by construction
743
976
  try {
744
977
  results.push(
745
- await wakeOrSpawn({ personality: o.personality, runtime: o.runtime, task: '' }, { cfg, env: deps.env }),
978
+ await wakeOrSpawn(
979
+ { personality: o.personality, runtime: o.runtime, task: '', resume: false },
980
+ { cfg, env: deps.env },
981
+ ),
746
982
  )
747
983
  } catch (e) {
748
984
  results.push({ status: 'FAILED', woke: false, reason: e instanceof Error ? e.message : String(e) })