@adaptic/maestro 1.8.4 → 1.9.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -43,9 +43,10 @@
43
43
  * logger optional fn({ ts, level, …rest }) → void for tests.
44
44
  */
45
45
 
46
- import { existsSync, readFileSync, writeFileSync } from "node:fs";
46
+ import { existsSync, readFileSync, writeFileSync, mkdirSync, appendFileSync, openSync, closeSync, statSync, unlinkSync } from "node:fs";
47
47
  import { join } from "node:path";
48
48
  import { spawn } from "node:child_process";
49
+ import { homedir } from "node:os";
49
50
 
50
51
  import {
51
52
  ensureBusDirs,
@@ -74,6 +75,17 @@ const DEFAULT_SPAWN_TIMEOUT_MS = 30 * 60_000;
74
75
  // preferable to thrashing Claude / hitting usage limits.
75
76
  const MAX_CONCURRENT_SUB_SESSIONS = 1;
76
77
 
78
+ // Retry policy. Most cadence failures are systemic (broken prompt, bad
79
+ // auth, transient API errors) — 5 retries doesn't help, it just amplifies
80
+ // the burn. 2 retries with exponential back-off is the right balance.
81
+ const DEFAULT_MAX_ATTEMPTS = 2;
82
+ const BACKOFF_SCHEDULE_MS = [0, 30_000, 120_000]; // 1st retry +30s, 2nd retry +2m
83
+
84
+ // Circuit breaker — when 3 same-cadence failures land in a row, stop
85
+ // spawning that cadence for 30 minutes. Prevents launchd-rate runaway.
86
+ const CIRCUIT_OPEN_THRESHOLD = 3;
87
+ const CIRCUIT_OPEN_DURATION_MS = 30 * 60_000;
88
+
77
89
  // ---------------------------------------------------------------------------
78
90
  // Helpers
79
91
  // ---------------------------------------------------------------------------
@@ -92,10 +104,49 @@ function defaultLogger(entry) {
92
104
  }
93
105
  }
94
106
 
107
+ /**
108
+ * Resolve an absolute path to the Claude CLI. launchd's bare environment
109
+ * does NOT include /Users/<u>/.local/bin or homebrew on PATH, so a plain
110
+ * `spawn('claude', …)` fails with ENOENT — which is exactly what was
111
+ * stuck in ravi-ai's DLQ. This resolver returns the first existing
112
+ * candidate among:
113
+ *
114
+ * 1. $CLAUDE_BIN env var (if set + executable)
115
+ * 2. ~/.local/bin/claude (default Claude Code install path)
116
+ * 3. /opt/homebrew/bin/claude (homebrew on Apple Silicon)
117
+ * 4. /usr/local/bin/claude (homebrew on Intel)
118
+ * 5. /usr/bin/claude
119
+ *
120
+ * Falls back to bare "claude" so the spawn's own error stays informative
121
+ * when nothing is found.
122
+ */
123
+ let _resolvedClaude = null;
124
+ function resolveClaudeBin() {
125
+ if (_resolvedClaude) return _resolvedClaude;
126
+ const envOverride = process.env.CLAUDE_BIN;
127
+ const candidates = [
128
+ envOverride,
129
+ join(homedir(), ".local/bin/claude"),
130
+ "/opt/homebrew/bin/claude",
131
+ "/usr/local/bin/claude",
132
+ "/usr/bin/claude",
133
+ ].filter(Boolean);
134
+ for (const c of candidates) {
135
+ if (existsSync(c)) { _resolvedClaude = c; return c; }
136
+ }
137
+ _resolvedClaude = "claude"; // last-resort; spawn will report ENOENT
138
+ return _resolvedClaude;
139
+ }
140
+
95
141
  /**
96
142
  * Spawn a sub-session running the cadence's trigger prompt and resolve
97
- * with { exit_code, durationMs }. Reads the prompt at call time so the
98
- * latest version (possibly upgraded between ticks) is always used.
143
+ * with { exit_code, durationMs, stderr_tail }. Reads the prompt at call
144
+ * time so the latest version (possibly upgraded between ticks) is always
145
+ * used.
146
+ *
147
+ * Robustness: stdout + stderr are tee'd to logs/cadence-bus/subsessions/
148
+ * so non-zero exits remain diagnosable after the fact. The last ~4 KB of
149
+ * stderr is also captured in-memory and surfaced on the failure event.
99
150
  */
100
151
  function realSpawnSession({ agentRoot, cadence, promptPath, timeoutMs, log }) {
101
152
  return new Promise((resolveOut) => {
@@ -111,17 +162,50 @@ function realSpawnSession({ agentRoot, cadence, promptPath, timeoutMs, log }) {
111
162
  return;
112
163
  }
113
164
 
114
- const bin = process.env.CLAUDE_BIN || "claude";
165
+ const bin = resolveClaudeBin();
115
166
  const args = ["--print", "--dangerously-skip-permissions", body];
116
- const env = { ...process.env, AGENT_ROOT: agentRoot, AGENT_DIR: agentRoot };
167
+ // Augment PATH so any tool the subsession invokes (jq, node, etc.)
168
+ // can still be found. launchd's bare env strips /opt/homebrew/bin etc.
169
+ const augmentedPath = [
170
+ process.env.PATH || "",
171
+ `${homedir()}/.local/bin`,
172
+ "/opt/homebrew/bin",
173
+ "/opt/homebrew/sbin",
174
+ "/usr/local/bin",
175
+ "/usr/bin",
176
+ "/bin",
177
+ "/usr/sbin",
178
+ "/sbin",
179
+ ].filter(Boolean).join(":");
180
+ const env = {
181
+ ...process.env,
182
+ AGENT_ROOT: agentRoot,
183
+ AGENT_DIR: agentRoot,
184
+ PATH: augmentedPath,
185
+ };
117
186
  const started = Date.now();
118
187
 
119
- log({ level: "info", stage: "subsession_spawn", cadence, bin });
188
+ // Per-run log file. Pattern is short enough to be tail-friendly.
189
+ const logsDir = join(agentRoot, "logs", "cadence-bus", "subsessions");
190
+ mkdirSync(logsDir, { recursive: true });
191
+ const date = new Date().toISOString().slice(0, 10);
192
+ const stamp = new Date().toISOString().replace(/[:.]/g, "-");
193
+ const stdoutPath = join(logsDir, `${date}-${cadence}-${stamp}.stdout.log`);
194
+ const stderrPath = join(logsDir, `${date}-${cadence}-${stamp}.stderr.log`);
195
+ const stdoutFd = openSync(stdoutPath, "a");
196
+ const stderrFd = openSync(stderrPath, "a");
197
+
198
+ log({ level: "info", stage: "subsession_spawn", cadence, bin, stdout: stdoutPath, stderr: stderrPath });
120
199
 
121
200
  let child;
122
201
  try {
123
- child = spawn(bin, args, { cwd: agentRoot, env, stdio: "ignore" });
202
+ // stdio:
203
+ // 0 ignore (claude --print reads prompt from argv, not stdin)
204
+ // 1 → file (capture stdout for later inspection)
205
+ // 2 → file (capture stderr — critical for diagnosing exit-1)
206
+ child = spawn(bin, args, { cwd: agentRoot, env, stdio: ["ignore", stdoutFd, stderrFd] });
124
207
  } catch (err) {
208
+ try { closeSync(stdoutFd); closeSync(stderrFd); } catch { /* */ }
125
209
  resolveOut({ ok: false, exit_code: -4, error: `spawn failed: ${err.message}` });
126
210
  return;
127
211
  }
@@ -135,18 +219,63 @@ function realSpawnSession({ agentRoot, cadence, promptPath, timeoutMs, log }) {
135
219
 
136
220
  child.on("exit", (code, signal) => {
137
221
  clearTimeout(timer);
222
+ try { closeSync(stdoutFd); closeSync(stderrFd); } catch { /* */ }
138
223
  const durationMs = Date.now() - started;
139
224
  const exit_code = typeof code === "number" ? code : (signal ? -1 : -5);
225
+
226
+ // Pull tail of stderr (and stdout if stderr empty) for the failure
227
+ // surface. Best-effort; we never block on file size.
228
+ let stderrTail = "";
229
+ try {
230
+ const body = readFileSync(stderrPath, "utf-8");
231
+ stderrTail = body.slice(-4096);
232
+ if (!stderrTail.trim()) {
233
+ const so = readFileSync(stdoutPath, "utf-8");
234
+ stderrTail = so.slice(-4096);
235
+ }
236
+ } catch { /* file may not exist if spawn ENOENT before fd-redirect */ }
237
+
238
+ // Record cost-ledger row. Token counts are 0 until we parse the
239
+ // session's JSON output; for now exit-code + duration are enough
240
+ // to spot pathological retry loops.
241
+ try {
242
+ const trackerPath = join(agentRoot, "scripts/cost/track-claude-usage.mjs");
243
+ if (existsSync(trackerPath)) {
244
+ spawn(process.execPath, [
245
+ trackerPath, "record",
246
+ "--cadence", cadence,
247
+ "--source", "cadence-consumer",
248
+ "--model", "sonnet",
249
+ "--duration-ms", String(durationMs),
250
+ "--input-tokens", "0",
251
+ "--output-tokens", "0",
252
+ "--exit", String(exit_code),
253
+ ], { stdio: "ignore", env: { ...env, AGENT_ROOT: agentRoot } }).unref();
254
+ }
255
+ } catch { /* cost tracking is best-effort */ }
256
+
257
+ // Clean up empty log files so the directory doesn't accumulate
258
+ // hundreds of zero-byte successes.
259
+ try {
260
+
261
+ if (statSync(stdoutPath).size === 0) unlinkSync(stdoutPath);
262
+ if (statSync(stderrPath).size === 0) unlinkSync(stderrPath);
263
+ } catch { /* */ }
264
+
140
265
  resolveOut({
141
266
  ok: exit_code === 0,
142
267
  exit_code,
143
268
  signal: signal || null,
144
269
  duration_ms: durationMs,
270
+ stderr_tail: stderrTail || null,
271
+ stdout_path: stdoutPath,
272
+ stderr_path: stderrPath,
145
273
  });
146
274
  });
147
275
 
148
276
  child.on("error", (err) => {
149
277
  clearTimeout(timer);
278
+ try { closeSync(stdoutFd); closeSync(stderrFd); } catch { /* */ }
150
279
  const durationMs = Date.now() - started;
151
280
  resolveOut({ ok: false, exit_code: -6, error: err.message, duration_ms: durationMs });
152
281
  });
@@ -171,6 +300,11 @@ export function startConsumer(opts = {}) {
171
300
  const maxSpawnMs = opts.maxSpawnMs ?? DEFAULT_SPAWN_TIMEOUT_MS;
172
301
  const spawnSession = opts.spawnSession || realSpawnSession;
173
302
  const userLogger = opts.logger;
303
+ // Test / tuning hooks for the reliability layer.
304
+ const backoffSchedule = opts.backoffSchedule || BACKOFF_SCHEDULE_MS;
305
+ const circuitThreshold = opts.circuitThreshold ?? CIRCUIT_OPEN_THRESHOLD;
306
+ const circuitDurationMs = opts.circuitDurationMs ?? CIRCUIT_OPEN_DURATION_MS;
307
+ const maxAttempts = opts.maxAttempts ?? DEFAULT_MAX_ATTEMPTS;
174
308
 
175
309
  const stats = {
176
310
  started_at: new Date().toISOString(),
@@ -178,6 +312,8 @@ export function startConsumer(opts = {}) {
178
312
  inline: 0,
179
313
  escalated: 0,
180
314
  skipped_emergency_stop: 0,
315
+ skipped_circuit_open: 0,
316
+ skipped_backoff: 0,
181
317
  dlq: 0,
182
318
  retries: 0,
183
319
  spawn_failures: 0,
@@ -190,6 +326,75 @@ export function startConsumer(opts = {}) {
190
326
  let timers = [];
191
327
  let activeSubSessions = 0;
192
328
 
329
+ // Per-cadence reliability state. Tracks consecutive failure count and
330
+ // the earliest moment we'll allow another spawn for that cadence.
331
+ // Persists nothing — circuit state is in-memory only. On daemon restart
332
+ // we get a fresh slate; that's intentional (operators expect a restart
333
+ // to mean "try again now").
334
+ const cadenceState = new Map(); // cadence → { failures, openUntil, nextAllowedAt }
335
+
336
+ function getCadenceState(cadence) {
337
+ let s = cadenceState.get(cadence);
338
+ if (!s) { s = { failures: 0, openUntil: 0, nextAllowedAt: 0 }; cadenceState.set(cadence, s); }
339
+ return s;
340
+ }
341
+
342
+ function recordSubsessionSuccess(cadence) {
343
+ const s = getCadenceState(cadence);
344
+ s.failures = 0;
345
+ s.openUntil = 0;
346
+ s.nextAllowedAt = 0;
347
+ }
348
+
349
+ function recordSubsessionFailure(cadence) {
350
+ const s = getCadenceState(cadence);
351
+ s.failures += 1;
352
+ // Exponential back-off honouring the (test-overridable) schedule.
353
+ const idx = Math.min(s.failures, backoffSchedule.length - 1);
354
+ s.nextAllowedAt = Date.now() + backoffSchedule[idx];
355
+ if (s.failures >= circuitThreshold) {
356
+ s.openUntil = Date.now() + circuitDurationMs;
357
+ log({ level: "error", stage: "circuit_opened", cadence, failures: s.failures, open_until: new Date(s.openUntil).toISOString() });
358
+ writeCircuitFile();
359
+ }
360
+ }
361
+
362
+ function writeCircuitFile() {
363
+ // Persist the open-circuit snapshot so doctor + the operator can see
364
+ // which cadences are currently held back without scraping logs.
365
+ const open = {};
366
+ for (const [cad, s] of cadenceState.entries()) {
367
+ if (s.openUntil > Date.now()) {
368
+ open[cad] = { failures: s.failures, open_until: new Date(s.openUntil).toISOString() };
369
+ }
370
+ }
371
+ const path = join(agentRoot, "state/cadence-bus/circuit-open.json");
372
+ try {
373
+ if (Object.keys(open).length === 0) {
374
+ // Remove the file when nothing is open.
375
+
376
+ try { unlinkSync(path); } catch { /* */ }
377
+ } else {
378
+ writeFileSync(path, JSON.stringify({ generated: new Date().toISOString(), open }, null, 2) + "\n");
379
+ }
380
+ } catch { /* best-effort */ }
381
+ }
382
+
383
+ function isCadenceAllowed(cadence) {
384
+ const s = getCadenceState(cadence);
385
+ const now = Date.now();
386
+ if (s.openUntil > now) return { allowed: false, reason: "circuit-open", retry_at: s.openUntil };
387
+ if (s.nextAllowedAt > now) return { allowed: false, reason: "backoff", retry_at: s.nextAllowedAt };
388
+ // Circuit closes automatically when openUntil passes.
389
+ if (s.openUntil && s.openUntil <= now) {
390
+ s.openUntil = 0;
391
+ s.failures = 0;
392
+ log({ level: "info", stage: "circuit_closed", cadence });
393
+ writeCircuitFile();
394
+ }
395
+ return { allowed: true };
396
+ }
397
+
193
398
  function log(entry) {
194
399
  const enriched = { ts: new Date().toISOString(), ...entry };
195
400
  logBusEvent(agentRoot, enriched);
@@ -209,6 +414,32 @@ export function startConsumer(opts = {}) {
209
414
  }
210
415
 
211
416
  async function escalate(event) {
417
+ // Circuit-breaker / back-off gate. If this cadence is currently held
418
+ // back, requeue without spawning. The event keeps its attempt count
419
+ // because the failure was upstream (not a per-event problem).
420
+ const gate = isCadenceAllowed(event.cadence);
421
+ if (!gate.allowed) {
422
+ log({
423
+ level: "warn",
424
+ stage: gate.reason === "circuit-open" ? "skipped_circuit_open" : "skipped_backoff",
425
+ id: event.id,
426
+ cadence: event.cadence,
427
+ retry_at: new Date(gate.retry_at).toISOString(),
428
+ });
429
+ if (gate.reason === "circuit-open") stats.skipped_circuit_open += 1;
430
+ else stats.skipped_backoff += 1;
431
+ // Put the event back in inbox WITHOUT bumping attempts so it doesn't
432
+ // burn its retry budget while the circuit is open.
433
+ const paths2 = getBusPaths(agentRoot);
434
+ try {
435
+ const event2 = { ...event, attempts: Math.max(0, (event.attempts || 1) - 1) };
436
+ writeFileSync(join(paths2.inbox, `${event.id}.json`), JSON.stringify(event2, null, 2) + "\n");
437
+
438
+ try { unlinkSync(join(paths2.claimed, `${event.id}.json`)); } catch { /* */ }
439
+ } catch { /* best-effort */ }
440
+ return { ok: false, decision: gate.reason };
441
+ }
442
+
212
443
  if (activeSubSessions >= MAX_CONCURRENT_SUB_SESSIONS) {
213
444
  // Re-queue and try again next tick. Single-owner cadence consumer
214
445
  // means this can only happen when a prior tick is still running —
@@ -220,7 +451,15 @@ export function startConsumer(opts = {}) {
220
451
  cadence: event.cadence,
221
452
  active_subsessions: activeSubSessions,
222
453
  });
223
- failTick(agentRoot, event.id, "deferred:concurrent-spawn", { maxAttempts: 10 });
454
+ // Re-queue without burning the retry budget — concurrent-spawn isn't
455
+ // a per-event failure.
456
+ const paths2 = getBusPaths(agentRoot);
457
+ try {
458
+ const event2 = { ...event, attempts: Math.max(0, (event.attempts || 1) - 1) };
459
+ writeFileSync(join(paths2.inbox, `${event.id}.json`), JSON.stringify(event2, null, 2) + "\n");
460
+
461
+ try { unlinkSync(join(paths2.claimed, `${event.id}.json`)); } catch { /* */ }
462
+ } catch { /* best-effort */ }
224
463
  stats.retries += 1;
225
464
  return { ok: false, decision: "deferred" };
226
465
  }
@@ -263,14 +502,31 @@ export function startConsumer(opts = {}) {
263
502
  prompt: promptPath,
264
503
  exit_code: result.exit_code,
265
504
  duration_ms: result.duration_ms,
505
+ stdout_path: result.stdout_path || null,
506
+ stderr_path: result.stderr_path || null,
266
507
  });
508
+ recordSubsessionSuccess(event.cadence);
267
509
  stats.escalated += 1;
268
510
  stats.last_decision = "escalated";
269
511
  return { ok: true, decision: "escalated", exit_code: result.exit_code };
270
512
  }
271
- log({ level: "error", stage: "subsession_failed", id: event.id, cadence: event.cadence, exit_code: result.exit_code, error: result.error || null });
513
+ // Failure path: log + cap retries low. The exact stderr tail comes
514
+ // from the spawn helper so we never DLQ "blind" again.
515
+ const stderrTail = (result.stderr_tail || "").trim().split("\n").slice(-3).join(" | ");
516
+ log({
517
+ level: "error",
518
+ stage: "subsession_failed",
519
+ id: event.id,
520
+ cadence: event.cadence,
521
+ exit_code: result.exit_code,
522
+ duration_ms: result.duration_ms,
523
+ error: result.error || stderrTail || `exit ${result.exit_code}`,
524
+ stderr_path: result.stderr_path || null,
525
+ });
272
526
  stats.spawn_failures += 1;
273
- const outcome = failTick(agentRoot, event.id, result.error || `exit ${result.exit_code}`);
527
+ recordSubsessionFailure(event.cadence);
528
+ const reason = result.error || (stderrTail ? `exit ${result.exit_code}: ${stderrTail}` : `exit ${result.exit_code}`);
529
+ const outcome = failTick(agentRoot, event.id, reason, { maxAttempts });
274
530
  if (outcome?.destination === "dlq") stats.dlq += 1;
275
531
  else stats.retries += 1;
276
532
  return { ok: false, decision: outcome?.destination || "failed" };
@@ -357,19 +613,38 @@ export function startConsumer(opts = {}) {
357
613
  recoverStaleClaims(agentRoot);
358
614
 
359
615
  let processed = 0;
360
- // Drain as much as the consumer can in one tick, but yield to the
361
- // event loop between events so heartbeats and stop signals fire.
616
+ let escalatedThisTick = 0;
617
+ // Drain inline events as much as the consumer can in one tick; cap
618
+ // sub-session escalations at 1 per tick so a fast-failing cadence
619
+ // can't burn a whole minute's worth of retries inside a single poll.
620
+ // The next poll (DEFAULT_POLL_MS later) will pick up where we left off.
362
621
  while (!stopping) {
363
622
  const claim = claimNextTick(agentRoot);
364
623
  if (!claim) break;
365
624
  const event = claim.event;
366
625
  activeTick = event.id;
626
+ let didEscalate = false;
367
627
  try {
628
+ const def = getCadenceDef(event.cadence);
629
+ const willEscalate = !def || (def.mode !== "inline" && (def.mode !== "guarded" || true));
630
+ // Roughly: if it's not a registry-inline cadence, we MAY escalate.
631
+ // We don't yet know if the guard will say inline; processEvent
632
+ // will tell us via stats. Use the escalated stats delta as the
633
+ // signal that an actual sub-session ran this iteration.
634
+ const before = stats.escalated + stats.spawn_failures + stats.skipped_circuit_open + stats.skipped_backoff;
368
635
  await processEvent(event);
636
+ const after = stats.escalated + stats.spawn_failures + stats.skipped_circuit_open + stats.skipped_backoff;
637
+ if (after > before) didEscalate = true;
638
+ // Silence unused var warning.
639
+ void willEscalate;
369
640
  } finally {
370
641
  activeTick = null;
371
642
  }
372
643
  processed += 1;
644
+ if (didEscalate) escalatedThisTick += 1;
645
+ // Hard cap: at most ONE sub-session spawn per tick. Inline ticks
646
+ // keep draining freely (they're cheap).
647
+ if (escalatedThisTick >= 1) break;
373
648
  if (processed >= 16) break; // soft batch cap
374
649
  }
375
650
  return { processed };
@@ -210,9 +210,16 @@ test("unknown cadence with no prompt file DLQ's immediately", async () => {
210
210
  test("spawn failure retries within the budget, then DLQs", async () => {
211
211
  const root = await makeAgentRoot();
212
212
  plantPrompt(root, "weekly-strategic-memo");
213
+ // Disable back-off + raise circuit threshold so the test exercises the
214
+ // retry-then-DLQ path without waiting for back-off windows. The
215
+ // real defaults (30s/2m back-off, 3-failure circuit) are exercised by
216
+ // dedicated tests below.
213
217
  const consumer = startConsumer({
214
218
  agentRoot: root,
215
219
  pollMs: 25,
220
+ backoffSchedule: [0, 0, 0],
221
+ circuitThreshold: 999,
222
+ maxAttempts: 2,
216
223
  spawnSession: async () => ({ ok: false, exit_code: 1, error: "always-fail", duration_ms: 1 }),
217
224
  });
218
225
  try {
@@ -226,6 +233,68 @@ test("spawn failure retries within the budget, then DLQs", async () => {
226
233
  }
227
234
  });
228
235
 
236
+ test("circuit breaker opens after consecutive failures and blocks further spawns", async () => {
237
+ const root = await makeAgentRoot();
238
+ plantPrompt(root, "weekly-strategic-memo");
239
+ let spawnCount = 0;
240
+ const consumer = startConsumer({
241
+ agentRoot: root,
242
+ pollMs: 20,
243
+ backoffSchedule: [0, 0, 0],
244
+ circuitThreshold: 2,
245
+ circuitDurationMs: 60_000, // 1 min — long enough for the assertion window
246
+ maxAttempts: 1, // each event DLQs on first failure so we don't conflate retry-counts
247
+ spawnSession: async () => { spawnCount++; return { ok: false, exit_code: 1, error: "fail", duration_ms: 1 }; },
248
+ });
249
+ try {
250
+ // Enqueue 5 events; circuit should open after 2 failures, blocking the rest.
251
+ for (let i = 0; i < 5; i++) {
252
+ enqueueTick({ cadence: "weekly-strategic-memo", source: "launchd", agentRoot: root });
253
+ }
254
+ const opened = await waitFor(() => consumer.getStats().skipped_circuit_open >= 1, { timeoutMs: 10_000 });
255
+ assert.ok(opened, `circuit should open; stats=${JSON.stringify(consumer.getStats())}`);
256
+ // Spawn count must NOT keep climbing once the circuit is open.
257
+ const spawnsAtOpen = spawnCount;
258
+ await new Promise((r) => setTimeout(r, 500));
259
+ assert.equal(spawnCount, spawnsAtOpen, `spawns must stop once circuit opens (was ${spawnsAtOpen}, now ${spawnCount})`);
260
+ } finally {
261
+ await consumer.stop();
262
+ await rmRoot(root);
263
+ }
264
+ });
265
+
266
+ test("back-off skips re-spawning until the cooldown elapses", async () => {
267
+ const root = await makeAgentRoot();
268
+ plantPrompt(root, "weekly-strategic-memo");
269
+ let spawnCount = 0;
270
+ const consumer = startConsumer({
271
+ agentRoot: root,
272
+ pollMs: 20,
273
+ backoffSchedule: [0, 300, 300], // 300ms cooldown after each failure
274
+ circuitThreshold: 999,
275
+ maxAttempts: 1,
276
+ spawnSession: async () => { spawnCount++; return { ok: false, exit_code: 1, error: "fail", duration_ms: 1 }; },
277
+ });
278
+ try {
279
+ // Enqueue 2 events back-to-back. The 1st triggers a spawn (fails). The
280
+ // 2nd should be held back by the 300ms back-off window.
281
+ enqueueTick({ cadence: "weekly-strategic-memo", source: "launchd", agentRoot: root });
282
+ enqueueTick({ cadence: "weekly-strategic-memo", source: "launchd", agentRoot: root });
283
+ await waitFor(() => spawnCount >= 1, { timeoutMs: 5_000 });
284
+ const spawnsBeforeWait = spawnCount;
285
+ // During the back-off window no new spawn should fire.
286
+ await new Promise((r) => setTimeout(r, 150));
287
+ assert.ok(spawnCount === spawnsBeforeWait, `spawns must wait for back-off (was ${spawnsBeforeWait}, now ${spawnCount})`);
288
+ assert.ok(consumer.getStats().skipped_backoff >= 1, "skipped_backoff should be recorded");
289
+ // After the window passes, the next event should be processed.
290
+ await waitFor(() => spawnCount > spawnsBeforeWait, { timeoutMs: 5_000 });
291
+ assert.ok(spawnCount > spawnsBeforeWait, "spawning resumes after back-off");
292
+ } finally {
293
+ await consumer.stop();
294
+ await rmRoot(root);
295
+ }
296
+ });
297
+
229
298
  // ---------------------------------------------------------------------------
230
299
  // Emergency stop
231
300
  // ---------------------------------------------------------------------------
@@ -0,0 +1,116 @@
1
+ #!/usr/bin/env node
2
+ /**
3
+ * scripts/decisions/capture-decision.mjs — Record a decision into the
4
+ * agent's decision log.
5
+ *
6
+ * Decisions land in knowledge/decisions/DEC-YYYY-MM-DD-NNN.yaml and the
7
+ * index in knowledge/decisions/index.yaml is updated. Idempotent on the
8
+ * same decision-id (overwrites the file, leaves the index intact).
9
+ *
10
+ * Usage:
11
+ * node scripts/decisions/capture-decision.mjs \
12
+ * --title "Adopt cadence bus" \
13
+ * --domain "infrastructure" \
14
+ * --decision "Use a local file-backed event bus consumed by a single persistent main session." \
15
+ * --rationale "Reduces Claude Code spawn cost; enables centralised throttling." \
16
+ * --context "Inbox/cadence ticks were spawning fresh sessions per launchd interval." \
17
+ * [--alternatives "Per-tick spawn (status quo); CronCreate-based scheduler"]
18
+ * [--stakeholders "ravi, mehran"]
19
+ * [--status active]
20
+ * [--decision-maker "Ravi Patel"]
21
+ *
22
+ * Reads agent.json for the default decision-maker.
23
+ */
24
+
25
+ import { existsSync, mkdirSync, readFileSync, readdirSync, writeFileSync } from "node:fs";
26
+ import { join, resolve, dirname } from "node:path";
27
+ import { fileURLToPath } from "node:url";
28
+
29
+ const __dirname = dirname(fileURLToPath(import.meta.url));
30
+ const AGENT_DIR = process.env.AGENT_ROOT || process.env.AGENT_DIR || resolve(__dirname, "..", "..");
31
+
32
+ function fail(msg) { process.stderr.write(`[capture-decision] ${msg}\n`); process.exit(1); }
33
+
34
+ const args = process.argv.slice(2);
35
+ const flags = {};
36
+ for (let i = 0; i < args.length; i++) {
37
+ const a = args[i];
38
+ if (!a.startsWith("--")) continue;
39
+ const key = a.slice(2);
40
+ const val = args[i + 1] && !args[i + 1].startsWith("--") ? args[++i] : "true";
41
+ flags[key] = val;
42
+ }
43
+
44
+ if (!flags.title || !flags.decision) fail("--title and --decision are required (see --help in source).");
45
+
46
+ const decisionMaker = flags["decision-maker"] || (() => {
47
+ try {
48
+ const a = JSON.parse(readFileSync(join(AGENT_DIR, "config/agent.json"), "utf-8"));
49
+ return a.fullName || a.firstName || "the agent";
50
+ } catch { return "the agent"; }
51
+ })();
52
+
53
+ const date = new Date().toISOString().slice(0, 10);
54
+ const decDir = join(AGENT_DIR, "knowledge/decisions");
55
+ mkdirSync(decDir, { recursive: true });
56
+
57
+ // Find next NNN for today.
58
+ const todayPrefix = `DEC-${date}-`;
59
+ let n = 1;
60
+ for (const name of readdirSync(decDir)) {
61
+ if (name.startsWith(todayPrefix)) {
62
+ const tail = name.replace(/^DEC-\d{4}-\d{2}-\d{2}-/, "").replace(/\.ya?ml$/, "");
63
+ const m = parseInt(tail, 10);
64
+ if (!Number.isNaN(m) && m >= n) n = m + 1;
65
+ }
66
+ }
67
+ const id = `${todayPrefix}${String(n).padStart(3, "0")}`;
68
+ const filename = `${id}.yaml`;
69
+
70
+ const yamlEsc = (s) => String(s ?? "").replace(/"/g, '\\"');
71
+ const lines = [
72
+ `id: ${id}`,
73
+ `date: ${date}`,
74
+ `title: "${yamlEsc(flags.title)}"`,
75
+ `domain: "${yamlEsc(flags.domain || "operational")}"`,
76
+ `decision_maker: "${yamlEsc(decisionMaker)}"`,
77
+ `decision_text: "${yamlEsc(flags.decision)}"`,
78
+ `context: "${yamlEsc(flags.context || "")}"`,
79
+ `rationale: "${yamlEsc(flags.rationale || "")}"`,
80
+ `status: ${yamlEsc(flags.status || "active")}`,
81
+ ];
82
+ if (flags.alternatives) {
83
+ lines.push("alternatives:");
84
+ for (const alt of String(flags.alternatives).split(";").map((s) => s.trim()).filter(Boolean)) {
85
+ lines.push(` - "${yamlEsc(alt)}"`);
86
+ }
87
+ }
88
+ if (flags.stakeholders) {
89
+ lines.push("stakeholders:");
90
+ for (const s of String(flags.stakeholders).split(",").map((s) => s.trim()).filter(Boolean)) {
91
+ lines.push(` - ${s}`);
92
+ }
93
+ }
94
+ if (flags["expires-at"]) {
95
+ lines.push(`expires_at: ${flags["expires-at"]}`);
96
+ }
97
+ lines.push("");
98
+
99
+ writeFileSync(join(decDir, filename), lines.join("\n"));
100
+
101
+ // Update index — naive append; reader can re-sort.
102
+ const indexPath = join(decDir, "index.yaml");
103
+ let indexBody = existsSync(indexPath) ? readFileSync(indexPath, "utf-8") : "# Decision Index — auto-maintained\ndecisions: []\n";
104
+ if (!indexBody.includes(id)) {
105
+ // Append a list entry before EOF.
106
+ if (/decisions:\s*\[\s*\]\s*$/.test(indexBody.trim())) {
107
+ indexBody = indexBody.replace(/decisions:\s*\[\s*\]/, `decisions:\n - id: ${id}\n date: ${date}\n title: "${yamlEsc(flags.title)}"\n status: ${yamlEsc(flags.status || "active")}`);
108
+ } else if (/decisions:\s*$/m.test(indexBody)) {
109
+ indexBody = indexBody.replace(/decisions:\s*$/m, `decisions:\n - id: ${id}\n date: ${date}\n title: "${yamlEsc(flags.title)}"\n status: ${yamlEsc(flags.status || "active")}`);
110
+ } else {
111
+ indexBody = indexBody.trimEnd() + `\n - id: ${id}\n date: ${date}\n title: "${yamlEsc(flags.title)}"\n status: ${yamlEsc(flags.status || "active")}\n`;
112
+ }
113
+ writeFileSync(indexPath, indexBody);
114
+ }
115
+
116
+ process.stdout.write(JSON.stringify({ ok: true, id, file: join(decDir, filename) }, null, 2) + "\n");