@adaptic/maestro 1.9.0 → 1.9.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/.env.example CHANGED
@@ -16,11 +16,29 @@
16
16
  # The agent's reasoning engines. At minimum you need Anthropic (Claude).
17
17
  #
18
18
 
19
- # REQUIRED — Primary reasoning engine (Claude Code uses this)
20
- # Get your key: https://console.anthropic.com/settings/keys
21
- # Subscription: Anthropic API plan (pay-per-token) or Max subscription
19
+ # REQUIRED — Primary reasoning engine. Two ways to authenticate:
20
+ #
21
+ # Option A API key (pay-per-token)
22
+ # Set ANTHROPIC_API_KEY below to a valid sk-ant-api03-... key.
23
+ # Get one: https://console.anthropic.com/settings/keys
24
+ #
25
+ # Option B — Claude Code subscription (Pro/Max, OAuth via Keychain)
26
+ # LEAVE ANTHROPIC_API_KEY EMPTY *and* set MAESTRO_PREFER_SUBSCRIPTION_AUTH=1.
27
+ # This tells the cadence consumer to strip ANTHROPIC_API_KEY from every
28
+ # sub-session spawn so claude --print falls back to the keychain OAuth
29
+ # token. Most agents on a Mac mini with a Claude Code subscription
30
+ # should use this option — routine cadence ticks cost zero API credits.
31
+ #
32
+ # Doctor validates the key against api.anthropic.com on every run; an
33
+ # invalid key here will cascade 401s through every sub-session spawn.
22
34
  ANTHROPIC_API_KEY=
23
35
 
36
+ # OPTIONAL — When set to 1, the cadence consumer strips ANTHROPIC_API_KEY
37
+ # from every claude --print sub-session env so claude falls back to
38
+ # Claude Code subscription auth (Keychain OAuth). Use this when the
39
+ # agent's Mac has a Claude Code Pro/Max subscription.
40
+ MAESTRO_PREFER_SUBSCRIPTION_AUTH=
41
+
24
42
  # OPTIONAL — Supplemental model access (GPT-4, embeddings)
25
43
  # Get your key: https://platform.openai.com/api-keys
26
44
  # Subscription: OpenAI API plan (pay-per-token)
package/bin/maestro.mjs CHANGED
@@ -1462,6 +1462,43 @@ function doctor() {
1462
1462
  check("ANTHROPIC_API_KEY", true);
1463
1463
  check("SLACK_USER_TOKEN", false);
1464
1464
  check("GMAIL_APP_PASSWORD", false);
1465
+
1466
+ // Auth validity: if ANTHROPIC_API_KEY is set, ping the API to
1467
+ // verify it works. An invalid key in .env will silently be sent
1468
+ // to every `claude --print` sub-session and cause cascading 401s
1469
+ // (exactly the ravi-ai inbox-processor runaway). Better to catch
1470
+ // it here. Skips the check if the user opted out via
1471
+ // MAESTRO_PREFER_SUBSCRIPTION_AUTH=1 (subscription wins).
1472
+ const keyMatch = env.match(/^ANTHROPIC_API_KEY=(.+)$/m);
1473
+ const preferSubsMatch = env.match(/^MAESTRO_PREFER_SUBSCRIPTION_AUTH=(.+)$/m);
1474
+ const preferSubs = preferSubsMatch && /^1|true|yes$/i.test(preferSubsMatch[1].trim());
1475
+ if (keyMatch && !preferSubs) {
1476
+ const key = keyMatch[1].trim().replace(/^"|"$/g, "");
1477
+ try {
1478
+ const result = spawnSync("curl", [
1479
+ "-s", "-o", "/dev/null", "-w", "%{http_code}",
1480
+ "-X", "POST",
1481
+ "-H", `x-api-key: ${key}`,
1482
+ "-H", "anthropic-version: 2023-06-01",
1483
+ "-H", "content-type: application/json",
1484
+ "--max-time", "8",
1485
+ "https://api.anthropic.com/v1/messages",
1486
+ "-d", JSON.stringify({ model: "claude-haiku-4-5", max_tokens: 5, messages: [{ role: "user", content: "ping" }] }),
1487
+ ], { encoding: "utf-8" });
1488
+ const code = (result.stdout || "").trim();
1489
+ if (code === "200") ok("ANTHROPIC_API_KEY validated against api.anthropic.com");
1490
+ else if (code === "401") {
1491
+ warn(`ANTHROPIC_API_KEY is INVALID (HTTP 401 from api.anthropic.com).`);
1492
+ warn(` This will cause every sub-session spawn to fail. Either:`);
1493
+ warn(` 1. Replace the key in .env with a valid one, OR`);
1494
+ warn(` 2. Set MAESTRO_PREFER_SUBSCRIPTION_AUTH=1 in .env to use Claude Code subscription auth.`);
1495
+ issues++;
1496
+ } else if (code) warn(`ANTHROPIC_API_KEY check returned HTTP ${code} (expected 200)`);
1497
+ else warn(`ANTHROPIC_API_KEY check skipped (no network / curl missing)`);
1498
+ } catch { warn("ANTHROPIC_API_KEY check failed (curl error)"); }
1499
+ } else if (preferSubs) {
1500
+ ok("MAESTRO_PREFER_SUBSCRIPTION_AUTH=1 — using Claude Code subscription (Keychain OAuth)");
1501
+ }
1465
1502
  } else {
1466
1503
  fail(".env file not found — copy from .env.example");
1467
1504
  issues++;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@adaptic/maestro",
3
- "version": "1.9.0",
3
+ "version": "1.9.2",
4
4
  "description": "Maestro — Autonomous AI agent operating system. Deploy AI employees on dedicated Mac minis.",
5
5
  "type": "module",
6
6
  "bin": {
@@ -43,7 +43,7 @@
43
43
  * logger optional fn({ ts, level, …rest }) → void for tests.
44
44
  */
45
45
 
46
- import { existsSync, readFileSync, writeFileSync } from "node:fs";
46
+ import { existsSync, readFileSync, writeFileSync, mkdirSync, appendFileSync, openSync, closeSync, statSync, unlinkSync } from "node:fs";
47
47
  import { join } from "node:path";
48
48
  import { spawn } from "node:child_process";
49
49
  import { homedir } from "node:os";
@@ -75,6 +75,17 @@ const DEFAULT_SPAWN_TIMEOUT_MS = 30 * 60_000;
75
75
  // preferable to thrashing Claude / hitting usage limits.
76
76
  const MAX_CONCURRENT_SUB_SESSIONS = 1;
77
77
 
78
+ // Retry policy. Most cadence failures are systemic (broken prompt, bad
79
+ // auth, transient API errors) — 5 retries doesn't help, it just amplifies
80
+ // the burn. 2 retries with exponential back-off is the right balance.
81
+ const DEFAULT_MAX_ATTEMPTS = 2;
82
+ const BACKOFF_SCHEDULE_MS = [0, 30_000, 120_000]; // 1st retry +30s, 2nd retry +2m
83
+
84
+ // Circuit breaker — when 3 same-cadence failures land in a row, stop
85
+ // spawning that cadence for 30 minutes. Prevents launchd-rate runaway.
86
+ const CIRCUIT_OPEN_THRESHOLD = 3;
87
+ const CIRCUIT_OPEN_DURATION_MS = 30 * 60_000;
88
+
78
89
  // ---------------------------------------------------------------------------
79
90
  // Helpers
80
91
  // ---------------------------------------------------------------------------
@@ -129,8 +140,13 @@ function resolveClaudeBin() {
129
140
 
130
141
  /**
131
142
  * Spawn a sub-session running the cadence's trigger prompt and resolve
132
- * with { exit_code, durationMs }. Reads the prompt at call time so the
133
- * latest version (possibly upgraded between ticks) is always used.
143
+ * with { exit_code, durationMs, stderr_tail }. Reads the prompt at call
144
+ * time so the latest version (possibly upgraded between ticks) is always
145
+ * used.
146
+ *
147
+ * Robustness: stdout + stderr are tee'd to logs/cadence-bus/subsessions/
148
+ * so non-zero exits remain diagnosable after the fact. The last ~4 KB of
149
+ * stderr is also captured in-memory and surfaced on the failure event.
134
150
  */
135
151
  function realSpawnSession({ agentRoot, cadence, promptPath, timeoutMs, log }) {
136
152
  return new Promise((resolveOut) => {
@@ -167,14 +183,43 @@ function realSpawnSession({ agentRoot, cadence, promptPath, timeoutMs, log }) {
167
183
  AGENT_DIR: agentRoot,
168
184
  PATH: augmentedPath,
169
185
  };
186
+ // Auth handling. Claude Code authenticates via macOS Keychain
187
+ // (OAuth from the user's Pro/Max subscription) when no API key is
188
+ // set, OR via the ANTHROPIC_API_KEY env var when one is present.
189
+ // If the env key is present BUT looks like a placeholder / empty
190
+ // string, we strip it so claude can fall back to Keychain OAuth.
191
+ // Set MAESTRO_PREFER_SUBSCRIPTION_AUTH=1 in .env to always strip
192
+ // the API key (force subscription auth) — useful when the agent
193
+ // owns a Claude Code Pro/Max subscription and shouldn't burn API
194
+ // credits for routine ticks.
195
+ const preferSubscription = process.env.MAESTRO_PREFER_SUBSCRIPTION_AUTH === "1";
196
+ const apiKey = env.ANTHROPIC_API_KEY || "";
197
+ if (preferSubscription || !apiKey.trim() || /^(your-api-key|placeholder|xxx+|sk-ant-xxx)/i.test(apiKey)) {
198
+ delete env.ANTHROPIC_API_KEY;
199
+ }
170
200
  const started = Date.now();
171
201
 
172
- log({ level: "info", stage: "subsession_spawn", cadence, bin });
202
+ // Per-run log file. Pattern is short enough to be tail-friendly.
203
+ const logsDir = join(agentRoot, "logs", "cadence-bus", "subsessions");
204
+ mkdirSync(logsDir, { recursive: true });
205
+ const date = new Date().toISOString().slice(0, 10);
206
+ const stamp = new Date().toISOString().replace(/[:.]/g, "-");
207
+ const stdoutPath = join(logsDir, `${date}-${cadence}-${stamp}.stdout.log`);
208
+ const stderrPath = join(logsDir, `${date}-${cadence}-${stamp}.stderr.log`);
209
+ const stdoutFd = openSync(stdoutPath, "a");
210
+ const stderrFd = openSync(stderrPath, "a");
211
+
212
+ log({ level: "info", stage: "subsession_spawn", cadence, bin, stdout: stdoutPath, stderr: stderrPath });
173
213
 
174
214
  let child;
175
215
  try {
176
- child = spawn(bin, args, { cwd: agentRoot, env, stdio: "ignore" });
216
+ // stdio:
217
+ // 0 ignore (claude --print reads prompt from argv, not stdin)
218
+ // 1 → file (capture stdout for later inspection)
219
+ // 2 → file (capture stderr — critical for diagnosing exit-1)
220
+ child = spawn(bin, args, { cwd: agentRoot, env, stdio: ["ignore", stdoutFd, stderrFd] });
177
221
  } catch (err) {
222
+ try { closeSync(stdoutFd); closeSync(stderrFd); } catch { /* */ }
178
223
  resolveOut({ ok: false, exit_code: -4, error: `spawn failed: ${err.message}` });
179
224
  return;
180
225
  }
@@ -188,8 +233,22 @@ function realSpawnSession({ agentRoot, cadence, promptPath, timeoutMs, log }) {
188
233
 
189
234
  child.on("exit", (code, signal) => {
190
235
  clearTimeout(timer);
236
+ try { closeSync(stdoutFd); closeSync(stderrFd); } catch { /* */ }
191
237
  const durationMs = Date.now() - started;
192
238
  const exit_code = typeof code === "number" ? code : (signal ? -1 : -5);
239
+
240
+ // Pull tail of stderr (and stdout if stderr empty) for the failure
241
+ // surface. Best-effort; we never block on file size.
242
+ let stderrTail = "";
243
+ try {
244
+ const body = readFileSync(stderrPath, "utf-8");
245
+ stderrTail = body.slice(-4096);
246
+ if (!stderrTail.trim()) {
247
+ const so = readFileSync(stdoutPath, "utf-8");
248
+ stderrTail = so.slice(-4096);
249
+ }
250
+ } catch { /* file may not exist if spawn ENOENT before fd-redirect */ }
251
+
193
252
  // Record cost-ledger row. Token counts are 0 until we parse the
194
253
  // session's JSON output; for now exit-code + duration are enough
195
254
  // to spot pathological retry loops.
@@ -208,16 +267,29 @@ function realSpawnSession({ agentRoot, cadence, promptPath, timeoutMs, log }) {
208
267
  ], { stdio: "ignore", env: { ...env, AGENT_ROOT: agentRoot } }).unref();
209
268
  }
210
269
  } catch { /* cost tracking is best-effort */ }
270
+
271
+ // Clean up empty log files so the directory doesn't accumulate
272
+ // hundreds of zero-byte successes.
273
+ try {
274
+
275
+ if (statSync(stdoutPath).size === 0) unlinkSync(stdoutPath);
276
+ if (statSync(stderrPath).size === 0) unlinkSync(stderrPath);
277
+ } catch { /* */ }
278
+
211
279
  resolveOut({
212
280
  ok: exit_code === 0,
213
281
  exit_code,
214
282
  signal: signal || null,
215
283
  duration_ms: durationMs,
284
+ stderr_tail: stderrTail || null,
285
+ stdout_path: stdoutPath,
286
+ stderr_path: stderrPath,
216
287
  });
217
288
  });
218
289
 
219
290
  child.on("error", (err) => {
220
291
  clearTimeout(timer);
292
+ try { closeSync(stdoutFd); closeSync(stderrFd); } catch { /* */ }
221
293
  const durationMs = Date.now() - started;
222
294
  resolveOut({ ok: false, exit_code: -6, error: err.message, duration_ms: durationMs });
223
295
  });
@@ -242,6 +314,11 @@ export function startConsumer(opts = {}) {
242
314
  const maxSpawnMs = opts.maxSpawnMs ?? DEFAULT_SPAWN_TIMEOUT_MS;
243
315
  const spawnSession = opts.spawnSession || realSpawnSession;
244
316
  const userLogger = opts.logger;
317
+ // Test / tuning hooks for the reliability layer.
318
+ const backoffSchedule = opts.backoffSchedule || BACKOFF_SCHEDULE_MS;
319
+ const circuitThreshold = opts.circuitThreshold ?? CIRCUIT_OPEN_THRESHOLD;
320
+ const circuitDurationMs = opts.circuitDurationMs ?? CIRCUIT_OPEN_DURATION_MS;
321
+ const maxAttempts = opts.maxAttempts ?? DEFAULT_MAX_ATTEMPTS;
245
322
 
246
323
  const stats = {
247
324
  started_at: new Date().toISOString(),
@@ -249,6 +326,8 @@ export function startConsumer(opts = {}) {
249
326
  inline: 0,
250
327
  escalated: 0,
251
328
  skipped_emergency_stop: 0,
329
+ skipped_circuit_open: 0,
330
+ skipped_backoff: 0,
252
331
  dlq: 0,
253
332
  retries: 0,
254
333
  spawn_failures: 0,
@@ -261,6 +340,75 @@ export function startConsumer(opts = {}) {
261
340
  let timers = [];
262
341
  let activeSubSessions = 0;
263
342
 
343
+ // Per-cadence reliability state. Tracks consecutive failure count and
344
+ // the earliest moment we'll allow another spawn for that cadence.
345
+ // Persists nothing — circuit state is in-memory only. On daemon restart
346
+ // we get a fresh slate; that's intentional (operators expect a restart
347
+ // to mean "try again now").
348
+ const cadenceState = new Map(); // cadence → { failures, openUntil, nextAllowedAt }
349
+
350
+ function getCadenceState(cadence) {
351
+ let s = cadenceState.get(cadence);
352
+ if (!s) { s = { failures: 0, openUntil: 0, nextAllowedAt: 0 }; cadenceState.set(cadence, s); }
353
+ return s;
354
+ }
355
+
356
+ function recordSubsessionSuccess(cadence) {
357
+ const s = getCadenceState(cadence);
358
+ s.failures = 0;
359
+ s.openUntil = 0;
360
+ s.nextAllowedAt = 0;
361
+ }
362
+
363
+ function recordSubsessionFailure(cadence) {
364
+ const s = getCadenceState(cadence);
365
+ s.failures += 1;
366
+ // Exponential back-off honouring the (test-overridable) schedule.
367
+ const idx = Math.min(s.failures, backoffSchedule.length - 1);
368
+ s.nextAllowedAt = Date.now() + backoffSchedule[idx];
369
+ if (s.failures >= circuitThreshold) {
370
+ s.openUntil = Date.now() + circuitDurationMs;
371
+ log({ level: "error", stage: "circuit_opened", cadence, failures: s.failures, open_until: new Date(s.openUntil).toISOString() });
372
+ writeCircuitFile();
373
+ }
374
+ }
375
+
376
+ function writeCircuitFile() {
377
+ // Persist the open-circuit snapshot so doctor + the operator can see
378
+ // which cadences are currently held back without scraping logs.
379
+ const open = {};
380
+ for (const [cad, s] of cadenceState.entries()) {
381
+ if (s.openUntil > Date.now()) {
382
+ open[cad] = { failures: s.failures, open_until: new Date(s.openUntil).toISOString() };
383
+ }
384
+ }
385
+ const path = join(agentRoot, "state/cadence-bus/circuit-open.json");
386
+ try {
387
+ if (Object.keys(open).length === 0) {
388
+ // Remove the file when nothing is open.
389
+
390
+ try { unlinkSync(path); } catch { /* */ }
391
+ } else {
392
+ writeFileSync(path, JSON.stringify({ generated: new Date().toISOString(), open }, null, 2) + "\n");
393
+ }
394
+ } catch { /* best-effort */ }
395
+ }
396
+
397
+ function isCadenceAllowed(cadence) {
398
+ const s = getCadenceState(cadence);
399
+ const now = Date.now();
400
+ if (s.openUntil > now) return { allowed: false, reason: "circuit-open", retry_at: s.openUntil };
401
+ if (s.nextAllowedAt > now) return { allowed: false, reason: "backoff", retry_at: s.nextAllowedAt };
402
+ // Circuit closes automatically when openUntil passes.
403
+ if (s.openUntil && s.openUntil <= now) {
404
+ s.openUntil = 0;
405
+ s.failures = 0;
406
+ log({ level: "info", stage: "circuit_closed", cadence });
407
+ writeCircuitFile();
408
+ }
409
+ return { allowed: true };
410
+ }
411
+
264
412
  function log(entry) {
265
413
  const enriched = { ts: new Date().toISOString(), ...entry };
266
414
  logBusEvent(agentRoot, enriched);
@@ -280,6 +428,32 @@ export function startConsumer(opts = {}) {
280
428
  }
281
429
 
282
430
  async function escalate(event) {
431
+ // Circuit-breaker / back-off gate. If this cadence is currently held
432
+ // back, requeue without spawning. The event keeps its attempt count
433
+ // because the failure was upstream (not a per-event problem).
434
+ const gate = isCadenceAllowed(event.cadence);
435
+ if (!gate.allowed) {
436
+ log({
437
+ level: "warn",
438
+ stage: gate.reason === "circuit-open" ? "skipped_circuit_open" : "skipped_backoff",
439
+ id: event.id,
440
+ cadence: event.cadence,
441
+ retry_at: new Date(gate.retry_at).toISOString(),
442
+ });
443
+ if (gate.reason === "circuit-open") stats.skipped_circuit_open += 1;
444
+ else stats.skipped_backoff += 1;
445
+ // Put the event back in inbox WITHOUT bumping attempts so it doesn't
446
+ // burn its retry budget while the circuit is open.
447
+ const paths2 = getBusPaths(agentRoot);
448
+ try {
449
+ const event2 = { ...event, attempts: Math.max(0, (event.attempts || 1) - 1) };
450
+ writeFileSync(join(paths2.inbox, `${event.id}.json`), JSON.stringify(event2, null, 2) + "\n");
451
+
452
+ try { unlinkSync(join(paths2.claimed, `${event.id}.json`)); } catch { /* */ }
453
+ } catch { /* best-effort */ }
454
+ return { ok: false, decision: gate.reason };
455
+ }
456
+
283
457
  if (activeSubSessions >= MAX_CONCURRENT_SUB_SESSIONS) {
284
458
  // Re-queue and try again next tick. Single-owner cadence consumer
285
459
  // means this can only happen when a prior tick is still running —
@@ -291,7 +465,15 @@ export function startConsumer(opts = {}) {
291
465
  cadence: event.cadence,
292
466
  active_subsessions: activeSubSessions,
293
467
  });
294
- failTick(agentRoot, event.id, "deferred:concurrent-spawn", { maxAttempts: 10 });
468
+ // Re-queue without burning the retry budget — concurrent-spawn isn't
469
+ // a per-event failure.
470
+ const paths2 = getBusPaths(agentRoot);
471
+ try {
472
+ const event2 = { ...event, attempts: Math.max(0, (event.attempts || 1) - 1) };
473
+ writeFileSync(join(paths2.inbox, `${event.id}.json`), JSON.stringify(event2, null, 2) + "\n");
474
+
475
+ try { unlinkSync(join(paths2.claimed, `${event.id}.json`)); } catch { /* */ }
476
+ } catch { /* best-effort */ }
295
477
  stats.retries += 1;
296
478
  return { ok: false, decision: "deferred" };
297
479
  }
@@ -334,14 +516,31 @@ export function startConsumer(opts = {}) {
334
516
  prompt: promptPath,
335
517
  exit_code: result.exit_code,
336
518
  duration_ms: result.duration_ms,
519
+ stdout_path: result.stdout_path || null,
520
+ stderr_path: result.stderr_path || null,
337
521
  });
522
+ recordSubsessionSuccess(event.cadence);
338
523
  stats.escalated += 1;
339
524
  stats.last_decision = "escalated";
340
525
  return { ok: true, decision: "escalated", exit_code: result.exit_code };
341
526
  }
342
- log({ level: "error", stage: "subsession_failed", id: event.id, cadence: event.cadence, exit_code: result.exit_code, error: result.error || null });
527
+ // Failure path: log + cap retries low. The exact stderr tail comes
528
+ // from the spawn helper so we never DLQ "blind" again.
529
+ const stderrTail = (result.stderr_tail || "").trim().split("\n").slice(-3).join(" | ");
530
+ log({
531
+ level: "error",
532
+ stage: "subsession_failed",
533
+ id: event.id,
534
+ cadence: event.cadence,
535
+ exit_code: result.exit_code,
536
+ duration_ms: result.duration_ms,
537
+ error: result.error || stderrTail || `exit ${result.exit_code}`,
538
+ stderr_path: result.stderr_path || null,
539
+ });
343
540
  stats.spawn_failures += 1;
344
- const outcome = failTick(agentRoot, event.id, result.error || `exit ${result.exit_code}`);
541
+ recordSubsessionFailure(event.cadence);
542
+ const reason = result.error || (stderrTail ? `exit ${result.exit_code}: ${stderrTail}` : `exit ${result.exit_code}`);
543
+ const outcome = failTick(agentRoot, event.id, reason, { maxAttempts });
345
544
  if (outcome?.destination === "dlq") stats.dlq += 1;
346
545
  else stats.retries += 1;
347
546
  return { ok: false, decision: outcome?.destination || "failed" };
@@ -428,19 +627,38 @@ export function startConsumer(opts = {}) {
428
627
  recoverStaleClaims(agentRoot);
429
628
 
430
629
  let processed = 0;
431
- // Drain as much as the consumer can in one tick, but yield to the
432
- // event loop between events so heartbeats and stop signals fire.
630
+ let escalatedThisTick = 0;
631
+ // Drain inline events as much as the consumer can in one tick; cap
632
+ // sub-session escalations at 1 per tick so a fast-failing cadence
633
+ // can't burn a whole minute's worth of retries inside a single poll.
634
+ // The next poll (DEFAULT_POLL_MS later) will pick up where we left off.
433
635
  while (!stopping) {
434
636
  const claim = claimNextTick(agentRoot);
435
637
  if (!claim) break;
436
638
  const event = claim.event;
437
639
  activeTick = event.id;
640
+ let didEscalate = false;
438
641
  try {
642
+ const def = getCadenceDef(event.cadence);
643
+ const willEscalate = !def || (def.mode !== "inline" && (def.mode !== "guarded" || true));
644
+ // Roughly: if it's not a registry-inline cadence, we MAY escalate.
645
+ // We don't yet know if the guard will say inline; processEvent
646
+ // will tell us via stats. Use the escalated stats delta as the
647
+ // signal that an actual sub-session ran this iteration.
648
+ const before = stats.escalated + stats.spawn_failures + stats.skipped_circuit_open + stats.skipped_backoff;
439
649
  await processEvent(event);
650
+ const after = stats.escalated + stats.spawn_failures + stats.skipped_circuit_open + stats.skipped_backoff;
651
+ if (after > before) didEscalate = true;
652
+ // Silence unused var warning.
653
+ void willEscalate;
440
654
  } finally {
441
655
  activeTick = null;
442
656
  }
443
657
  processed += 1;
658
+ if (didEscalate) escalatedThisTick += 1;
659
+ // Hard cap: at most ONE sub-session spawn per tick. Inline ticks
660
+ // keep draining freely (they're cheap).
661
+ if (escalatedThisTick >= 1) break;
444
662
  if (processed >= 16) break; // soft batch cap
445
663
  }
446
664
  return { processed };
@@ -210,9 +210,16 @@ test("unknown cadence with no prompt file DLQ's immediately", async () => {
210
210
  test("spawn failure retries within the budget, then DLQs", async () => {
211
211
  const root = await makeAgentRoot();
212
212
  plantPrompt(root, "weekly-strategic-memo");
213
+ // Disable back-off + raise circuit threshold so the test exercises the
214
+ // retry-then-DLQ path without waiting for back-off windows. The
215
+ // real defaults (30s/2m back-off, 3-failure circuit) are exercised by
216
+ // dedicated tests below.
213
217
  const consumer = startConsumer({
214
218
  agentRoot: root,
215
219
  pollMs: 25,
220
+ backoffSchedule: [0, 0, 0],
221
+ circuitThreshold: 999,
222
+ maxAttempts: 2,
216
223
  spawnSession: async () => ({ ok: false, exit_code: 1, error: "always-fail", duration_ms: 1 }),
217
224
  });
218
225
  try {
@@ -226,6 +233,68 @@ test("spawn failure retries within the budget, then DLQs", async () => {
226
233
  }
227
234
  });
228
235
 
236
+ test("circuit breaker opens after consecutive failures and blocks further spawns", async () => {
237
+ const root = await makeAgentRoot();
238
+ plantPrompt(root, "weekly-strategic-memo");
239
+ let spawnCount = 0;
240
+ const consumer = startConsumer({
241
+ agentRoot: root,
242
+ pollMs: 20,
243
+ backoffSchedule: [0, 0, 0],
244
+ circuitThreshold: 2,
245
+ circuitDurationMs: 60_000, // 1 min — long enough for the assertion window
246
+ maxAttempts: 1, // each event DLQs on first failure so we don't conflate retry-counts
247
+ spawnSession: async () => { spawnCount++; return { ok: false, exit_code: 1, error: "fail", duration_ms: 1 }; },
248
+ });
249
+ try {
250
+ // Enqueue 5 events; circuit should open after 2 failures, blocking the rest.
251
+ for (let i = 0; i < 5; i++) {
252
+ enqueueTick({ cadence: "weekly-strategic-memo", source: "launchd", agentRoot: root });
253
+ }
254
+ const opened = await waitFor(() => consumer.getStats().skipped_circuit_open >= 1, { timeoutMs: 10_000 });
255
+ assert.ok(opened, `circuit should open; stats=${JSON.stringify(consumer.getStats())}`);
256
+ // Spawn count must NOT keep climbing once the circuit is open.
257
+ const spawnsAtOpen = spawnCount;
258
+ await new Promise((r) => setTimeout(r, 500));
259
+ assert.equal(spawnCount, spawnsAtOpen, `spawns must stop once circuit opens (was ${spawnsAtOpen}, now ${spawnCount})`);
260
+ } finally {
261
+ await consumer.stop();
262
+ await rmRoot(root);
263
+ }
264
+ });
265
+
266
+ test("back-off skips re-spawning until the cooldown elapses", async () => {
267
+ const root = await makeAgentRoot();
268
+ plantPrompt(root, "weekly-strategic-memo");
269
+ let spawnCount = 0;
270
+ const consumer = startConsumer({
271
+ agentRoot: root,
272
+ pollMs: 20,
273
+ backoffSchedule: [0, 300, 300], // 300ms cooldown after each failure
274
+ circuitThreshold: 999,
275
+ maxAttempts: 1,
276
+ spawnSession: async () => { spawnCount++; return { ok: false, exit_code: 1, error: "fail", duration_ms: 1 }; },
277
+ });
278
+ try {
279
+ // Enqueue 2 events back-to-back. The 1st triggers a spawn (fails). The
280
+ // 2nd should be held back by the 300ms back-off window.
281
+ enqueueTick({ cadence: "weekly-strategic-memo", source: "launchd", agentRoot: root });
282
+ enqueueTick({ cadence: "weekly-strategic-memo", source: "launchd", agentRoot: root });
283
+ await waitFor(() => spawnCount >= 1, { timeoutMs: 5_000 });
284
+ const spawnsBeforeWait = spawnCount;
285
+ // During the back-off window no new spawn should fire.
286
+ await new Promise((r) => setTimeout(r, 150));
287
+ assert.ok(spawnCount === spawnsBeforeWait, `spawns must wait for back-off (was ${spawnsBeforeWait}, now ${spawnCount})`);
288
+ assert.ok(consumer.getStats().skipped_backoff >= 1, "skipped_backoff should be recorded");
289
+ // After the window passes, the next event should be processed.
290
+ await waitFor(() => spawnCount > spawnsBeforeWait, { timeoutMs: 5_000 });
291
+ assert.ok(spawnCount > spawnsBeforeWait, "spawning resumes after back-off");
292
+ } finally {
293
+ await consumer.stop();
294
+ await rmRoot(root);
295
+ }
296
+ });
297
+
229
298
  // ---------------------------------------------------------------------------
230
299
  // Emergency stop
231
300
  // ---------------------------------------------------------------------------