npm - @adaptic/maestro - Versions diffs - 1.9.0 → 1.9.1 - Mend

@adaptic/maestro 1.9.0 → 1.9.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/package.json +1 -1
package/scripts/daemon/cadence-consumer.mjs +214 -10
package/scripts/daemon/cadence-consumer.test.mjs +69 -0

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@adaptic/maestro",
-  "version": "1.9.0",
+  "version": "1.9.1",
   "description": "Maestro — Autonomous AI agent operating system. Deploy AI employees on dedicated Mac minis.",
   "type": "module",
   "bin": {

package/scripts/daemon/cadence-consumer.mjs CHANGED Viewed

@@ -43,7 +43,7 @@
  *   logger         optional fn({ ts, level, …rest }) → void for tests.
  */
-import { existsSync, readFileSync, writeFileSync } from "node:fs";
+import { existsSync, readFileSync, writeFileSync, mkdirSync, appendFileSync, openSync, closeSync, statSync, unlinkSync } from "node:fs";
 import { join } from "node:path";
 import { spawn } from "node:child_process";
 import { homedir } from "node:os";
@@ -75,6 +75,17 @@ const DEFAULT_SPAWN_TIMEOUT_MS = 30 * 60_000;
 // preferable to thrashing Claude / hitting usage limits.
 const MAX_CONCURRENT_SUB_SESSIONS = 1;
+// Retry policy. Most cadence failures are systemic (broken prompt, bad
+// auth, transient API errors) — 5 retries doesn't help, it just amplifies
+// the burn. 2 retries with exponential back-off is the right balance.
+const DEFAULT_MAX_ATTEMPTS = 2;
+const BACKOFF_SCHEDULE_MS = [0, 30_000, 120_000]; // 1st retry +30s, 2nd retry +2m
+// Circuit breaker — when 3 same-cadence failures land in a row, stop
+// spawning that cadence for 30 minutes. Prevents launchd-rate runaway.
+const CIRCUIT_OPEN_THRESHOLD = 3;
+const CIRCUIT_OPEN_DURATION_MS = 30 * 60_000;
 // ---------------------------------------------------------------------------
 // Helpers
 // ---------------------------------------------------------------------------
@@ -129,8 +140,13 @@ function resolveClaudeBin() {
 /**
  * Spawn a sub-session running the cadence's trigger prompt and resolve
- * with { exit_code, durationMs }. Reads the prompt at call time so the
- * latest version (possibly upgraded between ticks) is always used.
+ * with { exit_code, durationMs, stderr_tail }. Reads the prompt at call
+ * time so the latest version (possibly upgraded between ticks) is always
+ * used.
+ *
+ * Robustness: stdout + stderr are tee'd to logs/cadence-bus/subsessions/
+ * so non-zero exits remain diagnosable after the fact. The last ~4 KB of
+ * stderr is also captured in-memory and surfaced on the failure event.
  */
 function realSpawnSession({ agentRoot, cadence, promptPath, timeoutMs, log }) {
   return new Promise((resolveOut) => {
@@ -169,12 +185,27 @@ function realSpawnSession({ agentRoot, cadence, promptPath, timeoutMs, log }) {
     };
     const started = Date.now();
-    log({ level: "info", stage: "subsession_spawn", cadence, bin });
+    // Per-run log file. Pattern is short enough to be tail-friendly.
+    const logsDir = join(agentRoot, "logs", "cadence-bus", "subsessions");
+    mkdirSync(logsDir, { recursive: true });
+    const date = new Date().toISOString().slice(0, 10);
+    const stamp = new Date().toISOString().replace(/[:.]/g, "-");
+    const stdoutPath = join(logsDir, `${date}-${cadence}-${stamp}.stdout.log`);
+    const stderrPath = join(logsDir, `${date}-${cadence}-${stamp}.stderr.log`);
+    const stdoutFd = openSync(stdoutPath, "a");
+    const stderrFd = openSync(stderrPath, "a");
+    log({ level: "info", stage: "subsession_spawn", cadence, bin, stdout: stdoutPath, stderr: stderrPath });
     let child;
     try {
-      child = spawn(bin, args, { cwd: agentRoot, env, stdio: "ignore" });
+      // stdio:
+      //   0 ignore  (claude --print reads prompt from argv, not stdin)
+      //   1 → file  (capture stdout for later inspection)
+      //   2 → file  (capture stderr — critical for diagnosing exit-1)
+      child = spawn(bin, args, { cwd: agentRoot, env, stdio: ["ignore", stdoutFd, stderrFd] });
     } catch (err) {
+      try { closeSync(stdoutFd); closeSync(stderrFd); } catch { /* */ }
       resolveOut({ ok: false, exit_code: -4, error: `spawn failed: ${err.message}` });
       return;
     }
@@ -188,8 +219,22 @@ function realSpawnSession({ agentRoot, cadence, promptPath, timeoutMs, log }) {
     child.on("exit", (code, signal) => {
       clearTimeout(timer);
+      try { closeSync(stdoutFd); closeSync(stderrFd); } catch { /* */ }
       const durationMs = Date.now() - started;
       const exit_code = typeof code === "number" ? code : (signal ? -1 : -5);
+      // Pull tail of stderr (and stdout if stderr empty) for the failure
+      // surface. Best-effort; we never block on file size.
+      let stderrTail = "";
+      try {
+        const body = readFileSync(stderrPath, "utf-8");
+        stderrTail = body.slice(-4096);
+        if (!stderrTail.trim()) {
+          const so = readFileSync(stdoutPath, "utf-8");
+          stderrTail = so.slice(-4096);
+        }
+      } catch { /* file may not exist if spawn ENOENT before fd-redirect */ }
       // Record cost-ledger row. Token counts are 0 until we parse the
       // session's JSON output; for now exit-code + duration are enough
       // to spot pathological retry loops.
@@ -208,16 +253,29 @@ function realSpawnSession({ agentRoot, cadence, promptPath, timeoutMs, log }) {
           ], { stdio: "ignore", env: { ...env, AGENT_ROOT: agentRoot } }).unref();
         }
       } catch { /* cost tracking is best-effort */ }
+      // Clean up empty log files so the directory doesn't accumulate
+      // hundreds of zero-byte successes.
+      try {
+        if (statSync(stdoutPath).size === 0) unlinkSync(stdoutPath);
+        if (statSync(stderrPath).size === 0) unlinkSync(stderrPath);
+      } catch { /* */ }
       resolveOut({
         ok: exit_code === 0,
         exit_code,
         signal: signal || null,
         duration_ms: durationMs,
+        stderr_tail: stderrTail || null,
+        stdout_path: stdoutPath,
+        stderr_path: stderrPath,
       });
     });
     child.on("error", (err) => {
       clearTimeout(timer);
+      try { closeSync(stdoutFd); closeSync(stderrFd); } catch { /* */ }
       const durationMs = Date.now() - started;
       resolveOut({ ok: false, exit_code: -6, error: err.message, duration_ms: durationMs });
     });
@@ -242,6 +300,11 @@ export function startConsumer(opts = {}) {
   const maxSpawnMs = opts.maxSpawnMs ?? DEFAULT_SPAWN_TIMEOUT_MS;
   const spawnSession = opts.spawnSession || realSpawnSession;
   const userLogger = opts.logger;
+  // Test / tuning hooks for the reliability layer.
+  const backoffSchedule = opts.backoffSchedule || BACKOFF_SCHEDULE_MS;
+  const circuitThreshold = opts.circuitThreshold ?? CIRCUIT_OPEN_THRESHOLD;
+  const circuitDurationMs = opts.circuitDurationMs ?? CIRCUIT_OPEN_DURATION_MS;
+  const maxAttempts = opts.maxAttempts ?? DEFAULT_MAX_ATTEMPTS;
   const stats = {
     started_at: new Date().toISOString(),
@@ -249,6 +312,8 @@ export function startConsumer(opts = {}) {
     inline: 0,
     escalated: 0,
     skipped_emergency_stop: 0,
+    skipped_circuit_open: 0,
+    skipped_backoff: 0,
     dlq: 0,
     retries: 0,
     spawn_failures: 0,
@@ -261,6 +326,75 @@ export function startConsumer(opts = {}) {
   let timers = [];
   let activeSubSessions = 0;
+  // Per-cadence reliability state. Tracks consecutive failure count and
+  // the earliest moment we'll allow another spawn for that cadence.
+  // Persists nothing — circuit state is in-memory only. On daemon restart
+  // we get a fresh slate; that's intentional (operators expect a restart
+  // to mean "try again now").
+  const cadenceState = new Map(); // cadence → { failures, openUntil, nextAllowedAt }
+  function getCadenceState(cadence) {
+    let s = cadenceState.get(cadence);
+    if (!s) { s = { failures: 0, openUntil: 0, nextAllowedAt: 0 }; cadenceState.set(cadence, s); }
+    return s;
+  }
+  function recordSubsessionSuccess(cadence) {
+    const s = getCadenceState(cadence);
+    s.failures = 0;
+    s.openUntil = 0;
+    s.nextAllowedAt = 0;
+  }
+  function recordSubsessionFailure(cadence) {
+    const s = getCadenceState(cadence);
+    s.failures += 1;
+    // Exponential back-off honouring the (test-overridable) schedule.
+    const idx = Math.min(s.failures, backoffSchedule.length - 1);
+    s.nextAllowedAt = Date.now() + backoffSchedule[idx];
+    if (s.failures >= circuitThreshold) {
+      s.openUntil = Date.now() + circuitDurationMs;
+      log({ level: "error", stage: "circuit_opened", cadence, failures: s.failures, open_until: new Date(s.openUntil).toISOString() });
+      writeCircuitFile();
+    }
+  }
+  function writeCircuitFile() {
+    // Persist the open-circuit snapshot so doctor + the operator can see
+    // which cadences are currently held back without scraping logs.
+    const open = {};
+    for (const [cad, s] of cadenceState.entries()) {
+      if (s.openUntil > Date.now()) {
+        open[cad] = { failures: s.failures, open_until: new Date(s.openUntil).toISOString() };
+      }
+    }
+    const path = join(agentRoot, "state/cadence-bus/circuit-open.json");
+    try {
+      if (Object.keys(open).length === 0) {
+        // Remove the file when nothing is open.
+        try { unlinkSync(path); } catch { /* */ }
+      } else {
+        writeFileSync(path, JSON.stringify({ generated: new Date().toISOString(), open }, null, 2) + "\n");
+      }
+    } catch { /* best-effort */ }
+  }
+  function isCadenceAllowed(cadence) {
+    const s = getCadenceState(cadence);
+    const now = Date.now();
+    if (s.openUntil > now) return { allowed: false, reason: "circuit-open", retry_at: s.openUntil };
+    if (s.nextAllowedAt > now) return { allowed: false, reason: "backoff", retry_at: s.nextAllowedAt };
+    // Circuit closes automatically when openUntil passes.
+    if (s.openUntil && s.openUntil <= now) {
+      s.openUntil = 0;
+      s.failures = 0;
+      log({ level: "info", stage: "circuit_closed", cadence });
+      writeCircuitFile();
+    }
+    return { allowed: true };
+  }
   function log(entry) {
     const enriched = { ts: new Date().toISOString(), ...entry };
     logBusEvent(agentRoot, enriched);
@@ -280,6 +414,32 @@ export function startConsumer(opts = {}) {
   }
   async function escalate(event) {
+    // Circuit-breaker / back-off gate. If this cadence is currently held
+    // back, requeue without spawning. The event keeps its attempt count
+    // because the failure was upstream (not a per-event problem).
+    const gate = isCadenceAllowed(event.cadence);
+    if (!gate.allowed) {
+      log({
+        level: "warn",
+        stage: gate.reason === "circuit-open" ? "skipped_circuit_open" : "skipped_backoff",
+        id: event.id,
+        cadence: event.cadence,
+        retry_at: new Date(gate.retry_at).toISOString(),
+      });
+      if (gate.reason === "circuit-open") stats.skipped_circuit_open += 1;
+      else stats.skipped_backoff += 1;
+      // Put the event back in inbox WITHOUT bumping attempts so it doesn't
+      // burn its retry budget while the circuit is open.
+      const paths2 = getBusPaths(agentRoot);
+      try {
+        const event2 = { ...event, attempts: Math.max(0, (event.attempts || 1) - 1) };
+        writeFileSync(join(paths2.inbox, `${event.id}.json`), JSON.stringify(event2, null, 2) + "\n");
+        try { unlinkSync(join(paths2.claimed, `${event.id}.json`)); } catch { /* */ }
+      } catch { /* best-effort */ }
+      return { ok: false, decision: gate.reason };
+    }
     if (activeSubSessions >= MAX_CONCURRENT_SUB_SESSIONS) {
       // Re-queue and try again next tick. Single-owner cadence consumer
       // means this can only happen when a prior tick is still running —
@@ -291,7 +451,15 @@ export function startConsumer(opts = {}) {
         cadence: event.cadence,
         active_subsessions: activeSubSessions,
       });
-      failTick(agentRoot, event.id, "deferred:concurrent-spawn", { maxAttempts: 10 });
+      // Re-queue without burning the retry budget — concurrent-spawn isn't
+      // a per-event failure.
+      const paths2 = getBusPaths(agentRoot);
+      try {
+        const event2 = { ...event, attempts: Math.max(0, (event.attempts || 1) - 1) };
+        writeFileSync(join(paths2.inbox, `${event.id}.json`), JSON.stringify(event2, null, 2) + "\n");
+        try { unlinkSync(join(paths2.claimed, `${event.id}.json`)); } catch { /* */ }
+      } catch { /* best-effort */ }
       stats.retries += 1;
       return { ok: false, decision: "deferred" };
     }
@@ -334,14 +502,31 @@ export function startConsumer(opts = {}) {
         prompt: promptPath,
         exit_code: result.exit_code,
         duration_ms: result.duration_ms,
+        stdout_path: result.stdout_path || null,
+        stderr_path: result.stderr_path || null,
       });
+      recordSubsessionSuccess(event.cadence);
       stats.escalated += 1;
       stats.last_decision = "escalated";
       return { ok: true, decision: "escalated", exit_code: result.exit_code };
     }
-    log({ level: "error", stage: "subsession_failed", id: event.id, cadence: event.cadence, exit_code: result.exit_code, error: result.error || null });
+    // Failure path: log + cap retries low. The exact stderr tail comes
+    // from the spawn helper so we never DLQ "blind" again.
+    const stderrTail = (result.stderr_tail || "").trim().split("\n").slice(-3).join(" | ");
+    log({
+      level: "error",
+      stage: "subsession_failed",
+      id: event.id,
+      cadence: event.cadence,
+      exit_code: result.exit_code,
+      duration_ms: result.duration_ms,
+      error: result.error || stderrTail || `exit ${result.exit_code}`,
+      stderr_path: result.stderr_path || null,
+    });
     stats.spawn_failures += 1;
-    const outcome = failTick(agentRoot, event.id, result.error || `exit ${result.exit_code}`);
+    recordSubsessionFailure(event.cadence);
+    const reason = result.error || (stderrTail ? `exit ${result.exit_code}: ${stderrTail}` : `exit ${result.exit_code}`);
+    const outcome = failTick(agentRoot, event.id, reason, { maxAttempts });
     if (outcome?.destination === "dlq") stats.dlq += 1;
     else stats.retries += 1;
     return { ok: false, decision: outcome?.destination || "failed" };
@@ -428,19 +613,38 @@ export function startConsumer(opts = {}) {
     recoverStaleClaims(agentRoot);
     let processed = 0;
-    // Drain as much as the consumer can in one tick, but yield to the
-    // event loop between events so heartbeats and stop signals fire.
+    let escalatedThisTick = 0;
+    // Drain inline events as much as the consumer can in one tick; cap
+    // sub-session escalations at 1 per tick so a fast-failing cadence
+    // can't burn a whole minute's worth of retries inside a single poll.
+    // The next poll (DEFAULT_POLL_MS later) will pick up where we left off.
     while (!stopping) {
       const claim = claimNextTick(agentRoot);
       if (!claim) break;
       const event = claim.event;
       activeTick = event.id;
+      let didEscalate = false;
       try {
+        const def = getCadenceDef(event.cadence);
+        const willEscalate = !def || (def.mode !== "inline" && (def.mode !== "guarded" || true));
+        // Roughly: if it's not a registry-inline cadence, we MAY escalate.
+        // We don't yet know if the guard will say inline; processEvent
+        // will tell us via stats. Use the escalated stats delta as the
+        // signal that an actual sub-session ran this iteration.
+        const before = stats.escalated + stats.spawn_failures + stats.skipped_circuit_open + stats.skipped_backoff;
         await processEvent(event);
+        const after = stats.escalated + stats.spawn_failures + stats.skipped_circuit_open + stats.skipped_backoff;
+        if (after > before) didEscalate = true;
+        // Silence unused var warning.
+        void willEscalate;
       } finally {
         activeTick = null;
       }
       processed += 1;
+      if (didEscalate) escalatedThisTick += 1;
+      // Hard cap: at most ONE sub-session spawn per tick. Inline ticks
+      // keep draining freely (they're cheap).
+      if (escalatedThisTick >= 1) break;
       if (processed >= 16) break; // soft batch cap
     }
     return { processed };

package/scripts/daemon/cadence-consumer.test.mjs CHANGED Viewed

@@ -210,9 +210,16 @@ test("unknown cadence with no prompt file DLQ's immediately", async () => {
 test("spawn failure retries within the budget, then DLQs", async () => {
   const root = await makeAgentRoot();
   plantPrompt(root, "weekly-strategic-memo");
+  // Disable back-off + raise circuit threshold so the test exercises the
+  // retry-then-DLQ path without waiting for back-off windows. The
+  // real defaults (30s/2m back-off, 3-failure circuit) are exercised by
+  // dedicated tests below.
   const consumer = startConsumer({
     agentRoot: root,
     pollMs: 25,
+    backoffSchedule: [0, 0, 0],
+    circuitThreshold: 999,
+    maxAttempts: 2,
     spawnSession: async () => ({ ok: false, exit_code: 1, error: "always-fail", duration_ms: 1 }),
   });
   try {
@@ -226,6 +233,68 @@ test("spawn failure retries within the budget, then DLQs", async () => {
   }
 });
+test("circuit breaker opens after consecutive failures and blocks further spawns", async () => {
+  const root = await makeAgentRoot();
+  plantPrompt(root, "weekly-strategic-memo");
+  let spawnCount = 0;
+  const consumer = startConsumer({
+    agentRoot: root,
+    pollMs: 20,
+    backoffSchedule: [0, 0, 0],
+    circuitThreshold: 2,
+    circuitDurationMs: 60_000, // 1 min — long enough for the assertion window
+    maxAttempts: 1, // each event DLQs on first failure so we don't conflate retry-counts
+    spawnSession: async () => { spawnCount++; return { ok: false, exit_code: 1, error: "fail", duration_ms: 1 }; },
+  });
+  try {
+    // Enqueue 5 events; circuit should open after 2 failures, blocking the rest.
+    for (let i = 0; i < 5; i++) {
+      enqueueTick({ cadence: "weekly-strategic-memo", source: "launchd", agentRoot: root });
+    }
+    const opened = await waitFor(() => consumer.getStats().skipped_circuit_open >= 1, { timeoutMs: 10_000 });
+    assert.ok(opened, `circuit should open; stats=${JSON.stringify(consumer.getStats())}`);
+    // Spawn count must NOT keep climbing once the circuit is open.
+    const spawnsAtOpen = spawnCount;
+    await new Promise((r) => setTimeout(r, 500));
+    assert.equal(spawnCount, spawnsAtOpen, `spawns must stop once circuit opens (was ${spawnsAtOpen}, now ${spawnCount})`);
+  } finally {
+    await consumer.stop();
+    await rmRoot(root);
+  }
+});
+test("back-off skips re-spawning until the cooldown elapses", async () => {
+  const root = await makeAgentRoot();
+  plantPrompt(root, "weekly-strategic-memo");
+  let spawnCount = 0;
+  const consumer = startConsumer({
+    agentRoot: root,
+    pollMs: 20,
+    backoffSchedule: [0, 300, 300], // 300ms cooldown after each failure
+    circuitThreshold: 999,
+    maxAttempts: 1,
+    spawnSession: async () => { spawnCount++; return { ok: false, exit_code: 1, error: "fail", duration_ms: 1 }; },
+  });
+  try {
+    // Enqueue 2 events back-to-back. The 1st triggers a spawn (fails). The
+    // 2nd should be held back by the 300ms back-off window.
+    enqueueTick({ cadence: "weekly-strategic-memo", source: "launchd", agentRoot: root });
+    enqueueTick({ cadence: "weekly-strategic-memo", source: "launchd", agentRoot: root });
+    await waitFor(() => spawnCount >= 1, { timeoutMs: 5_000 });
+    const spawnsBeforeWait = spawnCount;
+    // During the back-off window no new spawn should fire.
+    await new Promise((r) => setTimeout(r, 150));
+    assert.ok(spawnCount === spawnsBeforeWait, `spawns must wait for back-off (was ${spawnsBeforeWait}, now ${spawnCount})`);
+    assert.ok(consumer.getStats().skipped_backoff >= 1, "skipped_backoff should be recorded");
+    // After the window passes, the next event should be processed.
+    await waitFor(() => spawnCount > spawnsBeforeWait, { timeoutMs: 5_000 });
+    assert.ok(spawnCount > spawnsBeforeWait, "spawning resumes after back-off");
+  } finally {
+    await consumer.stop();
+    await rmRoot(root);
+  }
+});
 // ---------------------------------------------------------------------------
 // Emergency stop
 // ---------------------------------------------------------------------------