npm - @adaptic/maestro - Versions diffs - 1.9.3 → 1.9.5 - Mend

@adaptic/maestro 1.9.3 → 1.9.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

package/lib/claude-bin.mjs +70 -0
package/package.json +2 -2
package/scripts/daemon/agent-daemon.mjs +562 -0
package/scripts/daemon/cadence-consumer.mjs +7 -47
package/scripts/daemon/cadence-handlers.mjs +22 -7
package/scripts/daemon/classifier.mjs +5 -3
package/scripts/daemon/dispatcher-cooldown.test.mjs +122 -0
package/scripts/daemon/dispatcher.mjs +66 -4
package/scripts/daemon/maestro-daemon.mjs +12 -9
package/scripts/daemon/responder.mjs +5 -2
package/scripts/daemon/sophie-daemon.mjs +11 -552

package/scripts/daemon/cadence-consumer.mjs CHANGED Viewed

@@ -59,6 +59,7 @@ import {
   logBusEvent,
   busDepth,
 } from "../../lib/cadence-bus.mjs";
+import { resolveClaudeBin as sharedResolveClaude, augmentedPath } from "../../lib/claude-bin.mjs";
 import { getCadenceDef } from "./cadence-handlers.mjs";
 // ---------------------------------------------------------------------------
@@ -104,39 +105,10 @@ function defaultLogger(entry) {
   }
 }
-/**
- * Resolve an absolute path to the Claude CLI. launchd's bare environment
- * does NOT include /Users/<u>/.local/bin or homebrew on PATH, so a plain
- * `spawn('claude', …)` fails with ENOENT — which is exactly what was
- * stuck in ravi-ai's DLQ. This resolver returns the first existing
- * candidate among:
- *
- *   1. $CLAUDE_BIN env var (if set + executable)
- *   2. ~/.local/bin/claude              (default Claude Code install path)
- *   3. /opt/homebrew/bin/claude          (homebrew on Apple Silicon)
- *   4. /usr/local/bin/claude             (homebrew on Intel)
- *   5. /usr/bin/claude
- *
- * Falls back to bare "claude" so the spawn's own error stays informative
- * when nothing is found.
- */
-let _resolvedClaude = null;
-function resolveClaudeBin() {
-  if (_resolvedClaude) return _resolvedClaude;
-  const envOverride = process.env.CLAUDE_BIN;
-  const candidates = [
-    envOverride,
-    join(homedir(), ".local/bin/claude"),
-    "/opt/homebrew/bin/claude",
-    "/usr/local/bin/claude",
-    "/usr/bin/claude",
-  ].filter(Boolean);
-  for (const c of candidates) {
-    if (existsSync(c)) { _resolvedClaude = c; return c; }
-  }
-  _resolvedClaude = "claude"; // last-resort; spawn will report ENOENT
-  return _resolvedClaude;
-}
+// Claude binary resolution moved to lib/claude-bin.mjs (shared by
+// dispatcher, responder, and this consumer). See that file for the
+// candidate search order.
+const resolveClaudeBin = sharedResolveClaude;
 /**
  * Spawn a sub-session running the cadence's trigger prompt and resolve
@@ -164,24 +136,12 @@ function realSpawnSession({ agentRoot, cadence, promptPath, timeoutMs, log }) {
     const bin = resolveClaudeBin();
     const args = ["--print", "--dangerously-skip-permissions", body];
-    // Augment PATH so any tool the subsession invokes (jq, node, etc.)
-    // can still be found. launchd's bare env strips /opt/homebrew/bin etc.
-    const augmentedPath = [
-      process.env.PATH || "",
-      `${homedir()}/.local/bin`,
-      "/opt/homebrew/bin",
-      "/opt/homebrew/sbin",
-      "/usr/local/bin",
-      "/usr/bin",
-      "/bin",
-      "/usr/sbin",
-      "/sbin",
-    ].filter(Boolean).join(":");
+    // PATH augmented via lib/claude-bin.mjs so subsession can find jq/node.
     const env = {
       ...process.env,
       AGENT_ROOT: agentRoot,
       AGENT_DIR: agentRoot,
-      PATH: augmentedPath,
+      PATH: augmentedPath(),
     };
     // Auth handling. Claude Code authenticates via macOS Keychain
     // (OAuth from the user's Pro/Max subscription) when no API key is

package/scripts/daemon/cadence-handlers.mjs CHANGED Viewed

@@ -129,16 +129,31 @@ async function guardInboxProcessor({ agentRoot }) {
 /**
  * backlog-executor guard:
- *   - Look at state/queues/*.yaml; any queue with at least one item is work.
- *   - If none, complete inline.
- *   - If any, escalate.
+ *   The reactive daemon (agent-daemon.mjs) already runs an internal
+ *   backlog sweep every BACKLOG_INTERVAL (default 10min) which dispatches
+ *   one session per top-priority queue item with proper post-completion
+ *   cooldowns. Running the cadence-bus backlog-executor on top of that
+ *   spawns DUPLICATE sessions for the same items — observed as 159
+ *   redundant spawns / day in ravi-ai's logs.
+ *
+ *   So this guard now always returns inline. The reactive daemon owns
+ *   the backlog. Operators who prefer the cadence-only mode (no reactive
+ *   daemon) can set BACKLOG_CADENCE_ESCALATE=1 in .env to flip behaviour.
  */
 async function guardBacklogExecutor({ agentRoot }) {
-  const withWork = queuesWithWork(agentRoot);
-  if (withWork.length === 0) {
-    return { ok: true, decision: "inline", reason: "all queues empty" };
+  if (process.env.BACKLOG_CADENCE_ESCALATE === "1") {
+    const withWork = queuesWithWork(agentRoot);
+    if (withWork.length === 0) {
+      return { ok: true, decision: "inline", reason: "all queues empty" };
+    }
+    return { ok: true, decision: "escalate", queues_with_work: withWork };
   }
-  return { ok: true, decision: "escalate", queues_with_work: withWork };
+  return {
+    ok: true,
+    decision: "inline",
+    reason: "reactive-daemon-owns-backlog",
+    note: "Set BACKLOG_CADENCE_ESCALATE=1 to override.",
+  };
 }
 /**

package/scripts/daemon/classifier.mjs CHANGED Viewed

@@ -102,8 +102,9 @@ function loadAgentRegistry() {
 const ANTHROPIC_MODEL = "claude-haiku-4-5-20251001";
 const OPENAI_MODEL = "gpt-4o-mini";
-// Default to `claude` from PATH; CLAUDE_BIN env var overrides for non-standard installs.
-const CLAUDE_BIN = process.env.CLAUDE_BIN || "claude";
+// Resolve claude against the agent's PATH (not launchd's bare env).
+import { resolveClaudeBin, augmentedPath } from "../../lib/claude-bin.mjs";
+const CLAUDE_BIN = resolveClaudeBin();
 const CLAUDE_CLI_TIMEOUT_MS = 30_000;
 // ── System prompt shared by both LLM classifiers ────────────────────────────
@@ -321,7 +322,8 @@ async function runClaudeCLI(systemPrompt, userPrompt) {
       stdio: ["pipe", "pipe", "pipe"],
       // Force claude CLI onto keychain OAuth (Max subscription); strip any
       // stale ANTHROPIC_API_KEY/AUTH_TOKEN inherited from the daemon env.
-      env: { ...process.env, ANTHROPIC_API_KEY: "", ANTHROPIC_AUTH_TOKEN: "" },
+      // Augment PATH so spawned subsessions find homebrew/nvm binaries.
+      env: { ...process.env, PATH: augmentedPath(), ANTHROPIC_API_KEY: "", ANTHROPIC_AUTH_TOKEN: "" },
     });
     let stdout = "";

package/scripts/daemon/dispatcher-cooldown.test.mjs ADDED Viewed

@@ -0,0 +1,122 @@
+/**
+ * dispatcher-cooldown.test.mjs — Coverage for the backlog post-completion
+ * cooldown that stopped the 159-redundant-spawns-per-day loop.
+ *
+ * The dispatcher module is stateful (in-memory Sets/Maps for active
+ * sessions, retry counts, cooldowns) and reads from AGENT_DIR at import
+ * time, so each test isolates by setting AGENT_DIR before dynamic-importing
+ * a fresh module instance.
+ */
+import { test } from "node:test";
+import assert from "node:assert/strict";
+import { promises as fsp } from "fs";
+import { tmpdir } from "os";
+import { join } from "path";
+async function freshDispatcher() {
+  // Isolated tmpdir per test so cooldown state file doesn't bleed.
+  const dir = join(
+    tmpdir(),
+    `dispatcher-test-${process.pid}-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`
+  );
+  await fsp.mkdir(join(dir, "state/sessions"), { recursive: true });
+  await fsp.mkdir(join(dir, "logs/daemon"), { recursive: true });
+  process.env.AGENT_DIR = dir;
+  // Bust the module cache so state resets cleanly.
+  const url = new URL("./dispatcher.mjs", import.meta.url);
+  const mod = await import(`${url.href}?test=${Math.random()}`);
+  return { mod, dir };
+}
+async function cleanup(dir) {
+  try { await fsp.rm(dir, { recursive: true, force: true }); } catch { /* */ }
+}
+test("canDispatchBacklog: allowed for a fresh item", async () => {
+  const { mod, dir } = await freshDispatcher();
+  try {
+    const r = mod.canDispatchBacklog({ id: "TEST-1", title: "Fresh item" });
+    assert.equal(r.allowed, true);
+  } finally { await cleanup(dir); }
+});
+test("canDispatchBacklog: blocked while cooldown is active", async () => {
+  const { mod, dir } = await freshDispatcher();
+  try {
+    // Simulate a session completion by directly writing the cooldown state
+    // file the dispatcher reads on init. We can't easily call internal
+    // setters, but the cooldown file IS public API for state persistence.
+    const cooldownPath = join(dir, "state/sessions/backlog-cooldowns.json");
+    const tomorrow = Date.now() + 60 * 60 * 1000; // +1h
+    await fsp.writeFile(cooldownPath, JSON.stringify({
+      "TEST-2": tomorrow,
+    }) + "\n");
+    // Re-import to load the cooldown file.
+    const { mod: mod2, dir: dir2 } = await freshDispatcher();
+    try {
+      // Plant the same cooldown in this fresh dispatcher's dir.
+      await fsp.writeFile(
+        join(dir2, "state/sessions/backlog-cooldowns.json"),
+        JSON.stringify({ "TEST-2": tomorrow }) + "\n"
+      );
+      // Reload yet again with the cooldown file present.
+      const { mod: mod3, dir: dir3 } = await freshDispatcher();
+      try {
+        await fsp.writeFile(
+          join(dir3, "state/sessions/backlog-cooldowns.json"),
+          JSON.stringify({ "TEST-2": tomorrow }) + "\n"
+        );
+        // Final import — this one reads the cooldown file at module load.
+        process.env.AGENT_DIR = dir3;
+        const url = new URL("./dispatcher.mjs", import.meta.url);
+        const final = await import(`${url.href}?cooldown=${Math.random()}`);
+        const r = final.canDispatchBacklog({ id: "TEST-2", title: "Cooldown item" });
+        assert.equal(r.allowed, false);
+        assert.equal(r.reason, "post_completion_cooldown");
+        assert.ok(r.remaining_min >= 1 && r.remaining_min <= 60);
+      } finally { await cleanup(dir3); }
+    } finally { await cleanup(dir2); }
+  } finally { await cleanup(dir); }
+});
+test("canDispatchBacklog: allowed after cooldown expires", async () => {
+  const dir = join(
+    tmpdir(),
+    `dispatcher-expire-test-${process.pid}-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`
+  );
+  try {
+    await fsp.mkdir(join(dir, "state/sessions"), { recursive: true });
+    await fsp.mkdir(join(dir, "logs/daemon"), { recursive: true });
+    // Cooldown already expired (set to 1h ago).
+    await fsp.writeFile(
+      join(dir, "state/sessions/backlog-cooldowns.json"),
+      JSON.stringify({ "TEST-3": Date.now() - 3_600_000 }) + "\n"
+    );
+    process.env.AGENT_DIR = dir;
+    const url = new URL("./dispatcher.mjs", import.meta.url);
+    const mod = await import(`${url.href}?expired=${Math.random()}`);
+    const r = mod.canDispatchBacklog({ id: "TEST-3", title: "Expired cooldown item" });
+    assert.equal(r.allowed, true);
+  } finally { await cleanup(dir); }
+});
+test("backlog-cooldowns.json persists across simulated daemon restarts", async () => {
+  const dir = join(
+    tmpdir(),
+    `dispatcher-persist-test-${process.pid}-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`
+  );
+  try {
+    await fsp.mkdir(join(dir, "state/sessions"), { recursive: true });
+    await fsp.mkdir(join(dir, "logs/daemon"), { recursive: true });
+    const cooldownPath = join(dir, "state/sessions/backlog-cooldowns.json");
+    const future = Date.now() + 30 * 60_000;
+    await fsp.writeFile(cooldownPath, JSON.stringify({ "RESTART-1": future }) + "\n");
+    process.env.AGENT_DIR = dir;
+    const url = new URL("./dispatcher.mjs", import.meta.url);
+    const mod = await import(`${url.href}?persist=${Math.random()}`);
+    const r = mod.canDispatchBacklog({ id: "RESTART-1", title: "After restart" });
+    assert.equal(r.allowed, false, "loaded cooldown should block dispatch");
+    assert.equal(r.reason, "post_completion_cooldown");
+  } finally { await cleanup(dir); }
+});

package/scripts/daemon/dispatcher.mjs CHANGED Viewed

@@ -4,12 +4,15 @@
 import { spawn } from "child_process";
 import { appendFileSync, mkdirSync, writeFileSync, readFileSync, renameSync } from "fs";
-import { join } from "path";
+import { join, dirname } from "path";
 import { releaseLock, releaseThreadLock, releaseRequestClaim, claimItem, releaseItemClaim } from "./session-lock.mjs";
 import { recordSession } from "./health.mjs";
 const AGENT_REPO_DIR = process.env.AGENT_DIR || join(new URL(".", import.meta.url).pathname, "../..");
-const CLAUDE_BIN = process.env.CLAUDE_BIN || "claude";
+// Resolve the claude binary against the agent's PATH (not launchd's bare
+// env). Without this, every daemon-spawned `claude --print` exits ENOENT.
+import { resolveClaudeBin, augmentedPath } from "../../lib/claude-bin.mjs";
+const CLAUDE_BIN = resolveClaudeBin();
 const MAX_CONCURRENT = parseInt(process.env.DAEMON_MAX_CONCURRENT || "10", 10);
 const RESERVED_INBOX_SLOTS = 3;    // Always keep 3 slots free for real-time inbox items
@@ -39,6 +42,41 @@ const activeBacklogKeys = new Set();  // backlog item key -> true (while session
 const backlogRetryCount = new Map();  // backlog item key -> number of times dispatched
 const MAX_BACKLOG_RETRIES = 6;        // Max retries before skipping (was 3 — too aggressive)
+// Post-completion cooldown — once a session has run on a backlog item, don't
+// re-dispatch it until N hours later. Without this, every 2-min backlog
+// sweep re-dispatches the same items because the daemon has no signal
+// that the underlying work was actually completed (sessions exit 0 even
+// when they only "looked at" the item). 53 redundant spawns/day per item
+// was the observed rate before this fix.
+const SUCCESS_COOLDOWN_MS = 4 * 60 * 60 * 1000;  // 4h after exit 0
+const FAILURE_COOLDOWN_MS = 30 * 60 * 1000;      // 30m after non-zero exit
+const backlogCooldownUntil = new Map();           // key -> epoch ms
+const COOLDOWN_STATE_PATH = join(AGENT_REPO_DIR, "state/sessions/backlog-cooldowns.json");
+// Persist cooldown state across daemon restarts so a freshly-started
+// daemon doesn't immediately re-dispatch items it just completed.
+function loadCooldowns() {
+  try {
+    const body = readFileSync(COOLDOWN_STATE_PATH, "utf-8");
+    const data = JSON.parse(body);
+    const now = Date.now();
+    for (const [key, until] of Object.entries(data || {})) {
+      if (typeof until === "number" && until > now) {
+        backlogCooldownUntil.set(key, until);
+      }
+    }
+  } catch { /* file missing or malformed — start fresh */ }
+}
+function saveCooldowns() {
+  try {
+    const obj = {};
+    for (const [k, v] of backlogCooldownUntil) obj[k] = v;
+    mkdirSync(dirname(COOLDOWN_STATE_PATH), { recursive: true });
+    writeFileSync(COOLDOWN_STATE_PATH, JSON.stringify(obj, null, 2) + "\n");
+  } catch { /* best-effort */ }
+}
+loadCooldowns();
 function logDir() {
   const dir = join(AGENT_REPO_DIR, "logs", "daemon");
   mkdirSync(dir, { recursive: true });
@@ -180,6 +218,13 @@ export function canDispatchBacklog(item) {
   if (retries >= MAX_BACKLOG_RETRIES) {
     return { allowed: false, reason: "max_retries_exceeded", retries };
   }
+  // Post-completion cooldown — prevent every-2-min re-dispatch of items
+  // that completed (success or failure) within the recent window.
+  const cooldownUntil = backlogCooldownUntil.get(key) || 0;
+  if (cooldownUntil > Date.now()) {
+    const remaining_min = Math.ceil((cooldownUntil - Date.now()) / 60000);
+    return { allowed: false, reason: "post_completion_cooldown", remaining_min };
+  }
   return { allowed: true };
 }
@@ -281,9 +326,10 @@ function spawnSession(entry) {
   // to the keychain OAuth (Max subscription) per CEO directive 2026-04-27.
   // A stale ANTHROPIC_API_KEY in the daemon's inherited env will otherwise
   // override the OAuth token and cause "Invalid API key" failures.
+  // PATH is augmented so the spawn finds homebrew/nvm tools (jq, node, etc).
   const proc = spawn(CLAUDE_BIN, args, {
     cwd: AGENT_REPO_DIR,
-    env: { ...process.env, ANTHROPIC_API_KEY: "", ANTHROPIC_AUTH_TOKEN: "" },
+    env: { ...process.env, PATH: augmentedPath(), ANTHROPIC_API_KEY: "", ANTHROPIC_AUTH_TOKEN: "" },
     stdio: ["ignore", "pipe", "pipe"],
   });
@@ -358,6 +404,19 @@ function spawnSession(entry) {
       activeBacklogKeys.delete(key);
       const retries = backlogRetryCount.get(key) || 0;
+      // Apply post-completion cooldown — different for success vs failure.
+      // This is the fix for the every-2-min re-dispatch loop: once a
+      // session has touched an item, we wait before touching it again.
+      const cooldownMs = code === 0 ? SUCCESS_COOLDOWN_MS : FAILURE_COOLDOWN_MS;
+      backlogCooldownUntil.set(key, Date.now() + cooldownMs);
+      saveCooldowns();
+      logSession({
+        event: "cooldown_set",
+        summary: classResult.summary,
+        exit_code: code,
+        cooldown_minutes: Math.round(cooldownMs / 60000),
+      });
       // Release file-based item claim (ib-20260407-001b)
       if (item.id) releaseItemClaim(item.id);
@@ -422,10 +481,13 @@ function spawnSession(entry) {
       claimReleased = true;
     }
-    // Release backlog tracking + item claim.
+    // Release backlog tracking + item claim. Apply failure cooldown so the
+    // same item isn't re-spawned on the next backlog sweep.
     if (source === "backlog") {
       const key = backlogKey(item);
       activeBacklogKeys.delete(key);
+      backlogCooldownUntil.set(key, Date.now() + FAILURE_COOLDOWN_MS);
+      saveCooldowns();
       // Release file-based item claim (ib-20260407-001b)
       if (item.id) releaseItemClaim(item.id);
     }

package/scripts/daemon/maestro-daemon.mjs CHANGED Viewed

@@ -8,11 +8,12 @@
 //
 // Lifecycle:
 //   1. Honour .emergency-stop BEFORE doing anything (don't acquire singleton
-//      lock, don't start consumer, don't import sophie-daemon). Stops the
+//      lock, don't start consumer, don't import the core daemon). Stops the
 //      launchd restart treadmill cold.
 //   2. Acquire the daemon singleton lock so only one instance runs.
 //   3. Start the cadence consumer (state/cadence-bus/ drain loop).
-//   4. Import the core daemon (sophie-daemon.mjs or legacy <firstName>-daemon.mjs).
+//   4. Import the core daemon (agent-daemon.mjs canonical, with fallbacks
+//      to legacy sophie-daemon.mjs or <firstName>-daemon.mjs for back-compat).
 //
 // Run: node scripts/daemon/maestro-daemon.mjs
 // Install: launchd plist with KeepAlive.SuccessfulExit: false (clean exits
@@ -82,13 +83,15 @@ try {
 // 4. Core daemon import
 // ---------------------------------------------------------------------------
 // Resolve the core daemon module. Try, in order:
-//   1. ./sophie-daemon.mjs  — canonical filename (post-Phase-2.5 SOT)
-//   2. ./<firstName>-daemon.mjs — legacy rename from init-maestro Phase 1
-//   3. The first scripts/daemon/*-daemon.mjs that isn't this file
+//   1. ./agent-daemon.mjs   — canonical filename (1.9.4+)
+//   2. ./sophie-daemon.mjs  — legacy canonical (pre-1.9.4)
+//   3. ./<firstName>-daemon.mjs — even older per-agent rename
+//   4. The first scripts/daemon/*-daemon.mjs that isn't this file
 function resolveCoreDaemon() {
-  const localCandidates = [];
-  const canonical = resolve(__dirname, "sophie-daemon.mjs");
-  localCandidates.push(canonical);
+  const localCandidates = [
+    resolve(__dirname, "agent-daemon.mjs"),
+    resolve(__dirname, "sophie-daemon.mjs"),
+  ];
   try {
     const agentJson = join(AGENT_DIR, "config", "agent.json");
@@ -116,7 +119,7 @@ function resolveCoreDaemon() {
 const coreDaemon = resolveCoreDaemon();
 if (!coreDaemon) {
-  console.error("[DAEMON] could not locate a core daemon module under scripts/daemon/. Expected sophie-daemon.mjs or <firstName>-daemon.mjs.");
+  console.error("[DAEMON] could not locate a core daemon module under scripts/daemon/. Expected agent-daemon.mjs (canonical) or sophie-daemon.mjs / <firstName>-daemon.mjs (legacy).");
   process.exit(78);
 }
 // Import and run the daemon (handles its own .env loading).

package/scripts/daemon/responder.mjs CHANGED Viewed

@@ -25,7 +25,9 @@ import { routingKey as deriveRoutingKey, createRouter } from "./lib/session-rout
 const AGENT_REPO_DIR = process.env.AGENT_DIR || join(new URL(".", import.meta.url).pathname, "../..");
 const SONNET_MODEL = "claude-sonnet-4-6";
-const CLAUDE_BIN = process.env.CLAUDE_BIN || "claude";
+// Resolve claude against the agent's PATH (not launchd's bare env).
+import { resolveClaudeBin, augmentedPath } from "../../lib/claude-bin.mjs";
+const CLAUDE_BIN = resolveClaudeBin();
 const CLAUDE_CLI_TIMEOUT_MS = 60_000;
 const SESSION_REGISTRY_PATH = join(AGENT_REPO_DIR, "state", "daemon", "session-router-registry.json");
@@ -140,7 +142,8 @@ function runClaudeCLI(systemPrompt, userPrompt, model, opts = {}) {
       stdio: ["pipe", "pipe", "pipe"],
       // Force claude CLI onto keychain OAuth (Max subscription); strip any
       // stale ANTHROPIC_API_KEY/AUTH_TOKEN inherited from the daemon env.
-      env: { ...process.env, ANTHROPIC_API_KEY: "", ANTHROPIC_AUTH_TOKEN: "" },
+      // Augment PATH so the subsession finds homebrew/nvm tools.
+      env: { ...process.env, PATH: augmentedPath(), ANTHROPIC_API_KEY: "", ANTHROPIC_AUTH_TOKEN: "" },
     });
     let stdout = "";