npm - agent-pool-mcp - Versions diffs - 1.5.0 → 1.7.0 - Mend

agent-pool-mcp 1.5.0 → 1.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/package.json +1 -1
package/src/scheduler/daemon.js +321 -70
package/src/scheduler/pipeline.js +82 -74
package/src/scheduler/run-signals.js +81 -0
package/src/server.js +61 -1
package/src/tool-definitions.js +45 -1
package/src/tools/messaging.js +104 -0

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "agent-pool-mcp",
-  "version": "1.5.0",
+  "version": "1.7.0",
   "type": "module",
   "description": "MCP Server for multi-agent task delegation and orchestration via Gemini CLI",
   "main": "index.js",

package/src/scheduler/daemon.js CHANGED Viewed

@@ -11,10 +11,15 @@
  * @module agent-pool/scheduler/daemon
  */
-import { readFileSync, writeFileSync, existsSync, mkdirSync, unlinkSync } from 'node:fs';
+import { readFileSync, writeFileSync, existsSync, mkdirSync, unlinkSync, readdirSync, renameSync } from 'node:fs';
 import { spawn } from 'node:child_process';
 import { join, dirname } from 'node:path';
 import { matchesCron } from './cron.js';
+import { getGroup } from '../tools/groups.js';
+import { getRunner } from '../runner/config.js';
+import { buildSshSpawn } from '../runner/ssh.js';
+import { killGroup } from '../runner/process-manager.js';
+import { consumeSignals, deleteSignals } from './run-signals.js';
 const POLL_INTERVAL_MS = 30_000; // Check schedules every 30 seconds
 const PID_FILE = '.agents/scheduler.pid';
@@ -159,62 +164,233 @@ function executeSchedule(schedule) {
   console.error(`[scheduler] Started: ${schedule.id} → gemini pid ${child.pid}`);
 }
-// ─── Pipeline tick ──────────────────────────────────────────
-import { readdirSync } from 'node:fs';
+// ─── Pipeline tick ──────────────────────────────────────────────────
 const PIPELINES_DIR = '.agents/pipelines';
 const RUNS_DIR = '.agents/runs';
 /**
- * Spawn a Gemini CLI agent for a pipeline step.
+ * In-memory pipeline state cache.
+ * Loaded from disk on startup, updated in-place during ticks.
+ * Written to disk on state transitions (write-through).
+ * @type {Map<string, object>}
+ */
+const runCache = new Map();
+/**
+ * Load all active runs from disk into the in-memory cache.
+ * Called once on daemon startup.
+ */
+function loadRunCache() {
+  const dir = join(cwd, RUNS_DIR);
+  if (!existsSync(dir)) return;
+  for (const f of readdirSync(dir).filter(f => f.endsWith('.json') && !f.includes('.signal-'))) {
+    try {
+      const run = JSON.parse(readFileSync(join(dir, f), 'utf-8'));
+      const runId = f.replace('.json', '');
+      runCache.set(runId, run);
+    } catch { /* skip corrupted */ }
+  }
+  console.error(`[pipeline] Loaded ${runCache.size} runs into memory cache`);
+}
+/**
+ * Persist a run to disk atomically (write-then-rename).
+ * Prevents corruption if daemon crashes mid-write.
+ * @param {string} runId
+ * @param {object} run
+ */
+function persistRun(runId, run) {
+  const dir = join(cwd, RUNS_DIR);
+  mkdirSync(dir, { recursive: true });
+  const target = join(dir, `${runId}.json`);
+  const tmp = join(dir, `${runId}.json.tmp`);
+  writeFileSync(tmp, JSON.stringify(run, null, 2));
+  // Atomic rename (same filesystem) — prevents corruption on crash
+  try { renameSync(tmp, target); }
+  catch { writeFileSync(target, JSON.stringify(run, null, 2)); }
+}
+/**
+ * Apply consumed signal files to a run's in-memory state.
+ * @param {object} run - Run state object (mutated in place)
+ * @param {Array} signals - Consumed signal objects
+ * @param {object} pipeline - Pipeline definition
+ * @returns {boolean} true if any signal was applied
+ */
+function applySignals(run, signals, pipeline) {
+  let modified = false;
+  for (const signal of signals) {
+    if (signal.type === 'STEP_COMPLETE') {
+      const step = run.steps[signal.stepName];
+      if (step && step.status === 'running') {
+        step.status = 'success';
+        step.signaled = true;
+        step.completedAt = new Date().toISOString();
+        if (signal.output) step.output = signal.output;
+        modified = true;
+        console.error(`[pipeline] Signal: step "${signal.stepName}" completed`);
+      }
+    } else if (signal.type === 'BOUNCE_BACK') {
+      const targetStep = run.steps[signal.stepName];
+      if (!targetStep) continue;
+      const stepDef = pipeline?.steps.find(s => s.name === signal.stepName);
+      const maxBounces = stepDef?.maxBounces ?? 2;
+      if (targetStep.bounces >= maxBounces) {
+        // Bounce limit reached
+        targetStep.status = 'failed';
+        targetStep.lastBounceReason = `Bounce limit (${maxBounces}) reached. Last: ${signal.reason}`;
+        run.status = 'failed';
+        run.completedAt = new Date().toISOString();
+        console.error(`[pipeline] Bounce limit reached for "${signal.stepName}"`);
+      } else {
+        // Reset target step
+        targetStep.status = 'bounce_pending';
+        targetStep.bounces = (targetStep.bounces || 0) + 1;
+        targetStep.lastBounceReason = signal.reason;
+        // Kill running processes for this step
+        const pidsToKill = [...(targetStep.pids || [])];
+        if (targetStep.pid && !pidsToKill.includes(targetStep.pid)) pidsToKill.push(targetStep.pid);
+        for (const pid of pidsToKill) killGroup(pid);
+        targetStep.pid = null;
+        targetStep.pids = [];
+        targetStep.exitCode = null;
+        targetStep.signaled = false;
+        // Reset calling step
+        if (signal.callingStepName && run.steps[signal.callingStepName]) {
+          run.steps[signal.callingStepName].status = 'waiting_bounce';
+        }
+        console.error(`[pipeline] Bounce: step "${signal.stepName}" reset (reason: ${signal.reason})`);
+      }
+      modified = true;
+    } else if (signal.type === 'CANCEL_RUN') {
+      // Cancel the entire run
+      for (const [name, step] of Object.entries(run.steps)) {
+        if (step.status === 'running') step.status = 'cancelled';
+        if (step.status === 'pending') step.status = 'skipped';
+      }
+      run.status = 'cancelled';
+      run.completedAt = new Date().toISOString();
+      console.error(`[pipeline] Signal: run cancelled`);
+      modified = true;
+    }
+  }
+  return modified;
+}
+/**
+ * Spawn Gemini CLI agent(s) for a pipeline step.
  * @param {object} stepDef - Step definition from pipeline
  * @param {object} run - Current run state
  * @param {string} runId
  * @param {string} [bounceReason] - If bouncing back, the reason
- * @returns {number} child PID
+ * @returns {number[]} Array of child PIDs
  */
 function spawnStep(stepDef, run, runId, bounceReason) {
-  let prompt = stepDef.prompt;
-  if (bounceReason) {
-    prompt = `${stepDef.prompt}\n\n⚠️ BOUNCE BACK: предыдущая попытка была отклонена следующим шагом.\nПричина: ${bounceReason}\nДополни и улучши результат.`;
+  const count = stepDef.count || 1;
+  const pids = [];
+  // Resolve group
+  let groupConfig = {};
+  if (stepDef.group) {
+    groupConfig = getGroup(run.cwd || cwd, stepDef.group) || {};
   }
-  // Inject pipeline context
-  prompt = `[Pipeline: ${run.pipelineName}, Step: ${stepDef.name}, Run: ${runId}]\n\nTask:\n${prompt}\n\nWhen finished, call signal_step_complete with step_name "${stepDef.name}" and run_id "${runId}".`;
+  const skill = stepDef.skill || groupConfig.skill;
+  const policy = groupConfig.policy; // currently policy only from group
+  const runnerId = groupConfig.runner;
+  const runner = runnerId ? getRunner(runnerId) : { type: 'local' };
+  const isRemote = runner && runner.type === 'ssh';
-  const args = [
-    '-p', prompt,
-    '--output-format', 'stream-json',
-    '--approval-mode', stepDef.approvalMode || 'yolo',
-  ];
+  for (let i = 0; i < count; i++) {
+    let prompt = stepDef.prompt;
+    if (bounceReason) {
+      prompt = `${stepDef.prompt}\n\n⚠️ BOUNCE BACK: предыдущая попытка была отклонена следующим шагом.\nПричина: ${bounceReason}\nДополни и улучши результат.`;
+    }
-  const child = spawn('gemini', args, {
-    cwd: run.cwd || cwd,
-    env: { ...process.env, TERM: 'dumb', CI: '1' },
-    stdio: ['pipe', 'pipe', 'pipe'],
-    detached: true,
-  });
+    if (count > 1) {
+      prompt = `[Agent ${i + 1}/${count}]\n\n${prompt}`;
+    }
-  child.on('close', (code) => {
-    // Update step exit code in run state
-    try {
-      const currentRun = JSON.parse(readFileSync(join(cwd, RUNS_DIR, `${runId}.json`), 'utf-8'));
-      if (currentRun.steps[stepDef.name]) {
-        currentRun.steps[stepDef.name].exitCode = code;
+    // Inject pipeline context
+    prompt = `[Pipeline: ${run.pipelineName}, Step: ${stepDef.name}, Run: ${runId}]\n\nTask:\n${prompt}\n\nWhen finished, call signal_step_complete with step_name "${stepDef.name}" and run_id "${runId}".`;
+    const args = [
+      '-p', prompt,
+      '--output-format', 'stream-json',
+      '--approval-mode', stepDef.approvalMode || 'yolo',
+    ];
+    if (skill) {
+      // Skills can be active via prompt injection, as we do for scheduled tasks
+      args[1] = `Activate skill "${skill}" first.\n\n${args[1]}`;
+    }
+    if (policy) {
+      args.push('--policy', policy);
+    }
+    if (groupConfig.include_dirs?.length > 0) {
+      for (const dir of groupConfig.include_dirs) {
+        args.push('--include-directories', dir);
       }
-      writeFileSync(join(cwd, RUNS_DIR, `${runId}.json`), JSON.stringify(currentRun, null, 2));
-    } catch { /* ignore */ }
-    console.error(`[pipeline] Step "${stepDef.name}" exited (code: ${code}, run: ${runId})`);
-  });
+    }
-  child.stdin.end();
-  child.unref();
+    let spawnCmd, spawnArgs, spawnOpts;
+    if (isRemote) {
+      const ssh = buildSshSpawn(runner, args, run.cwd || cwd);
+      spawnCmd = ssh.command;
+      spawnArgs = ssh.args;
+      spawnOpts = { stdio: ['pipe', 'pipe', 'pipe'], detached: true };
+    } else {
+      spawnCmd = 'gemini';
+      spawnArgs = args;
+      const currentDepth = parseInt(process.env.AGENT_POOL_DEPTH ?? '0');
+      spawnOpts = {
+        cwd: run.cwd || cwd,
+        env: {
+          ...process.env,
+          TERM: 'dumb',
+          CI: '1',
+          AGENT_POOL_DEPTH: String(currentDepth + 1)
+        },
+        stdio: ['pipe', 'pipe', 'pipe'],
+        detached: true,
+      };
+      if (count > 1) spawnOpts.env.AGENT_INDEX = String(i);
+    }
-  console.error(`[pipeline] Started step "${stepDef.name}" → pid ${child.pid} (run: ${runId})`);
-  return child.pid;
+    const child = spawn(spawnCmd, spawnArgs, spawnOpts);
+    child.on('close', (code) => {
+      // Update step exit code in in-memory state directly (same process)
+      const currentRun = runCache.get(runId);
+      if (currentRun?.steps[stepDef.name]) {
+        if (code !== 0) {
+          currentRun.steps[stepDef.name].exitCode = code;
+        } else if (currentRun.steps[stepDef.name].exitCode === null) {
+          currentRun.steps[stepDef.name].exitCode = 0;
+        }
+        // Write-through to disk
+        persistRun(runId, currentRun);
+      }
+      console.error(`[pipeline] Step "${stepDef.name}" [pid ${child.pid}] exited (code: ${code}, run: ${runId})`);
+    });
+    child.stdin.end();
+    child.unref();
+    console.error(`[pipeline] Started step "${stepDef.name}" → pid ${child.pid} (run: ${runId})`);
+    pids.push(child.pid);
+  }
+  return pids;
 }
 /**
  * Check if a process is alive.
  * @param {number} pid
@@ -226,33 +402,69 @@ function isAlive(pid) {
 }
 /**
- * Process pipeline runs — check triggers, advance steps.
+ * Process pipeline runs — consume signals, check triggers, advance steps.
+ * Uses in-memory cache for state; persists to disk on changes.
  * @returns {boolean} true if any pipeline is actively running
  */
 function tickPipelines() {
+  // Pick up new runs added to disk since last tick (e.g., from runPipeline)
   const runsDir = join(cwd, RUNS_DIR);
-  if (!existsSync(runsDir)) return false;
+  if (existsSync(runsDir)) {
+    for (const f of readdirSync(runsDir).filter(f => f.endsWith('.json') && !f.includes('.signal-') && !f.endsWith('.tmp'))) {
+      const runId = f.replace('.json', '');
+      if (!runCache.has(runId)) {
+        try {
+          const run = JSON.parse(readFileSync(join(runsDir, f), 'utf-8'));
+          runCache.set(runId, run);
+          console.error(`[pipeline] Picked up new run: ${runId}`);
+        } catch { /* skip corrupted */ }
+      }
+    }
+  }
   const pipelinesDir = join(cwd, PIPELINES_DIR);
   let hasActive = false;
-  for (const file of readdirSync(runsDir).filter(f => f.endsWith('.json'))) {
-    let run;
-    try { run = JSON.parse(readFileSync(join(runsDir, file), 'utf-8')); }
-    catch { continue; }
-    if (run.status !== 'running') continue;
+  // Iterate over a copy of keys to allow modification of runCache during iteration
+  for (const runId of Array.from(runCache.keys())) {
+    const run = runCache.get(runId);
+    // Evict completed runs from cache (memory leak fix)
+    if (run.status !== 'running') {
+      // Clean up any orphaned/late signals for completed runs
+      const lateSignals = consumeSignals(cwd, runId);
+      if (lateSignals.length > 0) {
+        deleteSignals(cwd, lateSignals);
+        console.error(`[pipeline] Cleaned ${lateSignals.length} orphaned signal(s) for completed run ${runId}`);
+      }
+      runCache.delete(runId);
+      continue;
+    }
     hasActive = true;
     // Load pipeline definition
     let pipeline;
     try {
       pipeline = JSON.parse(readFileSync(join(pipelinesDir, `${run.pipeline}.json`), 'utf-8'));
-    } catch { continue; }
+    } catch {
+      console.error(`[pipeline] Could not load pipeline definition for run ${runId}: ${run.pipeline}.json`);
+      continue;
+    }
-    const runId = file.replace('.json', '');
+    // 1. Consume and apply signal files
+    const signals = consumeSignals(cwd, runId);
     let modified = false;
+    if (signals.length > 0) {
+      modified = applySignals(run, signals, pipeline);
+      if (modified) {
+        // Durability: persist state BEFORE deleting signals
+        persistRun(runId, run);
+        deleteSignals(cwd, signals);
+      }
+    }
+    // 2. Process each step
     for (const stepDef of pipeline.steps) {
       const step = run.steps[stepDef.name];
       if (!step) continue;
@@ -261,33 +473,67 @@ function tickPipelines() {
       if (step.status === 'bounce_pending') {
         step.status = 'running';
         step.startedAt = new Date().toISOString();
-        step.pid = spawnStep(stepDef, run, runId, step.lastBounceReason);
+        const pids = spawnStep(stepDef, run, runId, step.lastBounceReason);
+        step.pids = pids;
+        if (pids.length > 0) step.pid = pids[0];
         modified = true;
         continue;
       }
       // ── Handle running steps: check if process died ──
-      if (step.status === 'running' && step.pid) {
-        if (!isAlive(step.pid)) {
-          // Process is dead — did agent signal?
-          if (!step.signaled) {
-            // Auto-fallback: check exit code
-            if (step.exitCode === 0 || step.exitCode === null) {
-              // Treat as success (agent forgot to signal)
-              step.status = 'success';
-              step.completedAt = new Date().toISOString();
-              console.error(`[pipeline] Step "${stepDef.name}" auto-completed (pid dead, exit: ${step.exitCode})`);
-            } else {
-              // Failed
-              step.status = 'failed';
-              step.completedAt = new Date().toISOString();
-              console.error(`[pipeline] Step "${stepDef.name}" failed (exit: ${step.exitCode})`);
-              if (pipeline.onError === 'stop') {
-                run.status = 'failed';
-                run.completedAt = new Date().toISOString();
-              }
+      if (step.status === 'running') {
+        const pids = step.pids?.length > 0 ? step.pids : (step.pid ? [step.pid] : []);
+        if (pids.length === 0) continue;
+        let livingPids = 0;
+        for (const pid of pids) if (isAlive(pid)) livingPids++;
+        const isParallel = pids.length > 1;
+        if (isParallel) {
+          // Parallel semantics: rely entirely on exit codes
+          if (step.exitCode !== null && step.exitCode !== 0) {
+            // Fail fast: kill siblings
+            for (const pid of pids) if (isAlive(pid)) killGroup(pid);
+            step.status = 'failed';
+            step.completedAt = new Date().toISOString();
+            console.error(`[pipeline] Step "${stepDef.name}" parallel failed (exit: ${step.exitCode})`);
+            if (pipeline.onError === 'stop') {
+              run.status = 'failed';
+              run.completedAt = new Date().toISOString();
             }
             modified = true;
+          } else if (livingPids === 0) {
+            // All dead and no errors
+            step.status = 'success';
+            step.completedAt = new Date().toISOString();
+            console.error(`[pipeline] Step "${stepDef.name}" parallel completed successfully`);
+            modified = true;
+          }
+        } else {
+          // Sequential semantics (count 1)
+          const pid = pids[0];
+          if (!isAlive(pid)) {
+            // Process is dead — did agent signal?
+            if (!step.signaled) {
+              // Auto-fallback: check exit code
+              if (step.exitCode === 0 || step.exitCode === null) {
+                // Treat as success (agent forgot to signal)
+                step.status = 'success';
+                step.completedAt = new Date().toISOString();
+                console.error(`[pipeline] Step "${stepDef.name}" auto-completed (pid dead, exit: ${step.exitCode})`);
+              } else {
+                // Failed
+                step.status = 'failed';
+                step.completedAt = new Date().toISOString();
+                console.error(`[pipeline] Step "${stepDef.name}" failed (exit: ${step.exitCode})`);
+                if (pipeline.onError === 'stop') {
+                  run.status = 'failed';
+                  run.completedAt = new Date().toISOString();
+                }
+              }
+              modified = true;
+            }
           }
         }
         continue;
@@ -324,7 +570,9 @@ function tickPipelines() {
         if (shouldStart && run.status === 'running') {
           step.status = 'running';
           step.startedAt = new Date().toISOString();
-          step.pid = spawnStep(stepDef, run, runId);
+          const pids = spawnStep(stepDef, run, runId);
+          step.pids = pids;
+          if (pids.length > 0) step.pid = pids[0];
           modified = true;
         }
       }
@@ -335,7 +583,9 @@ function tickPipelines() {
         if (depStepName && run.steps[depStepName]?.status === 'success') {
           step.status = 'running';
           step.startedAt = new Date().toISOString();
-          step.pid = spawnStep(stepDef, run, runId);
+          const pids = spawnStep(stepDef, run, runId);
+          step.pids = pids;
+          if (pids.length > 0) step.pid = pids[0];
           modified = true;
         }
       }
@@ -354,7 +604,7 @@ function tickPipelines() {
     }
     if (modified) {
-      writeFileSync(join(runsDir, file), JSON.stringify(run, null, 2));
+      persistRun(runId, run);
     }
   }
@@ -415,9 +665,10 @@ function tick() {
   setTimeout(tick, nextTickMs);
 }
-// ─── Startup ────────────────────────────────────────────────
+// ─── Startup ────────────────────────────────────────────────────
 acquireLock();
+loadRunCache();
 process.on('SIGINT', () => { releaseLock(); process.exit(0); });
 process.on('SIGTERM', () => { releaseLock(); process.exit(0); });

package/src/scheduler/pipeline.js CHANGED Viewed

@@ -11,6 +11,8 @@ import { readFileSync, writeFileSync, existsSync, mkdirSync, readdirSync, unlink
 import { join, dirname } from 'node:path';
 import { randomUUID } from 'node:crypto';
 import { ensureDaemon } from './scheduler.js';
+import { killGroup } from '../runner/process-manager.js';
+import { writeSignal } from './run-signals.js';
 const PIPELINES_DIR = '.agents/pipelines';
 const RUNS_DIR = '.agents/runs';
@@ -69,6 +71,8 @@ export function createPipeline(cwd, { name, steps, onError }) {
       name: s.name,
       prompt: s.prompt,
       skill: s.skill || null,
+      group: s.group || null,
+      count: s.count ? parseInt(s.count, 10) : 1,
       approvalMode: s.approval_mode || 'yolo',
       timeout: s.timeout || 600,
       maxBounces: s.maxBounces ?? s.max_bounces ?? 2,
@@ -134,7 +138,8 @@ export function runPipeline(cwd, pipelineId) {
   for (const step of pipeline.steps) {
     steps[step.name] = {
       status: 'pending',
-      pid: null,
+      pid: null,     // Legacy / single pid
+      pids: [],      // Array for parallel execution
       exitCode: null,
       signaled: false,
       bounces: 0,
@@ -198,7 +203,7 @@ export function listRuns(cwd, pipelineId) {
   const dir = join(cwd, RUNS_DIR);
   if (!existsSync(dir)) return [];
   return readdirSync(dir)
-    .filter(f => f.endsWith('.json'))
+    .filter(f => f.endsWith('.json') && !f.includes('.signal-'))
     .map(f => {
       try { return JSON.parse(readFileSync(join(dir, f), 'utf-8')); }
       catch { return null; }
@@ -208,7 +213,8 @@ export function listRuns(cwd, pipelineId) {
 }
 /**
- * Cancel a pipeline run.
+ * Cancel a pipeline run. Writes a signal file for the daemon.
+ * Kills running processes immediately for responsiveness.
  * @param {string} cwd
  * @param {string} runId
  * @returns {boolean}
@@ -217,19 +223,22 @@ export function cancelRun(cwd, runId) {
   const run = getRun(cwd, runId);
   if (!run || run.status !== 'running') return false;
-  // Kill any running step
+  // Kill running processes immediately (side-effect safe)
   for (const [name, step] of Object.entries(run.steps)) {
-    if (step.status === 'running' && step.pid) {
-      try { process.kill(step.pid, 'SIGTERM'); } catch { /* already dead */ }
-      step.status = 'cancelled';
-    }
-    if (step.status === 'pending') {
-      step.status = 'skipped';
+    if (step.status === 'running') {
+      const pidsToKill = [...(step.pids || [])];
+      if (step.pid && !pidsToKill.includes(step.pid)) pidsToKill.push(step.pid);
+      for (const pid of pidsToKill) {
+        killGroup(pid);
+      }
     }
   }
-  run.status = 'cancelled';
-  run.completedAt = new Date().toISOString();
-  saveRun(cwd, runId, run);
+  // Write signal file — daemon will apply the state change
+  writeSignal(cwd, runId, {
+    type: 'CANCEL_RUN',
+  });
   return true;
 }
@@ -245,7 +254,7 @@ export function findActiveRunByStep(cwd, stepName) {
   const dir = join(cwd, RUNS_DIR);
   if (!existsSync(dir)) return null;
-  for (const f of readdirSync(dir).filter(f => f.endsWith('.json'))) {
+  for (const f of readdirSync(dir).filter(f => f.endsWith('.json') && !f.includes('.signal-'))) {
     try {
       const run = JSON.parse(readFileSync(join(dir, f), 'utf-8'));
       if (run.status === 'running' && run.steps[stepName]) {
@@ -258,42 +267,42 @@ export function findActiveRunByStep(cwd, stepName) {
 /**
  * Signal step completion. Called by agent via MCP tool.
+ * Writes a signal file instead of mutating run state directly.
+ * The daemon will consume this signal on its next tick.
  * @param {string} cwd
  * @param {string} stepName
  * @param {string} [output]
  * @param {string} [runId] - Specific run ID (recommended)
- * @returns {{ success: boolean, nextStep?: string }}
+ * @returns {{ success: boolean }}
  */
 export function signalStepComplete(cwd, stepName, output, runId) {
-  let run, resolvedRunId;
+  let resolvedRunId = runId;
-  if (runId) {
-    // Direct lookup by run ID
-    run = getRun(cwd, runId);
-    resolvedRunId = runId;
-  } else {
+  if (!resolvedRunId) {
     // Fallback: search by step name
     const found = findActiveRunByStep(cwd, stepName);
     if (!found) return { success: false };
-    run = found.run;
     resolvedRunId = found.runId;
   }
+  // Verify run exists and is active
+  const run = getRun(cwd, resolvedRunId);
   if (!run || run.status !== 'running') return { success: false };
-  const step = run.steps[stepName];
-  if (!step || step.status !== 'running') return { success: false };
+  if (!run.steps[stepName] || run.steps[stepName].status !== 'running') return { success: false };
-  step.status = 'success';
-  step.signaled = true;
-  step.completedAt = new Date().toISOString();
-  if (output) step.output = output;
+  // Write signal file — daemon will apply it
+  writeSignal(cwd, resolvedRunId, {
+    type: 'STEP_COMPLETE',
+    stepName,
+    output: output || null,
+  });
-  saveRun(cwd, resolvedRunId, run);
   return { success: true };
 }
 /**
  * Bounce back to a previous step. Called by agent via MCP tool.
+ * Writes a signal file instead of mutating run state directly.
  * @param {string} cwd
  * @param {string} targetStepName - Step to re-run
  * @param {string} reason - Why bouncing back
@@ -301,54 +310,53 @@ export function signalStepComplete(cwd, stepName, output, runId) {
  * @returns {{ success: boolean, bounceCount?: number, maxBounces?: number }}
  */
 export function bounceBack(cwd, targetStepName, reason, runId) {
-  // Find active run where the caller is running
-  const dir = join(cwd, RUNS_DIR);
-  if (!existsSync(dir)) return { success: false };
+  // Find the active run containing this step
+  let resolvedRunId = runId;
+  let run;
-  for (const f of readdirSync(dir).filter(f => f.endsWith('.json'))) {
-    try {
-      const run = JSON.parse(readFileSync(join(dir, f), 'utf-8'));
-      if (run.status !== 'running') continue;
-      const targetStep = run.steps[targetStepName];
-      if (!targetStep) continue;
-      // Find the pipeline definition for maxBounces
-      const pipeline = getPipeline(run.cwd || cwd, run.pipeline);
-      const stepDef = pipeline?.steps.find(s => s.name === targetStepName);
-      const maxBounces = stepDef?.maxBounces ?? 2;
-      if (targetStep.bounces >= maxBounces) {
-        // Bounce limit reached — fail pipeline
-        targetStep.status = 'failed';
-        targetStep.lastBounceReason = `Bounce limit (${maxBounces}) reached. Last: ${reason}`;
-        run.status = 'failed';
-        run.completedAt = new Date().toISOString();
-        saveRun(cwd, f.replace('.json', ''), run);
-        return { success: false, bounceCount: targetStep.bounces, maxBounces };
-      }
+  if (resolvedRunId) {
+    run = getRun(cwd, resolvedRunId);
+  } else {
+    const dir = join(cwd, RUNS_DIR);
+    if (!existsSync(dir)) return { success: false };
+    for (const f of readdirSync(dir).filter(f => f.endsWith('.json') && !f.includes('.signal-'))) {
+      try {
+        const r = JSON.parse(readFileSync(join(dir, f), 'utf-8'));
+        if (r.status === 'running' && r.steps[targetStepName]) {
+          run = r;
+          resolvedRunId = f.replace('.json', '');
+          break;
+        }
+      } catch { /* skip */ }
+    }
+  }
-      // Reset target step to pending with bounce info
-      targetStep.status = 'bounce_pending';
-      targetStep.bounces += 1;
-      targetStep.lastBounceReason = reason;
-      targetStep.pid = null;
-      targetStep.exitCode = null;
-      targetStep.signaled = false;
-      // Reset the calling step too
-      const callingStepName = Object.keys(run.steps).find(name => {
-        const s = run.steps[name];
-        return s.status === 'running';
-      });
-      if (callingStepName) {
-        run.steps[callingStepName].status = 'waiting_bounce';
-      }
+  if (!run || run.status !== 'running') return { success: false };
-      saveRun(cwd, f.replace('.json', ''), run);
-      return { success: true, bounceCount: targetStep.bounces, maxBounces };
-    } catch { /* skip */ }
+  const targetStep = run.steps[targetStepName];
+  if (!targetStep) return { success: false };
+  // Check bounce limit (read-only check — safe without lock)
+  const pipeline = getPipeline(run.cwd || cwd, run.pipeline);
+  const stepDef = pipeline?.steps.find(s => s.name === targetStepName);
+  const maxBounces = stepDef?.maxBounces ?? 2;
+  if (targetStep.bounces >= maxBounces) {
+    return { success: false, bounceCount: targetStep.bounces, maxBounces };
   }
-  return { success: false };
+  // Find the calling step name (the step that's bouncing back)
+  const callingStepName = Object.keys(run.steps).find(name =>
+    run.steps[name].status === 'running' && name !== targetStepName,
+  );
+  // Write signal file — daemon will apply the state changes and kill processes
+  writeSignal(cwd, resolvedRunId, {
+    type: 'BOUNCE_BACK',
+    stepName: targetStepName,
+    callingStepName: callingStepName || null,
+    reason,
+  });
+  return { success: true, bounceCount: targetStep.bounces + 1, maxBounces };
 }

package/src/scheduler/run-signals.js ADDED Viewed

@@ -0,0 +1,81 @@
+/**
+ * Run signal files — atomic communication between MCP server and daemon.
+ *
+ * Instead of MCP tools writing directly to run JSON (race condition),
+ * they write small signal files that the daemon consumes on each tick.
+ *
+ * Signal types: STEP_COMPLETE, BOUNCE_BACK
+ *
+ * @module agent-pool/scheduler/run-signals
+ */
+import { writeFileSync, readFileSync, readdirSync, unlinkSync, existsSync, mkdirSync } from 'node:fs';
+import { join } from 'node:path';
+import { randomUUID } from 'node:crypto';
+const RUNS_DIR = '.agents/runs';
+/**
+ * Write a signal file for a specific run.
+ * Signal files are atomic — no concurrent read-modify-write.
+ * @param {string} cwd
+ * @param {string} runId
+ * @param {object} signal - { type, stepName, output?, reason?, targetStep? }
+ */
+export function writeSignal(cwd, runId, signal) {
+  const dir = join(cwd, RUNS_DIR);
+  mkdirSync(dir, { recursive: true });
+  const id = randomUUID().split('-')[0];
+  const fileName = `${runId}.signal-${id}.json`;
+  const payload = {
+    ...signal,
+    timestamp: new Date().toISOString(),
+  };
+  writeFileSync(join(dir, fileName), JSON.stringify(payload));
+}
+/**
+ * Consume all pending signal files for a run.
+ * Returns signals sorted by timestamp. Does NOT delete them —
+ * caller must call deleteSignals() after persisting state.
+ * @param {string} cwd
+ * @param {string} runId
+ * @returns {Array<{ type: string, stepName: string, fileName: string, [key: string]: any }>}
+ */
+export function consumeSignals(cwd, runId) {
+  const dir = join(cwd, RUNS_DIR);
+  if (!existsSync(dir)) return [];
+  const prefix = `${runId}.signal-`;
+  const signalFiles = readdirSync(dir).filter(f => f.startsWith(prefix) && f.endsWith('.json'));
+  const signals = [];
+  for (const f of signalFiles) {
+    try {
+      const data = JSON.parse(readFileSync(join(dir, f), 'utf-8'));
+      signals.push({ ...data, fileName: f });
+    } catch {
+      // Include corrupted files so they get cleaned up by deleteSignals
+      signals.push({ type: '_corrupted', fileName: f });
+    }
+  }
+  // Sort by timestamp for deterministic processing
+  signals.sort((a, b) => (a.timestamp || '').localeCompare(b.timestamp || ''));
+  return signals;
+}
+/**
+ * Delete signal files after state has been persisted to disk.
+ * @param {string} cwd
+ * @param {Array<{ fileName: string }>} signals
+ */
+export function deleteSignals(cwd, signals) {
+  const dir = join(cwd, RUNS_DIR);
+  for (const s of signals) {
+    try { unlinkSync(join(dir, s.fileName)); }
+    catch { /* ignore */ }
+  }
+}

package/src/server.js CHANGED Viewed

@@ -23,6 +23,7 @@ import { consultPeer } from './tools/consult.js';
 import { addSchedule, listSchedules, removeSchedule, getScheduledResults, getDaemonStatus } from './scheduler/scheduler.js';
 import { createPipeline, listPipelines, runPipeline, getRun, listRuns, cancelRun, signalStepComplete, bounceBack } from './scheduler/pipeline.js';
 import { createGroup, listGroups, getGroup } from './tools/groups.js';
+import { sendMessage, getMessages } from './tools/messaging.js';
 import { TOOL_DEFINITIONS } from './tool-definitions.js';
@@ -112,7 +113,7 @@ export function createServer() {
   }
   const server = new Server(
-    { name: 'agent-pool', version: '1.5.0' },
+    { name: 'agent-pool', version: '1.7.0' },
     { capabilities: { tools: {}, resources: {} } },
   );
@@ -208,6 +209,10 @@ export function createServer() {
           response = handleListGroups(args); break;
         case 'delegate_to_group':
           response = handleDelegateToGroup(args); break;
+        case 'send_message':
+          response = handleSendMessage(args); break;
+        case 'get_messages':
+          response = handleGetMessages(args); break;
         default:
           response = { content: [{ type: 'text', text: `Unknown tool: ${name}` }], isError: true };
       }
@@ -803,3 +808,58 @@ function handleDelegateToGroup(args) {
     }],
   };
 }
+// ─── Messaging handlers ─────────────────────────────────────
+function handleSendMessage(args) {
+  const cwd = args.cwd ?? defaultCwd;
+  const result = sendMessage(cwd, {
+    channel: args.channel,
+    payload: args.payload,
+    from: args.from,
+  });
+  if (!result.success) {
+    return {
+      content: [{ type: 'text', text: `❌ Failed to send message: ${result.error || 'unknown error'}` }],
+      isError: true,
+    };
+  }
+  return {
+    content: [{ type: 'text', text: `📨 Message sent to channel \`${result.channel}\`.` }],
+  };
+}
+function handleGetMessages(args) {
+  const cwd = args.cwd ?? defaultCwd;
+  const result = getMessages(cwd, {
+    channel: args.channel,
+    clear: args.clear,
+  });
+  if (result.error) {
+    return {
+      content: [{ type: 'text', text: `❌ ${result.error}` }],
+      isError: true,
+    };
+  }
+  if (result.count === 0) {
+    return {
+      content: [{ type: 'text', text: `📭 No messages on channel \`${args.channel}\`.` }],
+    };
+  }
+  const lines = result.messages.map((m, i) =>
+    `**${i + 1}.** [${m.timestamp}] from \`${m.from}\`:\n\`\`\`json\n${JSON.stringify(m.payload, null, 2)}\n\`\`\``
+  );
+  return {
+    content: [{
+      type: 'text',
+      text: `📬 **${result.count}** message(s) on channel \`${args.channel}\`${args.clear ? ' (cleared)' : ''}:\n\n${lines.join('\n\n')}`,
+    }],
+  };
+}

package/src/tool-definitions.js CHANGED Viewed

@@ -408,5 +408,49 @@ export const TOOL_DEFINITIONS = [
       required: ['group', 'prompt'],
     },
   },
+  {
+    name: 'send_message',
+    description: [
+      'Send a message to a channel for inter-agent communication.',
+      'Use this to pass structured data between pipeline steps or between any agents.',
+      '',
+      'Channel conventions:',
+      '  - {run_id} — broadcast to all steps in a pipeline run',
+      '  - {run_id}:{step_name} — targeted to a specific step',
+      '  - any string — ad-hoc channel for custom messaging',
+      '',
+      'Messages are persisted to disk (survives restarts). Uses JSONL format for concurrent-write safety.',
+    ].join('\n'),
+    inputSchema: {
+      type: 'object',
+      properties: {
+        channel: { type: 'string', description: 'Target channel. Use run_id for broadcast, run_id:step_name for targeted.' },
+        payload: { description: 'Message payload (any JSON-serializable value).' },
+        from: { type: 'string', description: 'Sender identifier (e.g., step name or task description).' },
+        cwd: { type: 'string', description: 'Working directory. Defaults to current working directory.' },
+      },
+      required: ['channel', 'payload'],
+    },
+  },
+  {
+    name: 'get_messages',
+    description: [
+      'Read messages from a channel. Returns all messages in chronological order.',
+      '',
+      'Channel conventions:',
+      '  - {run_id} — read broadcast messages for a pipeline run',
+      '  - {run_id}:{step_name} — read messages targeted to a specific step',
+      '',
+      'Use clear=true to consume messages (delete after reading).',
+    ].join('\n'),
+    inputSchema: {
+      type: 'object',
+      properties: {
+        channel: { type: 'string', description: 'Channel to read messages from.' },
+        clear: { type: 'boolean', description: 'If true, clear the channel after reading (consume mode). Default: false.' },
+        cwd: { type: 'string', description: 'Working directory. Defaults to current working directory.' },
+      },
+      required: ['channel'],
+    },
+  },
 ];

package/src/tools/messaging.js ADDED Viewed

@@ -0,0 +1,104 @@
+/**
+ * Inter-agent messaging — file-based JSONL mailboxes.
+ *
+ * Provides send_message / get_messages tools for agents
+ * to pass structured data between pipeline steps or tasks.
+ *
+ * Uses JSONL format (one JSON object per line) with appendFileSync()
+ * to avoid read-modify-write race conditions on concurrent writes.
+ *
+ * Channel addressing:
+ *   - {run_id}           → broadcast to all steps in a pipeline run
+ *   - {run_id}:{step}    → targeted to a specific step
+ *   - {custom_channel}   → any string for ad-hoc messaging
+ *
+ * @module agent-pool/tools/messaging
+ */
+import { appendFileSync, readFileSync, writeFileSync, existsSync, mkdirSync, renameSync, unlinkSync } from 'node:fs';
+import { join, dirname } from 'node:path';
+const MESSAGES_DIR = '.agents/messages';
+/**
+ * Sanitize channel name for use as filename.
+ * @param {string} channel
+ * @returns {string}
+ */
+function sanitizeChannel(channel) {
+  return channel.replace(/[^a-zA-Z0-9_:-]/g, '_');
+}
+/**
+ * Send a message to a channel.
+ * Uses appendFileSync for atomic writes (no read-modify-write).
+ * @param {string} cwd
+ * @param {object} opts
+ * @param {string} opts.channel - Target channel (e.g., "run_id:step_name")
+ * @param {*} opts.payload - Message payload (any JSON-serializable value)
+ * @param {string} [opts.from] - Sender identifier
+ * @returns {{ success: boolean, channel: string }}
+ */
+export function sendMessage(cwd, { channel, payload, from }) {
+  if (!channel) return { success: false, error: 'channel is required' };
+  const dir = join(cwd, MESSAGES_DIR);
+  mkdirSync(dir, { recursive: true });
+  const filePath = join(dir, `${sanitizeChannel(channel)}.jsonl`);
+  const message = {
+    timestamp: new Date().toISOString(),
+    from: from || 'unknown',
+    payload,
+  };
+  // JSONL: one JSON object per line, appended atomically
+  appendFileSync(filePath, JSON.stringify(message) + '\n');
+  return { success: true, channel };
+}
+/**
+ * Get messages from a channel.
+ * @param {string} cwd
+ * @param {object} opts
+ * @param {string} opts.channel - Channel to read from
+ * @param {boolean} [opts.clear] - If true, clear the channel after reading
+ * @returns {{ messages: Array<{ timestamp: string, from: string, payload: any }>, count: number }}
+ */
+export function getMessages(cwd, { channel, clear }) {
+  if (!channel) return { messages: [], count: 0, error: 'channel is required' };
+  const filePath = join(cwd, MESSAGES_DIR, `${sanitizeChannel(channel)}.jsonl`);
+  if (!existsSync(filePath)) return { messages: [], count: 0 };
+  let content;
+  if (clear) {
+    // Atomic consume: rename file first, then read. Any new messages
+    // appended after rename go to a NEW file (no data loss).
+    const tmpPath = filePath + '.consuming';
+    try {
+      renameSync(filePath, tmpPath);
+      content = readFileSync(tmpPath, 'utf-8').trim();
+      unlinkSync(tmpPath);
+    } catch {
+      // File was deleted or renamed between check and read
+      return { messages: [], count: 0 };
+    }
+  } else {
+    try {
+      content = readFileSync(filePath, 'utf-8').trim();
+    } catch {
+      return { messages: [], count: 0 };
+    }
+  }
+  if (!content) return { messages: [], count: 0 };
+  const messages = content.split('\n').map(line => {
+    try { return JSON.parse(line); }
+    catch { return null; }
+  }).filter(Boolean);
+  return { messages, count: messages.length };
+}