agent-pool-mcp 1.5.0 → 1.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "agent-pool-mcp",
3
- "version": "1.5.0",
3
+ "version": "1.7.0",
4
4
  "type": "module",
5
5
  "description": "MCP Server for multi-agent task delegation and orchestration via Gemini CLI",
6
6
  "main": "index.js",
@@ -11,10 +11,15 @@
11
11
  * @module agent-pool/scheduler/daemon
12
12
  */
13
13
 
14
- import { readFileSync, writeFileSync, existsSync, mkdirSync, unlinkSync } from 'node:fs';
14
+ import { readFileSync, writeFileSync, existsSync, mkdirSync, unlinkSync, readdirSync, renameSync } from 'node:fs';
15
15
  import { spawn } from 'node:child_process';
16
16
  import { join, dirname } from 'node:path';
17
17
  import { matchesCron } from './cron.js';
18
+ import { getGroup } from '../tools/groups.js';
19
+ import { getRunner } from '../runner/config.js';
20
+ import { buildSshSpawn } from '../runner/ssh.js';
21
+ import { killGroup } from '../runner/process-manager.js';
22
+ import { consumeSignals, deleteSignals } from './run-signals.js';
18
23
 
19
24
  const POLL_INTERVAL_MS = 30_000; // Check schedules every 30 seconds
20
25
  const PID_FILE = '.agents/scheduler.pid';
@@ -159,62 +164,233 @@ function executeSchedule(schedule) {
159
164
  console.error(`[scheduler] Started: ${schedule.id} → gemini pid ${child.pid}`);
160
165
  }
161
166
 
162
- // ─── Pipeline tick ──────────────────────────────────────────
163
-
164
- import { readdirSync } from 'node:fs';
167
+ // ─── Pipeline tick ──────────────────────────────────────────────────
165
168
 
166
169
  const PIPELINES_DIR = '.agents/pipelines';
167
170
  const RUNS_DIR = '.agents/runs';
168
171
 
169
172
  /**
170
- * Spawn a Gemini CLI agent for a pipeline step.
173
+ * In-memory pipeline state cache.
174
+ * Loaded from disk on startup, updated in-place during ticks.
175
+ * Written to disk on state transitions (write-through).
176
+ * @type {Map<string, object>}
177
+ */
178
+ const runCache = new Map();
179
+
180
+ /**
181
+ * Load all active runs from disk into the in-memory cache.
182
+ * Called once on daemon startup.
183
+ */
184
+ function loadRunCache() {
185
+ const dir = join(cwd, RUNS_DIR);
186
+ if (!existsSync(dir)) return;
187
+ for (const f of readdirSync(dir).filter(f => f.endsWith('.json') && !f.includes('.signal-'))) {
188
+ try {
189
+ const run = JSON.parse(readFileSync(join(dir, f), 'utf-8'));
190
+ const runId = f.replace('.json', '');
191
+ runCache.set(runId, run);
192
+ } catch { /* skip corrupted */ }
193
+ }
194
+ console.error(`[pipeline] Loaded ${runCache.size} runs into memory cache`);
195
+ }
196
+
197
+ /**
198
+ * Persist a run to disk atomically (write-then-rename).
199
+ * Prevents corruption if daemon crashes mid-write.
200
+ * @param {string} runId
201
+ * @param {object} run
202
+ */
203
+ function persistRun(runId, run) {
204
+ const dir = join(cwd, RUNS_DIR);
205
+ mkdirSync(dir, { recursive: true });
206
+ const target = join(dir, `${runId}.json`);
207
+ const tmp = join(dir, `${runId}.json.tmp`);
208
+ writeFileSync(tmp, JSON.stringify(run, null, 2));
209
+ // Atomic rename (same filesystem) — prevents corruption on crash
210
+ try { renameSync(tmp, target); }
211
+ catch { writeFileSync(target, JSON.stringify(run, null, 2)); }
212
+ }
213
+
214
+ /**
215
+ * Apply consumed signal files to a run's in-memory state.
216
+ * @param {object} run - Run state object (mutated in place)
217
+ * @param {Array} signals - Consumed signal objects
218
+ * @param {object} pipeline - Pipeline definition
219
+ * @returns {boolean} true if any signal was applied
220
+ */
221
+ function applySignals(run, signals, pipeline) {
222
+ let modified = false;
223
+ for (const signal of signals) {
224
+ if (signal.type === 'STEP_COMPLETE') {
225
+ const step = run.steps[signal.stepName];
226
+ if (step && step.status === 'running') {
227
+ step.status = 'success';
228
+ step.signaled = true;
229
+ step.completedAt = new Date().toISOString();
230
+ if (signal.output) step.output = signal.output;
231
+ modified = true;
232
+ console.error(`[pipeline] Signal: step "${signal.stepName}" completed`);
233
+ }
234
+ } else if (signal.type === 'BOUNCE_BACK') {
235
+ const targetStep = run.steps[signal.stepName];
236
+ if (!targetStep) continue;
237
+
238
+ const stepDef = pipeline?.steps.find(s => s.name === signal.stepName);
239
+ const maxBounces = stepDef?.maxBounces ?? 2;
240
+
241
+ if (targetStep.bounces >= maxBounces) {
242
+ // Bounce limit reached
243
+ targetStep.status = 'failed';
244
+ targetStep.lastBounceReason = `Bounce limit (${maxBounces}) reached. Last: ${signal.reason}`;
245
+ run.status = 'failed';
246
+ run.completedAt = new Date().toISOString();
247
+ console.error(`[pipeline] Bounce limit reached for "${signal.stepName}"`);
248
+ } else {
249
+ // Reset target step
250
+ targetStep.status = 'bounce_pending';
251
+ targetStep.bounces = (targetStep.bounces || 0) + 1;
252
+ targetStep.lastBounceReason = signal.reason;
253
+
254
+ // Kill running processes for this step
255
+ const pidsToKill = [...(targetStep.pids || [])];
256
+ if (targetStep.pid && !pidsToKill.includes(targetStep.pid)) pidsToKill.push(targetStep.pid);
257
+ for (const pid of pidsToKill) killGroup(pid);
258
+
259
+ targetStep.pid = null;
260
+ targetStep.pids = [];
261
+ targetStep.exitCode = null;
262
+ targetStep.signaled = false;
263
+
264
+ // Reset calling step
265
+ if (signal.callingStepName && run.steps[signal.callingStepName]) {
266
+ run.steps[signal.callingStepName].status = 'waiting_bounce';
267
+ }
268
+ console.error(`[pipeline] Bounce: step "${signal.stepName}" reset (reason: ${signal.reason})`);
269
+ }
270
+ modified = true;
271
+ } else if (signal.type === 'CANCEL_RUN') {
272
+ // Cancel the entire run
273
+ for (const [name, step] of Object.entries(run.steps)) {
274
+ if (step.status === 'running') step.status = 'cancelled';
275
+ if (step.status === 'pending') step.status = 'skipped';
276
+ }
277
+ run.status = 'cancelled';
278
+ run.completedAt = new Date().toISOString();
279
+ console.error(`[pipeline] Signal: run cancelled`);
280
+ modified = true;
281
+ }
282
+ }
283
+ return modified;
284
+ }
285
+
286
+ /**
287
+ * Spawn Gemini CLI agent(s) for a pipeline step.
171
288
  * @param {object} stepDef - Step definition from pipeline
172
289
  * @param {object} run - Current run state
173
290
  * @param {string} runId
174
291
  * @param {string} [bounceReason] - If bouncing back, the reason
175
- * @returns {number} child PID
292
+ * @returns {number[]} Array of child PIDs
176
293
  */
177
294
  function spawnStep(stepDef, run, runId, bounceReason) {
178
- let prompt = stepDef.prompt;
179
- if (bounceReason) {
180
- prompt = `${stepDef.prompt}\n\n⚠️ BOUNCE BACK: предыдущая попытка была отклонена следующим шагом.\nПричина: ${bounceReason}\nДополни и улучши результат.`;
295
+ const count = stepDef.count || 1;
296
+ const pids = [];
297
+
298
+ // Resolve group
299
+ let groupConfig = {};
300
+ if (stepDef.group) {
301
+ groupConfig = getGroup(run.cwd || cwd, stepDef.group) || {};
181
302
  }
182
303
 
183
- // Inject pipeline context
184
- prompt = `[Pipeline: ${run.pipelineName}, Step: ${stepDef.name}, Run: ${runId}]\n\nTask:\n${prompt}\n\nWhen finished, call signal_step_complete with step_name "${stepDef.name}" and run_id "${runId}".`;
304
+ const skill = stepDef.skill || groupConfig.skill;
305
+ const policy = groupConfig.policy; // currently policy only from group
306
+ const runnerId = groupConfig.runner;
307
+ const runner = runnerId ? getRunner(runnerId) : { type: 'local' };
308
+ const isRemote = runner && runner.type === 'ssh';
185
309
 
186
- const args = [
187
- '-p', prompt,
188
- '--output-format', 'stream-json',
189
- '--approval-mode', stepDef.approvalMode || 'yolo',
190
- ];
310
+ for (let i = 0; i < count; i++) {
311
+ let prompt = stepDef.prompt;
312
+ if (bounceReason) {
313
+ prompt = `${stepDef.prompt}\n\n⚠️ BOUNCE BACK: предыдущая попытка была отклонена следующим шагом.\nПричина: ${bounceReason}\nДополни и улучши результат.`;
314
+ }
191
315
 
192
- const child = spawn('gemini', args, {
193
- cwd: run.cwd || cwd,
194
- env: { ...process.env, TERM: 'dumb', CI: '1' },
195
- stdio: ['pipe', 'pipe', 'pipe'],
196
- detached: true,
197
- });
316
+ if (count > 1) {
317
+ prompt = `[Agent ${i + 1}/${count}]\n\n${prompt}`;
318
+ }
198
319
 
199
- child.on('close', (code) => {
200
- // Update step exit code in run state
201
- try {
202
- const currentRun = JSON.parse(readFileSync(join(cwd, RUNS_DIR, `${runId}.json`), 'utf-8'));
203
- if (currentRun.steps[stepDef.name]) {
204
- currentRun.steps[stepDef.name].exitCode = code;
320
+ // Inject pipeline context
321
+ prompt = `[Pipeline: ${run.pipelineName}, Step: ${stepDef.name}, Run: ${runId}]\n\nTask:\n${prompt}\n\nWhen finished, call signal_step_complete with step_name "${stepDef.name}" and run_id "${runId}".`;
322
+
323
+ const args = [
324
+ '-p', prompt,
325
+ '--output-format', 'stream-json',
326
+ '--approval-mode', stepDef.approvalMode || 'yolo',
327
+ ];
328
+
329
+ if (skill) {
330
+ // Skills can be active via prompt injection, as we do for scheduled tasks
331
+ args[1] = `Activate skill "${skill}" first.\n\n${args[1]}`;
332
+ }
333
+ if (policy) {
334
+ args.push('--policy', policy);
335
+ }
336
+ if (groupConfig.include_dirs?.length > 0) {
337
+ for (const dir of groupConfig.include_dirs) {
338
+ args.push('--include-directories', dir);
205
339
  }
206
- writeFileSync(join(cwd, RUNS_DIR, `${runId}.json`), JSON.stringify(currentRun, null, 2));
207
- } catch { /* ignore */ }
208
- console.error(`[pipeline] Step "${stepDef.name}" exited (code: ${code}, run: ${runId})`);
209
- });
340
+ }
210
341
 
211
- child.stdin.end();
212
- child.unref();
342
+ let spawnCmd, spawnArgs, spawnOpts;
343
+ if (isRemote) {
344
+ const ssh = buildSshSpawn(runner, args, run.cwd || cwd);
345
+ spawnCmd = ssh.command;
346
+ spawnArgs = ssh.args;
347
+ spawnOpts = { stdio: ['pipe', 'pipe', 'pipe'], detached: true };
348
+ } else {
349
+ spawnCmd = 'gemini';
350
+ spawnArgs = args;
351
+ const currentDepth = parseInt(process.env.AGENT_POOL_DEPTH ?? '0');
352
+ spawnOpts = {
353
+ cwd: run.cwd || cwd,
354
+ env: {
355
+ ...process.env,
356
+ TERM: 'dumb',
357
+ CI: '1',
358
+ AGENT_POOL_DEPTH: String(currentDepth + 1)
359
+ },
360
+ stdio: ['pipe', 'pipe', 'pipe'],
361
+ detached: true,
362
+ };
363
+ if (count > 1) spawnOpts.env.AGENT_INDEX = String(i);
364
+ }
213
365
 
214
- console.error(`[pipeline] Started step "${stepDef.name}" pid ${child.pid} (run: ${runId})`);
215
- return child.pid;
366
+ const child = spawn(spawnCmd, spawnArgs, spawnOpts);
367
+
368
+ child.on('close', (code) => {
369
+ // Update step exit code in in-memory state directly (same process)
370
+ const currentRun = runCache.get(runId);
371
+ if (currentRun?.steps[stepDef.name]) {
372
+ if (code !== 0) {
373
+ currentRun.steps[stepDef.name].exitCode = code;
374
+ } else if (currentRun.steps[stepDef.name].exitCode === null) {
375
+ currentRun.steps[stepDef.name].exitCode = 0;
376
+ }
377
+ // Write-through to disk
378
+ persistRun(runId, currentRun);
379
+ }
380
+ console.error(`[pipeline] Step "${stepDef.name}" [pid ${child.pid}] exited (code: ${code}, run: ${runId})`);
381
+ });
382
+
383
+ child.stdin.end();
384
+ child.unref();
385
+
386
+ console.error(`[pipeline] Started step "${stepDef.name}" → pid ${child.pid} (run: ${runId})`);
387
+ pids.push(child.pid);
388
+ }
389
+
390
+ return pids;
216
391
  }
217
392
 
393
+
218
394
  /**
219
395
  * Check if a process is alive.
220
396
  * @param {number} pid
@@ -226,33 +402,69 @@ function isAlive(pid) {
226
402
  }
227
403
 
228
404
  /**
229
- * Process pipeline runs — check triggers, advance steps.
405
+ * Process pipeline runs — consume signals, check triggers, advance steps.
406
+ * Uses in-memory cache for state; persists to disk on changes.
230
407
  * @returns {boolean} true if any pipeline is actively running
231
408
  */
232
409
  function tickPipelines() {
410
+ // Pick up new runs added to disk since last tick (e.g., from runPipeline)
233
411
  const runsDir = join(cwd, RUNS_DIR);
234
- if (!existsSync(runsDir)) return false;
412
+ if (existsSync(runsDir)) {
413
+ for (const f of readdirSync(runsDir).filter(f => f.endsWith('.json') && !f.includes('.signal-') && !f.endsWith('.tmp'))) {
414
+ const runId = f.replace('.json', '');
415
+ if (!runCache.has(runId)) {
416
+ try {
417
+ const run = JSON.parse(readFileSync(join(runsDir, f), 'utf-8'));
418
+ runCache.set(runId, run);
419
+ console.error(`[pipeline] Picked up new run: ${runId}`);
420
+ } catch { /* skip corrupted */ }
421
+ }
422
+ }
423
+ }
235
424
 
236
425
  const pipelinesDir = join(cwd, PIPELINES_DIR);
237
426
  let hasActive = false;
238
427
 
239
- for (const file of readdirSync(runsDir).filter(f => f.endsWith('.json'))) {
240
- let run;
241
- try { run = JSON.parse(readFileSync(join(runsDir, file), 'utf-8')); }
242
- catch { continue; }
243
-
244
- if (run.status !== 'running') continue;
428
+ // Iterate over a copy of keys to allow modification of runCache during iteration
429
+ for (const runId of Array.from(runCache.keys())) {
430
+ const run = runCache.get(runId);
431
+
432
+ // Evict completed runs from cache (memory leak fix)
433
+ if (run.status !== 'running') {
434
+ // Clean up any orphaned/late signals for completed runs
435
+ const lateSignals = consumeSignals(cwd, runId);
436
+ if (lateSignals.length > 0) {
437
+ deleteSignals(cwd, lateSignals);
438
+ console.error(`[pipeline] Cleaned ${lateSignals.length} orphaned signal(s) for completed run ${runId}`);
439
+ }
440
+ runCache.delete(runId);
441
+ continue;
442
+ }
245
443
  hasActive = true;
246
444
 
247
445
  // Load pipeline definition
248
446
  let pipeline;
249
447
  try {
250
448
  pipeline = JSON.parse(readFileSync(join(pipelinesDir, `${run.pipeline}.json`), 'utf-8'));
251
- } catch { continue; }
449
+ } catch {
450
+ console.error(`[pipeline] Could not load pipeline definition for run ${runId}: ${run.pipeline}.json`);
451
+ continue;
452
+ }
252
453
 
253
- const runId = file.replace('.json', '');
454
+ // 1. Consume and apply signal files
455
+ const signals = consumeSignals(cwd, runId);
254
456
  let modified = false;
255
457
 
458
+ if (signals.length > 0) {
459
+ modified = applySignals(run, signals, pipeline);
460
+ if (modified) {
461
+ // Durability: persist state BEFORE deleting signals
462
+ persistRun(runId, run);
463
+ deleteSignals(cwd, signals);
464
+ }
465
+ }
466
+
467
+ // 2. Process each step
256
468
  for (const stepDef of pipeline.steps) {
257
469
  const step = run.steps[stepDef.name];
258
470
  if (!step) continue;
@@ -261,33 +473,67 @@ function tickPipelines() {
261
473
  if (step.status === 'bounce_pending') {
262
474
  step.status = 'running';
263
475
  step.startedAt = new Date().toISOString();
264
- step.pid = spawnStep(stepDef, run, runId, step.lastBounceReason);
476
+ const pids = spawnStep(stepDef, run, runId, step.lastBounceReason);
477
+ step.pids = pids;
478
+ if (pids.length > 0) step.pid = pids[0];
265
479
  modified = true;
266
480
  continue;
267
481
  }
268
482
 
269
483
  // ── Handle running steps: check if process died ──
270
- if (step.status === 'running' && step.pid) {
271
- if (!isAlive(step.pid)) {
272
- // Process is dead — did agent signal?
273
- if (!step.signaled) {
274
- // Auto-fallback: check exit code
275
- if (step.exitCode === 0 || step.exitCode === null) {
276
- // Treat as success (agent forgot to signal)
277
- step.status = 'success';
278
- step.completedAt = new Date().toISOString();
279
- console.error(`[pipeline] Step "${stepDef.name}" auto-completed (pid dead, exit: ${step.exitCode})`);
280
- } else {
281
- // Failed
282
- step.status = 'failed';
283
- step.completedAt = new Date().toISOString();
284
- console.error(`[pipeline] Step "${stepDef.name}" failed (exit: ${step.exitCode})`);
285
- if (pipeline.onError === 'stop') {
286
- run.status = 'failed';
287
- run.completedAt = new Date().toISOString();
288
- }
484
+ if (step.status === 'running') {
485
+ const pids = step.pids?.length > 0 ? step.pids : (step.pid ? [step.pid] : []);
486
+ if (pids.length === 0) continue;
487
+
488
+ let livingPids = 0;
489
+ for (const pid of pids) if (isAlive(pid)) livingPids++;
490
+
491
+ const isParallel = pids.length > 1;
492
+
493
+ if (isParallel) {
494
+ // Parallel semantics: rely entirely on exit codes
495
+ if (step.exitCode !== null && step.exitCode !== 0) {
496
+ // Fail fast: kill siblings
497
+ for (const pid of pids) if (isAlive(pid)) killGroup(pid);
498
+ step.status = 'failed';
499
+ step.completedAt = new Date().toISOString();
500
+ console.error(`[pipeline] Step "${stepDef.name}" parallel failed (exit: ${step.exitCode})`);
501
+ if (pipeline.onError === 'stop') {
502
+ run.status = 'failed';
503
+ run.completedAt = new Date().toISOString();
289
504
  }
290
505
  modified = true;
506
+ } else if (livingPids === 0) {
507
+ // All dead and no errors
508
+ step.status = 'success';
509
+ step.completedAt = new Date().toISOString();
510
+ console.error(`[pipeline] Step "${stepDef.name}" parallel completed successfully`);
511
+ modified = true;
512
+ }
513
+ } else {
514
+ // Sequential semantics (count 1)
515
+ const pid = pids[0];
516
+ if (!isAlive(pid)) {
517
+ // Process is dead — did agent signal?
518
+ if (!step.signaled) {
519
+ // Auto-fallback: check exit code
520
+ if (step.exitCode === 0 || step.exitCode === null) {
521
+ // Treat as success (agent forgot to signal)
522
+ step.status = 'success';
523
+ step.completedAt = new Date().toISOString();
524
+ console.error(`[pipeline] Step "${stepDef.name}" auto-completed (pid dead, exit: ${step.exitCode})`);
525
+ } else {
526
+ // Failed
527
+ step.status = 'failed';
528
+ step.completedAt = new Date().toISOString();
529
+ console.error(`[pipeline] Step "${stepDef.name}" failed (exit: ${step.exitCode})`);
530
+ if (pipeline.onError === 'stop') {
531
+ run.status = 'failed';
532
+ run.completedAt = new Date().toISOString();
533
+ }
534
+ }
535
+ modified = true;
536
+ }
291
537
  }
292
538
  }
293
539
  continue;
@@ -324,7 +570,9 @@ function tickPipelines() {
324
570
  if (shouldStart && run.status === 'running') {
325
571
  step.status = 'running';
326
572
  step.startedAt = new Date().toISOString();
327
- step.pid = spawnStep(stepDef, run, runId);
573
+ const pids = spawnStep(stepDef, run, runId);
574
+ step.pids = pids;
575
+ if (pids.length > 0) step.pid = pids[0];
328
576
  modified = true;
329
577
  }
330
578
  }
@@ -335,7 +583,9 @@ function tickPipelines() {
335
583
  if (depStepName && run.steps[depStepName]?.status === 'success') {
336
584
  step.status = 'running';
337
585
  step.startedAt = new Date().toISOString();
338
- step.pid = spawnStep(stepDef, run, runId);
586
+ const pids = spawnStep(stepDef, run, runId);
587
+ step.pids = pids;
588
+ if (pids.length > 0) step.pid = pids[0];
339
589
  modified = true;
340
590
  }
341
591
  }
@@ -354,7 +604,7 @@ function tickPipelines() {
354
604
  }
355
605
 
356
606
  if (modified) {
357
- writeFileSync(join(runsDir, file), JSON.stringify(run, null, 2));
607
+ persistRun(runId, run);
358
608
  }
359
609
  }
360
610
 
@@ -415,9 +665,10 @@ function tick() {
415
665
  setTimeout(tick, nextTickMs);
416
666
  }
417
667
 
418
- // ─── Startup ────────────────────────────────────────────────
668
+ // ─── Startup ────────────────────────────────────────────────────
419
669
 
420
670
  acquireLock();
671
+ loadRunCache();
421
672
 
422
673
  process.on('SIGINT', () => { releaseLock(); process.exit(0); });
423
674
  process.on('SIGTERM', () => { releaseLock(); process.exit(0); });
@@ -11,6 +11,8 @@ import { readFileSync, writeFileSync, existsSync, mkdirSync, readdirSync, unlink
11
11
  import { join, dirname } from 'node:path';
12
12
  import { randomUUID } from 'node:crypto';
13
13
  import { ensureDaemon } from './scheduler.js';
14
+ import { killGroup } from '../runner/process-manager.js';
15
+ import { writeSignal } from './run-signals.js';
14
16
 
15
17
  const PIPELINES_DIR = '.agents/pipelines';
16
18
  const RUNS_DIR = '.agents/runs';
@@ -69,6 +71,8 @@ export function createPipeline(cwd, { name, steps, onError }) {
69
71
  name: s.name,
70
72
  prompt: s.prompt,
71
73
  skill: s.skill || null,
74
+ group: s.group || null,
75
+ count: s.count ? parseInt(s.count, 10) : 1,
72
76
  approvalMode: s.approval_mode || 'yolo',
73
77
  timeout: s.timeout || 600,
74
78
  maxBounces: s.maxBounces ?? s.max_bounces ?? 2,
@@ -134,7 +138,8 @@ export function runPipeline(cwd, pipelineId) {
134
138
  for (const step of pipeline.steps) {
135
139
  steps[step.name] = {
136
140
  status: 'pending',
137
- pid: null,
141
+ pid: null, // Legacy / single pid
142
+ pids: [], // Array for parallel execution
138
143
  exitCode: null,
139
144
  signaled: false,
140
145
  bounces: 0,
@@ -198,7 +203,7 @@ export function listRuns(cwd, pipelineId) {
198
203
  const dir = join(cwd, RUNS_DIR);
199
204
  if (!existsSync(dir)) return [];
200
205
  return readdirSync(dir)
201
- .filter(f => f.endsWith('.json'))
206
+ .filter(f => f.endsWith('.json') && !f.includes('.signal-'))
202
207
  .map(f => {
203
208
  try { return JSON.parse(readFileSync(join(dir, f), 'utf-8')); }
204
209
  catch { return null; }
@@ -208,7 +213,8 @@ export function listRuns(cwd, pipelineId) {
208
213
  }
209
214
 
210
215
  /**
211
- * Cancel a pipeline run.
216
+ * Cancel a pipeline run. Writes a signal file for the daemon.
217
+ * Kills running processes immediately for responsiveness.
212
218
  * @param {string} cwd
213
219
  * @param {string} runId
214
220
  * @returns {boolean}
@@ -217,19 +223,22 @@ export function cancelRun(cwd, runId) {
217
223
  const run = getRun(cwd, runId);
218
224
  if (!run || run.status !== 'running') return false;
219
225
 
220
- // Kill any running step
226
+ // Kill running processes immediately (side-effect safe)
221
227
  for (const [name, step] of Object.entries(run.steps)) {
222
- if (step.status === 'running' && step.pid) {
223
- try { process.kill(step.pid, 'SIGTERM'); } catch { /* already dead */ }
224
- step.status = 'cancelled';
225
- }
226
- if (step.status === 'pending') {
227
- step.status = 'skipped';
228
+ if (step.status === 'running') {
229
+ const pidsToKill = [...(step.pids || [])];
230
+ if (step.pid && !pidsToKill.includes(step.pid)) pidsToKill.push(step.pid);
231
+ for (const pid of pidsToKill) {
232
+ killGroup(pid);
233
+ }
228
234
  }
229
235
  }
230
- run.status = 'cancelled';
231
- run.completedAt = new Date().toISOString();
232
- saveRun(cwd, runId, run);
236
+
237
+ // Write signal file — daemon will apply the state change
238
+ writeSignal(cwd, runId, {
239
+ type: 'CANCEL_RUN',
240
+ });
241
+
233
242
  return true;
234
243
  }
235
244
 
@@ -245,7 +254,7 @@ export function findActiveRunByStep(cwd, stepName) {
245
254
  const dir = join(cwd, RUNS_DIR);
246
255
  if (!existsSync(dir)) return null;
247
256
 
248
- for (const f of readdirSync(dir).filter(f => f.endsWith('.json'))) {
257
+ for (const f of readdirSync(dir).filter(f => f.endsWith('.json') && !f.includes('.signal-'))) {
249
258
  try {
250
259
  const run = JSON.parse(readFileSync(join(dir, f), 'utf-8'));
251
260
  if (run.status === 'running' && run.steps[stepName]) {
@@ -258,42 +267,42 @@ export function findActiveRunByStep(cwd, stepName) {
258
267
 
259
268
  /**
260
269
  * Signal step completion. Called by agent via MCP tool.
270
+ * Writes a signal file instead of mutating run state directly.
271
+ * The daemon will consume this signal on its next tick.
261
272
  * @param {string} cwd
262
273
  * @param {string} stepName
263
274
  * @param {string} [output]
264
275
  * @param {string} [runId] - Specific run ID (recommended)
265
- * @returns {{ success: boolean, nextStep?: string }}
276
+ * @returns {{ success: boolean }}
266
277
  */
267
278
  export function signalStepComplete(cwd, stepName, output, runId) {
268
- let run, resolvedRunId;
279
+ let resolvedRunId = runId;
269
280
 
270
- if (runId) {
271
- // Direct lookup by run ID
272
- run = getRun(cwd, runId);
273
- resolvedRunId = runId;
274
- } else {
281
+ if (!resolvedRunId) {
275
282
  // Fallback: search by step name
276
283
  const found = findActiveRunByStep(cwd, stepName);
277
284
  if (!found) return { success: false };
278
- run = found.run;
279
285
  resolvedRunId = found.runId;
280
286
  }
281
287
 
288
+ // Verify run exists and is active
289
+ const run = getRun(cwd, resolvedRunId);
282
290
  if (!run || run.status !== 'running') return { success: false };
283
- const step = run.steps[stepName];
284
- if (!step || step.status !== 'running') return { success: false };
291
+ if (!run.steps[stepName] || run.steps[stepName].status !== 'running') return { success: false };
285
292
 
286
- step.status = 'success';
287
- step.signaled = true;
288
- step.completedAt = new Date().toISOString();
289
- if (output) step.output = output;
293
+ // Write signal file — daemon will apply it
294
+ writeSignal(cwd, resolvedRunId, {
295
+ type: 'STEP_COMPLETE',
296
+ stepName,
297
+ output: output || null,
298
+ });
290
299
 
291
- saveRun(cwd, resolvedRunId, run);
292
300
  return { success: true };
293
301
  }
294
302
 
295
303
  /**
296
304
  * Bounce back to a previous step. Called by agent via MCP tool.
305
+ * Writes a signal file instead of mutating run state directly.
297
306
  * @param {string} cwd
298
307
  * @param {string} targetStepName - Step to re-run
299
308
  * @param {string} reason - Why bouncing back
@@ -301,54 +310,53 @@ export function signalStepComplete(cwd, stepName, output, runId) {
301
310
  * @returns {{ success: boolean, bounceCount?: number, maxBounces?: number }}
302
311
  */
303
312
  export function bounceBack(cwd, targetStepName, reason, runId) {
304
- // Find active run where the caller is running
305
- const dir = join(cwd, RUNS_DIR);
306
- if (!existsSync(dir)) return { success: false };
313
+ // Find the active run containing this step
314
+ let resolvedRunId = runId;
315
+ let run;
307
316
 
308
- for (const f of readdirSync(dir).filter(f => f.endsWith('.json'))) {
309
- try {
310
- const run = JSON.parse(readFileSync(join(dir, f), 'utf-8'));
311
- if (run.status !== 'running') continue;
312
-
313
- const targetStep = run.steps[targetStepName];
314
- if (!targetStep) continue;
315
-
316
- // Find the pipeline definition for maxBounces
317
- const pipeline = getPipeline(run.cwd || cwd, run.pipeline);
318
- const stepDef = pipeline?.steps.find(s => s.name === targetStepName);
319
- const maxBounces = stepDef?.maxBounces ?? 2;
320
-
321
- if (targetStep.bounces >= maxBounces) {
322
- // Bounce limit reached — fail pipeline
323
- targetStep.status = 'failed';
324
- targetStep.lastBounceReason = `Bounce limit (${maxBounces}) reached. Last: ${reason}`;
325
- run.status = 'failed';
326
- run.completedAt = new Date().toISOString();
327
- saveRun(cwd, f.replace('.json', ''), run);
328
- return { success: false, bounceCount: targetStep.bounces, maxBounces };
329
- }
317
+ if (resolvedRunId) {
318
+ run = getRun(cwd, resolvedRunId);
319
+ } else {
320
+ const dir = join(cwd, RUNS_DIR);
321
+ if (!existsSync(dir)) return { success: false };
322
+ for (const f of readdirSync(dir).filter(f => f.endsWith('.json') && !f.includes('.signal-'))) {
323
+ try {
324
+ const r = JSON.parse(readFileSync(join(dir, f), 'utf-8'));
325
+ if (r.status === 'running' && r.steps[targetStepName]) {
326
+ run = r;
327
+ resolvedRunId = f.replace('.json', '');
328
+ break;
329
+ }
330
+ } catch { /* skip */ }
331
+ }
332
+ }
330
333
 
331
- // Reset target step to pending with bounce info
332
- targetStep.status = 'bounce_pending';
333
- targetStep.bounces += 1;
334
- targetStep.lastBounceReason = reason;
335
- targetStep.pid = null;
336
- targetStep.exitCode = null;
337
- targetStep.signaled = false;
338
-
339
- // Reset the calling step too
340
- const callingStepName = Object.keys(run.steps).find(name => {
341
- const s = run.steps[name];
342
- return s.status === 'running';
343
- });
344
- if (callingStepName) {
345
- run.steps[callingStepName].status = 'waiting_bounce';
346
- }
334
+ if (!run || run.status !== 'running') return { success: false };
347
335
 
348
- saveRun(cwd, f.replace('.json', ''), run);
349
- return { success: true, bounceCount: targetStep.bounces, maxBounces };
350
- } catch { /* skip */ }
336
+ const targetStep = run.steps[targetStepName];
337
+ if (!targetStep) return { success: false };
338
+
339
+ // Check bounce limit (read-only check — safe without lock)
340
+ const pipeline = getPipeline(run.cwd || cwd, run.pipeline);
341
+ const stepDef = pipeline?.steps.find(s => s.name === targetStepName);
342
+ const maxBounces = stepDef?.maxBounces ?? 2;
343
+
344
+ if (targetStep.bounces >= maxBounces) {
345
+ return { success: false, bounceCount: targetStep.bounces, maxBounces };
351
346
  }
352
347
 
353
- return { success: false };
348
+ // Find the calling step name (the step that's bouncing back)
349
+ const callingStepName = Object.keys(run.steps).find(name =>
350
+ run.steps[name].status === 'running' && name !== targetStepName,
351
+ );
352
+
353
+ // Write signal file — daemon will apply the state changes and kill processes
354
+ writeSignal(cwd, resolvedRunId, {
355
+ type: 'BOUNCE_BACK',
356
+ stepName: targetStepName,
357
+ callingStepName: callingStepName || null,
358
+ reason,
359
+ });
360
+
361
+ return { success: true, bounceCount: targetStep.bounces + 1, maxBounces };
354
362
  }
@@ -0,0 +1,81 @@
1
+ /**
2
+ * Run signal files — atomic communication between MCP server and daemon.
3
+ *
4
+ * Instead of MCP tools writing directly to run JSON (race condition),
5
+ * they write small signal files that the daemon consumes on each tick.
6
+ *
7
+ * Signal types: STEP_COMPLETE, BOUNCE_BACK
8
+ *
9
+ * @module agent-pool/scheduler/run-signals
10
+ */
11
+
12
+ import { writeFileSync, readFileSync, readdirSync, unlinkSync, existsSync, mkdirSync } from 'node:fs';
13
+ import { join } from 'node:path';
14
+ import { randomUUID } from 'node:crypto';
15
+
16
+ const RUNS_DIR = '.agents/runs';
17
+
18
+ /**
19
+ * Write a signal file for a specific run.
20
+ * Signal files are atomic — no concurrent read-modify-write.
21
+ * @param {string} cwd
22
+ * @param {string} runId
23
+ * @param {object} signal - { type, stepName, output?, reason?, targetStep? }
24
+ */
25
+ export function writeSignal(cwd, runId, signal) {
26
+ const dir = join(cwd, RUNS_DIR);
27
+ mkdirSync(dir, { recursive: true });
28
+
29
+ const id = randomUUID().split('-')[0];
30
+ const fileName = `${runId}.signal-${id}.json`;
31
+ const payload = {
32
+ ...signal,
33
+ timestamp: new Date().toISOString(),
34
+ };
35
+
36
+ writeFileSync(join(dir, fileName), JSON.stringify(payload));
37
+ }
38
+
39
+ /**
40
+ * Consume all pending signal files for a run.
41
+ * Returns signals sorted by timestamp. Does NOT delete them —
42
+ * caller must call deleteSignals() after persisting state.
43
+ * @param {string} cwd
44
+ * @param {string} runId
45
+ * @returns {Array<{ type: string, stepName: string, fileName: string, [key: string]: any }>}
46
+ */
47
+ export function consumeSignals(cwd, runId) {
48
+ const dir = join(cwd, RUNS_DIR);
49
+ if (!existsSync(dir)) return [];
50
+
51
+ const prefix = `${runId}.signal-`;
52
+ const signalFiles = readdirSync(dir).filter(f => f.startsWith(prefix) && f.endsWith('.json'));
53
+
54
+ const signals = [];
55
+ for (const f of signalFiles) {
56
+ try {
57
+ const data = JSON.parse(readFileSync(join(dir, f), 'utf-8'));
58
+ signals.push({ ...data, fileName: f });
59
+ } catch {
60
+ // Include corrupted files so they get cleaned up by deleteSignals
61
+ signals.push({ type: '_corrupted', fileName: f });
62
+ }
63
+ }
64
+
65
+ // Sort by timestamp for deterministic processing
66
+ signals.sort((a, b) => (a.timestamp || '').localeCompare(b.timestamp || ''));
67
+ return signals;
68
+ }
69
+
70
+ /**
71
+ * Delete signal files after state has been persisted to disk.
72
+ * @param {string} cwd
73
+ * @param {Array<{ fileName: string }>} signals
74
+ */
75
+ export function deleteSignals(cwd, signals) {
76
+ const dir = join(cwd, RUNS_DIR);
77
+ for (const s of signals) {
78
+ try { unlinkSync(join(dir, s.fileName)); }
79
+ catch { /* ignore */ }
80
+ }
81
+ }
package/src/server.js CHANGED
@@ -23,6 +23,7 @@ import { consultPeer } from './tools/consult.js';
23
23
  import { addSchedule, listSchedules, removeSchedule, getScheduledResults, getDaemonStatus } from './scheduler/scheduler.js';
24
24
  import { createPipeline, listPipelines, runPipeline, getRun, listRuns, cancelRun, signalStepComplete, bounceBack } from './scheduler/pipeline.js';
25
25
  import { createGroup, listGroups, getGroup } from './tools/groups.js';
26
+ import { sendMessage, getMessages } from './tools/messaging.js';
26
27
 
27
28
  import { TOOL_DEFINITIONS } from './tool-definitions.js';
28
29
 
@@ -112,7 +113,7 @@ export function createServer() {
112
113
  }
113
114
 
114
115
  const server = new Server(
115
- { name: 'agent-pool', version: '1.5.0' },
116
+ { name: 'agent-pool', version: '1.7.0' },
116
117
  { capabilities: { tools: {}, resources: {} } },
117
118
  );
118
119
 
@@ -208,6 +209,10 @@ export function createServer() {
208
209
  response = handleListGroups(args); break;
209
210
  case 'delegate_to_group':
210
211
  response = handleDelegateToGroup(args); break;
212
+ case 'send_message':
213
+ response = handleSendMessage(args); break;
214
+ case 'get_messages':
215
+ response = handleGetMessages(args); break;
211
216
  default:
212
217
  response = { content: [{ type: 'text', text: `Unknown tool: ${name}` }], isError: true };
213
218
  }
@@ -803,3 +808,58 @@ function handleDelegateToGroup(args) {
803
808
  }],
804
809
  };
805
810
  }
811
+
812
+ // ─── Messaging handlers ─────────────────────────────────────
813
+
814
+ function handleSendMessage(args) {
815
+ const cwd = args.cwd ?? defaultCwd;
816
+ const result = sendMessage(cwd, {
817
+ channel: args.channel,
818
+ payload: args.payload,
819
+ from: args.from,
820
+ });
821
+
822
+ if (!result.success) {
823
+ return {
824
+ content: [{ type: 'text', text: `❌ Failed to send message: ${result.error || 'unknown error'}` }],
825
+ isError: true,
826
+ };
827
+ }
828
+
829
+ return {
830
+ content: [{ type: 'text', text: `📨 Message sent to channel \`${result.channel}\`.` }],
831
+ };
832
+ }
833
+
834
+ function handleGetMessages(args) {
835
+ const cwd = args.cwd ?? defaultCwd;
836
+ const result = getMessages(cwd, {
837
+ channel: args.channel,
838
+ clear: args.clear,
839
+ });
840
+
841
+ if (result.error) {
842
+ return {
843
+ content: [{ type: 'text', text: `❌ ${result.error}` }],
844
+ isError: true,
845
+ };
846
+ }
847
+
848
+ if (result.count === 0) {
849
+ return {
850
+ content: [{ type: 'text', text: `📭 No messages on channel \`${args.channel}\`.` }],
851
+ };
852
+ }
853
+
854
+ const lines = result.messages.map((m, i) =>
855
+ `**${i + 1}.** [${m.timestamp}] from \`${m.from}\`:\n\`\`\`json\n${JSON.stringify(m.payload, null, 2)}\n\`\`\``
856
+ );
857
+
858
+ return {
859
+ content: [{
860
+ type: 'text',
861
+ text: `📬 **${result.count}** message(s) on channel \`${args.channel}\`${args.clear ? ' (cleared)' : ''}:\n\n${lines.join('\n\n')}`,
862
+ }],
863
+ };
864
+ }
865
+
@@ -408,5 +408,49 @@ export const TOOL_DEFINITIONS = [
408
408
  required: ['group', 'prompt'],
409
409
  },
410
410
  },
411
+ {
412
+ name: 'send_message',
413
+ description: [
414
+ 'Send a message to a channel for inter-agent communication.',
415
+ 'Use this to pass structured data between pipeline steps or between any agents.',
416
+ '',
417
+ 'Channel conventions:',
418
+ ' - {run_id} — broadcast to all steps in a pipeline run',
419
+ ' - {run_id}:{step_name} — targeted to a specific step',
420
+ ' - any string — ad-hoc channel for custom messaging',
421
+ '',
422
+ 'Messages are persisted to disk (survives restarts). Uses JSONL format for concurrent-write safety.',
423
+ ].join('\n'),
424
+ inputSchema: {
425
+ type: 'object',
426
+ properties: {
427
+ channel: { type: 'string', description: 'Target channel. Use run_id for broadcast, run_id:step_name for targeted.' },
428
+ payload: { description: 'Message payload (any JSON-serializable value).' },
429
+ from: { type: 'string', description: 'Sender identifier (e.g., step name or task description).' },
430
+ cwd: { type: 'string', description: 'Working directory. Defaults to current working directory.' },
431
+ },
432
+ required: ['channel', 'payload'],
433
+ },
434
+ },
435
+ {
436
+ name: 'get_messages',
437
+ description: [
438
+ 'Read messages from a channel. Returns all messages in chronological order.',
439
+ '',
440
+ 'Channel conventions:',
441
+ ' - {run_id} — read broadcast messages for a pipeline run',
442
+ ' - {run_id}:{step_name} — read messages targeted to a specific step',
443
+ '',
444
+ 'Use clear=true to consume messages (delete after reading).',
445
+ ].join('\n'),
446
+ inputSchema: {
447
+ type: 'object',
448
+ properties: {
449
+ channel: { type: 'string', description: 'Channel to read messages from.' },
450
+ clear: { type: 'boolean', description: 'If true, clear the channel after reading (consume mode). Default: false.' },
451
+ cwd: { type: 'string', description: 'Working directory. Defaults to current working directory.' },
452
+ },
453
+ required: ['channel'],
454
+ },
455
+ },
411
456
  ];
412
-
@@ -0,0 +1,104 @@
1
+ /**
2
+ * Inter-agent messaging — file-based JSONL mailboxes.
3
+ *
4
+ * Provides send_message / get_messages tools for agents
5
+ * to pass structured data between pipeline steps or tasks.
6
+ *
7
+ * Uses JSONL format (one JSON object per line) with appendFileSync()
8
+ * to avoid read-modify-write race conditions on concurrent writes.
9
+ *
10
+ * Channel addressing:
11
+ * - {run_id} → broadcast to all steps in a pipeline run
12
+ * - {run_id}:{step} → targeted to a specific step
13
+ * - {custom_channel} → any string for ad-hoc messaging
14
+ *
15
+ * @module agent-pool/tools/messaging
16
+ */
17
+
18
+ import { appendFileSync, readFileSync, writeFileSync, existsSync, mkdirSync, renameSync, unlinkSync } from 'node:fs';
19
+ import { join, dirname } from 'node:path';
20
+
21
+ const MESSAGES_DIR = '.agents/messages';
22
+
23
+ /**
24
+ * Sanitize channel name for use as filename.
25
+ * @param {string} channel
26
+ * @returns {string}
27
+ */
28
+ function sanitizeChannel(channel) {
29
+ return channel.replace(/[^a-zA-Z0-9_:-]/g, '_');
30
+ }
31
+
32
+ /**
33
+ * Send a message to a channel.
34
+ * Uses appendFileSync for atomic writes (no read-modify-write).
35
+ * @param {string} cwd
36
+ * @param {object} opts
37
+ * @param {string} opts.channel - Target channel (e.g., "run_id:step_name")
38
+ * @param {*} opts.payload - Message payload (any JSON-serializable value)
39
+ * @param {string} [opts.from] - Sender identifier
40
+ * @returns {{ success: boolean, channel: string }}
41
+ */
42
+ export function sendMessage(cwd, { channel, payload, from }) {
43
+ if (!channel) return { success: false, error: 'channel is required' };
44
+
45
+ const dir = join(cwd, MESSAGES_DIR);
46
+ mkdirSync(dir, { recursive: true });
47
+
48
+ const filePath = join(dir, `${sanitizeChannel(channel)}.jsonl`);
49
+ const message = {
50
+ timestamp: new Date().toISOString(),
51
+ from: from || 'unknown',
52
+ payload,
53
+ };
54
+
55
+ // JSONL: one JSON object per line, appended atomically
56
+ appendFileSync(filePath, JSON.stringify(message) + '\n');
57
+
58
+ return { success: true, channel };
59
+ }
60
+
61
+ /**
62
+ * Get messages from a channel.
63
+ * @param {string} cwd
64
+ * @param {object} opts
65
+ * @param {string} opts.channel - Channel to read from
66
+ * @param {boolean} [opts.clear] - If true, clear the channel after reading
67
+ * @returns {{ messages: Array<{ timestamp: string, from: string, payload: any }>, count: number }}
68
+ */
69
+ export function getMessages(cwd, { channel, clear }) {
70
+ if (!channel) return { messages: [], count: 0, error: 'channel is required' };
71
+
72
+ const filePath = join(cwd, MESSAGES_DIR, `${sanitizeChannel(channel)}.jsonl`);
73
+ if (!existsSync(filePath)) return { messages: [], count: 0 };
74
+
75
+ let content;
76
+ if (clear) {
77
+ // Atomic consume: rename file first, then read. Any new messages
78
+ // appended after rename go to a NEW file (no data loss).
79
+ const tmpPath = filePath + '.consuming';
80
+ try {
81
+ renameSync(filePath, tmpPath);
82
+ content = readFileSync(tmpPath, 'utf-8').trim();
83
+ unlinkSync(tmpPath);
84
+ } catch {
85
+ // File was deleted or renamed between check and read
86
+ return { messages: [], count: 0 };
87
+ }
88
+ } else {
89
+ try {
90
+ content = readFileSync(filePath, 'utf-8').trim();
91
+ } catch {
92
+ return { messages: [], count: 0 };
93
+ }
94
+ }
95
+
96
+ if (!content) return { messages: [], count: 0 };
97
+
98
+ const messages = content.split('\n').map(line => {
99
+ try { return JSON.parse(line); }
100
+ catch { return null; }
101
+ }).filter(Boolean);
102
+
103
+ return { messages, count: messages.length };
104
+ }