@yemi33/minions 0.1.1966 → 0.1.1968

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -468,6 +468,40 @@ function renderPlaybook(type, vars) {
468
468
  } catch (e) { log('warn', `keep_processes hint render failed: ${e.message}`); }
469
469
  }
470
470
 
471
+ // P-1f9c3a45 — opt-in managed_spawn dispatch hint. Mirrors keep_processes:
472
+ // injected only when the dispatcher set vars.managed_spawn (truthy) from the
473
+ // work item's `meta.managed_spawn`. Tells the agent how to write the
474
+ // managed-spawn sidecar so the engine takes over spawn + healthcheck.
475
+ if (vars.managed_spawn) {
476
+ try {
477
+ const managedSpawn = require('./managed-spawn');
478
+ const hint = managedSpawn.buildManagedSpawnHint({
479
+ agentId: vars.agent_id,
480
+ workItemId: vars.item_id || vars.task_id,
481
+ ttlMinutes: vars.managed_spawn_ttl_minutes,
482
+ minionsDir: MINIONS_DIR,
483
+ });
484
+ if (hint) inertAppendices.push(hint);
485
+ } catch (e) { log('warn', `managed_spawn hint render failed: ${e.message}`); }
486
+ }
487
+
488
+ // P-1f9c3a45 — auto-inject live managed processes block, project-scoped.
489
+ // Unconditional: any healthy+alive spec whose owner_project matches
490
+ // vars.project_name is surfaced to the dispatched agent so downstream WIs
491
+ // discover services stood up by earlier WIs without human hand-off. Cap at
492
+ // ENGINE_DEFAULTS.managedSpawn.promptContextMaxBytes (default 2KB) — the
493
+ // helper falls back to a compact name+base_url list when over cap. Empty
494
+ // string is returned when nothing matches, so we short-circuit on falsy.
495
+ if (vars.project_name) {
496
+ try {
497
+ const managedSpawn = require('./managed-spawn');
498
+ const liveBlock = managedSpawn.buildLiveManagedProcessesBlock({
499
+ project: vars.project_name,
500
+ });
501
+ if (liveBlock) inertAppendices.push(liveBlock);
502
+ } catch (e) { log('warn', `managed-spawn live-processes inject failed: ${e.message}`); }
503
+ }
504
+
471
505
  // Inject KB guardrail
472
506
  content += `\n\n---\n\n## Knowledge Base Rules\n\n`;
473
507
  content += `**Never delete, move, or overwrite files in \`knowledge/\`.** The sweep (consolidation engine) is the only process that writes to \`knowledge/\`. If you think a KB file is wrong, note it in your learnings file — do not touch \`knowledge/\` directly.\n`;
@@ -132,6 +132,7 @@ function removeProject(target, options = {}) {
132
132
  drainedDispatches: 0, // includes active dispatches whose agent processes were killed
133
133
  cleanedWorktrees: 0,
134
134
  disabledSchedules: 0,
135
+ killedManagedProcesses: 0, // P-8a4d6f29 — managed-spawn cleanup
135
136
  archivedPlans: [],
136
137
  pipelineRefs: [],
137
138
  archivedTo: null,
@@ -173,6 +174,18 @@ function removeProject(target, options = {}) {
173
174
  );
174
175
  _requeueProjectlessCentralWorkItems(projectlessCentralItemIds);
175
176
 
177
+ // 2.5. Managed-spawn cleanup (P-8a4d6f29). Centralised in managed-spawn.js
178
+ // so no other module needs to know about engine/managed-processes.json.
179
+ // Kills + unlinks every spec owned by this project, including the .log
180
+ // and .log.1 rotated sibling. Best-effort: failures only warn.
181
+ try {
182
+ const managedSpawn = require('./managed-spawn');
183
+ const result = managedSpawn.removeManagedSpecsForProject(project.name);
184
+ summary.killedManagedProcesses = result.killed || 0;
185
+ } catch (e) {
186
+ summary.warnings.push('managed-spawn cleanup: ' + e.message);
187
+ }
188
+
176
189
  // 3. Clean up worktrees under this project's worktree root, honoring
177
190
  // config.engine.worktreeRoot (mirrors lifecycle.js cleanupPlanWorktrees).
178
191
  if (project.localPath) {
package/engine/shared.js CHANGED
@@ -650,6 +650,65 @@ function isPidAlive(pid) {
650
650
  catch { return false; }
651
651
  }
652
652
 
653
+ // P-8a4d6f29 — single helper for detached-process stdio capture with
654
+ // rotate-on-open. Used by bin/minions.js (engine + dashboard stdio logs) and
655
+ // engine/managed-spawn.js openManagedLog. Centralising replaces the previous
656
+ // _openStdioLog in bin/minions.js so rotation is uniform across every long-
657
+ // running log Minions writes.
658
+ //
659
+ // Behavior:
660
+ // - Ensures `dir` exists (mkdir -p).
661
+ // - If `<dir>/<name>` already exists and its size > `rotateBytes`, rename it
662
+ // to `<dir>/<name>.1` (overwriting any prior `.1`) before opening. Keeps
663
+ // exactly one rotated sibling. We rotate first so the freshly opened fd
664
+ // points at an empty file — preserving the original O_APPEND semantics.
665
+ // - Opens the (possibly fresh) file in append mode and returns
666
+ // `{ fd, logPath, rotated }`. `rotated` is `true` when the .1 rename ran.
667
+ // - `opts.fallback === 'ignore'` makes any I/O failure return
668
+ // `{ fd: 'ignore', logPath, rotated: false }` instead of throwing — used by
669
+ // bin/minions.js where a failed log open must not block the restart.
670
+ // - `rotateBytes` defaults to ENGINE_DEFAULTS.managedSpawn.logRotateBytes
671
+ // (10 MB). Pass `Infinity` to disable rotation entirely.
672
+ function openAppendLogFd(name, dir, opts) {
673
+ opts = opts || {};
674
+ if (typeof name !== 'string' || name.length === 0) {
675
+ throw new Error('openAppendLogFd: name required');
676
+ }
677
+ if (typeof dir !== 'string' || dir.length === 0) {
678
+ throw new Error('openAppendLogFd: dir required');
679
+ }
680
+ const fallback = opts.fallback || null;
681
+ const cap = Number.isFinite(opts.rotateBytes)
682
+ ? opts.rotateBytes
683
+ : ((ENGINE_DEFAULTS.managedSpawn && ENGINE_DEFAULTS.managedSpawn.logRotateBytes) || 10 * 1024 * 1024);
684
+ const logPath = path.join(dir, name);
685
+ try {
686
+ try { fs.mkdirSync(dir, { recursive: true }); }
687
+ catch (e) { if (e && e.code !== 'EEXIST') throw e; }
688
+ let rotated = false;
689
+ if (Number.isFinite(cap) && cap > 0) {
690
+ try {
691
+ const st = fs.statSync(logPath);
692
+ if (st && st.size > cap) {
693
+ const rotatedPath = logPath + '.1';
694
+ try { fs.unlinkSync(rotatedPath); }
695
+ catch (e) { if (e && e.code !== 'ENOENT') throw e; }
696
+ fs.renameSync(logPath, rotatedPath);
697
+ rotated = true;
698
+ }
699
+ } catch (e) {
700
+ if (e && e.code !== 'ENOENT') throw e;
701
+ // No existing file → nothing to rotate.
702
+ }
703
+ }
704
+ const fd = fs.openSync(logPath, 'a');
705
+ return { fd: fd, logPath: logPath, rotated: rotated };
706
+ } catch (e) {
707
+ if (fallback === 'ignore') return { fd: 'ignore', logPath: logPath, rotated: false };
708
+ throw e;
709
+ }
710
+ }
711
+
653
712
  function withFileLock(lockPath, fn, {
654
713
  timeoutMs = 5000,
655
714
  retryDelayMs = 25,
@@ -1431,6 +1490,62 @@ const ENGINE_DEFAULTS = {
1431
1490
  // keep_processes use cases.
1432
1491
  requireGitWorkdir: true,
1433
1492
  },
1493
+ // P-7a3b1c92 / plan W-mp7k1r760003b5dd — managed-spawn primitive: agents
1494
+ // describe long-running services in agents/<id>/managed-spawn.json and the
1495
+ // engine owns the spawn + healthcheck lifecycle. These defaults gate the
1496
+ // validator (engine/managed-spawn.js) and later the engine-side spawn loop,
1497
+ // per-tick sweep, playbook hint, and dashboard endpoints. Mirrors the
1498
+ // `keepProcesses` block intentionally — symmetry with the keep-pids
1499
+ // primitive is the documented design convention.
1500
+ managedSpawn: {
1501
+ enabled: true, // global kill switch; opt-in per-WI via meta.managed_spawn
1502
+ maxSpecsPerFile: 5, // ≤5 specs per managed-spawn.json file
1503
+ maxNameLength: 64, // kebab-case spec name cap
1504
+ maxArgsCount: 64, // child-process arg-vector cap per spec
1505
+ maxEnvVars: 32, // env-object cap per spec
1506
+ maxAttrsBytes: 2048, // serialized `attrs` blob cap per spec
1507
+ maxTtlMinutes: 1440, // 24h hard cap on per-spec TTL
1508
+ defaultTtlMinutes: 240, // 4h default when spec.ttl_minutes omitted
1509
+ sweepEvery: 30, // ticks between TTL/dead-PID sweeps
1510
+ defaultHealthIntervalSec: 1, // healthcheck polling cadence pre-healthy
1511
+ healthBackoffSec: 30, // healthcheck liveness cadence post-healthy
1512
+ logRotateBytes: 10 * 1024 * 1024, // 10MB rotation threshold for managed-logs/<name>.log
1513
+ bootReconcileMaxMs: 2000, // boot-time reconcile timeout (don't block engine boot)
1514
+ promptContextMaxBytes: 2048, // cap on auto-injected `## Live managed processes` block
1515
+ requireGitWorkdir: true, // reject specs whose `cwd` isn't a real git worktree
1516
+ // Single global executable allowlist. Applies to both `spec.cmd` and any
1517
+ // `command` healthcheck's `cmd`. Keep narrow — adding a binary here lets
1518
+ // any agent's sidecar invoke it under engine ownership.
1519
+ executableAllowlist: [
1520
+ 'node', 'bun', 'npm', 'npx', 'pnpm', 'yarn',
1521
+ 'python', 'python3', 'pip', 'pip3',
1522
+ 'docker', 'podman',
1523
+ 'adb', 'emulator',
1524
+ 'gradle', 'gradlew', 'mvn',
1525
+ 'pwsh', 'powershell', 'bash', 'sh',
1526
+ 'curl', 'wget',
1527
+ 'git',
1528
+ ],
1529
+ // Env-key allowlist (exact match). Tight by default so a managed spec
1530
+ // can't leak credentials (AWS_*, AZURE_*, GH_TOKEN, etc.). Anything not
1531
+ // here must match one of the allowed prefixes below.
1532
+ envKeyAllowlist: [
1533
+ 'NODE_ENV', 'PORT', 'HOST', 'PATH',
1534
+ 'DEBUG', 'LOG_LEVEL',
1535
+ 'HOME', 'USERPROFILE', 'TMPDIR', 'TEMP', 'TMP',
1536
+ 'LANG', 'LC_ALL',
1537
+ 'JAVA_HOME', 'ANDROID_HOME', 'ANDROID_SDK_ROOT',
1538
+ ],
1539
+ // Env-key prefix allowlist. Standard ecosystem prefixes that frontends
1540
+ // and tooling depend on (Vite, Next.js, CRA, npm scripts). Extend with
1541
+ // caution; broad prefixes (`AWS_`, `AZURE_`) belong on a deny-list, not
1542
+ // an allow-list.
1543
+ envKeyAllowlistPrefixes: [
1544
+ 'VITE_', 'NEXT_', 'REACT_APP_', 'NUXT_', 'GATSBY_',
1545
+ 'npm_config_', 'NPM_CONFIG_',
1546
+ 'MINIONS_',
1547
+ ],
1548
+ },
1434
1549
  // Backward-compat: keep `engine.claude.*` field family deprecation tracker. Listed here so preflight
1435
1550
  // knows which subkeys to flag as deprecated. Do not consume `claude.*` in new code — use the runtime
1436
1551
  // adapter system (engine/runtimes/) and the resolveAgent*/resolveCc* helpers instead.
@@ -2081,6 +2196,8 @@ const FAILURE_CLASS = {
2081
2196
  WORKTREE_PREFLIGHT: 'worktree-preflight', // Pre-spawn worktree validation rejected (nested-in-project, drive-root collapse) — never retryable
2082
2197
  INVALID_KEEP_PROCESSES_WORKDIR: 'invalid-keep-processes-workdir', // W-mp6k7ywi000fa33c: keep-pids.json declared a cwd that is not a real git worktree (likely a selective copy of the repo) — never retryable; agent must rerun in a real worktree
2083
2198
  INVALID_KEEP_PROCESSES_SCHEMA: 'invalid-keep-processes-schema', // W-mp7i902u000l991f: keep-pids.json failed validation for a reason other than workdir (pids-missing, ttl-too-long, expires_at-missing, pids-too-many, port-invalid, etc.) — agent wrote the wrong shape; never retryable until they fix the file
2199
+ INVALID_MANAGED_SPAWN: 'invalid-managed-spawn', // P-7a3b1c92: agents/<id>/managed-spawn.json failed validator (bad schema, broken workdir, executable/env not on allowlist, healthcheck shape wrong). Engine refuses to spawn any spec — agent must fix file; never retryable as-is.
2200
+ MANAGED_SPAWN_HEALTHCHECK_FAILED: 'managed-spawn-healthcheck-failed', // P-7a3b1c92: at least one managed-spawn spec was spawned but failed its healthcheck within timeout_s. Engine killed the failing PIDs; siblings stay alive. Dispatch ERROR with the failing spec name + log tail surfaced in the inbox alert.
2084
2201
  UNKNOWN: 'unknown', // Unclassified failure
2085
2202
  };
2086
2203
  const ESCALATION_POLICY = {
@@ -4297,6 +4414,7 @@ module.exports = {
4297
4414
  _WIN_RESERVED_NAMES, // exported for testing
4298
4415
  LOCK_STALE_MS,
4299
4416
  isPidAlive,
4417
+ openAppendLogFd,
4300
4418
  flushLogs,
4301
4419
  redactSecrets,
4302
4420
  slugify,
package/engine.js CHANGED
@@ -2224,6 +2224,213 @@ async function spawnAgent(dispatchItem, config) {
2224
2224
  }
2225
2225
  }
2226
2226
 
2227
+ // P-2d5e8f04 — managed-spawn acceptance gate. Symmetric to the
2228
+ // keep-processes block above but for `agents/<id>/managed-spawn.json`:
2229
+ // engine owns the spawn + lifecycle of the long-running services the
2230
+ // agent described in its sidecar. This gate (a) rejects malformed
2231
+ // sidecars as a hard non-retryable failure with a dedicated failure
2232
+ // class + inbox alert, and (b) on success spawns each spec detached and
2233
+ // batch-records them in engine/managed-processes.json. Healthcheck loops
2234
+ // + dispatch ERROR-on-healthcheck-failure land in the follow-up item;
2235
+ // for now a spec that spawns successfully is recorded with
2236
+ // healthy:false, alive:true and the engine sweep / item-3 healthcheck
2237
+ // loop will drive its state from there.
2238
+ let managedSpawnAcceptanceFailure = null;
2239
+ let managedSpawnSpawned = []; // [{name, pid, started_at, log_path}]
2240
+ {
2241
+ const _wiMeta = dispatchItem.meta?.item?.meta || {};
2242
+ const _msEnabled = !!_wiMeta.managed_spawn
2243
+ || !!dispatchItem.meta?.managed_spawn;
2244
+ if (_msEnabled) {
2245
+ try {
2246
+ const managedSpawn = require('./engine/managed-spawn');
2247
+ const evalResult = managedSpawn.evaluateManagedSpawnAcceptance(agentId);
2248
+ if (evalResult.exists && !evalResult.accepted) {
2249
+ managedSpawnAcceptanceFailure = {
2250
+ reason: evalResult.reason,
2251
+ filePath: evalResult.filePath,
2252
+ isWorkdirRejection: !!evalResult.isWorkdirRejection,
2253
+ parsedRaw: evalResult.parsedRaw || null,
2254
+ };
2255
+ try { fs.unlinkSync(evalResult.filePath); } catch (_e) { /* gone or busy */ }
2256
+ log('warn', `managed-spawn acceptance: REJECTED ${agentId} (${id}) — ${evalResult.reason}; sidecar deleted`);
2257
+ try {
2258
+ const wiId = dispatchItem.meta?.item?.id || '';
2259
+ const canonicalHint = (() => {
2260
+ try {
2261
+ return managedSpawn.buildManagedSpawnHint({
2262
+ agentId,
2263
+ workItemId: wiId,
2264
+ minionsDir: shared.MINIONS_DIR,
2265
+ });
2266
+ } catch (_hintErr) { return ''; }
2267
+ })();
2268
+ let parsedSnippet = '';
2269
+ if (evalResult.parsedRaw) {
2270
+ try { parsedSnippet = JSON.stringify(evalResult.parsedRaw, null, 2); }
2271
+ catch (_jsonErr) { parsedSnippet = String(evalResult.parsedRaw); }
2272
+ if (parsedSnippet.length > 500) parsedSnippet = parsedSnippet.slice(0, 500) + '\n... (truncated)';
2273
+ }
2274
+ const alertBody = [
2275
+ `# managed_spawn setup REJECTED for ${agentId}`,
2276
+ '',
2277
+ `Your \`agents/${agentId}/managed-spawn.json\` failed validation: \`${evalResult.reason}\`.`,
2278
+ 'No services were spawned and the dispatch was marked ERROR (non-retryable).',
2279
+ '',
2280
+ wiId ? `Work item: ${wiId}` : '',
2281
+ `Agent: ${agentId}`,
2282
+ `Dispatch: ${id}`,
2283
+ '',
2284
+ parsedSnippet ? '## What you wrote\n\n```json\n' + parsedSnippet + '\n```\n' : '',
2285
+ '## Canonical shape',
2286
+ '',
2287
+ canonicalHint || '(see `engine/managed-spawn.js` `buildManagedSpawnHint` for the canonical shape.)',
2288
+ '',
2289
+ ].filter(Boolean).join('\n');
2290
+ writeInboxAlert(`managed-spawn-${agentId}`, alertBody);
2291
+ } catch (alertErr) {
2292
+ log('warn', `managed-spawn acceptance: failed to emit inbox alert for ${agentId}: ${alertErr.message}`);
2293
+ }
2294
+ } else if (evalResult.exists && evalResult.accepted && evalResult.record) {
2295
+ // Valid sidecar — spawn each spec detached and batch-record.
2296
+ // Per-spec failure here (e.g., binary missing on PATH despite
2297
+ // passing the allowlist) marks the whole gate failed so the
2298
+ // dispatch fails ERROR. Surviving siblings get killed for
2299
+ // consistency — the agent should not exit green with a partial
2300
+ // service set up.
2301
+ const ctx = {
2302
+ owner_agent: agentId,
2303
+ owner_wi: dispatchItem.meta?.item?.id || '',
2304
+ owner_project: project?.name || '',
2305
+ };
2306
+ const spawnedItems = [];
2307
+ let spawnFailureReason = null;
2308
+ for (const spec of evalResult.record.specs) {
2309
+ try {
2310
+ const runtime = managedSpawn.spawnManagedSpec(spec, ctx);
2311
+ spawnedItems.push({ spec, runtime });
2312
+ managedSpawnSpawned.push({ name: spec.name, pid: runtime.pid, started_at: runtime.started_at, log_path: runtime.log_path });
2313
+ } catch (specErr) {
2314
+ spawnFailureReason = `spawn failed for ${spec.name}: ${specErr.message}`;
2315
+ log('warn', `managed-spawn: ${spawnFailureReason}`);
2316
+ break;
2317
+ }
2318
+ }
2319
+ if (spawnFailureReason) {
2320
+ // Roll back: kill anything we just spawned, leave no dangling
2321
+ // state. This is consistent with the "all healthy or fail"
2322
+ // contract item 3 will enforce on healthcheck timeout.
2323
+ for (const item of spawnedItems) {
2324
+ try { shared.killByPidImmediate(item.runtime.pid); } catch (_e) {}
2325
+ }
2326
+ managedSpawnSpawned = [];
2327
+ managedSpawnAcceptanceFailure = {
2328
+ reason: spawnFailureReason,
2329
+ filePath: evalResult.filePath,
2330
+ isWorkdirRejection: false,
2331
+ parsedRaw: null,
2332
+ };
2333
+ try { fs.unlinkSync(evalResult.filePath); } catch (_e) {}
2334
+ } else {
2335
+ try {
2336
+ managedSpawn.recordManagedBatch(spawnedItems, ctx);
2337
+ } catch (recErr) {
2338
+ log('warn', `managed-spawn: state-file write failed for ${agentId}: ${recErr.message}`);
2339
+ }
2340
+ // The sidecar has been ingested into the state file; unlink
2341
+ // it so a future dispatch for this agent does not re-spawn
2342
+ // the same specs (the state file is the source of truth).
2343
+ try { fs.unlinkSync(evalResult.filePath); } catch (_e) {}
2344
+ log('info', `managed-spawn accepted: ${agentId} (${id}) spawned ${managedSpawnSpawned.length} spec(s)`);
2345
+ }
2346
+ }
2347
+ } catch (e) {
2348
+ log('warn', `managed-spawn acceptance check failed for ${agentId} (${id}): ${e.message}`);
2349
+ }
2350
+ }
2351
+ }
2352
+
2353
+ // P-9c1f47a6 — managed-spawn healthcheck gate (item 3). After all specs
2354
+ // are spawned + persisted, wait for each spec's first healthcheck to
2355
+ // pass (or fail) within its declared timeout_s. Any failure here forces
2356
+ // the dispatch to ERROR with FAILURE_CLASS.MANAGED_SPAWN_HEALTHCHECK_FAILED,
2357
+ // kills the failing spec's PID + removes its state entry, attaches a
2358
+ // log tail to the inbox alert, and leaves surviving siblings alone (a
2359
+ // partial-failure spec set is more useful than an all-or-nothing wipe
2360
+ // — the agent's later dispatch can inspect /api/managed-processes and
2361
+ // either restart the failed one or remove the survivors itself).
2362
+ let managedSpawnHealthcheckFailure = null;
2363
+ if (managedSpawnSpawned.length > 0) {
2364
+ try {
2365
+ const managedSpawn = require('./engine/managed-spawn');
2366
+ const items = managedSpawnSpawned;
2367
+ // Re-read the specs from the state file (recordManagedBatch normalised
2368
+ // them — healthcheck shape lives there now).
2369
+ const liveSpecs = managedSpawn.listManagedSpecs();
2370
+ const byName = new Map(liveSpecs.map(s => [s.name, s]));
2371
+ const results = await Promise.allSettled(items.map((spawned) => {
2372
+ const spec = byName.get(spawned.name);
2373
+ if (!spec || !spec.healthcheck) {
2374
+ return Promise.resolve({ healthy: false, error: 'no healthcheck recorded for ' + spawned.name, _name: spawned.name });
2375
+ }
2376
+ return managedSpawn.waitForFirstHealth(spec).then(r => Object.assign({ _name: spec.name, _pid: spawned.pid }, r));
2377
+ }));
2378
+ const failed = [];
2379
+ for (let i = 0; i < results.length; i++) {
2380
+ const r = results[i];
2381
+ if (r.status === 'rejected') {
2382
+ failed.push({ name: items[i].name, pid: items[i].pid, error: 'healthcheck threw: ' + (r.reason && r.reason.message ? r.reason.message : String(r.reason)) });
2383
+ continue;
2384
+ }
2385
+ if (!r.value.healthy) {
2386
+ failed.push({ name: r.value._name || items[i].name, pid: r.value._pid || items[i].pid, error: r.value.error || 'unhealthy' });
2387
+ }
2388
+ }
2389
+ if (failed.length > 0) {
2390
+ // Kill failing PIDs + drop their state entries. Surviving siblings
2391
+ // stay alive intentionally (see comment above).
2392
+ for (const f of failed) {
2393
+ try { managedSpawn.removeManagedSpec(f.name); }
2394
+ catch (e) { log('warn', `managed-spawn healthcheck: cleanup failed for ${f.name}: ${e.message}`); }
2395
+ }
2396
+ managedSpawnHealthcheckFailure = {
2397
+ failed: failed,
2398
+ survivedNames: items.filter(it => !failed.some(f => f.name === it.name)).map(it => it.name),
2399
+ };
2400
+ log('warn', `managed-spawn healthcheck: ${failed.length}/${items.length} spec(s) failed for ${agentId} (${id}); ` +
2401
+ failed.map(f => `${f.name}=${f.error}`).join('; '));
2402
+ try {
2403
+ const wiId = dispatchItem.meta?.item?.id || '';
2404
+ const logTails = failed.map(f => {
2405
+ const tail = managedSpawn.tailManagedLog(f.name, 50) || '(log empty or unreadable)';
2406
+ return '### ' + f.name + ' (pid ' + (f.pid || '?') + ')\n\nReason: `' + f.error + '`\n\n```\n' + tail.slice(-2000) + '\n```';
2407
+ }).join('\n\n');
2408
+ const alertBody = [
2409
+ `# managed_spawn healthcheck FAILED for ${agentId}`,
2410
+ '',
2411
+ `${failed.length} of ${items.length} spec(s) failed their first healthcheck within \`timeout_s\`. The failing PIDs were killed and their state entries removed; surviving siblings (${managedSpawnHealthcheckFailure.survivedNames.join(', ') || 'none'}) stay alive.`,
2412
+ '',
2413
+ wiId ? `Work item: ${wiId}` : '',
2414
+ `Agent: ${agentId}`,
2415
+ `Dispatch: ${id}`,
2416
+ '',
2417
+ '## Failure detail + log tails',
2418
+ '',
2419
+ logTails,
2420
+ '',
2421
+ ].filter(Boolean).join('\n');
2422
+ writeInboxAlert(`managed-spawn-healthcheck-${agentId}`, alertBody);
2423
+ } catch (alertErr) {
2424
+ log('warn', `managed-spawn healthcheck: failed to emit inbox alert for ${agentId}: ${alertErr.message}`);
2425
+ }
2426
+ } else {
2427
+ log('info', `managed-spawn healthcheck: ${items.length} spec(s) healthy for ${agentId} (${id})`);
2428
+ }
2429
+ } catch (e) {
2430
+ log('warn', `managed-spawn healthcheck check failed for ${agentId} (${id}): ${e.message}`);
2431
+ }
2432
+ }
2433
+
2227
2434
  // Move from active to completed in dispatch (single source of truth for agent status)
2228
2435
  // autoRecovered: agent failed after creating PRs — treat as success
2229
2436
  const hardContractFail = completionContractFailure?.severity === 'hard'
@@ -2239,7 +2446,15 @@ async function spawnAgent(dispatchItem, config) {
2239
2446
  // not silently treated as success even when exit code is 0. Both
2240
2447
  // workdir and schema rejections route here; the failure_class differs.
2241
2448
  const keepProcessesAcceptanceFail = !!keepProcessesAcceptanceFailure;
2242
- const effectiveResult = (hardContractFail || nonceFail || keepProcessesAcceptanceFail)
2449
+ // P-2d5e8f04 managed-spawn acceptance failure is also a hard failure
2450
+ // (same reasoning). Maps to FAILURE_CLASS.INVALID_MANAGED_SPAWN.
2451
+ const managedSpawnAcceptanceFail = !!managedSpawnAcceptanceFailure;
2452
+ // P-9c1f47a6 — managed-spawn healthcheck failure is also a hard failure:
2453
+ // the agent claims the service is set up but it never became healthy
2454
+ // within the declared timeout. Maps to
2455
+ // FAILURE_CLASS.MANAGED_SPAWN_HEALTHCHECK_FAILED.
2456
+ const managedSpawnHealthcheckFail = !!managedSpawnHealthcheckFailure;
2457
+ const effectiveResult = (hardContractFail || nonceFail || keepProcessesAcceptanceFail || managedSpawnAcceptanceFail || managedSpawnHealthcheckFail)
2243
2458
  ? DISPATCH_RESULT.ERROR
2244
2459
  : (((code === 0 && !agentReportedFailure) || autoRecovered) ? DISPATCH_RESULT.SUCCESS : DISPATCH_RESULT.ERROR);
2245
2460
  const finalCompletionReportPath = structuredCompletion?._path || dispatchItem.meta?.completionReportPath || shared.dispatchCompletionReportPath(id);
@@ -2252,21 +2467,30 @@ async function spawnAgent(dispatchItem, config) {
2252
2467
  ? FAILURE_CLASS.INVALID_KEEP_PROCESSES_WORKDIR
2253
2468
  : FAILURE_CLASS.INVALID_KEEP_PROCESSES_SCHEMA)
2254
2469
  : null;
2255
- const completeOpts = keepProcessesAcceptanceFail
2256
- ? { ...completionOpts, failureClass: _kpFailureClass, agentRetryable: false }
2257
- : (nonceFail
2258
- ? { ...completionOpts, failureClass: nonceMismatch.failureClass, agentRetryable: false }
2259
- : (hardContractFail
2260
- ? { ...completionOpts, processWorkItemFailure: false }
2261
- : (effectiveResult === DISPATCH_RESULT.ERROR ? {
2262
- ...completionOpts,
2263
- ...(failureClass ? { failureClass } : {}),
2264
- ...(typeof retryableDecision === 'boolean' ? { agentRetryable: retryableDecision } : {}),
2265
- ...(structuredCompletion?.failure_class ? { failureClass: structuredCompletion.failure_class } : {}),
2266
- } : completionOpts)));
2470
+ const completeOpts = managedSpawnHealthcheckFail
2471
+ ? { ...completionOpts, failureClass: FAILURE_CLASS.MANAGED_SPAWN_HEALTHCHECK_FAILED, agentRetryable: false }
2472
+ : (managedSpawnAcceptanceFail
2473
+ ? { ...completionOpts, failureClass: FAILURE_CLASS.INVALID_MANAGED_SPAWN, agentRetryable: false }
2474
+ : (keepProcessesAcceptanceFail
2475
+ ? { ...completionOpts, failureClass: _kpFailureClass, agentRetryable: false }
2476
+ : (nonceFail
2477
+ ? { ...completionOpts, failureClass: nonceMismatch.failureClass, agentRetryable: false }
2478
+ : (hardContractFail
2479
+ ? { ...completionOpts, processWorkItemFailure: false }
2480
+ : (effectiveResult === DISPATCH_RESULT.ERROR ? {
2481
+ ...completionOpts,
2482
+ ...(failureClass ? { failureClass } : {}),
2483
+ ...(typeof retryableDecision === 'boolean' ? { agentRetryable: retryableDecision } : {}),
2484
+ ...(structuredCompletion?.failure_class ? { failureClass: structuredCompletion.failure_class } : {}),
2485
+ } : completionOpts)))));
2267
2486
  // Extract last 5 non-empty stderr lines as error context when exit code is non-zero
2268
2487
  let errorReason = '';
2269
- if (keepProcessesAcceptanceFail) {
2488
+ if (managedSpawnHealthcheckFail) {
2489
+ const failNames = managedSpawnHealthcheckFailure.failed.map(f => f.name).join(',');
2490
+ errorReason = `managed_spawn_healthcheck_failed: ${failNames} (${managedSpawnHealthcheckFailure.failed.length}/${managedSpawnSpawned.length})`.slice(0, 300);
2491
+ } else if (managedSpawnAcceptanceFail) {
2492
+ errorReason = `invalid_managed_spawn: ${managedSpawnAcceptanceFailure.reason}`.slice(0, 300);
2493
+ } else if (keepProcessesAcceptanceFail) {
2270
2494
  if (keepProcessesAcceptanceFailure.isWorkdirRejection) {
2271
2495
  errorReason = `invalid_keep_processes_workdir: ${keepProcessesAcceptanceFailure.reason} (cwd=${keepProcessesAcceptanceFailure.cwd || '<unknown>'})`.slice(0, 300);
2272
2496
  } else {
@@ -4015,6 +4239,14 @@ function renderProjectWorkItemPromptForAgent(item, workType, agentId, config, pr
4015
4239
  keep_processes_ttl_minutes: item.meta && Number.isFinite(Number(item.meta.keep_processes_ttl_minutes))
4016
4240
  ? Math.floor(Number(item.meta.keep_processes_ttl_minutes))
4017
4241
  : '',
4242
+ // P-1f9c3a45 — opt-in managed_spawn hint plumbed via item.meta. Same
4243
+ // default-off shape as keep_processes; truthy fires the agent-side
4244
+ // sidecar instructions in renderPlaybook. Live-processes auto-inject is
4245
+ // project-scoped and unconditional (not gated on this flag).
4246
+ managed_spawn: !!(item.meta && item.meta.managed_spawn),
4247
+ managed_spawn_ttl_minutes: item.meta && Number.isFinite(Number(item.meta.managed_spawn_ttl_minutes))
4248
+ ? Math.floor(Number(item.meta.managed_spawn_ttl_minutes))
4249
+ : '',
4018
4250
  };
4019
4251
  const cpResult = buildWorkItemDispatchVars(item, vars, config, {
4020
4252
  worktreePath: vars.worktree_path || root,
@@ -5425,6 +5657,24 @@ async function tickInner() {
5425
5657
  if (_isTickStale(myGeneration)) return;
5426
5658
  }
5427
5659
 
5660
+ // 2.53. managed-spawn TTL/dead-PID sweep + log rotation (P-8a4d6f29). Walks
5661
+ // engine/managed-processes.json, kills TTL-expired specs, drops dead-PID
5662
+ // rows, rotates managed-logs/<name>.log past ENGINE_DEFAULTS.managedSpawn
5663
+ // .logRotateBytes. Mirrors the keep-processes sweep cadence (sweepEvery=30)
5664
+ // so the engine never iterates per-spec on every tick. Healthcheck loops
5665
+ // remain per-spec / self-scheduled and are NOT driven from here.
5666
+ const managedSweepEvery = Math.max(1, ENGINE_DEFAULTS.managedSpawn?.sweepEvery || 30);
5667
+ if (ENGINE_DEFAULTS.managedSpawn?.enabled !== false && tickCount % managedSweepEvery === 0) {
5668
+ safe('sweepManagedSpawn', () => {
5669
+ const { sweepManagedSpawn } = require('./engine/managed-spawn');
5670
+ const stats = sweepManagedSpawn();
5671
+ if (stats.scanned > 0 && (stats.ttlExpired || stats.deadDropped || stats.rotatedLogs || stats.malformed)) {
5672
+ log('info', `managed-spawn sweep: scanned=${stats.scanned} ttl=${stats.ttlExpired} dead=${stats.deadDropped} killed=${stats.killedPids} rotated=${stats.rotatedLogs} malformed=${stats.malformed}`);
5673
+ }
5674
+ });
5675
+ if (_isTickStale(myGeneration)) return;
5676
+ }
5677
+
5428
5678
  // 2.55. Check persistent watches (3 tick-equivalents, default ~3 minutes)
5429
5679
  const watchPollIntervalMs = _pollIntervalMsFromTicks(3, tickIntervalMs);
5430
5680
  if (_shouldRunPeriodicPhase(now, lastWatchCheckAt, watchPollIntervalMs)) {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@yemi33/minions",
3
- "version": "0.1.1966",
3
+ "version": "0.1.1968",
4
4
  "description": "Multi-agent AI dev team that runs from ~/.minions/ — five autonomous agents share a single engine, dashboard, and knowledge base",
5
5
  "bin": {
6
6
  "minions": "bin/minions.js"
@@ -20,6 +20,7 @@
20
20
  "test:e2e:report": "npx playwright show-report test/playwright/report",
21
21
  "test:e2e:video": "npx playwright test --video=on --headed",
22
22
  "test:all": "node test/run-parallel.js && node test/minions-tests.js && node test/integration/run.js",
23
+ "test:perf": "node test/perf/managed-spawn-load.test.js",
23
24
  "test:e2e:accept": "node test/playwright/accept-baseline.js",
24
25
  "test:e2e:accept-force": "node test/playwright/accept-baseline.js --force",
25
26
  "test:setup": "npx playwright install chromium"