@yemi33/minions 0.1.1965 → 0.1.1967
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/minions.js +6 -6
- package/dashboard/js/refresh.js +5 -0
- package/dashboard/js/render-managed.js +261 -0
- package/dashboard/js/render-other.js +5 -2
- package/dashboard/pages/engine.html +6 -0
- package/dashboard/styles.css +21 -4
- package/dashboard-build.js +1 -1
- package/dashboard.js +250 -1
- package/docs/README.md +10 -13
- package/docs/managed-spawn.md +259 -0
- package/docs/watches.md +47 -20
- package/engine/cli.js +39 -0
- package/engine/managed-spawn.js +1325 -0
- package/engine/playbook.js +34 -0
- package/engine/projects.js +13 -0
- package/engine/shared.js +118 -0
- package/engine.js +264 -14
- package/package.json +2 -1
package/engine/playbook.js
CHANGED
|
@@ -468,6 +468,40 @@ function renderPlaybook(type, vars) {
|
|
|
468
468
|
} catch (e) { log('warn', `keep_processes hint render failed: ${e.message}`); }
|
|
469
469
|
}
|
|
470
470
|
|
|
471
|
+
// P-1f9c3a45 — opt-in managed_spawn dispatch hint. Mirrors keep_processes:
|
|
472
|
+
// injected only when the dispatcher set vars.managed_spawn (truthy) from the
|
|
473
|
+
// work item's `meta.managed_spawn`. Tells the agent how to write the
|
|
474
|
+
// managed-spawn sidecar so the engine takes over spawn + healthcheck.
|
|
475
|
+
if (vars.managed_spawn) {
|
|
476
|
+
try {
|
|
477
|
+
const managedSpawn = require('./managed-spawn');
|
|
478
|
+
const hint = managedSpawn.buildManagedSpawnHint({
|
|
479
|
+
agentId: vars.agent_id,
|
|
480
|
+
workItemId: vars.item_id || vars.task_id,
|
|
481
|
+
ttlMinutes: vars.managed_spawn_ttl_minutes,
|
|
482
|
+
minionsDir: MINIONS_DIR,
|
|
483
|
+
});
|
|
484
|
+
if (hint) inertAppendices.push(hint);
|
|
485
|
+
} catch (e) { log('warn', `managed_spawn hint render failed: ${e.message}`); }
|
|
486
|
+
}
|
|
487
|
+
|
|
488
|
+
// P-1f9c3a45 — auto-inject live managed processes block, project-scoped.
|
|
489
|
+
// Unconditional: any healthy+alive spec whose owner_project matches
|
|
490
|
+
// vars.project_name is surfaced to the dispatched agent so downstream WIs
|
|
491
|
+
// discover services stood up by earlier WIs without human hand-off. Cap at
|
|
492
|
+
// ENGINE_DEFAULTS.managedSpawn.promptContextMaxBytes (default 2KB) — the
|
|
493
|
+
// helper falls back to a compact name+base_url list when over cap. Empty
|
|
494
|
+
// string is returned when nothing matches, so we short-circuit on falsy.
|
|
495
|
+
if (vars.project_name) {
|
|
496
|
+
try {
|
|
497
|
+
const managedSpawn = require('./managed-spawn');
|
|
498
|
+
const liveBlock = managedSpawn.buildLiveManagedProcessesBlock({
|
|
499
|
+
project: vars.project_name,
|
|
500
|
+
});
|
|
501
|
+
if (liveBlock) inertAppendices.push(liveBlock);
|
|
502
|
+
} catch (e) { log('warn', `managed-spawn live-processes inject failed: ${e.message}`); }
|
|
503
|
+
}
|
|
504
|
+
|
|
471
505
|
// Inject KB guardrail
|
|
472
506
|
content += `\n\n---\n\n## Knowledge Base Rules\n\n`;
|
|
473
507
|
content += `**Never delete, move, or overwrite files in \`knowledge/\`.** The sweep (consolidation engine) is the only process that writes to \`knowledge/\`. If you think a KB file is wrong, note it in your learnings file — do not touch \`knowledge/\` directly.\n`;
|
package/engine/projects.js
CHANGED
|
@@ -132,6 +132,7 @@ function removeProject(target, options = {}) {
|
|
|
132
132
|
drainedDispatches: 0, // includes active dispatches whose agent processes were killed
|
|
133
133
|
cleanedWorktrees: 0,
|
|
134
134
|
disabledSchedules: 0,
|
|
135
|
+
killedManagedProcesses: 0, // P-8a4d6f29 — managed-spawn cleanup
|
|
135
136
|
archivedPlans: [],
|
|
136
137
|
pipelineRefs: [],
|
|
137
138
|
archivedTo: null,
|
|
@@ -173,6 +174,18 @@ function removeProject(target, options = {}) {
|
|
|
173
174
|
);
|
|
174
175
|
_requeueProjectlessCentralWorkItems(projectlessCentralItemIds);
|
|
175
176
|
|
|
177
|
+
// 2.5. Managed-spawn cleanup (P-8a4d6f29). Centralised in managed-spawn.js
|
|
178
|
+
// so no other module needs to know about engine/managed-processes.json.
|
|
179
|
+
// Kills + unlinks every spec owned by this project, including the .log
|
|
180
|
+
// and .log.1 rotated sibling. Best-effort: failures only warn.
|
|
181
|
+
try {
|
|
182
|
+
const managedSpawn = require('./managed-spawn');
|
|
183
|
+
const result = managedSpawn.removeManagedSpecsForProject(project.name);
|
|
184
|
+
summary.killedManagedProcesses = result.killed || 0;
|
|
185
|
+
} catch (e) {
|
|
186
|
+
summary.warnings.push('managed-spawn cleanup: ' + e.message);
|
|
187
|
+
}
|
|
188
|
+
|
|
176
189
|
// 3. Clean up worktrees under this project's worktree root, honoring
|
|
177
190
|
// config.engine.worktreeRoot (mirrors lifecycle.js cleanupPlanWorktrees).
|
|
178
191
|
if (project.localPath) {
|
package/engine/shared.js
CHANGED
|
@@ -650,6 +650,65 @@ function isPidAlive(pid) {
|
|
|
650
650
|
catch { return false; }
|
|
651
651
|
}
|
|
652
652
|
|
|
653
|
+
// P-8a4d6f29 — single helper for detached-process stdio capture with
|
|
654
|
+
// rotate-on-open. Used by bin/minions.js (engine + dashboard stdio logs) and
|
|
655
|
+
// engine/managed-spawn.js openManagedLog. Centralising replaces the previous
|
|
656
|
+
// _openStdioLog in bin/minions.js so rotation is uniform across every long-
|
|
657
|
+
// running log Minions writes.
|
|
658
|
+
//
|
|
659
|
+
// Behavior:
|
|
660
|
+
// - Ensures `dir` exists (mkdir -p).
|
|
661
|
+
// - If `<dir>/<name>` already exists and its size > `rotateBytes`, rename it
|
|
662
|
+
// to `<dir>/<name>.1` (overwriting any prior `.1`) before opening. Keeps
|
|
663
|
+
// exactly one rotated sibling. We rotate first so the freshly opened fd
|
|
664
|
+
// points at an empty file — preserving the original O_APPEND semantics.
|
|
665
|
+
// - Opens the (possibly fresh) file in append mode and returns
|
|
666
|
+
// `{ fd, logPath, rotated }`. `rotated` is `true` when the .1 rename ran.
|
|
667
|
+
// - `opts.fallback === 'ignore'` makes any I/O failure return
|
|
668
|
+
// `{ fd: 'ignore', logPath, rotated: false }` instead of throwing — used by
|
|
669
|
+
// bin/minions.js where a failed log open must not block the restart.
|
|
670
|
+
// - `rotateBytes` defaults to ENGINE_DEFAULTS.managedSpawn.logRotateBytes
|
|
671
|
+
// (10 MB). Pass `Infinity` to disable rotation entirely.
|
|
672
|
+
function openAppendLogFd(name, dir, opts) {
|
|
673
|
+
opts = opts || {};
|
|
674
|
+
if (typeof name !== 'string' || name.length === 0) {
|
|
675
|
+
throw new Error('openAppendLogFd: name required');
|
|
676
|
+
}
|
|
677
|
+
if (typeof dir !== 'string' || dir.length === 0) {
|
|
678
|
+
throw new Error('openAppendLogFd: dir required');
|
|
679
|
+
}
|
|
680
|
+
const fallback = opts.fallback || null;
|
|
681
|
+
const cap = Number.isFinite(opts.rotateBytes)
|
|
682
|
+
? opts.rotateBytes
|
|
683
|
+
: ((ENGINE_DEFAULTS.managedSpawn && ENGINE_DEFAULTS.managedSpawn.logRotateBytes) || 10 * 1024 * 1024);
|
|
684
|
+
const logPath = path.join(dir, name);
|
|
685
|
+
try {
|
|
686
|
+
try { fs.mkdirSync(dir, { recursive: true }); }
|
|
687
|
+
catch (e) { if (e && e.code !== 'EEXIST') throw e; }
|
|
688
|
+
let rotated = false;
|
|
689
|
+
if (Number.isFinite(cap) && cap > 0) {
|
|
690
|
+
try {
|
|
691
|
+
const st = fs.statSync(logPath);
|
|
692
|
+
if (st && st.size > cap) {
|
|
693
|
+
const rotatedPath = logPath + '.1';
|
|
694
|
+
try { fs.unlinkSync(rotatedPath); }
|
|
695
|
+
catch (e) { if (e && e.code !== 'ENOENT') throw e; }
|
|
696
|
+
fs.renameSync(logPath, rotatedPath);
|
|
697
|
+
rotated = true;
|
|
698
|
+
}
|
|
699
|
+
} catch (e) {
|
|
700
|
+
if (e && e.code !== 'ENOENT') throw e;
|
|
701
|
+
// No existing file → nothing to rotate.
|
|
702
|
+
}
|
|
703
|
+
}
|
|
704
|
+
const fd = fs.openSync(logPath, 'a');
|
|
705
|
+
return { fd: fd, logPath: logPath, rotated: rotated };
|
|
706
|
+
} catch (e) {
|
|
707
|
+
if (fallback === 'ignore') return { fd: 'ignore', logPath: logPath, rotated: false };
|
|
708
|
+
throw e;
|
|
709
|
+
}
|
|
710
|
+
}
|
|
711
|
+
|
|
653
712
|
function withFileLock(lockPath, fn, {
|
|
654
713
|
timeoutMs = 5000,
|
|
655
714
|
retryDelayMs = 25,
|
|
@@ -1431,6 +1490,62 @@ const ENGINE_DEFAULTS = {
|
|
|
1431
1490
|
// keep_processes use cases.
|
|
1432
1491
|
requireGitWorkdir: true,
|
|
1433
1492
|
},
|
|
1493
|
+
// P-7a3b1c92 / plan W-mp7k1r760003b5dd — managed-spawn primitive: agents
|
|
1494
|
+
// describe long-running services in agents/<id>/managed-spawn.json and the
|
|
1495
|
+
// engine owns the spawn + healthcheck lifecycle. These defaults gate the
|
|
1496
|
+
// validator (engine/managed-spawn.js) and later the engine-side spawn loop,
|
|
1497
|
+
// per-tick sweep, playbook hint, and dashboard endpoints. Mirrors the
|
|
1498
|
+
// `keepProcesses` block intentionally — symmetry with the keep-pids
|
|
1499
|
+
// primitive is the documented design convention.
|
|
1500
|
+
managedSpawn: {
|
|
1501
|
+
enabled: true, // global kill switch; opt-in per-WI via meta.managed_spawn
|
|
1502
|
+
maxSpecsPerFile: 5, // ≤5 specs per managed-spawn.json file
|
|
1503
|
+
maxNameLength: 64, // kebab-case spec name cap
|
|
1504
|
+
maxArgsCount: 64, // child-process arg-vector cap per spec
|
|
1505
|
+
maxEnvVars: 32, // env-object cap per spec
|
|
1506
|
+
maxAttrsBytes: 2048, // serialized `attrs` blob cap per spec
|
|
1507
|
+
maxTtlMinutes: 1440, // 24h hard cap on per-spec TTL
|
|
1508
|
+
defaultTtlMinutes: 240, // 4h default when spec.ttl_minutes omitted
|
|
1509
|
+
sweepEvery: 30, // ticks between TTL/dead-PID sweeps
|
|
1510
|
+
defaultHealthIntervalSec: 1, // healthcheck polling cadence pre-healthy
|
|
1511
|
+
healthBackoffSec: 30, // healthcheck liveness cadence post-healthy
|
|
1512
|
+
logRotateBytes: 10 * 1024 * 1024, // 10MB rotation threshold for managed-logs/<name>.log
|
|
1513
|
+
bootReconcileMaxMs: 2000, // boot-time reconcile timeout (don't block engine boot)
|
|
1514
|
+
promptContextMaxBytes: 2048, // cap on auto-injected `## Live managed processes` block
|
|
1515
|
+
requireGitWorkdir: true, // reject specs whose `cwd` isn't a real git worktree
|
|
1516
|
+
// Single global executable allowlist. Applies to both `spec.cmd` and any
|
|
1517
|
+
// `command` healthcheck's `cmd`. Keep narrow — adding a binary here lets
|
|
1518
|
+
// any agent's sidecar invoke it under engine ownership.
|
|
1519
|
+
executableAllowlist: [
|
|
1520
|
+
'node', 'bun', 'npm', 'npx', 'pnpm', 'yarn',
|
|
1521
|
+
'python', 'python3', 'pip', 'pip3',
|
|
1522
|
+
'docker', 'podman',
|
|
1523
|
+
'adb', 'emulator',
|
|
1524
|
+
'gradle', 'gradlew', 'mvn',
|
|
1525
|
+
'pwsh', 'powershell', 'bash', 'sh',
|
|
1526
|
+
'curl', 'wget',
|
|
1527
|
+
'git',
|
|
1528
|
+
],
|
|
1529
|
+
// Env-key allowlist (exact match). Tight by default so a managed spec
|
|
1530
|
+
// can't leak credentials (AWS_*, AZURE_*, GH_TOKEN, etc.). Anything not
|
|
1531
|
+
// here must match one of the allowed prefixes below.
|
|
1532
|
+
envKeyAllowlist: [
|
|
1533
|
+
'NODE_ENV', 'PORT', 'HOST', 'PATH',
|
|
1534
|
+
'DEBUG', 'LOG_LEVEL',
|
|
1535
|
+
'HOME', 'USERPROFILE', 'TMPDIR', 'TEMP', 'TMP',
|
|
1536
|
+
'LANG', 'LC_ALL',
|
|
1537
|
+
'JAVA_HOME', 'ANDROID_HOME', 'ANDROID_SDK_ROOT',
|
|
1538
|
+
],
|
|
1539
|
+
// Env-key prefix allowlist. Standard ecosystem prefixes that frontends
|
|
1540
|
+
// and tooling depend on (Vite, Next.js, CRA, npm scripts). Extend with
|
|
1541
|
+
// caution; broad prefixes (`AWS_`, `AZURE_`) belong on a deny-list, not
|
|
1542
|
+
// an allow-list.
|
|
1543
|
+
envKeyAllowlistPrefixes: [
|
|
1544
|
+
'VITE_', 'NEXT_', 'REACT_APP_', 'NUXT_', 'GATSBY_',
|
|
1545
|
+
'npm_config_', 'NPM_CONFIG_',
|
|
1546
|
+
'MINIONS_',
|
|
1547
|
+
],
|
|
1548
|
+
},
|
|
1434
1549
|
// Backward-compat: keep `engine.claude.*` field family deprecation tracker. Listed here so preflight
|
|
1435
1550
|
// knows which subkeys to flag as deprecated. Do not consume `claude.*` in new code — use the runtime
|
|
1436
1551
|
// adapter system (engine/runtimes/) and the resolveAgent*/resolveCc* helpers instead.
|
|
@@ -2081,6 +2196,8 @@ const FAILURE_CLASS = {
|
|
|
2081
2196
|
WORKTREE_PREFLIGHT: 'worktree-preflight', // Pre-spawn worktree validation rejected (nested-in-project, drive-root collapse) — never retryable
|
|
2082
2197
|
INVALID_KEEP_PROCESSES_WORKDIR: 'invalid-keep-processes-workdir', // W-mp6k7ywi000fa33c: keep-pids.json declared a cwd that is not a real git worktree (likely a selective copy of the repo) — never retryable; agent must rerun in a real worktree
|
|
2083
2198
|
INVALID_KEEP_PROCESSES_SCHEMA: 'invalid-keep-processes-schema', // W-mp7i902u000l991f: keep-pids.json failed validation for a reason other than workdir (pids-missing, ttl-too-long, expires_at-missing, pids-too-many, port-invalid, etc.) — agent wrote the wrong shape; never retryable until they fix the file
|
|
2199
|
+
INVALID_MANAGED_SPAWN: 'invalid-managed-spawn', // P-7a3b1c92: agents/<id>/managed-spawn.json failed validator (bad schema, broken workdir, executable/env not on allowlist, healthcheck shape wrong). Engine refuses to spawn any spec — agent must fix file; never retryable as-is.
|
|
2200
|
+
MANAGED_SPAWN_HEALTHCHECK_FAILED: 'managed-spawn-healthcheck-failed', // P-7a3b1c92: at least one managed-spawn spec was spawned but failed its healthcheck within timeout_s. Engine killed the failing PIDs; siblings stay alive. Dispatch ERROR with the failing spec name + log tail surfaced in the inbox alert.
|
|
2084
2201
|
UNKNOWN: 'unknown', // Unclassified failure
|
|
2085
2202
|
};
|
|
2086
2203
|
const ESCALATION_POLICY = {
|
|
@@ -4297,6 +4414,7 @@ module.exports = {
|
|
|
4297
4414
|
_WIN_RESERVED_NAMES, // exported for testing
|
|
4298
4415
|
LOCK_STALE_MS,
|
|
4299
4416
|
isPidAlive,
|
|
4417
|
+
openAppendLogFd,
|
|
4300
4418
|
flushLogs,
|
|
4301
4419
|
redactSecrets,
|
|
4302
4420
|
slugify,
|
package/engine.js
CHANGED
|
@@ -2224,6 +2224,213 @@ async function spawnAgent(dispatchItem, config) {
|
|
|
2224
2224
|
}
|
|
2225
2225
|
}
|
|
2226
2226
|
|
|
2227
|
+
// P-2d5e8f04 — managed-spawn acceptance gate. Symmetric to the
|
|
2228
|
+
// keep-processes block above but for `agents/<id>/managed-spawn.json`:
|
|
2229
|
+
// engine owns the spawn + lifecycle of the long-running services the
|
|
2230
|
+
// agent described in its sidecar. This gate (a) rejects malformed
|
|
2231
|
+
// sidecars as a hard non-retryable failure with a dedicated failure
|
|
2232
|
+
// class + inbox alert, and (b) on success spawns each spec detached and
|
|
2233
|
+
// batch-records them in engine/managed-processes.json. Healthcheck loops
|
|
2234
|
+
// + dispatch ERROR-on-healthcheck-failure land in the follow-up item;
|
|
2235
|
+
// for now a spec that spawns successfully is recorded with
|
|
2236
|
+
// healthy:false, alive:true and the engine sweep / item-3 healthcheck
|
|
2237
|
+
// loop will drive its state from there.
|
|
2238
|
+
let managedSpawnAcceptanceFailure = null;
|
|
2239
|
+
let managedSpawnSpawned = []; // [{name, pid, started_at, log_path}]
|
|
2240
|
+
{
|
|
2241
|
+
const _wiMeta = dispatchItem.meta?.item?.meta || {};
|
|
2242
|
+
const _msEnabled = !!_wiMeta.managed_spawn
|
|
2243
|
+
|| !!dispatchItem.meta?.managed_spawn;
|
|
2244
|
+
if (_msEnabled) {
|
|
2245
|
+
try {
|
|
2246
|
+
const managedSpawn = require('./engine/managed-spawn');
|
|
2247
|
+
const evalResult = managedSpawn.evaluateManagedSpawnAcceptance(agentId);
|
|
2248
|
+
if (evalResult.exists && !evalResult.accepted) {
|
|
2249
|
+
managedSpawnAcceptanceFailure = {
|
|
2250
|
+
reason: evalResult.reason,
|
|
2251
|
+
filePath: evalResult.filePath,
|
|
2252
|
+
isWorkdirRejection: !!evalResult.isWorkdirRejection,
|
|
2253
|
+
parsedRaw: evalResult.parsedRaw || null,
|
|
2254
|
+
};
|
|
2255
|
+
try { fs.unlinkSync(evalResult.filePath); } catch (_e) { /* gone or busy */ }
|
|
2256
|
+
log('warn', `managed-spawn acceptance: REJECTED ${agentId} (${id}) — ${evalResult.reason}; sidecar deleted`);
|
|
2257
|
+
try {
|
|
2258
|
+
const wiId = dispatchItem.meta?.item?.id || '';
|
|
2259
|
+
const canonicalHint = (() => {
|
|
2260
|
+
try {
|
|
2261
|
+
return managedSpawn.buildManagedSpawnHint({
|
|
2262
|
+
agentId,
|
|
2263
|
+
workItemId: wiId,
|
|
2264
|
+
minionsDir: shared.MINIONS_DIR,
|
|
2265
|
+
});
|
|
2266
|
+
} catch (_hintErr) { return ''; }
|
|
2267
|
+
})();
|
|
2268
|
+
let parsedSnippet = '';
|
|
2269
|
+
if (evalResult.parsedRaw) {
|
|
2270
|
+
try { parsedSnippet = JSON.stringify(evalResult.parsedRaw, null, 2); }
|
|
2271
|
+
catch (_jsonErr) { parsedSnippet = String(evalResult.parsedRaw); }
|
|
2272
|
+
if (parsedSnippet.length > 500) parsedSnippet = parsedSnippet.slice(0, 500) + '\n... (truncated)';
|
|
2273
|
+
}
|
|
2274
|
+
const alertBody = [
|
|
2275
|
+
`# managed_spawn setup REJECTED for ${agentId}`,
|
|
2276
|
+
'',
|
|
2277
|
+
`Your \`agents/${agentId}/managed-spawn.json\` failed validation: \`${evalResult.reason}\`.`,
|
|
2278
|
+
'No services were spawned and the dispatch was marked ERROR (non-retryable).',
|
|
2279
|
+
'',
|
|
2280
|
+
wiId ? `Work item: ${wiId}` : '',
|
|
2281
|
+
`Agent: ${agentId}`,
|
|
2282
|
+
`Dispatch: ${id}`,
|
|
2283
|
+
'',
|
|
2284
|
+
parsedSnippet ? '## What you wrote\n\n```json\n' + parsedSnippet + '\n```\n' : '',
|
|
2285
|
+
'## Canonical shape',
|
|
2286
|
+
'',
|
|
2287
|
+
canonicalHint || '(see `engine/managed-spawn.js` `buildManagedSpawnHint` for the canonical shape.)',
|
|
2288
|
+
'',
|
|
2289
|
+
].filter(Boolean).join('\n');
|
|
2290
|
+
writeInboxAlert(`managed-spawn-${agentId}`, alertBody);
|
|
2291
|
+
} catch (alertErr) {
|
|
2292
|
+
log('warn', `managed-spawn acceptance: failed to emit inbox alert for ${agentId}: ${alertErr.message}`);
|
|
2293
|
+
}
|
|
2294
|
+
} else if (evalResult.exists && evalResult.accepted && evalResult.record) {
|
|
2295
|
+
// Valid sidecar — spawn each spec detached and batch-record.
|
|
2296
|
+
// Per-spec failure here (e.g., binary missing on PATH despite
|
|
2297
|
+
// passing the allowlist) marks the whole gate failed so the
|
|
2298
|
+
// dispatch fails ERROR. Surviving siblings get killed for
|
|
2299
|
+
// consistency — the agent should not exit green with a partial
|
|
2300
|
+
// service set up.
|
|
2301
|
+
const ctx = {
|
|
2302
|
+
owner_agent: agentId,
|
|
2303
|
+
owner_wi: dispatchItem.meta?.item?.id || '',
|
|
2304
|
+
owner_project: project?.name || '',
|
|
2305
|
+
};
|
|
2306
|
+
const spawnedItems = [];
|
|
2307
|
+
let spawnFailureReason = null;
|
|
2308
|
+
for (const spec of evalResult.record.specs) {
|
|
2309
|
+
try {
|
|
2310
|
+
const runtime = managedSpawn.spawnManagedSpec(spec, ctx);
|
|
2311
|
+
spawnedItems.push({ spec, runtime });
|
|
2312
|
+
managedSpawnSpawned.push({ name: spec.name, pid: runtime.pid, started_at: runtime.started_at, log_path: runtime.log_path });
|
|
2313
|
+
} catch (specErr) {
|
|
2314
|
+
spawnFailureReason = `spawn failed for ${spec.name}: ${specErr.message}`;
|
|
2315
|
+
log('warn', `managed-spawn: ${spawnFailureReason}`);
|
|
2316
|
+
break;
|
|
2317
|
+
}
|
|
2318
|
+
}
|
|
2319
|
+
if (spawnFailureReason) {
|
|
2320
|
+
// Roll back: kill anything we just spawned, leave no dangling
|
|
2321
|
+
// state. This is consistent with the "all healthy or fail"
|
|
2322
|
+
// contract item 3 will enforce on healthcheck timeout.
|
|
2323
|
+
for (const item of spawnedItems) {
|
|
2324
|
+
try { shared.killByPidImmediate(item.runtime.pid); } catch (_e) {}
|
|
2325
|
+
}
|
|
2326
|
+
managedSpawnSpawned = [];
|
|
2327
|
+
managedSpawnAcceptanceFailure = {
|
|
2328
|
+
reason: spawnFailureReason,
|
|
2329
|
+
filePath: evalResult.filePath,
|
|
2330
|
+
isWorkdirRejection: false,
|
|
2331
|
+
parsedRaw: null,
|
|
2332
|
+
};
|
|
2333
|
+
try { fs.unlinkSync(evalResult.filePath); } catch (_e) {}
|
|
2334
|
+
} else {
|
|
2335
|
+
try {
|
|
2336
|
+
managedSpawn.recordManagedBatch(spawnedItems, ctx);
|
|
2337
|
+
} catch (recErr) {
|
|
2338
|
+
log('warn', `managed-spawn: state-file write failed for ${agentId}: ${recErr.message}`);
|
|
2339
|
+
}
|
|
2340
|
+
// The sidecar has been ingested into the state file; unlink
|
|
2341
|
+
// it so a future dispatch for this agent does not re-spawn
|
|
2342
|
+
// the same specs (the state file is the source of truth).
|
|
2343
|
+
try { fs.unlinkSync(evalResult.filePath); } catch (_e) {}
|
|
2344
|
+
log('info', `managed-spawn accepted: ${agentId} (${id}) spawned ${managedSpawnSpawned.length} spec(s)`);
|
|
2345
|
+
}
|
|
2346
|
+
}
|
|
2347
|
+
} catch (e) {
|
|
2348
|
+
log('warn', `managed-spawn acceptance check failed for ${agentId} (${id}): ${e.message}`);
|
|
2349
|
+
}
|
|
2350
|
+
}
|
|
2351
|
+
}
|
|
2352
|
+
|
|
2353
|
+
// P-9c1f47a6 — managed-spawn healthcheck gate (item 3). After all specs
|
|
2354
|
+
// are spawned + persisted, wait for each spec's first healthcheck to
|
|
2355
|
+
// pass (or fail) within its declared timeout_s. Any failure here forces
|
|
2356
|
+
// the dispatch to ERROR with FAILURE_CLASS.MANAGED_SPAWN_HEALTHCHECK_FAILED,
|
|
2357
|
+
// kills the failing spec's PID + removes its state entry, attaches a
|
|
2358
|
+
// log tail to the inbox alert, and leaves surviving siblings alone (a
|
|
2359
|
+
// partial-failure spec set is more useful than an all-or-nothing wipe
|
|
2360
|
+
// — the agent's later dispatch can inspect /api/managed-processes and
|
|
2361
|
+
// either restart the failed one or remove the survivors itself).
|
|
2362
|
+
let managedSpawnHealthcheckFailure = null;
|
|
2363
|
+
if (managedSpawnSpawned.length > 0) {
|
|
2364
|
+
try {
|
|
2365
|
+
const managedSpawn = require('./engine/managed-spawn');
|
|
2366
|
+
const items = managedSpawnSpawned;
|
|
2367
|
+
// Re-read the specs from the state file (recordManagedBatch normalised
|
|
2368
|
+
// them — healthcheck shape lives there now).
|
|
2369
|
+
const liveSpecs = managedSpawn.listManagedSpecs();
|
|
2370
|
+
const byName = new Map(liveSpecs.map(s => [s.name, s]));
|
|
2371
|
+
const results = await Promise.allSettled(items.map((spawned) => {
|
|
2372
|
+
const spec = byName.get(spawned.name);
|
|
2373
|
+
if (!spec || !spec.healthcheck) {
|
|
2374
|
+
return Promise.resolve({ healthy: false, error: 'no healthcheck recorded for ' + spawned.name, _name: spawned.name });
|
|
2375
|
+
}
|
|
2376
|
+
return managedSpawn.waitForFirstHealth(spec).then(r => Object.assign({ _name: spec.name, _pid: spawned.pid }, r));
|
|
2377
|
+
}));
|
|
2378
|
+
const failed = [];
|
|
2379
|
+
for (let i = 0; i < results.length; i++) {
|
|
2380
|
+
const r = results[i];
|
|
2381
|
+
if (r.status === 'rejected') {
|
|
2382
|
+
failed.push({ name: items[i].name, pid: items[i].pid, error: 'healthcheck threw: ' + (r.reason && r.reason.message ? r.reason.message : String(r.reason)) });
|
|
2383
|
+
continue;
|
|
2384
|
+
}
|
|
2385
|
+
if (!r.value.healthy) {
|
|
2386
|
+
failed.push({ name: r.value._name || items[i].name, pid: r.value._pid || items[i].pid, error: r.value.error || 'unhealthy' });
|
|
2387
|
+
}
|
|
2388
|
+
}
|
|
2389
|
+
if (failed.length > 0) {
|
|
2390
|
+
// Kill failing PIDs + drop their state entries. Surviving siblings
|
|
2391
|
+
// stay alive intentionally (see comment above).
|
|
2392
|
+
for (const f of failed) {
|
|
2393
|
+
try { managedSpawn.removeManagedSpec(f.name); }
|
|
2394
|
+
catch (e) { log('warn', `managed-spawn healthcheck: cleanup failed for ${f.name}: ${e.message}`); }
|
|
2395
|
+
}
|
|
2396
|
+
managedSpawnHealthcheckFailure = {
|
|
2397
|
+
failed: failed,
|
|
2398
|
+
survivedNames: items.filter(it => !failed.some(f => f.name === it.name)).map(it => it.name),
|
|
2399
|
+
};
|
|
2400
|
+
log('warn', `managed-spawn healthcheck: ${failed.length}/${items.length} spec(s) failed for ${agentId} (${id}); ` +
|
|
2401
|
+
failed.map(f => `${f.name}=${f.error}`).join('; '));
|
|
2402
|
+
try {
|
|
2403
|
+
const wiId = dispatchItem.meta?.item?.id || '';
|
|
2404
|
+
const logTails = failed.map(f => {
|
|
2405
|
+
const tail = managedSpawn.tailManagedLog(f.name, 50) || '(log empty or unreadable)';
|
|
2406
|
+
return '### ' + f.name + ' (pid ' + (f.pid || '?') + ')\n\nReason: `' + f.error + '`\n\n```\n' + tail.slice(-2000) + '\n```';
|
|
2407
|
+
}).join('\n\n');
|
|
2408
|
+
const alertBody = [
|
|
2409
|
+
`# managed_spawn healthcheck FAILED for ${agentId}`,
|
|
2410
|
+
'',
|
|
2411
|
+
`${failed.length} of ${items.length} spec(s) failed their first healthcheck within \`timeout_s\`. The failing PIDs were killed and their state entries removed; surviving siblings (${managedSpawnHealthcheckFailure.survivedNames.join(', ') || 'none'}) stay alive.`,
|
|
2412
|
+
'',
|
|
2413
|
+
wiId ? `Work item: ${wiId}` : '',
|
|
2414
|
+
`Agent: ${agentId}`,
|
|
2415
|
+
`Dispatch: ${id}`,
|
|
2416
|
+
'',
|
|
2417
|
+
'## Failure detail + log tails',
|
|
2418
|
+
'',
|
|
2419
|
+
logTails,
|
|
2420
|
+
'',
|
|
2421
|
+
].filter(Boolean).join('\n');
|
|
2422
|
+
writeInboxAlert(`managed-spawn-healthcheck-${agentId}`, alertBody);
|
|
2423
|
+
} catch (alertErr) {
|
|
2424
|
+
log('warn', `managed-spawn healthcheck: failed to emit inbox alert for ${agentId}: ${alertErr.message}`);
|
|
2425
|
+
}
|
|
2426
|
+
} else {
|
|
2427
|
+
log('info', `managed-spawn healthcheck: ${items.length} spec(s) healthy for ${agentId} (${id})`);
|
|
2428
|
+
}
|
|
2429
|
+
} catch (e) {
|
|
2430
|
+
log('warn', `managed-spawn healthcheck check failed for ${agentId} (${id}): ${e.message}`);
|
|
2431
|
+
}
|
|
2432
|
+
}
|
|
2433
|
+
|
|
2227
2434
|
// Move from active to completed in dispatch (single source of truth for agent status)
|
|
2228
2435
|
// autoRecovered: agent failed after creating PRs — treat as success
|
|
2229
2436
|
const hardContractFail = completionContractFailure?.severity === 'hard'
|
|
@@ -2239,7 +2446,15 @@ async function spawnAgent(dispatchItem, config) {
|
|
|
2239
2446
|
// not silently treated as success even when exit code is 0. Both
|
|
2240
2447
|
// workdir and schema rejections route here; the failure_class differs.
|
|
2241
2448
|
const keepProcessesAcceptanceFail = !!keepProcessesAcceptanceFailure;
|
|
2242
|
-
|
|
2449
|
+
// P-2d5e8f04 — managed-spawn acceptance failure is also a hard failure
|
|
2450
|
+
// (same reasoning). Maps to FAILURE_CLASS.INVALID_MANAGED_SPAWN.
|
|
2451
|
+
const managedSpawnAcceptanceFail = !!managedSpawnAcceptanceFailure;
|
|
2452
|
+
// P-9c1f47a6 — managed-spawn healthcheck failure is also a hard failure:
|
|
2453
|
+
// the agent claims the service is set up but it never became healthy
|
|
2454
|
+
// within the declared timeout. Maps to
|
|
2455
|
+
// FAILURE_CLASS.MANAGED_SPAWN_HEALTHCHECK_FAILED.
|
|
2456
|
+
const managedSpawnHealthcheckFail = !!managedSpawnHealthcheckFailure;
|
|
2457
|
+
const effectiveResult = (hardContractFail || nonceFail || keepProcessesAcceptanceFail || managedSpawnAcceptanceFail || managedSpawnHealthcheckFail)
|
|
2243
2458
|
? DISPATCH_RESULT.ERROR
|
|
2244
2459
|
: (((code === 0 && !agentReportedFailure) || autoRecovered) ? DISPATCH_RESULT.SUCCESS : DISPATCH_RESULT.ERROR);
|
|
2245
2460
|
const finalCompletionReportPath = structuredCompletion?._path || dispatchItem.meta?.completionReportPath || shared.dispatchCompletionReportPath(id);
|
|
@@ -2252,21 +2467,30 @@ async function spawnAgent(dispatchItem, config) {
|
|
|
2252
2467
|
? FAILURE_CLASS.INVALID_KEEP_PROCESSES_WORKDIR
|
|
2253
2468
|
: FAILURE_CLASS.INVALID_KEEP_PROCESSES_SCHEMA)
|
|
2254
2469
|
: null;
|
|
2255
|
-
const completeOpts =
|
|
2256
|
-
? { ...completionOpts, failureClass:
|
|
2257
|
-
: (
|
|
2258
|
-
|
|
2259
|
-
|
|
2260
|
-
|
|
2261
|
-
|
|
2262
|
-
|
|
2263
|
-
|
|
2264
|
-
|
|
2265
|
-
|
|
2266
|
-
|
|
2470
|
+
const completeOpts = managedSpawnHealthcheckFail
|
|
2471
|
+
? { ...completionOpts, failureClass: FAILURE_CLASS.MANAGED_SPAWN_HEALTHCHECK_FAILED, agentRetryable: false }
|
|
2472
|
+
: (managedSpawnAcceptanceFail
|
|
2473
|
+
? { ...completionOpts, failureClass: FAILURE_CLASS.INVALID_MANAGED_SPAWN, agentRetryable: false }
|
|
2474
|
+
: (keepProcessesAcceptanceFail
|
|
2475
|
+
? { ...completionOpts, failureClass: _kpFailureClass, agentRetryable: false }
|
|
2476
|
+
: (nonceFail
|
|
2477
|
+
? { ...completionOpts, failureClass: nonceMismatch.failureClass, agentRetryable: false }
|
|
2478
|
+
: (hardContractFail
|
|
2479
|
+
? { ...completionOpts, processWorkItemFailure: false }
|
|
2480
|
+
: (effectiveResult === DISPATCH_RESULT.ERROR ? {
|
|
2481
|
+
...completionOpts,
|
|
2482
|
+
...(failureClass ? { failureClass } : {}),
|
|
2483
|
+
...(typeof retryableDecision === 'boolean' ? { agentRetryable: retryableDecision } : {}),
|
|
2484
|
+
...(structuredCompletion?.failure_class ? { failureClass: structuredCompletion.failure_class } : {}),
|
|
2485
|
+
} : completionOpts)))));
|
|
2267
2486
|
// Extract last 5 non-empty stderr lines as error context when exit code is non-zero
|
|
2268
2487
|
let errorReason = '';
|
|
2269
|
-
if (
|
|
2488
|
+
if (managedSpawnHealthcheckFail) {
|
|
2489
|
+
const failNames = managedSpawnHealthcheckFailure.failed.map(f => f.name).join(',');
|
|
2490
|
+
errorReason = `managed_spawn_healthcheck_failed: ${failNames} (${managedSpawnHealthcheckFailure.failed.length}/${managedSpawnSpawned.length})`.slice(0, 300);
|
|
2491
|
+
} else if (managedSpawnAcceptanceFail) {
|
|
2492
|
+
errorReason = `invalid_managed_spawn: ${managedSpawnAcceptanceFailure.reason}`.slice(0, 300);
|
|
2493
|
+
} else if (keepProcessesAcceptanceFail) {
|
|
2270
2494
|
if (keepProcessesAcceptanceFailure.isWorkdirRejection) {
|
|
2271
2495
|
errorReason = `invalid_keep_processes_workdir: ${keepProcessesAcceptanceFailure.reason} (cwd=${keepProcessesAcceptanceFailure.cwd || '<unknown>'})`.slice(0, 300);
|
|
2272
2496
|
} else {
|
|
@@ -4015,6 +4239,14 @@ function renderProjectWorkItemPromptForAgent(item, workType, agentId, config, pr
|
|
|
4015
4239
|
keep_processes_ttl_minutes: item.meta && Number.isFinite(Number(item.meta.keep_processes_ttl_minutes))
|
|
4016
4240
|
? Math.floor(Number(item.meta.keep_processes_ttl_minutes))
|
|
4017
4241
|
: '',
|
|
4242
|
+
// P-1f9c3a45 — opt-in managed_spawn hint plumbed via item.meta. Same
|
|
4243
|
+
// default-off shape as keep_processes; truthy fires the agent-side
|
|
4244
|
+
// sidecar instructions in renderPlaybook. Live-processes auto-inject is
|
|
4245
|
+
// project-scoped and unconditional (not gated on this flag).
|
|
4246
|
+
managed_spawn: !!(item.meta && item.meta.managed_spawn),
|
|
4247
|
+
managed_spawn_ttl_minutes: item.meta && Number.isFinite(Number(item.meta.managed_spawn_ttl_minutes))
|
|
4248
|
+
? Math.floor(Number(item.meta.managed_spawn_ttl_minutes))
|
|
4249
|
+
: '',
|
|
4018
4250
|
};
|
|
4019
4251
|
const cpResult = buildWorkItemDispatchVars(item, vars, config, {
|
|
4020
4252
|
worktreePath: vars.worktree_path || root,
|
|
@@ -5425,6 +5657,24 @@ async function tickInner() {
|
|
|
5425
5657
|
if (_isTickStale(myGeneration)) return;
|
|
5426
5658
|
}
|
|
5427
5659
|
|
|
5660
|
+
// 2.53. managed-spawn TTL/dead-PID sweep + log rotation (P-8a4d6f29). Walks
|
|
5661
|
+
// engine/managed-processes.json, kills TTL-expired specs, drops dead-PID
|
|
5662
|
+
// rows, rotates managed-logs/<name>.log past ENGINE_DEFAULTS.managedSpawn
|
|
5663
|
+
// .logRotateBytes. Mirrors the keep-processes sweep cadence (sweepEvery=30)
|
|
5664
|
+
// so the engine never iterates per-spec on every tick. Healthcheck loops
|
|
5665
|
+
// remain per-spec / self-scheduled and are NOT driven from here.
|
|
5666
|
+
const managedSweepEvery = Math.max(1, ENGINE_DEFAULTS.managedSpawn?.sweepEvery || 30);
|
|
5667
|
+
if (ENGINE_DEFAULTS.managedSpawn?.enabled !== false && tickCount % managedSweepEvery === 0) {
|
|
5668
|
+
safe('sweepManagedSpawn', () => {
|
|
5669
|
+
const { sweepManagedSpawn } = require('./engine/managed-spawn');
|
|
5670
|
+
const stats = sweepManagedSpawn();
|
|
5671
|
+
if (stats.scanned > 0 && (stats.ttlExpired || stats.deadDropped || stats.rotatedLogs || stats.malformed)) {
|
|
5672
|
+
log('info', `managed-spawn sweep: scanned=${stats.scanned} ttl=${stats.ttlExpired} dead=${stats.deadDropped} killed=${stats.killedPids} rotated=${stats.rotatedLogs} malformed=${stats.malformed}`);
|
|
5673
|
+
}
|
|
5674
|
+
});
|
|
5675
|
+
if (_isTickStale(myGeneration)) return;
|
|
5676
|
+
}
|
|
5677
|
+
|
|
5428
5678
|
// 2.55. Check persistent watches (3 tick-equivalents, default ~3 minutes)
|
|
5429
5679
|
const watchPollIntervalMs = _pollIntervalMsFromTicks(3, tickIntervalMs);
|
|
5430
5680
|
if (_shouldRunPeriodicPhase(now, lastWatchCheckAt, watchPollIntervalMs)) {
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@yemi33/minions",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.1967",
|
|
4
4
|
"description": "Multi-agent AI dev team that runs from ~/.minions/ — five autonomous agents share a single engine, dashboard, and knowledge base",
|
|
5
5
|
"bin": {
|
|
6
6
|
"minions": "bin/minions.js"
|
|
@@ -20,6 +20,7 @@
|
|
|
20
20
|
"test:e2e:report": "npx playwright show-report test/playwright/report",
|
|
21
21
|
"test:e2e:video": "npx playwright test --video=on --headed",
|
|
22
22
|
"test:all": "node test/run-parallel.js && node test/minions-tests.js && node test/integration/run.js",
|
|
23
|
+
"test:perf": "node test/perf/managed-spawn-load.test.js",
|
|
23
24
|
"test:e2e:accept": "node test/playwright/accept-baseline.js",
|
|
24
25
|
"test:e2e:accept-force": "node test/playwright/accept-baseline.js --force",
|
|
25
26
|
"test:setup": "npx playwright install chromium"
|