npm - substrate-ai - Versions diffs - 0.2.13 → 0.2.14 - Mend

substrate-ai 0.2.13 → 0.2.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (2) hide show

package/dist/cli/index.js +287 -99
package/package.json +1 -1

package/dist/cli/index.js CHANGED Viewed

@@ -2750,6 +2750,140 @@ function defaultSupervisorDeps() {
 		}
 	};
 }
+/** Build the supervisor:poll event payload. */
+function buildPollEvent(health, projectRoot, tokenSnapshot, extraFields) {
+	const proc = health.process ?? {
+		orchestrator_pid: null,
+		child_pids: [],
+		zombies: []
+	};
+	return {
+		type: "supervisor:poll",
+		run_id: health.run_id,
+		verdict: health.verdict,
+		staleness_seconds: health.staleness_seconds,
+		stories: {
+			active: health.stories.active,
+			completed: health.stories.completed,
+			escalated: health.stories.escalated
+		},
+		story_details: health.stories.details,
+		tokens: tokenSnapshot,
+		process: {
+			orchestrator_pid: proc.orchestrator_pid,
+			child_count: proc.child_pids.length,
+			zombie_count: proc.zombies.length
+		},
+		...extraFields
+	};
+}
+/** Extract succeeded / failed / escalated story keys from health details. */
+function buildTerminalSummary(storyDetails) {
+	const succeeded = [];
+	const failed = [];
+	const escalated = [];
+	for (const [k, s] of Object.entries(storyDetails)) if (s.phase === "COMPLETE") succeeded.push(k);
+	else if (s.phase === "ESCALATED") escalated.push(k);
+	else if (s.phase !== "PENDING") failed.push(k);
+	return {
+		succeeded,
+		failed,
+		escalated
+	};
+}
+/**
+* Handle stall recovery for a single project: kill stalled processes, restart pipeline.
+*
+* Returns null if no stall detected (staleness below threshold).
+* Returns updated state + maxRestartsExceeded flag otherwise.
+*/
+async function handleStallRecovery(health, state, config, deps, io) {
+	const { stallThreshold, maxRestarts, pack, outputFormat } = config;
+	const { killPid, resumePipeline, sleep, incrementRestarts, getAllDescendants } = deps;
+	const { emitEvent, log } = io;
+	const { projectRoot } = state;
+	if (health.staleness_seconds < stallThreshold) return null;
+	const directPids = [...health.process.orchestrator_pid !== null ? [health.process.orchestrator_pid] : [], ...health.process.child_pids];
+	const descendantPids = getAllDescendants(directPids);
+	const directPidSet = new Set(directPids);
+	const pids = [...directPids, ...descendantPids.filter((p) => !directPidSet.has(p))];
+	emitEvent({
+		type: "supervisor:kill",
+		run_id: health.run_id,
+		reason: "stall",
+		staleness_seconds: health.staleness_seconds,
+		pids
+	});
+	log(`Supervisor: Stall confirmed (${health.staleness_seconds}s ≥ ${stallThreshold}s threshold). Killing PIDs: ${pids.join(", ") || "none"}`);
+	for (const pid of pids) try {
+		killPid(pid, "SIGTERM");
+	} catch {}
+	await sleep(5e3);
+	for (const pid of pids) try {
+		killPid(pid, "SIGKILL");
+	} catch {}
+	if (pids.length > 0) {
+		let allDead = false;
+		for (let attempt = 0; attempt < 5; attempt++) {
+			await sleep(1e3);
+			allDead = pids.every((pid) => {
+				try {
+					process.kill(pid, 0);
+					return false;
+				} catch {
+					return true;
+				}
+			});
+			if (allDead) break;
+		}
+		if (!allDead) log(`Supervisor: Warning: Some PIDs may still be alive after SIGKILL`);
+	}
+	if (state.restartCount >= maxRestarts) {
+		emitEvent({
+			type: "supervisor:abort",
+			run_id: health.run_id,
+			reason: "max_restarts_exceeded",
+			attempts: state.restartCount
+		});
+		log(`Supervisor: Max restarts (${maxRestarts}) exceeded. Aborting.`);
+		return {
+			state,
+			maxRestartsExceeded: true
+		};
+	}
+	const newRestartCount = state.restartCount + 1;
+	if (health.run_id !== null) incrementRestarts(health.run_id, projectRoot);
+	emitEvent({
+		type: "supervisor:restart",
+		run_id: health.run_id,
+		attempt: newRestartCount
+	});
+	log(`Supervisor: Restarting pipeline (attempt ${newRestartCount}/${maxRestarts})`);
+	try {
+		await resumePipeline({
+			runId: health.run_id ?? void 0,
+			outputFormat,
+			projectRoot,
+			concurrency: 3,
+			pack
+		});
+	} catch (err) {
+		const message = err instanceof Error ? err.message : String(err);
+		log(`Supervisor: Resume error: ${message}`);
+		emitEvent({
+			type: "supervisor:error",
+			reason: "resume_failed",
+			message
+		});
+	}
+	return {
+		state: {
+			...state,
+			restartCount: newRestartCount
+		},
+		maxRestartsExceeded: false
+	};
+}
 /**
 * Run the pipeline supervisor — a long-running watchdog that polls pipeline health
 * and automatically kills and restarts stalled pipelines.
@@ -2763,11 +2897,16 @@ function defaultSupervisorDeps() {
 */
 async function runSupervisorAction(options, deps = {}) {
 	const { pollInterval, stallThreshold, maxRestarts, outputFormat, projectRoot, runId, pack, experiment, maxExperiments } = options;
-	const { getHealth, killPid, resumePipeline, sleep, incrementRestarts, runAnalysis, getTokenSnapshot, getAllDescendants } = {
+	const resolvedDeps = {
 		...defaultSupervisorDeps(),
 		...deps
 	};
-	let restartCount = 0;
+	const { getHealth, sleep, runAnalysis, getTokenSnapshot } = resolvedDeps;
+	let state = {
+		projectRoot,
+		runId,
+		restartCount: 0
+	};
 	const startTime = Date.now();
 	function emitEvent(event) {
 		if (outputFormat === "json") {
@@ -2793,46 +2932,20 @@ async function runSupervisorAction(options, deps = {}) {
 				output: 0,
 				cost_usd: 0
 			};
-			const proc = health.process ?? {
-				orchestrator_pid: null,
-				child_pids: [],
-				zombies: []
-			};
-			emitEvent({
-				type: "supervisor:poll",
-				run_id: health.run_id,
-				verdict: health.verdict,
-				staleness_seconds: health.staleness_seconds,
-				stories: {
-					active: health.stories.active,
-					completed: health.stories.completed,
-					escalated: health.stories.escalated
-				},
-				story_details: health.stories.details,
-				tokens: tokenSnapshot,
-				process: {
-					orchestrator_pid: proc.orchestrator_pid,
-					child_count: proc.child_pids.length,
-					zombie_count: proc.zombies.length
-				}
-			});
+			emitEvent(buildPollEvent(health, projectRoot, tokenSnapshot));
 		}
 		log(`[${ts}] Health: ${health.verdict} | staleness=${health.staleness_seconds}s | stories: active=${health.stories.active} completed=${health.stories.completed} escalated=${health.stories.escalated}`);
 		if (health.verdict === "NO_PIPELINE_RUNNING") {
 			const elapsedSeconds = Math.round((Date.now() - startTime) / 1e3);
-			const succeeded = Object.entries(health.stories.details).filter(([, s]) => s.phase === "COMPLETE").map(([k]) => k);
-			const failed = Object.entries(health.stories.details).filter(([, s]) => s.phase !== "COMPLETE" && s.phase !== "PENDING" && s.phase !== "ESCALATED").map(([k]) => k);
-			const escalated = Object.entries(health.stories.details).filter(([, s]) => s.phase === "ESCALATED").map(([k]) => k);
+			const summary = buildTerminalSummary(health.stories.details);
 			emitEvent({
 				type: "supervisor:summary",
 				run_id: health.run_id,
 				elapsed_seconds: elapsedSeconds,
-				succeeded,
-				failed,
-				escalated,
-				restarts: restartCount
+				...summary,
+				restarts: state.restartCount
 			});
-			log(`\nPipeline reached terminal state. Elapsed: ${elapsedSeconds}s | succeeded: ${succeeded.length} | failed: ${failed.length} | restarts: ${restartCount}`);
+			log(`\nPipeline reached terminal state. Elapsed: ${elapsedSeconds}s | succeeded: ${summary.succeeded.length} | failed: ${summary.failed.length} | restarts: ${state.restartCount}`);
 			if (health.run_id !== null && runAnalysis !== void 0) {
 				log(`[supervisor] Running post-run analysis for ${health.run_id}...`);
 				try {
@@ -2958,87 +3071,162 @@ async function runSupervisorAction(options, deps = {}) {
 					});
 				}
 			}
-			return failed.length > 0 || escalated.length > 0 ? 1 : 0;
+			return summary.failed.length > 0 || summary.escalated.length > 0 ? 1 : 0;
 		}
-		if (health.staleness_seconds >= stallThreshold) {
-			const directPids = [...health.process.orchestrator_pid !== null ? [health.process.orchestrator_pid] : [], ...health.process.child_pids];
-			const descendantPids = getAllDescendants(directPids);
-			const directPidSet = new Set(directPids);
-			const pids = [...directPids, ...descendantPids.filter((p) => !directPidSet.has(p))];
-			emitEvent({
-				type: "supervisor:kill",
-				run_id: health.run_id,
-				reason: "stall",
-				staleness_seconds: health.staleness_seconds,
-				pids
-			});
-			log(`Supervisor: Stall confirmed (${health.staleness_seconds}s ≥ ${stallThreshold}s threshold). Killing PIDs: ${pids.join(", ") || "none"}`);
-			for (const pid of pids) try {
-				killPid(pid, "SIGTERM");
-			} catch {}
-			await sleep(5e3);
-			for (const pid of pids) try {
-				killPid(pid, "SIGKILL");
-			} catch {}
-			if (pids.length > 0) {
-				let allDead = false;
-				for (let attempt = 0; attempt < 5; attempt++) {
-					await sleep(1e3);
-					allDead = pids.every((pid) => {
-						try {
-							process.kill(pid, 0);
-							return false;
-						} catch {
-							return true;
-						}
-					});
-					if (allDead) break;
-				}
-				if (!allDead) log(`Supervisor: Warning: Some PIDs may still be alive after SIGKILL`);
+		const stallResult = await handleStallRecovery(health, state, {
+			stallThreshold,
+			maxRestarts,
+			pack,
+			outputFormat
+		}, resolvedDeps, {
+			emitEvent,
+			log
+		});
+		if (stallResult !== null) {
+			if (stallResult.maxRestartsExceeded) return 2;
+			state = stallResult.state;
+		}
+		await sleep(pollInterval * 1e3);
+	}
+}
+/**
+* Run the supervisor across multiple projects simultaneously.
+* Polls each project sequentially within each cycle, tagging events with `project`.
+*
+* Exit codes:
+*   0 — all projects completed without failures
+*   1 — at least one project completed with failures or escalations
+*   2 — at least one project hit max restarts
+*/
+async function runMultiProjectSupervisor(options, deps = {}) {
+	const { projects, pollInterval, stallThreshold, maxRestarts, outputFormat, pack } = options;
+	const resolvedDeps = {
+		...defaultSupervisorDeps(),
+		...deps
+	};
+	const { getHealth, sleep, getTokenSnapshot } = resolvedDeps;
+	if (projects.length === 0) {
+		process.stderr.write("Error: --projects requires at least one project path\n");
+		return 1;
+	}
+	const states = new Map(projects.map((p) => [p, {
+		projectRoot: p,
+		restartCount: 0
+	}]));
+	const doneProjects = new Set();
+	const projectExitCodes = new Map();
+	const startTime = Date.now();
+	function emitEvent(event) {
+		if (outputFormat === "json") {
+			const stamped = {
+				...event,
+				ts: new Date().toISOString()
+			};
+			process.stdout.write(JSON.stringify(stamped) + "\n");
+		}
+	}
+	function log(message) {
+		if (outputFormat === "human") process.stdout.write(message + "\n");
+	}
+	while (true) {
+		for (const projectRoot of projects) {
+			if (doneProjects.has(projectRoot)) continue;
+			let health;
+			try {
+				health = await getHealth({ projectRoot });
+			} catch {
+				log(`[supervisor] ${projectRoot}: health check failed — marking as done`);
+				emitEvent({
+					type: "supervisor:error",
+					project: projectRoot,
+					reason: "health_check_failed"
+				});
+				doneProjects.add(projectRoot);
+				projectExitCodes.set(projectRoot, 1);
+				continue;
 			}
-			if (restartCount >= maxRestarts) {
+			const state = states.get(projectRoot);
+			if (outputFormat === "json") {
+				const tokenSnapshot = health.run_id !== null ? getTokenSnapshot(health.run_id, projectRoot) : {
+					input: 0,
+					output: 0,
+					cost_usd: 0
+				};
+				emitEvent(buildPollEvent(health, projectRoot, tokenSnapshot, { project: projectRoot }));
+			}
+			log(`[${projectRoot}] Health: ${health.verdict} | staleness=${health.staleness_seconds}s | active=${health.stories.active} completed=${health.stories.completed} escalated=${health.stories.escalated}`);
+			if (health.verdict === "NO_PIPELINE_RUNNING") {
+				const elapsedSeconds = Math.round((Date.now() - startTime) / 1e3);
+				const summary = buildTerminalSummary(health.stories.details);
 				emitEvent({
-					type: "supervisor:abort",
+					type: "supervisor:summary",
+					project: projectRoot,
 					run_id: health.run_id,
-					reason: "max_restarts_exceeded",
-					attempts: restartCount
+					elapsed_seconds: elapsedSeconds,
+					...summary,
+					restarts: state.restartCount
 				});
-				log(`Supervisor: Max restarts (${maxRestarts}) exceeded. Aborting.`);
-				return 2;
+				log(`[${projectRoot}] Terminal. succeeded=${summary.succeeded.length} failed=${summary.failed.length} restarts=${state.restartCount}`);
+				doneProjects.add(projectRoot);
+				projectExitCodes.set(projectRoot, summary.failed.length > 0 || summary.escalated.length > 0 ? 1 : 0);
+				continue;
 			}
-			restartCount++;
-			if (health.run_id !== null) incrementRestarts(health.run_id, projectRoot);
+			const stallResult = await handleStallRecovery(health, state, {
+				stallThreshold,
+				maxRestarts,
+				pack,
+				outputFormat
+			}, resolvedDeps, {
+				emitEvent: (evt) => emitEvent({
+					...evt,
+					project: projectRoot
+				}),
+				log: (msg) => log(`[${projectRoot}] ${msg}`)
+			});
+			if (stallResult !== null) if (stallResult.maxRestartsExceeded) {
+				doneProjects.add(projectRoot);
+				projectExitCodes.set(projectRoot, 2);
+			} else states.set(projectRoot, stallResult.state);
+		}
+		if (doneProjects.size >= projects.length) {
+			const elapsedSeconds = Math.round((Date.now() - startTime) / 1e3);
 			emitEvent({
-				type: "supervisor:restart",
-				run_id: health.run_id,
-				attempt: restartCount
+				type: "supervisor:done",
+				elapsed_seconds: elapsedSeconds,
+				project_results: Object.fromEntries(projectExitCodes)
 			});
-			log(`Supervisor: Restarting pipeline (attempt ${restartCount}/${maxRestarts})`);
-			try {
-				await resumePipeline({
-					runId: health.run_id ?? void 0,
-					outputFormat,
-					projectRoot,
-					concurrency: 3,
-					pack
-				});
-			} catch (err) {
-				const message = err instanceof Error ? err.message : String(err);
-				log(`Supervisor: Resume error: ${message}`);
-				if (outputFormat === "json") emitEvent({
-					type: "supervisor:error",
-					reason: "resume_failed",
-					message
-				});
-			}
+			log(`\nAll projects reached terminal state. Elapsed: ${elapsedSeconds}s`);
+			const exitCodes = [...projectExitCodes.values()];
+			if (exitCodes.includes(2)) return 2;
+			if (exitCodes.includes(1)) return 1;
+			return 0;
 		}
 		await sleep(pollInterval * 1e3);
 	}
 }
 function registerSupervisorCommand(program, _version = "0.0.0", projectRoot = process.cwd()) {
-	program.command("supervisor").description("Monitor a pipeline run and automatically recover from stalls").option("--poll-interval <seconds>", "Health poll interval in seconds", (v) => parseInt(v, 10), 60).option("--stall-threshold <seconds>", "Staleness in seconds before killing a stalled pipeline", (v) => parseInt(v, 10), 600).option("--max-restarts <n>", "Maximum automatic restarts before aborting", (v) => parseInt(v, 10), 3).option("--run-id <id>", "Pipeline run ID to monitor (defaults to latest)").option("--pack <name>", "Methodology pack name", "bmad").option("--project-root <path>", "Project root directory", projectRoot).option("--output-format <format>", "Output format: human (default) or json", "human").option("--experiment", "After post-run analysis, enter experiment mode: create branches, apply modifications, run single-story experiments, and report verdicts (Story 17-4)", false).option("--max-experiments <n>", "Maximum number of experiments to run per analysis cycle (default: 2, Story 17-4 AC6)", (v) => parseInt(v, 10), 2).action(async (opts) => {
+	program.command("supervisor").description("Monitor a pipeline run and automatically recover from stalls").option("--poll-interval <seconds>", "Health poll interval in seconds", (v) => parseInt(v, 10), 60).option("--stall-threshold <seconds>", "Staleness in seconds before killing a stalled pipeline", (v) => parseInt(v, 10), 600).option("--max-restarts <n>", "Maximum automatic restarts before aborting", (v) => parseInt(v, 10), 3).option("--run-id <id>", "Pipeline run ID to monitor (defaults to latest)").option("--pack <name>", "Methodology pack name", "bmad").option("--project-root <path>", "Project root directory", projectRoot).option("--projects <paths>", "Comma-separated project root directories to monitor (multi-project mode)").option("--output-format <format>", "Output format: human (default) or json", "human").option("--experiment", "After post-run analysis, enter experiment mode: create branches, apply modifications, run single-story experiments, and report verdicts (Story 17-4)", false).option("--max-experiments <n>", "Maximum number of experiments to run per analysis cycle (default: 2, Story 17-4 AC6)", (v) => parseInt(v, 10), 2).action(async (opts) => {
 		const outputFormat = opts.outputFormat === "json" ? "json" : "human";
 		if (opts.stallThreshold < 120) console.warn(`Warning: --stall-threshold ${opts.stallThreshold}s is below 120s. Agent steps typically take 45-90s. This may cause false stall detections and wasted restarts.`);
+		if (opts.projects) {
+			if (opts.runId) {
+				console.error("Error: --run-id cannot be used with --projects (ambiguous)");
+				process.exitCode = 1;
+				return;
+			}
+			if (opts.experiment) console.warn("Warning: --experiment is not supported in multi-project mode — ignored.");
+			const projects = opts.projects.split(",").map((p) => resolve(p.trim()));
+			const exitCode$1 = await runMultiProjectSupervisor({
+				projects,
+				pollInterval: opts.pollInterval,
+				stallThreshold: opts.stallThreshold,
+				maxRestarts: opts.maxRestarts,
+				outputFormat,
+				pack: opts.pack
+			});
+			process.exitCode = exitCode$1;
+			return;
+		}
 		const exitCode = await runSupervisorAction({
 			pollInterval: opts.pollInterval,
 			stallThreshold: opts.stallThreshold,

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "substrate-ai",
-  "version": "0.2.13",
+  "version": "0.2.14",
   "description": "Substrate — multi-agent orchestration daemon for AI coding agents",
   "type": "module",
   "license": "MIT",