substrate-ai 0.2.11 → 0.2.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli/index.js CHANGED
@@ -2,7 +2,7 @@
2
2
  import { createLogger, deepMask } from "../logger-C6n1g8uP.js";
3
3
  import { AdapterRegistry, createEventBus } from "../event-bus-J-bw-pkp.js";
4
4
  import { CURRENT_CONFIG_FORMAT_VERSION, CURRENT_TASK_GRAPH_VERSION, PartialSubstrateConfigSchema, SUPPORTED_CONFIG_FORMAT_VERSIONS, SubstrateConfigSchema, defaultConfigMigrator } from "../version-manager-impl-BpVx2DkY.js";
5
- import { DatabaseWrapper, SUBSTRATE_OWNED_SETTINGS_KEYS, VALID_PHASES, buildPipelineStatusOutput, createContextCompiler, createDispatcher, createImplementationOrchestrator, createPackLoader, createPhaseOrchestrator, createStopAfterGate, findPackageRoot, formatOutput, formatPhaseCompletionSummary, formatPipelineStatusHuman, formatPipelineSummary, formatTokenTelemetry, getSubstrateDefaultSettings, parseDbTimestampAsUtc, registerRunCommand, resolveBmadMethodSrcPath, resolveBmadMethodVersion, resolveMainRepoRoot, runAnalysisPhase, runMigrations, runPlanningPhase, runSolutioningPhase, validateStopAfterFromConflict } from "../run-L-R_XYNT.js";
5
+ import { DatabaseWrapper, SUBSTRATE_OWNED_SETTINGS_KEYS, VALID_PHASES, buildPipelineStatusOutput, createContextCompiler, createDispatcher, createImplementationOrchestrator, createPackLoader, createPhaseOrchestrator, createStopAfterGate, findPackageRoot, formatOutput, formatPhaseCompletionSummary, formatPipelineStatusHuman, formatPipelineSummary, formatTokenTelemetry, getSubstrateDefaultSettings, parseDbTimestampAsUtc, registerRunCommand, resolveBmadMethodSrcPath, resolveBmadMethodVersion, resolveMainRepoRoot, runAnalysisPhase, runMigrations, runPlanningPhase, runSolutioningPhase, validateStopAfterFromConflict } from "../run-CoP8UQU3.js";
6
6
  import { ConfigError, ConfigIncompatibleFormatError } from "../errors-BPqtzQ4U.js";
7
7
  import { addTokenUsage, createDecision, getDecisionsByPhaseForRun, getLatestRun, getPipelineRunById, getTokenUsageSummary, listRequirements, updatePipelineRun } from "../decisions-DNYByk0U.js";
8
8
  import { aggregateTokenUsageForRun, compareRunMetrics, getBaselineRunMetrics, getRunMetrics, getStoryMetricsForRun, incrementRunRestarts, listRunMetrics, tagRunAsBaseline } from "../metrics-BSg8VIHd.js";
@@ -2414,16 +2414,24 @@ const DEFAULT_STALL_THRESHOLD_SECONDS = 600;
2414
2414
  * - `node dist/cli/index.js run` (npm run substrate:dev)
2415
2415
  * - `npx substrate run`
2416
2416
  * - any node process whose command contains `run` with `--events` or `--stories`
2417
+ *
2418
+ * When `projectRoot` is provided, additionally checks that the command line
2419
+ * contains that path (via `--project-root` flag or as part of the binary/CWD path).
2420
+ * This ensures multi-project environments match the correct orchestrator.
2417
2421
  */
2418
- function isOrchestratorProcessLine(line) {
2422
+ function isOrchestratorProcessLine(line, projectRoot) {
2419
2423
  if (line.includes("grep")) return false;
2420
- if (line.includes("substrate run")) return true;
2421
- if (line.includes("substrate-ai run")) return true;
2422
- if (line.includes("index.js run")) return true;
2423
- if (line.includes("node") && /\srun(\s|$)/.test(line) && (line.includes("--events") || line.includes("--stories"))) return true;
2424
- return false;
2425
- }
2426
- function inspectProcessTree(execFileSyncOverride) {
2424
+ let isOrchestrator = false;
2425
+ if (line.includes("substrate run")) isOrchestrator = true;
2426
+ else if (line.includes("substrate-ai run")) isOrchestrator = true;
2427
+ else if (line.includes("index.js run")) isOrchestrator = true;
2428
+ else if (line.includes("node") && /\srun(\s|$)/.test(line) && (line.includes("--events") || line.includes("--stories"))) isOrchestrator = true;
2429
+ if (!isOrchestrator) return false;
2430
+ if (projectRoot !== void 0) return line.includes(projectRoot);
2431
+ return true;
2432
+ }
2433
+ function inspectProcessTree(opts) {
2434
+ const { projectRoot, execFileSync: execFileSyncOverride } = opts ?? {};
2427
2435
  const result = {
2428
2436
  orchestrator_pid: null,
2429
2437
  child_pids: [],
@@ -2443,7 +2451,7 @@ function inspectProcessTree(execFileSyncOverride) {
2443
2451
  });
2444
2452
  }
2445
2453
  const lines = psOutput.split("\n");
2446
- for (const line of lines) if (isOrchestratorProcessLine(line)) {
2454
+ for (const line of lines) if (isOrchestratorProcessLine(line, projectRoot)) {
2447
2455
  const match = line.trim().match(/^(\d+)/);
2448
2456
  if (match) {
2449
2457
  result.orchestrator_pid = parseInt(match[1], 10);
@@ -2466,6 +2474,58 @@ function inspectProcessTree(execFileSyncOverride) {
2466
2474
  return result;
2467
2475
  }
2468
2476
  /**
2477
+ * Collect all descendant PIDs of the given root PIDs by walking the process
2478
+ * tree recursively. This ensures that grandchildren of the orchestrator
2479
+ * (e.g. node subprocesses spawned by `claude -p`) are also killed during
2480
+ * stall recovery, leaving no orphan processes.
2481
+ *
2482
+ * Returns only the descendants — the root PIDs themselves are NOT included.
2483
+ */
2484
+ function getAllDescendantPids(rootPids, execFileSyncOverride) {
2485
+ if (rootPids.length === 0) return [];
2486
+ try {
2487
+ let psOutput;
2488
+ if (execFileSyncOverride !== void 0) psOutput = execFileSyncOverride("ps", ["-eo", "pid,ppid"], {
2489
+ encoding: "utf-8",
2490
+ timeout: 5e3
2491
+ });
2492
+ else {
2493
+ const { execFileSync } = __require("node:child_process");
2494
+ psOutput = execFileSync("ps", ["-eo", "pid,ppid"], {
2495
+ encoding: "utf-8",
2496
+ timeout: 5e3
2497
+ });
2498
+ }
2499
+ const childrenOf = new Map();
2500
+ for (const line of psOutput.split("\n")) {
2501
+ const parts = line.trim().split(/\s+/);
2502
+ if (parts.length >= 2) {
2503
+ const pid = parseInt(parts[0], 10);
2504
+ const ppid = parseInt(parts[1], 10);
2505
+ if (!isNaN(pid) && !isNaN(ppid) && pid > 0) {
2506
+ if (!childrenOf.has(ppid)) childrenOf.set(ppid, []);
2507
+ childrenOf.get(ppid).push(pid);
2508
+ }
2509
+ }
2510
+ }
2511
+ const descendants = [];
2512
+ const seen = new Set(rootPids);
2513
+ const queue = [...rootPids];
2514
+ while (queue.length > 0) {
2515
+ const current = queue.shift();
2516
+ const children = childrenOf.get(current) ?? [];
2517
+ for (const child of children) if (!seen.has(child)) {
2518
+ seen.add(child);
2519
+ descendants.push(child);
2520
+ queue.push(child);
2521
+ }
2522
+ }
2523
+ return descendants;
2524
+ } catch {
2525
+ return [];
2526
+ }
2527
+ }
2528
+ /**
2469
2529
  * Fetch pipeline health data as a structured object without any stdout side-effects.
2470
2530
  * Used by runSupervisorAction to poll health without formatting overhead.
2471
2531
  *
@@ -2524,10 +2584,11 @@ async function getAutoHealthData(options) {
2524
2584
  }
2525
2585
  }
2526
2586
  } catch {}
2527
- const processInfo = inspectProcessTree();
2587
+ const processInfo = inspectProcessTree({ projectRoot });
2528
2588
  let verdict = "NO_PIPELINE_RUNNING";
2529
2589
  if (run.status === "running") if (processInfo.orchestrator_pid === null && active === 0 && completed > 0) verdict = "NO_PIPELINE_RUNNING";
2530
2590
  else if (processInfo.zombies.length > 0) verdict = "STALLED";
2591
+ else if (processInfo.orchestrator_pid !== null && processInfo.child_pids.length > 0 && stalenessSeconds > DEFAULT_STALL_THRESHOLD_SECONDS) verdict = "HEALTHY";
2531
2592
  else if (stalenessSeconds > DEFAULT_STALL_THRESHOLD_SECONDS) verdict = "STALLED";
2532
2593
  else if (processInfo.orchestrator_pid !== null && processInfo.child_pids.length === 0 && active > 0) verdict = "STALLED";
2533
2594
  else verdict = "HEALTHY";
@@ -2660,6 +2721,7 @@ function defaultSupervisorDeps() {
2660
2721
  };
2661
2722
  }
2662
2723
  },
2724
+ getAllDescendants: (rootPids) => getAllDescendantPids(rootPids),
2663
2725
  runAnalysis: async (runId, projectRoot) => {
2664
2726
  const dbPath = join(projectRoot, ".substrate", "substrate.db");
2665
2727
  if (!existsSync(dbPath)) return;
@@ -2688,6 +2750,140 @@ function defaultSupervisorDeps() {
2688
2750
  }
2689
2751
  };
2690
2752
  }
2753
+ /** Build the supervisor:poll event payload. */
2754
+ function buildPollEvent(health, projectRoot, tokenSnapshot, extraFields) {
2755
+ const proc = health.process ?? {
2756
+ orchestrator_pid: null,
2757
+ child_pids: [],
2758
+ zombies: []
2759
+ };
2760
+ return {
2761
+ type: "supervisor:poll",
2762
+ run_id: health.run_id,
2763
+ verdict: health.verdict,
2764
+ staleness_seconds: health.staleness_seconds,
2765
+ stories: {
2766
+ active: health.stories.active,
2767
+ completed: health.stories.completed,
2768
+ escalated: health.stories.escalated
2769
+ },
2770
+ story_details: health.stories.details,
2771
+ tokens: tokenSnapshot,
2772
+ process: {
2773
+ orchestrator_pid: proc.orchestrator_pid,
2774
+ child_count: proc.child_pids.length,
2775
+ zombie_count: proc.zombies.length
2776
+ },
2777
+ ...extraFields
2778
+ };
2779
+ }
2780
+ /** Extract succeeded / failed / escalated story keys from health details. */
2781
+ function buildTerminalSummary(storyDetails) {
2782
+ const succeeded = [];
2783
+ const failed = [];
2784
+ const escalated = [];
2785
+ for (const [k, s] of Object.entries(storyDetails)) if (s.phase === "COMPLETE") succeeded.push(k);
2786
+ else if (s.phase === "ESCALATED") escalated.push(k);
2787
+ else if (s.phase !== "PENDING") failed.push(k);
2788
+ return {
2789
+ succeeded,
2790
+ failed,
2791
+ escalated
2792
+ };
2793
+ }
2794
+ /**
2795
+ * Handle stall recovery for a single project: kill stalled processes, restart pipeline.
2796
+ *
2797
+ * Returns null if no stall detected (staleness below threshold).
2798
+ * Returns updated state + maxRestartsExceeded flag otherwise.
2799
+ */
2800
+ async function handleStallRecovery(health, state, config, deps, io) {
2801
+ const { stallThreshold, maxRestarts, pack, outputFormat } = config;
2802
+ const { killPid, resumePipeline, sleep, incrementRestarts, getAllDescendants } = deps;
2803
+ const { emitEvent, log } = io;
2804
+ const { projectRoot } = state;
2805
+ if (health.staleness_seconds < stallThreshold) return null;
2806
+ const directPids = [...health.process.orchestrator_pid !== null ? [health.process.orchestrator_pid] : [], ...health.process.child_pids];
2807
+ const descendantPids = getAllDescendants(directPids);
2808
+ const directPidSet = new Set(directPids);
2809
+ const pids = [...directPids, ...descendantPids.filter((p) => !directPidSet.has(p))];
2810
+ emitEvent({
2811
+ type: "supervisor:kill",
2812
+ run_id: health.run_id,
2813
+ reason: "stall",
2814
+ staleness_seconds: health.staleness_seconds,
2815
+ pids
2816
+ });
2817
+ log(`Supervisor: Stall confirmed (${health.staleness_seconds}s ≥ ${stallThreshold}s threshold). Killing PIDs: ${pids.join(", ") || "none"}`);
2818
+ for (const pid of pids) try {
2819
+ killPid(pid, "SIGTERM");
2820
+ } catch {}
2821
+ await sleep(5e3);
2822
+ for (const pid of pids) try {
2823
+ killPid(pid, "SIGKILL");
2824
+ } catch {}
2825
+ if (pids.length > 0) {
2826
+ let allDead = false;
2827
+ for (let attempt = 0; attempt < 5; attempt++) {
2828
+ await sleep(1e3);
2829
+ allDead = pids.every((pid) => {
2830
+ try {
2831
+ process.kill(pid, 0);
2832
+ return false;
2833
+ } catch {
2834
+ return true;
2835
+ }
2836
+ });
2837
+ if (allDead) break;
2838
+ }
2839
+ if (!allDead) log(`Supervisor: Warning: Some PIDs may still be alive after SIGKILL`);
2840
+ }
2841
+ if (state.restartCount >= maxRestarts) {
2842
+ emitEvent({
2843
+ type: "supervisor:abort",
2844
+ run_id: health.run_id,
2845
+ reason: "max_restarts_exceeded",
2846
+ attempts: state.restartCount
2847
+ });
2848
+ log(`Supervisor: Max restarts (${maxRestarts}) exceeded. Aborting.`);
2849
+ return {
2850
+ state,
2851
+ maxRestartsExceeded: true
2852
+ };
2853
+ }
2854
+ const newRestartCount = state.restartCount + 1;
2855
+ if (health.run_id !== null) incrementRestarts(health.run_id, projectRoot);
2856
+ emitEvent({
2857
+ type: "supervisor:restart",
2858
+ run_id: health.run_id,
2859
+ attempt: newRestartCount
2860
+ });
2861
+ log(`Supervisor: Restarting pipeline (attempt ${newRestartCount}/${maxRestarts})`);
2862
+ try {
2863
+ await resumePipeline({
2864
+ runId: health.run_id ?? void 0,
2865
+ outputFormat,
2866
+ projectRoot,
2867
+ concurrency: 3,
2868
+ pack
2869
+ });
2870
+ } catch (err) {
2871
+ const message = err instanceof Error ? err.message : String(err);
2872
+ log(`Supervisor: Resume error: ${message}`);
2873
+ emitEvent({
2874
+ type: "supervisor:error",
2875
+ reason: "resume_failed",
2876
+ message
2877
+ });
2878
+ }
2879
+ return {
2880
+ state: {
2881
+ ...state,
2882
+ restartCount: newRestartCount
2883
+ },
2884
+ maxRestartsExceeded: false
2885
+ };
2886
+ }
2691
2887
  /**
2692
2888
  * Run the pipeline supervisor — a long-running watchdog that polls pipeline health
2693
2889
  * and automatically kills and restarts stalled pipelines.
@@ -2701,11 +2897,16 @@ function defaultSupervisorDeps() {
2701
2897
  */
2702
2898
  async function runSupervisorAction(options, deps = {}) {
2703
2899
  const { pollInterval, stallThreshold, maxRestarts, outputFormat, projectRoot, runId, pack, experiment, maxExperiments } = options;
2704
- const { getHealth, killPid, resumePipeline, sleep, incrementRestarts, runAnalysis, getTokenSnapshot } = {
2900
+ const resolvedDeps = {
2705
2901
  ...defaultSupervisorDeps(),
2706
2902
  ...deps
2707
2903
  };
2708
- let restartCount = 0;
2904
+ const { getHealth, sleep, runAnalysis, getTokenSnapshot } = resolvedDeps;
2905
+ let state = {
2906
+ projectRoot,
2907
+ runId,
2908
+ restartCount: 0
2909
+ };
2709
2910
  const startTime = Date.now();
2710
2911
  function emitEvent(event) {
2711
2912
  if (outputFormat === "json") {
@@ -2731,46 +2932,20 @@ async function runSupervisorAction(options, deps = {}) {
2731
2932
  output: 0,
2732
2933
  cost_usd: 0
2733
2934
  };
2734
- const proc = health.process ?? {
2735
- orchestrator_pid: null,
2736
- child_pids: [],
2737
- zombies: []
2738
- };
2739
- emitEvent({
2740
- type: "supervisor:poll",
2741
- run_id: health.run_id,
2742
- verdict: health.verdict,
2743
- staleness_seconds: health.staleness_seconds,
2744
- stories: {
2745
- active: health.stories.active,
2746
- completed: health.stories.completed,
2747
- escalated: health.stories.escalated
2748
- },
2749
- story_details: health.stories.details,
2750
- tokens: tokenSnapshot,
2751
- process: {
2752
- orchestrator_pid: proc.orchestrator_pid,
2753
- child_count: proc.child_pids.length,
2754
- zombie_count: proc.zombies.length
2755
- }
2756
- });
2935
+ emitEvent(buildPollEvent(health, projectRoot, tokenSnapshot));
2757
2936
  }
2758
2937
  log(`[${ts}] Health: ${health.verdict} | staleness=${health.staleness_seconds}s | stories: active=${health.stories.active} completed=${health.stories.completed} escalated=${health.stories.escalated}`);
2759
2938
  if (health.verdict === "NO_PIPELINE_RUNNING") {
2760
2939
  const elapsedSeconds = Math.round((Date.now() - startTime) / 1e3);
2761
- const succeeded = Object.entries(health.stories.details).filter(([, s]) => s.phase === "COMPLETE").map(([k]) => k);
2762
- const failed = Object.entries(health.stories.details).filter(([, s]) => s.phase !== "COMPLETE" && s.phase !== "PENDING" && s.phase !== "ESCALATED").map(([k]) => k);
2763
- const escalated = Object.entries(health.stories.details).filter(([, s]) => s.phase === "ESCALATED").map(([k]) => k);
2940
+ const summary = buildTerminalSummary(health.stories.details);
2764
2941
  emitEvent({
2765
2942
  type: "supervisor:summary",
2766
2943
  run_id: health.run_id,
2767
2944
  elapsed_seconds: elapsedSeconds,
2768
- succeeded,
2769
- failed,
2770
- escalated,
2771
- restarts: restartCount
2945
+ ...summary,
2946
+ restarts: state.restartCount
2772
2947
  });
2773
- log(`\nPipeline reached terminal state. Elapsed: ${elapsedSeconds}s | succeeded: ${succeeded.length} | failed: ${failed.length} | restarts: ${restartCount}`);
2948
+ log(`\nPipeline reached terminal state. Elapsed: ${elapsedSeconds}s | succeeded: ${summary.succeeded.length} | failed: ${summary.failed.length} | restarts: ${state.restartCount}`);
2774
2949
  if (health.run_id !== null && runAnalysis !== void 0) {
2775
2950
  log(`[supervisor] Running post-run analysis for ${health.run_id}...`);
2776
2951
  try {
@@ -2833,7 +3008,7 @@ async function runSupervisorAction(options, deps = {}) {
2833
3008
  const expDb = expDbWrapper.db;
2834
3009
  const { runRunAction: runPipeline } = await import(
2835
3010
  /* @vite-ignore */
2836
- "../run-C8aOWnKG.js"
3011
+ "../run-B9IglY4m.js"
2837
3012
  );
2838
3013
  const runStoryFn = async (opts) => {
2839
3014
  const exitCode = await runPipeline({
@@ -2896,84 +3071,162 @@ async function runSupervisorAction(options, deps = {}) {
2896
3071
  });
2897
3072
  }
2898
3073
  }
2899
- return failed.length > 0 || escalated.length > 0 ? 1 : 0;
3074
+ return summary.failed.length > 0 || summary.escalated.length > 0 ? 1 : 0;
2900
3075
  }
2901
- if (health.staleness_seconds >= stallThreshold) {
2902
- const pids = [...health.process.orchestrator_pid !== null ? [health.process.orchestrator_pid] : [], ...health.process.child_pids];
2903
- emitEvent({
2904
- type: "supervisor:kill",
2905
- run_id: health.run_id,
2906
- reason: "stall",
2907
- staleness_seconds: health.staleness_seconds,
2908
- pids
2909
- });
2910
- log(`Supervisor: Stall confirmed (${health.staleness_seconds}s ${stallThreshold}s threshold). Killing PIDs: ${pids.join(", ") || "none"}`);
2911
- for (const pid of pids) try {
2912
- killPid(pid, "SIGTERM");
2913
- } catch {}
2914
- await sleep(5e3);
2915
- for (const pid of pids) try {
2916
- killPid(pid, "SIGKILL");
2917
- } catch {}
2918
- if (pids.length > 0) {
2919
- let allDead = false;
2920
- for (let attempt = 0; attempt < 5; attempt++) {
2921
- await sleep(1e3);
2922
- allDead = pids.every((pid) => {
2923
- try {
2924
- process.kill(pid, 0);
2925
- return false;
2926
- } catch {
2927
- return true;
2928
- }
2929
- });
2930
- if (allDead) break;
2931
- }
2932
- if (!allDead) log(`Supervisor: Warning: Some PIDs may still be alive after SIGKILL`);
3076
+ const stallResult = await handleStallRecovery(health, state, {
3077
+ stallThreshold,
3078
+ maxRestarts,
3079
+ pack,
3080
+ outputFormat
3081
+ }, resolvedDeps, {
3082
+ emitEvent,
3083
+ log
3084
+ });
3085
+ if (stallResult !== null) {
3086
+ if (stallResult.maxRestartsExceeded) return 2;
3087
+ state = stallResult.state;
3088
+ }
3089
+ await sleep(pollInterval * 1e3);
3090
+ }
3091
+ }
3092
+ /**
3093
+ * Run the supervisor across multiple projects simultaneously.
3094
+ * Polls each project sequentially within each cycle, tagging events with `project`.
3095
+ *
3096
+ * Exit codes:
3097
+ * 0 all projects completed without failures
3098
+ * 1 — at least one project completed with failures or escalations
3099
+ * 2 — at least one project hit max restarts
3100
+ */
3101
+ async function runMultiProjectSupervisor(options, deps = {}) {
3102
+ const { projects, pollInterval, stallThreshold, maxRestarts, outputFormat, pack } = options;
3103
+ const resolvedDeps = {
3104
+ ...defaultSupervisorDeps(),
3105
+ ...deps
3106
+ };
3107
+ const { getHealth, sleep, getTokenSnapshot } = resolvedDeps;
3108
+ if (projects.length === 0) {
3109
+ process.stderr.write("Error: --projects requires at least one project path\n");
3110
+ return 1;
3111
+ }
3112
+ const states = new Map(projects.map((p) => [p, {
3113
+ projectRoot: p,
3114
+ restartCount: 0
3115
+ }]));
3116
+ const doneProjects = new Set();
3117
+ const projectExitCodes = new Map();
3118
+ const startTime = Date.now();
3119
+ function emitEvent(event) {
3120
+ if (outputFormat === "json") {
3121
+ const stamped = {
3122
+ ...event,
3123
+ ts: new Date().toISOString()
3124
+ };
3125
+ process.stdout.write(JSON.stringify(stamped) + "\n");
3126
+ }
3127
+ }
3128
+ function log(message) {
3129
+ if (outputFormat === "human") process.stdout.write(message + "\n");
3130
+ }
3131
+ while (true) {
3132
+ for (const projectRoot of projects) {
3133
+ if (doneProjects.has(projectRoot)) continue;
3134
+ let health;
3135
+ try {
3136
+ health = await getHealth({ projectRoot });
3137
+ } catch {
3138
+ log(`[supervisor] ${projectRoot}: health check failed — marking as done`);
3139
+ emitEvent({
3140
+ type: "supervisor:error",
3141
+ project: projectRoot,
3142
+ reason: "health_check_failed"
3143
+ });
3144
+ doneProjects.add(projectRoot);
3145
+ projectExitCodes.set(projectRoot, 1);
3146
+ continue;
3147
+ }
3148
+ const state = states.get(projectRoot);
3149
+ if (outputFormat === "json") {
3150
+ const tokenSnapshot = health.run_id !== null ? getTokenSnapshot(health.run_id, projectRoot) : {
3151
+ input: 0,
3152
+ output: 0,
3153
+ cost_usd: 0
3154
+ };
3155
+ emitEvent(buildPollEvent(health, projectRoot, tokenSnapshot, { project: projectRoot }));
2933
3156
  }
2934
- if (restartCount >= maxRestarts) {
3157
+ log(`[${projectRoot}] Health: ${health.verdict} | staleness=${health.staleness_seconds}s | active=${health.stories.active} completed=${health.stories.completed} escalated=${health.stories.escalated}`);
3158
+ if (health.verdict === "NO_PIPELINE_RUNNING") {
3159
+ const elapsedSeconds = Math.round((Date.now() - startTime) / 1e3);
3160
+ const summary = buildTerminalSummary(health.stories.details);
2935
3161
  emitEvent({
2936
- type: "supervisor:abort",
3162
+ type: "supervisor:summary",
3163
+ project: projectRoot,
2937
3164
  run_id: health.run_id,
2938
- reason: "max_restarts_exceeded",
2939
- attempts: restartCount
3165
+ elapsed_seconds: elapsedSeconds,
3166
+ ...summary,
3167
+ restarts: state.restartCount
2940
3168
  });
2941
- log(`Supervisor: Max restarts (${maxRestarts}) exceeded. Aborting.`);
2942
- return 2;
3169
+ log(`[${projectRoot}] Terminal. succeeded=${summary.succeeded.length} failed=${summary.failed.length} restarts=${state.restartCount}`);
3170
+ doneProjects.add(projectRoot);
3171
+ projectExitCodes.set(projectRoot, summary.failed.length > 0 || summary.escalated.length > 0 ? 1 : 0);
3172
+ continue;
2943
3173
  }
2944
- restartCount++;
2945
- if (health.run_id !== null) incrementRestarts(health.run_id, projectRoot);
3174
+ const stallResult = await handleStallRecovery(health, state, {
3175
+ stallThreshold,
3176
+ maxRestarts,
3177
+ pack,
3178
+ outputFormat
3179
+ }, resolvedDeps, {
3180
+ emitEvent: (evt) => emitEvent({
3181
+ ...evt,
3182
+ project: projectRoot
3183
+ }),
3184
+ log: (msg) => log(`[${projectRoot}] ${msg}`)
3185
+ });
3186
+ if (stallResult !== null) if (stallResult.maxRestartsExceeded) {
3187
+ doneProjects.add(projectRoot);
3188
+ projectExitCodes.set(projectRoot, 2);
3189
+ } else states.set(projectRoot, stallResult.state);
3190
+ }
3191
+ if (doneProjects.size >= projects.length) {
3192
+ const elapsedSeconds = Math.round((Date.now() - startTime) / 1e3);
2946
3193
  emitEvent({
2947
- type: "supervisor:restart",
2948
- run_id: health.run_id,
2949
- attempt: restartCount
3194
+ type: "supervisor:done",
3195
+ elapsed_seconds: elapsedSeconds,
3196
+ project_results: Object.fromEntries(projectExitCodes)
2950
3197
  });
2951
- log(`Supervisor: Restarting pipeline (attempt ${restartCount}/${maxRestarts})`);
2952
- try {
2953
- await resumePipeline({
2954
- runId: health.run_id ?? void 0,
2955
- outputFormat,
2956
- projectRoot,
2957
- concurrency: 3,
2958
- pack
2959
- });
2960
- } catch (err) {
2961
- const message = err instanceof Error ? err.message : String(err);
2962
- log(`Supervisor: Resume error: ${message}`);
2963
- if (outputFormat === "json") emitEvent({
2964
- type: "supervisor:error",
2965
- reason: "resume_failed",
2966
- message
2967
- });
2968
- }
3198
+ log(`\nAll projects reached terminal state. Elapsed: ${elapsedSeconds}s`);
3199
+ const exitCodes = [...projectExitCodes.values()];
3200
+ if (exitCodes.includes(2)) return 2;
3201
+ if (exitCodes.includes(1)) return 1;
3202
+ return 0;
2969
3203
  }
2970
3204
  await sleep(pollInterval * 1e3);
2971
3205
  }
2972
3206
  }
2973
3207
  function registerSupervisorCommand(program, _version = "0.0.0", projectRoot = process.cwd()) {
2974
- program.command("supervisor").description("Monitor a pipeline run and automatically recover from stalls").option("--poll-interval <seconds>", "Health poll interval in seconds", (v) => parseInt(v, 10), 60).option("--stall-threshold <seconds>", "Staleness in seconds before killing a stalled pipeline", (v) => parseInt(v, 10), 600).option("--max-restarts <n>", "Maximum automatic restarts before aborting", (v) => parseInt(v, 10), 3).option("--run-id <id>", "Pipeline run ID to monitor (defaults to latest)").option("--pack <name>", "Methodology pack name", "bmad").option("--project-root <path>", "Project root directory", projectRoot).option("--output-format <format>", "Output format: human (default) or json", "human").option("--experiment", "After post-run analysis, enter experiment mode: create branches, apply modifications, run single-story experiments, and report verdicts (Story 17-4)", false).option("--max-experiments <n>", "Maximum number of experiments to run per analysis cycle (default: 2, Story 17-4 AC6)", (v) => parseInt(v, 10), 2).action(async (opts) => {
3208
+ program.command("supervisor").description("Monitor a pipeline run and automatically recover from stalls").option("--poll-interval <seconds>", "Health poll interval in seconds", (v) => parseInt(v, 10), 60).option("--stall-threshold <seconds>", "Staleness in seconds before killing a stalled pipeline", (v) => parseInt(v, 10), 600).option("--max-restarts <n>", "Maximum automatic restarts before aborting", (v) => parseInt(v, 10), 3).option("--run-id <id>", "Pipeline run ID to monitor (defaults to latest)").option("--pack <name>", "Methodology pack name", "bmad").option("--project-root <path>", "Project root directory", projectRoot).option("--projects <paths>", "Comma-separated project root directories to monitor (multi-project mode)").option("--output-format <format>", "Output format: human (default) or json", "human").option("--experiment", "After post-run analysis, enter experiment mode: create branches, apply modifications, run single-story experiments, and report verdicts (Story 17-4)", false).option("--max-experiments <n>", "Maximum number of experiments to run per analysis cycle (default: 2, Story 17-4 AC6)", (v) => parseInt(v, 10), 2).action(async (opts) => {
2975
3209
  const outputFormat = opts.outputFormat === "json" ? "json" : "human";
2976
3210
  if (opts.stallThreshold < 120) console.warn(`Warning: --stall-threshold ${opts.stallThreshold}s is below 120s. Agent steps typically take 45-90s. This may cause false stall detections and wasted restarts.`);
3211
+ if (opts.projects) {
3212
+ if (opts.runId) {
3213
+ console.error("Error: --run-id cannot be used with --projects (ambiguous)");
3214
+ process.exitCode = 1;
3215
+ return;
3216
+ }
3217
+ if (opts.experiment) console.warn("Warning: --experiment is not supported in multi-project mode — ignored.");
3218
+ const projects = opts.projects.split(",").map((p) => resolve(p.trim()));
3219
+ const exitCode$1 = await runMultiProjectSupervisor({
3220
+ projects,
3221
+ pollInterval: opts.pollInterval,
3222
+ stallThreshold: opts.stallThreshold,
3223
+ maxRestarts: opts.maxRestarts,
3224
+ outputFormat,
3225
+ pack: opts.pack
3226
+ });
3227
+ process.exitCode = exitCode$1;
3228
+ return;
3229
+ }
2977
3230
  const exitCode = await runSupervisorAction({
2978
3231
  pollInterval: opts.pollInterval,
2979
3232
  stallThreshold: opts.stallThreshold,
package/dist/index.d.ts CHANGED
@@ -1025,6 +1025,8 @@ interface OrchestratorEvents {
1025
1025
  storyKey: string;
1026
1026
  phase: string;
1027
1027
  elapsedMs: number;
1028
+ /** PID of the stalled child process, or null if not tracked */
1029
+ childPid: number | null;
1028
1030
  };
1029
1031
  /** Readiness check has completed — emitted for all verdicts (READY, NEEDS_WORK, NOT_READY) */
1030
1032
  'solutioning:readiness-check': {
@@ -1,6 +1,6 @@
1
1
  import "./logger-C6n1g8uP.js";
2
2
  import "./event-bus-J-bw-pkp.js";
3
- import { registerRunCommand, runRunAction } from "./run-L-R_XYNT.js";
3
+ import { registerRunCommand, runRunAction } from "./run-CoP8UQU3.js";
4
4
  import "./decisions-DNYByk0U.js";
5
5
  import "./metrics-BSg8VIHd.js";
6
6
 
@@ -1202,6 +1202,15 @@ function buildPipelineStatusOutput(run, tokenSummary, decisionsCount, storiesCou
1202
1202
  totalOutput += row.total_output_tokens;
1203
1203
  totalCost += row.total_cost_usd;
1204
1204
  }
1205
+ let activeDispatches = 0;
1206
+ try {
1207
+ if (run.token_usage_json) {
1208
+ const state = JSON.parse(run.token_usage_json);
1209
+ if (state.stories) {
1210
+ for (const s of Object.values(state.stories)) if (s.phase !== "PENDING" && s.phase !== "COMPLETE" && s.phase !== "ESCALATED") activeDispatches++;
1211
+ }
1212
+ }
1213
+ } catch {}
1205
1214
  return {
1206
1215
  run_id: run.id,
1207
1216
  current_phase: currentPhase,
@@ -1214,7 +1223,9 @@ function buildPipelineStatusOutput(run, tokenSummary, decisionsCount, storiesCou
1214
1223
  decisions_count: decisionsCount,
1215
1224
  stories_count: storiesCount,
1216
1225
  last_activity: run.updated_at,
1217
- staleness_seconds: Math.round((Date.now() - parseDbTimestampAsUtc(run.updated_at).getTime()) / 1e3)
1226
+ staleness_seconds: Math.round((Date.now() - parseDbTimestampAsUtc(run.updated_at).getTime()) / 1e3),
1227
+ last_event_ts: run.updated_at,
1228
+ active_dispatches: activeDispatches
1218
1229
  };
1219
1230
  }
1220
1231
  /**
@@ -5359,6 +5370,7 @@ function createImplementationOrchestrator(deps) {
5359
5370
  let _heartbeatTimer = null;
5360
5371
  const HEARTBEAT_INTERVAL_MS = 3e4;
5361
5372
  const WATCHDOG_TIMEOUT_MS = 6e5;
5373
+ const _stalledStories = new Set();
5362
5374
  const _phaseStartMs = new Map();
5363
5375
  const _phaseEndMs = new Map();
5364
5376
  const _storyDispatches = new Map();
@@ -5454,6 +5466,7 @@ function createImplementationOrchestrator(deps) {
5454
5466
  }
5455
5467
  function recordProgress() {
5456
5468
  _lastProgressTs = Date.now();
5469
+ _stalledStories.clear();
5457
5470
  }
5458
5471
  function startHeartbeat() {
5459
5472
  if (_heartbeatTimer !== null) return;
@@ -5465,7 +5478,8 @@ function createImplementationOrchestrator(deps) {
5465
5478
  for (const s of _stories.values()) if (s.phase === "COMPLETE" || s.phase === "ESCALATED") completed++;
5466
5479
  else if (s.phase === "PENDING") queued++;
5467
5480
  else active++;
5468
- eventBus.emit("orchestrator:heartbeat", {
5481
+ const timeSinceProgress = Date.now() - _lastProgressTs;
5482
+ if (timeSinceProgress >= HEARTBEAT_INTERVAL_MS) eventBus.emit("orchestrator:heartbeat", {
5469
5483
  runId: config.pipelineRunId ?? "",
5470
5484
  activeDispatches: active,
5471
5485
  completedDispatches: completed,
@@ -5474,6 +5488,8 @@ function createImplementationOrchestrator(deps) {
5474
5488
  const elapsed = Date.now() - _lastProgressTs;
5475
5489
  if (elapsed >= WATCHDOG_TIMEOUT_MS) {
5476
5490
  for (const [key, s] of _stories) if (s.phase !== "PENDING" && s.phase !== "COMPLETE" && s.phase !== "ESCALATED") {
5491
+ if (_stalledStories.has(key)) continue;
5492
+ _stalledStories.add(key);
5477
5493
  logger$16.warn({
5478
5494
  storyKey: key,
5479
5495
  phase: s.phase,
@@ -5483,7 +5499,8 @@ function createImplementationOrchestrator(deps) {
5483
5499
  runId: config.pipelineRunId ?? "",
5484
5500
  storyKey: key,
5485
5501
  phase: s.phase,
5486
- elapsedMs: elapsed
5502
+ elapsedMs: elapsed,
5503
+ childPid: null
5487
5504
  });
5488
5505
  }
5489
5506
  }
@@ -6244,7 +6261,7 @@ function createImplementationOrchestrator(deps) {
6244
6261
  });
6245
6262
  persistState();
6246
6263
  recordProgress();
6247
- startHeartbeat();
6264
+ if (config.enableHeartbeat) startHeartbeat();
6248
6265
  if (projectRoot !== void 0) {
6249
6266
  const seedResult = seedMethodologyContext(db, projectRoot);
6250
6267
  if (seedResult.decisionsCreated > 0) logger$16.info({
@@ -10697,7 +10714,8 @@ async function runRunAction(options) {
10697
10714
  run_id: payload.runId,
10698
10715
  story_key: payload.storyKey,
10699
10716
  phase: payload.phase,
10700
- elapsed_ms: payload.elapsedMs
10717
+ elapsed_ms: payload.elapsedMs,
10718
+ child_pid: payload.childPid
10701
10719
  });
10702
10720
  });
10703
10721
  }
@@ -10710,7 +10728,8 @@ async function runRunAction(options) {
10710
10728
  config: {
10711
10729
  maxConcurrency: concurrency,
10712
10730
  maxReviewCycles: 2,
10713
- pipelineRunId: pipelineRun.id
10731
+ pipelineRunId: pipelineRun.id,
10732
+ enableHeartbeat: eventsFlag === true
10714
10733
  },
10715
10734
  projectRoot
10716
10735
  });
@@ -11153,4 +11172,4 @@ function registerRunCommand(program, _version = "0.0.0", projectRoot = process.c
11153
11172
 
11154
11173
  //#endregion
11155
11174
  export { DatabaseWrapper, SUBSTRATE_OWNED_SETTINGS_KEYS, VALID_PHASES, buildPipelineStatusOutput, createContextCompiler, createDispatcher, createImplementationOrchestrator, createPackLoader, createPhaseOrchestrator, createStopAfterGate, findPackageRoot, formatOutput, formatPhaseCompletionSummary, formatPipelineStatusHuman, formatPipelineSummary, formatTokenTelemetry, getSubstrateDefaultSettings, parseDbTimestampAsUtc, registerRunCommand, resolveBmadMethodSrcPath, resolveBmadMethodVersion, resolveMainRepoRoot, runAnalysisPhase, runMigrations, runPlanningPhase, runRunAction, runSolutioningPhase, validateStopAfterFromConflict };
11156
- //# sourceMappingURL=run-L-R_XYNT.js.map
11175
+ //# sourceMappingURL=run-CoP8UQU3.js.map
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "substrate-ai",
3
- "version": "0.2.11",
3
+ "version": "0.2.14",
4
4
  "description": "Substrate — multi-agent orchestration daemon for AI coding agents",
5
5
  "type": "module",
6
6
  "license": "MIT",