substrate-ai 0.2.13 → 0.2.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/dist/cli/index.js +287 -99
  2. package/package.json +1 -1
package/dist/cli/index.js CHANGED
@@ -2750,6 +2750,140 @@ function defaultSupervisorDeps() {
2750
2750
  }
2751
2751
  };
2752
2752
  }
2753
+ /** Build the supervisor:poll event payload. */
2754
+ function buildPollEvent(health, projectRoot, tokenSnapshot, extraFields) {
2755
+ const proc = health.process ?? {
2756
+ orchestrator_pid: null,
2757
+ child_pids: [],
2758
+ zombies: []
2759
+ };
2760
+ return {
2761
+ type: "supervisor:poll",
2762
+ run_id: health.run_id,
2763
+ verdict: health.verdict,
2764
+ staleness_seconds: health.staleness_seconds,
2765
+ stories: {
2766
+ active: health.stories.active,
2767
+ completed: health.stories.completed,
2768
+ escalated: health.stories.escalated
2769
+ },
2770
+ story_details: health.stories.details,
2771
+ tokens: tokenSnapshot,
2772
+ process: {
2773
+ orchestrator_pid: proc.orchestrator_pid,
2774
+ child_count: proc.child_pids.length,
2775
+ zombie_count: proc.zombies.length
2776
+ },
2777
+ ...extraFields
2778
+ };
2779
+ }
2780
+ /** Extract succeeded / failed / escalated story keys from health details. */
2781
+ function buildTerminalSummary(storyDetails) {
2782
+ const succeeded = [];
2783
+ const failed = [];
2784
+ const escalated = [];
2785
+ for (const [k, s] of Object.entries(storyDetails)) if (s.phase === "COMPLETE") succeeded.push(k);
2786
+ else if (s.phase === "ESCALATED") escalated.push(k);
2787
+ else if (s.phase !== "PENDING") failed.push(k);
2788
+ return {
2789
+ succeeded,
2790
+ failed,
2791
+ escalated
2792
+ };
2793
+ }
2794
+ /**
2795
+ * Handle stall recovery for a single project: kill stalled processes, restart pipeline.
2796
+ *
2797
+ * Returns null if no stall detected (staleness below threshold).
2798
+ * Returns updated state + maxRestartsExceeded flag otherwise.
2799
+ */
2800
+ async function handleStallRecovery(health, state, config, deps, io) {
2801
+ const { stallThreshold, maxRestarts, pack, outputFormat } = config;
2802
+ const { killPid, resumePipeline, sleep, incrementRestarts, getAllDescendants } = deps;
2803
+ const { emitEvent, log } = io;
2804
+ const { projectRoot } = state;
2805
+ if (health.staleness_seconds < stallThreshold) return null;
2806
+ const directPids = [...health.process.orchestrator_pid !== null ? [health.process.orchestrator_pid] : [], ...health.process.child_pids];
2807
+ const descendantPids = getAllDescendants(directPids);
2808
+ const directPidSet = new Set(directPids);
2809
+ const pids = [...directPids, ...descendantPids.filter((p) => !directPidSet.has(p))];
2810
+ emitEvent({
2811
+ type: "supervisor:kill",
2812
+ run_id: health.run_id,
2813
+ reason: "stall",
2814
+ staleness_seconds: health.staleness_seconds,
2815
+ pids
2816
+ });
2817
+ log(`Supervisor: Stall confirmed (${health.staleness_seconds}s ≥ ${stallThreshold}s threshold). Killing PIDs: ${pids.join(", ") || "none"}`);
2818
+ for (const pid of pids) try {
2819
+ killPid(pid, "SIGTERM");
2820
+ } catch {}
2821
+ await sleep(5e3);
2822
+ for (const pid of pids) try {
2823
+ killPid(pid, "SIGKILL");
2824
+ } catch {}
2825
+ if (pids.length > 0) {
2826
+ let allDead = false;
2827
+ for (let attempt = 0; attempt < 5; attempt++) {
2828
+ await sleep(1e3);
2829
+ allDead = pids.every((pid) => {
2830
+ try {
2831
+ process.kill(pid, 0);
2832
+ return false;
2833
+ } catch {
2834
+ return true;
2835
+ }
2836
+ });
2837
+ if (allDead) break;
2838
+ }
2839
+ if (!allDead) log(`Supervisor: Warning: Some PIDs may still be alive after SIGKILL`);
2840
+ }
2841
+ if (state.restartCount >= maxRestarts) {
2842
+ emitEvent({
2843
+ type: "supervisor:abort",
2844
+ run_id: health.run_id,
2845
+ reason: "max_restarts_exceeded",
2846
+ attempts: state.restartCount
2847
+ });
2848
+ log(`Supervisor: Max restarts (${maxRestarts}) exceeded. Aborting.`);
2849
+ return {
2850
+ state,
2851
+ maxRestartsExceeded: true
2852
+ };
2853
+ }
2854
+ const newRestartCount = state.restartCount + 1;
2855
+ if (health.run_id !== null) incrementRestarts(health.run_id, projectRoot);
2856
+ emitEvent({
2857
+ type: "supervisor:restart",
2858
+ run_id: health.run_id,
2859
+ attempt: newRestartCount
2860
+ });
2861
+ log(`Supervisor: Restarting pipeline (attempt ${newRestartCount}/${maxRestarts})`);
2862
+ try {
2863
+ await resumePipeline({
2864
+ runId: health.run_id ?? void 0,
2865
+ outputFormat,
2866
+ projectRoot,
2867
+ concurrency: 3,
2868
+ pack
2869
+ });
2870
+ } catch (err) {
2871
+ const message = err instanceof Error ? err.message : String(err);
2872
+ log(`Supervisor: Resume error: ${message}`);
2873
+ emitEvent({
2874
+ type: "supervisor:error",
2875
+ reason: "resume_failed",
2876
+ message
2877
+ });
2878
+ }
2879
+ return {
2880
+ state: {
2881
+ ...state,
2882
+ restartCount: newRestartCount
2883
+ },
2884
+ maxRestartsExceeded: false
2885
+ };
2886
+ }
2753
2887
  /**
2754
2888
  * Run the pipeline supervisor — a long-running watchdog that polls pipeline health
2755
2889
  * and automatically kills and restarts stalled pipelines.
@@ -2763,11 +2897,16 @@ function defaultSupervisorDeps() {
2763
2897
  */
2764
2898
  async function runSupervisorAction(options, deps = {}) {
2765
2899
  const { pollInterval, stallThreshold, maxRestarts, outputFormat, projectRoot, runId, pack, experiment, maxExperiments } = options;
2766
- const { getHealth, killPid, resumePipeline, sleep, incrementRestarts, runAnalysis, getTokenSnapshot, getAllDescendants } = {
2900
+ const resolvedDeps = {
2767
2901
  ...defaultSupervisorDeps(),
2768
2902
  ...deps
2769
2903
  };
2770
- let restartCount = 0;
2904
+ const { getHealth, sleep, runAnalysis, getTokenSnapshot } = resolvedDeps;
2905
+ let state = {
2906
+ projectRoot,
2907
+ runId,
2908
+ restartCount: 0
2909
+ };
2771
2910
  const startTime = Date.now();
2772
2911
  function emitEvent(event) {
2773
2912
  if (outputFormat === "json") {
@@ -2793,46 +2932,20 @@ async function runSupervisorAction(options, deps = {}) {
2793
2932
  output: 0,
2794
2933
  cost_usd: 0
2795
2934
  };
2796
- const proc = health.process ?? {
2797
- orchestrator_pid: null,
2798
- child_pids: [],
2799
- zombies: []
2800
- };
2801
- emitEvent({
2802
- type: "supervisor:poll",
2803
- run_id: health.run_id,
2804
- verdict: health.verdict,
2805
- staleness_seconds: health.staleness_seconds,
2806
- stories: {
2807
- active: health.stories.active,
2808
- completed: health.stories.completed,
2809
- escalated: health.stories.escalated
2810
- },
2811
- story_details: health.stories.details,
2812
- tokens: tokenSnapshot,
2813
- process: {
2814
- orchestrator_pid: proc.orchestrator_pid,
2815
- child_count: proc.child_pids.length,
2816
- zombie_count: proc.zombies.length
2817
- }
2818
- });
2935
+ emitEvent(buildPollEvent(health, projectRoot, tokenSnapshot));
2819
2936
  }
2820
2937
  log(`[${ts}] Health: ${health.verdict} | staleness=${health.staleness_seconds}s | stories: active=${health.stories.active} completed=${health.stories.completed} escalated=${health.stories.escalated}`);
2821
2938
  if (health.verdict === "NO_PIPELINE_RUNNING") {
2822
2939
  const elapsedSeconds = Math.round((Date.now() - startTime) / 1e3);
2823
- const succeeded = Object.entries(health.stories.details).filter(([, s]) => s.phase === "COMPLETE").map(([k]) => k);
2824
- const failed = Object.entries(health.stories.details).filter(([, s]) => s.phase !== "COMPLETE" && s.phase !== "PENDING" && s.phase !== "ESCALATED").map(([k]) => k);
2825
- const escalated = Object.entries(health.stories.details).filter(([, s]) => s.phase === "ESCALATED").map(([k]) => k);
2940
+ const summary = buildTerminalSummary(health.stories.details);
2826
2941
  emitEvent({
2827
2942
  type: "supervisor:summary",
2828
2943
  run_id: health.run_id,
2829
2944
  elapsed_seconds: elapsedSeconds,
2830
- succeeded,
2831
- failed,
2832
- escalated,
2833
- restarts: restartCount
2945
+ ...summary,
2946
+ restarts: state.restartCount
2834
2947
  });
2835
- log(`\nPipeline reached terminal state. Elapsed: ${elapsedSeconds}s | succeeded: ${succeeded.length} | failed: ${failed.length} | restarts: ${restartCount}`);
2948
+ log(`\nPipeline reached terminal state. Elapsed: ${elapsedSeconds}s | succeeded: ${summary.succeeded.length} | failed: ${summary.failed.length} | restarts: ${state.restartCount}`);
2836
2949
  if (health.run_id !== null && runAnalysis !== void 0) {
2837
2950
  log(`[supervisor] Running post-run analysis for ${health.run_id}...`);
2838
2951
  try {
@@ -2958,87 +3071,162 @@ async function runSupervisorAction(options, deps = {}) {
2958
3071
  });
2959
3072
  }
2960
3073
  }
2961
- return failed.length > 0 || escalated.length > 0 ? 1 : 0;
3074
+ return summary.failed.length > 0 || summary.escalated.length > 0 ? 1 : 0;
2962
3075
  }
2963
- if (health.staleness_seconds >= stallThreshold) {
2964
- const directPids = [...health.process.orchestrator_pid !== null ? [health.process.orchestrator_pid] : [], ...health.process.child_pids];
2965
- const descendantPids = getAllDescendants(directPids);
2966
- const directPidSet = new Set(directPids);
2967
- const pids = [...directPids, ...descendantPids.filter((p) => !directPidSet.has(p))];
2968
- emitEvent({
2969
- type: "supervisor:kill",
2970
- run_id: health.run_id,
2971
- reason: "stall",
2972
- staleness_seconds: health.staleness_seconds,
2973
- pids
2974
- });
2975
- log(`Supervisor: Stall confirmed (${health.staleness_seconds}s ≥ ${stallThreshold}s threshold). Killing PIDs: ${pids.join(", ") || "none"}`);
2976
- for (const pid of pids) try {
2977
- killPid(pid, "SIGTERM");
2978
- } catch {}
2979
- await sleep(5e3);
2980
- for (const pid of pids) try {
2981
- killPid(pid, "SIGKILL");
2982
- } catch {}
2983
- if (pids.length > 0) {
2984
- let allDead = false;
2985
- for (let attempt = 0; attempt < 5; attempt++) {
2986
- await sleep(1e3);
2987
- allDead = pids.every((pid) => {
2988
- try {
2989
- process.kill(pid, 0);
2990
- return false;
2991
- } catch {
2992
- return true;
2993
- }
2994
- });
2995
- if (allDead) break;
2996
- }
2997
- if (!allDead) log(`Supervisor: Warning: Some PIDs may still be alive after SIGKILL`);
3076
+ const stallResult = await handleStallRecovery(health, state, {
3077
+ stallThreshold,
3078
+ maxRestarts,
3079
+ pack,
3080
+ outputFormat
3081
+ }, resolvedDeps, {
3082
+ emitEvent,
3083
+ log
3084
+ });
3085
+ if (stallResult !== null) {
3086
+ if (stallResult.maxRestartsExceeded) return 2;
3087
+ state = stallResult.state;
3088
+ }
3089
+ await sleep(pollInterval * 1e3);
3090
+ }
3091
+ }
3092
+ /**
3093
+ * Run the supervisor across multiple projects simultaneously.
3094
+ * Polls each project sequentially within each cycle, tagging events with `project`.
3095
+ *
3096
+ * Exit codes:
3097
+ * 0 all projects completed without failures
3098
+ * 1 at least one project completed with failures or escalations
3099
+ * 2 — at least one project hit max restarts
3100
+ */
3101
+ async function runMultiProjectSupervisor(options, deps = {}) {
3102
+ const { projects, pollInterval, stallThreshold, maxRestarts, outputFormat, pack } = options;
3103
+ const resolvedDeps = {
3104
+ ...defaultSupervisorDeps(),
3105
+ ...deps
3106
+ };
3107
+ const { getHealth, sleep, getTokenSnapshot } = resolvedDeps;
3108
+ if (projects.length === 0) {
3109
+ process.stderr.write("Error: --projects requires at least one project path\n");
3110
+ return 1;
3111
+ }
3112
+ const states = new Map(projects.map((p) => [p, {
3113
+ projectRoot: p,
3114
+ restartCount: 0
3115
+ }]));
3116
+ const doneProjects = new Set();
3117
+ const projectExitCodes = new Map();
3118
+ const startTime = Date.now();
3119
+ function emitEvent(event) {
3120
+ if (outputFormat === "json") {
3121
+ const stamped = {
3122
+ ...event,
3123
+ ts: new Date().toISOString()
3124
+ };
3125
+ process.stdout.write(JSON.stringify(stamped) + "\n");
3126
+ }
3127
+ }
3128
+ function log(message) {
3129
+ if (outputFormat === "human") process.stdout.write(message + "\n");
3130
+ }
3131
+ while (true) {
3132
+ for (const projectRoot of projects) {
3133
+ if (doneProjects.has(projectRoot)) continue;
3134
+ let health;
3135
+ try {
3136
+ health = await getHealth({ projectRoot });
3137
+ } catch {
3138
+ log(`[supervisor] ${projectRoot}: health check failed — marking as done`);
3139
+ emitEvent({
3140
+ type: "supervisor:error",
3141
+ project: projectRoot,
3142
+ reason: "health_check_failed"
3143
+ });
3144
+ doneProjects.add(projectRoot);
3145
+ projectExitCodes.set(projectRoot, 1);
3146
+ continue;
2998
3147
  }
2999
- if (restartCount >= maxRestarts) {
3148
+ const state = states.get(projectRoot);
3149
+ if (outputFormat === "json") {
3150
+ const tokenSnapshot = health.run_id !== null ? getTokenSnapshot(health.run_id, projectRoot) : {
3151
+ input: 0,
3152
+ output: 0,
3153
+ cost_usd: 0
3154
+ };
3155
+ emitEvent(buildPollEvent(health, projectRoot, tokenSnapshot, { project: projectRoot }));
3156
+ }
3157
+ log(`[${projectRoot}] Health: ${health.verdict} | staleness=${health.staleness_seconds}s | active=${health.stories.active} completed=${health.stories.completed} escalated=${health.stories.escalated}`);
3158
+ if (health.verdict === "NO_PIPELINE_RUNNING") {
3159
+ const elapsedSeconds = Math.round((Date.now() - startTime) / 1e3);
3160
+ const summary = buildTerminalSummary(health.stories.details);
3000
3161
  emitEvent({
3001
- type: "supervisor:abort",
3162
+ type: "supervisor:summary",
3163
+ project: projectRoot,
3002
3164
  run_id: health.run_id,
3003
- reason: "max_restarts_exceeded",
3004
- attempts: restartCount
3165
+ elapsed_seconds: elapsedSeconds,
3166
+ ...summary,
3167
+ restarts: state.restartCount
3005
3168
  });
3006
- log(`Supervisor: Max restarts (${maxRestarts}) exceeded. Aborting.`);
3007
- return 2;
3169
+ log(`[${projectRoot}] Terminal. succeeded=${summary.succeeded.length} failed=${summary.failed.length} restarts=${state.restartCount}`);
3170
+ doneProjects.add(projectRoot);
3171
+ projectExitCodes.set(projectRoot, summary.failed.length > 0 || summary.escalated.length > 0 ? 1 : 0);
3172
+ continue;
3008
3173
  }
3009
- restartCount++;
3010
- if (health.run_id !== null) incrementRestarts(health.run_id, projectRoot);
3174
+ const stallResult = await handleStallRecovery(health, state, {
3175
+ stallThreshold,
3176
+ maxRestarts,
3177
+ pack,
3178
+ outputFormat
3179
+ }, resolvedDeps, {
3180
+ emitEvent: (evt) => emitEvent({
3181
+ ...evt,
3182
+ project: projectRoot
3183
+ }),
3184
+ log: (msg) => log(`[${projectRoot}] ${msg}`)
3185
+ });
3186
+ if (stallResult !== null) if (stallResult.maxRestartsExceeded) {
3187
+ doneProjects.add(projectRoot);
3188
+ projectExitCodes.set(projectRoot, 2);
3189
+ } else states.set(projectRoot, stallResult.state);
3190
+ }
3191
+ if (doneProjects.size >= projects.length) {
3192
+ const elapsedSeconds = Math.round((Date.now() - startTime) / 1e3);
3011
3193
  emitEvent({
3012
- type: "supervisor:restart",
3013
- run_id: health.run_id,
3014
- attempt: restartCount
3194
+ type: "supervisor:done",
3195
+ elapsed_seconds: elapsedSeconds,
3196
+ project_results: Object.fromEntries(projectExitCodes)
3015
3197
  });
3016
- log(`Supervisor: Restarting pipeline (attempt ${restartCount}/${maxRestarts})`);
3017
- try {
3018
- await resumePipeline({
3019
- runId: health.run_id ?? void 0,
3020
- outputFormat,
3021
- projectRoot,
3022
- concurrency: 3,
3023
- pack
3024
- });
3025
- } catch (err) {
3026
- const message = err instanceof Error ? err.message : String(err);
3027
- log(`Supervisor: Resume error: ${message}`);
3028
- if (outputFormat === "json") emitEvent({
3029
- type: "supervisor:error",
3030
- reason: "resume_failed",
3031
- message
3032
- });
3033
- }
3198
+ log(`\nAll projects reached terminal state. Elapsed: ${elapsedSeconds}s`);
3199
+ const exitCodes = [...projectExitCodes.values()];
3200
+ if (exitCodes.includes(2)) return 2;
3201
+ if (exitCodes.includes(1)) return 1;
3202
+ return 0;
3034
3203
  }
3035
3204
  await sleep(pollInterval * 1e3);
3036
3205
  }
3037
3206
  }
3038
3207
  function registerSupervisorCommand(program, _version = "0.0.0", projectRoot = process.cwd()) {
3039
- program.command("supervisor").description("Monitor a pipeline run and automatically recover from stalls").option("--poll-interval <seconds>", "Health poll interval in seconds", (v) => parseInt(v, 10), 60).option("--stall-threshold <seconds>", "Staleness in seconds before killing a stalled pipeline", (v) => parseInt(v, 10), 600).option("--max-restarts <n>", "Maximum automatic restarts before aborting", (v) => parseInt(v, 10), 3).option("--run-id <id>", "Pipeline run ID to monitor (defaults to latest)").option("--pack <name>", "Methodology pack name", "bmad").option("--project-root <path>", "Project root directory", projectRoot).option("--output-format <format>", "Output format: human (default) or json", "human").option("--experiment", "After post-run analysis, enter experiment mode: create branches, apply modifications, run single-story experiments, and report verdicts (Story 17-4)", false).option("--max-experiments <n>", "Maximum number of experiments to run per analysis cycle (default: 2, Story 17-4 AC6)", (v) => parseInt(v, 10), 2).action(async (opts) => {
3208
+ program.command("supervisor").description("Monitor a pipeline run and automatically recover from stalls").option("--poll-interval <seconds>", "Health poll interval in seconds", (v) => parseInt(v, 10), 60).option("--stall-threshold <seconds>", "Staleness in seconds before killing a stalled pipeline", (v) => parseInt(v, 10), 600).option("--max-restarts <n>", "Maximum automatic restarts before aborting", (v) => parseInt(v, 10), 3).option("--run-id <id>", "Pipeline run ID to monitor (defaults to latest)").option("--pack <name>", "Methodology pack name", "bmad").option("--project-root <path>", "Project root directory", projectRoot).option("--projects <paths>", "Comma-separated project root directories to monitor (multi-project mode)").option("--output-format <format>", "Output format: human (default) or json", "human").option("--experiment", "After post-run analysis, enter experiment mode: create branches, apply modifications, run single-story experiments, and report verdicts (Story 17-4)", false).option("--max-experiments <n>", "Maximum number of experiments to run per analysis cycle (default: 2, Story 17-4 AC6)", (v) => parseInt(v, 10), 2).action(async (opts) => {
3040
3209
  const outputFormat = opts.outputFormat === "json" ? "json" : "human";
3041
3210
  if (opts.stallThreshold < 120) console.warn(`Warning: --stall-threshold ${opts.stallThreshold}s is below 120s. Agent steps typically take 45-90s. This may cause false stall detections and wasted restarts.`);
3211
+ if (opts.projects) {
3212
+ if (opts.runId) {
3213
+ console.error("Error: --run-id cannot be used with --projects (ambiguous)");
3214
+ process.exitCode = 1;
3215
+ return;
3216
+ }
3217
+ if (opts.experiment) console.warn("Warning: --experiment is not supported in multi-project mode — ignored.");
3218
+ const projects = opts.projects.split(",").map((p) => resolve(p.trim()));
3219
+ const exitCode$1 = await runMultiProjectSupervisor({
3220
+ projects,
3221
+ pollInterval: opts.pollInterval,
3222
+ stallThreshold: opts.stallThreshold,
3223
+ maxRestarts: opts.maxRestarts,
3224
+ outputFormat,
3225
+ pack: opts.pack
3226
+ });
3227
+ process.exitCode = exitCode$1;
3228
+ return;
3229
+ }
3042
3230
  const exitCode = await runSupervisorAction({
3043
3231
  pollInterval: opts.pollInterval,
3044
3232
  stallThreshold: opts.stallThreshold,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "substrate-ai",
3
- "version": "0.2.13",
3
+ "version": "0.2.14",
4
4
  "description": "Substrate — multi-agent orchestration daemon for AI coding agents",
5
5
  "type": "module",
6
6
  "license": "MIT",