substrate-ai 0.2.13 → 0.2.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli/index.js +294 -105
- package/package.json +1 -1
package/dist/cli/index.js
CHANGED
|
@@ -2675,11 +2675,11 @@ function defaultSupervisorDeps() {
|
|
|
2675
2675
|
sleep: (ms) => new Promise((resolve$2) => setTimeout(resolve$2, ms)),
|
|
2676
2676
|
incrementRestarts: (() => {
|
|
2677
2677
|
let cachedDbWrapper = null;
|
|
2678
|
-
return (runId, projectRoot) => {
|
|
2678
|
+
return async (runId, projectRoot) => {
|
|
2679
2679
|
try {
|
|
2680
2680
|
if (cachedDbWrapper === null) {
|
|
2681
|
-
const
|
|
2682
|
-
const dbPath = join(
|
|
2681
|
+
const dbRoot = await resolveMainRepoRoot(projectRoot);
|
|
2682
|
+
const dbPath = join(dbRoot, ".substrate", "substrate.db");
|
|
2683
2683
|
cachedDbWrapper = new DatabaseWrapper(dbPath);
|
|
2684
2684
|
}
|
|
2685
2685
|
incrementRunRestarts(cachedDbWrapper.getDb(), runId);
|
|
@@ -2691,9 +2691,10 @@ function defaultSupervisorDeps() {
|
|
|
2691
2691
|
}
|
|
2692
2692
|
};
|
|
2693
2693
|
})(),
|
|
2694
|
-
getTokenSnapshot: (runId, projectRoot) => {
|
|
2694
|
+
getTokenSnapshot: async (runId, projectRoot) => {
|
|
2695
2695
|
try {
|
|
2696
|
-
const
|
|
2696
|
+
const dbRoot = await resolveMainRepoRoot(projectRoot);
|
|
2697
|
+
const dbPath = join(dbRoot, ".substrate", "substrate.db");
|
|
2697
2698
|
if (!existsSync(dbPath)) return {
|
|
2698
2699
|
input: 0,
|
|
2699
2700
|
output: 0,
|
|
@@ -2750,6 +2751,140 @@ function defaultSupervisorDeps() {
|
|
|
2750
2751
|
}
|
|
2751
2752
|
};
|
|
2752
2753
|
}
|
|
2754
|
+
/** Build the supervisor:poll event payload. */
|
|
2755
|
+
function buildPollEvent(health, projectRoot, tokenSnapshot, extraFields) {
|
|
2756
|
+
const proc = health.process ?? {
|
|
2757
|
+
orchestrator_pid: null,
|
|
2758
|
+
child_pids: [],
|
|
2759
|
+
zombies: []
|
|
2760
|
+
};
|
|
2761
|
+
return {
|
|
2762
|
+
type: "supervisor:poll",
|
|
2763
|
+
run_id: health.run_id,
|
|
2764
|
+
verdict: health.verdict,
|
|
2765
|
+
staleness_seconds: health.staleness_seconds,
|
|
2766
|
+
stories: {
|
|
2767
|
+
active: health.stories.active,
|
|
2768
|
+
completed: health.stories.completed,
|
|
2769
|
+
escalated: health.stories.escalated
|
|
2770
|
+
},
|
|
2771
|
+
story_details: health.stories.details,
|
|
2772
|
+
tokens: tokenSnapshot,
|
|
2773
|
+
process: {
|
|
2774
|
+
orchestrator_pid: proc.orchestrator_pid,
|
|
2775
|
+
child_count: proc.child_pids.length,
|
|
2776
|
+
zombie_count: proc.zombies.length
|
|
2777
|
+
},
|
|
2778
|
+
...extraFields
|
|
2779
|
+
};
|
|
2780
|
+
}
|
|
2781
|
+
/** Extract succeeded / failed / escalated story keys from health details. */
|
|
2782
|
+
function buildTerminalSummary(storyDetails) {
|
|
2783
|
+
const succeeded = [];
|
|
2784
|
+
const failed = [];
|
|
2785
|
+
const escalated = [];
|
|
2786
|
+
for (const [k, s] of Object.entries(storyDetails)) if (s.phase === "COMPLETE") succeeded.push(k);
|
|
2787
|
+
else if (s.phase === "ESCALATED") escalated.push(k);
|
|
2788
|
+
else if (s.phase !== "PENDING") failed.push(k);
|
|
2789
|
+
return {
|
|
2790
|
+
succeeded,
|
|
2791
|
+
failed,
|
|
2792
|
+
escalated
|
|
2793
|
+
};
|
|
2794
|
+
}
|
|
2795
|
+
/**
|
|
2796
|
+
* Handle stall recovery for a single project: kill stalled processes, restart pipeline.
|
|
2797
|
+
*
|
|
2798
|
+
* Returns null if no stall detected (staleness below threshold).
|
|
2799
|
+
* Returns updated state + maxRestartsExceeded flag otherwise.
|
|
2800
|
+
*/
|
|
2801
|
+
async function handleStallRecovery(health, state, config, deps, io) {
|
|
2802
|
+
const { stallThreshold, maxRestarts, pack, outputFormat } = config;
|
|
2803
|
+
const { killPid, resumePipeline, sleep, incrementRestarts, getAllDescendants } = deps;
|
|
2804
|
+
const { emitEvent, log } = io;
|
|
2805
|
+
const { projectRoot } = state;
|
|
2806
|
+
if (health.staleness_seconds < stallThreshold) return null;
|
|
2807
|
+
const directPids = [...health.process.orchestrator_pid !== null ? [health.process.orchestrator_pid] : [], ...health.process.child_pids];
|
|
2808
|
+
const descendantPids = getAllDescendants(directPids);
|
|
2809
|
+
const directPidSet = new Set(directPids);
|
|
2810
|
+
const pids = [...directPids, ...descendantPids.filter((p) => !directPidSet.has(p))];
|
|
2811
|
+
emitEvent({
|
|
2812
|
+
type: "supervisor:kill",
|
|
2813
|
+
run_id: health.run_id,
|
|
2814
|
+
reason: "stall",
|
|
2815
|
+
staleness_seconds: health.staleness_seconds,
|
|
2816
|
+
pids
|
|
2817
|
+
});
|
|
2818
|
+
log(`Supervisor: Stall confirmed (${health.staleness_seconds}s ≥ ${stallThreshold}s threshold). Killing PIDs: ${pids.join(", ") || "none"}`);
|
|
2819
|
+
for (const pid of pids) try {
|
|
2820
|
+
killPid(pid, "SIGTERM");
|
|
2821
|
+
} catch {}
|
|
2822
|
+
await sleep(5e3);
|
|
2823
|
+
for (const pid of pids) try {
|
|
2824
|
+
killPid(pid, "SIGKILL");
|
|
2825
|
+
} catch {}
|
|
2826
|
+
if (pids.length > 0) {
|
|
2827
|
+
let allDead = false;
|
|
2828
|
+
for (let attempt = 0; attempt < 5; attempt++) {
|
|
2829
|
+
await sleep(1e3);
|
|
2830
|
+
allDead = pids.every((pid) => {
|
|
2831
|
+
try {
|
|
2832
|
+
process.kill(pid, 0);
|
|
2833
|
+
return false;
|
|
2834
|
+
} catch {
|
|
2835
|
+
return true;
|
|
2836
|
+
}
|
|
2837
|
+
});
|
|
2838
|
+
if (allDead) break;
|
|
2839
|
+
}
|
|
2840
|
+
if (!allDead) log(`Supervisor: Warning: Some PIDs may still be alive after SIGKILL`);
|
|
2841
|
+
}
|
|
2842
|
+
if (state.restartCount >= maxRestarts) {
|
|
2843
|
+
emitEvent({
|
|
2844
|
+
type: "supervisor:abort",
|
|
2845
|
+
run_id: health.run_id,
|
|
2846
|
+
reason: "max_restarts_exceeded",
|
|
2847
|
+
attempts: state.restartCount
|
|
2848
|
+
});
|
|
2849
|
+
log(`Supervisor: Max restarts (${maxRestarts}) exceeded. Aborting.`);
|
|
2850
|
+
return {
|
|
2851
|
+
state,
|
|
2852
|
+
maxRestartsExceeded: true
|
|
2853
|
+
};
|
|
2854
|
+
}
|
|
2855
|
+
const newRestartCount = state.restartCount + 1;
|
|
2856
|
+
if (health.run_id !== null) await incrementRestarts(health.run_id, projectRoot);
|
|
2857
|
+
emitEvent({
|
|
2858
|
+
type: "supervisor:restart",
|
|
2859
|
+
run_id: health.run_id,
|
|
2860
|
+
attempt: newRestartCount
|
|
2861
|
+
});
|
|
2862
|
+
log(`Supervisor: Restarting pipeline (attempt ${newRestartCount}/${maxRestarts})`);
|
|
2863
|
+
try {
|
|
2864
|
+
await resumePipeline({
|
|
2865
|
+
runId: health.run_id ?? void 0,
|
|
2866
|
+
outputFormat,
|
|
2867
|
+
projectRoot,
|
|
2868
|
+
concurrency: 3,
|
|
2869
|
+
pack
|
|
2870
|
+
});
|
|
2871
|
+
} catch (err) {
|
|
2872
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
2873
|
+
log(`Supervisor: Resume error: ${message}`);
|
|
2874
|
+
emitEvent({
|
|
2875
|
+
type: "supervisor:error",
|
|
2876
|
+
reason: "resume_failed",
|
|
2877
|
+
message
|
|
2878
|
+
});
|
|
2879
|
+
}
|
|
2880
|
+
return {
|
|
2881
|
+
state: {
|
|
2882
|
+
...state,
|
|
2883
|
+
restartCount: newRestartCount
|
|
2884
|
+
},
|
|
2885
|
+
maxRestartsExceeded: false
|
|
2886
|
+
};
|
|
2887
|
+
}
|
|
2753
2888
|
/**
|
|
2754
2889
|
* Run the pipeline supervisor — a long-running watchdog that polls pipeline health
|
|
2755
2890
|
* and automatically kills and restarts stalled pipelines.
|
|
@@ -2763,11 +2898,16 @@ function defaultSupervisorDeps() {
|
|
|
2763
2898
|
*/
|
|
2764
2899
|
async function runSupervisorAction(options, deps = {}) {
|
|
2765
2900
|
const { pollInterval, stallThreshold, maxRestarts, outputFormat, projectRoot, runId, pack, experiment, maxExperiments } = options;
|
|
2766
|
-
const
|
|
2901
|
+
const resolvedDeps = {
|
|
2767
2902
|
...defaultSupervisorDeps(),
|
|
2768
2903
|
...deps
|
|
2769
2904
|
};
|
|
2770
|
-
|
|
2905
|
+
const { getHealth, sleep, runAnalysis, getTokenSnapshot } = resolvedDeps;
|
|
2906
|
+
let state = {
|
|
2907
|
+
projectRoot,
|
|
2908
|
+
runId,
|
|
2909
|
+
restartCount: 0
|
|
2910
|
+
};
|
|
2771
2911
|
const startTime = Date.now();
|
|
2772
2912
|
function emitEvent(event) {
|
|
2773
2913
|
if (outputFormat === "json") {
|
|
@@ -2788,51 +2928,25 @@ async function runSupervisorAction(options, deps = {}) {
|
|
|
2788
2928
|
});
|
|
2789
2929
|
const ts = new Date().toISOString();
|
|
2790
2930
|
if (outputFormat === "json") {
|
|
2791
|
-
const tokenSnapshot = health.run_id !== null ? getTokenSnapshot(health.run_id, projectRoot) : {
|
|
2931
|
+
const tokenSnapshot = health.run_id !== null ? await getTokenSnapshot(health.run_id, projectRoot) : {
|
|
2792
2932
|
input: 0,
|
|
2793
2933
|
output: 0,
|
|
2794
2934
|
cost_usd: 0
|
|
2795
2935
|
};
|
|
2796
|
-
|
|
2797
|
-
orchestrator_pid: null,
|
|
2798
|
-
child_pids: [],
|
|
2799
|
-
zombies: []
|
|
2800
|
-
};
|
|
2801
|
-
emitEvent({
|
|
2802
|
-
type: "supervisor:poll",
|
|
2803
|
-
run_id: health.run_id,
|
|
2804
|
-
verdict: health.verdict,
|
|
2805
|
-
staleness_seconds: health.staleness_seconds,
|
|
2806
|
-
stories: {
|
|
2807
|
-
active: health.stories.active,
|
|
2808
|
-
completed: health.stories.completed,
|
|
2809
|
-
escalated: health.stories.escalated
|
|
2810
|
-
},
|
|
2811
|
-
story_details: health.stories.details,
|
|
2812
|
-
tokens: tokenSnapshot,
|
|
2813
|
-
process: {
|
|
2814
|
-
orchestrator_pid: proc.orchestrator_pid,
|
|
2815
|
-
child_count: proc.child_pids.length,
|
|
2816
|
-
zombie_count: proc.zombies.length
|
|
2817
|
-
}
|
|
2818
|
-
});
|
|
2936
|
+
emitEvent(buildPollEvent(health, projectRoot, tokenSnapshot));
|
|
2819
2937
|
}
|
|
2820
2938
|
log(`[${ts}] Health: ${health.verdict} | staleness=${health.staleness_seconds}s | stories: active=${health.stories.active} completed=${health.stories.completed} escalated=${health.stories.escalated}`);
|
|
2821
2939
|
if (health.verdict === "NO_PIPELINE_RUNNING") {
|
|
2822
2940
|
const elapsedSeconds = Math.round((Date.now() - startTime) / 1e3);
|
|
2823
|
-
const
|
|
2824
|
-
const failed = Object.entries(health.stories.details).filter(([, s]) => s.phase !== "COMPLETE" && s.phase !== "PENDING" && s.phase !== "ESCALATED").map(([k]) => k);
|
|
2825
|
-
const escalated = Object.entries(health.stories.details).filter(([, s]) => s.phase === "ESCALATED").map(([k]) => k);
|
|
2941
|
+
const summary = buildTerminalSummary(health.stories.details);
|
|
2826
2942
|
emitEvent({
|
|
2827
2943
|
type: "supervisor:summary",
|
|
2828
2944
|
run_id: health.run_id,
|
|
2829
2945
|
elapsed_seconds: elapsedSeconds,
|
|
2830
|
-
|
|
2831
|
-
|
|
2832
|
-
escalated,
|
|
2833
|
-
restarts: restartCount
|
|
2946
|
+
...summary,
|
|
2947
|
+
restarts: state.restartCount
|
|
2834
2948
|
});
|
|
2835
|
-
log(`\nPipeline reached terminal state. Elapsed: ${elapsedSeconds}s | succeeded: ${succeeded.length} | failed: ${failed.length} | restarts: ${restartCount}`);
|
|
2949
|
+
log(`\nPipeline reached terminal state. Elapsed: ${elapsedSeconds}s | succeeded: ${summary.succeeded.length} | failed: ${summary.failed.length} | restarts: ${state.restartCount}`);
|
|
2836
2950
|
if (health.run_id !== null && runAnalysis !== void 0) {
|
|
2837
2951
|
log(`[supervisor] Running post-run analysis for ${health.run_id}...`);
|
|
2838
2952
|
try {
|
|
@@ -2958,87 +3072,162 @@ async function runSupervisorAction(options, deps = {}) {
|
|
|
2958
3072
|
});
|
|
2959
3073
|
}
|
|
2960
3074
|
}
|
|
2961
|
-
return failed.length > 0 || escalated.length > 0 ? 1 : 0;
|
|
3075
|
+
return summary.failed.length > 0 || summary.escalated.length > 0 ? 1 : 0;
|
|
2962
3076
|
}
|
|
2963
|
-
|
|
2964
|
-
|
|
2965
|
-
|
|
2966
|
-
|
|
2967
|
-
|
|
2968
|
-
|
|
2969
|
-
|
|
2970
|
-
|
|
2971
|
-
|
|
2972
|
-
|
|
2973
|
-
|
|
2974
|
-
|
|
2975
|
-
|
|
2976
|
-
|
|
2977
|
-
|
|
2978
|
-
|
|
2979
|
-
|
|
2980
|
-
|
|
2981
|
-
|
|
2982
|
-
|
|
2983
|
-
|
|
2984
|
-
|
|
2985
|
-
|
|
2986
|
-
|
|
2987
|
-
|
|
2988
|
-
|
|
2989
|
-
|
|
2990
|
-
|
|
2991
|
-
|
|
2992
|
-
|
|
2993
|
-
|
|
2994
|
-
|
|
2995
|
-
|
|
2996
|
-
|
|
2997
|
-
|
|
3077
|
+
const stallResult = await handleStallRecovery(health, state, {
|
|
3078
|
+
stallThreshold,
|
|
3079
|
+
maxRestarts,
|
|
3080
|
+
pack,
|
|
3081
|
+
outputFormat
|
|
3082
|
+
}, resolvedDeps, {
|
|
3083
|
+
emitEvent,
|
|
3084
|
+
log
|
|
3085
|
+
});
|
|
3086
|
+
if (stallResult !== null) {
|
|
3087
|
+
if (stallResult.maxRestartsExceeded) return 2;
|
|
3088
|
+
state = stallResult.state;
|
|
3089
|
+
}
|
|
3090
|
+
await sleep(pollInterval * 1e3);
|
|
3091
|
+
}
|
|
3092
|
+
}
|
|
3093
|
+
/**
|
|
3094
|
+
* Run the supervisor across multiple projects simultaneously.
|
|
3095
|
+
* Polls each project sequentially within each cycle, tagging events with `project`.
|
|
3096
|
+
*
|
|
3097
|
+
* Exit codes:
|
|
3098
|
+
* 0 — all projects completed without failures
|
|
3099
|
+
* 1 — at least one project completed with failures or escalations
|
|
3100
|
+
* 2 — at least one project hit max restarts
|
|
3101
|
+
*/
|
|
3102
|
+
async function runMultiProjectSupervisor(options, deps = {}) {
|
|
3103
|
+
const { projects, pollInterval, stallThreshold, maxRestarts, outputFormat, pack } = options;
|
|
3104
|
+
const resolvedDeps = {
|
|
3105
|
+
...defaultSupervisorDeps(),
|
|
3106
|
+
...deps
|
|
3107
|
+
};
|
|
3108
|
+
const { getHealth, sleep, getTokenSnapshot } = resolvedDeps;
|
|
3109
|
+
if (projects.length === 0) {
|
|
3110
|
+
process.stderr.write("Error: --projects requires at least one project path\n");
|
|
3111
|
+
return 1;
|
|
3112
|
+
}
|
|
3113
|
+
const states = new Map(projects.map((p) => [p, {
|
|
3114
|
+
projectRoot: p,
|
|
3115
|
+
restartCount: 0
|
|
3116
|
+
}]));
|
|
3117
|
+
const doneProjects = new Set();
|
|
3118
|
+
const projectExitCodes = new Map();
|
|
3119
|
+
const startTime = Date.now();
|
|
3120
|
+
function emitEvent(event) {
|
|
3121
|
+
if (outputFormat === "json") {
|
|
3122
|
+
const stamped = {
|
|
3123
|
+
...event,
|
|
3124
|
+
ts: new Date().toISOString()
|
|
3125
|
+
};
|
|
3126
|
+
process.stdout.write(JSON.stringify(stamped) + "\n");
|
|
3127
|
+
}
|
|
3128
|
+
}
|
|
3129
|
+
function log(message) {
|
|
3130
|
+
if (outputFormat === "human") process.stdout.write(message + "\n");
|
|
3131
|
+
}
|
|
3132
|
+
while (true) {
|
|
3133
|
+
for (const projectRoot of projects) {
|
|
3134
|
+
if (doneProjects.has(projectRoot)) continue;
|
|
3135
|
+
let health;
|
|
3136
|
+
try {
|
|
3137
|
+
health = await getHealth({ projectRoot });
|
|
3138
|
+
} catch {
|
|
3139
|
+
log(`[supervisor] ${projectRoot}: health check failed — marking as done`);
|
|
3140
|
+
emitEvent({
|
|
3141
|
+
type: "supervisor:error",
|
|
3142
|
+
project: projectRoot,
|
|
3143
|
+
reason: "health_check_failed"
|
|
3144
|
+
});
|
|
3145
|
+
doneProjects.add(projectRoot);
|
|
3146
|
+
projectExitCodes.set(projectRoot, 1);
|
|
3147
|
+
continue;
|
|
2998
3148
|
}
|
|
2999
|
-
|
|
3149
|
+
const state = states.get(projectRoot);
|
|
3150
|
+
if (outputFormat === "json") {
|
|
3151
|
+
const tokenSnapshot = health.run_id !== null ? await getTokenSnapshot(health.run_id, projectRoot) : {
|
|
3152
|
+
input: 0,
|
|
3153
|
+
output: 0,
|
|
3154
|
+
cost_usd: 0
|
|
3155
|
+
};
|
|
3156
|
+
emitEvent(buildPollEvent(health, projectRoot, tokenSnapshot, { project: projectRoot }));
|
|
3157
|
+
}
|
|
3158
|
+
log(`[${projectRoot}] Health: ${health.verdict} | staleness=${health.staleness_seconds}s | active=${health.stories.active} completed=${health.stories.completed} escalated=${health.stories.escalated}`);
|
|
3159
|
+
if (health.verdict === "NO_PIPELINE_RUNNING") {
|
|
3160
|
+
const elapsedSeconds = Math.round((Date.now() - startTime) / 1e3);
|
|
3161
|
+
const summary = buildTerminalSummary(health.stories.details);
|
|
3000
3162
|
emitEvent({
|
|
3001
|
-
type: "supervisor:
|
|
3163
|
+
type: "supervisor:summary",
|
|
3164
|
+
project: projectRoot,
|
|
3002
3165
|
run_id: health.run_id,
|
|
3003
|
-
|
|
3004
|
-
|
|
3166
|
+
elapsed_seconds: elapsedSeconds,
|
|
3167
|
+
...summary,
|
|
3168
|
+
restarts: state.restartCount
|
|
3005
3169
|
});
|
|
3006
|
-
log(`
|
|
3007
|
-
|
|
3170
|
+
log(`[${projectRoot}] Terminal. succeeded=${summary.succeeded.length} failed=${summary.failed.length} restarts=${state.restartCount}`);
|
|
3171
|
+
doneProjects.add(projectRoot);
|
|
3172
|
+
projectExitCodes.set(projectRoot, summary.failed.length > 0 || summary.escalated.length > 0 ? 1 : 0);
|
|
3173
|
+
continue;
|
|
3008
3174
|
}
|
|
3009
|
-
|
|
3010
|
-
|
|
3175
|
+
const stallResult = await handleStallRecovery(health, state, {
|
|
3176
|
+
stallThreshold,
|
|
3177
|
+
maxRestarts,
|
|
3178
|
+
pack,
|
|
3179
|
+
outputFormat
|
|
3180
|
+
}, resolvedDeps, {
|
|
3181
|
+
emitEvent: (evt) => emitEvent({
|
|
3182
|
+
...evt,
|
|
3183
|
+
project: projectRoot
|
|
3184
|
+
}),
|
|
3185
|
+
log: (msg) => log(`[${projectRoot}] ${msg}`)
|
|
3186
|
+
});
|
|
3187
|
+
if (stallResult !== null) if (stallResult.maxRestartsExceeded) {
|
|
3188
|
+
doneProjects.add(projectRoot);
|
|
3189
|
+
projectExitCodes.set(projectRoot, 2);
|
|
3190
|
+
} else states.set(projectRoot, stallResult.state);
|
|
3191
|
+
}
|
|
3192
|
+
if (doneProjects.size >= projects.length) {
|
|
3193
|
+
const elapsedSeconds = Math.round((Date.now() - startTime) / 1e3);
|
|
3011
3194
|
emitEvent({
|
|
3012
|
-
type: "supervisor:
|
|
3013
|
-
|
|
3014
|
-
|
|
3195
|
+
type: "supervisor:done",
|
|
3196
|
+
elapsed_seconds: elapsedSeconds,
|
|
3197
|
+
project_results: Object.fromEntries(projectExitCodes)
|
|
3015
3198
|
});
|
|
3016
|
-
log(
|
|
3017
|
-
|
|
3018
|
-
|
|
3019
|
-
|
|
3020
|
-
|
|
3021
|
-
projectRoot,
|
|
3022
|
-
concurrency: 3,
|
|
3023
|
-
pack
|
|
3024
|
-
});
|
|
3025
|
-
} catch (err) {
|
|
3026
|
-
const message = err instanceof Error ? err.message : String(err);
|
|
3027
|
-
log(`Supervisor: Resume error: ${message}`);
|
|
3028
|
-
if (outputFormat === "json") emitEvent({
|
|
3029
|
-
type: "supervisor:error",
|
|
3030
|
-
reason: "resume_failed",
|
|
3031
|
-
message
|
|
3032
|
-
});
|
|
3033
|
-
}
|
|
3199
|
+
log(`\nAll projects reached terminal state. Elapsed: ${elapsedSeconds}s`);
|
|
3200
|
+
const exitCodes = [...projectExitCodes.values()];
|
|
3201
|
+
if (exitCodes.includes(2)) return 2;
|
|
3202
|
+
if (exitCodes.includes(1)) return 1;
|
|
3203
|
+
return 0;
|
|
3034
3204
|
}
|
|
3035
3205
|
await sleep(pollInterval * 1e3);
|
|
3036
3206
|
}
|
|
3037
3207
|
}
|
|
3038
3208
|
function registerSupervisorCommand(program, _version = "0.0.0", projectRoot = process.cwd()) {
|
|
3039
|
-
program.command("supervisor").description("Monitor a pipeline run and automatically recover from stalls").option("--poll-interval <seconds>", "Health poll interval in seconds", (v) => parseInt(v, 10), 60).option("--stall-threshold <seconds>", "Staleness in seconds before killing a stalled pipeline", (v) => parseInt(v, 10), 600).option("--max-restarts <n>", "Maximum automatic restarts before aborting", (v) => parseInt(v, 10), 3).option("--run-id <id>", "Pipeline run ID to monitor (defaults to latest)").option("--pack <name>", "Methodology pack name", "bmad").option("--project-root <path>", "Project root directory", projectRoot).option("--output-format <format>", "Output format: human (default) or json", "human").option("--experiment", "After post-run analysis, enter experiment mode: create branches, apply modifications, run single-story experiments, and report verdicts (Story 17-4)", false).option("--max-experiments <n>", "Maximum number of experiments to run per analysis cycle (default: 2, Story 17-4 AC6)", (v) => parseInt(v, 10), 2).action(async (opts) => {
|
|
3209
|
+
program.command("supervisor").description("Monitor a pipeline run and automatically recover from stalls").option("--poll-interval <seconds>", "Health poll interval in seconds", (v) => parseInt(v, 10), 60).option("--stall-threshold <seconds>", "Staleness in seconds before killing a stalled pipeline", (v) => parseInt(v, 10), 600).option("--max-restarts <n>", "Maximum automatic restarts before aborting", (v) => parseInt(v, 10), 3).option("--run-id <id>", "Pipeline run ID to monitor (defaults to latest)").option("--pack <name>", "Methodology pack name", "bmad").option("--project-root <path>", "Project root directory", projectRoot).option("--projects <paths>", "Comma-separated project root directories to monitor (multi-project mode)").option("--output-format <format>", "Output format: human (default) or json", "human").option("--experiment", "After post-run analysis, enter experiment mode: create branches, apply modifications, run single-story experiments, and report verdicts (Story 17-4)", false).option("--max-experiments <n>", "Maximum number of experiments to run per analysis cycle (default: 2, Story 17-4 AC6)", (v) => parseInt(v, 10), 2).action(async (opts) => {
|
|
3040
3210
|
const outputFormat = opts.outputFormat === "json" ? "json" : "human";
|
|
3041
3211
|
if (opts.stallThreshold < 120) console.warn(`Warning: --stall-threshold ${opts.stallThreshold}s is below 120s. Agent steps typically take 45-90s. This may cause false stall detections and wasted restarts.`);
|
|
3212
|
+
if (opts.projects) {
|
|
3213
|
+
if (opts.runId) {
|
|
3214
|
+
console.error("Error: --run-id cannot be used with --projects (ambiguous)");
|
|
3215
|
+
process.exitCode = 1;
|
|
3216
|
+
return;
|
|
3217
|
+
}
|
|
3218
|
+
if (opts.experiment) console.warn("Warning: --experiment is not supported in multi-project mode — ignored.");
|
|
3219
|
+
const projects = opts.projects.split(",").map((p) => resolve(p.trim()));
|
|
3220
|
+
const exitCode$1 = await runMultiProjectSupervisor({
|
|
3221
|
+
projects,
|
|
3222
|
+
pollInterval: opts.pollInterval,
|
|
3223
|
+
stallThreshold: opts.stallThreshold,
|
|
3224
|
+
maxRestarts: opts.maxRestarts,
|
|
3225
|
+
outputFormat,
|
|
3226
|
+
pack: opts.pack
|
|
3227
|
+
});
|
|
3228
|
+
process.exitCode = exitCode$1;
|
|
3229
|
+
return;
|
|
3230
|
+
}
|
|
3042
3231
|
const exitCode = await runSupervisorAction({
|
|
3043
3232
|
pollInterval: opts.pollInterval,
|
|
3044
3233
|
stallThreshold: opts.stallThreshold,
|