substrate-ai 0.2.11 → 0.2.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli/index.js
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
import { createLogger, deepMask } from "../logger-C6n1g8uP.js";
|
|
3
3
|
import { AdapterRegistry, createEventBus } from "../event-bus-J-bw-pkp.js";
|
|
4
4
|
import { CURRENT_CONFIG_FORMAT_VERSION, CURRENT_TASK_GRAPH_VERSION, PartialSubstrateConfigSchema, SUPPORTED_CONFIG_FORMAT_VERSIONS, SubstrateConfigSchema, defaultConfigMigrator } from "../version-manager-impl-BpVx2DkY.js";
|
|
5
|
-
import { DatabaseWrapper, SUBSTRATE_OWNED_SETTINGS_KEYS, VALID_PHASES, buildPipelineStatusOutput, createContextCompiler, createDispatcher, createImplementationOrchestrator, createPackLoader, createPhaseOrchestrator, createStopAfterGate, findPackageRoot, formatOutput, formatPhaseCompletionSummary, formatPipelineStatusHuman, formatPipelineSummary, formatTokenTelemetry, getSubstrateDefaultSettings, parseDbTimestampAsUtc, registerRunCommand, resolveBmadMethodSrcPath, resolveBmadMethodVersion, resolveMainRepoRoot, runAnalysisPhase, runMigrations, runPlanningPhase, runSolutioningPhase, validateStopAfterFromConflict } from "../run-
|
|
5
|
+
import { DatabaseWrapper, SUBSTRATE_OWNED_SETTINGS_KEYS, VALID_PHASES, buildPipelineStatusOutput, createContextCompiler, createDispatcher, createImplementationOrchestrator, createPackLoader, createPhaseOrchestrator, createStopAfterGate, findPackageRoot, formatOutput, formatPhaseCompletionSummary, formatPipelineStatusHuman, formatPipelineSummary, formatTokenTelemetry, getSubstrateDefaultSettings, parseDbTimestampAsUtc, registerRunCommand, resolveBmadMethodSrcPath, resolveBmadMethodVersion, resolveMainRepoRoot, runAnalysisPhase, runMigrations, runPlanningPhase, runSolutioningPhase, validateStopAfterFromConflict } from "../run-CoP8UQU3.js";
|
|
6
6
|
import { ConfigError, ConfigIncompatibleFormatError } from "../errors-BPqtzQ4U.js";
|
|
7
7
|
import { addTokenUsage, createDecision, getDecisionsByPhaseForRun, getLatestRun, getPipelineRunById, getTokenUsageSummary, listRequirements, updatePipelineRun } from "../decisions-DNYByk0U.js";
|
|
8
8
|
import { aggregateTokenUsageForRun, compareRunMetrics, getBaselineRunMetrics, getRunMetrics, getStoryMetricsForRun, incrementRunRestarts, listRunMetrics, tagRunAsBaseline } from "../metrics-BSg8VIHd.js";
|
|
@@ -2414,16 +2414,24 @@ const DEFAULT_STALL_THRESHOLD_SECONDS = 600;
|
|
|
2414
2414
|
* - `node dist/cli/index.js run` (npm run substrate:dev)
|
|
2415
2415
|
* - `npx substrate run`
|
|
2416
2416
|
* - any node process whose command contains `run` with `--events` or `--stories`
|
|
2417
|
+
*
|
|
2418
|
+
* When `projectRoot` is provided, additionally checks that the command line
|
|
2419
|
+
* contains that path (via `--project-root` flag or as part of the binary/CWD path).
|
|
2420
|
+
* This ensures multi-project environments match the correct orchestrator.
|
|
2417
2421
|
*/
|
|
2418
|
-
function isOrchestratorProcessLine(line) {
|
|
2422
|
+
function isOrchestratorProcessLine(line, projectRoot) {
|
|
2419
2423
|
if (line.includes("grep")) return false;
|
|
2420
|
-
|
|
2421
|
-
if (line.includes("substrate
|
|
2422
|
-
if (line.includes("
|
|
2423
|
-
if (line.includes("
|
|
2424
|
-
|
|
2425
|
-
|
|
2426
|
-
|
|
2424
|
+
let isOrchestrator = false;
|
|
2425
|
+
if (line.includes("substrate run")) isOrchestrator = true;
|
|
2426
|
+
else if (line.includes("substrate-ai run")) isOrchestrator = true;
|
|
2427
|
+
else if (line.includes("index.js run")) isOrchestrator = true;
|
|
2428
|
+
else if (line.includes("node") && /\srun(\s|$)/.test(line) && (line.includes("--events") || line.includes("--stories"))) isOrchestrator = true;
|
|
2429
|
+
if (!isOrchestrator) return false;
|
|
2430
|
+
if (projectRoot !== void 0) return line.includes(projectRoot);
|
|
2431
|
+
return true;
|
|
2432
|
+
}
|
|
2433
|
+
function inspectProcessTree(opts) {
|
|
2434
|
+
const { projectRoot, execFileSync: execFileSyncOverride } = opts ?? {};
|
|
2427
2435
|
const result = {
|
|
2428
2436
|
orchestrator_pid: null,
|
|
2429
2437
|
child_pids: [],
|
|
@@ -2443,7 +2451,7 @@ function inspectProcessTree(execFileSyncOverride) {
|
|
|
2443
2451
|
});
|
|
2444
2452
|
}
|
|
2445
2453
|
const lines = psOutput.split("\n");
|
|
2446
|
-
for (const line of lines) if (isOrchestratorProcessLine(line)) {
|
|
2454
|
+
for (const line of lines) if (isOrchestratorProcessLine(line, projectRoot)) {
|
|
2447
2455
|
const match = line.trim().match(/^(\d+)/);
|
|
2448
2456
|
if (match) {
|
|
2449
2457
|
result.orchestrator_pid = parseInt(match[1], 10);
|
|
@@ -2466,6 +2474,58 @@ function inspectProcessTree(execFileSyncOverride) {
|
|
|
2466
2474
|
return result;
|
|
2467
2475
|
}
|
|
2468
2476
|
/**
|
|
2477
|
+
* Collect all descendant PIDs of the given root PIDs by walking the process
|
|
2478
|
+
* tree recursively. This ensures that grandchildren of the orchestrator
|
|
2479
|
+
* (e.g. node subprocesses spawned by `claude -p`) are also killed during
|
|
2480
|
+
* stall recovery, leaving no orphan processes.
|
|
2481
|
+
*
|
|
2482
|
+
* Returns only the descendants — the root PIDs themselves are NOT included.
|
|
2483
|
+
*/
|
|
2484
|
+
function getAllDescendantPids(rootPids, execFileSyncOverride) {
|
|
2485
|
+
if (rootPids.length === 0) return [];
|
|
2486
|
+
try {
|
|
2487
|
+
let psOutput;
|
|
2488
|
+
if (execFileSyncOverride !== void 0) psOutput = execFileSyncOverride("ps", ["-eo", "pid,ppid"], {
|
|
2489
|
+
encoding: "utf-8",
|
|
2490
|
+
timeout: 5e3
|
|
2491
|
+
});
|
|
2492
|
+
else {
|
|
2493
|
+
const { execFileSync } = __require("node:child_process");
|
|
2494
|
+
psOutput = execFileSync("ps", ["-eo", "pid,ppid"], {
|
|
2495
|
+
encoding: "utf-8",
|
|
2496
|
+
timeout: 5e3
|
|
2497
|
+
});
|
|
2498
|
+
}
|
|
2499
|
+
const childrenOf = new Map();
|
|
2500
|
+
for (const line of psOutput.split("\n")) {
|
|
2501
|
+
const parts = line.trim().split(/\s+/);
|
|
2502
|
+
if (parts.length >= 2) {
|
|
2503
|
+
const pid = parseInt(parts[0], 10);
|
|
2504
|
+
const ppid = parseInt(parts[1], 10);
|
|
2505
|
+
if (!isNaN(pid) && !isNaN(ppid) && pid > 0) {
|
|
2506
|
+
if (!childrenOf.has(ppid)) childrenOf.set(ppid, []);
|
|
2507
|
+
childrenOf.get(ppid).push(pid);
|
|
2508
|
+
}
|
|
2509
|
+
}
|
|
2510
|
+
}
|
|
2511
|
+
const descendants = [];
|
|
2512
|
+
const seen = new Set(rootPids);
|
|
2513
|
+
const queue = [...rootPids];
|
|
2514
|
+
while (queue.length > 0) {
|
|
2515
|
+
const current = queue.shift();
|
|
2516
|
+
const children = childrenOf.get(current) ?? [];
|
|
2517
|
+
for (const child of children) if (!seen.has(child)) {
|
|
2518
|
+
seen.add(child);
|
|
2519
|
+
descendants.push(child);
|
|
2520
|
+
queue.push(child);
|
|
2521
|
+
}
|
|
2522
|
+
}
|
|
2523
|
+
return descendants;
|
|
2524
|
+
} catch {
|
|
2525
|
+
return [];
|
|
2526
|
+
}
|
|
2527
|
+
}
|
|
2528
|
+
/**
|
|
2469
2529
|
* Fetch pipeline health data as a structured object without any stdout side-effects.
|
|
2470
2530
|
* Used by runSupervisorAction to poll health without formatting overhead.
|
|
2471
2531
|
*
|
|
@@ -2524,10 +2584,11 @@ async function getAutoHealthData(options) {
|
|
|
2524
2584
|
}
|
|
2525
2585
|
}
|
|
2526
2586
|
} catch {}
|
|
2527
|
-
const processInfo = inspectProcessTree();
|
|
2587
|
+
const processInfo = inspectProcessTree({ projectRoot });
|
|
2528
2588
|
let verdict = "NO_PIPELINE_RUNNING";
|
|
2529
2589
|
if (run.status === "running") if (processInfo.orchestrator_pid === null && active === 0 && completed > 0) verdict = "NO_PIPELINE_RUNNING";
|
|
2530
2590
|
else if (processInfo.zombies.length > 0) verdict = "STALLED";
|
|
2591
|
+
else if (processInfo.orchestrator_pid !== null && processInfo.child_pids.length > 0 && stalenessSeconds > DEFAULT_STALL_THRESHOLD_SECONDS) verdict = "HEALTHY";
|
|
2531
2592
|
else if (stalenessSeconds > DEFAULT_STALL_THRESHOLD_SECONDS) verdict = "STALLED";
|
|
2532
2593
|
else if (processInfo.orchestrator_pid !== null && processInfo.child_pids.length === 0 && active > 0) verdict = "STALLED";
|
|
2533
2594
|
else verdict = "HEALTHY";
|
|
@@ -2660,6 +2721,7 @@ function defaultSupervisorDeps() {
|
|
|
2660
2721
|
};
|
|
2661
2722
|
}
|
|
2662
2723
|
},
|
|
2724
|
+
getAllDescendants: (rootPids) => getAllDescendantPids(rootPids),
|
|
2663
2725
|
runAnalysis: async (runId, projectRoot) => {
|
|
2664
2726
|
const dbPath = join(projectRoot, ".substrate", "substrate.db");
|
|
2665
2727
|
if (!existsSync(dbPath)) return;
|
|
@@ -2688,6 +2750,140 @@ function defaultSupervisorDeps() {
|
|
|
2688
2750
|
}
|
|
2689
2751
|
};
|
|
2690
2752
|
}
|
|
2753
|
+
/** Build the supervisor:poll event payload. */
|
|
2754
|
+
function buildPollEvent(health, projectRoot, tokenSnapshot, extraFields) {
|
|
2755
|
+
const proc = health.process ?? {
|
|
2756
|
+
orchestrator_pid: null,
|
|
2757
|
+
child_pids: [],
|
|
2758
|
+
zombies: []
|
|
2759
|
+
};
|
|
2760
|
+
return {
|
|
2761
|
+
type: "supervisor:poll",
|
|
2762
|
+
run_id: health.run_id,
|
|
2763
|
+
verdict: health.verdict,
|
|
2764
|
+
staleness_seconds: health.staleness_seconds,
|
|
2765
|
+
stories: {
|
|
2766
|
+
active: health.stories.active,
|
|
2767
|
+
completed: health.stories.completed,
|
|
2768
|
+
escalated: health.stories.escalated
|
|
2769
|
+
},
|
|
2770
|
+
story_details: health.stories.details,
|
|
2771
|
+
tokens: tokenSnapshot,
|
|
2772
|
+
process: {
|
|
2773
|
+
orchestrator_pid: proc.orchestrator_pid,
|
|
2774
|
+
child_count: proc.child_pids.length,
|
|
2775
|
+
zombie_count: proc.zombies.length
|
|
2776
|
+
},
|
|
2777
|
+
...extraFields
|
|
2778
|
+
};
|
|
2779
|
+
}
|
|
2780
|
+
/** Extract succeeded / failed / escalated story keys from health details. */
|
|
2781
|
+
function buildTerminalSummary(storyDetails) {
|
|
2782
|
+
const succeeded = [];
|
|
2783
|
+
const failed = [];
|
|
2784
|
+
const escalated = [];
|
|
2785
|
+
for (const [k, s] of Object.entries(storyDetails)) if (s.phase === "COMPLETE") succeeded.push(k);
|
|
2786
|
+
else if (s.phase === "ESCALATED") escalated.push(k);
|
|
2787
|
+
else if (s.phase !== "PENDING") failed.push(k);
|
|
2788
|
+
return {
|
|
2789
|
+
succeeded,
|
|
2790
|
+
failed,
|
|
2791
|
+
escalated
|
|
2792
|
+
};
|
|
2793
|
+
}
|
|
2794
|
+
/**
|
|
2795
|
+
* Handle stall recovery for a single project: kill stalled processes, restart pipeline.
|
|
2796
|
+
*
|
|
2797
|
+
* Returns null if no stall detected (staleness below threshold).
|
|
2798
|
+
* Returns updated state + maxRestartsExceeded flag otherwise.
|
|
2799
|
+
*/
|
|
2800
|
+
async function handleStallRecovery(health, state, config, deps, io) {
|
|
2801
|
+
const { stallThreshold, maxRestarts, pack, outputFormat } = config;
|
|
2802
|
+
const { killPid, resumePipeline, sleep, incrementRestarts, getAllDescendants } = deps;
|
|
2803
|
+
const { emitEvent, log } = io;
|
|
2804
|
+
const { projectRoot } = state;
|
|
2805
|
+
if (health.staleness_seconds < stallThreshold) return null;
|
|
2806
|
+
const directPids = [...health.process.orchestrator_pid !== null ? [health.process.orchestrator_pid] : [], ...health.process.child_pids];
|
|
2807
|
+
const descendantPids = getAllDescendants(directPids);
|
|
2808
|
+
const directPidSet = new Set(directPids);
|
|
2809
|
+
const pids = [...directPids, ...descendantPids.filter((p) => !directPidSet.has(p))];
|
|
2810
|
+
emitEvent({
|
|
2811
|
+
type: "supervisor:kill",
|
|
2812
|
+
run_id: health.run_id,
|
|
2813
|
+
reason: "stall",
|
|
2814
|
+
staleness_seconds: health.staleness_seconds,
|
|
2815
|
+
pids
|
|
2816
|
+
});
|
|
2817
|
+
log(`Supervisor: Stall confirmed (${health.staleness_seconds}s ≥ ${stallThreshold}s threshold). Killing PIDs: ${pids.join(", ") || "none"}`);
|
|
2818
|
+
for (const pid of pids) try {
|
|
2819
|
+
killPid(pid, "SIGTERM");
|
|
2820
|
+
} catch {}
|
|
2821
|
+
await sleep(5e3);
|
|
2822
|
+
for (const pid of pids) try {
|
|
2823
|
+
killPid(pid, "SIGKILL");
|
|
2824
|
+
} catch {}
|
|
2825
|
+
if (pids.length > 0) {
|
|
2826
|
+
let allDead = false;
|
|
2827
|
+
for (let attempt = 0; attempt < 5; attempt++) {
|
|
2828
|
+
await sleep(1e3);
|
|
2829
|
+
allDead = pids.every((pid) => {
|
|
2830
|
+
try {
|
|
2831
|
+
process.kill(pid, 0);
|
|
2832
|
+
return false;
|
|
2833
|
+
} catch {
|
|
2834
|
+
return true;
|
|
2835
|
+
}
|
|
2836
|
+
});
|
|
2837
|
+
if (allDead) break;
|
|
2838
|
+
}
|
|
2839
|
+
if (!allDead) log(`Supervisor: Warning: Some PIDs may still be alive after SIGKILL`);
|
|
2840
|
+
}
|
|
2841
|
+
if (state.restartCount >= maxRestarts) {
|
|
2842
|
+
emitEvent({
|
|
2843
|
+
type: "supervisor:abort",
|
|
2844
|
+
run_id: health.run_id,
|
|
2845
|
+
reason: "max_restarts_exceeded",
|
|
2846
|
+
attempts: state.restartCount
|
|
2847
|
+
});
|
|
2848
|
+
log(`Supervisor: Max restarts (${maxRestarts}) exceeded. Aborting.`);
|
|
2849
|
+
return {
|
|
2850
|
+
state,
|
|
2851
|
+
maxRestartsExceeded: true
|
|
2852
|
+
};
|
|
2853
|
+
}
|
|
2854
|
+
const newRestartCount = state.restartCount + 1;
|
|
2855
|
+
if (health.run_id !== null) incrementRestarts(health.run_id, projectRoot);
|
|
2856
|
+
emitEvent({
|
|
2857
|
+
type: "supervisor:restart",
|
|
2858
|
+
run_id: health.run_id,
|
|
2859
|
+
attempt: newRestartCount
|
|
2860
|
+
});
|
|
2861
|
+
log(`Supervisor: Restarting pipeline (attempt ${newRestartCount}/${maxRestarts})`);
|
|
2862
|
+
try {
|
|
2863
|
+
await resumePipeline({
|
|
2864
|
+
runId: health.run_id ?? void 0,
|
|
2865
|
+
outputFormat,
|
|
2866
|
+
projectRoot,
|
|
2867
|
+
concurrency: 3,
|
|
2868
|
+
pack
|
|
2869
|
+
});
|
|
2870
|
+
} catch (err) {
|
|
2871
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
2872
|
+
log(`Supervisor: Resume error: ${message}`);
|
|
2873
|
+
emitEvent({
|
|
2874
|
+
type: "supervisor:error",
|
|
2875
|
+
reason: "resume_failed",
|
|
2876
|
+
message
|
|
2877
|
+
});
|
|
2878
|
+
}
|
|
2879
|
+
return {
|
|
2880
|
+
state: {
|
|
2881
|
+
...state,
|
|
2882
|
+
restartCount: newRestartCount
|
|
2883
|
+
},
|
|
2884
|
+
maxRestartsExceeded: false
|
|
2885
|
+
};
|
|
2886
|
+
}
|
|
2691
2887
|
/**
|
|
2692
2888
|
* Run the pipeline supervisor — a long-running watchdog that polls pipeline health
|
|
2693
2889
|
* and automatically kills and restarts stalled pipelines.
|
|
@@ -2701,11 +2897,16 @@ function defaultSupervisorDeps() {
|
|
|
2701
2897
|
*/
|
|
2702
2898
|
async function runSupervisorAction(options, deps = {}) {
|
|
2703
2899
|
const { pollInterval, stallThreshold, maxRestarts, outputFormat, projectRoot, runId, pack, experiment, maxExperiments } = options;
|
|
2704
|
-
const
|
|
2900
|
+
const resolvedDeps = {
|
|
2705
2901
|
...defaultSupervisorDeps(),
|
|
2706
2902
|
...deps
|
|
2707
2903
|
};
|
|
2708
|
-
|
|
2904
|
+
const { getHealth, sleep, runAnalysis, getTokenSnapshot } = resolvedDeps;
|
|
2905
|
+
let state = {
|
|
2906
|
+
projectRoot,
|
|
2907
|
+
runId,
|
|
2908
|
+
restartCount: 0
|
|
2909
|
+
};
|
|
2709
2910
|
const startTime = Date.now();
|
|
2710
2911
|
function emitEvent(event) {
|
|
2711
2912
|
if (outputFormat === "json") {
|
|
@@ -2731,46 +2932,20 @@ async function runSupervisorAction(options, deps = {}) {
|
|
|
2731
2932
|
output: 0,
|
|
2732
2933
|
cost_usd: 0
|
|
2733
2934
|
};
|
|
2734
|
-
|
|
2735
|
-
orchestrator_pid: null,
|
|
2736
|
-
child_pids: [],
|
|
2737
|
-
zombies: []
|
|
2738
|
-
};
|
|
2739
|
-
emitEvent({
|
|
2740
|
-
type: "supervisor:poll",
|
|
2741
|
-
run_id: health.run_id,
|
|
2742
|
-
verdict: health.verdict,
|
|
2743
|
-
staleness_seconds: health.staleness_seconds,
|
|
2744
|
-
stories: {
|
|
2745
|
-
active: health.stories.active,
|
|
2746
|
-
completed: health.stories.completed,
|
|
2747
|
-
escalated: health.stories.escalated
|
|
2748
|
-
},
|
|
2749
|
-
story_details: health.stories.details,
|
|
2750
|
-
tokens: tokenSnapshot,
|
|
2751
|
-
process: {
|
|
2752
|
-
orchestrator_pid: proc.orchestrator_pid,
|
|
2753
|
-
child_count: proc.child_pids.length,
|
|
2754
|
-
zombie_count: proc.zombies.length
|
|
2755
|
-
}
|
|
2756
|
-
});
|
|
2935
|
+
emitEvent(buildPollEvent(health, projectRoot, tokenSnapshot));
|
|
2757
2936
|
}
|
|
2758
2937
|
log(`[${ts}] Health: ${health.verdict} | staleness=${health.staleness_seconds}s | stories: active=${health.stories.active} completed=${health.stories.completed} escalated=${health.stories.escalated}`);
|
|
2759
2938
|
if (health.verdict === "NO_PIPELINE_RUNNING") {
|
|
2760
2939
|
const elapsedSeconds = Math.round((Date.now() - startTime) / 1e3);
|
|
2761
|
-
const
|
|
2762
|
-
const failed = Object.entries(health.stories.details).filter(([, s]) => s.phase !== "COMPLETE" && s.phase !== "PENDING" && s.phase !== "ESCALATED").map(([k]) => k);
|
|
2763
|
-
const escalated = Object.entries(health.stories.details).filter(([, s]) => s.phase === "ESCALATED").map(([k]) => k);
|
|
2940
|
+
const summary = buildTerminalSummary(health.stories.details);
|
|
2764
2941
|
emitEvent({
|
|
2765
2942
|
type: "supervisor:summary",
|
|
2766
2943
|
run_id: health.run_id,
|
|
2767
2944
|
elapsed_seconds: elapsedSeconds,
|
|
2768
|
-
|
|
2769
|
-
|
|
2770
|
-
escalated,
|
|
2771
|
-
restarts: restartCount
|
|
2945
|
+
...summary,
|
|
2946
|
+
restarts: state.restartCount
|
|
2772
2947
|
});
|
|
2773
|
-
log(`\nPipeline reached terminal state. Elapsed: ${elapsedSeconds}s | succeeded: ${succeeded.length} | failed: ${failed.length} | restarts: ${restartCount}`);
|
|
2948
|
+
log(`\nPipeline reached terminal state. Elapsed: ${elapsedSeconds}s | succeeded: ${summary.succeeded.length} | failed: ${summary.failed.length} | restarts: ${state.restartCount}`);
|
|
2774
2949
|
if (health.run_id !== null && runAnalysis !== void 0) {
|
|
2775
2950
|
log(`[supervisor] Running post-run analysis for ${health.run_id}...`);
|
|
2776
2951
|
try {
|
|
@@ -2833,7 +3008,7 @@ async function runSupervisorAction(options, deps = {}) {
|
|
|
2833
3008
|
const expDb = expDbWrapper.db;
|
|
2834
3009
|
const { runRunAction: runPipeline } = await import(
|
|
2835
3010
|
/* @vite-ignore */
|
|
2836
|
-
"../run-
|
|
3011
|
+
"../run-B9IglY4m.js"
|
|
2837
3012
|
);
|
|
2838
3013
|
const runStoryFn = async (opts) => {
|
|
2839
3014
|
const exitCode = await runPipeline({
|
|
@@ -2896,84 +3071,162 @@ async function runSupervisorAction(options, deps = {}) {
|
|
|
2896
3071
|
});
|
|
2897
3072
|
}
|
|
2898
3073
|
}
|
|
2899
|
-
return failed.length > 0 || escalated.length > 0 ? 1 : 0;
|
|
3074
|
+
return summary.failed.length > 0 || summary.escalated.length > 0 ? 1 : 0;
|
|
2900
3075
|
}
|
|
2901
|
-
|
|
2902
|
-
|
|
2903
|
-
|
|
2904
|
-
|
|
2905
|
-
|
|
2906
|
-
|
|
2907
|
-
|
|
2908
|
-
|
|
2909
|
-
|
|
2910
|
-
|
|
2911
|
-
|
|
2912
|
-
|
|
2913
|
-
|
|
2914
|
-
|
|
2915
|
-
|
|
2916
|
-
|
|
2917
|
-
|
|
2918
|
-
|
|
2919
|
-
|
|
2920
|
-
|
|
2921
|
-
|
|
2922
|
-
|
|
2923
|
-
|
|
2924
|
-
|
|
2925
|
-
|
|
2926
|
-
|
|
2927
|
-
|
|
2928
|
-
|
|
2929
|
-
|
|
2930
|
-
|
|
2931
|
-
|
|
2932
|
-
|
|
3076
|
+
const stallResult = await handleStallRecovery(health, state, {
|
|
3077
|
+
stallThreshold,
|
|
3078
|
+
maxRestarts,
|
|
3079
|
+
pack,
|
|
3080
|
+
outputFormat
|
|
3081
|
+
}, resolvedDeps, {
|
|
3082
|
+
emitEvent,
|
|
3083
|
+
log
|
|
3084
|
+
});
|
|
3085
|
+
if (stallResult !== null) {
|
|
3086
|
+
if (stallResult.maxRestartsExceeded) return 2;
|
|
3087
|
+
state = stallResult.state;
|
|
3088
|
+
}
|
|
3089
|
+
await sleep(pollInterval * 1e3);
|
|
3090
|
+
}
|
|
3091
|
+
}
|
|
3092
|
+
/**
|
|
3093
|
+
* Run the supervisor across multiple projects simultaneously.
|
|
3094
|
+
* Polls each project sequentially within each cycle, tagging events with `project`.
|
|
3095
|
+
*
|
|
3096
|
+
* Exit codes:
|
|
3097
|
+
* 0 — all projects completed without failures
|
|
3098
|
+
* 1 — at least one project completed with failures or escalations
|
|
3099
|
+
* 2 — at least one project hit max restarts
|
|
3100
|
+
*/
|
|
3101
|
+
async function runMultiProjectSupervisor(options, deps = {}) {
|
|
3102
|
+
const { projects, pollInterval, stallThreshold, maxRestarts, outputFormat, pack } = options;
|
|
3103
|
+
const resolvedDeps = {
|
|
3104
|
+
...defaultSupervisorDeps(),
|
|
3105
|
+
...deps
|
|
3106
|
+
};
|
|
3107
|
+
const { getHealth, sleep, getTokenSnapshot } = resolvedDeps;
|
|
3108
|
+
if (projects.length === 0) {
|
|
3109
|
+
process.stderr.write("Error: --projects requires at least one project path\n");
|
|
3110
|
+
return 1;
|
|
3111
|
+
}
|
|
3112
|
+
const states = new Map(projects.map((p) => [p, {
|
|
3113
|
+
projectRoot: p,
|
|
3114
|
+
restartCount: 0
|
|
3115
|
+
}]));
|
|
3116
|
+
const doneProjects = new Set();
|
|
3117
|
+
const projectExitCodes = new Map();
|
|
3118
|
+
const startTime = Date.now();
|
|
3119
|
+
function emitEvent(event) {
|
|
3120
|
+
if (outputFormat === "json") {
|
|
3121
|
+
const stamped = {
|
|
3122
|
+
...event,
|
|
3123
|
+
ts: new Date().toISOString()
|
|
3124
|
+
};
|
|
3125
|
+
process.stdout.write(JSON.stringify(stamped) + "\n");
|
|
3126
|
+
}
|
|
3127
|
+
}
|
|
3128
|
+
function log(message) {
|
|
3129
|
+
if (outputFormat === "human") process.stdout.write(message + "\n");
|
|
3130
|
+
}
|
|
3131
|
+
while (true) {
|
|
3132
|
+
for (const projectRoot of projects) {
|
|
3133
|
+
if (doneProjects.has(projectRoot)) continue;
|
|
3134
|
+
let health;
|
|
3135
|
+
try {
|
|
3136
|
+
health = await getHealth({ projectRoot });
|
|
3137
|
+
} catch {
|
|
3138
|
+
log(`[supervisor] ${projectRoot}: health check failed — marking as done`);
|
|
3139
|
+
emitEvent({
|
|
3140
|
+
type: "supervisor:error",
|
|
3141
|
+
project: projectRoot,
|
|
3142
|
+
reason: "health_check_failed"
|
|
3143
|
+
});
|
|
3144
|
+
doneProjects.add(projectRoot);
|
|
3145
|
+
projectExitCodes.set(projectRoot, 1);
|
|
3146
|
+
continue;
|
|
3147
|
+
}
|
|
3148
|
+
const state = states.get(projectRoot);
|
|
3149
|
+
if (outputFormat === "json") {
|
|
3150
|
+
const tokenSnapshot = health.run_id !== null ? getTokenSnapshot(health.run_id, projectRoot) : {
|
|
3151
|
+
input: 0,
|
|
3152
|
+
output: 0,
|
|
3153
|
+
cost_usd: 0
|
|
3154
|
+
};
|
|
3155
|
+
emitEvent(buildPollEvent(health, projectRoot, tokenSnapshot, { project: projectRoot }));
|
|
2933
3156
|
}
|
|
2934
|
-
|
|
3157
|
+
log(`[${projectRoot}] Health: ${health.verdict} | staleness=${health.staleness_seconds}s | active=${health.stories.active} completed=${health.stories.completed} escalated=${health.stories.escalated}`);
|
|
3158
|
+
if (health.verdict === "NO_PIPELINE_RUNNING") {
|
|
3159
|
+
const elapsedSeconds = Math.round((Date.now() - startTime) / 1e3);
|
|
3160
|
+
const summary = buildTerminalSummary(health.stories.details);
|
|
2935
3161
|
emitEvent({
|
|
2936
|
-
type: "supervisor:
|
|
3162
|
+
type: "supervisor:summary",
|
|
3163
|
+
project: projectRoot,
|
|
2937
3164
|
run_id: health.run_id,
|
|
2938
|
-
|
|
2939
|
-
|
|
3165
|
+
elapsed_seconds: elapsedSeconds,
|
|
3166
|
+
...summary,
|
|
3167
|
+
restarts: state.restartCount
|
|
2940
3168
|
});
|
|
2941
|
-
log(`
|
|
2942
|
-
|
|
3169
|
+
log(`[${projectRoot}] Terminal. succeeded=${summary.succeeded.length} failed=${summary.failed.length} restarts=${state.restartCount}`);
|
|
3170
|
+
doneProjects.add(projectRoot);
|
|
3171
|
+
projectExitCodes.set(projectRoot, summary.failed.length > 0 || summary.escalated.length > 0 ? 1 : 0);
|
|
3172
|
+
continue;
|
|
2943
3173
|
}
|
|
2944
|
-
|
|
2945
|
-
|
|
3174
|
+
const stallResult = await handleStallRecovery(health, state, {
|
|
3175
|
+
stallThreshold,
|
|
3176
|
+
maxRestarts,
|
|
3177
|
+
pack,
|
|
3178
|
+
outputFormat
|
|
3179
|
+
}, resolvedDeps, {
|
|
3180
|
+
emitEvent: (evt) => emitEvent({
|
|
3181
|
+
...evt,
|
|
3182
|
+
project: projectRoot
|
|
3183
|
+
}),
|
|
3184
|
+
log: (msg) => log(`[${projectRoot}] ${msg}`)
|
|
3185
|
+
});
|
|
3186
|
+
if (stallResult !== null) if (stallResult.maxRestartsExceeded) {
|
|
3187
|
+
doneProjects.add(projectRoot);
|
|
3188
|
+
projectExitCodes.set(projectRoot, 2);
|
|
3189
|
+
} else states.set(projectRoot, stallResult.state);
|
|
3190
|
+
}
|
|
3191
|
+
if (doneProjects.size >= projects.length) {
|
|
3192
|
+
const elapsedSeconds = Math.round((Date.now() - startTime) / 1e3);
|
|
2946
3193
|
emitEvent({
|
|
2947
|
-
type: "supervisor:
|
|
2948
|
-
|
|
2949
|
-
|
|
3194
|
+
type: "supervisor:done",
|
|
3195
|
+
elapsed_seconds: elapsedSeconds,
|
|
3196
|
+
project_results: Object.fromEntries(projectExitCodes)
|
|
2950
3197
|
});
|
|
2951
|
-
log(
|
|
2952
|
-
|
|
2953
|
-
|
|
2954
|
-
|
|
2955
|
-
|
|
2956
|
-
projectRoot,
|
|
2957
|
-
concurrency: 3,
|
|
2958
|
-
pack
|
|
2959
|
-
});
|
|
2960
|
-
} catch (err) {
|
|
2961
|
-
const message = err instanceof Error ? err.message : String(err);
|
|
2962
|
-
log(`Supervisor: Resume error: ${message}`);
|
|
2963
|
-
if (outputFormat === "json") emitEvent({
|
|
2964
|
-
type: "supervisor:error",
|
|
2965
|
-
reason: "resume_failed",
|
|
2966
|
-
message
|
|
2967
|
-
});
|
|
2968
|
-
}
|
|
3198
|
+
log(`\nAll projects reached terminal state. Elapsed: ${elapsedSeconds}s`);
|
|
3199
|
+
const exitCodes = [...projectExitCodes.values()];
|
|
3200
|
+
if (exitCodes.includes(2)) return 2;
|
|
3201
|
+
if (exitCodes.includes(1)) return 1;
|
|
3202
|
+
return 0;
|
|
2969
3203
|
}
|
|
2970
3204
|
await sleep(pollInterval * 1e3);
|
|
2971
3205
|
}
|
|
2972
3206
|
}
|
|
2973
3207
|
function registerSupervisorCommand(program, _version = "0.0.0", projectRoot = process.cwd()) {
|
|
2974
|
-
program.command("supervisor").description("Monitor a pipeline run and automatically recover from stalls").option("--poll-interval <seconds>", "Health poll interval in seconds", (v) => parseInt(v, 10), 60).option("--stall-threshold <seconds>", "Staleness in seconds before killing a stalled pipeline", (v) => parseInt(v, 10), 600).option("--max-restarts <n>", "Maximum automatic restarts before aborting", (v) => parseInt(v, 10), 3).option("--run-id <id>", "Pipeline run ID to monitor (defaults to latest)").option("--pack <name>", "Methodology pack name", "bmad").option("--project-root <path>", "Project root directory", projectRoot).option("--output-format <format>", "Output format: human (default) or json", "human").option("--experiment", "After post-run analysis, enter experiment mode: create branches, apply modifications, run single-story experiments, and report verdicts (Story 17-4)", false).option("--max-experiments <n>", "Maximum number of experiments to run per analysis cycle (default: 2, Story 17-4 AC6)", (v) => parseInt(v, 10), 2).action(async (opts) => {
|
|
3208
|
+
program.command("supervisor").description("Monitor a pipeline run and automatically recover from stalls").option("--poll-interval <seconds>", "Health poll interval in seconds", (v) => parseInt(v, 10), 60).option("--stall-threshold <seconds>", "Staleness in seconds before killing a stalled pipeline", (v) => parseInt(v, 10), 600).option("--max-restarts <n>", "Maximum automatic restarts before aborting", (v) => parseInt(v, 10), 3).option("--run-id <id>", "Pipeline run ID to monitor (defaults to latest)").option("--pack <name>", "Methodology pack name", "bmad").option("--project-root <path>", "Project root directory", projectRoot).option("--projects <paths>", "Comma-separated project root directories to monitor (multi-project mode)").option("--output-format <format>", "Output format: human (default) or json", "human").option("--experiment", "After post-run analysis, enter experiment mode: create branches, apply modifications, run single-story experiments, and report verdicts (Story 17-4)", false).option("--max-experiments <n>", "Maximum number of experiments to run per analysis cycle (default: 2, Story 17-4 AC6)", (v) => parseInt(v, 10), 2).action(async (opts) => {
|
|
2975
3209
|
const outputFormat = opts.outputFormat === "json" ? "json" : "human";
|
|
2976
3210
|
if (opts.stallThreshold < 120) console.warn(`Warning: --stall-threshold ${opts.stallThreshold}s is below 120s. Agent steps typically take 45-90s. This may cause false stall detections and wasted restarts.`);
|
|
3211
|
+
if (opts.projects) {
|
|
3212
|
+
if (opts.runId) {
|
|
3213
|
+
console.error("Error: --run-id cannot be used with --projects (ambiguous)");
|
|
3214
|
+
process.exitCode = 1;
|
|
3215
|
+
return;
|
|
3216
|
+
}
|
|
3217
|
+
if (opts.experiment) console.warn("Warning: --experiment is not supported in multi-project mode — ignored.");
|
|
3218
|
+
const projects = opts.projects.split(",").map((p) => resolve(p.trim()));
|
|
3219
|
+
const exitCode$1 = await runMultiProjectSupervisor({
|
|
3220
|
+
projects,
|
|
3221
|
+
pollInterval: opts.pollInterval,
|
|
3222
|
+
stallThreshold: opts.stallThreshold,
|
|
3223
|
+
maxRestarts: opts.maxRestarts,
|
|
3224
|
+
outputFormat,
|
|
3225
|
+
pack: opts.pack
|
|
3226
|
+
});
|
|
3227
|
+
process.exitCode = exitCode$1;
|
|
3228
|
+
return;
|
|
3229
|
+
}
|
|
2977
3230
|
const exitCode = await runSupervisorAction({
|
|
2978
3231
|
pollInterval: opts.pollInterval,
|
|
2979
3232
|
stallThreshold: opts.stallThreshold,
|
package/dist/index.d.ts
CHANGED
|
@@ -1025,6 +1025,8 @@ interface OrchestratorEvents {
|
|
|
1025
1025
|
storyKey: string;
|
|
1026
1026
|
phase: string;
|
|
1027
1027
|
elapsedMs: number;
|
|
1028
|
+
/** PID of the stalled child process, or null if not tracked */
|
|
1029
|
+
childPid: number | null;
|
|
1028
1030
|
};
|
|
1029
1031
|
/** Readiness check has completed — emitted for all verdicts (READY, NEEDS_WORK, NOT_READY) */
|
|
1030
1032
|
'solutioning:readiness-check': {
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import "./logger-C6n1g8uP.js";
|
|
2
2
|
import "./event-bus-J-bw-pkp.js";
|
|
3
|
-
import { registerRunCommand, runRunAction } from "./run-
|
|
3
|
+
import { registerRunCommand, runRunAction } from "./run-CoP8UQU3.js";
|
|
4
4
|
import "./decisions-DNYByk0U.js";
|
|
5
5
|
import "./metrics-BSg8VIHd.js";
|
|
6
6
|
|
|
@@ -1202,6 +1202,15 @@ function buildPipelineStatusOutput(run, tokenSummary, decisionsCount, storiesCou
|
|
|
1202
1202
|
totalOutput += row.total_output_tokens;
|
|
1203
1203
|
totalCost += row.total_cost_usd;
|
|
1204
1204
|
}
|
|
1205
|
+
let activeDispatches = 0;
|
|
1206
|
+
try {
|
|
1207
|
+
if (run.token_usage_json) {
|
|
1208
|
+
const state = JSON.parse(run.token_usage_json);
|
|
1209
|
+
if (state.stories) {
|
|
1210
|
+
for (const s of Object.values(state.stories)) if (s.phase !== "PENDING" && s.phase !== "COMPLETE" && s.phase !== "ESCALATED") activeDispatches++;
|
|
1211
|
+
}
|
|
1212
|
+
}
|
|
1213
|
+
} catch {}
|
|
1205
1214
|
return {
|
|
1206
1215
|
run_id: run.id,
|
|
1207
1216
|
current_phase: currentPhase,
|
|
@@ -1214,7 +1223,9 @@ function buildPipelineStatusOutput(run, tokenSummary, decisionsCount, storiesCou
|
|
|
1214
1223
|
decisions_count: decisionsCount,
|
|
1215
1224
|
stories_count: storiesCount,
|
|
1216
1225
|
last_activity: run.updated_at,
|
|
1217
|
-
staleness_seconds: Math.round((Date.now() - parseDbTimestampAsUtc(run.updated_at).getTime()) / 1e3)
|
|
1226
|
+
staleness_seconds: Math.round((Date.now() - parseDbTimestampAsUtc(run.updated_at).getTime()) / 1e3),
|
|
1227
|
+
last_event_ts: run.updated_at,
|
|
1228
|
+
active_dispatches: activeDispatches
|
|
1218
1229
|
};
|
|
1219
1230
|
}
|
|
1220
1231
|
/**
|
|
@@ -5359,6 +5370,7 @@ function createImplementationOrchestrator(deps) {
|
|
|
5359
5370
|
let _heartbeatTimer = null;
|
|
5360
5371
|
const HEARTBEAT_INTERVAL_MS = 3e4;
|
|
5361
5372
|
const WATCHDOG_TIMEOUT_MS = 6e5;
|
|
5373
|
+
const _stalledStories = new Set();
|
|
5362
5374
|
const _phaseStartMs = new Map();
|
|
5363
5375
|
const _phaseEndMs = new Map();
|
|
5364
5376
|
const _storyDispatches = new Map();
|
|
@@ -5454,6 +5466,7 @@ function createImplementationOrchestrator(deps) {
|
|
|
5454
5466
|
}
|
|
5455
5467
|
function recordProgress() {
|
|
5456
5468
|
_lastProgressTs = Date.now();
|
|
5469
|
+
_stalledStories.clear();
|
|
5457
5470
|
}
|
|
5458
5471
|
function startHeartbeat() {
|
|
5459
5472
|
if (_heartbeatTimer !== null) return;
|
|
@@ -5465,7 +5478,8 @@ function createImplementationOrchestrator(deps) {
|
|
|
5465
5478
|
for (const s of _stories.values()) if (s.phase === "COMPLETE" || s.phase === "ESCALATED") completed++;
|
|
5466
5479
|
else if (s.phase === "PENDING") queued++;
|
|
5467
5480
|
else active++;
|
|
5468
|
-
|
|
5481
|
+
const timeSinceProgress = Date.now() - _lastProgressTs;
|
|
5482
|
+
if (timeSinceProgress >= HEARTBEAT_INTERVAL_MS) eventBus.emit("orchestrator:heartbeat", {
|
|
5469
5483
|
runId: config.pipelineRunId ?? "",
|
|
5470
5484
|
activeDispatches: active,
|
|
5471
5485
|
completedDispatches: completed,
|
|
@@ -5474,6 +5488,8 @@ function createImplementationOrchestrator(deps) {
|
|
|
5474
5488
|
const elapsed = Date.now() - _lastProgressTs;
|
|
5475
5489
|
if (elapsed >= WATCHDOG_TIMEOUT_MS) {
|
|
5476
5490
|
for (const [key, s] of _stories) if (s.phase !== "PENDING" && s.phase !== "COMPLETE" && s.phase !== "ESCALATED") {
|
|
5491
|
+
if (_stalledStories.has(key)) continue;
|
|
5492
|
+
_stalledStories.add(key);
|
|
5477
5493
|
logger$16.warn({
|
|
5478
5494
|
storyKey: key,
|
|
5479
5495
|
phase: s.phase,
|
|
@@ -5483,7 +5499,8 @@ function createImplementationOrchestrator(deps) {
|
|
|
5483
5499
|
runId: config.pipelineRunId ?? "",
|
|
5484
5500
|
storyKey: key,
|
|
5485
5501
|
phase: s.phase,
|
|
5486
|
-
elapsedMs: elapsed
|
|
5502
|
+
elapsedMs: elapsed,
|
|
5503
|
+
childPid: null
|
|
5487
5504
|
});
|
|
5488
5505
|
}
|
|
5489
5506
|
}
|
|
@@ -6244,7 +6261,7 @@ function createImplementationOrchestrator(deps) {
|
|
|
6244
6261
|
});
|
|
6245
6262
|
persistState();
|
|
6246
6263
|
recordProgress();
|
|
6247
|
-
startHeartbeat();
|
|
6264
|
+
if (config.enableHeartbeat) startHeartbeat();
|
|
6248
6265
|
if (projectRoot !== void 0) {
|
|
6249
6266
|
const seedResult = seedMethodologyContext(db, projectRoot);
|
|
6250
6267
|
if (seedResult.decisionsCreated > 0) logger$16.info({
|
|
@@ -10697,7 +10714,8 @@ async function runRunAction(options) {
|
|
|
10697
10714
|
run_id: payload.runId,
|
|
10698
10715
|
story_key: payload.storyKey,
|
|
10699
10716
|
phase: payload.phase,
|
|
10700
|
-
elapsed_ms: payload.elapsedMs
|
|
10717
|
+
elapsed_ms: payload.elapsedMs,
|
|
10718
|
+
child_pid: payload.childPid
|
|
10701
10719
|
});
|
|
10702
10720
|
});
|
|
10703
10721
|
}
|
|
@@ -10710,7 +10728,8 @@ async function runRunAction(options) {
|
|
|
10710
10728
|
config: {
|
|
10711
10729
|
maxConcurrency: concurrency,
|
|
10712
10730
|
maxReviewCycles: 2,
|
|
10713
|
-
pipelineRunId: pipelineRun.id
|
|
10731
|
+
pipelineRunId: pipelineRun.id,
|
|
10732
|
+
enableHeartbeat: eventsFlag === true
|
|
10714
10733
|
},
|
|
10715
10734
|
projectRoot
|
|
10716
10735
|
});
|
|
@@ -11153,4 +11172,4 @@ function registerRunCommand(program, _version = "0.0.0", projectRoot = process.c
|
|
|
11153
11172
|
|
|
11154
11173
|
//#endregion
|
|
11155
11174
|
export { DatabaseWrapper, SUBSTRATE_OWNED_SETTINGS_KEYS, VALID_PHASES, buildPipelineStatusOutput, createContextCompiler, createDispatcher, createImplementationOrchestrator, createPackLoader, createPhaseOrchestrator, createStopAfterGate, findPackageRoot, formatOutput, formatPhaseCompletionSummary, formatPipelineStatusHuman, formatPipelineSummary, formatTokenTelemetry, getSubstrateDefaultSettings, parseDbTimestampAsUtc, registerRunCommand, resolveBmadMethodSrcPath, resolveBmadMethodVersion, resolveMainRepoRoot, runAnalysisPhase, runMigrations, runPlanningPhase, runRunAction, runSolutioningPhase, validateStopAfterFromConflict };
|
|
11156
|
-
//# sourceMappingURL=run-
|
|
11175
|
+
//# sourceMappingURL=run-CoP8UQU3.js.map
|