substrate-ai 0.9.0 → 0.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/adapter-registry-DXLMTmfD.js +0 -0
- package/dist/adapter-registry-neBZrkr3.js +4 -0
- package/dist/cli/index.js +5594 -5951
- package/dist/decisions-C0pz9Clx.js +0 -0
- package/dist/{decisions-BDLp3tJB.js → decisions-DQZW0h9X.js} +2 -1
- package/dist/dist-eNB_v7Iy.js +10205 -0
- package/dist/errors-BvyMlvCX.js +74 -0
- package/dist/experimenter-Dos3NsCg.js +3 -0
- package/dist/health-BvYILeQQ.js +6 -0
- package/dist/{health-C-VRJruD.js → health-CiDi90gC.js} +57 -1850
- package/dist/{helpers-CpMs8VZX.js → helpers-DTp3VJ2-.js} +31 -121
- package/dist/index.d.ts +709 -266
- package/dist/index.js +5 -3
- package/dist/{logger-D2fS2ccL.js → logger-KeHncl-f.js} +2 -42
- package/dist/routing-CcBOCuC9.js +0 -0
- package/dist/{routing-CD8bIci_.js → routing-HaYsjEIS.js} +2 -2
- package/dist/{run-ClxNDHbr.js → run-CAUhTR7Y.js} +594 -4249
- package/dist/run-DPZOQOvB.js +9 -0
- package/dist/{upgrade-B1S61VXJ.js → upgrade-DFGrqjGI.js} +3 -3
- package/dist/{upgrade-BK0HrKA6.js → upgrade-DYdYuuJK.js} +3 -3
- package/dist/version-manager-impl-BmOWu8ml.js +0 -0
- package/dist/version-manager-impl-CKv6I1S0.js +4 -0
- package/package.json +5 -2
- package/dist/adapter-registry-D2zdMwVu.js +0 -840
- package/dist/adapter-registry-WAyFydN5.js +0 -4
- package/dist/config-migrator-CtGelIsG.js +0 -250
- package/dist/decisions-DhAA2HG2.js +0 -397
- package/dist/experimenter-D_N_7ZF3.js +0 -503
- package/dist/git-utils-DxPx6erV.js +0 -365
- package/dist/health-DMbNP9bw.js +0 -5
- package/dist/operational-BdcdmDqS.js +0 -374
- package/dist/routing-BVrxrM6v.js +0 -832
- package/dist/run-MAQ3Wuju.js +0 -10
- package/dist/version-manager-impl-BIxOe7gZ.js +0 -372
- package/dist/version-manager-impl-RrWs-CI6.js +0 -4
|
@@ -1,503 +0,0 @@
|
|
|
1
|
-
import "./logger-D2fS2ccL.js";
|
|
2
|
-
import { createDecision } from "./decisions-DhAA2HG2.js";
|
|
3
|
-
import { EXPERIMENT_RESULT, getRunMetrics, getStoryMetricsForRun } from "./operational-BdcdmDqS.js";
|
|
4
|
-
import { spawnGit } from "./git-utils-DxPx6erV.js";
|
|
5
|
-
import { spawn } from "node:child_process";
|
|
6
|
-
import { join } from "node:path";
|
|
7
|
-
import { mkdir, readFile, writeFile } from "node:fs/promises";
|
|
8
|
-
|
|
9
|
-
//#region src/modules/supervisor/experimenter.ts
|
|
10
|
-
/**
|
|
11
|
-
* Default spawn implementation used when no `spawn` dep is injected.
|
|
12
|
-
*/
|
|
13
|
-
function spawnCommand(cmd, args, opts) {
|
|
14
|
-
return new Promise((resolve$1) => {
|
|
15
|
-
const proc = spawn(cmd, args, {
|
|
16
|
-
cwd: opts?.cwd,
|
|
17
|
-
env: opts?.env ?? process.env,
|
|
18
|
-
stdio: [
|
|
19
|
-
"ignore",
|
|
20
|
-
"pipe",
|
|
21
|
-
"pipe"
|
|
22
|
-
]
|
|
23
|
-
});
|
|
24
|
-
let stdout = "";
|
|
25
|
-
let stderr = "";
|
|
26
|
-
proc.stdout?.on("data", (chunk) => {
|
|
27
|
-
stdout += chunk.toString();
|
|
28
|
-
});
|
|
29
|
-
proc.stderr?.on("data", (chunk) => {
|
|
30
|
-
stderr += chunk.toString();
|
|
31
|
-
});
|
|
32
|
-
proc.on("close", (code) => {
|
|
33
|
-
resolve$1({
|
|
34
|
-
stdout: stdout.trim(),
|
|
35
|
-
stderr: stderr.trim(),
|
|
36
|
-
code: code ?? 1
|
|
37
|
-
});
|
|
38
|
-
});
|
|
39
|
-
proc.on("error", (err) => {
|
|
40
|
-
resolve$1({
|
|
41
|
-
stdout: "",
|
|
42
|
-
stderr: err.message,
|
|
43
|
-
code: 1
|
|
44
|
-
});
|
|
45
|
-
});
|
|
46
|
-
});
|
|
47
|
-
}
|
|
48
|
-
/**
|
|
49
|
-
* Build a git branch name for an experiment.
|
|
50
|
-
* Format: supervisor/experiment/<run-id-prefix>-<short-desc>
|
|
51
|
-
*
|
|
52
|
-
* The run-id is truncated to 8 characters. The short_desc is sanitized and
|
|
53
|
-
* truncated to 30 characters.
|
|
54
|
-
*/
|
|
55
|
-
function buildBranchName(runId, shortDesc) {
|
|
56
|
-
const safe = shortDesc.toLowerCase().replace(/[^a-z0-9-]/g, "-").replace(/-+/g, "-").slice(0, 30).replace(/^-|-$/g, "");
|
|
57
|
-
const runIdShort = runId.slice(0, 8);
|
|
58
|
-
return `supervisor/experiment/${runIdShort}-${safe}`;
|
|
59
|
-
}
|
|
60
|
-
/**
|
|
61
|
-
* Build a worktree directory path for an experiment.
|
|
62
|
-
* Format: <projectRoot>/.claude/worktrees/experiment-<run-id-prefix>-<short-desc-truncated>
|
|
63
|
-
*
|
|
64
|
-
* The run-id is truncated to 8 characters. The short_desc is sanitized and
|
|
65
|
-
* truncated to 20 characters for the directory name.
|
|
66
|
-
*/
|
|
67
|
-
function buildWorktreePath(projectRoot, baselineRunId, shortDesc) {
|
|
68
|
-
const safe = shortDesc.toLowerCase().replace(/[^a-z0-9-]/g, "-").replace(/-+/g, "-").slice(0, 20).replace(/^-|-$/g, "");
|
|
69
|
-
const idShort = baselineRunId.slice(0, 8);
|
|
70
|
-
return join(projectRoot, ".claude", "worktrees", `experiment-${idShort}-${safe}`);
|
|
71
|
-
}
|
|
72
|
-
/**
|
|
73
|
-
* Maps phase names to their prompt template filenames.
|
|
74
|
-
*/
|
|
75
|
-
const PHASE_TO_PROMPT_FILE = {
|
|
76
|
-
"create-story": "create-story.md",
|
|
77
|
-
"dev-story": "dev-story.md",
|
|
78
|
-
"code-review": "code-review.md",
|
|
79
|
-
"fix": "fix-story.md"
|
|
80
|
-
};
|
|
81
|
-
/**
|
|
82
|
-
* Build the modification directive to append to a prompt file.
|
|
83
|
-
* The directive is a HTML comment that instructs the agent to apply
|
|
84
|
-
* the recommended optimization strategy for this experiment.
|
|
85
|
-
*
|
|
86
|
-
* These are template-based strategies per recommendation type.
|
|
87
|
-
* Future iterations could use an LLM to generate novel modifications.
|
|
88
|
-
*/
|
|
89
|
-
function buildModificationDirective(rec) {
|
|
90
|
-
switch (rec.type) {
|
|
91
|
-
case "token_regression": return `
|
|
92
|
-
<!-- supervisor-experiment: token_regression fix — compress context injection, limit summaries to key points (story: ${rec.story_key}, phase: ${rec.phase}, delta: +${rec.delta_pct ?? "?"}%) -->`;
|
|
93
|
-
case "review_cycles": return `
|
|
94
|
-
<!-- supervisor-experiment: review_cycles fix — accept passing implementations with minor style issues, reduce strictness for non-critical checks (story: ${rec.story_key}, cycles: ${rec.review_cycles ?? "?"}) -->`;
|
|
95
|
-
case "timing_bottleneck": return `
|
|
96
|
-
<!-- supervisor-experiment: timing_bottleneck fix — reduce max_turns by 20% for this phase to enforce time-boxing (story: ${rec.story_key}, phase: ${rec.phase}) -->`;
|
|
97
|
-
}
|
|
98
|
-
}
|
|
99
|
-
/**
|
|
100
|
-
* Resolve the absolute path to the prompt file for a recommendation's phase.
|
|
101
|
-
*/
|
|
102
|
-
function resolvePromptFile(rec, projectRoot, pack) {
|
|
103
|
-
const filename = PHASE_TO_PROMPT_FILE[rec.phase] ?? `${rec.phase}.md`;
|
|
104
|
-
return join(projectRoot, "packs", pack, "prompts", filename);
|
|
105
|
-
}
|
|
106
|
-
const REGRESSION_THRESHOLD_PCT = 20;
|
|
107
|
-
/**
|
|
108
|
-
* Determine the experiment verdict based on metric deltas and the recommendation type.
|
|
109
|
-
*
|
|
110
|
-
* Rules:
|
|
111
|
-
* - IMPROVED: target metric improved (negative delta) AND no other metric regressed by >20%
|
|
112
|
-
* - MIXED: target metric improved BUT at least one other metric regressed by >20%
|
|
113
|
-
* - REGRESSED: target metric did not improve (or could not be measured)
|
|
114
|
-
*/
|
|
115
|
-
function determineVerdict(rec, deltas) {
|
|
116
|
-
const targetImproved = isTargetMetricImproved(rec, deltas);
|
|
117
|
-
const hasRegression = hasNonTargetRegression(rec, deltas);
|
|
118
|
-
if (!targetImproved) return "REGRESSED";
|
|
119
|
-
if (hasRegression) return "MIXED";
|
|
120
|
-
return "IMPROVED";
|
|
121
|
-
}
|
|
122
|
-
function isTargetMetricImproved(rec, deltas) {
|
|
123
|
-
switch (rec.type) {
|
|
124
|
-
case "token_regression": return deltas.tokens_pct !== null && deltas.tokens_pct < 0;
|
|
125
|
-
case "review_cycles": return deltas.review_cycles_pct !== null && deltas.review_cycles_pct < 0;
|
|
126
|
-
case "timing_bottleneck": return deltas.wall_clock_pct !== null && deltas.wall_clock_pct < 0;
|
|
127
|
-
}
|
|
128
|
-
}
|
|
129
|
-
function hasNonTargetRegression(rec, deltas) {
|
|
130
|
-
if (rec.type !== "token_regression" && deltas.tokens_pct !== null && deltas.tokens_pct > REGRESSION_THRESHOLD_PCT) return true;
|
|
131
|
-
if (rec.type !== "review_cycles" && deltas.review_cycles_pct !== null && deltas.review_cycles_pct > REGRESSION_THRESHOLD_PCT) return true;
|
|
132
|
-
return false;
|
|
133
|
-
}
|
|
134
|
-
/**
|
|
135
|
-
* Compute percentage deltas between baseline and experiment run metrics.
|
|
136
|
-
* Negative values mean the experiment was better.
|
|
137
|
-
*/
|
|
138
|
-
function computeDeltas(baselineMetrics, experimentMetrics) {
|
|
139
|
-
const pct = (base, exp) => base === 0 ? null : Math.round((exp - base) / base * 100 * 10) / 10;
|
|
140
|
-
const baseTokens = (baselineMetrics.total_input_tokens ?? 0) + (baselineMetrics.total_output_tokens ?? 0);
|
|
141
|
-
const expTokens = (experimentMetrics.total_input_tokens ?? 0) + (experimentMetrics.total_output_tokens ?? 0);
|
|
142
|
-
return {
|
|
143
|
-
tokens_pct: pct(baseTokens, expTokens),
|
|
144
|
-
cost_pct: pct(baselineMetrics.total_cost_usd ?? 0, experimentMetrics.total_cost_usd ?? 0),
|
|
145
|
-
review_cycles_pct: pct(baselineMetrics.total_review_cycles ?? 0, experimentMetrics.total_review_cycles ?? 0),
|
|
146
|
-
wall_clock_pct: pct(baselineMetrics.wall_clock_seconds ?? 0, experimentMetrics.wall_clock_seconds ?? 0)
|
|
147
|
-
};
|
|
148
|
-
}
|
|
149
|
-
/**
|
|
150
|
-
* Format a percentage delta for display (e.g., "+15%" or "-20%" or "N/A").
|
|
151
|
-
*/
|
|
152
|
-
function fmtPct(pct) {
|
|
153
|
-
if (pct === null) return "N/A";
|
|
154
|
-
return `${pct > 0 ? "+" : ""}${pct}%`;
|
|
155
|
-
}
|
|
156
|
-
/**
|
|
157
|
-
* Build the GitHub PR body with a metrics comparison table and raw data.
|
|
158
|
-
* Used by createPR() when the verdict is IMPROVED or MIXED (AC5).
|
|
159
|
-
*/
|
|
160
|
-
function buildPRBody(result) {
|
|
161
|
-
const { recommendation: rec, verdict, deltas, baselineRunId, experimentRunId, branchName } = result;
|
|
162
|
-
return [
|
|
163
|
-
`## Experiment Results`,
|
|
164
|
-
``,
|
|
165
|
-
`**Verdict**: \`${verdict}\``,
|
|
166
|
-
``,
|
|
167
|
-
`### Recommendation`,
|
|
168
|
-
`- **Type**: ${rec.type}`,
|
|
169
|
-
`- **Story**: ${rec.story_key}`,
|
|
170
|
-
`- **Phase**: ${rec.phase}`,
|
|
171
|
-
`- **Description**: ${rec.description}`,
|
|
172
|
-
``,
|
|
173
|
-
`### Metrics Comparison`,
|
|
174
|
-
``,
|
|
175
|
-
`| Metric | Delta |`,
|
|
176
|
-
`|--------|-------|`,
|
|
177
|
-
`| Tokens | ${fmtPct(deltas.tokens_pct)} |`,
|
|
178
|
-
`| Cost | ${fmtPct(deltas.cost_pct)} |`,
|
|
179
|
-
`| Review Cycles | ${fmtPct(deltas.review_cycles_pct)} |`,
|
|
180
|
-
`| Wall Clock | ${fmtPct(deltas.wall_clock_pct)} |`,
|
|
181
|
-
``,
|
|
182
|
-
`### Raw Data`,
|
|
183
|
-
`- Baseline Run: \`${baselineRunId}\``,
|
|
184
|
-
`- Experiment Run: \`${experimentRunId ?? "N/A"}\``,
|
|
185
|
-
`- Branch: \`${branchName}\``
|
|
186
|
-
].join("\n");
|
|
187
|
-
}
|
|
188
|
-
/**
|
|
189
|
-
* Build an audit log entry for a single experiment result.
|
|
190
|
-
* The entry is markdown formatted and suitable for appending to the audit log file.
|
|
191
|
-
*/
|
|
192
|
-
function buildAuditLogEntry(result, timestamp = new Date().toISOString()) {
|
|
193
|
-
const { recommendation: rec, verdict, deltas, error, prLink } = result;
|
|
194
|
-
const lines = [
|
|
195
|
-
`## Experiment: ${rec.short_desc} (${timestamp})`,
|
|
196
|
-
``,
|
|
197
|
-
`**Hypothesis**: ${rec.description}`,
|
|
198
|
-
``,
|
|
199
|
-
`**Modification**: Applied \`${rec.type}\` strategy to \`${rec.phase}\` prompt`,
|
|
200
|
-
``,
|
|
201
|
-
`**Results**:`,
|
|
202
|
-
`- Verdict: ${verdict}`,
|
|
203
|
-
`- Tokens delta: ${fmtPct(deltas.tokens_pct)}`,
|
|
204
|
-
`- Cost delta: ${fmtPct(deltas.cost_pct)}`,
|
|
205
|
-
`- Review cycles delta: ${fmtPct(deltas.review_cycles_pct)}`,
|
|
206
|
-
`- Wall clock delta: ${fmtPct(deltas.wall_clock_pct)}`
|
|
207
|
-
];
|
|
208
|
-
if (error) lines.push(`- Error: ${error}`);
|
|
209
|
-
lines.push(``);
|
|
210
|
-
if (prLink) lines.push(`**PR**: ${prLink}`, ``);
|
|
211
|
-
lines.push(`---`, ``);
|
|
212
|
-
return lines.join("\n");
|
|
213
|
-
}
|
|
214
|
-
/**
|
|
215
|
-
* Create an Experimenter with the given configuration and injectable dependencies.
|
|
216
|
-
*
|
|
217
|
-
* The Experimenter implements the AC2/AC3/AC4 state machine:
|
|
218
|
-
* SELECTING → BRANCHING → MODIFYING → RUNNING → COMPARING → REPORTING
|
|
219
|
-
*/
|
|
220
|
-
function createExperimenter(config, deps) {
|
|
221
|
-
const resolvedDeps = {
|
|
222
|
-
git: deps?.git ?? spawnGit,
|
|
223
|
-
spawn: deps?.spawn ?? spawnCommand,
|
|
224
|
-
runStory: deps?.runStory ?? (async () => {
|
|
225
|
-
throw new Error("runStory dependency not provided");
|
|
226
|
-
}),
|
|
227
|
-
getRunMetrics: deps?.getRunMetrics ?? getRunMetrics,
|
|
228
|
-
getStoryMetrics: deps?.getStoryMetrics ?? getStoryMetricsForRun,
|
|
229
|
-
readFile: deps?.readFile ?? ((p) => readFile(p, "utf-8")),
|
|
230
|
-
writeFile: deps?.writeFile ?? ((p, c) => writeFile(p, c, "utf-8")),
|
|
231
|
-
mkdir: deps?.mkdir ?? ((p, o) => mkdir(p, o).then(() => void 0)),
|
|
232
|
-
log: deps?.log ?? ((msg) => process.stdout.write(msg + "\n"))
|
|
233
|
-
};
|
|
234
|
-
const { git, spawn: sp, runStory, getRunMetrics: getRun, getStoryMetrics: getStory, readFile: rf, writeFile: wf, mkdir: md, log } = resolvedDeps;
|
|
235
|
-
async function getCurrentBranch() {
|
|
236
|
-
const result = await git([
|
|
237
|
-
"rev-parse",
|
|
238
|
-
"--abbrev-ref",
|
|
239
|
-
"HEAD"
|
|
240
|
-
], { cwd: config.projectRoot });
|
|
241
|
-
return result.stdout.trim();
|
|
242
|
-
}
|
|
243
|
-
/**
|
|
244
|
-
* Create an isolated git worktree for an experiment.
|
|
245
|
-
* Uses `git worktree add <path> -b <branch>` so the main working tree is never affected.
|
|
246
|
-
*/
|
|
247
|
-
async function createWorktree(worktreePath, branchName) {
|
|
248
|
-
const result = await git([
|
|
249
|
-
"worktree",
|
|
250
|
-
"add",
|
|
251
|
-
worktreePath,
|
|
252
|
-
"-b",
|
|
253
|
-
branchName
|
|
254
|
-
], { cwd: config.projectRoot });
|
|
255
|
-
if (result.code !== 0) throw new Error(`Failed to create worktree ${worktreePath}: ${result.stderr}`);
|
|
256
|
-
}
|
|
257
|
-
/**
|
|
258
|
-
* Remove an experiment worktree after the experiment completes.
|
|
259
|
-
* Always called in the finally block (regardless of verdict).
|
|
260
|
-
* Uses --force to handle cases where the worktree directory may be dirty.
|
|
261
|
-
*/
|
|
262
|
-
async function removeWorktree(worktreePath) {
|
|
263
|
-
const result = await git([
|
|
264
|
-
"worktree",
|
|
265
|
-
"remove",
|
|
266
|
-
worktreePath,
|
|
267
|
-
"--force"
|
|
268
|
-
], { cwd: config.projectRoot });
|
|
269
|
-
if (result.code !== 0) log(`[experimenter] Warning: could not remove worktree ${worktreePath}: ${result.stderr}`);
|
|
270
|
-
else log(`[experimenter] Removed experiment worktree: ${worktreePath}`);
|
|
271
|
-
}
|
|
272
|
-
async function commitModification(rec, filePath, cwd) {
|
|
273
|
-
await git(["add", filePath], { cwd });
|
|
274
|
-
const message = `supervisor-experiment: ${rec.type} fix for ${rec.story_key}/${rec.phase}\n\nRecommendation: ${rec.description}`;
|
|
275
|
-
const result = await git([
|
|
276
|
-
"commit",
|
|
277
|
-
"-m",
|
|
278
|
-
message
|
|
279
|
-
], { cwd });
|
|
280
|
-
if (result.code !== 0) throw new Error(`Failed to commit modification: ${result.stderr}`);
|
|
281
|
-
}
|
|
282
|
-
async function deleteBranch(branchName) {
|
|
283
|
-
const result = await git([
|
|
284
|
-
"branch",
|
|
285
|
-
"-D",
|
|
286
|
-
branchName
|
|
287
|
-
], { cwd: config.projectRoot });
|
|
288
|
-
if (result.code !== 0) log(`[experimenter] Warning: could not delete branch ${branchName}: ${result.stderr}`);
|
|
289
|
-
else log(`[experimenter] Deleted REGRESSED experiment branch: ${branchName}`);
|
|
290
|
-
}
|
|
291
|
-
/**
|
|
292
|
-
* Open a GitHub PR for the experiment. Returns the PR URL, or null if gh is not available.
|
|
293
|
-
* Degrades gracefully if gh CLI is not installed.
|
|
294
|
-
*/
|
|
295
|
-
async function createPR(result) {
|
|
296
|
-
const { recommendation: rec, branchName, verdict } = result;
|
|
297
|
-
const title = `[supervisor] ${rec.description}`;
|
|
298
|
-
const body = buildPRBody(result);
|
|
299
|
-
const ghResult = await sp("gh", [
|
|
300
|
-
"pr",
|
|
301
|
-
"create",
|
|
302
|
-
"--title",
|
|
303
|
-
title,
|
|
304
|
-
"--body",
|
|
305
|
-
body,
|
|
306
|
-
"--label",
|
|
307
|
-
"supervisor",
|
|
308
|
-
"--label",
|
|
309
|
-
"automated-experiment",
|
|
310
|
-
"--head",
|
|
311
|
-
branchName
|
|
312
|
-
], { cwd: config.projectRoot });
|
|
313
|
-
if (ghResult.code !== 0) {
|
|
314
|
-
log(`[experimenter] Warning: gh pr create failed (verdict: ${verdict}): ${ghResult.stderr}`);
|
|
315
|
-
log(`[experimenter] Is the gh CLI installed and authenticated?`);
|
|
316
|
-
return null;
|
|
317
|
-
}
|
|
318
|
-
const prUrl = ghResult.stdout.trim();
|
|
319
|
-
log(`[experimenter] PR created: ${prUrl}`);
|
|
320
|
-
return prUrl;
|
|
321
|
-
}
|
|
322
|
-
/**
|
|
323
|
-
* Append an experiment result to the audit log file.
|
|
324
|
-
* Path: _bmad-output/supervisor-reports/<baselineRunId>-experiments.md
|
|
325
|
-
* Append-only: reads existing content and appends the new entry.
|
|
326
|
-
*/
|
|
327
|
-
async function appendExperimentLog(result) {
|
|
328
|
-
const { baselineRunId } = result;
|
|
329
|
-
const reportDir = join(config.projectRoot, "_bmad-output", "supervisor-reports");
|
|
330
|
-
const logPath = join(reportDir, `${baselineRunId}-experiments.md`);
|
|
331
|
-
try {
|
|
332
|
-
await md(reportDir, { recursive: true });
|
|
333
|
-
let existing = "";
|
|
334
|
-
try {
|
|
335
|
-
existing = await rf(logPath);
|
|
336
|
-
} catch {
|
|
337
|
-
existing = `# Supervisor Experiment Log\n\nRun ID: \`${baselineRunId}\`\n\n`;
|
|
338
|
-
}
|
|
339
|
-
const entry = buildAuditLogEntry(result);
|
|
340
|
-
await wf(logPath, existing + entry);
|
|
341
|
-
log(`[experimenter] Audit log updated: ${logPath}`);
|
|
342
|
-
} catch (err) {
|
|
343
|
-
log(`[experimenter] Warning: could not write audit log: ${err instanceof Error ? err.message : String(err)}`);
|
|
344
|
-
}
|
|
345
|
-
}
|
|
346
|
-
/**
|
|
347
|
-
* Check if the experiment run exceeded the token budget cap (AC6).
|
|
348
|
-
* Budget cap = baseline story tokens × tokenBudgetMultiplier.
|
|
349
|
-
*
|
|
350
|
-
* Returns false (budget exceeded) if the experiment used more than the cap.
|
|
351
|
-
* Returns true (within budget) if the cap is satisfied or metrics are unavailable.
|
|
352
|
-
*/
|
|
353
|
-
async function isWithinTokenBudget(db, storyKey, baselineRunId, experimentRunId) {
|
|
354
|
-
try {
|
|
355
|
-
const baselineStories = await getStory(db, baselineRunId);
|
|
356
|
-
const experimentStories = await getStory(db, experimentRunId);
|
|
357
|
-
const baselineStory = baselineStories.find((m) => m.story_key === storyKey);
|
|
358
|
-
const experimentStory = experimentStories.find((m) => m.story_key === storyKey);
|
|
359
|
-
if (!baselineStory || !experimentStory) return true;
|
|
360
|
-
const baselineTokens = (baselineStory.input_tokens ?? 0) + (baselineStory.output_tokens ?? 0);
|
|
361
|
-
const experimentTokens = (experimentStory.input_tokens ?? 0) + (experimentStory.output_tokens ?? 0);
|
|
362
|
-
if (baselineTokens === 0) return true;
|
|
363
|
-
const cap = baselineTokens * config.tokenBudgetMultiplier;
|
|
364
|
-
const withinBudget = experimentTokens <= cap;
|
|
365
|
-
if (!withinBudget) log(`[experimenter] Token budget exceeded: experiment used ${experimentTokens} tokens, cap is ${cap} (${config.tokenBudgetMultiplier}x baseline of ${baselineTokens})`);
|
|
366
|
-
return withinBudget;
|
|
367
|
-
} catch {
|
|
368
|
-
return true;
|
|
369
|
-
}
|
|
370
|
-
}
|
|
371
|
-
async function runOneExperiment(db, rec, baselineRunId) {
|
|
372
|
-
const branchName = buildBranchName(baselineRunId, rec.short_desc);
|
|
373
|
-
const worktreePath = buildWorktreePath(config.projectRoot, baselineRunId, rec.short_desc);
|
|
374
|
-
let experimentRunId = null;
|
|
375
|
-
let currentPhase = "SELECTING";
|
|
376
|
-
let verdict = "REGRESSED";
|
|
377
|
-
let deltas = {
|
|
378
|
-
tokens_pct: null,
|
|
379
|
-
cost_pct: null,
|
|
380
|
-
review_cycles_pct: null,
|
|
381
|
-
wall_clock_pct: null
|
|
382
|
-
};
|
|
383
|
-
let caughtError;
|
|
384
|
-
let worktreeCreated = false;
|
|
385
|
-
try {
|
|
386
|
-
currentPhase = "BRANCHING";
|
|
387
|
-
log(`[experimenter] Creating worktree: ${worktreePath} on branch ${branchName}`);
|
|
388
|
-
await createWorktree(worktreePath, branchName);
|
|
389
|
-
worktreeCreated = true;
|
|
390
|
-
currentPhase = "MODIFYING";
|
|
391
|
-
const promptFile = resolvePromptFile(rec, worktreePath, config.pack);
|
|
392
|
-
const directive = buildModificationDirective(rec);
|
|
393
|
-
const originalContent = await rf(promptFile);
|
|
394
|
-
await wf(promptFile, originalContent + directive);
|
|
395
|
-
await commitModification(rec, promptFile, worktreePath);
|
|
396
|
-
log(`[experimenter] Applied modification to ${promptFile}`);
|
|
397
|
-
currentPhase = "RUNNING";
|
|
398
|
-
log(`[experimenter] Running single-story experiment for ${rec.story_key}`);
|
|
399
|
-
const { runId, exitCode } = await runStory({
|
|
400
|
-
stories: rec.story_key,
|
|
401
|
-
projectRoot: worktreePath,
|
|
402
|
-
pack: config.pack
|
|
403
|
-
});
|
|
404
|
-
experimentRunId = runId;
|
|
405
|
-
log(`[experimenter] Experiment run completed: ${runId} (exit: ${exitCode})`);
|
|
406
|
-
currentPhase = "COMPARING";
|
|
407
|
-
const baselineMetrics = await getRun(db, baselineRunId);
|
|
408
|
-
const experimentMetrics = await getRun(db, runId);
|
|
409
|
-
if (!baselineMetrics || !experimentMetrics) {
|
|
410
|
-
log(`[experimenter] Warning: metrics unavailable for comparison`);
|
|
411
|
-
verdict = "REGRESSED";
|
|
412
|
-
caughtError = "Could not retrieve metrics for comparison";
|
|
413
|
-
} else {
|
|
414
|
-
deltas = computeDeltas(baselineMetrics, experimentMetrics);
|
|
415
|
-
const withinBudget = await isWithinTokenBudget(db, rec.story_key, baselineRunId, runId);
|
|
416
|
-
if (!withinBudget) {
|
|
417
|
-
verdict = "REGRESSED";
|
|
418
|
-
caughtError = `Token budget cap exceeded (${config.tokenBudgetMultiplier}x baseline)`;
|
|
419
|
-
log(`[experimenter] Aborting experiment: token budget exceeded`);
|
|
420
|
-
} else verdict = determineVerdict(rec, deltas);
|
|
421
|
-
}
|
|
422
|
-
currentPhase = "REPORTING";
|
|
423
|
-
log(`[experimenter] Verdict for ${rec.story_key}/${rec.type}: ${verdict}`);
|
|
424
|
-
log(`[experimenter] Deltas: tokens=${deltas.tokens_pct}% cost=${deltas.cost_pct}% cycles=${deltas.review_cycles_pct}% clock=${deltas.wall_clock_pct}%`);
|
|
425
|
-
} catch (err) {
|
|
426
|
-
caughtError = err instanceof Error ? err.message : String(err);
|
|
427
|
-
verdict = "REGRESSED";
|
|
428
|
-
log(`[experimenter] Error in phase ${currentPhase}: ${caughtError}`);
|
|
429
|
-
} finally {
|
|
430
|
-
if (worktreeCreated) try {
|
|
431
|
-
await removeWorktree(worktreePath);
|
|
432
|
-
} catch {
|
|
433
|
-
log(`[experimenter] Warning: could not remove worktree ${worktreePath}`);
|
|
434
|
-
}
|
|
435
|
-
}
|
|
436
|
-
let prLink = null;
|
|
437
|
-
if (verdict === "REGRESSED" && worktreeCreated) await deleteBranch(branchName);
|
|
438
|
-
else if (verdict === "IMPROVED" || verdict === "MIXED") {
|
|
439
|
-
const partialResult = {
|
|
440
|
-
recommendation: rec,
|
|
441
|
-
branchName,
|
|
442
|
-
baselineRunId,
|
|
443
|
-
experimentRunId,
|
|
444
|
-
verdict,
|
|
445
|
-
deltas,
|
|
446
|
-
currentPhase: "REPORTING",
|
|
447
|
-
...caughtError !== void 0 ? { error: caughtError } : {}
|
|
448
|
-
};
|
|
449
|
-
prLink = await createPR(partialResult);
|
|
450
|
-
}
|
|
451
|
-
const finalResult = {
|
|
452
|
-
recommendation: rec,
|
|
453
|
-
branchName,
|
|
454
|
-
baselineRunId,
|
|
455
|
-
experimentRunId,
|
|
456
|
-
verdict,
|
|
457
|
-
deltas,
|
|
458
|
-
currentPhase: "REPORTING",
|
|
459
|
-
...caughtError !== void 0 ? { error: caughtError } : {},
|
|
460
|
-
prLink
|
|
461
|
-
};
|
|
462
|
-
await appendExperimentLog(finalResult);
|
|
463
|
-
try {
|
|
464
|
-
const targetMetricValue = rec.type === "token_regression" ? rec.tokens_actual ?? 0 : rec.type === "review_cycles" ? rec.review_cycles ?? 0 : rec.timing_seconds ?? 0;
|
|
465
|
-
const afterValue = rec.type === "token_regression" ? deltas.tokens_pct !== null ? Math.round(targetMetricValue * (1 + deltas.tokens_pct / 100)) : targetMetricValue : rec.type === "review_cycles" ? deltas.review_cycles_pct !== null ? Math.round(targetMetricValue * (1 + deltas.review_cycles_pct / 100)) : targetMetricValue : deltas.wall_clock_pct !== null ? Math.round(targetMetricValue * (1 + deltas.wall_clock_pct / 100)) : targetMetricValue;
|
|
466
|
-
await createDecision(db, {
|
|
467
|
-
pipeline_run_id: baselineRunId,
|
|
468
|
-
phase: "supervisor",
|
|
469
|
-
category: EXPERIMENT_RESULT,
|
|
470
|
-
key: `experiment:${baselineRunId}:${Date.now()}`,
|
|
471
|
-
value: JSON.stringify({
|
|
472
|
-
target_metric: rec.type,
|
|
473
|
-
before: targetMetricValue,
|
|
474
|
-
after: afterValue,
|
|
475
|
-
verdict,
|
|
476
|
-
branch_name: verdict === "IMPROVED" || verdict === "MIXED" ? branchName : null
|
|
477
|
-
}),
|
|
478
|
-
rationale: `Experiment for ${rec.story_key}/${rec.phase}: ${rec.description}. Verdict: ${verdict}.`
|
|
479
|
-
});
|
|
480
|
-
} catch {}
|
|
481
|
-
return finalResult;
|
|
482
|
-
}
|
|
483
|
-
return { async runExperiments(db, recommendations, baselineRunId) {
|
|
484
|
-
if (recommendations.length === 0) return [];
|
|
485
|
-
const currentBranch = await getCurrentBranch();
|
|
486
|
-
const results = [];
|
|
487
|
-
const limit = Math.min(recommendations.length, config.maxExperiments);
|
|
488
|
-
log(`[experimenter] Starting experiment cycle: ${limit} of ${recommendations.length} recommendations`);
|
|
489
|
-
log(`[experimenter] Current branch: ${currentBranch} (worktrees will not affect this)`);
|
|
490
|
-
for (let i = 0; i < limit; i++) {
|
|
491
|
-
const rec = recommendations[i];
|
|
492
|
-
log(`[experimenter] Experiment ${i + 1}/${limit}: ${rec.type} for ${rec.story_key}/${rec.phase}`);
|
|
493
|
-
const result = await runOneExperiment(db, rec, baselineRunId);
|
|
494
|
-
results.push(result);
|
|
495
|
-
}
|
|
496
|
-
log(`[experimenter] Experiment cycle complete: ${results.filter((r) => r.verdict === "IMPROVED").length} improved, ${results.filter((r) => r.verdict === "MIXED").length} mixed, ${results.filter((r) => r.verdict === "REGRESSED").length} regressed`);
|
|
497
|
-
return results;
|
|
498
|
-
} };
|
|
499
|
-
}
|
|
500
|
-
|
|
501
|
-
//#endregion
|
|
502
|
-
export { createExperimenter };
|
|
503
|
-
//# sourceMappingURL=experimenter-D_N_7ZF3.js.map
|