@tritard/waterbrother 0.9.1 → 0.9.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/agent.js +2 -1
- package/src/cli.js +73 -22
- package/src/experiment.js +220 -57
package/package.json
CHANGED
package/src/agent.js
CHANGED
|
@@ -74,7 +74,8 @@ When you use tools:
|
|
|
74
74
|
- avoid hype such as "premium", "luxurious", "studio-grade", or "improved!"
|
|
75
75
|
- Explain what you changed and why.
|
|
76
76
|
- Never claim you ran commands you did not run.
|
|
77
|
-
- If a tool fails, show the failure and recover
|
|
77
|
+
- If a tool fails, show the failure and recover.
|
|
78
|
+
- You are a coding tool for real software engineering work. If a request is clearly a joke, hypothetical, non-technical, or not related to actual software development, respond conversationally WITHOUT using any tools. Do not create files, write scripts, or make edits for non-engineering requests. Examples of things you should NOT build: personality generators, dating advice scripts, joke apps, horoscope generators, or any request that is clearly not serious engineering work.`;
|
|
78
79
|
|
|
79
80
|
const COMPACTION_SYSTEM_PROMPT = `You summarize coding assistant transcripts for context compaction.
|
|
80
81
|
Output concise markdown with these sections:
|
package/src/cli.js
CHANGED
|
@@ -34,7 +34,7 @@ import { createPanelRenderer, buildPanelState } from "./panel.js";
|
|
|
34
34
|
import { deriveTaskNameFromPrompt, nextActionsForState, routeNaturalInput } from "./router.js";
|
|
35
35
|
import { compressEpisode, saveEpisode, loadRecentEpisodes, findRelevantEpisodes, buildEpisodicMemoryBlock, buildReminderBlock } from "./episodic.js";
|
|
36
36
|
import { formatPlanForDisplay } from "./planner.js";
|
|
37
|
-
import { parseCharterFromGoal, runExperimentLoop, formatExperimentSummary } from "./experiment.js";
|
|
37
|
+
import { parseCharterFromGoal, runExperimentLoop, formatExperimentSummary, gitReturnToBranch } from "./experiment.js";
|
|
38
38
|
|
|
39
39
|
const execFileAsync = promisify(execFile);
|
|
40
40
|
const PACKAGE_ROOT = path.resolve(path.dirname(fileURLToPath(import.meta.url)), "..");
|
|
@@ -6144,7 +6144,7 @@ async function promptLoop(agent, session, context) {
|
|
|
6144
6144
|
// Build charter
|
|
6145
6145
|
const charter = parseCharterFromGoal(goalArg);
|
|
6146
6146
|
|
|
6147
|
-
// Ask for metric command
|
|
6147
|
+
// Ask for metric command
|
|
6148
6148
|
if (!charter.metric.command) {
|
|
6149
6149
|
try {
|
|
6150
6150
|
const metricCmd = await promptLine("metric command (e.g. npm test, python bench.py): ", { input: process.stdin, output: process.stdout });
|
|
@@ -6159,49 +6159,69 @@ async function promptLoop(agent, session, context) {
|
|
|
6159
6159
|
}
|
|
6160
6160
|
}
|
|
6161
6161
|
|
|
6162
|
-
// Ask for attempt
|
|
6162
|
+
// Ask for attempt budget (0 = infinite, runs until Ctrl+C or time limit)
|
|
6163
6163
|
try {
|
|
6164
|
-
const attemptsStr = await promptLine(
|
|
6164
|
+
const attemptsStr = await promptLine("max attempts (0 = run until interrupted) [0]: ", { input: process.stdin, output: process.stdout });
|
|
6165
6165
|
const parsed = parseInt(attemptsStr.trim(), 10);
|
|
6166
|
-
if (parsed > 0) charter.budget.maxAttempts =
|
|
6166
|
+
if (parsed > 0) charter.budget.maxAttempts = parsed;
|
|
6167
6167
|
} catch {}
|
|
6168
6168
|
|
|
6169
|
+
// Ask for time budget
|
|
6170
|
+
try {
|
|
6171
|
+
const timeStr = await promptLine(`time limit in minutes [${charter.budget.maxMinutes}]: `, { input: process.stdin, output: process.stdout });
|
|
6172
|
+
const parsed = parseInt(timeStr.trim(), 10);
|
|
6173
|
+
if (parsed > 0) charter.budget.maxMinutes = parsed;
|
|
6174
|
+
} catch {}
|
|
6175
|
+
|
|
6176
|
+
const isInfinite = !charter.budget.maxAttempts || charter.budget.maxAttempts <= 0;
|
|
6169
6177
|
console.log(`────────────────────────────────────────────────────────────`);
|
|
6170
6178
|
console.log(`experiment: ${charter.goal}`);
|
|
6171
6179
|
console.log(`metric: ${charter.metric.command} (${charter.metric.direction} is better)`);
|
|
6172
|
-
console.log(`budget: ${charter.budget.maxAttempts} attempts, ${charter.budget.maxMinutes}m max`);
|
|
6173
|
-
|
|
6174
|
-
for (const c of charter.constraints) console.log(` constraint: ${c}`);
|
|
6175
|
-
}
|
|
6180
|
+
console.log(`budget: ${isInfinite ? "∞ attempts" : `${charter.budget.maxAttempts} attempts`}, ${charter.budget.maxMinutes}m max`);
|
|
6181
|
+
console.log(`simplicity bias: ${charter.simplicityBias ? "on" : "off"}`);
|
|
6176
6182
|
console.log(`────────────────────────────────────────────────────────────`);
|
|
6177
|
-
console.log("measuring baseline...");
|
|
6183
|
+
console.log("creating experiment branch and measuring baseline...");
|
|
6178
6184
|
|
|
6179
6185
|
const spinner = createProgressSpinner("running experiment...");
|
|
6186
|
+
let interrupted = false;
|
|
6187
|
+
|
|
6188
|
+
const abortController = typeof AbortController === "function" ? new AbortController() : null;
|
|
6189
|
+
const detachInterrupt = createInterruptListener(() => {
|
|
6190
|
+
interrupted = true;
|
|
6191
|
+
spinner.setLabel("stopping after current attempt...");
|
|
6192
|
+
}, { enableEsc: process.stdin.isTTY, shouldIgnoreEsc: () => approvalPromptActive });
|
|
6180
6193
|
|
|
6181
6194
|
try {
|
|
6182
6195
|
const results = await runExperimentLoop({
|
|
6183
6196
|
charter,
|
|
6184
6197
|
cwd: context.cwd,
|
|
6198
|
+
tag: goalArg.toLowerCase().replace(/[^a-z0-9]+/g, "-").slice(0, 30),
|
|
6185
6199
|
handlers: {
|
|
6186
6200
|
onBaseline(value) {
|
|
6187
6201
|
spinner.stop();
|
|
6188
6202
|
console.log(`baseline: ${value}`);
|
|
6189
6203
|
console.log(`────────────────────────────────────────────────────────────`);
|
|
6190
6204
|
},
|
|
6191
|
-
|
|
6192
|
-
|
|
6205
|
+
onInterrupted() {
|
|
6206
|
+
return interrupted;
|
|
6207
|
+
},
|
|
6208
|
+
async planChange({ goal, constraints, metric, previousAttempts, simplicityBias }) {
|
|
6193
6209
|
const model = context.runtime.plannerModel || agent.getModel();
|
|
6194
6210
|
const previousSummary = previousAttempts.map((a) =>
|
|
6195
|
-
`attempt ${a.number}: ${a.hypothesis} → ${a.
|
|
6211
|
+
`attempt ${a.number}: ${a.hypothesis} → ${a.status} (${a.value})`
|
|
6196
6212
|
).join("\n");
|
|
6197
6213
|
|
|
6214
|
+
const simplicityNote = simplicityBias
|
|
6215
|
+
? "\n\nSimplicity criterion: prefer simpler changes. A small improvement from deleting code is better than a large improvement from adding complexity. If improvement is ~0 but code is simpler, that's a win."
|
|
6216
|
+
: "";
|
|
6217
|
+
|
|
6198
6218
|
const prompt = [
|
|
6199
6219
|
`Goal: ${goal}`,
|
|
6200
6220
|
`Metric: ${metric.command} (${metric.direction} is better, current best: ${metric.currentBest})`,
|
|
6201
6221
|
constraints.length > 0 ? `Constraints: ${constraints.join("; ")}` : "",
|
|
6202
6222
|
previousSummary ? `Previous attempts:\n${previousSummary}` : "",
|
|
6203
|
-
"Propose ONE specific code change
|
|
6204
|
-
|
|
6223
|
+
"Propose ONE specific code change. Be concrete: which file, which function, what change.",
|
|
6224
|
+
`Respond with JSON: { "hypothesis": "one-line summary", "prompt": "detailed executor instructions" }${simplicityNote}`
|
|
6205
6225
|
].filter(Boolean).join("\n\n");
|
|
6206
6226
|
|
|
6207
6227
|
try {
|
|
@@ -6211,32 +6231,47 @@ async function promptLoop(agent, session, context) {
|
|
|
6211
6231
|
baseUrl: context.runtime.baseUrl,
|
|
6212
6232
|
model,
|
|
6213
6233
|
messages: [
|
|
6214
|
-
{ role: "system", content: "You are an
|
|
6234
|
+
{ role: "system", content: "You are an autonomous researcher optimizing code. Each attempt must try something different from previous attempts. Learn from kept vs discarded results. Respond with JSON only." },
|
|
6215
6235
|
{ role: "user", content: prompt }
|
|
6216
6236
|
],
|
|
6217
|
-
temperature: 0.
|
|
6237
|
+
temperature: 0.6
|
|
6218
6238
|
});
|
|
6219
|
-
return completion.json || { hypothesis:
|
|
6239
|
+
return completion.json || { hypothesis: "attempt", prompt: goal };
|
|
6220
6240
|
} catch {
|
|
6221
|
-
return { hypothesis:
|
|
6241
|
+
return { hypothesis: "attempt", prompt: goal };
|
|
6222
6242
|
}
|
|
6223
6243
|
},
|
|
6224
6244
|
async executeChange({ prompt }) {
|
|
6225
6245
|
spinner.setLabel("implementing change...");
|
|
6226
|
-
// Use the executor model to make the change
|
|
6227
6246
|
await agent.runTurn(prompt, {
|
|
6228
6247
|
onAssistantDelta() {},
|
|
6229
6248
|
onToolStart() { spinner.setLabel("editing..."); },
|
|
6230
|
-
onToolEnd() { spinner.setLabel("implementing
|
|
6249
|
+
onToolEnd() { spinner.setLabel("implementing..."); }
|
|
6231
6250
|
});
|
|
6232
6251
|
await agent.toolRuntime.completeTurn({});
|
|
6233
6252
|
},
|
|
6253
|
+
async fixCrash({ error, prompt }) {
|
|
6254
|
+
// Try to fix the crash — give the agent the error and ask it to fix
|
|
6255
|
+
spinner.setLabel("fixing crash...");
|
|
6256
|
+
try {
|
|
6257
|
+
await agent.runTurn(
|
|
6258
|
+
`The previous change crashed with this error:\n${error}\n\nFix the issue. The original goal was: ${prompt}`,
|
|
6259
|
+
{ onAssistantDelta() {}, onToolStart() {}, onToolEnd() {} }
|
|
6260
|
+
);
|
|
6261
|
+
await agent.toolRuntime.completeTurn({});
|
|
6262
|
+
return true;
|
|
6263
|
+
} catch {
|
|
6264
|
+
return false;
|
|
6265
|
+
}
|
|
6266
|
+
},
|
|
6234
6267
|
onAttemptStart(number, hypothesis) {
|
|
6235
6268
|
spinner.setLabel(`attempt ${number}: ${hypothesis}`);
|
|
6236
6269
|
console.log(`\n ${number}. trying: ${hypothesis}`);
|
|
6237
6270
|
},
|
|
6238
6271
|
onAttemptEnd(attempt) {
|
|
6239
|
-
const icon = attempt.
|
|
6272
|
+
const icon = attempt.status === "keep" ? green("✓ kept")
|
|
6273
|
+
: attempt.status === "crash" ? yellow("💥 crash")
|
|
6274
|
+
: red("✗ reverted");
|
|
6240
6275
|
const value = attempt.value !== null ? ` → ${attempt.value}` : " → failed";
|
|
6241
6276
|
console.log(` ${icon}${value}`);
|
|
6242
6277
|
},
|
|
@@ -6246,8 +6281,24 @@ async function promptLoop(agent, session, context) {
|
|
|
6246
6281
|
}
|
|
6247
6282
|
});
|
|
6248
6283
|
|
|
6284
|
+
detachInterrupt();
|
|
6249
6285
|
console.log("\n" + formatExperimentSummary(results));
|
|
6286
|
+
|
|
6287
|
+
// Offer to return to original branch or stay
|
|
6288
|
+
if (results.originalBranch && results.branch) {
|
|
6289
|
+
console.log(`\nexperiment branch: ${results.branch}`);
|
|
6290
|
+
console.log(`original branch: ${results.originalBranch}`);
|
|
6291
|
+
try {
|
|
6292
|
+
const stay = await promptYesNo("Stay on experiment branch?", { input: process.stdin, output: process.stdout });
|
|
6293
|
+
if (!stay) {
|
|
6294
|
+
const { gitReturnToBranch } = await import("./experiment.js");
|
|
6295
|
+
await gitReturnToBranch({ cwd: context.cwd, branch: results.originalBranch });
|
|
6296
|
+
console.log(`returned to ${results.originalBranch}`);
|
|
6297
|
+
}
|
|
6298
|
+
} catch {}
|
|
6299
|
+
}
|
|
6250
6300
|
} catch (error) {
|
|
6301
|
+
detachInterrupt();
|
|
6251
6302
|
spinner.stop();
|
|
6252
6303
|
console.log(`experiment failed: ${error instanceof Error ? error.message : String(error)}`);
|
|
6253
6304
|
}
|
package/src/experiment.js
CHANGED
|
@@ -1,10 +1,16 @@
|
|
|
1
1
|
import { execFile } from "node:child_process";
|
|
2
|
+
import fs from "node:fs/promises";
|
|
3
|
+
import path from "node:path";
|
|
2
4
|
import { promisify } from "node:util";
|
|
3
5
|
|
|
4
6
|
const execFileAsync = promisify(execFile);
|
|
5
7
|
|
|
8
|
+
const MAX_CRASH_RETRIES = 2;
|
|
9
|
+
const RESULTS_FILENAME = "results.tsv";
|
|
10
|
+
const TSV_HEADER = "commit\tvalue\tstatus\tdescription";
|
|
11
|
+
|
|
6
12
|
/**
|
|
7
|
-
* Charter: the experiment's rules.
|
|
13
|
+
* Charter: the experiment's rules (inspired by Karpathy's program.md).
|
|
8
14
|
* {
|
|
9
15
|
* goal: "speed up auth middleware",
|
|
10
16
|
* metric: {
|
|
@@ -13,15 +19,15 @@ const execFileAsync = promisify(execFile);
|
|
|
13
19
|
* direction: "lower" // "lower" | "higher"
|
|
14
20
|
* },
|
|
15
21
|
* budget: {
|
|
16
|
-
* maxAttempts:
|
|
17
|
-
* maxMinutes:
|
|
22
|
+
* maxAttempts: 0, // 0 = infinite (run until interrupted)
|
|
23
|
+
* maxMinutes: 60
|
|
18
24
|
* },
|
|
19
|
-
* constraints: ["do not change the public API", "keep all existing tests passing"]
|
|
25
|
+
* constraints: ["do not change the public API", "keep all existing tests passing"],
|
|
26
|
+
* simplicityBias: true // prefer simpler changes over complex ones
|
|
20
27
|
* }
|
|
21
28
|
*/
|
|
22
29
|
|
|
23
30
|
export function parseCharterFromGoal(goalText) {
|
|
24
|
-
// Extract metric hints from natural language
|
|
25
31
|
const lower = goalText.toLowerCase();
|
|
26
32
|
let direction = "lower";
|
|
27
33
|
let extract = "duration";
|
|
@@ -43,8 +49,9 @@ export function parseCharterFromGoal(goalText) {
|
|
|
43
49
|
return {
|
|
44
50
|
goal: goalText,
|
|
45
51
|
metric: { command: "", extract, direction },
|
|
46
|
-
budget: { maxAttempts:
|
|
47
|
-
constraints: []
|
|
52
|
+
budget: { maxAttempts: 0, maxMinutes: 60 },
|
|
53
|
+
constraints: [],
|
|
54
|
+
simplicityBias: true
|
|
48
55
|
};
|
|
49
56
|
}
|
|
50
57
|
|
|
@@ -52,20 +59,18 @@ export async function runMetric({ command, extract, cwd }) {
|
|
|
52
59
|
const startTime = Date.now();
|
|
53
60
|
try {
|
|
54
61
|
const isWin = process.platform === "win32";
|
|
55
|
-
const execOpts = { cwd, env: process.env, maxBuffer: 8 * 1024 * 1024, timeout:
|
|
56
|
-
let stdout, stderr
|
|
62
|
+
const execOpts = { cwd, env: process.env, maxBuffer: 8 * 1024 * 1024, timeout: 600000 };
|
|
63
|
+
let stdout, stderr;
|
|
57
64
|
|
|
58
65
|
if (isWin) {
|
|
59
66
|
const result = await execFileAsync(command, [], { ...execOpts, shell: true });
|
|
60
67
|
stdout = String(result.stdout || "");
|
|
61
68
|
stderr = String(result.stderr || "");
|
|
62
|
-
exitCode = 0;
|
|
63
69
|
} else {
|
|
64
70
|
const parts = command.split(/\s+/);
|
|
65
71
|
const result = await execFileAsync(parts[0], parts.slice(1), execOpts);
|
|
66
72
|
stdout = String(result.stdout || "");
|
|
67
73
|
stderr = String(result.stderr || "");
|
|
68
|
-
exitCode = 0;
|
|
69
74
|
}
|
|
70
75
|
|
|
71
76
|
const elapsed = Date.now() - startTime;
|
|
@@ -74,7 +79,7 @@ export async function runMetric({ command, extract, cwd }) {
|
|
|
74
79
|
return { ok: true, value: elapsed, raw: stdout, elapsed };
|
|
75
80
|
}
|
|
76
81
|
if (extract === "exit_code") {
|
|
77
|
-
return { ok: true, value:
|
|
82
|
+
return { ok: true, value: 0, raw: stdout, elapsed };
|
|
78
83
|
}
|
|
79
84
|
if (extract === "stdout_number") {
|
|
80
85
|
const match = (stdout + stderr).match(/(\d+\.?\d*)/);
|
|
@@ -90,18 +95,36 @@ export async function runMetric({ command, extract, cwd }) {
|
|
|
90
95
|
return {
|
|
91
96
|
ok: false,
|
|
92
97
|
value: null,
|
|
93
|
-
raw: error.stderr || error.message ||
|
|
98
|
+
raw: String(error.stderr || error.message || error).slice(0, 2000),
|
|
94
99
|
elapsed,
|
|
95
100
|
exitCode: error.code || 1
|
|
96
101
|
};
|
|
97
102
|
}
|
|
98
103
|
}
|
|
99
104
|
|
|
105
|
+
// --- Git operations ---
|
|
106
|
+
|
|
107
|
+
export async function gitCreateBranch({ cwd, tag }) {
|
|
108
|
+
const branch = `experiment/${tag}`;
|
|
109
|
+
try {
|
|
110
|
+
await execFileAsync("git", ["checkout", "-b", branch], { cwd });
|
|
111
|
+
return { ok: true, branch };
|
|
112
|
+
} catch {
|
|
113
|
+
// Branch might already exist
|
|
114
|
+
try {
|
|
115
|
+
await execFileAsync("git", ["checkout", branch], { cwd });
|
|
116
|
+
return { ok: true, branch };
|
|
117
|
+
} catch {
|
|
118
|
+
return { ok: false, branch };
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
|
|
100
123
|
export async function gitCheckpoint({ cwd, label }) {
|
|
101
124
|
try {
|
|
102
125
|
await execFileAsync("git", ["add", "-A"], { cwd });
|
|
103
126
|
await execFileAsync("git", ["commit", "-m", `[experiment] ${label}`, "--allow-empty"], { cwd });
|
|
104
|
-
const { stdout } = await execFileAsync("git", ["rev-parse", "HEAD"], { cwd });
|
|
127
|
+
const { stdout } = await execFileAsync("git", ["rev-parse", "--short", "HEAD"], { cwd });
|
|
105
128
|
return { ok: true, sha: stdout.trim() };
|
|
106
129
|
} catch {
|
|
107
130
|
return { ok: false, sha: null };
|
|
@@ -117,6 +140,69 @@ export async function gitRevert({ cwd, sha }) {
|
|
|
117
140
|
}
|
|
118
141
|
}
|
|
119
142
|
|
|
143
|
+
export async function gitReturnToBranch({ cwd, branch }) {
|
|
144
|
+
try {
|
|
145
|
+
await execFileAsync("git", ["checkout", branch], { cwd });
|
|
146
|
+
return { ok: true };
|
|
147
|
+
} catch {
|
|
148
|
+
return { ok: false };
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
async function gitGetCurrentBranch({ cwd }) {
|
|
153
|
+
try {
|
|
154
|
+
const { stdout } = await execFileAsync("git", ["rev-parse", "--abbrev-ref", "HEAD"], { cwd });
|
|
155
|
+
return stdout.trim();
|
|
156
|
+
} catch {
|
|
157
|
+
return null;
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
// --- Results TSV (persistent scoreboard, untracked by git) ---
|
|
162
|
+
|
|
163
|
+
async function ensureResultsTsv(cwd) {
|
|
164
|
+
const tsvPath = path.join(cwd, RESULTS_FILENAME);
|
|
165
|
+
try {
|
|
166
|
+
await fs.access(tsvPath);
|
|
167
|
+
} catch {
|
|
168
|
+
await fs.writeFile(tsvPath, `${TSV_HEADER}\n`, "utf8");
|
|
169
|
+
}
|
|
170
|
+
// Add to .gitignore if not already there
|
|
171
|
+
try {
|
|
172
|
+
const gitignorePath = path.join(cwd, ".gitignore");
|
|
173
|
+
let gitignore = "";
|
|
174
|
+
try { gitignore = await fs.readFile(gitignorePath, "utf8"); } catch {}
|
|
175
|
+
if (!gitignore.includes(RESULTS_FILENAME)) {
|
|
176
|
+
await fs.writeFile(gitignorePath, `${gitignore.trimEnd()}\n${RESULTS_FILENAME}\n`, "utf8");
|
|
177
|
+
}
|
|
178
|
+
} catch {}
|
|
179
|
+
return tsvPath;
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
export async function appendResult({ cwd, commit, value, status, description }) {
|
|
183
|
+
const tsvPath = await ensureResultsTsv(cwd);
|
|
184
|
+
const valueStr = value !== null && value !== undefined ? String(value) : "0";
|
|
185
|
+
const desc = String(description || "").replace(/\t/g, " ").replace(/\n/g, " ").slice(0, 200);
|
|
186
|
+
const line = `${commit || "-------"}\t${valueStr}\t${status}\t${desc}\n`;
|
|
187
|
+
await fs.appendFile(tsvPath, line, "utf8");
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
export async function readResults(cwd) {
|
|
191
|
+
const tsvPath = path.join(cwd, RESULTS_FILENAME);
|
|
192
|
+
try {
|
|
193
|
+
const raw = await fs.readFile(tsvPath, "utf8");
|
|
194
|
+
const lines = raw.trim().split("\n").slice(1); // skip header
|
|
195
|
+
return lines.map((line) => {
|
|
196
|
+
const [commit, value, status, description] = line.split("\t");
|
|
197
|
+
return { commit, value: parseFloat(value) || 0, status, description: description || "" };
|
|
198
|
+
});
|
|
199
|
+
} catch {
|
|
200
|
+
return [];
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
// --- Core logic ---
|
|
205
|
+
|
|
120
206
|
export function isBetter(newValue, oldValue, direction) {
|
|
121
207
|
if (newValue === null || oldValue === null) return false;
|
|
122
208
|
if (direction === "lower") return newValue < oldValue;
|
|
@@ -125,20 +211,23 @@ export function isBetter(newValue, oldValue, direction) {
|
|
|
125
211
|
}
|
|
126
212
|
|
|
127
213
|
export function formatAttemptResult(attempt) {
|
|
128
|
-
const
|
|
214
|
+
const icon = attempt.status === "keep" ? "✓" : attempt.status === "crash" ? "💥" : "✗";
|
|
215
|
+
const label = attempt.status === "keep" ? "kept" : attempt.status === "crash" ? "crash" : "reverted";
|
|
129
216
|
const delta = attempt.baseline !== null && attempt.value !== null
|
|
130
217
|
? ` (${attempt.value > attempt.baseline ? "+" : ""}${(attempt.value - attempt.baseline).toFixed(2)})`
|
|
131
218
|
: "";
|
|
132
|
-
|
|
219
|
+
const valueStr = attempt.value !== null ? String(attempt.value) : "failed";
|
|
220
|
+
return ` ${attempt.number}. ${icon} ${label} ${valueStr}${delta} — ${attempt.hypothesis}`;
|
|
133
221
|
}
|
|
134
222
|
|
|
135
223
|
export function formatExperimentSummary(results) {
|
|
136
224
|
const lines = [];
|
|
137
|
-
const { charter, baseline, attempts, bestValue, bestAttempt, totalElapsed } = results;
|
|
225
|
+
const { charter, baseline, attempts, bestValue, bestAttempt, totalElapsed, branch } = results;
|
|
138
226
|
|
|
139
227
|
lines.push(`────────────────────────────────────────────────────────────`);
|
|
140
228
|
lines.push(`experiment: ${charter.goal}`);
|
|
141
229
|
lines.push(`metric: ${charter.metric.command} (${charter.metric.direction} is better)`);
|
|
230
|
+
if (branch) lines.push(`branch: ${branch}`);
|
|
142
231
|
lines.push(`baseline: ${baseline}`);
|
|
143
232
|
lines.push(`────────────────────────────────────────────────────────────`);
|
|
144
233
|
|
|
@@ -157,97 +246,169 @@ export function formatExperimentSummary(results) {
|
|
|
157
246
|
}
|
|
158
247
|
const mins = (totalElapsed / 60000).toFixed(1);
|
|
159
248
|
lines.push(`${attempts.length} attempts in ${mins}m`);
|
|
249
|
+
lines.push(`results logged to ${RESULTS_FILENAME}`);
|
|
160
250
|
lines.push(`────────────────────────────────────────────────────────────`);
|
|
161
251
|
|
|
162
252
|
return lines.join("\n");
|
|
163
253
|
}
|
|
164
254
|
|
|
165
255
|
/**
|
|
166
|
-
* Run the experiment loop.
|
|
256
|
+
* Run the experiment loop (autoresearch-style).
|
|
257
|
+
*
|
|
258
|
+
* Key differences from v1:
|
|
259
|
+
* - Runs on a dedicated branch (experiment/<tag>)
|
|
260
|
+
* - Logs every attempt to results.tsv (persistent, git-ignored)
|
|
261
|
+
* - maxAttempts=0 means infinite (run until interrupted or time budget)
|
|
262
|
+
* - Crash recovery: retries up to MAX_CRASH_RETRIES before giving up on an idea
|
|
263
|
+
* - Simplicity bias: planner is told to prefer simpler changes
|
|
264
|
+
* - Output redirected: metric output goes to log, not context
|
|
167
265
|
*
|
|
168
266
|
* handlers: {
|
|
169
|
-
* onBaseline(value)
|
|
170
|
-
* onAttemptStart(number, hypothesis)
|
|
171
|
-
* onAttemptEnd(attempt)
|
|
172
|
-
* onDone(results)
|
|
173
|
-
*
|
|
267
|
+
* onBaseline(value)
|
|
268
|
+
* onAttemptStart(number, hypothesis)
|
|
269
|
+
* onAttemptEnd(attempt)
|
|
270
|
+
* onDone(results)
|
|
271
|
+
* onInterrupted() → boolean (check if user pressed Ctrl+C)
|
|
272
|
+
* planChange({ goal, constraints, metric, previousAttempts, cwd, simplicityBias }) → { hypothesis, prompt }
|
|
174
273
|
* executeChange({ prompt, cwd }) → void
|
|
274
|
+
* fixCrash({ error, prompt, cwd }) → boolean (true if fixed, false to give up)
|
|
175
275
|
* }
|
|
176
276
|
*/
|
|
177
|
-
export async function runExperimentLoop({ charter, cwd, handlers = {} }) {
|
|
277
|
+
export async function runExperimentLoop({ charter, cwd, tag, handlers = {} }) {
|
|
178
278
|
const startTime = Date.now();
|
|
179
279
|
const { metric, budget } = charter;
|
|
180
280
|
const attempts = [];
|
|
281
|
+
const isInfinite = !budget.maxAttempts || budget.maxAttempts <= 0;
|
|
282
|
+
|
|
283
|
+
// Create dedicated branch
|
|
284
|
+
const originalBranch = await gitGetCurrentBranch({ cwd });
|
|
285
|
+
const branchTag = tag || `exp-${Date.now().toString(36)}`;
|
|
286
|
+
const { branch } = await gitCreateBranch({ cwd, tag: branchTag });
|
|
287
|
+
|
|
288
|
+
// Ensure results.tsv exists
|
|
289
|
+
await ensureResultsTsv(cwd);
|
|
181
290
|
|
|
182
291
|
// Measure baseline
|
|
183
292
|
const baselineResult = await runMetric({ command: metric.command, extract: metric.extract, cwd });
|
|
184
293
|
if (!baselineResult.ok) {
|
|
294
|
+
// Return to original branch on failure
|
|
295
|
+
if (originalBranch) await gitReturnToBranch({ cwd, branch: originalBranch });
|
|
185
296
|
throw new Error(`Baseline metric failed: ${baselineResult.raw}`);
|
|
186
297
|
}
|
|
187
298
|
const baseline = baselineResult.value;
|
|
188
299
|
if (handlers.onBaseline) handlers.onBaseline(baseline);
|
|
189
300
|
|
|
190
|
-
// Checkpoint baseline
|
|
301
|
+
// Checkpoint baseline
|
|
191
302
|
const baselineCheckpoint = await gitCheckpoint({ cwd, label: `baseline (${baseline})` });
|
|
303
|
+
await appendResult({ cwd, commit: baselineCheckpoint.sha, value: baseline, status: "keep", description: "baseline" });
|
|
304
|
+
|
|
192
305
|
let currentBest = baseline;
|
|
193
306
|
let bestAttempt = null;
|
|
194
307
|
let lastGoodSha = baselineCheckpoint.sha;
|
|
308
|
+
let attemptNum = 0;
|
|
195
309
|
|
|
196
|
-
|
|
310
|
+
while (true) {
|
|
311
|
+
attemptNum++;
|
|
312
|
+
|
|
313
|
+
// Check budget
|
|
197
314
|
const elapsed = (Date.now() - startTime) / 60000;
|
|
198
315
|
if (elapsed >= budget.maxMinutes) break;
|
|
316
|
+
if (!isInfinite && attemptNum > budget.maxAttempts) break;
|
|
317
|
+
|
|
318
|
+
// Check for interruption
|
|
319
|
+
if (handlers.onInterrupted && handlers.onInterrupted()) break;
|
|
199
320
|
|
|
200
|
-
//
|
|
201
|
-
let hypothesis = `attempt ${
|
|
321
|
+
// Plan the change
|
|
322
|
+
let hypothesis = `attempt ${attemptNum}`;
|
|
202
323
|
let changePrompt = charter.goal;
|
|
203
324
|
if (handlers.planChange) {
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
325
|
+
try {
|
|
326
|
+
const plan = await handlers.planChange({
|
|
327
|
+
goal: charter.goal,
|
|
328
|
+
constraints: charter.constraints,
|
|
329
|
+
metric: { command: metric.command, direction: metric.direction, currentBest },
|
|
330
|
+
previousAttempts: attempts,
|
|
331
|
+
simplicityBias: charter.simplicityBias !== false,
|
|
332
|
+
cwd
|
|
333
|
+
});
|
|
334
|
+
hypothesis = plan.hypothesis || hypothesis;
|
|
335
|
+
changePrompt = plan.prompt || changePrompt;
|
|
336
|
+
} catch {
|
|
337
|
+
// Planner failed — use generic prompt
|
|
338
|
+
}
|
|
213
339
|
}
|
|
214
340
|
|
|
215
|
-
if (handlers.onAttemptStart) handlers.onAttemptStart(
|
|
341
|
+
if (handlers.onAttemptStart) handlers.onAttemptStart(attemptNum, hypothesis);
|
|
216
342
|
|
|
217
|
-
// Execute the change
|
|
218
|
-
|
|
343
|
+
// Execute the change (with crash recovery)
|
|
344
|
+
let executed = false;
|
|
345
|
+
let crashError = null;
|
|
346
|
+
for (let retry = 0; retry <= MAX_CRASH_RETRIES; retry++) {
|
|
219
347
|
try {
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
348
|
+
if (handlers.executeChange) {
|
|
349
|
+
await handlers.executeChange({ prompt: changePrompt, cwd });
|
|
350
|
+
}
|
|
351
|
+
executed = true;
|
|
352
|
+
break;
|
|
353
|
+
} catch (err) {
|
|
354
|
+
crashError = err;
|
|
355
|
+
// Try to fix the crash if handler is available
|
|
356
|
+
if (retry < MAX_CRASH_RETRIES && handlers.fixCrash) {
|
|
357
|
+
const fixed = await handlers.fixCrash({
|
|
358
|
+
error: err instanceof Error ? err.message : String(err),
|
|
359
|
+
prompt: changePrompt,
|
|
360
|
+
cwd
|
|
361
|
+
});
|
|
362
|
+
if (!fixed) break;
|
|
363
|
+
// Fixed — retry execution
|
|
364
|
+
}
|
|
227
365
|
}
|
|
228
366
|
}
|
|
229
367
|
|
|
368
|
+
if (!executed) {
|
|
369
|
+
// Crash — revert and log
|
|
370
|
+
await gitRevert({ cwd, sha: lastGoodSha });
|
|
371
|
+
const attempt = { number: attemptNum, hypothesis, value: null, baseline: currentBest, status: "crash", error: true };
|
|
372
|
+
attempts.push(attempt);
|
|
373
|
+
await appendResult({ cwd, commit: "-------", value: 0, status: "crash", description: hypothesis });
|
|
374
|
+
if (handlers.onAttemptEnd) handlers.onAttemptEnd(attempt);
|
|
375
|
+
continue;
|
|
376
|
+
}
|
|
377
|
+
|
|
230
378
|
// Measure result
|
|
231
379
|
const result = await runMetric({ command: metric.command, extract: metric.extract, cwd });
|
|
380
|
+
|
|
381
|
+
if (!result.ok) {
|
|
382
|
+
// Metric failed (runtime crash) — revert
|
|
383
|
+
await gitRevert({ cwd, sha: lastGoodSha });
|
|
384
|
+
const attempt = { number: attemptNum, hypothesis, value: null, baseline: currentBest, status: "crash", error: true };
|
|
385
|
+
attempts.push(attempt);
|
|
386
|
+
await appendResult({ cwd, commit: "-------", value: 0, status: "crash", description: `${hypothesis} (metric failed)` });
|
|
387
|
+
if (handlers.onAttemptEnd) handlers.onAttemptEnd(attempt);
|
|
388
|
+
continue;
|
|
389
|
+
}
|
|
390
|
+
|
|
232
391
|
const attempt = {
|
|
233
|
-
number:
|
|
392
|
+
number: attemptNum,
|
|
234
393
|
hypothesis,
|
|
235
|
-
value: result.
|
|
394
|
+
value: result.value,
|
|
236
395
|
baseline: currentBest,
|
|
237
|
-
|
|
238
|
-
error:
|
|
396
|
+
status: "discard",
|
|
397
|
+
error: false
|
|
239
398
|
};
|
|
240
399
|
|
|
241
|
-
if (
|
|
242
|
-
// Keep —
|
|
243
|
-
attempt.
|
|
400
|
+
if (isBetter(result.value, currentBest, metric.direction)) {
|
|
401
|
+
// Keep — advance the branch
|
|
402
|
+
attempt.status = "keep";
|
|
244
403
|
currentBest = result.value;
|
|
245
404
|
bestAttempt = attempt;
|
|
246
|
-
const cp = await gitCheckpoint({ cwd, label: `attempt ${
|
|
405
|
+
const cp = await gitCheckpoint({ cwd, label: `attempt ${attemptNum}: ${hypothesis} (${result.value})` });
|
|
247
406
|
lastGoodSha = cp.sha || lastGoodSha;
|
|
407
|
+
await appendResult({ cwd, commit: cp.sha, value: result.value, status: "keep", description: hypothesis });
|
|
248
408
|
} else {
|
|
249
|
-
//
|
|
409
|
+
// Discard — revert to last good state
|
|
250
410
|
await gitRevert({ cwd, sha: lastGoodSha });
|
|
411
|
+
await appendResult({ cwd, commit: lastGoodSha, value: result.value, status: "discard", description: hypothesis });
|
|
251
412
|
}
|
|
252
413
|
|
|
253
414
|
attempts.push(attempt);
|
|
@@ -260,7 +421,9 @@ export async function runExperimentLoop({ charter, cwd, handlers = {} }) {
|
|
|
260
421
|
attempts,
|
|
261
422
|
bestValue: currentBest,
|
|
262
423
|
bestAttempt,
|
|
263
|
-
totalElapsed: Date.now() - startTime
|
|
424
|
+
totalElapsed: Date.now() - startTime,
|
|
425
|
+
branch,
|
|
426
|
+
originalBranch
|
|
264
427
|
};
|
|
265
428
|
|
|
266
429
|
if (handlers.onDone) handlers.onDone(results);
|