@bradtaylorsf/alpha-loop 1.4.2 → 1.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +136 -3
- package/dist/cli.js +73 -6
- package/dist/cli.js.map +1 -1
- package/dist/commands/eval.d.ts +59 -4
- package/dist/commands/eval.js +370 -55
- package/dist/commands/eval.js.map +1 -1
- package/dist/commands/evolve.d.ts +43 -4
- package/dist/commands/evolve.js +444 -66
- package/dist/commands/evolve.js.map +1 -1
- package/dist/commands/init.js +3 -7
- package/dist/commands/init.js.map +1 -1
- package/dist/commands/plan.d.ts +11 -0
- package/dist/commands/plan.js +298 -0
- package/dist/commands/plan.js.map +1 -0
- package/dist/commands/roadmap.d.ts +9 -0
- package/dist/commands/roadmap.js +202 -0
- package/dist/commands/roadmap.js.map +1 -0
- package/dist/commands/triage.d.ts +9 -0
- package/dist/commands/triage.js +226 -0
- package/dist/commands/triage.js.map +1 -0
- package/dist/commands/vision.js +1 -0
- package/dist/commands/vision.js.map +1 -1
- package/dist/lib/config.d.ts +20 -0
- package/dist/lib/config.js +55 -0
- package/dist/lib/config.js.map +1 -1
- package/dist/lib/eval-checks.d.ts +11 -1
- package/dist/lib/eval-checks.js +39 -0
- package/dist/lib/eval-checks.js.map +1 -1
- package/dist/lib/eval-fixtures.d.ts +55 -0
- package/dist/lib/eval-fixtures.js +172 -0
- package/dist/lib/eval-fixtures.js.map +1 -0
- package/dist/lib/eval-runner.d.ts +26 -2
- package/dist/lib/eval-runner.js +202 -17
- package/dist/lib/eval-runner.js.map +1 -1
- package/dist/lib/eval-skill-bridge.d.ts +53 -0
- package/dist/lib/eval-skill-bridge.js +121 -0
- package/dist/lib/eval-skill-bridge.js.map +1 -0
- package/dist/lib/eval-swebench.d.ts +68 -0
- package/dist/lib/eval-swebench.js +274 -0
- package/dist/lib/eval-swebench.js.map +1 -0
- package/dist/lib/eval.d.ts +9 -1
- package/dist/lib/eval.js +27 -7
- package/dist/lib/eval.js.map +1 -1
- package/dist/lib/github.d.ts +46 -0
- package/dist/lib/github.js +179 -0
- package/dist/lib/github.js.map +1 -1
- package/dist/lib/pipeline.js +44 -2
- package/dist/lib/pipeline.js.map +1 -1
- package/dist/lib/planning.d.ts +91 -0
- package/dist/lib/planning.js +315 -0
- package/dist/lib/planning.js.map +1 -0
- package/dist/lib/prompts.d.ts +79 -0
- package/dist/lib/prompts.js +151 -2
- package/dist/lib/prompts.js.map +1 -1
- package/dist/lib/score.d.ts +24 -2
- package/dist/lib/score.js +162 -3
- package/dist/lib/score.js.map +1 -1
- package/package.json +2 -1
|
@@ -2,7 +2,50 @@ export type EvolveOptions = {
|
|
|
2
2
|
maxIterations?: string;
|
|
3
3
|
dryRun?: boolean;
|
|
4
4
|
verbose?: boolean;
|
|
5
|
+
continuous?: boolean;
|
|
6
|
+
surface?: string;
|
|
7
|
+
resume?: boolean;
|
|
5
8
|
};
|
|
9
|
+
/** Optimization surface levels — what the proposer is allowed to modify. */
|
|
10
|
+
export type SurfaceLevel = 'prompts' | 'skills' | 'config' | 'all';
|
|
11
|
+
/** Allowed target paths per surface level. */
|
|
12
|
+
export declare const SURFACE_TARGETS: Record<SurfaceLevel, string[]>;
|
|
13
|
+
/** Path to the evolve log TSV file. */
|
|
14
|
+
export declare const EVOLVE_LOG_PATH = ".alpha-loop/evals/evolve-log.tsv";
|
|
15
|
+
/** A single entry in the evolve log. */
|
|
16
|
+
export type EvolveLogEntry = {
|
|
17
|
+
commit: string;
|
|
18
|
+
score: number;
|
|
19
|
+
cost: number;
|
|
20
|
+
status: 'baseline' | 'keep' | 'discard' | 'crash';
|
|
21
|
+
iteration: number;
|
|
22
|
+
description: string;
|
|
23
|
+
};
|
|
24
|
+
/**
|
|
25
|
+
* Append an entry to the evolve log TSV.
|
|
26
|
+
*/
|
|
27
|
+
export declare function appendEvolveLog(entry: EvolveLogEntry, cwd?: string): void;
|
|
28
|
+
/**
|
|
29
|
+
* Read all entries from the evolve log TSV.
|
|
30
|
+
*/
|
|
31
|
+
export declare function readEvolveLog(cwd?: string): EvolveLogEntry[];
|
|
32
|
+
/**
|
|
33
|
+
* Run pre-checks before expensive eval.
|
|
34
|
+
* Returns { passed, error } indicating whether the code is safe to eval.
|
|
35
|
+
*/
|
|
36
|
+
export declare function runPreChecks(surface: SurfaceLevel, cwd?: string): Promise<{
|
|
37
|
+
passed: boolean;
|
|
38
|
+
error?: string;
|
|
39
|
+
}>;
|
|
40
|
+
/**
|
|
41
|
+
* Decide whether to keep or discard based on score comparison.
|
|
42
|
+
* Returns 'keep' if newScore > bestScore, 'discard' otherwise.
|
|
43
|
+
*/
|
|
44
|
+
export declare function keepOrDiscard(newScore: number, bestScore: number): 'keep' | 'discard';
|
|
45
|
+
/**
|
|
46
|
+
* Check if a proposed path is safe to modify for a given surface level.
|
|
47
|
+
*/
|
|
48
|
+
export declare function isSafePath(filePath: string, surface?: SurfaceLevel): boolean;
|
|
6
49
|
/**
|
|
7
50
|
* Run the evolve loop: propose → eval → keep/discard.
|
|
8
51
|
*/
|
|
@@ -18,8 +61,4 @@ type ProposedChange = {
|
|
|
18
61
|
* Expects a JSON array in the output.
|
|
19
62
|
*/
|
|
20
63
|
export declare function parseProposedChanges(output: string): ProposedChange[];
|
|
21
|
-
/**
|
|
22
|
-
* Check if a proposed path is safe to modify.
|
|
23
|
-
*/
|
|
24
|
-
export declare function isSafePath(filePath: string): boolean;
|
|
25
64
|
export {};
|
package/dist/commands/evolve.js
CHANGED
|
@@ -12,57 +12,251 @@
|
|
|
12
12
|
* Key insight from Meta-Harness: full trace access (not summaries) is critical.
|
|
13
13
|
* Key insight from autoresearch: fixed eval metric + autonomous iteration.
|
|
14
14
|
*/
|
|
15
|
-
import { existsSync, readFileSync, readdirSync, writeFileSync } from 'node:fs';
|
|
16
|
-
import { join } from 'node:path';
|
|
15
|
+
import { existsSync, readFileSync, readdirSync, writeFileSync, mkdirSync, appendFileSync, unlinkSync, rmdirSync } from 'node:fs';
|
|
16
|
+
import { join, dirname } from 'node:path';
|
|
17
17
|
import { log } from '../lib/logger.js';
|
|
18
18
|
import { loadConfig } from '../lib/config.js';
|
|
19
19
|
import { spawnAgent } from '../lib/agent.js';
|
|
20
20
|
import { loadEvalCases, evalsDir } from '../lib/eval.js';
|
|
21
21
|
import { readScores, latestScore, formatScoreEntry } from '../lib/score.js';
|
|
22
22
|
import { listTraces, readTrace } from '../lib/traces.js';
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
23
|
+
import { exec } from '../lib/shell.js';
|
|
24
|
+
import { runEvalSuite } from '../lib/eval-runner.js';
|
|
25
|
+
const SURFACE_LEVELS = ['prompts', 'skills', 'config', 'all'];
|
|
26
|
+
/** Allowed target paths per surface level. */
|
|
27
|
+
export const SURFACE_TARGETS = {
|
|
28
|
+
prompts: [
|
|
29
|
+
'.alpha-loop/templates/agents/',
|
|
30
|
+
],
|
|
31
|
+
skills: [
|
|
32
|
+
'.alpha-loop/templates/agents/',
|
|
33
|
+
'.alpha-loop/templates/skills/',
|
|
34
|
+
],
|
|
35
|
+
config: [
|
|
36
|
+
'.alpha-loop/templates/agents/',
|
|
37
|
+
'.alpha-loop/templates/skills/',
|
|
38
|
+
'.alpha-loop.yaml',
|
|
39
|
+
],
|
|
40
|
+
all: [
|
|
41
|
+
'.alpha-loop/templates/agents/',
|
|
42
|
+
'.alpha-loop/templates/skills/',
|
|
43
|
+
'.alpha-loop.yaml',
|
|
44
|
+
'src/lib/prompts.ts',
|
|
45
|
+
'src/lib/pipeline.ts',
|
|
46
|
+
],
|
|
47
|
+
};
|
|
48
|
+
/** Path to the evolve log TSV file. */
|
|
49
|
+
export const EVOLVE_LOG_PATH = '.alpha-loop/evals/evolve-log.tsv';
|
|
50
|
+
const EVOLVE_LOG_HEADER = 'commit\tscore\tcost\tstatus\titeration\tdescription';
|
|
51
|
+
/**
|
|
52
|
+
* Append an entry to the evolve log TSV.
|
|
53
|
+
*/
|
|
54
|
+
export function appendEvolveLog(entry, cwd) {
|
|
55
|
+
const logPath = join(cwd ?? process.cwd(), EVOLVE_LOG_PATH);
|
|
56
|
+
const dir = join(cwd ?? process.cwd(), '.alpha-loop', 'evals');
|
|
57
|
+
mkdirSync(dir, { recursive: true });
|
|
58
|
+
if (!existsSync(logPath)) {
|
|
59
|
+
writeFileSync(logPath, EVOLVE_LOG_HEADER + '\n');
|
|
60
|
+
}
|
|
61
|
+
const line = [
|
|
62
|
+
entry.commit,
|
|
63
|
+
entry.score.toFixed(2),
|
|
64
|
+
entry.cost.toFixed(2),
|
|
65
|
+
entry.status,
|
|
66
|
+
String(entry.iteration),
|
|
67
|
+
entry.description,
|
|
68
|
+
].join('\t');
|
|
69
|
+
appendFileSync(logPath, line + '\n');
|
|
70
|
+
}
|
|
71
|
+
/**
|
|
72
|
+
* Read all entries from the evolve log TSV.
|
|
73
|
+
*/
|
|
74
|
+
export function readEvolveLog(cwd) {
|
|
75
|
+
const logPath = join(cwd ?? process.cwd(), EVOLVE_LOG_PATH);
|
|
76
|
+
if (!existsSync(logPath))
|
|
77
|
+
return [];
|
|
78
|
+
const content = readFileSync(logPath, 'utf-8').trim();
|
|
79
|
+
const lines = content.split('\n').filter(Boolean);
|
|
80
|
+
// Skip header
|
|
81
|
+
const entries = [];
|
|
82
|
+
for (let i = 1; i < lines.length; i++) {
|
|
83
|
+
const parts = lines[i].split('\t');
|
|
84
|
+
if (parts.length < 6)
|
|
85
|
+
continue;
|
|
86
|
+
entries.push({
|
|
87
|
+
commit: parts[0],
|
|
88
|
+
score: parseFloat(parts[1]),
|
|
89
|
+
cost: parseFloat(parts[2]),
|
|
90
|
+
status: parts[3],
|
|
91
|
+
iteration: parseInt(parts[4], 10),
|
|
92
|
+
description: parts.slice(5).join('\t'), // description may contain tabs
|
|
93
|
+
});
|
|
94
|
+
}
|
|
95
|
+
return entries;
|
|
96
|
+
}
|
|
97
|
+
/**
|
|
98
|
+
* Run pre-checks before expensive eval.
|
|
99
|
+
* Returns { passed, error } indicating whether the code is safe to eval.
|
|
100
|
+
*/
|
|
101
|
+
export async function runPreChecks(surface, cwd) {
|
|
102
|
+
const projectDir = cwd ?? process.cwd();
|
|
103
|
+
// Only run compile check if code files were changed
|
|
104
|
+
if (surface === 'all') {
|
|
105
|
+
const tscResult = exec('pnpm tsc --noEmit', { cwd: projectDir, timeout: 60_000 });
|
|
106
|
+
if (tscResult.exitCode !== 0) {
|
|
107
|
+
return { passed: false, error: `TypeScript compilation failed:\n${tscResult.stderr || tscResult.stdout}` };
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
// Run unit tests for all surface levels
|
|
111
|
+
const testResult = exec('pnpm test', { cwd: projectDir, timeout: 120_000 });
|
|
112
|
+
if (testResult.exitCode !== 0) {
|
|
113
|
+
return { passed: false, error: `Unit tests failed:\n${testResult.stderr || testResult.stdout}` };
|
|
114
|
+
}
|
|
115
|
+
return { passed: true };
|
|
116
|
+
}
|
|
117
|
+
/**
|
|
118
|
+
* Decide whether to keep or discard based on score comparison.
|
|
119
|
+
* Returns 'keep' if newScore > bestScore, 'discard' otherwise.
|
|
120
|
+
*/
|
|
121
|
+
export function keepOrDiscard(newScore, bestScore) {
|
|
122
|
+
return newScore > bestScore ? 'keep' : 'discard';
|
|
123
|
+
}
|
|
124
|
+
/**
|
|
125
|
+
* Get the current git commit hash (short form).
|
|
126
|
+
*/
|
|
127
|
+
function getCommitHash(cwd) {
|
|
128
|
+
try {
|
|
129
|
+
const result = exec('git rev-parse --short HEAD', { cwd: cwd ?? process.cwd(), timeout: 5000 });
|
|
130
|
+
return result.stdout.trim() || 'unknown';
|
|
131
|
+
}
|
|
132
|
+
catch {
|
|
133
|
+
return 'unknown';
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
/**
|
|
137
|
+
* Check if a proposed path is safe to modify for a given surface level.
|
|
138
|
+
*/
|
|
139
|
+
export function isSafePath(filePath, surface) {
|
|
140
|
+
// Reject absolute paths and path traversal
|
|
141
|
+
if (filePath.startsWith('/') || filePath.includes('..'))
|
|
142
|
+
return false;
|
|
143
|
+
const targets = SURFACE_TARGETS[surface ?? 'prompts'];
|
|
144
|
+
// Must be in allowed targets: directory prefixes use startsWith, files use exact match
|
|
145
|
+
return targets.some((prefix) => prefix.endsWith('/') ? filePath.startsWith(prefix) : filePath === prefix);
|
|
146
|
+
}
|
|
29
147
|
/**
|
|
30
148
|
* Run the evolve loop: propose → eval → keep/discard.
|
|
31
149
|
*/
|
|
32
150
|
export async function evolveCommand(options) {
|
|
33
151
|
const config = loadConfig({ dryRun: options.dryRun });
|
|
34
|
-
const
|
|
152
|
+
const surface = parseSurface(options.surface);
|
|
153
|
+
const continuous = options.continuous ?? false;
|
|
154
|
+
const maxIterations = continuous ? Infinity : parseInt(options.maxIterations ?? '5', 10);
|
|
35
155
|
// Validate prerequisites
|
|
36
|
-
const
|
|
37
|
-
if (
|
|
156
|
+
const allCases = loadEvalCases();
|
|
157
|
+
if (allCases.length === 0) {
|
|
38
158
|
log.warn('No eval cases found. Create eval cases first with `alpha-loop eval capture`.');
|
|
39
159
|
return;
|
|
40
160
|
}
|
|
41
|
-
const
|
|
42
|
-
const
|
|
161
|
+
const evalDir = evalsDir(undefined, config.evalDir);
|
|
162
|
+
const scores = readScores(evalDir);
|
|
163
|
+
const baseline = latestScore(evalDir);
|
|
164
|
+
// Resume support: pick up where we left off
|
|
165
|
+
let startIteration = 1;
|
|
166
|
+
let bestScore = baseline?.composite ?? 0;
|
|
167
|
+
let totalKept = 0;
|
|
168
|
+
let totalDiscarded = 0;
|
|
169
|
+
let totalCrashed = 0;
|
|
170
|
+
let totalCost = 0;
|
|
171
|
+
if (options.resume) {
|
|
172
|
+
const priorLog = readEvolveLog();
|
|
173
|
+
if (priorLog.length > 0) {
|
|
174
|
+
const lastEntry = priorLog[priorLog.length - 1];
|
|
175
|
+
startIteration = lastEntry.iteration + 1;
|
|
176
|
+
// Find best score from kept entries
|
|
177
|
+
const keptEntries = priorLog.filter((e) => e.status === 'keep');
|
|
178
|
+
if (keptEntries.length > 0) {
|
|
179
|
+
bestScore = Math.max(...keptEntries.map((e) => e.score));
|
|
180
|
+
}
|
|
181
|
+
totalKept = priorLog.filter((e) => e.status === 'keep').length;
|
|
182
|
+
totalDiscarded = priorLog.filter((e) => e.status === 'discard').length;
|
|
183
|
+
totalCrashed = priorLog.filter((e) => e.status === 'crash').length;
|
|
184
|
+
totalCost = priorLog.reduce((sum, e) => sum + e.cost, 0);
|
|
185
|
+
log.info(`Resuming from iteration ${startIteration} (best score: ${bestScore.toFixed(2)})`);
|
|
186
|
+
}
|
|
187
|
+
else {
|
|
188
|
+
log.info('No prior evolve log found. Starting fresh.');
|
|
189
|
+
}
|
|
190
|
+
}
|
|
43
191
|
log.step('Alpha Loop Evolve — Meta-Harness Optimization');
|
|
44
192
|
console.log('');
|
|
45
|
-
console.log(` Eval cases: ${
|
|
193
|
+
console.log(` Eval cases: ${allCases.length}`);
|
|
46
194
|
console.log(` Score history: ${scores.length} entries`);
|
|
47
|
-
console.log(` Baseline score: ${
|
|
48
|
-
console.log(`
|
|
195
|
+
console.log(` Baseline score: ${bestScore > 0 ? bestScore.toFixed(2) : 'none (will run baseline)'}`);
|
|
196
|
+
console.log(` Iterations: ${continuous ? 'continuous (until stopped)' : maxIterations}`);
|
|
197
|
+
console.log(` Surface: ${surface}`);
|
|
49
198
|
console.log(` Agent: ${config.agent}`);
|
|
50
199
|
console.log(` Model: ${config.model || 'default'}`);
|
|
200
|
+
if (options.resume)
|
|
201
|
+
console.log(` Resuming from: iteration ${startIteration}`);
|
|
51
202
|
console.log('');
|
|
52
|
-
//
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
203
|
+
// Graceful shutdown for --continuous
|
|
204
|
+
let shutdownRequested = false;
|
|
205
|
+
if (continuous) {
|
|
206
|
+
const handler = () => {
|
|
207
|
+
log.info('Shutdown requested. Finishing current iteration...');
|
|
208
|
+
shutdownRequested = true;
|
|
209
|
+
};
|
|
210
|
+
process.on('SIGINT', handler);
|
|
211
|
+
process.on('SIGTERM', handler);
|
|
212
|
+
}
|
|
213
|
+
// Gather context for the proposer (refreshed every 5 iterations)
|
|
214
|
+
let recentTraces = listTraces().slice(0, 10);
|
|
215
|
+
log.info(`Recent traces: ${recentTraces.length}`);
|
|
216
|
+
// Step 0: Run baseline eval if no baseline score exists
|
|
217
|
+
if (bestScore === 0 && !options.dryRun) {
|
|
218
|
+
log.step('Running baseline eval...');
|
|
219
|
+
const stepCases = loadEvalCases({ type: 'step' });
|
|
220
|
+
if (stepCases.length > 0) {
|
|
221
|
+
const baselineResult = await runEvalSuite(stepCases, config, { verbose: options.verbose });
|
|
222
|
+
bestScore = baselineResult.composite;
|
|
223
|
+
const commit = getCommitHash();
|
|
224
|
+
appendEvolveLog({
|
|
225
|
+
commit,
|
|
226
|
+
score: bestScore,
|
|
227
|
+
cost: 0,
|
|
228
|
+
status: 'baseline',
|
|
229
|
+
iteration: 0,
|
|
230
|
+
description: 'initial baseline eval',
|
|
231
|
+
});
|
|
232
|
+
log.info(`Baseline score: ${bestScore.toFixed(2)} (${baselineResult.passCount}/${stepCases.length} passing)`);
|
|
233
|
+
}
|
|
234
|
+
else {
|
|
235
|
+
log.warn('No step-level eval cases found. Using full cases for eval.');
|
|
236
|
+
}
|
|
237
|
+
}
|
|
238
|
+
for (let iteration = startIteration; iteration <= (startIteration + maxIterations - 1); iteration++) {
|
|
239
|
+
if (shutdownRequested) {
|
|
240
|
+
log.info('Graceful shutdown: stopping before next iteration.');
|
|
241
|
+
break;
|
|
242
|
+
}
|
|
243
|
+
log.step(`Iteration ${iteration}${continuous ? '' : `/${startIteration + maxIterations - 1}`}`);
|
|
58
244
|
if (config.dryRun) {
|
|
59
245
|
log.dry('Would invoke proposer agent with full trace access');
|
|
60
|
-
log.dry(
|
|
246
|
+
log.dry(`Would modify files in surface: ${surface}`);
|
|
247
|
+
log.dry('Would run pre-checks (compile + tests)');
|
|
248
|
+
log.dry('Would run step-level eval, then e2e eval if step passes');
|
|
61
249
|
log.dry('Would keep if score improves, revert if not');
|
|
62
250
|
continue;
|
|
63
251
|
}
|
|
252
|
+
// Refresh traces every 5 iterations so the proposer sees recent data
|
|
253
|
+
if ((iteration - startIteration) > 0 && (iteration - startIteration) % 5 === 0) {
|
|
254
|
+
recentTraces = listTraces().slice(0, 10);
|
|
255
|
+
}
|
|
256
|
+
// Read evolve log for proposer context
|
|
257
|
+
const evolveLog = readEvolveLog();
|
|
64
258
|
// Build the proposer prompt with full filesystem context
|
|
65
|
-
const prompt = buildProposerPrompt(config, recentTraces, scores,
|
|
259
|
+
const prompt = buildProposerPrompt(config, recentTraces, scores, allCases.length, surface, evolveLog);
|
|
66
260
|
// Invoke proposer agent
|
|
67
261
|
log.info('Invoking proposer agent...');
|
|
68
262
|
const result = await spawnAgent({
|
|
@@ -73,26 +267,46 @@ export async function evolveCommand(options) {
|
|
|
73
267
|
logFile: undefined,
|
|
74
268
|
});
|
|
75
269
|
if (result.exitCode !== 0 || !result.output.trim()) {
|
|
76
|
-
log.warn(`Proposer failed (exit ${result.exitCode}).
|
|
77
|
-
|
|
270
|
+
log.warn(`Proposer failed (exit ${result.exitCode}). Skipping iteration.`);
|
|
271
|
+
appendEvolveLog({
|
|
272
|
+
commit: getCommitHash(),
|
|
273
|
+
score: bestScore,
|
|
274
|
+
cost: 0,
|
|
275
|
+
status: 'crash',
|
|
276
|
+
iteration,
|
|
277
|
+
description: 'proposer agent failed',
|
|
278
|
+
});
|
|
279
|
+
totalCrashed++;
|
|
280
|
+
continue;
|
|
78
281
|
}
|
|
79
282
|
// Parse proposed changes from agent output
|
|
80
283
|
const changes = parseProposedChanges(result.output);
|
|
81
284
|
if (changes.length === 0) {
|
|
82
|
-
log.info('Proposer returned no changes. Optimization complete.');
|
|
83
|
-
|
|
285
|
+
log.info('Proposer returned no changes. Optimization may be complete.');
|
|
286
|
+
if (!continuous)
|
|
287
|
+
break;
|
|
288
|
+
continue;
|
|
84
289
|
}
|
|
85
290
|
log.info(`Proposer suggested ${changes.length} change(s):`);
|
|
86
291
|
for (const change of changes) {
|
|
87
292
|
console.log(` - ${change.path}: ${change.reason}`);
|
|
88
293
|
}
|
|
89
|
-
// Validate all paths are safe
|
|
90
|
-
const unsafeChanges = changes.filter((c) => !isSafePath(c.path));
|
|
294
|
+
// Validate all paths are safe for the current surface level
|
|
295
|
+
const unsafeChanges = changes.filter((c) => !isSafePath(c.path, surface));
|
|
91
296
|
if (unsafeChanges.length > 0) {
|
|
92
|
-
log.warn('Proposer suggested changes to
|
|
297
|
+
log.warn('Proposer suggested changes to paths outside surface — skipping:');
|
|
93
298
|
for (const c of unsafeChanges) {
|
|
94
299
|
console.log(` - ${c.path}`);
|
|
95
300
|
}
|
|
301
|
+
appendEvolveLog({
|
|
302
|
+
commit: getCommitHash(),
|
|
303
|
+
score: bestScore,
|
|
304
|
+
cost: 0,
|
|
305
|
+
status: 'crash',
|
|
306
|
+
iteration,
|
|
307
|
+
description: `unsafe paths: ${unsafeChanges.map((c) => c.path).join(', ')}`,
|
|
308
|
+
});
|
|
309
|
+
totalCrashed++;
|
|
96
310
|
continue;
|
|
97
311
|
}
|
|
98
312
|
// Backup current files
|
|
@@ -101,45 +315,179 @@ export async function evolveCommand(options) {
|
|
|
101
315
|
if (existsSync(change.path)) {
|
|
102
316
|
backups.set(change.path, readFileSync(change.path, 'utf-8'));
|
|
103
317
|
}
|
|
318
|
+
else {
|
|
319
|
+
backups.set(change.path, null); // file didn't exist before
|
|
320
|
+
}
|
|
104
321
|
}
|
|
105
322
|
// Apply changes
|
|
106
323
|
for (const change of changes) {
|
|
324
|
+
const dir = dirname(change.path);
|
|
325
|
+
if (!existsSync(dir))
|
|
326
|
+
mkdirSync(dir, { recursive: true });
|
|
107
327
|
writeFileSync(change.path, change.content);
|
|
108
328
|
log.info(`Applied: ${change.path}`);
|
|
109
329
|
}
|
|
110
|
-
//
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
330
|
+
// Pre-checks: compile + unit tests
|
|
331
|
+
log.info('Running pre-checks...');
|
|
332
|
+
const preCheckResult = await runPreChecks(surface);
|
|
333
|
+
if (!preCheckResult.passed) {
|
|
334
|
+
log.warn(`Pre-checks failed: ${preCheckResult.error?.split('\n')[0]}`);
|
|
335
|
+
revertChanges(backups);
|
|
336
|
+
appendEvolveLog({
|
|
337
|
+
commit: getCommitHash(),
|
|
338
|
+
score: bestScore,
|
|
339
|
+
cost: 0,
|
|
340
|
+
status: 'crash',
|
|
341
|
+
iteration,
|
|
342
|
+
description: `pre-check failed: ${preCheckResult.error?.split('\n')[0] ?? 'unknown'}`,
|
|
343
|
+
});
|
|
344
|
+
totalCrashed++;
|
|
345
|
+
continue;
|
|
346
|
+
}
|
|
347
|
+
log.info('Pre-checks passed.');
|
|
348
|
+
// Step-level eval (fast gate)
|
|
349
|
+
log.info('Running step-level eval...');
|
|
350
|
+
const stepCases = loadEvalCases({ type: 'step' });
|
|
351
|
+
let stepScore = bestScore;
|
|
352
|
+
let iterationCost = 0;
|
|
353
|
+
if (stepCases.length > 0) {
|
|
354
|
+
const stepResult = await runEvalSuite(stepCases, config, { verbose: options.verbose });
|
|
355
|
+
stepScore = stepResult.composite;
|
|
356
|
+
iterationCost += stepResult.totalCost;
|
|
357
|
+
if (stepScore < bestScore) {
|
|
358
|
+
log.warn(`Step-level eval regressed: ${stepScore.toFixed(2)} < ${bestScore.toFixed(2)}. Discarding.`);
|
|
359
|
+
revertChanges(backups);
|
|
360
|
+
appendEvolveLog({
|
|
361
|
+
commit: getCommitHash(),
|
|
362
|
+
score: stepScore,
|
|
363
|
+
cost: iterationCost,
|
|
364
|
+
status: 'discard',
|
|
365
|
+
iteration,
|
|
366
|
+
description: `step-level regression: ${changes.map((c) => c.reason).join('; ')}`,
|
|
367
|
+
});
|
|
368
|
+
totalDiscarded++;
|
|
369
|
+
totalCost += iterationCost;
|
|
370
|
+
continue;
|
|
371
|
+
}
|
|
372
|
+
log.info(`Step-level eval: ${stepScore.toFixed(2)} (baseline: ${bestScore.toFixed(2)})`);
|
|
373
|
+
}
|
|
374
|
+
// E2E eval (slow, full validation)
|
|
375
|
+
const fullCases = loadEvalCases({ type: 'full' });
|
|
376
|
+
let compositeScore = stepScore;
|
|
377
|
+
if (fullCases.length > 0) {
|
|
378
|
+
log.info('Running e2e eval...');
|
|
379
|
+
const e2eResult = await runEvalSuite(fullCases, config, { verbose: options.verbose });
|
|
380
|
+
compositeScore = e2eResult.composite;
|
|
381
|
+
iterationCost += e2eResult.totalCost;
|
|
382
|
+
}
|
|
383
|
+
// Keep or discard
|
|
384
|
+
const decision = keepOrDiscard(compositeScore, bestScore);
|
|
385
|
+
totalCost += iterationCost;
|
|
386
|
+
if (decision === 'keep') {
|
|
387
|
+
log.info(`Score improved: ${compositeScore.toFixed(2)} > ${bestScore.toFixed(2)}. Keeping changes.`);
|
|
388
|
+
// Commit the changes
|
|
389
|
+
const description = changes.map((c) => c.reason).join('; ');
|
|
390
|
+
for (const change of changes) {
|
|
391
|
+
exec(`git add "${change.path}"`, { cwd: process.cwd() });
|
|
392
|
+
}
|
|
393
|
+
// Write commit message to a temp file to avoid shell injection
|
|
394
|
+
const commitMsg = `evolve(${iteration}): ${description.slice(0, 200)}`;
|
|
395
|
+
const commitMsgFile = join(process.cwd(), '.alpha-loop', 'evals', '.commit-msg.tmp');
|
|
396
|
+
writeFileSync(commitMsgFile, commitMsg);
|
|
397
|
+
exec(`git commit --file "${commitMsgFile}"`, { cwd: process.cwd() });
|
|
398
|
+
try {
|
|
399
|
+
unlinkSync(commitMsgFile);
|
|
400
|
+
}
|
|
401
|
+
catch { /* non-fatal */ }
|
|
402
|
+
bestScore = compositeScore;
|
|
403
|
+
totalKept++;
|
|
404
|
+
appendEvolveLog({
|
|
405
|
+
commit: getCommitHash(),
|
|
406
|
+
score: compositeScore,
|
|
407
|
+
cost: iterationCost,
|
|
408
|
+
status: 'keep',
|
|
409
|
+
iteration,
|
|
410
|
+
description,
|
|
411
|
+
});
|
|
412
|
+
}
|
|
413
|
+
else {
|
|
414
|
+
log.info(`Score did not improve: ${compositeScore.toFixed(2)} <= ${bestScore.toFixed(2)}. Discarding.`);
|
|
415
|
+
revertChanges(backups);
|
|
416
|
+
totalDiscarded++;
|
|
417
|
+
appendEvolveLog({
|
|
418
|
+
commit: getCommitHash(),
|
|
419
|
+
score: compositeScore,
|
|
420
|
+
cost: iterationCost,
|
|
421
|
+
status: 'discard',
|
|
422
|
+
iteration,
|
|
423
|
+
description: changes.map((c) => c.reason).join('; '),
|
|
424
|
+
});
|
|
425
|
+
}
|
|
114
426
|
console.log('');
|
|
115
|
-
// In a full implementation, we would:
|
|
116
|
-
// 1. Run the eval suite
|
|
117
|
-
// 2. Compare composite score to baseline
|
|
118
|
-
// 3. If improved: commit and update baseline
|
|
119
|
-
// 4. If not: revert all changes from backups
|
|
120
|
-
// 5. Continue to next iteration
|
|
121
|
-
// For now, stop after first proposal (eval execution requires fixture repos)
|
|
122
|
-
log.info('Stopping after first proposal. Full automated loop requires eval fixtures.');
|
|
123
|
-
break;
|
|
124
427
|
}
|
|
125
428
|
// Summary
|
|
126
429
|
console.log('');
|
|
127
430
|
log.step('Evolve Summary');
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
}
|
|
131
|
-
log
|
|
431
|
+
const totalIterations = totalKept + totalDiscarded + totalCrashed;
|
|
432
|
+
console.log(` Iterations run: ${totalIterations}`);
|
|
433
|
+
console.log(` Kept: ${totalKept}`);
|
|
434
|
+
console.log(` Discarded: ${totalDiscarded}`);
|
|
435
|
+
console.log(` Crashed: ${totalCrashed}`);
|
|
436
|
+
console.log(` Best score: ${bestScore.toFixed(2)}`);
|
|
437
|
+
console.log(` Total cost: ~$${totalCost.toFixed(2)}`);
|
|
438
|
+
console.log('');
|
|
132
439
|
log.info('Run `alpha-loop eval scores` to view score history.');
|
|
440
|
+
log.info(`Evolve log: ${EVOLVE_LOG_PATH}`);
|
|
441
|
+
}
|
|
442
|
+
/**
|
|
443
|
+
* Revert file changes from backups.
|
|
444
|
+
* For new files, also removes empty parent directories that were created.
|
|
445
|
+
*/
|
|
446
|
+
function revertChanges(backups) {
|
|
447
|
+
for (const [path, content] of backups) {
|
|
448
|
+
if (content === null) {
|
|
449
|
+
// File didn't exist before — remove it and clean empty parents
|
|
450
|
+
try {
|
|
451
|
+
unlinkSync(path);
|
|
452
|
+
// Walk up removing empty directories until we hit a non-empty one
|
|
453
|
+
let dir = dirname(path);
|
|
454
|
+
while (dir && dir !== '.' && dir !== '/') {
|
|
455
|
+
try {
|
|
456
|
+
rmdirSync(dir); // throws if non-empty
|
|
457
|
+
dir = dirname(dir);
|
|
458
|
+
}
|
|
459
|
+
catch {
|
|
460
|
+
break; // directory not empty, stop
|
|
461
|
+
}
|
|
462
|
+
}
|
|
463
|
+
}
|
|
464
|
+
catch { /* ignore */ }
|
|
465
|
+
}
|
|
466
|
+
else {
|
|
467
|
+
writeFileSync(path, content);
|
|
468
|
+
}
|
|
469
|
+
}
|
|
470
|
+
}
|
|
471
|
+
/**
|
|
472
|
+
* Parse and validate the surface option.
|
|
473
|
+
*/
|
|
474
|
+
function parseSurface(surface) {
|
|
475
|
+
if (!surface)
|
|
476
|
+
return 'prompts';
|
|
477
|
+
if (SURFACE_LEVELS.includes(surface))
|
|
478
|
+
return surface;
|
|
479
|
+
log.warn(`Unknown surface level '${surface}'. Using 'prompts'. Valid: ${SURFACE_LEVELS.join(', ')}`);
|
|
480
|
+
return 'prompts';
|
|
133
481
|
}
|
|
134
482
|
/**
|
|
135
483
|
* Build the proposer prompt with full trace context (Meta-Harness style).
|
|
136
484
|
*/
|
|
137
|
-
function buildProposerPrompt(config, traces, scores, evalCaseCount) {
|
|
485
|
+
function buildProposerPrompt(config, traces, scores, evalCaseCount, surface, evolveLog) {
|
|
138
486
|
const sections = [];
|
|
139
|
-
|
|
487
|
+
const targets = SURFACE_TARGETS[surface];
|
|
488
|
+
sections.push(`# Harness Optimization Skill
|
|
140
489
|
|
|
141
|
-
You are
|
|
142
|
-
by analyzing execution traces, scores, and source code, then proposing targeted changes.
|
|
490
|
+
You are optimizing AlphaLoop's harness configuration to improve its eval score.
|
|
143
491
|
|
|
144
492
|
## Current State
|
|
145
493
|
- Eval cases: ${evalCaseCount}
|
|
@@ -147,7 +495,35 @@ by analyzing execution traces, scores, and source code, then proposing targeted
|
|
|
147
495
|
- Recent traces: ${traces.length}
|
|
148
496
|
- Agent: ${config.agent}
|
|
149
497
|
- Model: ${config.model || 'default'}
|
|
498
|
+
- Optimization surface: ${surface}
|
|
499
|
+
|
|
500
|
+
## Your Environment
|
|
501
|
+
- \`.alpha-loop/evals/results/\` — filesystem of ALL prior eval runs
|
|
502
|
+
- Each run has: harness snapshot, scores, costs, and full execution traces
|
|
503
|
+
- Use grep/cat to inspect prior code, traces, and scores
|
|
504
|
+
- \`.alpha-loop/templates/\` — current prompts and skills (YOUR optimization target)
|
|
505
|
+
${surface === 'all' ? '- `src/lib/prompts.ts` — prompt builder functions (modifiable)\n- `src/lib/pipeline.ts` — pipeline orchestration (modifiable)\n' : ''}
|
|
506
|
+
## What You Can Modify
|
|
507
|
+
${targets.map((t) => `- \`${t}\``).join('\n')}
|
|
508
|
+
|
|
509
|
+
## What You CANNOT Modify
|
|
510
|
+
- \`.alpha-loop/evals/\` — eval cases and results (read-only)
|
|
511
|
+
- Test files
|
|
512
|
+
- Any file not listed above
|
|
150
513
|
`);
|
|
514
|
+
// Add evolve log history
|
|
515
|
+
if (evolveLog.length > 0) {
|
|
516
|
+
sections.push('## Prior Evolve Iterations');
|
|
517
|
+
sections.push('Learn from both successes and failures:');
|
|
518
|
+
sections.push('');
|
|
519
|
+
sections.push('```');
|
|
520
|
+
sections.push(EVOLVE_LOG_HEADER);
|
|
521
|
+
for (const entry of evolveLog.slice(-20)) {
|
|
522
|
+
sections.push(`${entry.commit}\t${entry.score.toFixed(2)}\t${entry.cost.toFixed(2)}\t${entry.status}\t${entry.iteration}\t${entry.description}`);
|
|
523
|
+
}
|
|
524
|
+
sections.push('```');
|
|
525
|
+
sections.push('');
|
|
526
|
+
}
|
|
151
527
|
// Add score history
|
|
152
528
|
if (scores.length > 0) {
|
|
153
529
|
sections.push('## Score History (most recent first)');
|
|
@@ -205,10 +581,22 @@ by analyzing execution traces, scores, and source code, then proposing targeted
|
|
|
205
581
|
readDir(join(templatesDir, 'agents'), '.alpha-loop/templates/agents/');
|
|
206
582
|
readDir(join(templatesDir, 'skills'), '.alpha-loop/templates/skills/');
|
|
207
583
|
}
|
|
208
|
-
sections.push(`## Your
|
|
584
|
+
sections.push(`## Your Process
|
|
585
|
+
1. Read at least 5 prior runs (traces, scores, evolve log) before proposing a change
|
|
586
|
+
2. Identify failure patterns and form hypotheses
|
|
587
|
+
3. Compare traces from passing vs failing cases
|
|
588
|
+
4. Propose targeted, additive changes (prefer adding info over changing flow)
|
|
589
|
+
5. Explain your reasoning clearly
|
|
590
|
+
|
|
591
|
+
## Key Lessons from Meta-Harness
|
|
592
|
+
- Additive changes are safer than structural rewrites (iteration 7 won by ADDING info, not changing flow)
|
|
593
|
+
- Prompt edits that modify control flow are high-risk (5 of 7 regressions came from these)
|
|
594
|
+
- If multiple prior changes regressed, the common factor is the problem (confound detection)
|
|
209
595
|
|
|
210
|
-
|
|
211
|
-
|
|
596
|
+
## Your Task
|
|
597
|
+
|
|
598
|
+
Analyze the traces, scores, and evolve log above. Identify patterns in failures and propose
|
|
599
|
+
specific changes that would improve the composite score.
|
|
212
600
|
|
|
213
601
|
Output your proposed changes as a JSON array:
|
|
214
602
|
|
|
@@ -223,7 +611,7 @@ Output your proposed changes as a JSON array:
|
|
|
223
611
|
\`\`\`
|
|
224
612
|
|
|
225
613
|
Rules:
|
|
226
|
-
- Only modify files under: ${
|
|
614
|
+
- Only modify files under: ${targets.join(', ')}
|
|
227
615
|
- Each change must include a clear reason
|
|
228
616
|
- Focus on the highest-impact changes first
|
|
229
617
|
- If no changes would help, output an empty array: []
|
|
@@ -257,14 +645,4 @@ export function parseProposedChanges(output) {
|
|
|
257
645
|
return [];
|
|
258
646
|
}
|
|
259
647
|
}
|
|
260
|
-
/**
|
|
261
|
-
* Check if a proposed path is safe to modify.
|
|
262
|
-
*/
|
|
263
|
-
export function isSafePath(filePath) {
|
|
264
|
-
// Reject absolute paths and path traversal
|
|
265
|
-
if (filePath.startsWith('/') || filePath.includes('..'))
|
|
266
|
-
return false;
|
|
267
|
-
// Must be in allowed targets: directory prefixes use startsWith, files use exact match
|
|
268
|
-
return ALLOWED_TARGETS.some((prefix) => prefix.endsWith('/') ? filePath.startsWith(prefix) : filePath === prefix);
|
|
269
|
-
}
|
|
270
648
|
//# sourceMappingURL=evolve.js.map
|