@bradtaylorsf/alpha-loop 1.4.2 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. package/README.md +136 -3
  2. package/dist/cli.js +73 -6
  3. package/dist/cli.js.map +1 -1
  4. package/dist/commands/eval.d.ts +59 -4
  5. package/dist/commands/eval.js +370 -55
  6. package/dist/commands/eval.js.map +1 -1
  7. package/dist/commands/evolve.d.ts +43 -4
  8. package/dist/commands/evolve.js +444 -66
  9. package/dist/commands/evolve.js.map +1 -1
  10. package/dist/commands/init.js +3 -7
  11. package/dist/commands/init.js.map +1 -1
  12. package/dist/commands/plan.d.ts +11 -0
  13. package/dist/commands/plan.js +298 -0
  14. package/dist/commands/plan.js.map +1 -0
  15. package/dist/commands/roadmap.d.ts +9 -0
  16. package/dist/commands/roadmap.js +202 -0
  17. package/dist/commands/roadmap.js.map +1 -0
  18. package/dist/commands/triage.d.ts +9 -0
  19. package/dist/commands/triage.js +226 -0
  20. package/dist/commands/triage.js.map +1 -0
  21. package/dist/commands/vision.js +1 -0
  22. package/dist/commands/vision.js.map +1 -1
  23. package/dist/lib/config.d.ts +20 -0
  24. package/dist/lib/config.js +55 -0
  25. package/dist/lib/config.js.map +1 -1
  26. package/dist/lib/eval-checks.d.ts +11 -1
  27. package/dist/lib/eval-checks.js +39 -0
  28. package/dist/lib/eval-checks.js.map +1 -1
  29. package/dist/lib/eval-fixtures.d.ts +55 -0
  30. package/dist/lib/eval-fixtures.js +172 -0
  31. package/dist/lib/eval-fixtures.js.map +1 -0
  32. package/dist/lib/eval-runner.d.ts +26 -2
  33. package/dist/lib/eval-runner.js +202 -17
  34. package/dist/lib/eval-runner.js.map +1 -1
  35. package/dist/lib/eval-skill-bridge.d.ts +53 -0
  36. package/dist/lib/eval-skill-bridge.js +121 -0
  37. package/dist/lib/eval-skill-bridge.js.map +1 -0
  38. package/dist/lib/eval-swebench.d.ts +68 -0
  39. package/dist/lib/eval-swebench.js +274 -0
  40. package/dist/lib/eval-swebench.js.map +1 -0
  41. package/dist/lib/eval.d.ts +9 -1
  42. package/dist/lib/eval.js +27 -7
  43. package/dist/lib/eval.js.map +1 -1
  44. package/dist/lib/github.d.ts +46 -0
  45. package/dist/lib/github.js +179 -0
  46. package/dist/lib/github.js.map +1 -1
  47. package/dist/lib/pipeline.js +44 -2
  48. package/dist/lib/pipeline.js.map +1 -1
  49. package/dist/lib/planning.d.ts +91 -0
  50. package/dist/lib/planning.js +315 -0
  51. package/dist/lib/planning.js.map +1 -0
  52. package/dist/lib/prompts.d.ts +79 -0
  53. package/dist/lib/prompts.js +151 -2
  54. package/dist/lib/prompts.js.map +1 -1
  55. package/dist/lib/score.d.ts +24 -2
  56. package/dist/lib/score.js +162 -3
  57. package/dist/lib/score.js.map +1 -1
  58. package/package.json +2 -1
@@ -2,7 +2,50 @@ export type EvolveOptions = {
2
2
  maxIterations?: string;
3
3
  dryRun?: boolean;
4
4
  verbose?: boolean;
5
+ continuous?: boolean;
6
+ surface?: string;
7
+ resume?: boolean;
5
8
  };
9
+ /** Optimization surface levels — what the proposer is allowed to modify. */
10
+ export type SurfaceLevel = 'prompts' | 'skills' | 'config' | 'all';
11
+ /** Allowed target paths per surface level. */
12
+ export declare const SURFACE_TARGETS: Record<SurfaceLevel, string[]>;
13
+ /** Path to the evolve log TSV file. */
14
+ export declare const EVOLVE_LOG_PATH = ".alpha-loop/evals/evolve-log.tsv";
15
+ /** A single entry in the evolve log. */
16
+ export type EvolveLogEntry = {
17
+ commit: string;
18
+ score: number;
19
+ cost: number;
20
+ status: 'baseline' | 'keep' | 'discard' | 'crash';
21
+ iteration: number;
22
+ description: string;
23
+ };
24
+ /**
25
+ * Append an entry to the evolve log TSV.
26
+ */
27
+ export declare function appendEvolveLog(entry: EvolveLogEntry, cwd?: string): void;
28
+ /**
29
+ * Read all entries from the evolve log TSV.
30
+ */
31
+ export declare function readEvolveLog(cwd?: string): EvolveLogEntry[];
32
+ /**
33
+ * Run pre-checks before expensive eval.
34
+ * Returns { passed, error } indicating whether the code is safe to eval.
35
+ */
36
+ export declare function runPreChecks(surface: SurfaceLevel, cwd?: string): Promise<{
37
+ passed: boolean;
38
+ error?: string;
39
+ }>;
40
+ /**
41
+ * Decide whether to keep or discard based on score comparison.
42
+ * Returns 'keep' if newScore > bestScore, 'discard' otherwise.
43
+ */
44
+ export declare function keepOrDiscard(newScore: number, bestScore: number): 'keep' | 'discard';
45
+ /**
46
+ * Check if a proposed path is safe to modify for a given surface level.
47
+ */
48
+ export declare function isSafePath(filePath: string, surface?: SurfaceLevel): boolean;
6
49
  /**
7
50
  * Run the evolve loop: propose → eval → keep/discard.
8
51
  */
@@ -18,8 +61,4 @@ type ProposedChange = {
18
61
  * Expects a JSON array in the output.
19
62
  */
20
63
  export declare function parseProposedChanges(output: string): ProposedChange[];
21
- /**
22
- * Check if a proposed path is safe to modify.
23
- */
24
- export declare function isSafePath(filePath: string): boolean;
25
64
  export {};
@@ -12,57 +12,251 @@
12
12
  * Key insight from Meta-Harness: full trace access (not summaries) is critical.
13
13
  * Key insight from autoresearch: fixed eval metric + autonomous iteration.
14
14
  */
15
- import { existsSync, readFileSync, readdirSync, writeFileSync } from 'node:fs';
16
- import { join } from 'node:path';
15
+ import { existsSync, readFileSync, readdirSync, writeFileSync, mkdirSync, appendFileSync, unlinkSync, rmdirSync } from 'node:fs';
16
+ import { join, dirname } from 'node:path';
17
17
  import { log } from '../lib/logger.js';
18
18
  import { loadConfig } from '../lib/config.js';
19
19
  import { spawnAgent } from '../lib/agent.js';
20
20
  import { loadEvalCases, evalsDir } from '../lib/eval.js';
21
21
  import { readScores, latestScore, formatScoreEntry } from '../lib/score.js';
22
22
  import { listTraces, readTrace } from '../lib/traces.js';
23
- /** Files that the proposer is allowed to modify. */
24
- const ALLOWED_TARGETS = [
25
- '.alpha-loop/templates/skills/',
26
- '.alpha-loop/templates/agents/',
27
- '.alpha-loop.yaml',
28
- ];
23
+ import { exec } from '../lib/shell.js';
24
+ import { runEvalSuite } from '../lib/eval-runner.js';
25
+ const SURFACE_LEVELS = ['prompts', 'skills', 'config', 'all'];
26
+ /** Allowed target paths per surface level. */
27
+ export const SURFACE_TARGETS = {
28
+ prompts: [
29
+ '.alpha-loop/templates/agents/',
30
+ ],
31
+ skills: [
32
+ '.alpha-loop/templates/agents/',
33
+ '.alpha-loop/templates/skills/',
34
+ ],
35
+ config: [
36
+ '.alpha-loop/templates/agents/',
37
+ '.alpha-loop/templates/skills/',
38
+ '.alpha-loop.yaml',
39
+ ],
40
+ all: [
41
+ '.alpha-loop/templates/agents/',
42
+ '.alpha-loop/templates/skills/',
43
+ '.alpha-loop.yaml',
44
+ 'src/lib/prompts.ts',
45
+ 'src/lib/pipeline.ts',
46
+ ],
47
+ };
48
+ /** Path to the evolve log TSV file. */
49
+ export const EVOLVE_LOG_PATH = '.alpha-loop/evals/evolve-log.tsv';
50
+ const EVOLVE_LOG_HEADER = 'commit\tscore\tcost\tstatus\titeration\tdescription';
51
+ /**
52
+ * Append an entry to the evolve log TSV.
53
+ */
54
+ export function appendEvolveLog(entry, cwd) {
55
+ const logPath = join(cwd ?? process.cwd(), EVOLVE_LOG_PATH);
56
+ const dir = join(cwd ?? process.cwd(), '.alpha-loop', 'evals');
57
+ mkdirSync(dir, { recursive: true });
58
+ if (!existsSync(logPath)) {
59
+ writeFileSync(logPath, EVOLVE_LOG_HEADER + '\n');
60
+ }
61
+ const line = [
62
+ entry.commit,
63
+ entry.score.toFixed(2),
64
+ entry.cost.toFixed(2),
65
+ entry.status,
66
+ String(entry.iteration),
67
+ entry.description,
68
+ ].join('\t');
69
+ appendFileSync(logPath, line + '\n');
70
+ }
71
+ /**
72
+ * Read all entries from the evolve log TSV.
73
+ */
74
+ export function readEvolveLog(cwd) {
75
+ const logPath = join(cwd ?? process.cwd(), EVOLVE_LOG_PATH);
76
+ if (!existsSync(logPath))
77
+ return [];
78
+ const content = readFileSync(logPath, 'utf-8').trim();
79
+ const lines = content.split('\n').filter(Boolean);
80
+ // Skip header
81
+ const entries = [];
82
+ for (let i = 1; i < lines.length; i++) {
83
+ const parts = lines[i].split('\t');
84
+ if (parts.length < 6)
85
+ continue;
86
+ entries.push({
87
+ commit: parts[0],
88
+ score: parseFloat(parts[1]),
89
+ cost: parseFloat(parts[2]),
90
+ status: parts[3],
91
+ iteration: parseInt(parts[4], 10),
92
+ description: parts.slice(5).join('\t'), // description may contain tabs
93
+ });
94
+ }
95
+ return entries;
96
+ }
97
+ /**
98
+ * Run pre-checks before expensive eval.
99
+ * Returns { passed, error } indicating whether the code is safe to eval.
100
+ */
101
+ export async function runPreChecks(surface, cwd) {
102
+ const projectDir = cwd ?? process.cwd();
103
+ // Only run compile check if code files were changed
104
+ if (surface === 'all') {
105
+ const tscResult = exec('pnpm tsc --noEmit', { cwd: projectDir, timeout: 60_000 });
106
+ if (tscResult.exitCode !== 0) {
107
+ return { passed: false, error: `TypeScript compilation failed:\n${tscResult.stderr || tscResult.stdout}` };
108
+ }
109
+ }
110
+ // Run unit tests for all surface levels
111
+ const testResult = exec('pnpm test', { cwd: projectDir, timeout: 120_000 });
112
+ if (testResult.exitCode !== 0) {
113
+ return { passed: false, error: `Unit tests failed:\n${testResult.stderr || testResult.stdout}` };
114
+ }
115
+ return { passed: true };
116
+ }
117
+ /**
118
+ * Decide whether to keep or discard based on score comparison.
119
+ * Returns 'keep' if newScore > bestScore, 'discard' otherwise.
120
+ */
121
+ export function keepOrDiscard(newScore, bestScore) {
122
+ return newScore > bestScore ? 'keep' : 'discard';
123
+ }
124
+ /**
125
+ * Get the current git commit hash (short form).
126
+ */
127
+ function getCommitHash(cwd) {
128
+ try {
129
+ const result = exec('git rev-parse --short HEAD', { cwd: cwd ?? process.cwd(), timeout: 5000 });
130
+ return result.stdout.trim() || 'unknown';
131
+ }
132
+ catch {
133
+ return 'unknown';
134
+ }
135
+ }
136
+ /**
137
+ * Check if a proposed path is safe to modify for a given surface level.
138
+ */
139
+ export function isSafePath(filePath, surface) {
140
+ // Reject absolute paths and path traversal
141
+ if (filePath.startsWith('/') || filePath.includes('..'))
142
+ return false;
143
+ const targets = SURFACE_TARGETS[surface ?? 'prompts'];
144
+ // Must be in allowed targets: directory prefixes use startsWith, files use exact match
145
+ return targets.some((prefix) => prefix.endsWith('/') ? filePath.startsWith(prefix) : filePath === prefix);
146
+ }
29
147
  /**
30
148
  * Run the evolve loop: propose → eval → keep/discard.
31
149
  */
32
150
  export async function evolveCommand(options) {
33
151
  const config = loadConfig({ dryRun: options.dryRun });
34
- const maxIterations = parseInt(options.maxIterations ?? '5', 10);
152
+ const surface = parseSurface(options.surface);
153
+ const continuous = options.continuous ?? false;
154
+ const maxIterations = continuous ? Infinity : parseInt(options.maxIterations ?? '5', 10);
35
155
  // Validate prerequisites
36
- const cases = loadEvalCases();
37
- if (cases.length === 0) {
156
+ const allCases = loadEvalCases();
157
+ if (allCases.length === 0) {
38
158
  log.warn('No eval cases found. Create eval cases first with `alpha-loop eval capture`.');
39
159
  return;
40
160
  }
41
- const scores = readScores(evalsDir(undefined, config.evalDir));
42
- const baseline = latestScore(evalsDir(undefined, config.evalDir));
161
+ const evalDir = evalsDir(undefined, config.evalDir);
162
+ const scores = readScores(evalDir);
163
+ const baseline = latestScore(evalDir);
164
+ // Resume support: pick up where we left off
165
+ let startIteration = 1;
166
+ let bestScore = baseline?.composite ?? 0;
167
+ let totalKept = 0;
168
+ let totalDiscarded = 0;
169
+ let totalCrashed = 0;
170
+ let totalCost = 0;
171
+ if (options.resume) {
172
+ const priorLog = readEvolveLog();
173
+ if (priorLog.length > 0) {
174
+ const lastEntry = priorLog[priorLog.length - 1];
175
+ startIteration = lastEntry.iteration + 1;
176
+ // Find best score from kept entries
177
+ const keptEntries = priorLog.filter((e) => e.status === 'keep');
178
+ if (keptEntries.length > 0) {
179
+ bestScore = Math.max(...keptEntries.map((e) => e.score));
180
+ }
181
+ totalKept = priorLog.filter((e) => e.status === 'keep').length;
182
+ totalDiscarded = priorLog.filter((e) => e.status === 'discard').length;
183
+ totalCrashed = priorLog.filter((e) => e.status === 'crash').length;
184
+ totalCost = priorLog.reduce((sum, e) => sum + e.cost, 0);
185
+ log.info(`Resuming from iteration ${startIteration} (best score: ${bestScore.toFixed(2)})`);
186
+ }
187
+ else {
188
+ log.info('No prior evolve log found. Starting fresh.');
189
+ }
190
+ }
43
191
  log.step('Alpha Loop Evolve — Meta-Harness Optimization');
44
192
  console.log('');
45
- console.log(` Eval cases: ${cases.length}`);
193
+ console.log(` Eval cases: ${allCases.length}`);
46
194
  console.log(` Score history: ${scores.length} entries`);
47
- console.log(` Baseline score: ${baseline ? baseline.composite.toFixed(2) : 'none'}`);
48
- console.log(` Max iterations: ${maxIterations}`);
195
+ console.log(` Baseline score: ${bestScore > 0 ? bestScore.toFixed(2) : 'none (will run baseline)'}`);
196
+ console.log(` Iterations: ${continuous ? 'continuous (until stopped)' : maxIterations}`);
197
+ console.log(` Surface: ${surface}`);
49
198
  console.log(` Agent: ${config.agent}`);
50
199
  console.log(` Model: ${config.model || 'default'}`);
200
+ if (options.resume)
201
+ console.log(` Resuming from: iteration ${startIteration}`);
51
202
  console.log('');
52
- // Gather context for the proposer
53
- const traces = listTraces();
54
- const recentTraces = traces.slice(0, 10);
55
- log.info(`Recent traces: ${recentTraces.length} (from ${traces.length} total)`);
56
- for (let iteration = 1; iteration <= maxIterations; iteration++) {
57
- log.step(`Iteration ${iteration}/${maxIterations}`);
203
+ // Graceful shutdown for --continuous
204
+ let shutdownRequested = false;
205
+ if (continuous) {
206
+ const handler = () => {
207
+ log.info('Shutdown requested. Finishing current iteration...');
208
+ shutdownRequested = true;
209
+ };
210
+ process.on('SIGINT', handler);
211
+ process.on('SIGTERM', handler);
212
+ }
213
+ // Gather context for the proposer (refreshed every 5 iterations)
214
+ let recentTraces = listTraces().slice(0, 10);
215
+ log.info(`Recent traces: ${recentTraces.length}`);
216
+ // Step 0: Run baseline eval if no baseline score exists
217
+ if (bestScore === 0 && !options.dryRun) {
218
+ log.step('Running baseline eval...');
219
+ const stepCases = loadEvalCases({ type: 'step' });
220
+ if (stepCases.length > 0) {
221
+ const baselineResult = await runEvalSuite(stepCases, config, { verbose: options.verbose });
222
+ bestScore = baselineResult.composite;
223
+ const commit = getCommitHash();
224
+ appendEvolveLog({
225
+ commit,
226
+ score: bestScore,
227
+ cost: 0,
228
+ status: 'baseline',
229
+ iteration: 0,
230
+ description: 'initial baseline eval',
231
+ });
232
+ log.info(`Baseline score: ${bestScore.toFixed(2)} (${baselineResult.passCount}/${stepCases.length} passing)`);
233
+ }
234
+ else {
235
+ log.warn('No step-level eval cases found. Using full cases for eval.');
236
+ }
237
+ }
238
+ for (let iteration = startIteration; iteration <= (startIteration + maxIterations - 1); iteration++) {
239
+ if (shutdownRequested) {
240
+ log.info('Graceful shutdown: stopping before next iteration.');
241
+ break;
242
+ }
243
+ log.step(`Iteration ${iteration}${continuous ? '' : `/${startIteration + maxIterations - 1}`}`);
58
244
  if (config.dryRun) {
59
245
  log.dry('Would invoke proposer agent with full trace access');
60
- log.dry('Would run eval suite on proposed changes');
246
+ log.dry(`Would modify files in surface: ${surface}`);
247
+ log.dry('Would run pre-checks (compile + tests)');
248
+ log.dry('Would run step-level eval, then e2e eval if step passes');
61
249
  log.dry('Would keep if score improves, revert if not');
62
250
  continue;
63
251
  }
252
+ // Refresh traces every 5 iterations so the proposer sees recent data
253
+ if ((iteration - startIteration) > 0 && (iteration - startIteration) % 5 === 0) {
254
+ recentTraces = listTraces().slice(0, 10);
255
+ }
256
+ // Read evolve log for proposer context
257
+ const evolveLog = readEvolveLog();
64
258
  // Build the proposer prompt with full filesystem context
65
- const prompt = buildProposerPrompt(config, recentTraces, scores, cases.length);
259
+ const prompt = buildProposerPrompt(config, recentTraces, scores, allCases.length, surface, evolveLog);
66
260
  // Invoke proposer agent
67
261
  log.info('Invoking proposer agent...');
68
262
  const result = await spawnAgent({
@@ -73,26 +267,46 @@ export async function evolveCommand(options) {
73
267
  logFile: undefined,
74
268
  });
75
269
  if (result.exitCode !== 0 || !result.output.trim()) {
76
- log.warn(`Proposer failed (exit ${result.exitCode}). Stopping.`);
77
- break;
270
+ log.warn(`Proposer failed (exit ${result.exitCode}). Skipping iteration.`);
271
+ appendEvolveLog({
272
+ commit: getCommitHash(),
273
+ score: bestScore,
274
+ cost: 0,
275
+ status: 'crash',
276
+ iteration,
277
+ description: 'proposer agent failed',
278
+ });
279
+ totalCrashed++;
280
+ continue;
78
281
  }
79
282
  // Parse proposed changes from agent output
80
283
  const changes = parseProposedChanges(result.output);
81
284
  if (changes.length === 0) {
82
- log.info('Proposer returned no changes. Optimization complete.');
83
- break;
285
+ log.info('Proposer returned no changes. Optimization may be complete.');
286
+ if (!continuous)
287
+ break;
288
+ continue;
84
289
  }
85
290
  log.info(`Proposer suggested ${changes.length} change(s):`);
86
291
  for (const change of changes) {
87
292
  console.log(` - ${change.path}: ${change.reason}`);
88
293
  }
89
- // Validate all paths are safe
90
- const unsafeChanges = changes.filter((c) => !isSafePath(c.path));
294
+ // Validate all paths are safe for the current surface level
295
+ const unsafeChanges = changes.filter((c) => !isSafePath(c.path, surface));
91
296
  if (unsafeChanges.length > 0) {
92
- log.warn('Proposer suggested changes to unsafe paths — skipping:');
297
+ log.warn('Proposer suggested changes to paths outside surface — skipping:');
93
298
  for (const c of unsafeChanges) {
94
299
  console.log(` - ${c.path}`);
95
300
  }
301
+ appendEvolveLog({
302
+ commit: getCommitHash(),
303
+ score: bestScore,
304
+ cost: 0,
305
+ status: 'crash',
306
+ iteration,
307
+ description: `unsafe paths: ${unsafeChanges.map((c) => c.path).join(', ')}`,
308
+ });
309
+ totalCrashed++;
96
310
  continue;
97
311
  }
98
312
  // Backup current files
@@ -101,45 +315,179 @@ export async function evolveCommand(options) {
101
315
  if (existsSync(change.path)) {
102
316
  backups.set(change.path, readFileSync(change.path, 'utf-8'));
103
317
  }
318
+ else {
319
+ backups.set(change.path, null); // file didn't exist before
320
+ }
104
321
  }
105
322
  // Apply changes
106
323
  for (const change of changes) {
324
+ const dir = dirname(change.path);
325
+ if (!existsSync(dir))
326
+ mkdirSync(dir, { recursive: true });
107
327
  writeFileSync(change.path, change.content);
108
328
  log.info(`Applied: ${change.path}`);
109
329
  }
110
- // TODO: Run eval suite and compare scores
111
- // For now, log what would happen
112
- log.info('Changes applied. Run `alpha-loop eval` to measure impact.');
113
- log.info('If score improves: keep changes. If not: revert with git.');
330
+ // Pre-checks: compile + unit tests
331
+ log.info('Running pre-checks...');
332
+ const preCheckResult = await runPreChecks(surface);
333
+ if (!preCheckResult.passed) {
334
+ log.warn(`Pre-checks failed: ${preCheckResult.error?.split('\n')[0]}`);
335
+ revertChanges(backups);
336
+ appendEvolveLog({
337
+ commit: getCommitHash(),
338
+ score: bestScore,
339
+ cost: 0,
340
+ status: 'crash',
341
+ iteration,
342
+ description: `pre-check failed: ${preCheckResult.error?.split('\n')[0] ?? 'unknown'}`,
343
+ });
344
+ totalCrashed++;
345
+ continue;
346
+ }
347
+ log.info('Pre-checks passed.');
348
+ // Step-level eval (fast gate)
349
+ log.info('Running step-level eval...');
350
+ const stepCases = loadEvalCases({ type: 'step' });
351
+ let stepScore = bestScore;
352
+ let iterationCost = 0;
353
+ if (stepCases.length > 0) {
354
+ const stepResult = await runEvalSuite(stepCases, config, { verbose: options.verbose });
355
+ stepScore = stepResult.composite;
356
+ iterationCost += stepResult.totalCost;
357
+ if (stepScore < bestScore) {
358
+ log.warn(`Step-level eval regressed: ${stepScore.toFixed(2)} < ${bestScore.toFixed(2)}. Discarding.`);
359
+ revertChanges(backups);
360
+ appendEvolveLog({
361
+ commit: getCommitHash(),
362
+ score: stepScore,
363
+ cost: iterationCost,
364
+ status: 'discard',
365
+ iteration,
366
+ description: `step-level regression: ${changes.map((c) => c.reason).join('; ')}`,
367
+ });
368
+ totalDiscarded++;
369
+ totalCost += iterationCost;
370
+ continue;
371
+ }
372
+ log.info(`Step-level eval: ${stepScore.toFixed(2)} (baseline: ${bestScore.toFixed(2)})`);
373
+ }
374
+ // E2E eval (slow, full validation)
375
+ const fullCases = loadEvalCases({ type: 'full' });
376
+ let compositeScore = stepScore;
377
+ if (fullCases.length > 0) {
378
+ log.info('Running e2e eval...');
379
+ const e2eResult = await runEvalSuite(fullCases, config, { verbose: options.verbose });
380
+ compositeScore = e2eResult.composite;
381
+ iterationCost += e2eResult.totalCost;
382
+ }
383
+ // Keep or discard
384
+ const decision = keepOrDiscard(compositeScore, bestScore);
385
+ totalCost += iterationCost;
386
+ if (decision === 'keep') {
387
+ log.info(`Score improved: ${compositeScore.toFixed(2)} > ${bestScore.toFixed(2)}. Keeping changes.`);
388
+ // Commit the changes
389
+ const description = changes.map((c) => c.reason).join('; ');
390
+ for (const change of changes) {
391
+ exec(`git add "${change.path}"`, { cwd: process.cwd() });
392
+ }
393
+ // Write commit message to a temp file to avoid shell injection
394
+ const commitMsg = `evolve(${iteration}): ${description.slice(0, 200)}`;
395
+ const commitMsgFile = join(process.cwd(), '.alpha-loop', 'evals', '.commit-msg.tmp');
396
+ writeFileSync(commitMsgFile, commitMsg);
397
+ exec(`git commit --file "${commitMsgFile}"`, { cwd: process.cwd() });
398
+ try {
399
+ unlinkSync(commitMsgFile);
400
+ }
401
+ catch { /* non-fatal */ }
402
+ bestScore = compositeScore;
403
+ totalKept++;
404
+ appendEvolveLog({
405
+ commit: getCommitHash(),
406
+ score: compositeScore,
407
+ cost: iterationCost,
408
+ status: 'keep',
409
+ iteration,
410
+ description,
411
+ });
412
+ }
413
+ else {
414
+ log.info(`Score did not improve: ${compositeScore.toFixed(2)} <= ${bestScore.toFixed(2)}. Discarding.`);
415
+ revertChanges(backups);
416
+ totalDiscarded++;
417
+ appendEvolveLog({
418
+ commit: getCommitHash(),
419
+ score: compositeScore,
420
+ cost: iterationCost,
421
+ status: 'discard',
422
+ iteration,
423
+ description: changes.map((c) => c.reason).join('; '),
424
+ });
425
+ }
114
426
  console.log('');
115
- // In a full implementation, we would:
116
- // 1. Run the eval suite
117
- // 2. Compare composite score to baseline
118
- // 3. If improved: commit and update baseline
119
- // 4. If not: revert all changes from backups
120
- // 5. Continue to next iteration
121
- // For now, stop after first proposal (eval execution requires fixture repos)
122
- log.info('Stopping after first proposal. Full automated loop requires eval fixtures.');
123
- break;
124
427
  }
125
428
  // Summary
126
429
  console.log('');
127
430
  log.step('Evolve Summary');
128
- if (baseline) {
129
- console.log(` Baseline score: ${baseline.composite.toFixed(2)}`);
130
- }
131
- log.info('Run `alpha-loop eval` to measure current score after changes.');
431
+ const totalIterations = totalKept + totalDiscarded + totalCrashed;
432
+ console.log(` Iterations run: ${totalIterations}`);
433
+ console.log(` Kept: ${totalKept}`);
434
+ console.log(` Discarded: ${totalDiscarded}`);
435
+ console.log(` Crashed: ${totalCrashed}`);
436
+ console.log(` Best score: ${bestScore.toFixed(2)}`);
437
+ console.log(` Total cost: ~$${totalCost.toFixed(2)}`);
438
+ console.log('');
132
439
  log.info('Run `alpha-loop eval scores` to view score history.');
440
+ log.info(`Evolve log: ${EVOLVE_LOG_PATH}`);
441
+ }
442
+ /**
443
+ * Revert file changes from backups.
444
+ * For new files, also removes empty parent directories that were created.
445
+ */
446
+ function revertChanges(backups) {
447
+ for (const [path, content] of backups) {
448
+ if (content === null) {
449
+ // File didn't exist before — remove it and clean empty parents
450
+ try {
451
+ unlinkSync(path);
452
+ // Walk up removing empty directories until we hit a non-empty one
453
+ let dir = dirname(path);
454
+ while (dir && dir !== '.' && dir !== '/') {
455
+ try {
456
+ rmdirSync(dir); // throws if non-empty
457
+ dir = dirname(dir);
458
+ }
459
+ catch {
460
+ break; // directory not empty, stop
461
+ }
462
+ }
463
+ }
464
+ catch { /* ignore */ }
465
+ }
466
+ else {
467
+ writeFileSync(path, content);
468
+ }
469
+ }
470
+ }
471
+ /**
472
+ * Parse and validate the surface option.
473
+ */
474
+ function parseSurface(surface) {
475
+ if (!surface)
476
+ return 'prompts';
477
+ if (SURFACE_LEVELS.includes(surface))
478
+ return surface;
479
+ log.warn(`Unknown surface level '${surface}'. Using 'prompts'. Valid: ${SURFACE_LEVELS.join(', ')}`);
480
+ return 'prompts';
133
481
  }
134
482
  /**
135
483
  * Build the proposer prompt with full trace context (Meta-Harness style).
136
484
  */
137
- function buildProposerPrompt(config, traces, scores, evalCaseCount) {
485
+ function buildProposerPrompt(config, traces, scores, evalCaseCount, surface, evolveLog) {
138
486
  const sections = [];
139
- sections.push(`# Alpha Loop Optimization Proposer
487
+ const targets = SURFACE_TARGETS[surface];
488
+ sections.push(`# Harness Optimization Skill
140
489
 
141
- You are an optimization agent. Your goal is to improve AlphaLoop's pipeline performance
142
- by analyzing execution traces, scores, and source code, then proposing targeted changes.
490
+ You are optimizing AlphaLoop's harness configuration to improve its eval score.
143
491
 
144
492
  ## Current State
145
493
  - Eval cases: ${evalCaseCount}
@@ -147,7 +495,35 @@ by analyzing execution traces, scores, and source code, then proposing targeted
147
495
  - Recent traces: ${traces.length}
148
496
  - Agent: ${config.agent}
149
497
  - Model: ${config.model || 'default'}
498
+ - Optimization surface: ${surface}
499
+
500
+ ## Your Environment
501
+ - \`.alpha-loop/evals/results/\` — filesystem of ALL prior eval runs
502
+ - Each run has: harness snapshot, scores, costs, and full execution traces
503
+ - Use grep/cat to inspect prior code, traces, and scores
504
+ - \`.alpha-loop/templates/\` — current prompts and skills (YOUR optimization target)
505
+ ${surface === 'all' ? '- `src/lib/prompts.ts` — prompt builder functions (modifiable)\n- `src/lib/pipeline.ts` — pipeline orchestration (modifiable)\n' : ''}
506
+ ## What You Can Modify
507
+ ${targets.map((t) => `- \`${t}\``).join('\n')}
508
+
509
+ ## What You CANNOT Modify
510
+ - \`.alpha-loop/evals/\` — eval cases and results (read-only)
511
+ - Test files
512
+ - Any file not listed above
150
513
  `);
514
+ // Add evolve log history
515
+ if (evolveLog.length > 0) {
516
+ sections.push('## Prior Evolve Iterations');
517
+ sections.push('Learn from both successes and failures:');
518
+ sections.push('');
519
+ sections.push('```');
520
+ sections.push(EVOLVE_LOG_HEADER);
521
+ for (const entry of evolveLog.slice(-20)) {
522
+ sections.push(`${entry.commit}\t${entry.score.toFixed(2)}\t${entry.cost.toFixed(2)}\t${entry.status}\t${entry.iteration}\t${entry.description}`);
523
+ }
524
+ sections.push('```');
525
+ sections.push('');
526
+ }
151
527
  // Add score history
152
528
  if (scores.length > 0) {
153
529
  sections.push('## Score History (most recent first)');
@@ -205,10 +581,22 @@ by analyzing execution traces, scores, and source code, then proposing targeted
205
581
  readDir(join(templatesDir, 'agents'), '.alpha-loop/templates/agents/');
206
582
  readDir(join(templatesDir, 'skills'), '.alpha-loop/templates/skills/');
207
583
  }
208
- sections.push(`## Your Task
584
+ sections.push(`## Your Process
585
+ 1. Read at least 5 prior runs (traces, scores, evolve log) before proposing a change
586
+ 2. Identify failure patterns and form hypotheses
587
+ 3. Compare traces from passing vs failing cases
588
+ 4. Propose targeted, additive changes (prefer adding info over changing flow)
589
+ 5. Explain your reasoning clearly
590
+
591
+ ## Key Lessons from Meta-Harness
592
+ - Additive changes are safer than structural rewrites (iteration 7 won by ADDING info, not changing flow)
593
+ - Prompt edits that modify control flow are high-risk (5 of 7 regressions came from these)
594
+ - If multiple prior changes regressed, the common factor is the problem (confound detection)
209
595
 
210
- Analyze the traces and scores above. Identify patterns in failures and propose specific
211
- changes to agent prompts, skills, or config that would improve the composite score.
596
+ ## Your Task
597
+
598
+ Analyze the traces, scores, and evolve log above. Identify patterns in failures and propose
599
+ specific changes that would improve the composite score.
212
600
 
213
601
  Output your proposed changes as a JSON array:
214
602
 
@@ -223,7 +611,7 @@ Output your proposed changes as a JSON array:
223
611
  \`\`\`
224
612
 
225
613
  Rules:
226
- - Only modify files under: ${ALLOWED_TARGETS.join(', ')}
614
+ - Only modify files under: ${targets.join(', ')}
227
615
  - Each change must include a clear reason
228
616
  - Focus on the highest-impact changes first
229
617
  - If no changes would help, output an empty array: []
@@ -257,14 +645,4 @@ export function parseProposedChanges(output) {
257
645
  return [];
258
646
  }
259
647
  }
260
- /**
261
- * Check if a proposed path is safe to modify.
262
- */
263
- export function isSafePath(filePath) {
264
- // Reject absolute paths and path traversal
265
- if (filePath.startsWith('/') || filePath.includes('..'))
266
- return false;
267
- // Must be in allowed targets: directory prefixes use startsWith, files use exact match
268
- return ALLOWED_TARGETS.some((prefix) => prefix.endsWith('/') ? filePath.startsWith(prefix) : filePath === prefix);
269
- }
270
648
  //# sourceMappingURL=evolve.js.map