evalbuff 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. package/README.md +79 -0
  2. package/dist/carve-features.d.ts +42 -0
  3. package/dist/carve-features.d.ts.map +1 -0
  4. package/dist/carve-features.js +305 -0
  5. package/dist/carve-features.js.map +1 -0
  6. package/dist/cli.d.ts +3 -0
  7. package/dist/cli.d.ts.map +1 -0
  8. package/dist/cli.js +42 -0
  9. package/dist/cli.js.map +1 -0
  10. package/dist/docs-refactor.d.ts +4 -0
  11. package/dist/docs-refactor.d.ts.map +1 -0
  12. package/dist/docs-refactor.js +122 -0
  13. package/dist/docs-refactor.js.map +1 -0
  14. package/dist/docs-writer.d.ts +4 -0
  15. package/dist/docs-writer.d.ts.map +1 -0
  16. package/dist/docs-writer.js +122 -0
  17. package/dist/docs-writer.js.map +1 -0
  18. package/dist/eval-helpers.d.ts +19 -0
  19. package/dist/eval-helpers.d.ts.map +1 -0
  20. package/dist/eval-helpers.js +327 -0
  21. package/dist/eval-helpers.js.map +1 -0
  22. package/dist/eval-runner.d.ts +42 -0
  23. package/dist/eval-runner.d.ts.map +1 -0
  24. package/dist/eval-runner.js +193 -0
  25. package/dist/eval-runner.js.map +1 -0
  26. package/dist/judge.d.ts +22 -0
  27. package/dist/judge.d.ts.map +1 -0
  28. package/dist/judge.js +284 -0
  29. package/dist/judge.js.map +1 -0
  30. package/dist/perfect-feature.d.ts +2 -0
  31. package/dist/perfect-feature.d.ts.map +1 -0
  32. package/dist/perfect-feature.js +666 -0
  33. package/dist/perfect-feature.js.map +1 -0
  34. package/dist/report.d.ts +31 -0
  35. package/dist/report.d.ts.map +1 -0
  36. package/dist/report.js +249 -0
  37. package/dist/report.js.map +1 -0
  38. package/dist/run-evalbuff.d.ts +12 -0
  39. package/dist/run-evalbuff.d.ts.map +1 -0
  40. package/dist/run-evalbuff.js +383 -0
  41. package/dist/run-evalbuff.js.map +1 -0
  42. package/dist/runners/claude.d.ts +10 -0
  43. package/dist/runners/claude.d.ts.map +1 -0
  44. package/dist/runners/claude.js +80 -0
  45. package/dist/runners/claude.js.map +1 -0
  46. package/dist/runners/codebuff.d.ts +24 -0
  47. package/dist/runners/codebuff.d.ts.map +1 -0
  48. package/dist/runners/codebuff.js +88 -0
  49. package/dist/runners/codebuff.js.map +1 -0
  50. package/dist/runners/codex.d.ts +8 -0
  51. package/dist/runners/codex.d.ts.map +1 -0
  52. package/dist/runners/codex.js +131 -0
  53. package/dist/runners/codex.js.map +1 -0
  54. package/dist/runners/index.d.ts +5 -0
  55. package/dist/runners/index.d.ts.map +1 -0
  56. package/dist/runners/index.js +4 -0
  57. package/dist/runners/index.js.map +1 -0
  58. package/dist/runners/runner.d.ts +11 -0
  59. package/dist/runners/runner.d.ts.map +1 -0
  60. package/dist/runners/runner.js +2 -0
  61. package/dist/runners/runner.js.map +1 -0
  62. package/dist/test-repo-utils.d.ts +21 -0
  63. package/dist/test-repo-utils.d.ts.map +1 -0
  64. package/dist/test-repo-utils.js +109 -0
  65. package/dist/test-repo-utils.js.map +1 -0
  66. package/dist/trace-compressor.d.ts +130 -0
  67. package/dist/trace-compressor.d.ts.map +1 -0
  68. package/dist/trace-compressor.js +680 -0
  69. package/dist/trace-compressor.js.map +1 -0
  70. package/dist/tui/data.d.ts +84 -0
  71. package/dist/tui/data.d.ts.map +1 -0
  72. package/dist/tui/data.js +80 -0
  73. package/dist/tui/data.js.map +1 -0
  74. package/dist/tui/events.d.ts +86 -0
  75. package/dist/tui/events.d.ts.map +1 -0
  76. package/dist/tui/events.js +52 -0
  77. package/dist/tui/events.js.map +1 -0
  78. package/dist/vendor/error.d.ts +18 -0
  79. package/dist/vendor/error.d.ts.map +1 -0
  80. package/dist/vendor/error.js +64 -0
  81. package/dist/vendor/error.js.map +1 -0
  82. package/dist/vendor/print-mode.d.ts +75 -0
  83. package/dist/vendor/print-mode.d.ts.map +1 -0
  84. package/dist/vendor/print-mode.js +2 -0
  85. package/dist/vendor/print-mode.js.map +1 -0
  86. package/package.json +46 -0
@@ -0,0 +1,666 @@
1
+ /**
2
+ * Perfect Feature — iteratively rebuild a single feature toward a 10/10 score.
3
+ *
4
+ * Unlike run-evalbuff (which runs many features and does holistic doc improvement),
5
+ * this script focuses on ONE feature and tries a series of doc strategies to help
6
+ * the rebuild agent achieve a perfect score — without giving away the answer.
7
+ *
8
+ * Strategies are applied in stages:
9
+ * Rounds 1-2: General design & style principles
10
+ * Rounds 3-4: Project knowledge (utilities, framework, common patterns)
11
+ * Rounds 5-6: Process instructions (e2e testing workflow, verification)
12
+ * Rounds 7+: Subagent instructions (spawn a critic/planner/reviewer)
13
+ *
14
+ * Usage:
15
+ * bun run src/perfect-feature.ts \
16
+ * --repo /path/to/repo \
17
+ * --features features.json \
18
+ * --feature-id my-feature-id \
19
+ * [--max-rounds 10] \
20
+ * [--coding-model sonnet] \
21
+ * [--judge-model opus] \
22
+ * [--init-command "npm install"]
23
+ */
24
+ import { execSync } from 'child_process';
25
+ import fs from 'fs';
26
+ import os from 'os';
27
+ import path from 'path';
28
+ import { ClaudeRunner } from './runners/claude';
29
+ import { applyCarveOperations, copyDocsIntoRepo, ensureGitIdentity, getDocsSnapshot, getGroundTruthDiff, computeDocsDiffText, syncDocsIntoRepo, truncateDiff, } from './eval-helpers';
30
+ // ---------------------------------------------------------------------------
31
+ // Doc improvement strategies — all available to the analyzer every round
32
+ // ---------------------------------------------------------------------------
33
+ const ANALYZER_STRATEGY_GUIDE = `You have several categories of doc improvements available. Use whichever ones address the actual failure — often multiple categories apply at once. Use your judgment about which will have the most impact given the diagnosis.
34
+
35
+ ### 1. Design & Style Principles
36
+ When to use: The agent's code works but doesn't match project conventions, or the agent made bad structural decisions.
37
+ - Code style conventions (naming, file organization, export patterns)
38
+ - UI/UX design principles the project follows (if applicable)
39
+ - Error handling patterns
40
+ - Type conventions and data modeling patterns
41
+ - How new features should be structured to match existing code
42
+
43
+ ### 2. Project Knowledge (Utilities, Framework, Reusable Patterns)
44
+ When to use: The agent reinvented something that already exists, used the wrong abstraction, or didn't know about a key utility.
45
+ - Shared utility functions and where they live
46
+ - Framework abstractions (routing, state management, DB access, etc.)
47
+ - Common imports and their usage patterns
48
+ - Configuration and environment setup
49
+ - How existing features compose these building blocks
50
+
51
+ ### 3. Process Instructions (Workflow, Verification, E2E Testing)
52
+ When to use: The agent produced code that doesn't build, doesn't pass tests, or has bugs it could have caught by testing.
53
+ - A step-by-step workflow: read docs → plan → implement → test → fix
54
+ - How to run and verify changes (build commands, test commands, dev server)
55
+ - E2E testing steps the agent should perform before declaring done
56
+ - How to check for common mistakes (missing imports, unregistered routes, etc.)
57
+ - A checklist of things to verify before finishing
58
+
59
+ ### 4. Subagent & Self-Review Instructions
60
+ When to use: The agent's first-pass implementation has issues it could catch with a review step, or the task is complex enough to benefit from planning.
61
+ - Suggest the agent spawn a "critic" subagent to review its own work before finishing
62
+ - Suggest the agent spawn a "planner" subagent before starting implementation
63
+ - Suggest the agent re-read its own diff and look for issues
64
+ - Suggest the agent run the test suite and fix any failures before finishing
65
+ - Suggest the agent use a checklist-driven review process at the end`;
66
+ // ---------------------------------------------------------------------------
67
+ // Custom judge — flexible, allows better-than-ground-truth solutions
68
+ // ---------------------------------------------------------------------------
69
+ function buildFlexibleJudgePrompt(input) {
70
+ const { taskPrompt, agentDiff: rawAgentDiff, groundTruthDiff: rawGroundTruthDiff, round } = input;
71
+ const agentDiff = truncateDiff(rawAgentDiff);
72
+ const groundTruthDiff = truncateDiff(rawGroundTruthDiff);
73
+ return `You are a senior engineer performing a thorough code review with hands-on E2E testing.
74
+
75
+ ## Your Mission
76
+
77
+ An AI coding agent was given a task and produced changes. You must judge how well it did.
78
+
79
+ **CRITICAL: The ground truth diff below is just ONE valid implementation — a reference, not the answer key.**
80
+ The agent's solution may be DIFFERENT from the ground truth and still be PERFECT (10/10).
81
+ The agent's solution may even be BETTER than the ground truth.
82
+ Do NOT penalize the agent for:
83
+ - Using different variable names, file structure, or code organization
84
+ - Taking a different architectural approach that achieves the same result
85
+ - Adding extra features, tests, or error handling beyond what was asked
86
+ - Using different libraries or utilities to accomplish the same thing
87
+
88
+ DO penalize the agent for:
89
+ - Missing functionality (the feature doesn't work or is incomplete)
90
+ - Bugs (runtime errors, logic errors, broken edge cases)
91
+ - Build/type errors
92
+ - Not following the project's existing conventions (if docs describe them)
93
+ - Leaving dead code, TODO comments, or unfinished scaffolding
94
+
95
+ ## How to Judge
96
+
97
+ 1. **Read the project docs** (docs/, AGENTS.md, CLAUDE.md) to understand conventions
98
+ 2. **Review the agent's diff** for completeness and correctness
99
+ 3. **Actually test the changes end-to-end:**
100
+ - Run the build/compile step
101
+ - Run the test suite
102
+ - Start the dev server if applicable
103
+ - Exercise the feature manually (browser tools, curl, CLI)
104
+ - Check logs for errors
105
+ - Test edge cases
106
+ 4. **Compare against ground truth** only to understand what SHOULD work, not to require identical code
107
+ 5. **Write your judgment** to evalbuff-review-result.json
108
+
109
+ ## User Prompt (What the agent was asked to do)
110
+ ${taskPrompt}
111
+
112
+ ## Ground Truth (One valid reference implementation — NOT the required approach)
113
+ \`\`\`diff
114
+ ${groundTruthDiff}
115
+ \`\`\`
116
+
117
+ ## Agent's Changes
118
+ \`\`\`diff
119
+ ${agentDiff || '(No changes made)'}
120
+ \`\`\`
121
+
122
+ ## Scoring Guide
123
+
124
+ - **10/10**: Feature works completely. Builds, passes tests, works end-to-end. May differ from ground truth.
125
+ - **8-9/10**: Feature mostly works but has minor issues (cosmetic bugs, missing edge case, slight convention mismatch).
126
+ - **6-7/10**: Core feature works but significant issues (broken edge cases, missing pieces, convention violations).
127
+ - **4-5/10**: Partially working — some functionality present but major gaps.
128
+ - **1-3/10**: Barely functional or fundamentally broken.
129
+ - **0/10**: Nothing useful produced.
130
+
131
+ ## Required Output
132
+
133
+ Write your judgment to \`evalbuff-review-result.json\`:
134
+
135
+ \`\`\`json
136
+ {
137
+ "analysis": "Detailed analysis of what you tested and found...",
138
+ "strengths": ["strength 1", "strength 2"],
139
+ "weaknesses": ["weakness 1", "weakness 2"],
140
+ "e2eTestsPerformed": ["Test 1", "Test 2"],
141
+ "completionScore": 8,
142
+ "codeQualityScore": 9,
143
+ "e2eScore": 7,
144
+ "overallScore": 8,
145
+ "docSuggestions": ["Suggestion 1", "Suggestion 2"]
146
+ }
147
+ \`\`\`
148
+
149
+ ## Documentation Suggestions
150
+
151
+ This is round ${round} of an iterative improvement process. Based on what you find, suggest doc changes that would help a coding agent do better WITHOUT giving away the specific implementation.
152
+
153
+ Good: "Document that all route handlers must be registered in src/routes/index.ts"
154
+ Bad: "Tell the agent to add a UserProfile route to src/routes/index.ts"
155
+
156
+ Focus on GENERAL PATTERNS that would help with ANY feature, not just this one.
157
+
158
+ IMPORTANT: You MUST write the result file. Do it as your very last action.`;
159
+ }
160
+ async function runFlexibleJudge(repoDir, input, model) {
161
+ const prompt = buildFlexibleJudgePrompt(input);
162
+ console.log(` [Judge] Running flexible Claude judge (${model})...`);
163
+ const runner = new ClaudeRunner(repoDir, {}, model, 'high');
164
+ try {
165
+ await runner.run(prompt);
166
+ }
167
+ catch (err) {
168
+ const msg = err instanceof Error ? err.message : String(err);
169
+ console.warn(` [Judge] Runner failed: ${msg.slice(0, 200)}`);
170
+ }
171
+ // Read result file
172
+ const resultPath = path.join(repoDir, 'evalbuff-review-result.json');
173
+ try {
174
+ if (fs.existsSync(resultPath)) {
175
+ const raw = JSON.parse(fs.readFileSync(resultPath, 'utf-8'));
176
+ return {
177
+ analysis: raw.analysis || 'No analysis',
178
+ strengths: Array.isArray(raw.strengths) ? raw.strengths : [],
179
+ weaknesses: Array.isArray(raw.weaknesses) ? raw.weaknesses : [],
180
+ e2eTestsPerformed: Array.isArray(raw.e2eTestsPerformed) ? raw.e2eTestsPerformed : [],
181
+ completionScore: typeof raw.completionScore === 'number' ? raw.completionScore : raw.overallScore ?? 0,
182
+ codeQualityScore: typeof raw.codeQualityScore === 'number' ? raw.codeQualityScore : raw.overallScore ?? 0,
183
+ e2eScore: typeof raw.e2eScore === 'number' ? raw.e2eScore : raw.overallScore ?? 0,
184
+ overallScore: typeof raw.overallScore === 'number' ? raw.overallScore : 0,
185
+ docSuggestions: Array.isArray(raw.docSuggestions) ? raw.docSuggestions : undefined,
186
+ };
187
+ }
188
+ }
189
+ catch (err) {
190
+ console.warn(` [Judge] Failed to parse result: ${err}`);
191
+ }
192
+ return {
193
+ analysis: 'Judge failed to produce result file',
194
+ strengths: [],
195
+ weaknesses: ['Judge failed'],
196
+ e2eTestsPerformed: [],
197
+ completionScore: 0,
198
+ codeQualityScore: 0,
199
+ e2eScore: 0,
200
+ overallScore: 0,
201
+ };
202
+ }
203
+ // ---------------------------------------------------------------------------
204
+ // Analyzer — diagnoses WHY score isn't 10/10 and suggests doc improvements
205
+ // ---------------------------------------------------------------------------
206
+ function buildAnalyzerPrompt(input) {
207
+ const { taskPrompt, agentDiff: rawAgentDiff, groundTruthDiff: rawGroundTruthDiff, judging, round, previousDiagnoses, currentDocs } = input;
208
+ const agentDiff = truncateDiff(rawAgentDiff);
209
+ const groundTruthDiff = truncateDiff(rawGroundTruthDiff);
210
+ const prevSection = previousDiagnoses.length > 0
211
+ ? `## Previous Diagnoses (what we already tried)\n${previousDiagnoses.map((d, i) => `Round ${i + 1}: ${d}`).join('\n\n')}\n\nDo NOT repeat suggestions that were already tried. Find NEW angles.`
212
+ : '';
213
+ const docsSection = Object.keys(currentDocs).length > 0
214
+ ? `## Current Documentation\n${Object.entries(currentDocs).map(([f, c]) => `### ${f}\n${c}`).join('\n\n')}`
215
+ : '## Current Documentation\n(No docs exist yet)';
216
+ return `You are an expert at analyzing why an AI coding agent failed to perfectly implement a feature, and at writing documentation that would help it succeed next time — WITHOUT giving away the specific answer.
217
+
218
+ ## Context
219
+
220
+ A coding agent was asked to implement a feature. It scored ${judging.overallScore}/10. This is round ${round} of an iterative improvement process. Your job is to figure out WHY it didn't get 10/10 and suggest documentation changes that would help it (or any agent) do better.
221
+
222
+ **CRITICAL RULES:**
223
+ 1. Your doc suggestions must be GENERAL — they should help an agent build ANY feature, not just this one.
224
+ 2. NEVER include the specific implementation, specific file contents, or specific code that the agent should write.
225
+ 3. DO document patterns, conventions, architectural rules, utility functions, and workflows.
226
+ 4. Think about what KNOWLEDGE GAP caused the failure, then fill that gap with general knowledge.
227
+
228
+ ## The Task
229
+ ${taskPrompt}
230
+
231
+ ## Agent's Attempt (scored ${judging.overallScore}/10)
232
+ \`\`\`diff
233
+ ${agentDiff || '(No changes)'}
234
+ \`\`\`
235
+
236
+ ## Judge's Feedback
237
+ **Analysis:** ${judging.analysis}
238
+ **Strengths:** ${judging.strengths.join(', ') || 'None listed'}
239
+ **Weaknesses:** ${judging.weaknesses.join(', ') || 'None listed'}
240
+ **E2E tests performed:** ${judging.e2eTestsPerformed.join(', ') || 'None'}
241
+ **Judge's doc suggestions:** ${judging.docSuggestions?.join('\n- ') || 'None'}
242
+
243
+ ## Ground Truth (reference only — the agent should NOT be told this)
244
+ \`\`\`diff
245
+ ${groundTruthDiff}
246
+ \`\`\`
247
+
248
+ ${prevSection}
249
+
250
+ ${docsSection}
251
+
252
+ ## Available Improvement Strategies
253
+
254
+ ${ANALYZER_STRATEGY_GUIDE}
255
+
256
+ ## Your Output
257
+
258
+ Diagnose the root cause, then pick whichever strategies (one or more) best address the failure. Write your result to \`analyzer-result.json\`:
259
+
260
+ \`\`\`json
261
+ {
262
+ "diagnosis": "A 2-3 sentence explanation of the root cause — what knowledge gap or process failure led to the imperfect score",
263
+ "docSuggestions": [
264
+ "Each suggestion should specify which file to create/update AND include the full content. E.g.: 'Create docs/routing.md: All routes must be registered in src/routes/index.ts by calling registerRoute()...'",
265
+ "Use whichever strategy categories are most relevant to the actual failure"
266
+ ]
267
+ }
268
+ \`\`\`
269
+
270
+ Remember: The goal is to make docs that help an agent build ANY feature perfectly, not to encode the answer to THIS specific feature. If the agent's failure was highly specific and can't be generalized, say so in your diagnosis and provide minimal/no suggestions.
271
+
272
+ IMPORTANT: You MUST write analyzer-result.json. Do it as your very last action.`;
273
+ }
274
+ async function runAnalyzer(repoDir, input, model) {
275
+ const prompt = buildAnalyzerPrompt(input);
276
+ console.log(` [Analyzer] Diagnosing failure (${model})...`);
277
+ const runner = new ClaudeRunner(repoDir, {}, model, 'high');
278
+ try {
279
+ await runner.run(prompt);
280
+ }
281
+ catch (err) {
282
+ const msg = err instanceof Error ? err.message : String(err);
283
+ console.warn(` [Analyzer] Runner failed: ${msg.slice(0, 200)}`);
284
+ }
285
+ const resultPath = path.join(repoDir, 'analyzer-result.json');
286
+ try {
287
+ if (fs.existsSync(resultPath)) {
288
+ const raw = JSON.parse(fs.readFileSync(resultPath, 'utf-8'));
289
+ return {
290
+ diagnosis: raw.diagnosis || 'No diagnosis produced',
291
+ docSuggestions: Array.isArray(raw.docSuggestions) ? raw.docSuggestions : [],
292
+ };
293
+ }
294
+ }
295
+ catch (err) {
296
+ console.warn(` [Analyzer] Failed to parse result: ${err}`);
297
+ }
298
+ return { diagnosis: 'Analyzer failed to produce results', docSuggestions: [] };
299
+ }
300
+ // ---------------------------------------------------------------------------
301
+ // Docs writer — applies suggestions from the analyzer
302
+ // ---------------------------------------------------------------------------
303
+ async function runDocsWriter(repoPath, suggestions, model) {
304
+ if (suggestions.length === 0) {
305
+ console.log(` [DocsWriter] No suggestions to apply, skipping.`);
306
+ return;
307
+ }
308
+ const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'evalbuff-docs-'));
309
+ const repoDir = path.join(tempDir, 'repo');
310
+ const prompt = `You are a documentation writer for a coding project. Your job is to update the project docs to help AI coding agents build features successfully.
311
+
312
+ ## Suggestions to Apply
313
+
314
+ ${suggestions.map((s, i) => `${i + 1}. ${s}`).join('\n')}
315
+
316
+ ## Rules
317
+
318
+ 1. ONLY modify files in docs/, AGENTS.md, or CLAUDE.md. Do NOT modify source code.
319
+ 2. Each suggestion tells you which file to create or update — follow those instructions.
320
+ 3. If a suggestion says to update an existing file, make targeted edits rather than rewriting.
321
+ 4. If multiple suggestions overlap, merge them into one cohesive doc.
322
+ 5. Keep docs concise and actionable. Dense information beats verbose explanations.
323
+ 6. Before documenting any function or file path, grep to confirm it exists.
324
+ 7. Never document aspirational/future behavior — only what exists NOW.
325
+ 8. Remove or update any existing docs that conflict with the new information.
326
+
327
+ ## Verification
328
+
329
+ After making changes, read back each modified file to verify it's coherent and accurate.`;
330
+ try {
331
+ execSync(`git clone --no-checkout "${repoPath}" "${repoDir}"`, { stdio: 'ignore' });
332
+ const headSha = execSync('git rev-parse HEAD', { cwd: repoPath, encoding: 'utf-8' }).trim();
333
+ execSync(`git checkout ${headSha}`, { cwd: repoDir, stdio: 'ignore' });
334
+ syncDocsIntoRepo(repoPath, repoDir);
335
+ const runner = new ClaudeRunner(repoDir, {}, model, 'high');
336
+ await runner.run(prompt);
337
+ syncDocsIntoRepo(repoDir, repoPath);
338
+ }
339
+ catch (err) {
340
+ const msg = err instanceof Error ? err.message : String(err);
341
+ console.warn(` [DocsWriter] Failed: ${msg.slice(0, 200)}`);
342
+ }
343
+ finally {
344
+ try {
345
+ fs.rmSync(tempDir, { recursive: true, force: true });
346
+ }
347
+ catch { /* ignore */ }
348
+ }
349
+ }
350
+ // ---------------------------------------------------------------------------
351
+ // Single rebuild + judge cycle
352
+ // ---------------------------------------------------------------------------
353
+ async function runRebuildAndJudge(opts) {
354
+ const { repoPath, feature, groundTruthDiff, round, codingModel, judgeModel, initCommand } = opts;
355
+ const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'evalbuff-perfect-'));
356
+ const repoDir = path.join(tempDir, 'repo');
357
+ try {
358
+ // Clone and carve
359
+ execSync(`git clone --no-checkout "${repoPath}" "${repoDir}"`, { stdio: 'ignore' });
360
+ const headSha = execSync('git rev-parse HEAD', { cwd: repoPath, encoding: 'utf-8' }).trim();
361
+ execSync(`git checkout ${headSha}`, { cwd: repoDir, stdio: 'ignore' });
362
+ ensureGitIdentity(repoDir);
363
+ applyCarveOperations(repoDir, feature.operations);
364
+ execSync('git add -A', { cwd: repoDir, stdio: 'ignore' });
365
+ execSync(`git commit -m "carve: remove ${feature.id}" --allow-empty`, { cwd: repoDir, stdio: 'ignore' });
366
+ // Copy docs
367
+ copyDocsIntoRepo(repoPath, repoDir);
368
+ // Init command
369
+ if (initCommand) {
370
+ try {
371
+ execSync(initCommand, { cwd: repoDir, stdio: 'ignore', timeout: 120000 });
372
+ }
373
+ catch (e) {
374
+ console.warn(` [Rebuild] Init command failed: ${e}`);
375
+ }
376
+ }
377
+ // Run rebuild agent
378
+ console.log(` [Rebuild] Round ${round}: Running claude (${codingModel})...`);
379
+ const runner = new ClaudeRunner(repoDir, {}, codingModel, 'medium');
380
+ let result;
381
+ try {
382
+ result = await runner.run(feature.prompt);
383
+ }
384
+ catch (err) {
385
+ const msg = err instanceof Error ? err.message : String(err);
386
+ return {
387
+ judging: {
388
+ analysis: `Agent failed: ${msg.slice(0, 500)}`,
389
+ strengths: [],
390
+ weaknesses: ['Agent failed'],
391
+ e2eTestsPerformed: [],
392
+ completionScore: 0,
393
+ codeQualityScore: 0,
394
+ e2eScore: 0,
395
+ overallScore: 0,
396
+ },
397
+ diff: '',
398
+ costEstimate: 0,
399
+ };
400
+ }
401
+ // Judge
402
+ const judging = await runFlexibleJudge(repoDir, {
403
+ taskPrompt: feature.prompt,
404
+ agentDiff: result.diff,
405
+ groundTruthDiff,
406
+ round,
407
+ }, judgeModel);
408
+ return {
409
+ judging,
410
+ diff: result.diff,
411
+ costEstimate: result.totalCostUsd,
412
+ };
413
+ }
414
+ finally {
415
+ try {
416
+ fs.rmSync(tempDir, { recursive: true, force: true });
417
+ }
418
+ catch { /* ignore */ }
419
+ }
420
+ }
421
+ // ---------------------------------------------------------------------------
422
+ // Main loop
423
+ // ---------------------------------------------------------------------------
424
+ async function perfectFeature(opts) {
425
+ const startTime = new Date().toISOString();
426
+ const logDir = path.join(os.tmpdir(), `evalbuff-perfect-${opts.featureId}-${new Date().toISOString().slice(0, 19).replace(/:/g, '-')}`);
427
+ fs.mkdirSync(logDir, { recursive: true });
428
+ console.log(`\nPerfect Feature`);
429
+ console.log(` Repo: ${opts.repoPath}`);
430
+ console.log(` Feature: ${opts.featureId}`);
431
+ console.log(` Max rounds: ${opts.maxRounds}`);
432
+ console.log(` Coding model: ${opts.codingModel}`);
433
+ console.log(` Judge model: ${opts.judgeModel}`);
434
+ console.log(` Analyzer model: ${opts.analyzerModel}`);
435
+ console.log(` Docs model: ${opts.docsModel}`);
436
+ console.log(` Log dir: ${logDir}`);
437
+ // Load feature
438
+ const allFeatures = JSON.parse(fs.readFileSync(opts.featuresPath, 'utf-8'));
439
+ const feature = allFeatures.find(f => f.id === opts.featureId);
440
+ if (!feature) {
441
+ const ids = allFeatures.map(f => f.id).join(', ');
442
+ console.error(`Feature "${opts.featureId}" not found. Available: ${ids}`);
443
+ process.exit(1);
444
+ }
445
+ const groundTruthDiff = getGroundTruthDiff(feature);
446
+ fs.writeFileSync(path.join(logDir, 'feature.json'), JSON.stringify(feature, null, 2));
447
+ fs.writeFileSync(path.join(logDir, 'ground-truth.diff'), groundTruthDiff);
448
+ const outcomes = [];
449
+ const diagnoses = [];
450
+ let totalCost = 0;
451
+ let bestScore = 0;
452
+ for (let round = 0; round < opts.maxRounds; round++) {
453
+ console.log(`\n${'='.repeat(60)}`);
454
+ console.log(`ROUND ${round}`);
455
+ console.log(`${'='.repeat(60)}`);
456
+ // Save docs state before this round
457
+ const docsBefore = getDocsSnapshot(opts.repoPath);
458
+ fs.writeFileSync(path.join(logDir, `docs-before-round-${round}.json`), JSON.stringify(docsBefore, null, 2));
459
+ // Run rebuild + judge
460
+ const { judging, diff, costEstimate } = await runRebuildAndJudge({
461
+ repoPath: opts.repoPath,
462
+ feature,
463
+ groundTruthDiff,
464
+ round,
465
+ codingModel: opts.codingModel,
466
+ judgeModel: opts.judgeModel,
467
+ initCommand: opts.initCommand,
468
+ });
469
+ totalCost += costEstimate;
470
+ const score = judging.overallScore;
471
+ if (score > bestScore)
472
+ bestScore = score;
473
+ console.log(`\n Score: ${score}/10 (best: ${bestScore}/10)`);
474
+ console.log(` Strengths: ${judging.strengths.join('; ') || 'none'}`);
475
+ console.log(` Weaknesses: ${judging.weaknesses.join('; ') || 'none'}`);
476
+ // Save round results
477
+ const roundDir = path.join(logDir, `round-${round}`);
478
+ fs.mkdirSync(roundDir, { recursive: true });
479
+ fs.writeFileSync(path.join(roundDir, 'judging.json'), JSON.stringify(judging, null, 2));
480
+ fs.writeFileSync(path.join(roundDir, 'diff.txt'), diff);
481
+ fs.writeFileSync(path.join(roundDir, 'score.txt'), score.toString());
482
+ // Check for perfection
483
+ if (score >= 10) {
484
+ console.log(`\n PERFECT SCORE achieved in round ${round}!`);
485
+ outcomes.push({ round, score, judging, diff, diagnosis: '', docsChanged: false, costEstimate });
486
+ break;
487
+ }
488
+ // Analyze failure
489
+ const analyzerRepoDir = fs.mkdtempSync(path.join(os.tmpdir(), 'evalbuff-analyzer-'));
490
+ const analyzerRepo = path.join(analyzerRepoDir, 'repo');
491
+ try {
492
+ execSync(`git clone --no-checkout "${opts.repoPath}" "${analyzerRepo}"`, { stdio: 'ignore' });
493
+ const headSha = execSync('git rev-parse HEAD', { cwd: opts.repoPath, encoding: 'utf-8' }).trim();
494
+ execSync(`git checkout ${headSha}`, { cwd: analyzerRepo, stdio: 'ignore' });
495
+ }
496
+ catch { /* ignore clone errors */ }
497
+ const analysis = await runAnalyzer(analyzerRepo, {
498
+ taskPrompt: feature.prompt,
499
+ agentDiff: diff,
500
+ groundTruthDiff,
501
+ judging,
502
+ round,
503
+ previousDiagnoses: diagnoses,
504
+ currentDocs: docsBefore,
505
+ }, opts.analyzerModel);
506
+ try {
507
+ fs.rmSync(analyzerRepoDir, { recursive: true, force: true });
508
+ }
509
+ catch { /* ignore */ }
510
+ diagnoses.push(analysis.diagnosis);
511
+ console.log(`\n Diagnosis: ${analysis.diagnosis}`);
512
+ console.log(` Suggestions: ${analysis.docSuggestions.length}`);
513
+ fs.writeFileSync(path.join(roundDir, 'diagnosis.json'), JSON.stringify(analysis, null, 2));
514
+ // Combine analyzer suggestions with judge suggestions
515
+ const allSuggestions = [
516
+ ...analysis.docSuggestions,
517
+ ...(judging.docSuggestions || []),
518
+ ];
519
+ // Apply doc improvements
520
+ let docsChanged = false;
521
+ if (allSuggestions.length > 0) {
522
+ console.log(`\n Applying ${allSuggestions.length} doc suggestions...`);
523
+ await runDocsWriter(opts.repoPath, allSuggestions, opts.docsModel);
524
+ const docsAfter = getDocsSnapshot(opts.repoPath);
525
+ const docsDiff = computeDocsDiffText(docsBefore, docsAfter);
526
+ docsChanged = docsDiff.trim().length > 0;
527
+ fs.writeFileSync(path.join(roundDir, 'docs-diff.txt'), docsDiff);
528
+ fs.writeFileSync(path.join(roundDir, 'docs-after.json'), JSON.stringify(docsAfter, null, 2));
529
+ if (docsChanged) {
530
+ console.log(` Docs updated.`);
531
+ }
532
+ else {
533
+ console.log(` Docs writer ran but made no changes.`);
534
+ }
535
+ }
536
+ outcomes.push({ round, score, judging, diff, diagnosis: analysis.diagnosis, docsChanged, costEstimate });
537
+ // If we've been stuck at the same score for 3 rounds, skip ahead in strategy
538
+ if (outcomes.length >= 3) {
539
+ const lastThree = outcomes.slice(-3);
540
+ const allSameScore = lastThree.every(o => o.score === lastThree[0].score);
541
+ if (allSameScore && !docsChanged) {
542
+ console.log(`\n Stuck at ${score}/10 for 3 rounds. Consider trying a different approach.`);
543
+ }
544
+ }
545
+ }
546
+ // Write final report
547
+ const endTime = new Date().toISOString();
548
+ const finalDocs = getDocsSnapshot(opts.repoPath);
549
+ const report = generateReport(opts, outcomes, totalCost, startTime, endTime, finalDocs);
550
+ fs.writeFileSync(path.join(logDir, 'report.md'), report);
551
+ console.log(`\n${'='.repeat(60)}`);
552
+ console.log('PERFECT FEATURE RUN COMPLETE');
553
+ console.log(`${'='.repeat(60)}`);
554
+ console.log(` Feature: ${opts.featureId}`);
555
+ console.log(` Rounds: ${outcomes.length}`);
556
+ console.log(` Score progression: ${outcomes.map(o => o.score.toFixed(1)).join(' → ')}`);
557
+ console.log(` Best score: ${bestScore}/10`);
558
+ console.log(` Total cost: $${totalCost.toFixed(2)}`);
559
+ console.log(` Log dir: ${logDir}`);
560
+ console.log(` Report: ${path.join(logDir, 'report.md')}`);
561
+ }
562
+ // ---------------------------------------------------------------------------
563
+ // Report generation
564
+ // ---------------------------------------------------------------------------
565
+ function generateReport(opts, outcomes, totalCost, startTime, endTime, finalDocs) {
566
+ const L = [];
567
+ L.push('# Perfect Feature Report', '');
568
+ L.push('## Overview', '');
569
+ L.push(`| | |`);
570
+ L.push(`|---|---|`);
571
+ L.push(`| **Feature** | ${opts.featureId} |`);
572
+ L.push(`| **Repo** | \`${opts.repoPath}\` |`);
573
+ L.push(`| **Start** | ${startTime} |`);
574
+ L.push(`| **End** | ${endTime} |`);
575
+ L.push(`| **Rounds** | ${outcomes.length} |`);
576
+ L.push(`| **Best score** | ${Math.max(...outcomes.map(o => o.score))}/10 |`);
577
+ L.push(`| **Total cost** | $${totalCost.toFixed(2)} |`);
578
+ L.push(`| **Coding model** | ${opts.codingModel} |`);
579
+ L.push(`| **Judge model** | ${opts.judgeModel} |`);
580
+ L.push('');
581
+ // Score progression
582
+ L.push('## Score Progression', '');
583
+ L.push('```');
584
+ for (const o of outcomes) {
585
+ const bar = '█'.repeat(Math.round(o.score * 2));
586
+ L.push(`Round ${o.round.toString().padStart(2)} ${o.score.toFixed(1).padStart(5)}/10 ${bar}`);
587
+ }
588
+ L.push('```', '');
589
+ // Per-round detail
590
+ for (const o of outcomes) {
591
+ L.push(`## Round ${o.round} — ${o.score.toFixed(1)}/10`, '');
592
+ L.push(`| Completion | Code Quality | E2E | Overall |`);
593
+ L.push(`|---|---|---|---|`);
594
+ L.push(`| ${o.judging.completionScore} | ${o.judging.codeQualityScore} | ${o.judging.e2eScore} | ${o.judging.overallScore} |`);
595
+ L.push('');
596
+ L.push(`**Analysis:** ${o.judging.analysis}`, '');
597
+ if (o.judging.strengths.length > 0) {
598
+ L.push('**Strengths:**');
599
+ for (const s of o.judging.strengths)
600
+ L.push(`- ${s}`);
601
+ L.push('');
602
+ }
603
+ if (o.judging.weaknesses.length > 0) {
604
+ L.push('**Weaknesses:**');
605
+ for (const w of o.judging.weaknesses)
606
+ L.push(`- ${w}`);
607
+ L.push('');
608
+ }
609
+ if (o.diagnosis) {
610
+ L.push(`**Diagnosis:** ${o.diagnosis}`, '');
611
+ }
612
+ L.push(`**Docs changed:** ${o.docsChanged ? 'Yes' : 'No'}`);
613
+ L.push(`**Cost:** $${o.costEstimate.toFixed(2)}`, '');
614
+ }
615
+ // Final docs
616
+ const docKeys = Object.keys(finalDocs).sort();
617
+ if (docKeys.length > 0) {
618
+ L.push('## Final Documentation', '');
619
+ for (const key of docKeys) {
620
+ L.push(`### ${key}`, '');
621
+ L.push('```markdown');
622
+ L.push(finalDocs[key]);
623
+ L.push('```', '');
624
+ }
625
+ }
626
+ return L.join('\n');
627
+ }
628
+ // ---------------------------------------------------------------------------
629
+ // CLI
630
+ // ---------------------------------------------------------------------------
631
+ if (import.meta.main) {
632
+ const args = process.argv.slice(2);
633
+ const getArg = (name, defaultValue) => {
634
+ const idx = args.indexOf(`--${name}`);
635
+ if (idx >= 0 && idx + 1 < args.length)
636
+ return args[idx + 1];
637
+ if (defaultValue !== undefined)
638
+ return defaultValue;
639
+ throw new Error(`Missing required argument: --${name}`);
640
+ };
641
+ const hasArg = (name) => args.includes(`--${name}`);
642
+ const repoPath = getArg('repo');
643
+ const featuresPath = getArg('features');
644
+ const featureId = getArg('feature-id');
645
+ const maxRounds = parseInt(getArg('max-rounds', '10'));
646
+ const codingModel = getArg('coding-model', 'sonnet');
647
+ const judgeModel = getArg('judge-model', 'opus');
648
+ const analyzerModel = getArg('analyzer-model', 'opus');
649
+ const docsModel = getArg('docs-model', 'opus');
650
+ const initCommand = hasArg('init-command') ? getArg('init-command') : undefined;
651
+ perfectFeature({
652
+ repoPath,
653
+ featuresPath,
654
+ featureId,
655
+ maxRounds,
656
+ codingModel,
657
+ judgeModel,
658
+ analyzerModel,
659
+ docsModel,
660
+ initCommand,
661
+ }).catch((error) => {
662
+ console.error('Perfect feature run failed:', error);
663
+ process.exit(1);
664
+ });
665
+ }
666
+ //# sourceMappingURL=perfect-feature.js.map