evalbuff 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +79 -0
- package/dist/carve-features.d.ts +42 -0
- package/dist/carve-features.d.ts.map +1 -0
- package/dist/carve-features.js +305 -0
- package/dist/carve-features.js.map +1 -0
- package/dist/cli.d.ts +3 -0
- package/dist/cli.d.ts.map +1 -0
- package/dist/cli.js +42 -0
- package/dist/cli.js.map +1 -0
- package/dist/docs-refactor.d.ts +4 -0
- package/dist/docs-refactor.d.ts.map +1 -0
- package/dist/docs-refactor.js +122 -0
- package/dist/docs-refactor.js.map +1 -0
- package/dist/docs-writer.d.ts +4 -0
- package/dist/docs-writer.d.ts.map +1 -0
- package/dist/docs-writer.js +122 -0
- package/dist/docs-writer.js.map +1 -0
- package/dist/eval-helpers.d.ts +19 -0
- package/dist/eval-helpers.d.ts.map +1 -0
- package/dist/eval-helpers.js +327 -0
- package/dist/eval-helpers.js.map +1 -0
- package/dist/eval-runner.d.ts +42 -0
- package/dist/eval-runner.d.ts.map +1 -0
- package/dist/eval-runner.js +193 -0
- package/dist/eval-runner.js.map +1 -0
- package/dist/judge.d.ts +22 -0
- package/dist/judge.d.ts.map +1 -0
- package/dist/judge.js +284 -0
- package/dist/judge.js.map +1 -0
- package/dist/perfect-feature.d.ts +2 -0
- package/dist/perfect-feature.d.ts.map +1 -0
- package/dist/perfect-feature.js +666 -0
- package/dist/perfect-feature.js.map +1 -0
- package/dist/report.d.ts +31 -0
- package/dist/report.d.ts.map +1 -0
- package/dist/report.js +249 -0
- package/dist/report.js.map +1 -0
- package/dist/run-evalbuff.d.ts +12 -0
- package/dist/run-evalbuff.d.ts.map +1 -0
- package/dist/run-evalbuff.js +383 -0
- package/dist/run-evalbuff.js.map +1 -0
- package/dist/runners/claude.d.ts +10 -0
- package/dist/runners/claude.d.ts.map +1 -0
- package/dist/runners/claude.js +80 -0
- package/dist/runners/claude.js.map +1 -0
- package/dist/runners/codebuff.d.ts +24 -0
- package/dist/runners/codebuff.d.ts.map +1 -0
- package/dist/runners/codebuff.js +88 -0
- package/dist/runners/codebuff.js.map +1 -0
- package/dist/runners/codex.d.ts +8 -0
- package/dist/runners/codex.d.ts.map +1 -0
- package/dist/runners/codex.js +131 -0
- package/dist/runners/codex.js.map +1 -0
- package/dist/runners/index.d.ts +5 -0
- package/dist/runners/index.d.ts.map +1 -0
- package/dist/runners/index.js +4 -0
- package/dist/runners/index.js.map +1 -0
- package/dist/runners/runner.d.ts +11 -0
- package/dist/runners/runner.d.ts.map +1 -0
- package/dist/runners/runner.js +2 -0
- package/dist/runners/runner.js.map +1 -0
- package/dist/test-repo-utils.d.ts +21 -0
- package/dist/test-repo-utils.d.ts.map +1 -0
- package/dist/test-repo-utils.js +109 -0
- package/dist/test-repo-utils.js.map +1 -0
- package/dist/trace-compressor.d.ts +130 -0
- package/dist/trace-compressor.d.ts.map +1 -0
- package/dist/trace-compressor.js +680 -0
- package/dist/trace-compressor.js.map +1 -0
- package/dist/tui/data.d.ts +84 -0
- package/dist/tui/data.d.ts.map +1 -0
- package/dist/tui/data.js +80 -0
- package/dist/tui/data.js.map +1 -0
- package/dist/tui/events.d.ts +86 -0
- package/dist/tui/events.d.ts.map +1 -0
- package/dist/tui/events.js +52 -0
- package/dist/tui/events.js.map +1 -0
- package/dist/vendor/error.d.ts +18 -0
- package/dist/vendor/error.d.ts.map +1 -0
- package/dist/vendor/error.js +64 -0
- package/dist/vendor/error.js.map +1 -0
- package/dist/vendor/print-mode.d.ts +75 -0
- package/dist/vendor/print-mode.d.ts.map +1 -0
- package/dist/vendor/print-mode.js +2 -0
- package/dist/vendor/print-mode.js.map +1 -0
- package/package.json +46 -0
|
@@ -0,0 +1,666 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Perfect Feature — iteratively rebuild a single feature toward a 10/10 score.
|
|
3
|
+
*
|
|
4
|
+
* Unlike run-evalbuff (which runs many features and does holistic doc improvement),
|
|
5
|
+
* this script focuses on ONE feature and tries a series of doc strategies to help
|
|
6
|
+
* the rebuild agent achieve a perfect score — without giving away the answer.
|
|
7
|
+
*
|
|
8
|
+
* Strategies are applied in stages:
|
|
9
|
+
* Rounds 1-2: General design & style principles
|
|
10
|
+
* Rounds 3-4: Project knowledge (utilities, framework, common patterns)
|
|
11
|
+
* Rounds 5-6: Process instructions (e2e testing workflow, verification)
|
|
12
|
+
* Rounds 7+: Subagent instructions (spawn a critic/planner/reviewer)
|
|
13
|
+
*
|
|
14
|
+
* Usage:
|
|
15
|
+
* bun run src/perfect-feature.ts \
|
|
16
|
+
* --repo /path/to/repo \
|
|
17
|
+
* --features features.json \
|
|
18
|
+
* --feature-id my-feature-id \
|
|
19
|
+
* [--max-rounds 10] \
|
|
20
|
+
* [--coding-model sonnet] \
|
|
21
|
+
* [--judge-model opus] \
|
|
22
|
+
* [--init-command "npm install"]
|
|
23
|
+
*/
|
|
24
|
+
import { execSync } from 'child_process';
|
|
25
|
+
import fs from 'fs';
|
|
26
|
+
import os from 'os';
|
|
27
|
+
import path from 'path';
|
|
28
|
+
import { ClaudeRunner } from './runners/claude';
|
|
29
|
+
import { applyCarveOperations, copyDocsIntoRepo, ensureGitIdentity, getDocsSnapshot, getGroundTruthDiff, computeDocsDiffText, syncDocsIntoRepo, truncateDiff, } from './eval-helpers';
|
|
30
|
+
// ---------------------------------------------------------------------------
|
|
31
|
+
// Doc improvement strategies — all available to the analyzer every round
|
|
32
|
+
// ---------------------------------------------------------------------------
|
|
33
|
+
const ANALYZER_STRATEGY_GUIDE = `You have several categories of doc improvements available. Use whichever ones address the actual failure — often multiple categories apply at once. Use your judgment about which will have the most impact given the diagnosis.
|
|
34
|
+
|
|
35
|
+
### 1. Design & Style Principles
|
|
36
|
+
When to use: The agent's code works but doesn't match project conventions, or the agent made bad structural decisions.
|
|
37
|
+
- Code style conventions (naming, file organization, export patterns)
|
|
38
|
+
- UI/UX design principles the project follows (if applicable)
|
|
39
|
+
- Error handling patterns
|
|
40
|
+
- Type conventions and data modeling patterns
|
|
41
|
+
- How new features should be structured to match existing code
|
|
42
|
+
|
|
43
|
+
### 2. Project Knowledge (Utilities, Framework, Reusable Patterns)
|
|
44
|
+
When to use: The agent reinvented something that already exists, used the wrong abstraction, or didn't know about a key utility.
|
|
45
|
+
- Shared utility functions and where they live
|
|
46
|
+
- Framework abstractions (routing, state management, DB access, etc.)
|
|
47
|
+
- Common imports and their usage patterns
|
|
48
|
+
- Configuration and environment setup
|
|
49
|
+
- How existing features compose these building blocks
|
|
50
|
+
|
|
51
|
+
### 3. Process Instructions (Workflow, Verification, E2E Testing)
|
|
52
|
+
When to use: The agent produced code that doesn't build, doesn't pass tests, or has bugs it could have caught by testing.
|
|
53
|
+
- A step-by-step workflow: read docs → plan → implement → test → fix
|
|
54
|
+
- How to run and verify changes (build commands, test commands, dev server)
|
|
55
|
+
- E2E testing steps the agent should perform before declaring done
|
|
56
|
+
- How to check for common mistakes (missing imports, unregistered routes, etc.)
|
|
57
|
+
- A checklist of things to verify before finishing
|
|
58
|
+
|
|
59
|
+
### 4. Subagent & Self-Review Instructions
|
|
60
|
+
When to use: The agent's first-pass implementation has issues it could catch with a review step, or the task is complex enough to benefit from planning.
|
|
61
|
+
- Suggest the agent spawn a "critic" subagent to review its own work before finishing
|
|
62
|
+
- Suggest the agent spawn a "planner" subagent before starting implementation
|
|
63
|
+
- Suggest the agent re-read its own diff and look for issues
|
|
64
|
+
- Suggest the agent run the test suite and fix any failures before finishing
|
|
65
|
+
- Suggest the agent use a checklist-driven review process at the end`;
|
|
66
|
+
// ---------------------------------------------------------------------------
|
|
67
|
+
// Custom judge — flexible, allows better-than-ground-truth solutions
|
|
68
|
+
// ---------------------------------------------------------------------------
|
|
69
|
+
function buildFlexibleJudgePrompt(input) {
|
|
70
|
+
const { taskPrompt, agentDiff: rawAgentDiff, groundTruthDiff: rawGroundTruthDiff, round } = input;
|
|
71
|
+
const agentDiff = truncateDiff(rawAgentDiff);
|
|
72
|
+
const groundTruthDiff = truncateDiff(rawGroundTruthDiff);
|
|
73
|
+
return `You are a senior engineer performing a thorough code review with hands-on E2E testing.
|
|
74
|
+
|
|
75
|
+
## Your Mission
|
|
76
|
+
|
|
77
|
+
An AI coding agent was given a task and produced changes. You must judge how well it did.
|
|
78
|
+
|
|
79
|
+
**CRITICAL: The ground truth diff below is just ONE valid implementation — a reference, not the answer key.**
|
|
80
|
+
The agent's solution may be DIFFERENT from the ground truth and still be PERFECT (10/10).
|
|
81
|
+
The agent's solution may even be BETTER than the ground truth.
|
|
82
|
+
Do NOT penalize the agent for:
|
|
83
|
+
- Using different variable names, file structure, or code organization
|
|
84
|
+
- Taking a different architectural approach that achieves the same result
|
|
85
|
+
- Adding extra features, tests, or error handling beyond what was asked
|
|
86
|
+
- Using different libraries or utilities to accomplish the same thing
|
|
87
|
+
|
|
88
|
+
DO penalize the agent for:
|
|
89
|
+
- Missing functionality (the feature doesn't work or is incomplete)
|
|
90
|
+
- Bugs (runtime errors, logic errors, broken edge cases)
|
|
91
|
+
- Build/type errors
|
|
92
|
+
- Not following the project's existing conventions (if docs describe them)
|
|
93
|
+
- Leaving dead code, TODO comments, or unfinished scaffolding
|
|
94
|
+
|
|
95
|
+
## How to Judge
|
|
96
|
+
|
|
97
|
+
1. **Read the project docs** (docs/, AGENTS.md, CLAUDE.md) to understand conventions
|
|
98
|
+
2. **Review the agent's diff** for completeness and correctness
|
|
99
|
+
3. **Actually test the changes end-to-end:**
|
|
100
|
+
- Run the build/compile step
|
|
101
|
+
- Run the test suite
|
|
102
|
+
- Start the dev server if applicable
|
|
103
|
+
- Exercise the feature manually (browser tools, curl, CLI)
|
|
104
|
+
- Check logs for errors
|
|
105
|
+
- Test edge cases
|
|
106
|
+
4. **Compare against ground truth** only to understand what SHOULD work, not to require identical code
|
|
107
|
+
5. **Write your judgment** to evalbuff-review-result.json
|
|
108
|
+
|
|
109
|
+
## User Prompt (What the agent was asked to do)
|
|
110
|
+
${taskPrompt}
|
|
111
|
+
|
|
112
|
+
## Ground Truth (One valid reference implementation — NOT the required approach)
|
|
113
|
+
\`\`\`diff
|
|
114
|
+
${groundTruthDiff}
|
|
115
|
+
\`\`\`
|
|
116
|
+
|
|
117
|
+
## Agent's Changes
|
|
118
|
+
\`\`\`diff
|
|
119
|
+
${agentDiff || '(No changes made)'}
|
|
120
|
+
\`\`\`
|
|
121
|
+
|
|
122
|
+
## Scoring Guide
|
|
123
|
+
|
|
124
|
+
- **10/10**: Feature works completely. Builds, passes tests, works end-to-end. May differ from ground truth.
|
|
125
|
+
- **8-9/10**: Feature mostly works but has minor issues (cosmetic bugs, missing edge case, slight convention mismatch).
|
|
126
|
+
- **6-7/10**: Core feature works but significant issues (broken edge cases, missing pieces, convention violations).
|
|
127
|
+
- **4-5/10**: Partially working — some functionality present but major gaps.
|
|
128
|
+
- **1-3/10**: Barely functional or fundamentally broken.
|
|
129
|
+
- **0/10**: Nothing useful produced.
|
|
130
|
+
|
|
131
|
+
## Required Output
|
|
132
|
+
|
|
133
|
+
Write your judgment to \`evalbuff-review-result.json\`:
|
|
134
|
+
|
|
135
|
+
\`\`\`json
|
|
136
|
+
{
|
|
137
|
+
"analysis": "Detailed analysis of what you tested and found...",
|
|
138
|
+
"strengths": ["strength 1", "strength 2"],
|
|
139
|
+
"weaknesses": ["weakness 1", "weakness 2"],
|
|
140
|
+
"e2eTestsPerformed": ["Test 1", "Test 2"],
|
|
141
|
+
"completionScore": 8,
|
|
142
|
+
"codeQualityScore": 9,
|
|
143
|
+
"e2eScore": 7,
|
|
144
|
+
"overallScore": 8,
|
|
145
|
+
"docSuggestions": ["Suggestion 1", "Suggestion 2"]
|
|
146
|
+
}
|
|
147
|
+
\`\`\`
|
|
148
|
+
|
|
149
|
+
## Documentation Suggestions
|
|
150
|
+
|
|
151
|
+
This is round ${round} of an iterative improvement process. Based on what you find, suggest doc changes that would help a coding agent do better WITHOUT giving away the specific implementation.
|
|
152
|
+
|
|
153
|
+
Good: "Document that all route handlers must be registered in src/routes/index.ts"
|
|
154
|
+
Bad: "Tell the agent to add a UserProfile route to src/routes/index.ts"
|
|
155
|
+
|
|
156
|
+
Focus on GENERAL PATTERNS that would help with ANY feature, not just this one.
|
|
157
|
+
|
|
158
|
+
IMPORTANT: You MUST write the result file. Do it as your very last action.`;
|
|
159
|
+
}
|
|
160
|
+
async function runFlexibleJudge(repoDir, input, model) {
|
|
161
|
+
const prompt = buildFlexibleJudgePrompt(input);
|
|
162
|
+
console.log(` [Judge] Running flexible Claude judge (${model})...`);
|
|
163
|
+
const runner = new ClaudeRunner(repoDir, {}, model, 'high');
|
|
164
|
+
try {
|
|
165
|
+
await runner.run(prompt);
|
|
166
|
+
}
|
|
167
|
+
catch (err) {
|
|
168
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
169
|
+
console.warn(` [Judge] Runner failed: ${msg.slice(0, 200)}`);
|
|
170
|
+
}
|
|
171
|
+
// Read result file
|
|
172
|
+
const resultPath = path.join(repoDir, 'evalbuff-review-result.json');
|
|
173
|
+
try {
|
|
174
|
+
if (fs.existsSync(resultPath)) {
|
|
175
|
+
const raw = JSON.parse(fs.readFileSync(resultPath, 'utf-8'));
|
|
176
|
+
return {
|
|
177
|
+
analysis: raw.analysis || 'No analysis',
|
|
178
|
+
strengths: Array.isArray(raw.strengths) ? raw.strengths : [],
|
|
179
|
+
weaknesses: Array.isArray(raw.weaknesses) ? raw.weaknesses : [],
|
|
180
|
+
e2eTestsPerformed: Array.isArray(raw.e2eTestsPerformed) ? raw.e2eTestsPerformed : [],
|
|
181
|
+
completionScore: typeof raw.completionScore === 'number' ? raw.completionScore : raw.overallScore ?? 0,
|
|
182
|
+
codeQualityScore: typeof raw.codeQualityScore === 'number' ? raw.codeQualityScore : raw.overallScore ?? 0,
|
|
183
|
+
e2eScore: typeof raw.e2eScore === 'number' ? raw.e2eScore : raw.overallScore ?? 0,
|
|
184
|
+
overallScore: typeof raw.overallScore === 'number' ? raw.overallScore : 0,
|
|
185
|
+
docSuggestions: Array.isArray(raw.docSuggestions) ? raw.docSuggestions : undefined,
|
|
186
|
+
};
|
|
187
|
+
}
|
|
188
|
+
}
|
|
189
|
+
catch (err) {
|
|
190
|
+
console.warn(` [Judge] Failed to parse result: ${err}`);
|
|
191
|
+
}
|
|
192
|
+
return {
|
|
193
|
+
analysis: 'Judge failed to produce result file',
|
|
194
|
+
strengths: [],
|
|
195
|
+
weaknesses: ['Judge failed'],
|
|
196
|
+
e2eTestsPerformed: [],
|
|
197
|
+
completionScore: 0,
|
|
198
|
+
codeQualityScore: 0,
|
|
199
|
+
e2eScore: 0,
|
|
200
|
+
overallScore: 0,
|
|
201
|
+
};
|
|
202
|
+
}
|
|
203
|
+
// ---------------------------------------------------------------------------
|
|
204
|
+
// Analyzer — diagnoses WHY score isn't 10/10 and suggests doc improvements
|
|
205
|
+
// ---------------------------------------------------------------------------
|
|
206
|
+
function buildAnalyzerPrompt(input) {
|
|
207
|
+
const { taskPrompt, agentDiff: rawAgentDiff, groundTruthDiff: rawGroundTruthDiff, judging, round, previousDiagnoses, currentDocs } = input;
|
|
208
|
+
const agentDiff = truncateDiff(rawAgentDiff);
|
|
209
|
+
const groundTruthDiff = truncateDiff(rawGroundTruthDiff);
|
|
210
|
+
const prevSection = previousDiagnoses.length > 0
|
|
211
|
+
? `## Previous Diagnoses (what we already tried)\n${previousDiagnoses.map((d, i) => `Round ${i + 1}: ${d}`).join('\n\n')}\n\nDo NOT repeat suggestions that were already tried. Find NEW angles.`
|
|
212
|
+
: '';
|
|
213
|
+
const docsSection = Object.keys(currentDocs).length > 0
|
|
214
|
+
? `## Current Documentation\n${Object.entries(currentDocs).map(([f, c]) => `### ${f}\n${c}`).join('\n\n')}`
|
|
215
|
+
: '## Current Documentation\n(No docs exist yet)';
|
|
216
|
+
return `You are an expert at analyzing why an AI coding agent failed to perfectly implement a feature, and at writing documentation that would help it succeed next time — WITHOUT giving away the specific answer.
|
|
217
|
+
|
|
218
|
+
## Context
|
|
219
|
+
|
|
220
|
+
A coding agent was asked to implement a feature. It scored ${judging.overallScore}/10. This is round ${round} of an iterative improvement process. Your job is to figure out WHY it didn't get 10/10 and suggest documentation changes that would help it (or any agent) do better.
|
|
221
|
+
|
|
222
|
+
**CRITICAL RULES:**
|
|
223
|
+
1. Your doc suggestions must be GENERAL — they should help an agent build ANY feature, not just this one.
|
|
224
|
+
2. NEVER include the specific implementation, specific file contents, or specific code that the agent should write.
|
|
225
|
+
3. DO document patterns, conventions, architectural rules, utility functions, and workflows.
|
|
226
|
+
4. Think about what KNOWLEDGE GAP caused the failure, then fill that gap with general knowledge.
|
|
227
|
+
|
|
228
|
+
## The Task
|
|
229
|
+
${taskPrompt}
|
|
230
|
+
|
|
231
|
+
## Agent's Attempt (scored ${judging.overallScore}/10)
|
|
232
|
+
\`\`\`diff
|
|
233
|
+
${agentDiff || '(No changes)'}
|
|
234
|
+
\`\`\`
|
|
235
|
+
|
|
236
|
+
## Judge's Feedback
|
|
237
|
+
**Analysis:** ${judging.analysis}
|
|
238
|
+
**Strengths:** ${judging.strengths.join(', ') || 'None listed'}
|
|
239
|
+
**Weaknesses:** ${judging.weaknesses.join(', ') || 'None listed'}
|
|
240
|
+
**E2E tests performed:** ${judging.e2eTestsPerformed.join(', ') || 'None'}
|
|
241
|
+
**Judge's doc suggestions:** ${judging.docSuggestions?.join('\n- ') || 'None'}
|
|
242
|
+
|
|
243
|
+
## Ground Truth (reference only — the agent should NOT be told this)
|
|
244
|
+
\`\`\`diff
|
|
245
|
+
${groundTruthDiff}
|
|
246
|
+
\`\`\`
|
|
247
|
+
|
|
248
|
+
${prevSection}
|
|
249
|
+
|
|
250
|
+
${docsSection}
|
|
251
|
+
|
|
252
|
+
## Available Improvement Strategies
|
|
253
|
+
|
|
254
|
+
${ANALYZER_STRATEGY_GUIDE}
|
|
255
|
+
|
|
256
|
+
## Your Output
|
|
257
|
+
|
|
258
|
+
Diagnose the root cause, then pick whichever strategies (one or more) best address the failure. Write your result to \`analyzer-result.json\`:
|
|
259
|
+
|
|
260
|
+
\`\`\`json
|
|
261
|
+
{
|
|
262
|
+
"diagnosis": "A 2-3 sentence explanation of the root cause — what knowledge gap or process failure led to the imperfect score",
|
|
263
|
+
"docSuggestions": [
|
|
264
|
+
"Each suggestion should specify which file to create/update AND include the full content. E.g.: 'Create docs/routing.md: All routes must be registered in src/routes/index.ts by calling registerRoute()...'",
|
|
265
|
+
"Use whichever strategy categories are most relevant to the actual failure"
|
|
266
|
+
]
|
|
267
|
+
}
|
|
268
|
+
\`\`\`
|
|
269
|
+
|
|
270
|
+
Remember: The goal is to make docs that help an agent build ANY feature perfectly, not to encode the answer to THIS specific feature. If the agent's failure was highly specific and can't be generalized, say so in your diagnosis and provide minimal/no suggestions.
|
|
271
|
+
|
|
272
|
+
IMPORTANT: You MUST write analyzer-result.json. Do it as your very last action.`;
|
|
273
|
+
}
|
|
274
|
+
async function runAnalyzer(repoDir, input, model) {
|
|
275
|
+
const prompt = buildAnalyzerPrompt(input);
|
|
276
|
+
console.log(` [Analyzer] Diagnosing failure (${model})...`);
|
|
277
|
+
const runner = new ClaudeRunner(repoDir, {}, model, 'high');
|
|
278
|
+
try {
|
|
279
|
+
await runner.run(prompt);
|
|
280
|
+
}
|
|
281
|
+
catch (err) {
|
|
282
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
283
|
+
console.warn(` [Analyzer] Runner failed: ${msg.slice(0, 200)}`);
|
|
284
|
+
}
|
|
285
|
+
const resultPath = path.join(repoDir, 'analyzer-result.json');
|
|
286
|
+
try {
|
|
287
|
+
if (fs.existsSync(resultPath)) {
|
|
288
|
+
const raw = JSON.parse(fs.readFileSync(resultPath, 'utf-8'));
|
|
289
|
+
return {
|
|
290
|
+
diagnosis: raw.diagnosis || 'No diagnosis produced',
|
|
291
|
+
docSuggestions: Array.isArray(raw.docSuggestions) ? raw.docSuggestions : [],
|
|
292
|
+
};
|
|
293
|
+
}
|
|
294
|
+
}
|
|
295
|
+
catch (err) {
|
|
296
|
+
console.warn(` [Analyzer] Failed to parse result: ${err}`);
|
|
297
|
+
}
|
|
298
|
+
return { diagnosis: 'Analyzer failed to produce results', docSuggestions: [] };
|
|
299
|
+
}
|
|
300
|
+
// ---------------------------------------------------------------------------
|
|
301
|
+
// Docs writer — applies suggestions from the analyzer
|
|
302
|
+
// ---------------------------------------------------------------------------
|
|
303
|
+
async function runDocsWriter(repoPath, suggestions, model) {
|
|
304
|
+
if (suggestions.length === 0) {
|
|
305
|
+
console.log(` [DocsWriter] No suggestions to apply, skipping.`);
|
|
306
|
+
return;
|
|
307
|
+
}
|
|
308
|
+
const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'evalbuff-docs-'));
|
|
309
|
+
const repoDir = path.join(tempDir, 'repo');
|
|
310
|
+
const prompt = `You are a documentation writer for a coding project. Your job is to update the project docs to help AI coding agents build features successfully.
|
|
311
|
+
|
|
312
|
+
## Suggestions to Apply
|
|
313
|
+
|
|
314
|
+
${suggestions.map((s, i) => `${i + 1}. ${s}`).join('\n')}
|
|
315
|
+
|
|
316
|
+
## Rules
|
|
317
|
+
|
|
318
|
+
1. ONLY modify files in docs/, AGENTS.md, or CLAUDE.md. Do NOT modify source code.
|
|
319
|
+
2. Each suggestion tells you which file to create or update — follow those instructions.
|
|
320
|
+
3. If a suggestion says to update an existing file, make targeted edits rather than rewriting.
|
|
321
|
+
4. If multiple suggestions overlap, merge them into one cohesive doc.
|
|
322
|
+
5. Keep docs concise and actionable. Dense information beats verbose explanations.
|
|
323
|
+
6. Before documenting any function or file path, grep to confirm it exists.
|
|
324
|
+
7. Never document aspirational/future behavior — only what exists NOW.
|
|
325
|
+
8. Remove or update any existing docs that conflict with the new information.
|
|
326
|
+
|
|
327
|
+
## Verification
|
|
328
|
+
|
|
329
|
+
After making changes, read back each modified file to verify it's coherent and accurate.`;
|
|
330
|
+
try {
|
|
331
|
+
execSync(`git clone --no-checkout "${repoPath}" "${repoDir}"`, { stdio: 'ignore' });
|
|
332
|
+
const headSha = execSync('git rev-parse HEAD', { cwd: repoPath, encoding: 'utf-8' }).trim();
|
|
333
|
+
execSync(`git checkout ${headSha}`, { cwd: repoDir, stdio: 'ignore' });
|
|
334
|
+
syncDocsIntoRepo(repoPath, repoDir);
|
|
335
|
+
const runner = new ClaudeRunner(repoDir, {}, model, 'high');
|
|
336
|
+
await runner.run(prompt);
|
|
337
|
+
syncDocsIntoRepo(repoDir, repoPath);
|
|
338
|
+
}
|
|
339
|
+
catch (err) {
|
|
340
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
341
|
+
console.warn(` [DocsWriter] Failed: ${msg.slice(0, 200)}`);
|
|
342
|
+
}
|
|
343
|
+
finally {
|
|
344
|
+
try {
|
|
345
|
+
fs.rmSync(tempDir, { recursive: true, force: true });
|
|
346
|
+
}
|
|
347
|
+
catch { /* ignore */ }
|
|
348
|
+
}
|
|
349
|
+
}
|
|
350
|
+
// ---------------------------------------------------------------------------
|
|
351
|
+
// Single rebuild + judge cycle
|
|
352
|
+
// ---------------------------------------------------------------------------
|
|
353
|
+
async function runRebuildAndJudge(opts) {
|
|
354
|
+
const { repoPath, feature, groundTruthDiff, round, codingModel, judgeModel, initCommand } = opts;
|
|
355
|
+
const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'evalbuff-perfect-'));
|
|
356
|
+
const repoDir = path.join(tempDir, 'repo');
|
|
357
|
+
try {
|
|
358
|
+
// Clone and carve
|
|
359
|
+
execSync(`git clone --no-checkout "${repoPath}" "${repoDir}"`, { stdio: 'ignore' });
|
|
360
|
+
const headSha = execSync('git rev-parse HEAD', { cwd: repoPath, encoding: 'utf-8' }).trim();
|
|
361
|
+
execSync(`git checkout ${headSha}`, { cwd: repoDir, stdio: 'ignore' });
|
|
362
|
+
ensureGitIdentity(repoDir);
|
|
363
|
+
applyCarveOperations(repoDir, feature.operations);
|
|
364
|
+
execSync('git add -A', { cwd: repoDir, stdio: 'ignore' });
|
|
365
|
+
execSync(`git commit -m "carve: remove ${feature.id}" --allow-empty`, { cwd: repoDir, stdio: 'ignore' });
|
|
366
|
+
// Copy docs
|
|
367
|
+
copyDocsIntoRepo(repoPath, repoDir);
|
|
368
|
+
// Init command
|
|
369
|
+
if (initCommand) {
|
|
370
|
+
try {
|
|
371
|
+
execSync(initCommand, { cwd: repoDir, stdio: 'ignore', timeout: 120000 });
|
|
372
|
+
}
|
|
373
|
+
catch (e) {
|
|
374
|
+
console.warn(` [Rebuild] Init command failed: ${e}`);
|
|
375
|
+
}
|
|
376
|
+
}
|
|
377
|
+
// Run rebuild agent
|
|
378
|
+
console.log(` [Rebuild] Round ${round}: Running claude (${codingModel})...`);
|
|
379
|
+
const runner = new ClaudeRunner(repoDir, {}, codingModel, 'medium');
|
|
380
|
+
let result;
|
|
381
|
+
try {
|
|
382
|
+
result = await runner.run(feature.prompt);
|
|
383
|
+
}
|
|
384
|
+
catch (err) {
|
|
385
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
386
|
+
return {
|
|
387
|
+
judging: {
|
|
388
|
+
analysis: `Agent failed: ${msg.slice(0, 500)}`,
|
|
389
|
+
strengths: [],
|
|
390
|
+
weaknesses: ['Agent failed'],
|
|
391
|
+
e2eTestsPerformed: [],
|
|
392
|
+
completionScore: 0,
|
|
393
|
+
codeQualityScore: 0,
|
|
394
|
+
e2eScore: 0,
|
|
395
|
+
overallScore: 0,
|
|
396
|
+
},
|
|
397
|
+
diff: '',
|
|
398
|
+
costEstimate: 0,
|
|
399
|
+
};
|
|
400
|
+
}
|
|
401
|
+
// Judge
|
|
402
|
+
const judging = await runFlexibleJudge(repoDir, {
|
|
403
|
+
taskPrompt: feature.prompt,
|
|
404
|
+
agentDiff: result.diff,
|
|
405
|
+
groundTruthDiff,
|
|
406
|
+
round,
|
|
407
|
+
}, judgeModel);
|
|
408
|
+
return {
|
|
409
|
+
judging,
|
|
410
|
+
diff: result.diff,
|
|
411
|
+
costEstimate: result.totalCostUsd,
|
|
412
|
+
};
|
|
413
|
+
}
|
|
414
|
+
finally {
|
|
415
|
+
try {
|
|
416
|
+
fs.rmSync(tempDir, { recursive: true, force: true });
|
|
417
|
+
}
|
|
418
|
+
catch { /* ignore */ }
|
|
419
|
+
}
|
|
420
|
+
}
|
|
421
|
+
// ---------------------------------------------------------------------------
|
|
422
|
+
// Main loop
|
|
423
|
+
// ---------------------------------------------------------------------------
|
|
424
|
+
async function perfectFeature(opts) {
|
|
425
|
+
const startTime = new Date().toISOString();
|
|
426
|
+
const logDir = path.join(os.tmpdir(), `evalbuff-perfect-${opts.featureId}-${new Date().toISOString().slice(0, 19).replace(/:/g, '-')}`);
|
|
427
|
+
fs.mkdirSync(logDir, { recursive: true });
|
|
428
|
+
console.log(`\nPerfect Feature`);
|
|
429
|
+
console.log(` Repo: ${opts.repoPath}`);
|
|
430
|
+
console.log(` Feature: ${opts.featureId}`);
|
|
431
|
+
console.log(` Max rounds: ${opts.maxRounds}`);
|
|
432
|
+
console.log(` Coding model: ${opts.codingModel}`);
|
|
433
|
+
console.log(` Judge model: ${opts.judgeModel}`);
|
|
434
|
+
console.log(` Analyzer model: ${opts.analyzerModel}`);
|
|
435
|
+
console.log(` Docs model: ${opts.docsModel}`);
|
|
436
|
+
console.log(` Log dir: ${logDir}`);
|
|
437
|
+
// Load feature
|
|
438
|
+
const allFeatures = JSON.parse(fs.readFileSync(opts.featuresPath, 'utf-8'));
|
|
439
|
+
const feature = allFeatures.find(f => f.id === opts.featureId);
|
|
440
|
+
if (!feature) {
|
|
441
|
+
const ids = allFeatures.map(f => f.id).join(', ');
|
|
442
|
+
console.error(`Feature "${opts.featureId}" not found. Available: ${ids}`);
|
|
443
|
+
process.exit(1);
|
|
444
|
+
}
|
|
445
|
+
const groundTruthDiff = getGroundTruthDiff(feature);
|
|
446
|
+
fs.writeFileSync(path.join(logDir, 'feature.json'), JSON.stringify(feature, null, 2));
|
|
447
|
+
fs.writeFileSync(path.join(logDir, 'ground-truth.diff'), groundTruthDiff);
|
|
448
|
+
const outcomes = [];
|
|
449
|
+
const diagnoses = [];
|
|
450
|
+
let totalCost = 0;
|
|
451
|
+
let bestScore = 0;
|
|
452
|
+
for (let round = 0; round < opts.maxRounds; round++) {
|
|
453
|
+
console.log(`\n${'='.repeat(60)}`);
|
|
454
|
+
console.log(`ROUND ${round}`);
|
|
455
|
+
console.log(`${'='.repeat(60)}`);
|
|
456
|
+
// Save docs state before this round
|
|
457
|
+
const docsBefore = getDocsSnapshot(opts.repoPath);
|
|
458
|
+
fs.writeFileSync(path.join(logDir, `docs-before-round-${round}.json`), JSON.stringify(docsBefore, null, 2));
|
|
459
|
+
// Run rebuild + judge
|
|
460
|
+
const { judging, diff, costEstimate } = await runRebuildAndJudge({
|
|
461
|
+
repoPath: opts.repoPath,
|
|
462
|
+
feature,
|
|
463
|
+
groundTruthDiff,
|
|
464
|
+
round,
|
|
465
|
+
codingModel: opts.codingModel,
|
|
466
|
+
judgeModel: opts.judgeModel,
|
|
467
|
+
initCommand: opts.initCommand,
|
|
468
|
+
});
|
|
469
|
+
totalCost += costEstimate;
|
|
470
|
+
const score = judging.overallScore;
|
|
471
|
+
if (score > bestScore)
|
|
472
|
+
bestScore = score;
|
|
473
|
+
console.log(`\n Score: ${score}/10 (best: ${bestScore}/10)`);
|
|
474
|
+
console.log(` Strengths: ${judging.strengths.join('; ') || 'none'}`);
|
|
475
|
+
console.log(` Weaknesses: ${judging.weaknesses.join('; ') || 'none'}`);
|
|
476
|
+
// Save round results
|
|
477
|
+
const roundDir = path.join(logDir, `round-${round}`);
|
|
478
|
+
fs.mkdirSync(roundDir, { recursive: true });
|
|
479
|
+
fs.writeFileSync(path.join(roundDir, 'judging.json'), JSON.stringify(judging, null, 2));
|
|
480
|
+
fs.writeFileSync(path.join(roundDir, 'diff.txt'), diff);
|
|
481
|
+
fs.writeFileSync(path.join(roundDir, 'score.txt'), score.toString());
|
|
482
|
+
// Check for perfection
|
|
483
|
+
if (score >= 10) {
|
|
484
|
+
console.log(`\n PERFECT SCORE achieved in round ${round}!`);
|
|
485
|
+
outcomes.push({ round, score, judging, diff, diagnosis: '', docsChanged: false, costEstimate });
|
|
486
|
+
break;
|
|
487
|
+
}
|
|
488
|
+
// Analyze failure
|
|
489
|
+
const analyzerRepoDir = fs.mkdtempSync(path.join(os.tmpdir(), 'evalbuff-analyzer-'));
|
|
490
|
+
const analyzerRepo = path.join(analyzerRepoDir, 'repo');
|
|
491
|
+
try {
|
|
492
|
+
execSync(`git clone --no-checkout "${opts.repoPath}" "${analyzerRepo}"`, { stdio: 'ignore' });
|
|
493
|
+
const headSha = execSync('git rev-parse HEAD', { cwd: opts.repoPath, encoding: 'utf-8' }).trim();
|
|
494
|
+
execSync(`git checkout ${headSha}`, { cwd: analyzerRepo, stdio: 'ignore' });
|
|
495
|
+
}
|
|
496
|
+
catch { /* ignore clone errors */ }
|
|
497
|
+
const analysis = await runAnalyzer(analyzerRepo, {
|
|
498
|
+
taskPrompt: feature.prompt,
|
|
499
|
+
agentDiff: diff,
|
|
500
|
+
groundTruthDiff,
|
|
501
|
+
judging,
|
|
502
|
+
round,
|
|
503
|
+
previousDiagnoses: diagnoses,
|
|
504
|
+
currentDocs: docsBefore,
|
|
505
|
+
}, opts.analyzerModel);
|
|
506
|
+
try {
|
|
507
|
+
fs.rmSync(analyzerRepoDir, { recursive: true, force: true });
|
|
508
|
+
}
|
|
509
|
+
catch { /* ignore */ }
|
|
510
|
+
diagnoses.push(analysis.diagnosis);
|
|
511
|
+
console.log(`\n Diagnosis: ${analysis.diagnosis}`);
|
|
512
|
+
console.log(` Suggestions: ${analysis.docSuggestions.length}`);
|
|
513
|
+
fs.writeFileSync(path.join(roundDir, 'diagnosis.json'), JSON.stringify(analysis, null, 2));
|
|
514
|
+
// Combine analyzer suggestions with judge suggestions
|
|
515
|
+
const allSuggestions = [
|
|
516
|
+
...analysis.docSuggestions,
|
|
517
|
+
...(judging.docSuggestions || []),
|
|
518
|
+
];
|
|
519
|
+
// Apply doc improvements
|
|
520
|
+
let docsChanged = false;
|
|
521
|
+
if (allSuggestions.length > 0) {
|
|
522
|
+
console.log(`\n Applying ${allSuggestions.length} doc suggestions...`);
|
|
523
|
+
await runDocsWriter(opts.repoPath, allSuggestions, opts.docsModel);
|
|
524
|
+
const docsAfter = getDocsSnapshot(opts.repoPath);
|
|
525
|
+
const docsDiff = computeDocsDiffText(docsBefore, docsAfter);
|
|
526
|
+
docsChanged = docsDiff.trim().length > 0;
|
|
527
|
+
fs.writeFileSync(path.join(roundDir, 'docs-diff.txt'), docsDiff);
|
|
528
|
+
fs.writeFileSync(path.join(roundDir, 'docs-after.json'), JSON.stringify(docsAfter, null, 2));
|
|
529
|
+
if (docsChanged) {
|
|
530
|
+
console.log(` Docs updated.`);
|
|
531
|
+
}
|
|
532
|
+
else {
|
|
533
|
+
console.log(` Docs writer ran but made no changes.`);
|
|
534
|
+
}
|
|
535
|
+
}
|
|
536
|
+
outcomes.push({ round, score, judging, diff, diagnosis: analysis.diagnosis, docsChanged, costEstimate });
|
|
537
|
+
// If we've been stuck at the same score for 3 rounds, skip ahead in strategy
|
|
538
|
+
if (outcomes.length >= 3) {
|
|
539
|
+
const lastThree = outcomes.slice(-3);
|
|
540
|
+
const allSameScore = lastThree.every(o => o.score === lastThree[0].score);
|
|
541
|
+
if (allSameScore && !docsChanged) {
|
|
542
|
+
console.log(`\n Stuck at ${score}/10 for 3 rounds. Consider trying a different approach.`);
|
|
543
|
+
}
|
|
544
|
+
}
|
|
545
|
+
}
|
|
546
|
+
// Write final report
|
|
547
|
+
const endTime = new Date().toISOString();
|
|
548
|
+
const finalDocs = getDocsSnapshot(opts.repoPath);
|
|
549
|
+
const report = generateReport(opts, outcomes, totalCost, startTime, endTime, finalDocs);
|
|
550
|
+
fs.writeFileSync(path.join(logDir, 'report.md'), report);
|
|
551
|
+
console.log(`\n${'='.repeat(60)}`);
|
|
552
|
+
console.log('PERFECT FEATURE RUN COMPLETE');
|
|
553
|
+
console.log(`${'='.repeat(60)}`);
|
|
554
|
+
console.log(` Feature: ${opts.featureId}`);
|
|
555
|
+
console.log(` Rounds: ${outcomes.length}`);
|
|
556
|
+
console.log(` Score progression: ${outcomes.map(o => o.score.toFixed(1)).join(' → ')}`);
|
|
557
|
+
console.log(` Best score: ${bestScore}/10`);
|
|
558
|
+
console.log(` Total cost: $${totalCost.toFixed(2)}`);
|
|
559
|
+
console.log(` Log dir: ${logDir}`);
|
|
560
|
+
console.log(` Report: ${path.join(logDir, 'report.md')}`);
|
|
561
|
+
}
|
|
562
|
+
// ---------------------------------------------------------------------------
|
|
563
|
+
// Report generation
|
|
564
|
+
// ---------------------------------------------------------------------------
|
|
565
|
+
function generateReport(opts, outcomes, totalCost, startTime, endTime, finalDocs) {
|
|
566
|
+
const L = [];
|
|
567
|
+
L.push('# Perfect Feature Report', '');
|
|
568
|
+
L.push('## Overview', '');
|
|
569
|
+
L.push(`| | |`);
|
|
570
|
+
L.push(`|---|---|`);
|
|
571
|
+
L.push(`| **Feature** | ${opts.featureId} |`);
|
|
572
|
+
L.push(`| **Repo** | \`${opts.repoPath}\` |`);
|
|
573
|
+
L.push(`| **Start** | ${startTime} |`);
|
|
574
|
+
L.push(`| **End** | ${endTime} |`);
|
|
575
|
+
L.push(`| **Rounds** | ${outcomes.length} |`);
|
|
576
|
+
L.push(`| **Best score** | ${Math.max(...outcomes.map(o => o.score))}/10 |`);
|
|
577
|
+
L.push(`| **Total cost** | $${totalCost.toFixed(2)} |`);
|
|
578
|
+
L.push(`| **Coding model** | ${opts.codingModel} |`);
|
|
579
|
+
L.push(`| **Judge model** | ${opts.judgeModel} |`);
|
|
580
|
+
L.push('');
|
|
581
|
+
// Score progression
|
|
582
|
+
L.push('## Score Progression', '');
|
|
583
|
+
L.push('```');
|
|
584
|
+
for (const o of outcomes) {
|
|
585
|
+
const bar = '█'.repeat(Math.round(o.score * 2));
|
|
586
|
+
L.push(`Round ${o.round.toString().padStart(2)} ${o.score.toFixed(1).padStart(5)}/10 ${bar}`);
|
|
587
|
+
}
|
|
588
|
+
L.push('```', '');
|
|
589
|
+
// Per-round detail
|
|
590
|
+
for (const o of outcomes) {
|
|
591
|
+
L.push(`## Round ${o.round} — ${o.score.toFixed(1)}/10`, '');
|
|
592
|
+
L.push(`| Completion | Code Quality | E2E | Overall |`);
|
|
593
|
+
L.push(`|---|---|---|---|`);
|
|
594
|
+
L.push(`| ${o.judging.completionScore} | ${o.judging.codeQualityScore} | ${o.judging.e2eScore} | ${o.judging.overallScore} |`);
|
|
595
|
+
L.push('');
|
|
596
|
+
L.push(`**Analysis:** ${o.judging.analysis}`, '');
|
|
597
|
+
if (o.judging.strengths.length > 0) {
|
|
598
|
+
L.push('**Strengths:**');
|
|
599
|
+
for (const s of o.judging.strengths)
|
|
600
|
+
L.push(`- ${s}`);
|
|
601
|
+
L.push('');
|
|
602
|
+
}
|
|
603
|
+
if (o.judging.weaknesses.length > 0) {
|
|
604
|
+
L.push('**Weaknesses:**');
|
|
605
|
+
for (const w of o.judging.weaknesses)
|
|
606
|
+
L.push(`- ${w}`);
|
|
607
|
+
L.push('');
|
|
608
|
+
}
|
|
609
|
+
if (o.diagnosis) {
|
|
610
|
+
L.push(`**Diagnosis:** ${o.diagnosis}`, '');
|
|
611
|
+
}
|
|
612
|
+
L.push(`**Docs changed:** ${o.docsChanged ? 'Yes' : 'No'}`);
|
|
613
|
+
L.push(`**Cost:** $${o.costEstimate.toFixed(2)}`, '');
|
|
614
|
+
}
|
|
615
|
+
// Final docs
|
|
616
|
+
const docKeys = Object.keys(finalDocs).sort();
|
|
617
|
+
if (docKeys.length > 0) {
|
|
618
|
+
L.push('## Final Documentation', '');
|
|
619
|
+
for (const key of docKeys) {
|
|
620
|
+
L.push(`### ${key}`, '');
|
|
621
|
+
L.push('```markdown');
|
|
622
|
+
L.push(finalDocs[key]);
|
|
623
|
+
L.push('```', '');
|
|
624
|
+
}
|
|
625
|
+
}
|
|
626
|
+
return L.join('\n');
|
|
627
|
+
}
|
|
628
|
+
// ---------------------------------------------------------------------------
|
|
629
|
+
// CLI
|
|
630
|
+
// ---------------------------------------------------------------------------
|
|
631
|
+
if (import.meta.main) {
|
|
632
|
+
const args = process.argv.slice(2);
|
|
633
|
+
const getArg = (name, defaultValue) => {
|
|
634
|
+
const idx = args.indexOf(`--${name}`);
|
|
635
|
+
if (idx >= 0 && idx + 1 < args.length)
|
|
636
|
+
return args[idx + 1];
|
|
637
|
+
if (defaultValue !== undefined)
|
|
638
|
+
return defaultValue;
|
|
639
|
+
throw new Error(`Missing required argument: --${name}`);
|
|
640
|
+
};
|
|
641
|
+
const hasArg = (name) => args.includes(`--${name}`);
|
|
642
|
+
const repoPath = getArg('repo');
|
|
643
|
+
const featuresPath = getArg('features');
|
|
644
|
+
const featureId = getArg('feature-id');
|
|
645
|
+
const maxRounds = parseInt(getArg('max-rounds', '10'));
|
|
646
|
+
const codingModel = getArg('coding-model', 'sonnet');
|
|
647
|
+
const judgeModel = getArg('judge-model', 'opus');
|
|
648
|
+
const analyzerModel = getArg('analyzer-model', 'opus');
|
|
649
|
+
const docsModel = getArg('docs-model', 'opus');
|
|
650
|
+
const initCommand = hasArg('init-command') ? getArg('init-command') : undefined;
|
|
651
|
+
perfectFeature({
|
|
652
|
+
repoPath,
|
|
653
|
+
featuresPath,
|
|
654
|
+
featureId,
|
|
655
|
+
maxRounds,
|
|
656
|
+
codingModel,
|
|
657
|
+
judgeModel,
|
|
658
|
+
analyzerModel,
|
|
659
|
+
docsModel,
|
|
660
|
+
initCommand,
|
|
661
|
+
}).catch((error) => {
|
|
662
|
+
console.error('Perfect feature run failed:', error);
|
|
663
|
+
process.exit(1);
|
|
664
|
+
});
|
|
665
|
+
}
|
|
666
|
+
//# sourceMappingURL=perfect-feature.js.map
|