npm - evalbuff - Versions diffs - 0.0.1 - Mend

evalbuff 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (86) hide show

package/README.md +79 -0
package/dist/carve-features.d.ts +42 -0
package/dist/carve-features.d.ts.map +1 -0
package/dist/carve-features.js +305 -0
package/dist/carve-features.js.map +1 -0
package/dist/cli.d.ts +3 -0
package/dist/cli.d.ts.map +1 -0
package/dist/cli.js +42 -0
package/dist/cli.js.map +1 -0
package/dist/docs-refactor.d.ts +4 -0
package/dist/docs-refactor.d.ts.map +1 -0
package/dist/docs-refactor.js +122 -0
package/dist/docs-refactor.js.map +1 -0
package/dist/docs-writer.d.ts +4 -0
package/dist/docs-writer.d.ts.map +1 -0
package/dist/docs-writer.js +122 -0
package/dist/docs-writer.js.map +1 -0
package/dist/eval-helpers.d.ts +19 -0
package/dist/eval-helpers.d.ts.map +1 -0
package/dist/eval-helpers.js +327 -0
package/dist/eval-helpers.js.map +1 -0
package/dist/eval-runner.d.ts +42 -0
package/dist/eval-runner.d.ts.map +1 -0
package/dist/eval-runner.js +193 -0
package/dist/eval-runner.js.map +1 -0
package/dist/judge.d.ts +22 -0
package/dist/judge.d.ts.map +1 -0
package/dist/judge.js +284 -0
package/dist/judge.js.map +1 -0
package/dist/perfect-feature.d.ts +2 -0
package/dist/perfect-feature.d.ts.map +1 -0
package/dist/perfect-feature.js +666 -0
package/dist/perfect-feature.js.map +1 -0
package/dist/report.d.ts +31 -0
package/dist/report.d.ts.map +1 -0
package/dist/report.js +249 -0
package/dist/report.js.map +1 -0
package/dist/run-evalbuff.d.ts +12 -0
package/dist/run-evalbuff.d.ts.map +1 -0
package/dist/run-evalbuff.js +383 -0
package/dist/run-evalbuff.js.map +1 -0
package/dist/runners/claude.d.ts +10 -0
package/dist/runners/claude.d.ts.map +1 -0
package/dist/runners/claude.js +80 -0
package/dist/runners/claude.js.map +1 -0
package/dist/runners/codebuff.d.ts +24 -0
package/dist/runners/codebuff.d.ts.map +1 -0
package/dist/runners/codebuff.js +88 -0
package/dist/runners/codebuff.js.map +1 -0
package/dist/runners/codex.d.ts +8 -0
package/dist/runners/codex.d.ts.map +1 -0
package/dist/runners/codex.js +131 -0
package/dist/runners/codex.js.map +1 -0
package/dist/runners/index.d.ts +5 -0
package/dist/runners/index.d.ts.map +1 -0
package/dist/runners/index.js +4 -0
package/dist/runners/index.js.map +1 -0
package/dist/runners/runner.d.ts +11 -0
package/dist/runners/runner.d.ts.map +1 -0
package/dist/runners/runner.js +2 -0
package/dist/runners/runner.js.map +1 -0
package/dist/test-repo-utils.d.ts +21 -0
package/dist/test-repo-utils.d.ts.map +1 -0
package/dist/test-repo-utils.js +109 -0
package/dist/test-repo-utils.js.map +1 -0
package/dist/trace-compressor.d.ts +130 -0
package/dist/trace-compressor.d.ts.map +1 -0
package/dist/trace-compressor.js +680 -0
package/dist/trace-compressor.js.map +1 -0
package/dist/tui/data.d.ts +84 -0
package/dist/tui/data.d.ts.map +1 -0
package/dist/tui/data.js +80 -0
package/dist/tui/data.js.map +1 -0
package/dist/tui/events.d.ts +86 -0
package/dist/tui/events.d.ts.map +1 -0
package/dist/tui/events.js +52 -0
package/dist/tui/events.js.map +1 -0
package/dist/vendor/error.d.ts +18 -0
package/dist/vendor/error.d.ts.map +1 -0
package/dist/vendor/error.js +64 -0
package/dist/vendor/error.js.map +1 -0
package/dist/vendor/print-mode.d.ts +75 -0
package/dist/vendor/print-mode.d.ts.map +1 -0
package/dist/vendor/print-mode.js +2 -0
package/dist/vendor/print-mode.js.map +1 -0
package/package.json +46 -0

package/README.md ADDED Viewed

@@ -0,0 +1,79 @@
+# Evalbuff
+**Improve your coding agent's performance through automated practice.**
+Evalbuff runs your coding agent on practice tasks carved from your codebase, watches it fail, writes docs to fix the pattern, and keeps only the changes that measurably help. The result is a `docs/` directory of markdown files that encode the missing knowledge your agent needs to produce correct changes.
+## Why it works
+Your coding agent is missing context. It doesn't understand your product. It edits the wrong package. It doesn't know how to verify changes end-to-end.
+All of this is solvable with the right context — missing domain knowledge, subtle conventions, step-by-step verification workflows. And all of that context can be recorded in plain markdown files.
+### Hierarchical docs > skills
+[OpenAI](https://openai.com/index/harness-engineering/) and [Vercel](https://vercel.com/blog/agents-md-outperforms-skills-in-our-agent-evals) have independently converged on a pattern for increasing agent performance:
+- A `docs/` directory with nested markdown files
+- A table of contents in `AGENTS.md` (or `CLAUDE.md`) with descriptions so the agent reads the right docs
+Evalbuff automates building and maintaining this docs directory — and validates every change against real evals.
+### Not just docs, evalmaxxing docs
+The goal isn't to produce docs that explain your project. The goal is to include whatever knowledge or instructions **increase the performance of your coding agent on evals** — domain knowledge missing from raw code, processes for end-to-end verification, and guardrails that prevent common mistakes.
+## How it works
+Evalbuff creates practice tasks by **carving** — surgically removing a feature from your codebase (deleting the relevant code while keeping everything else intact) and then challenging an agent to rebuild it from scratch. The original implementation serves as ground truth for judging the result.
+```
+1. Identify features in the repo that can be cleanly carved out
+2. Carve a random subset of n features (delete the code, keep the rest)
+3. Baseline: have agents rebuild each carved feature in parallel, judge the results
+   against the original implementation, collect scores + doc suggestions
+4. Loop N times:
+   a. Docs refactor agent reads judge suggestions and edits docs holistically
+   b. Re-eval: rebuild in parallel, judge, get new scores + doc suggestions
+   c. Keep only doc changes that improve scores
+```
+## Usage
+Try it now! Simply run the `run-evalbuff.ts` script with the path to your repo:
+```bash
+bun run src/run-evalbuff.ts \
+  --repo /path/to/repo \
+  [--n 20] \
+  [--parallelism 3] \
+  [--loops 3] \
+  [--init-command "npm install"]
+```
+| Flag | Description |
+|------|-------------|
+| `--repo` | Path to the repo to optimize docs for |
+| `--n` | Number of features to carve per eval round (default: 20) |
+| `--parallelism` | How many agent runs to execute in parallel (default: 3) |
+| `--loops` | Number of doc-improvement iterations (default: 3) |
+| `--init-command` | Setup command to run in the repo before each agent run (e.g. `npm install`) |
+| `--coding-model` | Model for the coding agent (default: sonnet) |
+| `--docs-model` | Model for the docs writer agent (default: opus) |
+## Testing
+```bash
+bun run test
+bun run test:all
+bun run test:e2e
+bun run typecheck
+```
+## Artifacts
+Run artifacts are written under:
+```bash
+$TMPDIR/evalbuff-run-<timestamp>/
+```

package/dist/carve-features.d.ts ADDED Viewed

@@ -0,0 +1,42 @@
+export interface CarveCandidate {
+    id: string;
+    name: string;
+    prompt: string;
+    description: string;
+    files: string[];
+    relevantFiles: string[];
+    complexity: 'small' | 'medium' | 'large';
+}
+export interface CarvePlan {
+    candidates: CarveCandidate[];
+    reasoning: string;
+}
+export interface FileOperation {
+    path: string;
+    action: 'delete' | 'modify';
+    newContent?: string;
+}
+export interface CarvedFeature {
+    id: string;
+    prompt: string;
+    description: string;
+    complexity: 'small' | 'medium' | 'large';
+    /** Files as they exist before carving (the "ground truth" to rebuild) */
+    originalFiles: Record<string, string>;
+    /** Operations to perform to carve the feature out */
+    operations: FileOperation[];
+    /** Unified diff of the carving (from git diff) */
+    diff: string;
+}
+export interface CarveResult {
+    repoPath: string;
+    generationDate: string;
+    features: CarvedFeature[];
+}
+export declare function planFeatures(repoPath: string): Promise<CarvePlan>;
+export declare function carveFeature(repoPath: string, candidate: CarveCandidate): Promise<CarvedFeature | null>;
+export declare function carveFeatures(repoPath: string, options?: {
+    count?: number;
+    outputPath?: string;
+}): Promise<CarveResult>;
+//# sourceMappingURL=carve-features.d.ts.map

package/dist/carve-features.d.ts.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"carve-features.d.ts","sourceRoot":"","sources":["../src/carve-features.ts"],"names":[],"mappings":"AAgBA,MAAM,WAAW,cAAc;IAC7B,EAAE,EAAE,MAAM,CAAA;IACV,IAAI,EAAE,MAAM,CAAA;IACZ,MAAM,EAAE,MAAM,CAAA;IACd,WAAW,EAAE,MAAM,CAAA;IACnB,KAAK,EAAE,MAAM,EAAE,CAAA;IACf,aAAa,EAAE,MAAM,EAAE,CAAA;IACvB,UAAU,EAAE,OAAO,GAAG,QAAQ,GAAG,OAAO,CAAA;CACzC;AAED,MAAM,WAAW,SAAS;IACxB,UAAU,EAAE,cAAc,EAAE,CAAA;IAC5B,SAAS,EAAE,MAAM,CAAA;CAClB;AAED,MAAM,WAAW,aAAa;IAC5B,IAAI,EAAE,MAAM,CAAA;IACZ,MAAM,EAAE,QAAQ,GAAG,QAAQ,CAAA;IAC3B,UAAU,CAAC,EAAE,MAAM,CAAA;CACpB;AAED,MAAM,WAAW,aAAa;IAC5B,EAAE,EAAE,MAAM,CAAA;IACV,MAAM,EAAE,MAAM,CAAA;IACd,WAAW,EAAE,MAAM,CAAA;IACnB,UAAU,EAAE,OAAO,GAAG,QAAQ,GAAG,OAAO,CAAA;IACxC,yEAAyE;IACzE,aAAa,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAA;IACrC,qDAAqD;IACrD,UAAU,EAAE,aAAa,EAAE,CAAA;IAC3B,kDAAkD;IAClD,IAAI,EAAE,MAAM,CAAA;CACb;AAED,MAAM,WAAW,WAAW;IAC1B,QAAQ,EAAE,MAAM,CAAA;IAChB,cAAc,EAAE,MAAM,CAAA;IACtB,QAAQ,EAAE,aAAa,EAAE,CAAA;CAC1B;AAQD,wBAAsB,YAAY,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,SAAS,CAAC,CA6FvE;AAID,wBAAsB,YAAY,CAChC,QAAQ,EAAE,MAAM,EAChB,SAAS,EAAE,cAAc,GACxB,OAAO,CAAC,aAAa,GAAG,IAAI,CAAC,CA+G/B;AAoCD,wBAAsB,aAAa,CACjC,QAAQ,EAAE,MAAM,EAChB,OAAO,GAAE;IACP,KAAK,CAAC,EAAE,MAAM,CAAA;IACd,UAAU,CAAC,EAAE,MAAM,CAAA;CACf,GACL,OAAO,CAAC,WAAW,CAAC,CAsDtB"}

package/dist/carve-features.js ADDED Viewed

@@ -0,0 +1,305 @@
+/**
+ * Feature Carver for evalbuff v2.
+ *
+ * Uses Codex agents to:
+ * 1. Analyze a codebase to identify discrete, self-contained features
+ * 2. Carve each feature out in an isolated git worktree, running typecheck/tests to verify
+ * 3. Capture the real git diff as the ground truth
+ */
+import { execSync } from 'child_process';
+import fs from 'fs';
+import path from 'path';
+import { Codex } from '@openai/codex-sdk';
+// --- Constants ---
+const RESULT_FILE = 'evalbuff-carve-result.json';
+// --- Phase 1: Identify features to carve (Codex agent) ---
+export async function planFeatures(repoPath) {
+    const codex = new Codex({
+        apiKey: process.env.OPENAI_API_KEY,
+    });
+    const thread = codex.startThread({
+        model: 'gpt-5.4',
+        workingDirectory: repoPath,
+        approvalPolicy: 'never',
+        sandboxMode: 'read-only',
+        webSearchMode: 'live',
+        modelReasoningEffort: 'high',
+    });
+    console.log('Planning features to carve...');
+    const prompt = `You are an expert software architect. Analyze this codebase to identify 15-25 discrete, self-contained features that can be cleanly "carved out" (deleted) and used as coding evaluation tasks.
+Explore the codebase thoroughly — read the file tree, key config files, entry points, and source files to understand the architecture.
+## What makes a GOOD carve candidate
+- A React component + its usage sites + unit tests + docs
+- An API endpoint (route + handler + types + unit tests + docs)
+- A CLI subcommand or flag
+- A utility module used in a few places
+- A feature behind a config/flag including tests and docs
+- A test suite for a specific module
+- A middleware or plugin
+- An integration with an external service
+Each feature should:
+1. Be self-contained — removing it leaves the rest of the codebase functional
+2. Be describable in 1-2 sentences — a developer could ask for it naturally
+3. Be non-trivial but bounded — not a one-liner, but not "rewrite the whole app"
+4. Not overlap with other candidates
+## What makes a BAD candidate
+- Core infrastructure that everything depends on (routing, auth framework, database connection)
+- A single function that's called in 50 places
+- Trivially small changes (rename, config tweak)
+- Auto-generated or boilerplate code
+## Output
+After your analysis, write a file called \`${RESULT_FILE}\` with this JSON structure:
+\`\`\`json
+{
+  "reasoning": "Your analysis of the codebase and approach to selecting features",
+  "candidates": [
+    {
+      "id": "short-kebab-id",
+      "name": "Human readable name",
+      "prompt": "Natural prompt a developer would use to ask for this feature, 1-2 sentences",
+      "description": "What this feature does and why it exists",
+      "files": ["path/to/file1.ts", "path/to/file2.tsx"],
+      "relevantFiles": ["path/to/importer.ts"],
+      "complexity": "small|medium|large"
+    }
+  ]
+}
+\`\`\`
+- **files**: The files that ARE the feature (to be deleted or modified to remove it). Be thorough — missing a file means the carve won't be clean.
+- **relevantFiles**: Other files that import or reference the feature.
+You MUST write the result file as your last action.`;
+    const result = await thread.run(prompt);
+    // Read the result file
+    const resultPath = path.join(repoPath, RESULT_FILE);
+    if (!fs.existsSync(resultPath)) {
+        // Try to extract from the agent's final response
+        const jsonMatch = result.finalResponse.match(/\{[\s\S]*"candidates"[\s\S]*\}/);
+        if (jsonMatch) {
+            const parsed = JSON.parse(jsonMatch[0]);
+            console.log(`Identified ${parsed.candidates.length} carve candidates (from response)`);
+            return parsed;
+        }
+        throw new Error('Codex agent did not produce a result file');
+    }
+    try {
+        const raw = fs.readFileSync(resultPath, 'utf-8');
+        const parsed = JSON.parse(raw);
+        console.log(`Identified ${parsed.candidates.length} carve candidates`);
+        return parsed;
+    }
+    finally {
+        fs.rmSync(resultPath, { force: true });
+    }
+}
+// --- Phase 2: Carve a feature in an isolated worktree ---
+export async function carveFeature(repoPath, candidate) {
+    console.log(`  Carving feature: ${candidate.id}...`);
+    // Save original files before carving
+    const originalFiles = {};
+    for (const filePath of candidate.files) {
+        const fullPath = path.join(repoPath, filePath);
+        if (fs.existsSync(fullPath)) {
+            originalFiles[filePath] = fs.readFileSync(fullPath, 'utf-8');
+        }
+    }
+    // Create a git worktree for isolated carving
+    const worktreePath = `${repoPath}-carve-${candidate.id}`;
+    const branchName = `evalbuff-carve-${candidate.id}-${Date.now()}`;
+    try {
+        execSync(`git worktree add -b "${branchName}" "${worktreePath}" HEAD`, {
+            cwd: repoPath,
+            stdio: 'ignore',
+        });
+        // Run the Codex agent in the worktree to carve the feature
+        const codex = new Codex({
+            apiKey: process.env.OPENAI_API_KEY,
+        });
+        const thread = codex.startThread({
+            model: 'gpt-5.4',
+            workingDirectory: worktreePath,
+            approvalPolicy: 'never',
+            sandboxMode: 'workspace-write',
+            webSearchMode: 'live',
+            modelReasoningEffort: 'high',
+        });
+        const prompt = `You are a precise code surgeon. Your job is to cleanly remove the following feature from this codebase.
+## Feature to Remove
+**Name:** ${candidate.name}
+**Description:** ${candidate.description}
+**Feature files (to delete or modify):** ${candidate.files.join(', ')}
+**Other relevant files to check for references:** ${candidate.relevantFiles?.join(', ') || '(none)'}
+## Rules
+1. **Delete completely** — remove ALL code related to the feature: components, handlers, types, tests, docs, imports, route registrations, etc.
+2. **Don't break the rest** — the remaining code must still compile and pass tests. Fix imports, remove dead references, etc.
+3. **Minimal collateral** — only remove what's necessary. Don't "improve" or refactor surrounding code.
+4. **Be thorough** — search for references in other files. If file A imports something from the feature, update file A's imports.
+5. **Verify your work** — after making changes, run the typecheck command (check package.json for the right command, typically \`tsc --noEmit\` or \`npx tsc --noEmit\`). Fix any errors that result from your changes. Also run the test suite if one exists.
+## Process
+1. Read the feature files and understand what to remove
+2. Search for all references/imports of the feature across the codebase
+3. Delete feature-only files, edit shared files to remove feature code
+4. Run typecheck and fix any compilation errors
+5. Run tests if available and fix any failures caused by the removal (remove tests for the deleted feature, fix tests that referenced it)
+Do NOT create any result files — just make the edits directly.`;
+        await thread.run(prompt);
+        // Capture the diff
+        execSync('git add -A', { cwd: worktreePath, stdio: 'ignore' });
+        const diff = execSync('git diff --cached HEAD', {
+            cwd: worktreePath,
+            encoding: 'utf-8',
+            maxBuffer: 10 * 1024 * 1024,
+        });
+        if (!diff.trim()) {
+            console.warn(`  No changes made for ${candidate.id}`);
+            return null;
+        }
+        // Build operations from the actual git diff
+        const operations = buildOperationsFromDiff(worktreePath, repoPath, candidate.files);
+        console.log(`  Carved ${candidate.id}: ${operations.length} file operations, ${diff.split('\n').length} diff lines`);
+        return {
+            id: candidate.id,
+            prompt: candidate.prompt,
+            description: candidate.description,
+            complexity: candidate.complexity,
+            originalFiles,
+            operations,
+            diff,
+        };
+    }
+    catch (error) {
+        const msg = error instanceof Error ? error.message : String(error);
+        console.error(`  Failed to carve ${candidate.id}: ${msg.slice(0, 200)}`);
+        return null;
+    }
+    finally {
+        // Clean up worktree and branch
+        try {
+            execSync(`git worktree remove --force "${worktreePath}"`, {
+                cwd: repoPath,
+                stdio: 'ignore',
+            });
+        }
+        catch { /* ignore */ }
+        try {
+            execSync(`git branch -D "${branchName}"`, {
+                cwd: repoPath,
+                stdio: 'ignore',
+            });
+        }
+        catch { /* ignore */ }
+    }
+}
+/**
+ * Build FileOperation[] by comparing worktree state against the original repo.
+ */
+function buildOperationsFromDiff(worktreePath, repoPath, featureFiles) {
+    const operations = [];
+    // Get list of changed files from git
+    const statusOutput = execSync('git diff --cached --name-status HEAD', {
+        cwd: worktreePath,
+        encoding: 'utf-8',
+    });
+    for (const line of statusOutput.trim().split('\n')) {
+        if (!line.trim())
+            continue;
+        const [status, ...pathParts] = line.split('\t');
+        const filePath = pathParts.join('\t');
+        if (status === 'D') {
+            operations.push({ path: filePath, action: 'delete' });
+        }
+        else if (status === 'M' || status === 'A') {
+            const newContent = fs.readFileSync(path.join(worktreePath, filePath), 'utf-8');
+            operations.push({ path: filePath, action: 'modify', newContent });
+        }
+    }
+    return operations;
+}
+// --- Main orchestrator ---
+export async function carveFeatures(repoPath, options = {}) {
+    const { count = 10, outputPath } = options;
+    console.log(`\nCarving features from: ${repoPath}`);
+    console.log(`Target: ${count} features\n`);
+    // Phase 1: Plan
+    const plan = await planFeatures(repoPath);
+    console.log(`\nPlanning complete. Reasoning:\n${plan.reasoning}\n`);
+    console.log('Candidates:');
+    for (const c of plan.candidates) {
+        console.log(`  ${c.id} (${c.complexity}): ${c.name}`);
+        console.log(`    Prompt: ${c.prompt}`);
+        console.log(`    Files: ${c.files.join(', ')}`);
+    }
+    // Select top N candidates (prefer medium complexity)
+    const ranked = [...plan.candidates].sort((a, b) => {
+        const complexityOrder = { medium: 0, small: 1, large: 2 };
+        return complexityOrder[a.complexity] - complexityOrder[b.complexity];
+    });
+    const selected = ranked.slice(0, count);
+    console.log(`\nSelected ${selected.length} features for carving:\n`);
+    // Phase 2: Carve each feature
+    const features = [];
+    for (const candidate of selected) {
+        try {
+            const carved = await carveFeature(repoPath, candidate);
+            if (carved) {
+                features.push(carved);
+                console.log(`  Done: ${carved.id} — ${carved.operations.length} file operations`);
+            }
+        }
+        catch (error) {
+            console.error(`  Failed: ${candidate.id}:`, error);
+        }
+    }
+    const result = {
+        repoPath,
+        generationDate: new Date().toISOString(),
+        features,
+    };
+    // Save output
+    const outPath = outputPath ||
+        path.join(repoPath, `carve-${new Date().toISOString().slice(0, 10)}.json`);
+    fs.writeFileSync(outPath, JSON.stringify(result, null, 2));
+    console.log(`\nSaved ${features.length} carved features to: ${outPath}`);
+    return result;
+}
+// --- CLI ---
+if (import.meta.main) {
+    const args = process.argv.slice(2);
+    const getArg = (name, defaultValue) => {
+        const idx = args.indexOf(`--${name}`);
+        if (idx >= 0 && idx + 1 < args.length)
+            return args[idx + 1];
+        if (defaultValue !== undefined)
+            return defaultValue;
+        throw new Error(`Missing required argument: --${name}`);
+    };
+    const repoPath = getArg('repo');
+    const count = parseInt(getArg('count', '10'));
+    const outputPath = args.indexOf('--output') >= 0 ? getArg('output') : undefined;
+    carveFeatures(repoPath, { count, outputPath })
+        .then((result) => {
+        console.log(`\nDone! Carved ${result.features.length} features.`);
+    })
+        .catch((error) => {
+        console.error('Carving failed:', error);
+        process.exit(1);
+    });
+}
+//# sourceMappingURL=carve-features.js.map

package/dist/carve-features.js.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"carve-features.js","sourceRoot":"","sources":["../src/carve-features.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AACH,OAAO,EAAE,QAAQ,EAAE,MAAM,eAAe,CAAA;AACxC,OAAO,EAAE,MAAM,IAAI,CAAA;AACnB,OAAO,IAAI,MAAM,MAAM,CAAA;AAEvB,OAAO,EAAE,KAAK,EAAE,MAAM,mBAAmB,CAAA;AA4CzC,oBAAoB;AAEpB,MAAM,WAAW,GAAG,4BAA4B,CAAA;AAEhD,4DAA4D;AAE5D,MAAM,CAAC,KAAK,UAAU,YAAY,CAAC,QAAgB;IACjD,MAAM,KAAK,GAAG,IAAI,KAAK,CAAC;QACtB,MAAM,EAAE,OAAO,CAAC,GAAG,CAAC,cAAc;KACnC,CAAC,CAAA;IAEF,MAAM,MAAM,GAAG,KAAK,CAAC,WAAW,CAAC;QAC/B,KAAK,EAAE,SAAS;QAChB,gBAAgB,EAAE,QAAQ;QAC1B,cAAc,EAAE,OAAO;QACvB,WAAW,EAAE,WAAW;QACxB,aAAa,EAAE,MAAM;QACrB,oBAAoB,EAAE,MAAM;KAC7B,CAAC,CAAA;IAEF,OAAO,CAAC,GAAG,CAAC,+BAA+B,CAAC,CAAA;IAE5C,MAAM,MAAM,GAAG;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;6CA8B4B,WAAW;;;;;;;;;;;;;;;;;;;;;;oDAsBJ,CAAA;IAElD,MAAM,MAAM,GAAG,MAAM,MAAM,CAAC,GAAG,CAAC,MAAM,CAAC,CAAA;IAEvC,uBAAuB;IACvB,MAAM,UAAU,GAAG,IAAI,CAAC,IAAI,CAAC,QAAQ,EAAE,WAAW,CAAC,CAAA;IACnD,IAAI,CAAC,EAAE,CAAC,UAAU,CAAC,UAAU,CAAC,EAAE,CAAC;QAC/B,iDAAiD;QACjD,MAAM,SAAS,GAAG,MAAM,CAAC,aAAa,CAAC,KAAK,CAAC,gCAAgC,CAAC,CAAA;QAC9E,IAAI,SAAS,EAAE,CAAC;YACd,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,SAAS,CAAC,CAAC,CAAC,CAAc,CAAA;YACpD,OAAO,CAAC,GAAG,CAAC,cAAc,MAAM,CAAC,UAAU,CAAC,MAAM,mCAAmC,CAAC,CAAA;YACtF,OAAO,MAAM,CAAA;QACf,CAAC;QACD,MAAM,IAAI,KAAK,CAAC,2CAA2C,CAAC,CAAA;IAC9D,CAAC;IAED,IAAI,CAAC;QACH,MAAM,GAAG,GAAG,EAAE,CAAC,YAAY,CAAC,UAAU,EAAE,OAAO,CAAC,CAAA;QAChD,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAc,CAAA;QAC3C,OAAO,CAAC,GAAG,CAAC,cAAc,MAAM,CAAC,UAAU,CAAC,MAAM,mBAAmB,CAAC,CAAA;QACtE,OAAO,MAAM,CAAA;IACf,CAAC;YAAS,CAAC;QACT,EAAE,CAAC,MAAM,CAAC,UAAU,EAAE,EAAE,KAAK,EAAE,IAAI,EAAE,CAAC,CAAA;IACxC,CAAC;AACH,CAAC;AAED,2DAA2D;AAE3D,MAAM,CAAC,KAAK,UAAU,YAAY,CAChC,QAAgB,EAChB,SAAyB;IAEzB,OAAO,CAAC,GAAG,CAAC,sBAAsB,SAAS,CAAC,EAAE,KAAK,CAAC,CAAA;IAEpD,qCAAqC;IACrC,MAAM,aAAa,GAA2B,EAAE,CAAA;IAChD,KAAK,MAAM,QAAQ,IAAI,SAAS,CAAC,KAAK,EAAE,CAAC;QACvC,MAAM,QAAQ,GAAG,IAAI,CAAC,IAAI,CAAC,QAAQ,EAAE,QAAQ,CAAC,CAAA;QAC9C,IAAI,EAAE,CAAC,UAAU,CAAC,QAAQ,CAAC,EAAE,CAAC;YAC5B,aAAa,CAAC,QAAQ,CAAC,GAAG,EAAE,CAAC,YAAY,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAA;QAC9D,CAAC;IACH,CAAC;IAED,6CAA6C;IAC7C,MAAM,YAAY,GAAG,GAAG,QAAQ,UAAU,SAAS,CAAC,EAAE,EAAE,CAAA;IACxD,MAAM,UAAU,GAAG,kBAAkB,SAAS,CAAC,EAAE,IAAI,IAAI,CAAC,GAAG,EAAE,EAAE,CAAA;IAEjE,IAAI,CAAC;QACH,QAAQ,CAAC,wBAAwB,UAAU,MAAM,YAAY,QAAQ,EAAE;YACrE,GAAG,EAAE,QAAQ;YACb,KAAK,EAAE,QAAQ;SAChB,CAAC,CAAA;QAEF,2DAA2D;QAC3D,MAAM,KAAK,GAAG,IAAI,KAAK,CAAC;YACtB,MAAM,EAAE,OAAO,CAAC,GAAG,CAAC,cAAc;SACnC,CAAC,CAAA;QAEF,MAAM,MAAM,GAAG,KAAK,CAAC,WAAW,CAAC;YAC/B,KAAK,EAAE,SAAS;YAChB,gBAAgB,EAAE,YAAY;YAC9B,cAAc,EAAE,OAAO;YACvB,WAAW,EAAE,iBAAiB;YAC9B,aAAa,EAAE,MAAM;YACrB,oBAAoB,EAAE,MAAM;SAC7B,CAAC,CAAA;QAEF,MAAM,MAAM,GAAG;;;YAGP,SAAS,CAAC,IAAI;mBACP,SAAS,CAAC,WAAW;;2CAEG,SAAS,CAAC,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC;oDACjB,SAAS,CAAC,aAAa,EAAE,IAAI,CAAC,IAAI,CAAC,IAAI,QAAQ;;;;;;;;;;;;;;;;;;+DAkBpC,CAAA;QAE3D,MAAM,MAAM,CAAC,GAAG,CAAC,MAAM,CAAC,CAAA;QAExB,mBAAmB;QACnB,QAAQ,CAAC,YAAY,EAAE,EAAE,GAAG,EAAE,YAAY,EAAE,KAAK,EAAE,QAAQ,EAAE,CAAC,CAAA;QAC9D,MAAM,IAAI,GAAG,QAAQ,CAAC,wBAAwB,EAAE;YAC9C,GAAG,EAAE,YAAY;YACjB,QAAQ,EAAE,OAAO;YACjB,SAAS,EAAE,EAAE,GAAG,IAAI,GAAG,IAAI;SAC5B,CAAC,CAAA;QAEF,IAAI,CAAC,IAAI,CAAC,IAAI,EAAE,EAAE,CAAC;YACjB,OAAO,CAAC,IAAI,CAAC,yBAAyB,SAAS,CAAC,EAAE,EAAE,CAAC,CAAA;YACrD,OAAO,IAAI,CAAA;QACb,CAAC;QAED,4CAA4C;QAC5C,MAAM,UAAU,GAAG,uBAAuB,CAAC,YAAY,EAAE,QAAQ,EAAE,SAAS,CAAC,KAAK,CAAC,CAAA;QAEnF,OAAO,CAAC,GAAG,CAAC,YAAY,SAAS,CAAC,EAAE,KAAK,UAAU,CAAC,MAAM,qBAAqB,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,MAAM,aAAa,CAAC,CAAA;QAEpH,OAAO;YACL,EAAE,EAAE,SAAS,CAAC,EAAE;YAChB,MAAM,EAAE,SAAS,CAAC,MAAM;YACxB,WAAW,EAAE,SAAS,CAAC,WAAW;YAClC,UAAU,EAAE,SAAS,CAAC,UAAU;YAChC,aAAa;YACb,UAAU;YACV,IAAI;SACL,CAAA;IACH,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,MAAM,GAAG,GAAG,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAA;QAClE,OAAO,CAAC,KAAK,CAAC,qBAAqB,SAAS,CAAC,EAAE,KAAK,GAAG,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,EAAE,CAAC,CAAA;QACxE,OAAO,IAAI,CAAA;IACb,CAAC;YAAS,CAAC;QACT,+BAA+B;QAC/B,IAAI,CAAC;YACH,QAAQ,CAAC,gCAAgC,YAAY,GAAG,EAAE;gBACxD,GAAG,EAAE,QAAQ;gBACb,KAAK,EAAE,QAAQ;aAChB,CAAC,CAAA;QACJ,CAAC;QAAC,MAAM,CAAC,CAAC,YAAY,CAAC,CAAC;QACxB,IAAI,CAAC;YACH,QAAQ,CAAC,kBAAkB,UAAU,GAAG,EAAE;gBACxC,GAAG,EAAE,QAAQ;gBACb,KAAK,EAAE,QAAQ;aAChB,CAAC,CAAA;QACJ,CAAC;QAAC,MAAM,CAAC,CAAC,YAAY,CAAC,CAAC;IAC1B,CAAC;AACH,CAAC;AAED;;GAEG;AACH,SAAS,uBAAuB,CAC9B,YAAoB,EACpB,QAAgB,EAChB,YAAsB;IAEtB,MAAM,UAAU,GAAoB,EAAE,CAAA;IAEtC,qCAAqC;IACrC,MAAM,YAAY,GAAG,QAAQ,CAAC,sCAAsC,EAAE;QACpE,GAAG,EAAE,YAAY;QACjB,QAAQ,EAAE,OAAO;KAClB,CAAC,CAAA;IAEF,KAAK,MAAM,IAAI,IAAI,YAAY,CAAC,IAAI,EAAE,CAAC,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC;QACnD,IAAI,CAAC,IAAI,CAAC,IAAI,EAAE;YAAE,SAAQ;QAC1B,MAAM,CAAC,MAAM,EAAE,GAAG,SAAS,CAAC,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAA;QAC/C,MAAM,QAAQ,GAAG,SAAS,CAAC,IAAI,CAAC,IAAI,CAAC,CAAA;QAErC,IAAI,MAAM,KAAK,GAAG,EAAE,CAAC;YACnB,UAAU,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,QAAQ,EAAE,MAAM,EAAE,QAAQ,EAAE,CAAC,CAAA;QACvD,CAAC;aAAM,IAAI,MAAM,KAAK,GAAG,IAAI,MAAM,KAAK,GAAG,EAAE,CAAC;YAC5C,MAAM,UAAU,GAAG,EAAE,CAAC,YAAY,CAAC,IAAI,CAAC,IAAI,CAAC,YAAY,EAAE,QAAQ,CAAC,EAAE,OAAO,CAAC,CAAA;YAC9E,UAAU,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,QAAQ,EAAE,MAAM,EAAE,QAAQ,EAAE,UAAU,EAAE,CAAC,CAAA;QACnE,CAAC;IACH,CAAC;IAED,OAAO,UAAU,CAAA;AACnB,CAAC;AAED,4BAA4B;AAE5B,MAAM,CAAC,KAAK,UAAU,aAAa,CACjC,QAAgB,EAChB,UAGI,EAAE;IAEN,MAAM,EAAE,KAAK,GAAG,EAAE,EAAE,UAAU,EAAE,GAAG,OAAO,CAAA;IAE1C,OAAO,CAAC,GAAG,CAAC,4BAA4B,QAAQ,EAAE,CAAC,CAAA;IACnD,OAAO,CAAC,GAAG,CAAC,WAAW,KAAK,aAAa,CAAC,CAAA;IAE1C,gBAAgB;IAChB,MAAM,IAAI,GAAG,MAAM,YAAY,CAAC,QAAQ,CAAC,CAAA;IAEzC,OAAO,CAAC,GAAG,CAAC,oCAAoC,IAAI,CAAC,SAAS,IAAI,CAAC,CAAA;IACnE,OAAO,CAAC,GAAG,CAAC,aAAa,CAAC,CAAA;IAC1B,KAAK,MAAM,CAAC,IAAI,IAAI,CAAC,UAAU,EAAE,CAAC;QAChC,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC,EAAE,KAAK,CAAC,CAAC,UAAU,MAAM,CAAC,CAAC,IAAI,EAAE,CAAC,CAAA;QACrD,OAAO,CAAC,GAAG,CAAC,eAAe,CAAC,CAAC,MAAM,EAAE,CAAC,CAAA;QACtC,OAAO,CAAC,GAAG,CAAC,cAAc,CAAC,CAAC,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAA;IACjD,CAAC;IAED,qDAAqD;IACrD,MAAM,MAAM,GAAG,CAAC,GAAG,IAAI,CAAC,UAAU,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE;QAChD,MAAM,eAAe,GAAG,EAAE,MAAM,EAAE,CAAC,EAAE,KAAK,EAAE,CAAC,EAAE,KAAK,EAAE,CAAC,EAAE,CAAA;QACzD,OAAO,eAAe,CAAC,CAAC,CAAC,UAAU,CAAC,GAAG,eAAe,CAAC,CAAC,CAAC,UAAU,CAAC,CAAA;IACtE,CAAC,CAAC,CAAA;IACF,MAAM,QAAQ,GAAG,MAAM,CAAC,KAAK,CAAC,CAAC,EAAE,KAAK,CAAC,CAAA;IAEvC,OAAO,CAAC,GAAG,CAAC,cAAc,QAAQ,CAAC,MAAM,0BAA0B,CAAC,CAAA;IAEpE,8BAA8B;IAC9B,MAAM,QAAQ,GAAoB,EAAE,CAAA;IACpC,KAAK,MAAM,SAAS,IAAI,QAAQ,EAAE,CAAC;QACjC,IAAI,CAAC;YACH,MAAM,MAAM,GAAG,MAAM,YAAY,CAAC,QAAQ,EAAE,SAAS,CAAC,CAAA;YACtD,IAAI,MAAM,EAAE,CAAC;gBACX,QAAQ,CAAC,IAAI,CAAC,MAAM,CAAC,CAAA;gBACrB,OAAO,CAAC,GAAG,CAAC,WAAW,MAAM,CAAC,EAAE,MAAM,MAAM,CAAC,UAAU,CAAC,MAAM,kBAAkB,CAAC,CAAA;YACnF,CAAC;QACH,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,OAAO,CAAC,KAAK,CAAC,aAAa,SAAS,CAAC,EAAE,GAAG,EAAE,KAAK,CAAC,CAAA;QACpD,CAAC;IACH,CAAC;IAED,MAAM,MAAM,GAAgB;QAC1B,QAAQ;QACR,cAAc,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;QACxC,QAAQ;KACT,CAAA;IAED,cAAc;IACd,MAAM,OAAO,GACX,UAAU;QACV,IAAI,CAAC,IAAI,CAAC,QAAQ,EAAE,SAAS,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,OAAO,CAAC,CAAA;IAC5E,EAAE,CAAC,aAAa,CAAC,OAAO,EAAE,IAAI,CAAC,SAAS,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,CAAA;IAC1D,OAAO,CAAC,GAAG,CAAC,WAAW,QAAQ,CAAC,MAAM,wBAAwB,OAAO,EAAE,CAAC,CAAA;IAExE,OAAO,MAAM,CAAA;AACf,CAAC;AAED,cAAc;AAEd,IAAI,MAAM,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC;IACrB,MAAM,IAAI,GAAG,OAAO,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAA;IAElC,MAAM,MAAM,GAAG,CAAC,IAAY,EAAE,YAAqB,EAAU,EAAE;QAC7D,MAAM,GAAG,GAAG,IAAI,CAAC,OAAO,CAAC,KAAK,IAAI,EAAE,CAAC,CAAA;QACrC,IAAI,GAAG,IAAI,CAAC,IAAI,GAAG,GAAG,CAAC,GAAG,IAAI,CAAC,MAAM;YAAE,OAAO,IAAI,CAAC,GAAG,GAAG,CAAC,CAAC,CAAA;QAC3D,IAAI,YAAY,KAAK,SAAS;YAAE,OAAO,YAAY,CAAA;QACnD,MAAM,IAAI,KAAK,CAAC,gCAAgC,IAAI,EAAE,CAAC,CAAA;IACzD,CAAC,CAAA;IAED,MAAM,QAAQ,GAAG,MAAM,CAAC,MAAM,CAAC,CAAA;IAC/B,MAAM,KAAK,GAAG,QAAQ,CAAC,MAAM,CAAC,OAAO,EAAE,IAAI,CAAC,CAAC,CAAA;IAC7C,MAAM,UAAU,GAAG,IAAI,CAAC,OAAO,CAAC,UAAU,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,SAAS,CAAA;IAE/E,aAAa,CAAC,QAAQ,EAAE,EAAE,KAAK,EAAE,UAAU,EAAE,CAAC;SAC3C,IAAI,CAAC,CAAC,MAAM,EAAE,EAAE;QACf,OAAO,CAAC,GAAG,CAAC,kBAAkB,MAAM,CAAC,QAAQ,CAAC,MAAM,YAAY,CAAC,CAAA;IACnE,CAAC,CAAC;SACD,KAAK,CAAC,CAAC,KAAK,EAAE,EAAE;QACf,OAAO,CAAC,KAAK,CAAC,iBAAiB,EAAE,KAAK,CAAC,CAAA;QACvC,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;IACjB,CAAC,CAAC,CAAA;AACN,CAAC"}

package/dist/cli.d.ts ADDED Viewed

@@ -0,0 +1,3 @@
+#!/usr/bin/env node
+export {};
+//# sourceMappingURL=cli.d.ts.map

package/dist/cli.d.ts.map ADDED Viewed

	@@ -0,0 +1 @@
1	+ {"version":3,"file":"cli.d.ts","sourceRoot":"","sources":["../src/cli.ts"],"names":[],"mappings":""}

package/dist/cli.js ADDED Viewed

@@ -0,0 +1,42 @@
+#!/usr/bin/env node
+/**
+ * Evalbuff CLI entry point.
+ *
+ * Usage:
+ *   evalbuff --repo /path/to/repo [--n 20] [--parallelism 10] [--loops 3]
+ *            [--init-command "npm install"] [--coding-model sonnet] [--docs-model opus]
+ *            [--cached-features /path/to/features.json]
+ */
+import { runEvalbuff } from './run-evalbuff';
+const args = process.argv.slice(2);
+const getArg = (name, defaultValue) => {
+    const idx = args.indexOf(`--${name}`);
+    if (idx >= 0 && idx + 1 < args.length)
+        return args[idx + 1];
+    if (defaultValue !== undefined)
+        return defaultValue;
+    throw new Error(`Missing required argument: --${name}`);
+};
+const hasArg = (name) => args.includes(`--${name}`);
+const repoPath = getArg('repo');
+const n = parseInt(getArg('n', '20'));
+const parallelism = parseInt(getArg('parallelism', '10'));
+const loops = parseInt(getArg('loops', '3'));
+const initCommand = hasArg('init-command') ? getArg('init-command') : undefined;
+const codingModel = getArg('coding-model', 'sonnet');
+const docsModel = getArg('docs-model', 'opus');
+const cachedFeatures = hasArg('cached-features') ? getArg('cached-features') : undefined;
+runEvalbuff({
+    repoPath,
+    n,
+    parallelism,
+    loops,
+    initCommand,
+    codingModel,
+    docsModel,
+    cachedFeatures,
+}).catch((error) => {
+    console.error('Evalbuff run failed:', error);
+    process.exit(1);
+});
+//# sourceMappingURL=cli.js.map

package/dist/cli.js.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"cli.js","sourceRoot":"","sources":["../src/cli.ts"],"names":[],"mappings":";AACA;;;;;;;GAOG;AACH,OAAO,EAAE,WAAW,EAAE,MAAM,gBAAgB,CAAA;AAE5C,MAAM,IAAI,GAAG,OAAO,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAA;AAElC,MAAM,MAAM,GAAG,CAAC,IAAY,EAAE,YAAqB,EAAU,EAAE;IAC7D,MAAM,GAAG,GAAG,IAAI,CAAC,OAAO,CAAC,KAAK,IAAI,EAAE,CAAC,CAAA;IACrC,IAAI,GAAG,IAAI,CAAC,IAAI,GAAG,GAAG,CAAC,GAAG,IAAI,CAAC,MAAM;QAAE,OAAO,IAAI,CAAC,GAAG,GAAG,CAAC,CAAC,CAAA;IAC3D,IAAI,YAAY,KAAK,SAAS;QAAE,OAAO,YAAY,CAAA;IACnD,MAAM,IAAI,KAAK,CAAC,gCAAgC,IAAI,EAAE,CAAC,CAAA;AACzD,CAAC,CAAA;AACD,MAAM,MAAM,GAAG,CAAC,IAAY,EAAW,EAAE,CAAC,IAAI,CAAC,QAAQ,CAAC,KAAK,IAAI,EAAE,CAAC,CAAA;AAEpE,MAAM,QAAQ,GAAG,MAAM,CAAC,MAAM,CAAC,CAAA;AAC/B,MAAM,CAAC,GAAG,QAAQ,CAAC,MAAM,CAAC,GAAG,EAAE,IAAI,CAAC,CAAC,CAAA;AACrC,MAAM,WAAW,GAAG,QAAQ,CAAC,MAAM,CAAC,aAAa,EAAE,IAAI,CAAC,CAAC,CAAA;AACzD,MAAM,KAAK,GAAG,QAAQ,CAAC,MAAM,CAAC,OAAO,EAAE,GAAG,CAAC,CAAC,CAAA;AAC5C,MAAM,WAAW,GAAG,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,SAAS,CAAA;AAC/E,MAAM,WAAW,GAAG,MAAM,CAAC,cAAc,EAAE,QAAQ,CAAC,CAAA;AACpD,MAAM,SAAS,GAAG,MAAM,CAAC,YAAY,EAAE,MAAM,CAAC,CAAA;AAC9C,MAAM,cAAc,GAAG,MAAM,CAAC,iBAAiB,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,iBAAiB,CAAC,CAAC,CAAC,CAAC,SAAS,CAAA;AAExF,WAAW,CAAC;IACV,QAAQ;IACR,CAAC;IACD,WAAW;IACX,KAAK;IACL,WAAW;IACX,WAAW;IACX,SAAS;IACT,cAAc;CACf,CAAC,CAAC,KAAK,CAAC,CAAC,KAAK,EAAE,EAAE;IACjB,OAAO,CAAC,KAAK,CAAC,sBAAsB,EAAE,KAAK,CAAC,CAAA;IAC5C,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;AACjB,CAAC,CAAC,CAAA"}

package/dist/docs-refactor.d.ts ADDED Viewed

@@ -0,0 +1,4 @@
+import type { TaskResult } from './eval-runner';
+export declare function collectDocSuggestions(tasks: TaskResult[]): string;
+export declare function runDocsRefactorAgent(repoPath: string, judgeSuggestions: string, model: string): Promise<void>;
+//# sourceMappingURL=docs-refactor.d.ts.map

package/dist/docs-refactor.d.ts.map ADDED Viewed

	@@ -0,0 +1 @@
1	+ {"version":3,"file":"docs-refactor.d.ts","sourceRoot":"","sources":["../src/docs-refactor.ts"],"names":[],"mappings":"AAQA,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,eAAe,CAAA;AAE/C,wBAAgB,qBAAqB,CAAC,KAAK,EAAE,UAAU,EAAE,GAAG,MAAM,CAcjE;AAED,wBAAsB,oBAAoB,CACxC,QAAQ,EAAE,MAAM,EAChB,gBAAgB,EAAE,MAAM,EACxB,KAAK,EAAE,MAAM,GACZ,OAAO,CAAC,IAAI,CAAC,CAwGf"}

package/dist/docs-refactor.js ADDED Viewed

@@ -0,0 +1,122 @@
+import { execSync } from 'child_process';
+import fs from 'fs';
+import os from 'os';
+import path from 'path';
+import { ClaudeRunner } from './runners/claude';
+import { syncDocsIntoRepo } from './eval-helpers';
+export function collectDocSuggestions(tasks) {
+    const sections = [];
+    for (const task of tasks) {
+        const suggestions = task.judging.docSuggestions;
+        if (!suggestions || suggestions.length === 0)
+            continue;
+        sections.push(`### ${task.featureId} (score: ${task.score.toFixed(1)}/10)\n` +
+            suggestions.map((s) => `- ${s}`).join('\n'));
+    }
+    return sections.join('\n\n');
+}
+export async function runDocsRefactorAgent(repoPath, judgeSuggestions, model) {
+    console.log(`\n  [DocsRefactor] Running holistic docs refactor...`);
+    const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'evalbuff-docs-'));
+    const repoDir = path.join(tempDir, 'repo');
+    const prompt = `Read ALL existing documentation (docs/, AGENTS.md, CLAUDE.md), consider the judge suggestions below, and make the documentation as useful as possible for coding agents.
+## Goal
+The purpose of these docs is to help a coding agent successfully build NEW features it has never seen before. The docs should teach the agent how the project works — its architecture, patterns, conventions, and rules — so it can confidently build anything, not just reconstruct specific existing features.
+## Judge Suggestions
+Multiple judge agents reviewed coding agent attempts and identified documentation gaps. Here are their suggestions:
+${judgeSuggestions || '(No suggestions were made)'}
+## What to do
+1. **Extract general patterns** — each judge suggestion reflects a specific failure, but your job is to identify the underlying pattern or convention that would prevent a whole class of similar failures. Ask: "What general rule would help an agent get this right for ANY feature?"
+2. **Do NOT reference specific features** — never mention a specific feature, component, or endpoint by name as an example of what to build. Instead, document the pattern it follows. For example, instead of "the UserProfile component fetches data in useEffect", write "components in this project fetch data using useEffect on mount, following the pattern in src/hooks/".
+3. **Document architecture and data flow** — describe how the project is structured, how data flows through it, and where new code should be placed. These are the things an agent building something new needs most.
+4. **Edit existing docs** — when a suggestion maps to an existing doc, make fine-grained edits rather than rewriting from scratch.
+5. **Create new docs** — when a suggestion identifies a missing pattern or convention, create a concise new doc for it.
+6. **Merge overlapping docs** — if multiple suggestions or existing docs cover similar topics, combine them.
+7. **Remove redundancy** — consolidate duplicate advice. Dense, actionable information beats verbose explanations.
+8. **Fix contradictions** — if docs disagree, pick the correct advice and remove the wrong one.
+9. **Prune stale docs** — remove docs that reference files/patterns that no longer exist in the codebase.
+Rules:
+- ONLY modify files in docs/, AGENTS.md, or CLAUDE.md. Do NOT modify source code.
+- **Do NOT edit AGENTS.md beyond adding new docs to its index.** The only allowed changes to AGENTS.md are: (a) adding/removing entries in the doc index when you create or delete files under docs/, and (b) correcting existing information that is factually wrong. Do NOT add new paragraphs, prose, sections, or explanatory text above or below existing content. Put all new guidance in docs/ files and link to them from the index.
+- It's OK to delete doc files that are redundant or low-value.
+- The goal is a minimal, high-signal set of docs that a coding agent can use to build ANY feature, including ones that don't exist yet.
+- Less is more — 5 great docs are better than 15 mediocre ones.
+- Document patterns, conventions, and architectural rules — not specific feature implementations.
+- Be specific about file paths, directory structure, and conventions — but generic about what gets built.
+## Docs Must Match Source Code
+Docs that describe nonexistent code are WORSE than no docs at all — they actively mislead coding agents and cause them to fail.
+Before writing any doc that references a helper, function, type, or script:
+1. **grep for the exact symbol name** to confirm it exists. If it doesn't exist, DO NOT document it.
+2. **Never document aspirational/future behavior.** Only document what the code does RIGHT NOW.
+3. **If a judge suggestion references a helper that doesn't exist**, document the PATTERN the agent should follow instead — not a fictional API.
+Wrong: "Use \`captureGitDiff()\` from src/eval-helpers.ts to capture diffs"  (if it doesn't exist)
+Right: "Diff capture should use an explicit base SHA recorded before the agent runs"  (describes the pattern)
+## Final Step: Spawn a Critique Sub-Agent
+Before you finish, you MUST spawn a critique sub-agent via the Task tool (subagent_type: "general-purpose") to review the docs you just wrote or modified. Then apply every valid fix it identifies.
+Use this exact prompt for the sub-agent:
+---
+You are a documentation critic. Review every file under docs/, plus AGENTS.md and CLAUDE.md, and report violations of the rules below. For each violation, give the file path, the offending text or line range, and a concrete fix (exact replacement text, the section to remove, or the split to perform).
+Rules (enforce strictly):
+1. **No overfitting to a single task.** Docs must describe general patterns, conventions, and architecture that apply to building ANY feature — not one specific task. Flag:
+   - Feature-specific function, type, component, endpoint, table, or CLI-subcommand names that only matter for one task and are not shared infrastructure.
+   - Examples phrased around one feature ("the UserProfile component fetches data via useEffect") instead of the general pattern ("components in src/components/ fetch data in useEffect on mount").
+   - Any symbol reference that does not represent a shared utility, pattern, or architectural boundary used by multiple features.
+   The fix is to rewrite the passage as a general rule about the pattern, directory, or convention — or delete it if it does not generalize.
+2. **No code excerpts unless documenting a common utility or shared pattern.** A code block is only allowed when it shows:
+   - The signature or usage of a shared helper multiple features rely on, OR
+   - A canonical pattern every agent should copy (error handling, a standard import shape, etc.).
+   Flag any code block that shows task-specific implementation details. The fix is to delete the block or replace it with a one-line prose description of the pattern.
+3. **Individual markdown files must stay focused and reasonably short.** If any single file exceeds roughly 300 lines, OR covers multiple unrelated topics, recommend splitting it into smaller topic-scoped files and specify the split (new filenames + which sections move where). Prefer many small focused docs over one large doc.
+4. **Docs must match source code.** Before flagging a missing symbol, grep the repo to confirm it does not exist. Flag references to helpers, functions, types, files, or scripts that are not present in the code.
+Return a numbered list of violations with fixes. If a file is clean, say so. Do not edit any files yourself — only report.
+---
+After the sub-agent returns, apply every valid fix it identified by editing the doc files directly. If it recommended splitting a long doc, perform the split. Re-read each affected file after fixing to confirm the result. Only then finish.`;
+    try {
+        execSync(`git clone --no-checkout "${repoPath}" "${repoDir}"`, { stdio: 'ignore' });
+        const headSha = execSync('git rev-parse HEAD', {
+            cwd: repoPath,
+            encoding: 'utf-8',
+        }).trim();
+        execSync(`git checkout ${headSha}`, { cwd: repoDir, stdio: 'ignore' });
+        syncDocsIntoRepo(repoPath, repoDir);
+        const runner = new ClaudeRunner(repoDir, {}, model, 'high');
+        await runner.run(prompt);
+        syncDocsIntoRepo(repoDir, repoPath);
+    }
+    catch (err) {
+        const msg = err instanceof Error ? err.message : String(err);
+        console.warn(`  [DocsRefactor] Failed: ${msg.slice(0, 200)}`);
+    }
+    finally {
+        try {
+            fs.rmSync(tempDir, { recursive: true, force: true });
+        }
+        catch {
+            // ignore cleanup failures
+        }
+    }
+}
+//# sourceMappingURL=docs-refactor.js.map

package/dist/docs-refactor.js.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"docs-refactor.js","sourceRoot":"","sources":["../src/docs-refactor.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,MAAM,eAAe,CAAA;AACxC,OAAO,EAAE,MAAM,IAAI,CAAA;AACnB,OAAO,EAAE,MAAM,IAAI,CAAA;AACnB,OAAO,IAAI,MAAM,MAAM,CAAA;AAEvB,OAAO,EAAE,YAAY,EAAE,MAAM,kBAAkB,CAAA;AAC/C,OAAO,EAAE,gBAAgB,EAAE,MAAM,gBAAgB,CAAA;AAIjD,MAAM,UAAU,qBAAqB,CAAC,KAAmB;IACvD,MAAM,QAAQ,GAAa,EAAE,CAAA;IAE7B,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QACzB,MAAM,WAAW,GAAG,IAAI,CAAC,OAAO,CAAC,cAAc,CAAA;QAC/C,IAAI,CAAC,WAAW,IAAI,WAAW,CAAC,MAAM,KAAK,CAAC;YAAE,SAAQ;QAEtD,QAAQ,CAAC,IAAI,CACX,OAAO,IAAI,CAAC,SAAS,YAAY,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,QAAQ;YAC9D,WAAW,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,KAAK,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAC5C,CAAA;IACH,CAAC;IAED,OAAO,QAAQ,CAAC,IAAI,CAAC,MAAM,CAAC,CAAA;AAC9B,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,oBAAoB,CACxC,QAAgB,EAChB,gBAAwB,EACxB,KAAa;IAEb,OAAO,CAAC,GAAG,CAAC,sDAAsD,CAAC,CAAA;IACnE,MAAM,OAAO,GAAG,EAAE,CAAC,WAAW,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,MAAM,EAAE,EAAE,gBAAgB,CAAC,CAAC,CAAA;IACxE,MAAM,OAAO,GAAG,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,MAAM,CAAC,CAAA;IAE1C,MAAM,MAAM,GAAG;;;;;;;;;;EAUf,gBAAgB,IAAI,4BAA4B;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;gPAgE8L,CAAA;IAE9O,IAAI,CAAC;QACH,QAAQ,CAAC,4BAA4B,QAAQ,MAAM,OAAO,GAAG,EAAE,EAAE,KAAK,EAAE,QAAQ,EAAE,CAAC,CAAA;QACnF,MAAM,OAAO,GAAG,QAAQ,CAAC,oBAAoB,EAAE;YAC7C,GAAG,EAAE,QAAQ;YACb,QAAQ,EAAE,OAAO;SAClB,CAAC,CAAC,IAAI,EAAE,CAAA;QACT,QAAQ,CAAC,gBAAgB,OAAO,EAAE,EAAE,EAAE,GAAG,EAAE,OAAO,EAAE,KAAK,EAAE,QAAQ,EAAE,CAAC,CAAA;QAEtE,gBAAgB,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAA;QAEnC,MAAM,MAAM,GAAG,IAAI,YAAY,CAAC,OAAO,EAAE,EAAE,EAAE,KAAK,EAAE,MAAM,CAAC,CAAA;QAC3D,MAAM,MAAM,CAAC,GAAG,CAAC,MAAM,CAAC,CAAA;QACxB,gBAAgB,CAAC,OAAO,EAAE,QAAQ,CAAC,CAAA;IACrC,CAAC;IAAC,OAAO,GAAG,EAAE,CAAC;QACb,MAAM,GAAG,GAAG,GAAG,YAAY,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,CAAA;QAC5D,OAAO,CAAC,IAAI,CAAC,4BAA4B,GAAG,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,EAAE,CAAC,CAAA;IAC/D,CAAC;YAAS,CAAC;QACT,IAAI,CAAC;YACH,EAAE,CAAC,MAAM,CAAC,OAAO,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,CAAC,CAAA;QACtD,CAAC;QAAC,MAAM,CAAC;YACP,0BAA0B;QAC5B,CAAC;IACH,CAAC;AACH,CAAC"}

package/dist/docs-writer.d.ts ADDED Viewed

@@ -0,0 +1,4 @@
+import type { TaskResult } from './eval-runner';
+export declare function collectDocSuggestions(tasks: TaskResult[]): string;
+export declare function runDocsWriterAgent(repoPath: string, judgeSuggestions: string, model: string): Promise<void>;
+//# sourceMappingURL=docs-writer.d.ts.map

package/dist/docs-writer.d.ts.map ADDED Viewed

	@@ -0,0 +1 @@
1	+ {"version":3,"file":"docs-writer.d.ts","sourceRoot":"","sources":["../src/docs-writer.ts"],"names":[],"mappings":"AAQA,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,eAAe,CAAA;AAE/C,wBAAgB,qBAAqB,CAAC,KAAK,EAAE,UAAU,EAAE,GAAG,MAAM,CAcjE;AAED,wBAAsB,kBAAkB,CACtC,QAAQ,EAAE,MAAM,EAChB,gBAAgB,EAAE,MAAM,EACxB,KAAK,EAAE,MAAM,GACZ,OAAO,CAAC,IAAI,CAAC,CAwGf"}