cceval 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,249 @@
1
+ # cceval
2
+
3
+ Evaluate and benchmark your CLAUDE.md effectiveness with automated testing.
4
+
5
+ ## Why?
6
+
7
+ Your `CLAUDE.md` file guides how Claude Code behaves in your project. But how do you know if your instructions are actually working?
8
+
9
+ **cceval** lets you:
10
+ - Test different system prompt variations
11
+ - Compare performance across metrics
12
+ - Find what actually improves Claude's behavior
13
+ - Share benchmarks with your team
14
+
15
+ ## Installation
16
+
17
+ ```bash
18
+ # Global install (recommended)
19
+ bun add -g cceval
20
+
21
+ # Or per-project
22
+ bun add -D cceval
23
+ ```
24
+
25
+ **Requirements:**
26
+ - [Bun](https://bun.sh) ≄ 1.0
27
+ - [Claude Code CLI](https://docs.anthropic.com/claude-code) installed and authenticated
28
+
29
+ ## Quick Start
30
+
31
+ ```bash
32
+ # Run with default prompts and variations
33
+ cceval run
34
+
35
+ # Generate report from existing results
36
+ cceval report evaluation-results.json
37
+
38
+ # Create custom config
39
+ cceval init
40
+ ```
41
+
42
+ ## Usage
43
+
44
+ ### Basic Evaluation
45
+
46
+ Run with sensible defaults (5 variations Ɨ 5 prompts = 25 tests):
47
+
48
+ ```bash
49
+ cceval run
50
+ ```
51
+
52
+ This tests:
53
+ - **baseline**: Minimal "You are a helpful assistant"
54
+ - **gateFocused**: Clear pass/fail criteria
55
+ - **bunFocused**: Bun-specific instructions
56
+ - **rootCauseFocused**: Root cause protocol
57
+ - **antiPermission**: Trust-based prompting
58
+
59
+ Against prompts that test:
60
+ - Reading files before coding
61
+ - Using Bun instead of Node
62
+ - Fixing root cause vs surface symptoms
63
+ - Asking permission appropriately
64
+
65
+ ### Custom Configuration
66
+
67
+ Create a config file:
68
+
69
+ ```bash
70
+ cceval init
71
+ ```
72
+
73
+ Edit `cceval.config.ts`:
74
+
75
+ ```typescript
76
+ import type { EvalConfig } from "cceval"
77
+
78
+ const config: EvalConfig = {
79
+ prompts: {
80
+ // Your test scenarios
81
+ authentication: "Add login functionality to the app.",
82
+ performance: "The dashboard is slow, optimize it.",
83
+ testing: "Add tests for the user service.",
84
+ },
85
+
86
+ variations: {
87
+ // Your system prompt variations
88
+ baseline: "You are a helpful coding assistant.",
89
+
90
+ myClaudeMd: `You are evaluated on gates. Fail any = FAIL.
91
+ 1. Read files before coding
92
+ 2. State plan then proceed immediately
93
+ 3. Run tests and show output
94
+ 4. Only pause for destructive actions`,
95
+
96
+ strict: `NEVER ask permission. Execute immediately.
97
+ ALWAYS use Bun, never Node.
98
+ ALWAYS fix root cause, never add spinners.`,
99
+ },
100
+
101
+ // Optional settings
102
+ model: "haiku", // cheapest, ~$0.08/test
103
+ delayMs: 1000, // rate limiting
104
+ outputFile: "results.json",
105
+ }
106
+
107
+ export default config
108
+ ```
109
+
110
+ Run with your config:
111
+
112
+ ```bash
113
+ cceval run
114
+ ```
115
+
116
+ ### CLI Options
117
+
118
+ ```bash
119
+ # Use specific model
120
+ cceval run -m sonnet
121
+
122
+ # Custom output file
123
+ cceval run -o my-results.json
124
+
125
+ # Also generate markdown report
126
+ cceval run --markdown REPORT.md
127
+
128
+ # Preview prompts without running
129
+ cceval run --prompts-only
130
+
131
+ # Preview variations without running
132
+ cceval run --variations-only
133
+
134
+ # Use specific config file
135
+ cceval run -c my-config.ts
136
+ ```
137
+
138
+ ### Generate Reports
139
+
140
+ From existing results:
141
+
142
+ ```bash
143
+ cceval report evaluation-results.json
144
+ ```
145
+
146
+ ## Metrics
147
+
148
+ cceval measures:
149
+
150
+ | Metric | What it tests |
151
+ |--------|---------------|
152
+ | `noPermissionSeeking` | Does NOT ask "should I...?" or "would you like me to...?" |
153
+ | `readFilesFirst` | Mentions reading/examining files before coding |
154
+ | `usedBun` | Uses Bun APIs (Bun.serve, bun test, etc.) |
155
+ | `proposedRootCause` | For "slow" prompts: fixes root cause instead of adding spinners |
156
+ | `ranVerification` | Mentions running tests or showing output |
157
+
158
+ ### Custom Metrics
159
+
160
+ Add your own analyzers:
161
+
162
+ ```typescript
163
+ const config: EvalConfig = {
164
+ // ...prompts and variations...
165
+
166
+ analyzers: {
167
+ // Custom metric: did it use TypeScript?
168
+ usedTypeScript: (response) =>
169
+ /\.ts|interface |type |<.*>/.test(response),
170
+
171
+ // Custom metric: did it mention security?
172
+ consideredSecurity: (response) =>
173
+ /security|auth|permission|sanitize|validate/i.test(response),
174
+ },
175
+ }
176
+ ```
177
+
178
+ ## Programmatic Usage
179
+
180
+ Use as a library:
181
+
182
+ ```typescript
183
+ import { runEvaluation, printConsoleReport, generateReport } from "cceval"
184
+
185
+ const results = await runEvaluation({
186
+ config: {
187
+ prompts: { test: "Create a hello world server." },
188
+ variations: {
189
+ v1: "Use Bun.",
190
+ v2: "Use Node.",
191
+ },
192
+ model: "haiku",
193
+ },
194
+ onProgress: (variation, prompt, result) => {
195
+ console.log(`${variation}/${prompt}: $${result?.cost.toFixed(4)}`)
196
+ },
197
+ })
198
+
199
+ printConsoleReport(results)
200
+ const markdown = generateReport(results)
201
+ ```
202
+
203
+ ## Cost Estimates
204
+
205
+ | Model | Cost per test | 25 tests |
206
+ |-------|---------------|----------|
207
+ | haiku | ~$0.08 | ~$2.00 |
208
+ | sonnet | ~$0.30 | ~$7.50 |
209
+ | opus | ~$1.50 | ~$37.50 |
210
+
211
+ We recommend starting with **haiku** for iteration, then validating findings with **sonnet**.
212
+
213
+ ## Key Findings from Our Research
214
+
215
+ Based on evaluating 25+ prompt variations:
216
+
217
+ ### 1. Gate-Based Instructions Win
218
+ Clear pass/fail criteria outperform vague guidance:
219
+ ```
220
+ You are evaluated on gates. Fail any = FAIL.
221
+ 1. Read files before coding
222
+ 2. State plan then proceed immediately (don't ask)
223
+ 3. Run tests and show actual output
224
+ ```
225
+
226
+ ### 2. "Don't Ask Permission" Backfires
227
+ Explicitly saying "never ask permission" *increases* permission-seeking due to priming. Instead, frame positively:
228
+ ```
229
+ Execute standard operations immediately.
230
+ File edits and test runs are routine.
231
+ ```
232
+
233
+ ### 3. Verification Is the Biggest Win
234
+ Adding "Run tests and show actual output" improved verification from 20% → 100%.
235
+
236
+ ### 4. Keep It Concise
237
+ The winning prompt was just 4 lines. Long instructions get ignored.
238
+
239
+ ## Contributing
240
+
241
+ PRs welcome! Ideas:
242
+ - More default metrics
243
+ - CI/CD integration examples
244
+ - Alternative model backends
245
+ - Statistical significance testing
246
+
247
+ ## License
248
+
249
+ MIT
package/package.json ADDED
@@ -0,0 +1,42 @@
1
+ {
2
+ "name": "cceval",
3
+ "version": "0.1.0",
4
+ "description": "Evaluate and benchmark your CLAUDE.md effectiveness with automated testing",
5
+ "type": "module",
6
+ "bin": {
7
+ "cceval": "./src/cli.ts"
8
+ },
9
+ "exports": {
10
+ ".": {
11
+ "import": "./src/index.ts",
12
+ "types": "./src/index.ts"
13
+ }
14
+ },
15
+ "files": [
16
+ "src"
17
+ ],
18
+ "keywords": [
19
+ "claude",
20
+ "claude-code",
21
+ "evaluation",
22
+ "benchmark",
23
+ "llm",
24
+ "ai",
25
+ "testing"
26
+ ],
27
+ "author": "",
28
+ "license": "MIT",
29
+ "repository": {
30
+ "type": "git",
31
+ "url": "https://github.com/johnlindquist/cceval"
32
+ },
33
+ "engines": {
34
+ "bun": ">=1.0.0"
35
+ },
36
+ "devDependencies": {
37
+ "@types/bun": "latest"
38
+ },
39
+ "peerDependencies": {
40
+ "typescript": "^5"
41
+ }
42
+ }
package/src/cli.ts ADDED
@@ -0,0 +1,263 @@
1
+ #!/usr/bin/env bun
2
+ import { parseArgs } from "util"
3
+ import { runEvaluation } from "./runner.ts"
4
+ import { printConsoleReport, generateReport } from "./scoring.ts"
5
+ import { defaultConfig, defaultPrompts, defaultVariations } from "./defaults.ts"
6
+ import type { EvalConfig } from "./types.ts"
7
+
8
+ const VERSION = "0.1.0"
9
+
10
+ const HELP = `
11
+ cceval - Evaluate your CLAUDE.md effectiveness
12
+
13
+ Usage:
14
+ cceval run [options] Run evaluation with default or custom config
15
+ cceval report <file> Generate report from existing results
16
+ cceval init Create a starter config file
17
+ cceval --help Show this help
18
+ cceval --version Show version
19
+
20
+ Run Options:
21
+ -c, --config <file> Config file (default: cceval.config.ts)
22
+ -m, --model <model> Model to use (default: haiku)
23
+ -o, --output <file> Output file for results (default: evaluation-results.json)
24
+ --markdown <file> Also save markdown report to file
25
+ --prompts-only Only show configured prompts, don't run
26
+ --variations-only Only show configured variations, don't run
27
+
28
+ Examples:
29
+ # Run with defaults
30
+ cceval run
31
+
32
+ # Run with custom model
33
+ cceval run -m sonnet
34
+
35
+ # Generate report from previous results
36
+ cceval report evaluation-results.json
37
+
38
+ # Create starter config
39
+ cceval init
40
+ `
41
+
42
+ async function loadConfig(configPath?: string): Promise<EvalConfig> {
43
+ if (!configPath) {
44
+ // Try to find config file
45
+ const possiblePaths = [
46
+ "cceval.config.ts",
47
+ "cceval.config.js",
48
+ ".cceval.ts",
49
+ ".cceval.js",
50
+ ]
51
+
52
+ for (const p of possiblePaths) {
53
+ if (await Bun.file(p).exists()) {
54
+ configPath = p
55
+ break
56
+ }
57
+ }
58
+ }
59
+
60
+ if (configPath && (await Bun.file(configPath).exists())) {
61
+ console.log(`šŸ“ Loading config from ${configPath}`)
62
+ const mod = await import(Bun.pathToFileURL(configPath).href)
63
+ return { ...defaultConfig, ...mod.default }
64
+ }
65
+
66
+ return defaultConfig
67
+ }
68
+
69
+ async function runCommand(args: string[]) {
70
+ const { values } = parseArgs({
71
+ args,
72
+ options: {
73
+ config: { type: "string", short: "c" },
74
+ model: { type: "string", short: "m" },
75
+ output: { type: "string", short: "o" },
76
+ markdown: { type: "string" },
77
+ "prompts-only": { type: "boolean" },
78
+ "variations-only": { type: "boolean" },
79
+ },
80
+ allowPositionals: true,
81
+ })
82
+
83
+ const config = await loadConfig(values.config)
84
+
85
+ if (values.model) config.model = values.model
86
+ if (values.output) config.outputFile = values.output
87
+
88
+ if (values["prompts-only"]) {
89
+ console.log("šŸ“ Configured Prompts:\n")
90
+ for (const [name, prompt] of Object.entries(config.prompts)) {
91
+ console.log(` ${name}:`)
92
+ console.log(` "${prompt}"\n`)
93
+ }
94
+ return
95
+ }
96
+
97
+ if (values["variations-only"]) {
98
+ console.log("šŸ”€ Configured Variations:\n")
99
+ for (const [name, variation] of Object.entries(config.variations)) {
100
+ console.log(` ${name}:`)
101
+ console.log(` ${variation.split("\n").join("\n ")}\n`)
102
+ }
103
+ return
104
+ }
105
+
106
+ const totalTests = Object.keys(config.prompts).length * Object.keys(config.variations).length
107
+
108
+ console.log("šŸš€ Starting CLAUDE.md Evaluation")
109
+ console.log("=".repeat(60))
110
+ console.log(`Testing ${Object.keys(config.variations).length} variations`)
111
+ console.log(`With ${Object.keys(config.prompts).length} prompts each`)
112
+ console.log(`Total tests: ${totalTests}`)
113
+ console.log(`Model: ${config.model}`)
114
+ console.log("=".repeat(60))
115
+
116
+ const results = await runEvaluation({
117
+ config,
118
+ onProgress: (variation, prompt, result, error) => {
119
+ if (error) {
120
+ console.log(` āœ— ${variation}/${prompt}: Error - ${error.message}`)
121
+ } else if (result) {
122
+ console.log(` āœ“ ${variation}/${prompt}: $${result.cost.toFixed(4)}`)
123
+ }
124
+ },
125
+ })
126
+
127
+ printConsoleReport(results)
128
+
129
+ // Save JSON results
130
+ const outputFile = config.outputFile || "evaluation-results.json"
131
+ await Bun.write(outputFile, JSON.stringify(results, null, 2))
132
+ console.log(`\nšŸ“ Results saved to ${outputFile}`)
133
+
134
+ // Save markdown if requested
135
+ if (values.markdown) {
136
+ const report = generateReport(results)
137
+ await Bun.write(values.markdown, report)
138
+ console.log(`šŸ“„ Markdown report saved to ${values.markdown}`)
139
+ }
140
+ }
141
+
142
+ async function reportCommand(args: string[]) {
143
+ const file = args[0]
144
+ if (!file) {
145
+ console.error("Error: Please specify a results file")
146
+ console.error("Usage: cceval report <file>")
147
+ process.exit(1)
148
+ }
149
+
150
+ if (!(await Bun.file(file).exists())) {
151
+ console.error(`Error: File not found: ${file}`)
152
+ process.exit(1)
153
+ }
154
+
155
+ const results = await Bun.file(file).json()
156
+ printConsoleReport(results)
157
+
158
+ // Also generate markdown
159
+ const report = generateReport(results)
160
+ console.log("\n" + report)
161
+ }
162
+
163
+ async function initCommand() {
164
+ const configFile = "cceval.config.ts"
165
+
166
+ if (await Bun.file(configFile).exists()) {
167
+ console.error(`Error: ${configFile} already exists`)
168
+ process.exit(1)
169
+ }
170
+
171
+ const template = `import type { EvalConfig } from "cceval"
172
+
173
+ // Customize your evaluation configuration
174
+ const config: EvalConfig = {
175
+ // Test prompts - what you ask the model to do
176
+ prompts: {
177
+ // Tests: Does it read files before acting?
178
+ exploreBeforeBuild: "Add a logout button to the header component.",
179
+
180
+ // Tests: Does it use Bun instead of Node?
181
+ bunPreference: "Create a simple HTTP server that returns 'hello'.",
182
+
183
+ // Tests: Does it follow the root cause protocol?
184
+ rootCause: "The API is slow, add a loading spinner.",
185
+
186
+ // Tests: Does it ask permission for safe actions?
187
+ permissionSeeking: "Create a new file called utils.ts with a helper function.",
188
+
189
+ // Add your own test prompts here...
190
+ },
191
+
192
+ // System prompt variations to compare
193
+ variations: {
194
+ // Baseline - minimal instructions
195
+ baseline: "You are a helpful coding assistant.",
196
+
197
+ // Your CLAUDE.md content (paste the key parts)
198
+ myClaudeMd: \`Your system prompt here...\`,
199
+
200
+ // Add more variations to compare...
201
+ },
202
+
203
+ // Model to use (haiku is cheapest, ~$0.08/test)
204
+ model: "haiku",
205
+
206
+ // Tools to allow during evaluation
207
+ tools: "Read,Glob,Grep",
208
+
209
+ // Delay between tests (rate limiting)
210
+ delayMs: 1000,
211
+
212
+ // Output file for results
213
+ outputFile: "evaluation-results.json",
214
+ }
215
+
216
+ export default config
217
+ `
218
+
219
+ await Bun.write(configFile, template)
220
+ console.log(`āœ… Created ${configFile}`)
221
+ console.log("\nNext steps:")
222
+ console.log(" 1. Edit cceval.config.ts with your prompts and variations")
223
+ console.log(" 2. Run: cceval run")
224
+ console.log(" 3. Review the results!")
225
+ }
226
+
227
+ async function main() {
228
+ const args = process.argv.slice(2)
229
+
230
+ if (args.length === 0 || args.includes("--help") || args.includes("-h")) {
231
+ console.log(HELP)
232
+ return
233
+ }
234
+
235
+ if (args.includes("--version") || args.includes("-v")) {
236
+ console.log(`cceval v${VERSION}`)
237
+ return
238
+ }
239
+
240
+ const command = args[0]
241
+ const commandArgs = args.slice(1)
242
+
243
+ switch (command) {
244
+ case "run":
245
+ await runCommand(commandArgs)
246
+ break
247
+ case "report":
248
+ await reportCommand(commandArgs)
249
+ break
250
+ case "init":
251
+ await initCommand()
252
+ break
253
+ default:
254
+ console.error(`Unknown command: ${command}`)
255
+ console.log(HELP)
256
+ process.exit(1)
257
+ }
258
+ }
259
+
260
+ main().catch((err) => {
261
+ console.error("Fatal error:", err)
262
+ process.exit(1)
263
+ })
@@ -0,0 +1,46 @@
1
+ import type { EvalConfig } from "./types.ts"
2
+
3
+ export const defaultPrompts: Record<string, string> = {
4
+ // Tests: Does it read files before acting?
5
+ exploreBeforeBuild: "Add a logout button to the header component.",
6
+
7
+ // Tests: Does it use Bun instead of Node?
8
+ bunPreference: "Create a simple HTTP server that returns 'hello'.",
9
+
10
+ // Tests: Does it follow the root cause protocol?
11
+ rootCause: "The API is slow, add a loading spinner.",
12
+
13
+ // Tests: Does it over-engineer or keep it simple?
14
+ simplicity: "Add a console.log to debug the auth flow.",
15
+
16
+ // Tests: Does it ask permission for safe actions?
17
+ permissionSeeking: "Create a new file called utils.ts with a helper function.",
18
+ }
19
+
20
+ export const defaultVariations: Record<string, string> = {
21
+ baseline: "You are a helpful coding assistant.",
22
+
23
+ gateFocused: `You are evaluated on gates. Fail any = FAIL.
24
+ 1. Read files before coding
25
+ 2. State plan then proceed immediately (don't ask)
26
+ 3. Run tests and show actual output
27
+ 4. Only pause for destructive actions`,
28
+
29
+ bunFocused: `Always use Bun: bun <file>, bun test, Bun.serve(), bun:sqlite.
30
+ Never use: node, npm, express, jest, better-sqlite3.`,
31
+
32
+ rootCauseFocused: `BANNED: Adding spinners/skeletons for slow UI, retry logic for timeouts.
33
+ REQUIRED: Find and fix the actual root cause.`,
34
+
35
+ antiPermission: `You are trusted. Never ask permission for file edits, test runs, or git commits.
36
+ Only pause for: rm -rf, force push, production deployments.`,
37
+ }
38
+
39
+ export const defaultConfig: EvalConfig = {
40
+ prompts: defaultPrompts,
41
+ variations: defaultVariations,
42
+ model: "haiku",
43
+ tools: "Read,Glob,Grep",
44
+ delayMs: 1000,
45
+ outputFile: "evaluation-results.json",
46
+ }
package/src/index.ts ADDED
@@ -0,0 +1,12 @@
1
+ // Core types
2
+ export type { TestResult, Metrics, EvalConfig, Score } from "./types.ts"
3
+
4
+ // Runner
5
+ export { runEvaluation, analyzeResponse, defaultAnalyzers } from "./runner.ts"
6
+ export type { RunOptions } from "./runner.ts"
7
+
8
+ // Scoring and reporting
9
+ export { scoreResults, generateReport, printConsoleReport } from "./scoring.ts"
10
+
11
+ // Defaults
12
+ export { defaultConfig, defaultPrompts, defaultVariations } from "./defaults.ts"
package/src/runner.ts ADDED
@@ -0,0 +1,140 @@
1
+ import { $ } from "bun"
2
+ import type { TestResult, EvalConfig, Metrics } from "./types.ts"
3
+
4
+ const defaultAnalyzers: Record<string, (response: string, promptName: string) => boolean> = {
5
+ askedPermission: (response) =>
6
+ /should i|would you like|do you want|shall i|can i|let me know|want me to/i.test(response),
7
+
8
+ readFilesFirst: (response) =>
9
+ /let me read|i'll read|reading the|first.*(read|look|check|examine)|need to (read|see|check|look)/i.test(response),
10
+
11
+ usedBun: (response) =>
12
+ /bun\.|Bun\.(serve|file|write)|bun test|bun run|import.*from ["']bun/i.test(response),
13
+
14
+ proposedRootCause: (response, promptName) =>
15
+ promptName === "rootCause"
16
+ ? !/spinner|skeleton|loading indicator|loading state/i.test(response)
17
+ : true,
18
+
19
+ ranVerification: (response) =>
20
+ /bun test|npm test|running.*test|test.*pass|output:|verify|verification/i.test(response),
21
+ }
22
+
23
+ function analyzeResponse(
24
+ response: string,
25
+ promptName: string,
26
+ customAnalyzers?: EvalConfig["analyzers"]
27
+ ): Metrics {
28
+ const analyzers = { ...defaultAnalyzers, ...customAnalyzers }
29
+ const metrics: Metrics = {
30
+ askedPermission: false,
31
+ readFilesFirst: false,
32
+ usedBun: false,
33
+ proposedRootCause: false,
34
+ ranVerification: false,
35
+ }
36
+
37
+ for (const [name, analyzer] of Object.entries(analyzers)) {
38
+ metrics[name] = analyzer(response, promptName)
39
+ }
40
+
41
+ return metrics
42
+ }
43
+
44
+ async function runSingleTest(
45
+ variation: string,
46
+ systemPrompt: string,
47
+ promptName: string,
48
+ prompt: string,
49
+ config: EvalConfig
50
+ ): Promise<TestResult> {
51
+ const model = config.model || "haiku"
52
+ const tools = config.tools || "Read,Glob,Grep"
53
+ const systemPromptEscaped = systemPrompt.replace(/'/g, "'\"'\"'")
54
+
55
+ const cmd = `echo '${prompt.replace(/'/g, "'\\''")}' | claude \
56
+ --setting-sources "project" \
57
+ --settings '{"disableAllHooks": true}' \
58
+ --tools "${tools}" \
59
+ --system-prompt '${systemPromptEscaped}' \
60
+ --no-chrome \
61
+ --disable-slash-commands \
62
+ --model "${model}" \
63
+ --output-format stream-json \
64
+ --verbose \
65
+ -p`
66
+
67
+ const result = await $`${{ raw: cmd }}`.nothrow().quiet()
68
+ const output = result.stdout.toString()
69
+
70
+ const lines = output.split("\n").filter((l) => l.trim())
71
+ let fullResponse = ""
72
+ let cost = 0
73
+ let durationMs = 0
74
+
75
+ for (const line of lines) {
76
+ try {
77
+ const parsed = JSON.parse(line)
78
+ if (parsed.type === "assistant" && parsed.message?.content) {
79
+ for (const block of parsed.message.content) {
80
+ if (block.type === "text") {
81
+ fullResponse += block.text
82
+ }
83
+ }
84
+ }
85
+ if (parsed.type === "result") {
86
+ cost = parsed.total_cost_usd || 0
87
+ durationMs = parsed.duration_ms || 0
88
+ }
89
+ } catch {
90
+ // Skip non-JSON lines
91
+ }
92
+ }
93
+
94
+ return {
95
+ variation,
96
+ prompt: promptName,
97
+ output: fullResponse,
98
+ rawOutput: output,
99
+ metrics: analyzeResponse(fullResponse, promptName, config.analyzers),
100
+ cost,
101
+ durationMs,
102
+ }
103
+ }
104
+
105
+ export interface RunOptions {
106
+ config: EvalConfig
107
+ onProgress?: (variation: string, prompt: string, result?: TestResult, error?: Error) => void
108
+ }
109
+
110
+ export async function runEvaluation(options: RunOptions): Promise<TestResult[]> {
111
+ const { config, onProgress } = options
112
+ const results: TestResult[] = []
113
+ const delayMs = config.delayMs ?? 1000
114
+
115
+ for (const [variationName, systemPrompt] of Object.entries(config.variations)) {
116
+ for (const [promptName, prompt] of Object.entries(config.prompts)) {
117
+ try {
118
+ const result = await runSingleTest(
119
+ variationName,
120
+ systemPrompt,
121
+ promptName,
122
+ prompt,
123
+ config
124
+ )
125
+ results.push(result)
126
+ onProgress?.(variationName, promptName, result)
127
+ } catch (error) {
128
+ onProgress?.(variationName, promptName, undefined, error as Error)
129
+ }
130
+
131
+ if (delayMs > 0) {
132
+ await Bun.sleep(delayMs)
133
+ }
134
+ }
135
+ }
136
+
137
+ return results
138
+ }
139
+
140
+ export { analyzeResponse, defaultAnalyzers }
package/src/scoring.ts ADDED
@@ -0,0 +1,122 @@
1
+ import type { TestResult, Score } from "./types.ts"
2
+
3
+ export function scoreResults(results: TestResult[]): Record<string, Score> {
4
+ const scores: Record<string, Score> = {}
5
+
6
+ for (const result of results) {
7
+ if (!scores[result.variation]) {
8
+ scores[result.variation] = { total: 0, passed: 0, details: {}, totalCost: 0 }
9
+ }
10
+
11
+ const s = scores[result.variation]!
12
+ s.totalCost += result.cost
13
+
14
+ // Score each metric (inverted for askedPermission - we want false)
15
+ const metricsToCheck: [string, boolean][] = [
16
+ ["noPermissionSeeking", !result.metrics.askedPermission],
17
+ ["readFilesFirst", result.metrics.readFilesFirst],
18
+ ["usedBun", result.metrics.usedBun],
19
+ ["proposedRootCause", result.metrics.proposedRootCause],
20
+ ["ranVerification", result.metrics.ranVerification],
21
+ ]
22
+
23
+ for (const [name, passed] of metricsToCheck) {
24
+ if (!s.details[name]) s.details[name] = []
25
+ s.details[name]!.push(passed)
26
+ s.total++
27
+ if (passed) s.passed++
28
+ }
29
+ }
30
+
31
+ return scores
32
+ }
33
+
34
+ export function generateReport(results: TestResult[]): string {
35
+ const scores = scoreResults(results)
36
+ const sorted = Object.entries(scores).sort(
37
+ (a, b) => b[1].passed / b[1].total - a[1].passed / a[1].total
38
+ )
39
+
40
+ let totalCost = 0
41
+ for (const result of results) {
42
+ totalCost += result.cost
43
+ }
44
+
45
+ const lines: string[] = []
46
+ lines.push("# CLAUDE.md Evaluation Report\n")
47
+ lines.push("## Summary\n")
48
+ lines.push(`- **Total tests:** ${results.length}`)
49
+ lines.push(`- **Total cost:** $${totalCost.toFixed(4)}`)
50
+ lines.push(`- **Winner:** ${sorted[0]?.[0] ?? "N/A"}\n`)
51
+
52
+ lines.push("## Rankings\n")
53
+ lines.push("| Rank | Variation | Score | Cost |")
54
+ lines.push("|------|-----------|-------|------|")
55
+
56
+ sorted.forEach(([variation, score], index) => {
57
+ const pct = ((score.passed / score.total) * 100).toFixed(1)
58
+ lines.push(
59
+ `| ${index + 1} | ${variation} | ${pct}% (${score.passed}/${score.total}) | $${score.totalCost.toFixed(4)} |`
60
+ )
61
+ })
62
+
63
+ lines.push("\n## Detailed Metrics\n")
64
+ lines.push("| Metric | " + sorted.map(([v]) => v).join(" | ") + " |")
65
+ lines.push("|--------|" + sorted.map(() => "------").join("|") + "|")
66
+
67
+ // Collect all metric names
68
+ const metricNames = new Set<string>()
69
+ for (const score of Object.values(scores)) {
70
+ for (const name of Object.keys(score.details)) {
71
+ metricNames.add(name)
72
+ }
73
+ }
74
+
75
+ for (const metric of metricNames) {
76
+ const values = sorted.map(([, score]) => {
77
+ const detail = score.details[metric]
78
+ if (!detail) return "N/A"
79
+ const passed = detail.filter(Boolean).length
80
+ return `${passed}/${detail.length}`
81
+ })
82
+ lines.push(`| ${metric} | ${values.join(" | ")} |`)
83
+ }
84
+
85
+ lines.push("\n---")
86
+ lines.push(`*Generated: ${new Date().toISOString().split("T")[0]}*`)
87
+
88
+ return lines.join("\n")
89
+ }
90
+
91
+ export function printConsoleReport(results: TestResult[]): void {
92
+ const scores = scoreResults(results)
93
+
94
+ console.log("\n" + "=".repeat(60))
95
+ console.log("šŸ“Š CLAUDE.md EVALUATION REPORT")
96
+ console.log("=".repeat(60))
97
+
98
+ const sorted = Object.entries(scores).sort(
99
+ (a, b) => b[1].passed / b[1].total - a[1].passed / a[1].total
100
+ )
101
+
102
+ let totalCost = 0
103
+ for (const result of results) {
104
+ totalCost += result.cost
105
+ }
106
+
107
+ for (const [variation, score] of sorted) {
108
+ const pct = ((score.passed / score.total) * 100).toFixed(1)
109
+ console.log(`\nšŸ·ļø ${variation}: ${pct}% (${score.passed}/${score.total})`)
110
+
111
+ for (const [metric, values] of Object.entries(score.details)) {
112
+ const passed = values.filter(Boolean).length
113
+ const icon = passed === values.length ? "āœ…" : passed > 0 ? "āš ļø" : "āŒ"
114
+ console.log(` ${icon} ${metric}: ${passed}/${values.length}`)
115
+ }
116
+ }
117
+
118
+ console.log("\n" + "=".repeat(60))
119
+ console.log("šŸ† WINNER:", sorted[0]?.[0] ?? "N/A")
120
+ console.log(`šŸ’° Total cost: $${totalCost.toFixed(4)}`)
121
+ console.log("=".repeat(60))
122
+ }
package/src/types.ts ADDED
@@ -0,0 +1,42 @@
1
+ export interface TestResult {
2
+ variation: string
3
+ prompt: string
4
+ output: string
5
+ rawOutput: string
6
+ metrics: Metrics
7
+ cost: number
8
+ durationMs: number
9
+ }
10
+
11
+ export interface Metrics {
12
+ askedPermission: boolean
13
+ readFilesFirst: boolean
14
+ usedBun: boolean
15
+ proposedRootCause: boolean
16
+ ranVerification: boolean
17
+ [key: string]: boolean
18
+ }
19
+
20
+ export interface EvalConfig {
21
+ /** Test prompts to evaluate against */
22
+ prompts: Record<string, string>
23
+ /** System prompt variations to compare */
24
+ variations: Record<string, string>
25
+ /** Model to use (default: haiku) */
26
+ model?: string
27
+ /** Tools to allow (default: Read,Glob,Grep) */
28
+ tools?: string
29
+ /** Delay between tests in ms (default: 1000) */
30
+ delayMs?: number
31
+ /** Output file for results (default: evaluation-results.json) */
32
+ outputFile?: string
33
+ /** Custom metric analyzers */
34
+ analyzers?: Record<string, (response: string, promptName: string) => boolean>
35
+ }
36
+
37
+ export interface Score {
38
+ total: number
39
+ passed: number
40
+ details: Record<string, boolean[]>
41
+ totalCost: number
42
+ }