cceval 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +249 -0
- package/package.json +42 -0
- package/src/cli.ts +263 -0
- package/src/defaults.ts +46 -0
- package/src/index.ts +12 -0
- package/src/runner.ts +140 -0
- package/src/scoring.ts +122 -0
- package/src/types.ts +42 -0
package/README.md
ADDED
|
@@ -0,0 +1,249 @@
|
|
|
1
|
+
# cceval
|
|
2
|
+
|
|
3
|
+
Evaluate and benchmark your CLAUDE.md effectiveness with automated testing.
|
|
4
|
+
|
|
5
|
+
## Why?
|
|
6
|
+
|
|
7
|
+
Your `CLAUDE.md` file guides how Claude Code behaves in your project. But how do you know if your instructions are actually working?
|
|
8
|
+
|
|
9
|
+
**cceval** lets you:
|
|
10
|
+
- Test different system prompt variations
|
|
11
|
+
- Compare performance across metrics
|
|
12
|
+
- Find what actually improves Claude's behavior
|
|
13
|
+
- Share benchmarks with your team
|
|
14
|
+
|
|
15
|
+
## Installation
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
# Global install (recommended)
|
|
19
|
+
bun add -g cceval
|
|
20
|
+
|
|
21
|
+
# Or per-project
|
|
22
|
+
bun add -D cceval
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
**Requirements:**
|
|
26
|
+
- [Bun](https://bun.sh) ā„ 1.0
|
|
27
|
+
- [Claude Code CLI](https://docs.anthropic.com/claude-code) installed and authenticated
|
|
28
|
+
|
|
29
|
+
## Quick Start
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
# Run with default prompts and variations
|
|
33
|
+
cceval run
|
|
34
|
+
|
|
35
|
+
# Generate report from existing results
|
|
36
|
+
cceval report evaluation-results.json
|
|
37
|
+
|
|
38
|
+
# Create custom config
|
|
39
|
+
cceval init
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
## Usage
|
|
43
|
+
|
|
44
|
+
### Basic Evaluation
|
|
45
|
+
|
|
46
|
+
Run with sensible defaults (5 variations Ć 5 prompts = 25 tests):
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
cceval run
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
This tests:
|
|
53
|
+
- **baseline**: Minimal "You are a helpful assistant"
|
|
54
|
+
- **gateFocused**: Clear pass/fail criteria
|
|
55
|
+
- **bunFocused**: Bun-specific instructions
|
|
56
|
+
- **rootCauseFocused**: Root cause protocol
|
|
57
|
+
- **antiPermission**: Trust-based prompting
|
|
58
|
+
|
|
59
|
+
Against prompts that test:
|
|
60
|
+
- Reading files before coding
|
|
61
|
+
- Using Bun instead of Node
|
|
62
|
+
- Fixing root cause vs surface symptoms
|
|
63
|
+
- Asking permission appropriately
|
|
64
|
+
|
|
65
|
+
### Custom Configuration
|
|
66
|
+
|
|
67
|
+
Create a config file:
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
cceval init
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
Edit `cceval.config.ts`:
|
|
74
|
+
|
|
75
|
+
```typescript
|
|
76
|
+
import type { EvalConfig } from "cceval"
|
|
77
|
+
|
|
78
|
+
const config: EvalConfig = {
|
|
79
|
+
prompts: {
|
|
80
|
+
// Your test scenarios
|
|
81
|
+
authentication: "Add login functionality to the app.",
|
|
82
|
+
performance: "The dashboard is slow, optimize it.",
|
|
83
|
+
testing: "Add tests for the user service.",
|
|
84
|
+
},
|
|
85
|
+
|
|
86
|
+
variations: {
|
|
87
|
+
// Your system prompt variations
|
|
88
|
+
baseline: "You are a helpful coding assistant.",
|
|
89
|
+
|
|
90
|
+
myClaudeMd: `You are evaluated on gates. Fail any = FAIL.
|
|
91
|
+
1. Read files before coding
|
|
92
|
+
2. State plan then proceed immediately
|
|
93
|
+
3. Run tests and show output
|
|
94
|
+
4. Only pause for destructive actions`,
|
|
95
|
+
|
|
96
|
+
strict: `NEVER ask permission. Execute immediately.
|
|
97
|
+
ALWAYS use Bun, never Node.
|
|
98
|
+
ALWAYS fix root cause, never add spinners.`,
|
|
99
|
+
},
|
|
100
|
+
|
|
101
|
+
// Optional settings
|
|
102
|
+
model: "haiku", // cheapest, ~$0.08/test
|
|
103
|
+
delayMs: 1000, // rate limiting
|
|
104
|
+
outputFile: "results.json",
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
export default config
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
Run with your config:
|
|
111
|
+
|
|
112
|
+
```bash
|
|
113
|
+
cceval run
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
### CLI Options
|
|
117
|
+
|
|
118
|
+
```bash
|
|
119
|
+
# Use specific model
|
|
120
|
+
cceval run -m sonnet
|
|
121
|
+
|
|
122
|
+
# Custom output file
|
|
123
|
+
cceval run -o my-results.json
|
|
124
|
+
|
|
125
|
+
# Also generate markdown report
|
|
126
|
+
cceval run --markdown REPORT.md
|
|
127
|
+
|
|
128
|
+
# Preview prompts without running
|
|
129
|
+
cceval run --prompts-only
|
|
130
|
+
|
|
131
|
+
# Preview variations without running
|
|
132
|
+
cceval run --variations-only
|
|
133
|
+
|
|
134
|
+
# Use specific config file
|
|
135
|
+
cceval run -c my-config.ts
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
### Generate Reports
|
|
139
|
+
|
|
140
|
+
From existing results:
|
|
141
|
+
|
|
142
|
+
```bash
|
|
143
|
+
cceval report evaluation-results.json
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
## Metrics
|
|
147
|
+
|
|
148
|
+
cceval measures:
|
|
149
|
+
|
|
150
|
+
| Metric | What it tests |
|
|
151
|
+
|--------|---------------|
|
|
152
|
+
| `noPermissionSeeking` | Does NOT ask "should I...?" or "would you like me to...?" |
|
|
153
|
+
| `readFilesFirst` | Mentions reading/examining files before coding |
|
|
154
|
+
| `usedBun` | Uses Bun APIs (Bun.serve, bun test, etc.) |
|
|
155
|
+
| `proposedRootCause` | For "slow" prompts: fixes root cause instead of adding spinners |
|
|
156
|
+
| `ranVerification` | Mentions running tests or showing output |
|
|
157
|
+
|
|
158
|
+
### Custom Metrics
|
|
159
|
+
|
|
160
|
+
Add your own analyzers:
|
|
161
|
+
|
|
162
|
+
```typescript
|
|
163
|
+
const config: EvalConfig = {
|
|
164
|
+
// ...prompts and variations...
|
|
165
|
+
|
|
166
|
+
analyzers: {
|
|
167
|
+
// Custom metric: did it use TypeScript?
|
|
168
|
+
usedTypeScript: (response) =>
|
|
169
|
+
/\.ts|interface |type |<.*>/.test(response),
|
|
170
|
+
|
|
171
|
+
// Custom metric: did it mention security?
|
|
172
|
+
consideredSecurity: (response) =>
|
|
173
|
+
/security|auth|permission|sanitize|validate/i.test(response),
|
|
174
|
+
},
|
|
175
|
+
}
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
## Programmatic Usage
|
|
179
|
+
|
|
180
|
+
Use as a library:
|
|
181
|
+
|
|
182
|
+
```typescript
|
|
183
|
+
import { runEvaluation, printConsoleReport, generateReport } from "cceval"
|
|
184
|
+
|
|
185
|
+
const results = await runEvaluation({
|
|
186
|
+
config: {
|
|
187
|
+
prompts: { test: "Create a hello world server." },
|
|
188
|
+
variations: {
|
|
189
|
+
v1: "Use Bun.",
|
|
190
|
+
v2: "Use Node.",
|
|
191
|
+
},
|
|
192
|
+
model: "haiku",
|
|
193
|
+
},
|
|
194
|
+
onProgress: (variation, prompt, result) => {
|
|
195
|
+
console.log(`${variation}/${prompt}: $${result?.cost.toFixed(4)}`)
|
|
196
|
+
},
|
|
197
|
+
})
|
|
198
|
+
|
|
199
|
+
printConsoleReport(results)
|
|
200
|
+
const markdown = generateReport(results)
|
|
201
|
+
```
|
|
202
|
+
|
|
203
|
+
## Cost Estimates
|
|
204
|
+
|
|
205
|
+
| Model | Cost per test | 25 tests |
|
|
206
|
+
|-------|---------------|----------|
|
|
207
|
+
| haiku | ~$0.08 | ~$2.00 |
|
|
208
|
+
| sonnet | ~$0.30 | ~$7.50 |
|
|
209
|
+
| opus | ~$1.50 | ~$37.50 |
|
|
210
|
+
|
|
211
|
+
We recommend starting with **haiku** for iteration, then validating findings with **sonnet**.
|
|
212
|
+
|
|
213
|
+
## Key Findings from Our Research
|
|
214
|
+
|
|
215
|
+
Based on evaluating 25+ prompt variations:
|
|
216
|
+
|
|
217
|
+
### 1. Gate-Based Instructions Win
|
|
218
|
+
Clear pass/fail criteria outperform vague guidance:
|
|
219
|
+
```
|
|
220
|
+
You are evaluated on gates. Fail any = FAIL.
|
|
221
|
+
1. Read files before coding
|
|
222
|
+
2. State plan then proceed immediately (don't ask)
|
|
223
|
+
3. Run tests and show actual output
|
|
224
|
+
```
|
|
225
|
+
|
|
226
|
+
### 2. "Don't Ask Permission" Backfires
|
|
227
|
+
Explicitly saying "never ask permission" *increases* permission-seeking due to priming. Instead, frame positively:
|
|
228
|
+
```
|
|
229
|
+
Execute standard operations immediately.
|
|
230
|
+
File edits and test runs are routine.
|
|
231
|
+
```
|
|
232
|
+
|
|
233
|
+
### 3. Verification Is the Biggest Win
|
|
234
|
+
Adding "Run tests and show actual output" improved verification from 20% ā 100%.
|
|
235
|
+
|
|
236
|
+
### 4. Keep It Concise
|
|
237
|
+
The winning prompt was just 4 lines. Long instructions get ignored.
|
|
238
|
+
|
|
239
|
+
## Contributing
|
|
240
|
+
|
|
241
|
+
PRs welcome! Ideas:
|
|
242
|
+
- More default metrics
|
|
243
|
+
- CI/CD integration examples
|
|
244
|
+
- Alternative model backends
|
|
245
|
+
- Statistical significance testing
|
|
246
|
+
|
|
247
|
+
## License
|
|
248
|
+
|
|
249
|
+
MIT
|
package/package.json
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "cceval",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"description": "Evaluate and benchmark your CLAUDE.md effectiveness with automated testing",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"bin": {
|
|
7
|
+
"cceval": "./src/cli.ts"
|
|
8
|
+
},
|
|
9
|
+
"exports": {
|
|
10
|
+
".": {
|
|
11
|
+
"import": "./src/index.ts",
|
|
12
|
+
"types": "./src/index.ts"
|
|
13
|
+
}
|
|
14
|
+
},
|
|
15
|
+
"files": [
|
|
16
|
+
"src"
|
|
17
|
+
],
|
|
18
|
+
"keywords": [
|
|
19
|
+
"claude",
|
|
20
|
+
"claude-code",
|
|
21
|
+
"evaluation",
|
|
22
|
+
"benchmark",
|
|
23
|
+
"llm",
|
|
24
|
+
"ai",
|
|
25
|
+
"testing"
|
|
26
|
+
],
|
|
27
|
+
"author": "",
|
|
28
|
+
"license": "MIT",
|
|
29
|
+
"repository": {
|
|
30
|
+
"type": "git",
|
|
31
|
+
"url": "https://github.com/johnlindquist/cceval"
|
|
32
|
+
},
|
|
33
|
+
"engines": {
|
|
34
|
+
"bun": ">=1.0.0"
|
|
35
|
+
},
|
|
36
|
+
"devDependencies": {
|
|
37
|
+
"@types/bun": "latest"
|
|
38
|
+
},
|
|
39
|
+
"peerDependencies": {
|
|
40
|
+
"typescript": "^5"
|
|
41
|
+
}
|
|
42
|
+
}
|
package/src/cli.ts
ADDED
|
@@ -0,0 +1,263 @@
|
|
|
1
|
+
#!/usr/bin/env bun
|
|
2
|
+
import { parseArgs } from "util"
|
|
3
|
+
import { runEvaluation } from "./runner.ts"
|
|
4
|
+
import { printConsoleReport, generateReport } from "./scoring.ts"
|
|
5
|
+
import { defaultConfig, defaultPrompts, defaultVariations } from "./defaults.ts"
|
|
6
|
+
import type { EvalConfig } from "./types.ts"
|
|
7
|
+
|
|
8
|
+
const VERSION = "0.1.0"
|
|
9
|
+
|
|
10
|
+
const HELP = `
|
|
11
|
+
cceval - Evaluate your CLAUDE.md effectiveness
|
|
12
|
+
|
|
13
|
+
Usage:
|
|
14
|
+
cceval run [options] Run evaluation with default or custom config
|
|
15
|
+
cceval report <file> Generate report from existing results
|
|
16
|
+
cceval init Create a starter config file
|
|
17
|
+
cceval --help Show this help
|
|
18
|
+
cceval --version Show version
|
|
19
|
+
|
|
20
|
+
Run Options:
|
|
21
|
+
-c, --config <file> Config file (default: cceval.config.ts)
|
|
22
|
+
-m, --model <model> Model to use (default: haiku)
|
|
23
|
+
-o, --output <file> Output file for results (default: evaluation-results.json)
|
|
24
|
+
--markdown <file> Also save markdown report to file
|
|
25
|
+
--prompts-only Only show configured prompts, don't run
|
|
26
|
+
--variations-only Only show configured variations, don't run
|
|
27
|
+
|
|
28
|
+
Examples:
|
|
29
|
+
# Run with defaults
|
|
30
|
+
cceval run
|
|
31
|
+
|
|
32
|
+
# Run with custom model
|
|
33
|
+
cceval run -m sonnet
|
|
34
|
+
|
|
35
|
+
# Generate report from previous results
|
|
36
|
+
cceval report evaluation-results.json
|
|
37
|
+
|
|
38
|
+
# Create starter config
|
|
39
|
+
cceval init
|
|
40
|
+
`
|
|
41
|
+
|
|
42
|
+
async function loadConfig(configPath?: string): Promise<EvalConfig> {
|
|
43
|
+
if (!configPath) {
|
|
44
|
+
// Try to find config file
|
|
45
|
+
const possiblePaths = [
|
|
46
|
+
"cceval.config.ts",
|
|
47
|
+
"cceval.config.js",
|
|
48
|
+
".cceval.ts",
|
|
49
|
+
".cceval.js",
|
|
50
|
+
]
|
|
51
|
+
|
|
52
|
+
for (const p of possiblePaths) {
|
|
53
|
+
if (await Bun.file(p).exists()) {
|
|
54
|
+
configPath = p
|
|
55
|
+
break
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
if (configPath && (await Bun.file(configPath).exists())) {
|
|
61
|
+
console.log(`š Loading config from ${configPath}`)
|
|
62
|
+
const mod = await import(Bun.pathToFileURL(configPath).href)
|
|
63
|
+
return { ...defaultConfig, ...mod.default }
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
return defaultConfig
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
async function runCommand(args: string[]) {
|
|
70
|
+
const { values } = parseArgs({
|
|
71
|
+
args,
|
|
72
|
+
options: {
|
|
73
|
+
config: { type: "string", short: "c" },
|
|
74
|
+
model: { type: "string", short: "m" },
|
|
75
|
+
output: { type: "string", short: "o" },
|
|
76
|
+
markdown: { type: "string" },
|
|
77
|
+
"prompts-only": { type: "boolean" },
|
|
78
|
+
"variations-only": { type: "boolean" },
|
|
79
|
+
},
|
|
80
|
+
allowPositionals: true,
|
|
81
|
+
})
|
|
82
|
+
|
|
83
|
+
const config = await loadConfig(values.config)
|
|
84
|
+
|
|
85
|
+
if (values.model) config.model = values.model
|
|
86
|
+
if (values.output) config.outputFile = values.output
|
|
87
|
+
|
|
88
|
+
if (values["prompts-only"]) {
|
|
89
|
+
console.log("š Configured Prompts:\n")
|
|
90
|
+
for (const [name, prompt] of Object.entries(config.prompts)) {
|
|
91
|
+
console.log(` ${name}:`)
|
|
92
|
+
console.log(` "${prompt}"\n`)
|
|
93
|
+
}
|
|
94
|
+
return
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
if (values["variations-only"]) {
|
|
98
|
+
console.log("š Configured Variations:\n")
|
|
99
|
+
for (const [name, variation] of Object.entries(config.variations)) {
|
|
100
|
+
console.log(` ${name}:`)
|
|
101
|
+
console.log(` ${variation.split("\n").join("\n ")}\n`)
|
|
102
|
+
}
|
|
103
|
+
return
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
const totalTests = Object.keys(config.prompts).length * Object.keys(config.variations).length
|
|
107
|
+
|
|
108
|
+
console.log("š Starting CLAUDE.md Evaluation")
|
|
109
|
+
console.log("=".repeat(60))
|
|
110
|
+
console.log(`Testing ${Object.keys(config.variations).length} variations`)
|
|
111
|
+
console.log(`With ${Object.keys(config.prompts).length} prompts each`)
|
|
112
|
+
console.log(`Total tests: ${totalTests}`)
|
|
113
|
+
console.log(`Model: ${config.model}`)
|
|
114
|
+
console.log("=".repeat(60))
|
|
115
|
+
|
|
116
|
+
const results = await runEvaluation({
|
|
117
|
+
config,
|
|
118
|
+
onProgress: (variation, prompt, result, error) => {
|
|
119
|
+
if (error) {
|
|
120
|
+
console.log(` ā ${variation}/${prompt}: Error - ${error.message}`)
|
|
121
|
+
} else if (result) {
|
|
122
|
+
console.log(` ā ${variation}/${prompt}: $${result.cost.toFixed(4)}`)
|
|
123
|
+
}
|
|
124
|
+
},
|
|
125
|
+
})
|
|
126
|
+
|
|
127
|
+
printConsoleReport(results)
|
|
128
|
+
|
|
129
|
+
// Save JSON results
|
|
130
|
+
const outputFile = config.outputFile || "evaluation-results.json"
|
|
131
|
+
await Bun.write(outputFile, JSON.stringify(results, null, 2))
|
|
132
|
+
console.log(`\nš Results saved to ${outputFile}`)
|
|
133
|
+
|
|
134
|
+
// Save markdown if requested
|
|
135
|
+
if (values.markdown) {
|
|
136
|
+
const report = generateReport(results)
|
|
137
|
+
await Bun.write(values.markdown, report)
|
|
138
|
+
console.log(`š Markdown report saved to ${values.markdown}`)
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
async function reportCommand(args: string[]) {
|
|
143
|
+
const file = args[0]
|
|
144
|
+
if (!file) {
|
|
145
|
+
console.error("Error: Please specify a results file")
|
|
146
|
+
console.error("Usage: cceval report <file>")
|
|
147
|
+
process.exit(1)
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
if (!(await Bun.file(file).exists())) {
|
|
151
|
+
console.error(`Error: File not found: ${file}`)
|
|
152
|
+
process.exit(1)
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
const results = await Bun.file(file).json()
|
|
156
|
+
printConsoleReport(results)
|
|
157
|
+
|
|
158
|
+
// Also generate markdown
|
|
159
|
+
const report = generateReport(results)
|
|
160
|
+
console.log("\n" + report)
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
async function initCommand() {
|
|
164
|
+
const configFile = "cceval.config.ts"
|
|
165
|
+
|
|
166
|
+
if (await Bun.file(configFile).exists()) {
|
|
167
|
+
console.error(`Error: ${configFile} already exists`)
|
|
168
|
+
process.exit(1)
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
const template = `import type { EvalConfig } from "cceval"
|
|
172
|
+
|
|
173
|
+
// Customize your evaluation configuration
|
|
174
|
+
const config: EvalConfig = {
|
|
175
|
+
// Test prompts - what you ask the model to do
|
|
176
|
+
prompts: {
|
|
177
|
+
// Tests: Does it read files before acting?
|
|
178
|
+
exploreBeforeBuild: "Add a logout button to the header component.",
|
|
179
|
+
|
|
180
|
+
// Tests: Does it use Bun instead of Node?
|
|
181
|
+
bunPreference: "Create a simple HTTP server that returns 'hello'.",
|
|
182
|
+
|
|
183
|
+
// Tests: Does it follow the root cause protocol?
|
|
184
|
+
rootCause: "The API is slow, add a loading spinner.",
|
|
185
|
+
|
|
186
|
+
// Tests: Does it ask permission for safe actions?
|
|
187
|
+
permissionSeeking: "Create a new file called utils.ts with a helper function.",
|
|
188
|
+
|
|
189
|
+
// Add your own test prompts here...
|
|
190
|
+
},
|
|
191
|
+
|
|
192
|
+
// System prompt variations to compare
|
|
193
|
+
variations: {
|
|
194
|
+
// Baseline - minimal instructions
|
|
195
|
+
baseline: "You are a helpful coding assistant.",
|
|
196
|
+
|
|
197
|
+
// Your CLAUDE.md content (paste the key parts)
|
|
198
|
+
myClaudeMd: \`Your system prompt here...\`,
|
|
199
|
+
|
|
200
|
+
// Add more variations to compare...
|
|
201
|
+
},
|
|
202
|
+
|
|
203
|
+
// Model to use (haiku is cheapest, ~$0.08/test)
|
|
204
|
+
model: "haiku",
|
|
205
|
+
|
|
206
|
+
// Tools to allow during evaluation
|
|
207
|
+
tools: "Read,Glob,Grep",
|
|
208
|
+
|
|
209
|
+
// Delay between tests (rate limiting)
|
|
210
|
+
delayMs: 1000,
|
|
211
|
+
|
|
212
|
+
// Output file for results
|
|
213
|
+
outputFile: "evaluation-results.json",
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
export default config
|
|
217
|
+
`
|
|
218
|
+
|
|
219
|
+
await Bun.write(configFile, template)
|
|
220
|
+
console.log(`ā
Created ${configFile}`)
|
|
221
|
+
console.log("\nNext steps:")
|
|
222
|
+
console.log(" 1. Edit cceval.config.ts with your prompts and variations")
|
|
223
|
+
console.log(" 2. Run: cceval run")
|
|
224
|
+
console.log(" 3. Review the results!")
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
async function main() {
|
|
228
|
+
const args = process.argv.slice(2)
|
|
229
|
+
|
|
230
|
+
if (args.length === 0 || args.includes("--help") || args.includes("-h")) {
|
|
231
|
+
console.log(HELP)
|
|
232
|
+
return
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
if (args.includes("--version") || args.includes("-v")) {
|
|
236
|
+
console.log(`cceval v${VERSION}`)
|
|
237
|
+
return
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
const command = args[0]
|
|
241
|
+
const commandArgs = args.slice(1)
|
|
242
|
+
|
|
243
|
+
switch (command) {
|
|
244
|
+
case "run":
|
|
245
|
+
await runCommand(commandArgs)
|
|
246
|
+
break
|
|
247
|
+
case "report":
|
|
248
|
+
await reportCommand(commandArgs)
|
|
249
|
+
break
|
|
250
|
+
case "init":
|
|
251
|
+
await initCommand()
|
|
252
|
+
break
|
|
253
|
+
default:
|
|
254
|
+
console.error(`Unknown command: ${command}`)
|
|
255
|
+
console.log(HELP)
|
|
256
|
+
process.exit(1)
|
|
257
|
+
}
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
main().catch((err) => {
|
|
261
|
+
console.error("Fatal error:", err)
|
|
262
|
+
process.exit(1)
|
|
263
|
+
})
|
package/src/defaults.ts
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
import type { EvalConfig } from "./types.ts"
|
|
2
|
+
|
|
3
|
+
export const defaultPrompts: Record<string, string> = {
|
|
4
|
+
// Tests: Does it read files before acting?
|
|
5
|
+
exploreBeforeBuild: "Add a logout button to the header component.",
|
|
6
|
+
|
|
7
|
+
// Tests: Does it use Bun instead of Node?
|
|
8
|
+
bunPreference: "Create a simple HTTP server that returns 'hello'.",
|
|
9
|
+
|
|
10
|
+
// Tests: Does it follow the root cause protocol?
|
|
11
|
+
rootCause: "The API is slow, add a loading spinner.",
|
|
12
|
+
|
|
13
|
+
// Tests: Does it over-engineer or keep it simple?
|
|
14
|
+
simplicity: "Add a console.log to debug the auth flow.",
|
|
15
|
+
|
|
16
|
+
// Tests: Does it ask permission for safe actions?
|
|
17
|
+
permissionSeeking: "Create a new file called utils.ts with a helper function.",
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
export const defaultVariations: Record<string, string> = {
|
|
21
|
+
baseline: "You are a helpful coding assistant.",
|
|
22
|
+
|
|
23
|
+
gateFocused: `You are evaluated on gates. Fail any = FAIL.
|
|
24
|
+
1. Read files before coding
|
|
25
|
+
2. State plan then proceed immediately (don't ask)
|
|
26
|
+
3. Run tests and show actual output
|
|
27
|
+
4. Only pause for destructive actions`,
|
|
28
|
+
|
|
29
|
+
bunFocused: `Always use Bun: bun <file>, bun test, Bun.serve(), bun:sqlite.
|
|
30
|
+
Never use: node, npm, express, jest, better-sqlite3.`,
|
|
31
|
+
|
|
32
|
+
rootCauseFocused: `BANNED: Adding spinners/skeletons for slow UI, retry logic for timeouts.
|
|
33
|
+
REQUIRED: Find and fix the actual root cause.`,
|
|
34
|
+
|
|
35
|
+
antiPermission: `You are trusted. Never ask permission for file edits, test runs, or git commits.
|
|
36
|
+
Only pause for: rm -rf, force push, production deployments.`,
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
export const defaultConfig: EvalConfig = {
|
|
40
|
+
prompts: defaultPrompts,
|
|
41
|
+
variations: defaultVariations,
|
|
42
|
+
model: "haiku",
|
|
43
|
+
tools: "Read,Glob,Grep",
|
|
44
|
+
delayMs: 1000,
|
|
45
|
+
outputFile: "evaluation-results.json",
|
|
46
|
+
}
|
package/src/index.ts
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
// Core types
|
|
2
|
+
export type { TestResult, Metrics, EvalConfig, Score } from "./types.ts"
|
|
3
|
+
|
|
4
|
+
// Runner
|
|
5
|
+
export { runEvaluation, analyzeResponse, defaultAnalyzers } from "./runner.ts"
|
|
6
|
+
export type { RunOptions } from "./runner.ts"
|
|
7
|
+
|
|
8
|
+
// Scoring and reporting
|
|
9
|
+
export { scoreResults, generateReport, printConsoleReport } from "./scoring.ts"
|
|
10
|
+
|
|
11
|
+
// Defaults
|
|
12
|
+
export { defaultConfig, defaultPrompts, defaultVariations } from "./defaults.ts"
|
package/src/runner.ts
ADDED
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
import { $ } from "bun"
|
|
2
|
+
import type { TestResult, EvalConfig, Metrics } from "./types.ts"
|
|
3
|
+
|
|
4
|
+
const defaultAnalyzers: Record<string, (response: string, promptName: string) => boolean> = {
|
|
5
|
+
askedPermission: (response) =>
|
|
6
|
+
/should i|would you like|do you want|shall i|can i|let me know|want me to/i.test(response),
|
|
7
|
+
|
|
8
|
+
readFilesFirst: (response) =>
|
|
9
|
+
/let me read|i'll read|reading the|first.*(read|look|check|examine)|need to (read|see|check|look)/i.test(response),
|
|
10
|
+
|
|
11
|
+
usedBun: (response) =>
|
|
12
|
+
/bun\.|Bun\.(serve|file|write)|bun test|bun run|import.*from ["']bun/i.test(response),
|
|
13
|
+
|
|
14
|
+
proposedRootCause: (response, promptName) =>
|
|
15
|
+
promptName === "rootCause"
|
|
16
|
+
? !/spinner|skeleton|loading indicator|loading state/i.test(response)
|
|
17
|
+
: true,
|
|
18
|
+
|
|
19
|
+
ranVerification: (response) =>
|
|
20
|
+
/bun test|npm test|running.*test|test.*pass|output:|verify|verification/i.test(response),
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
function analyzeResponse(
|
|
24
|
+
response: string,
|
|
25
|
+
promptName: string,
|
|
26
|
+
customAnalyzers?: EvalConfig["analyzers"]
|
|
27
|
+
): Metrics {
|
|
28
|
+
const analyzers = { ...defaultAnalyzers, ...customAnalyzers }
|
|
29
|
+
const metrics: Metrics = {
|
|
30
|
+
askedPermission: false,
|
|
31
|
+
readFilesFirst: false,
|
|
32
|
+
usedBun: false,
|
|
33
|
+
proposedRootCause: false,
|
|
34
|
+
ranVerification: false,
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
for (const [name, analyzer] of Object.entries(analyzers)) {
|
|
38
|
+
metrics[name] = analyzer(response, promptName)
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
return metrics
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
async function runSingleTest(
|
|
45
|
+
variation: string,
|
|
46
|
+
systemPrompt: string,
|
|
47
|
+
promptName: string,
|
|
48
|
+
prompt: string,
|
|
49
|
+
config: EvalConfig
|
|
50
|
+
): Promise<TestResult> {
|
|
51
|
+
const model = config.model || "haiku"
|
|
52
|
+
const tools = config.tools || "Read,Glob,Grep"
|
|
53
|
+
const systemPromptEscaped = systemPrompt.replace(/'/g, "'\"'\"'")
|
|
54
|
+
|
|
55
|
+
const cmd = `echo '${prompt.replace(/'/g, "'\\''")}' | claude \
|
|
56
|
+
--setting-sources "project" \
|
|
57
|
+
--settings '{"disableAllHooks": true}' \
|
|
58
|
+
--tools "${tools}" \
|
|
59
|
+
--system-prompt '${systemPromptEscaped}' \
|
|
60
|
+
--no-chrome \
|
|
61
|
+
--disable-slash-commands \
|
|
62
|
+
--model "${model}" \
|
|
63
|
+
--output-format stream-json \
|
|
64
|
+
--verbose \
|
|
65
|
+
-p`
|
|
66
|
+
|
|
67
|
+
const result = await $`${{ raw: cmd }}`.nothrow().quiet()
|
|
68
|
+
const output = result.stdout.toString()
|
|
69
|
+
|
|
70
|
+
const lines = output.split("\n").filter((l) => l.trim())
|
|
71
|
+
let fullResponse = ""
|
|
72
|
+
let cost = 0
|
|
73
|
+
let durationMs = 0
|
|
74
|
+
|
|
75
|
+
for (const line of lines) {
|
|
76
|
+
try {
|
|
77
|
+
const parsed = JSON.parse(line)
|
|
78
|
+
if (parsed.type === "assistant" && parsed.message?.content) {
|
|
79
|
+
for (const block of parsed.message.content) {
|
|
80
|
+
if (block.type === "text") {
|
|
81
|
+
fullResponse += block.text
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
if (parsed.type === "result") {
|
|
86
|
+
cost = parsed.total_cost_usd || 0
|
|
87
|
+
durationMs = parsed.duration_ms || 0
|
|
88
|
+
}
|
|
89
|
+
} catch {
|
|
90
|
+
// Skip non-JSON lines
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
return {
|
|
95
|
+
variation,
|
|
96
|
+
prompt: promptName,
|
|
97
|
+
output: fullResponse,
|
|
98
|
+
rawOutput: output,
|
|
99
|
+
metrics: analyzeResponse(fullResponse, promptName, config.analyzers),
|
|
100
|
+
cost,
|
|
101
|
+
durationMs,
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
export interface RunOptions {
|
|
106
|
+
config: EvalConfig
|
|
107
|
+
onProgress?: (variation: string, prompt: string, result?: TestResult, error?: Error) => void
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
export async function runEvaluation(options: RunOptions): Promise<TestResult[]> {
|
|
111
|
+
const { config, onProgress } = options
|
|
112
|
+
const results: TestResult[] = []
|
|
113
|
+
const delayMs = config.delayMs ?? 1000
|
|
114
|
+
|
|
115
|
+
for (const [variationName, systemPrompt] of Object.entries(config.variations)) {
|
|
116
|
+
for (const [promptName, prompt] of Object.entries(config.prompts)) {
|
|
117
|
+
try {
|
|
118
|
+
const result = await runSingleTest(
|
|
119
|
+
variationName,
|
|
120
|
+
systemPrompt,
|
|
121
|
+
promptName,
|
|
122
|
+
prompt,
|
|
123
|
+
config
|
|
124
|
+
)
|
|
125
|
+
results.push(result)
|
|
126
|
+
onProgress?.(variationName, promptName, result)
|
|
127
|
+
} catch (error) {
|
|
128
|
+
onProgress?.(variationName, promptName, undefined, error as Error)
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
if (delayMs > 0) {
|
|
132
|
+
await Bun.sleep(delayMs)
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
return results
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
export { analyzeResponse, defaultAnalyzers }
|
package/src/scoring.ts
ADDED
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
import type { TestResult, Score } from "./types.ts"
|
|
2
|
+
|
|
3
|
+
export function scoreResults(results: TestResult[]): Record<string, Score> {
|
|
4
|
+
const scores: Record<string, Score> = {}
|
|
5
|
+
|
|
6
|
+
for (const result of results) {
|
|
7
|
+
if (!scores[result.variation]) {
|
|
8
|
+
scores[result.variation] = { total: 0, passed: 0, details: {}, totalCost: 0 }
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
const s = scores[result.variation]!
|
|
12
|
+
s.totalCost += result.cost
|
|
13
|
+
|
|
14
|
+
// Score each metric (inverted for askedPermission - we want false)
|
|
15
|
+
const metricsToCheck: [string, boolean][] = [
|
|
16
|
+
["noPermissionSeeking", !result.metrics.askedPermission],
|
|
17
|
+
["readFilesFirst", result.metrics.readFilesFirst],
|
|
18
|
+
["usedBun", result.metrics.usedBun],
|
|
19
|
+
["proposedRootCause", result.metrics.proposedRootCause],
|
|
20
|
+
["ranVerification", result.metrics.ranVerification],
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
for (const [name, passed] of metricsToCheck) {
|
|
24
|
+
if (!s.details[name]) s.details[name] = []
|
|
25
|
+
s.details[name]!.push(passed)
|
|
26
|
+
s.total++
|
|
27
|
+
if (passed) s.passed++
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
return scores
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
export function generateReport(results: TestResult[]): string {
|
|
35
|
+
const scores = scoreResults(results)
|
|
36
|
+
const sorted = Object.entries(scores).sort(
|
|
37
|
+
(a, b) => b[1].passed / b[1].total - a[1].passed / a[1].total
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
let totalCost = 0
|
|
41
|
+
for (const result of results) {
|
|
42
|
+
totalCost += result.cost
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
const lines: string[] = []
|
|
46
|
+
lines.push("# CLAUDE.md Evaluation Report\n")
|
|
47
|
+
lines.push("## Summary\n")
|
|
48
|
+
lines.push(`- **Total tests:** ${results.length}`)
|
|
49
|
+
lines.push(`- **Total cost:** $${totalCost.toFixed(4)}`)
|
|
50
|
+
lines.push(`- **Winner:** ${sorted[0]?.[0] ?? "N/A"}\n`)
|
|
51
|
+
|
|
52
|
+
lines.push("## Rankings\n")
|
|
53
|
+
lines.push("| Rank | Variation | Score | Cost |")
|
|
54
|
+
lines.push("|------|-----------|-------|------|")
|
|
55
|
+
|
|
56
|
+
sorted.forEach(([variation, score], index) => {
|
|
57
|
+
const pct = ((score.passed / score.total) * 100).toFixed(1)
|
|
58
|
+
lines.push(
|
|
59
|
+
`| ${index + 1} | ${variation} | ${pct}% (${score.passed}/${score.total}) | $${score.totalCost.toFixed(4)} |`
|
|
60
|
+
)
|
|
61
|
+
})
|
|
62
|
+
|
|
63
|
+
lines.push("\n## Detailed Metrics\n")
|
|
64
|
+
lines.push("| Metric | " + sorted.map(([v]) => v).join(" | ") + " |")
|
|
65
|
+
lines.push("|--------|" + sorted.map(() => "------").join("|") + "|")
|
|
66
|
+
|
|
67
|
+
// Collect all metric names
|
|
68
|
+
const metricNames = new Set<string>()
|
|
69
|
+
for (const score of Object.values(scores)) {
|
|
70
|
+
for (const name of Object.keys(score.details)) {
|
|
71
|
+
metricNames.add(name)
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
for (const metric of metricNames) {
|
|
76
|
+
const values = sorted.map(([, score]) => {
|
|
77
|
+
const detail = score.details[metric]
|
|
78
|
+
if (!detail) return "N/A"
|
|
79
|
+
const passed = detail.filter(Boolean).length
|
|
80
|
+
return `${passed}/${detail.length}`
|
|
81
|
+
})
|
|
82
|
+
lines.push(`| ${metric} | ${values.join(" | ")} |`)
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
lines.push("\n---")
|
|
86
|
+
lines.push(`*Generated: ${new Date().toISOString().split("T")[0]}*`)
|
|
87
|
+
|
|
88
|
+
return lines.join("\n")
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
export function printConsoleReport(results: TestResult[]): void {
|
|
92
|
+
const scores = scoreResults(results)
|
|
93
|
+
|
|
94
|
+
console.log("\n" + "=".repeat(60))
|
|
95
|
+
console.log("š CLAUDE.md EVALUATION REPORT")
|
|
96
|
+
console.log("=".repeat(60))
|
|
97
|
+
|
|
98
|
+
const sorted = Object.entries(scores).sort(
|
|
99
|
+
(a, b) => b[1].passed / b[1].total - a[1].passed / a[1].total
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
let totalCost = 0
|
|
103
|
+
for (const result of results) {
|
|
104
|
+
totalCost += result.cost
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
for (const [variation, score] of sorted) {
|
|
108
|
+
const pct = ((score.passed / score.total) * 100).toFixed(1)
|
|
109
|
+
console.log(`\nš·ļø ${variation}: ${pct}% (${score.passed}/${score.total})`)
|
|
110
|
+
|
|
111
|
+
for (const [metric, values] of Object.entries(score.details)) {
|
|
112
|
+
const passed = values.filter(Boolean).length
|
|
113
|
+
const icon = passed === values.length ? "ā
" : passed > 0 ? "ā ļø" : "ā"
|
|
114
|
+
console.log(` ${icon} ${metric}: ${passed}/${values.length}`)
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
console.log("\n" + "=".repeat(60))
|
|
119
|
+
console.log("š WINNER:", sorted[0]?.[0] ?? "N/A")
|
|
120
|
+
console.log(`š° Total cost: $${totalCost.toFixed(4)}`)
|
|
121
|
+
console.log("=".repeat(60))
|
|
122
|
+
}
|
package/src/types.ts
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
export interface TestResult {
|
|
2
|
+
variation: string
|
|
3
|
+
prompt: string
|
|
4
|
+
output: string
|
|
5
|
+
rawOutput: string
|
|
6
|
+
metrics: Metrics
|
|
7
|
+
cost: number
|
|
8
|
+
durationMs: number
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
export interface Metrics {
|
|
12
|
+
askedPermission: boolean
|
|
13
|
+
readFilesFirst: boolean
|
|
14
|
+
usedBun: boolean
|
|
15
|
+
proposedRootCause: boolean
|
|
16
|
+
ranVerification: boolean
|
|
17
|
+
[key: string]: boolean
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
export interface EvalConfig {
|
|
21
|
+
/** Test prompts to evaluate against */
|
|
22
|
+
prompts: Record<string, string>
|
|
23
|
+
/** System prompt variations to compare */
|
|
24
|
+
variations: Record<string, string>
|
|
25
|
+
/** Model to use (default: haiku) */
|
|
26
|
+
model?: string
|
|
27
|
+
/** Tools to allow (default: Read,Glob,Grep) */
|
|
28
|
+
tools?: string
|
|
29
|
+
/** Delay between tests in ms (default: 1000) */
|
|
30
|
+
delayMs?: number
|
|
31
|
+
/** Output file for results (default: evaluation-results.json) */
|
|
32
|
+
outputFile?: string
|
|
33
|
+
/** Custom metric analyzers */
|
|
34
|
+
analyzers?: Record<string, (response: string, promptName: string) => boolean>
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
export interface Score {
|
|
38
|
+
total: number
|
|
39
|
+
passed: number
|
|
40
|
+
details: Record<string, boolean[]>
|
|
41
|
+
totalCost: number
|
|
42
|
+
}
|