cceval 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +165 -122
- package/package.json +1 -1
- package/src/cli.ts +179 -13
- package/src/generator.ts +281 -0
- package/src/index.ts +10 -0
package/README.md
CHANGED
|
@@ -7,10 +7,10 @@ Evaluate and benchmark your CLAUDE.md effectiveness with automated testing.
|
|
|
7
7
|
Your `CLAUDE.md` file guides how Claude Code behaves in your project. But how do you know if your instructions are actually working?
|
|
8
8
|
|
|
9
9
|
**cceval** lets you:
|
|
10
|
-
-
|
|
11
|
-
-
|
|
10
|
+
- Auto-generate variations of your CLAUDE.md
|
|
11
|
+
- Test them against realistic prompts
|
|
12
12
|
- Find what actually improves Claude's behavior
|
|
13
|
-
-
|
|
13
|
+
- Iterate quickly on your instructions
|
|
14
14
|
|
|
15
15
|
## Installation
|
|
16
16
|
|
|
@@ -23,123 +23,161 @@ bun add -D cceval
|
|
|
23
23
|
```
|
|
24
24
|
|
|
25
25
|
**Requirements:**
|
|
26
|
-
- [Bun](https://bun.sh)
|
|
26
|
+
- [Bun](https://bun.sh) >= 1.0
|
|
27
27
|
- [Claude Code CLI](https://docs.anthropic.com/claude-code) installed and authenticated
|
|
28
28
|
|
|
29
|
-
|
|
29
|
+
**Optional (for faster generation):**
|
|
30
|
+
- `ANTHROPIC_API_KEY` environment variable - if set, uses direct API calls for variation generation instead of Claude CLI (faster and more reliable)
|
|
30
31
|
|
|
31
|
-
|
|
32
|
-
# Run with default prompts and variations
|
|
33
|
-
cceval run
|
|
32
|
+
## Quick Start (Turnkey)
|
|
34
33
|
|
|
35
|
-
|
|
36
|
-
cceval report evaluation-results.json
|
|
34
|
+
Just run `cceval` in any project with a CLAUDE.md:
|
|
37
35
|
|
|
38
|
-
|
|
39
|
-
|
|
36
|
+
```bash
|
|
37
|
+
cd your-project
|
|
38
|
+
cceval
|
|
40
39
|
```
|
|
41
40
|
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
41
|
+
That's it! cceval will:
|
|
42
|
+
1. Find your CLAUDE.md
|
|
43
|
+
2. Use Claude to generate 5 variations (condensed, explicit, prioritized, minimal, structured)
|
|
44
|
+
3. Test each variation against 5 realistic prompts
|
|
45
|
+
4. Show you which variation performs best
|
|
45
46
|
|
|
46
|
-
|
|
47
|
+
### Example Output
|
|
47
48
|
|
|
48
|
-
```
|
|
49
|
-
cceval
|
|
49
|
+
```
|
|
50
|
+
🚀 cceval - Turnkey CLAUDE.md Evaluation
|
|
51
|
+
============================================================
|
|
52
|
+
|
|
53
|
+
📄 Source: ./CLAUDE.md
|
|
54
|
+
🤖 Model: haiku
|
|
55
|
+
|
|
56
|
+
🔄 Generating 7 variations...
|
|
57
|
+
|
|
58
|
+
⏳ Generating "condensed"... ✓
|
|
59
|
+
⏳ Generating "explicit"... ✓
|
|
60
|
+
⏳ Generating "prioritized"... ✓
|
|
61
|
+
⏳ Generating "minimal"... ✓
|
|
62
|
+
⏳ Generating "structured"... ✓
|
|
63
|
+
|
|
64
|
+
📦 Cached variations to .cceval-variations.json
|
|
65
|
+
|
|
66
|
+
============================================================
|
|
67
|
+
📊 Starting Evaluation
|
|
68
|
+
============================================================
|
|
69
|
+
Testing 7 variations:
|
|
70
|
+
• baseline
|
|
71
|
+
• original
|
|
72
|
+
• condensed
|
|
73
|
+
• explicit
|
|
74
|
+
• prioritized
|
|
75
|
+
• minimal
|
|
76
|
+
• structured
|
|
77
|
+
|
|
78
|
+
With 5 test prompts each
|
|
79
|
+
Total tests: 35
|
|
80
|
+
============================================================
|
|
81
|
+
|
|
82
|
+
✓ baseline/exploreBeforeBuild: $0.0012
|
|
83
|
+
✓ baseline/bunPreference: $0.0015
|
|
84
|
+
...
|
|
85
|
+
|
|
86
|
+
============================================================
|
|
87
|
+
📊 CLAUDE.md EVALUATION REPORT
|
|
88
|
+
============================================================
|
|
89
|
+
|
|
90
|
+
🏷️ explicit: 72.0% (18/25)
|
|
91
|
+
✅ noPermissionSeeking: 5/5
|
|
92
|
+
✅ readFilesFirst: 5/5
|
|
93
|
+
⚠️ usedBun: 3/5
|
|
94
|
+
...
|
|
95
|
+
|
|
96
|
+
🏆 WINNER: explicit
|
|
97
|
+
💰 Total cost: $0.42
|
|
98
|
+
============================================================
|
|
50
99
|
```
|
|
51
100
|
|
|
52
|
-
|
|
53
|
-
- **baseline**: Minimal "You are a helpful assistant"
|
|
54
|
-
- **gateFocused**: Clear pass/fail criteria
|
|
55
|
-
- **bunFocused**: Bun-specific instructions
|
|
56
|
-
- **rootCauseFocused**: Root cause protocol
|
|
57
|
-
- **antiPermission**: Trust-based prompting
|
|
101
|
+
## How It Works
|
|
58
102
|
|
|
59
|
-
|
|
60
|
-
- Reading files before coding
|
|
61
|
-
- Using Bun instead of Node
|
|
62
|
-
- Fixing root cause vs surface symptoms
|
|
63
|
-
- Asking permission appropriately
|
|
103
|
+
### Variation Strategies
|
|
64
104
|
|
|
65
|
-
|
|
105
|
+
cceval generates these variations from your CLAUDE.md:
|
|
66
106
|
|
|
67
|
-
|
|
107
|
+
| Strategy | Description |
|
|
108
|
+
|----------|-------------|
|
|
109
|
+
| `baseline` | Minimal "You are a helpful coding assistant" |
|
|
110
|
+
| `original` | Your actual CLAUDE.md as-is |
|
|
111
|
+
| `condensed` | Shorter version keeping only critical rules |
|
|
112
|
+
| `explicit` | More explicit version with clear imperatives (MUST, NEVER, ALWAYS) |
|
|
113
|
+
| `prioritized` | Reordered with most important rules first |
|
|
114
|
+
| `minimal` | Just the 3-5 most critical rules |
|
|
115
|
+
| `structured` | Well-organized with clear sections and headers |
|
|
68
116
|
|
|
69
|
-
|
|
70
|
-
cceval init
|
|
71
|
-
```
|
|
117
|
+
### Test Prompts
|
|
72
118
|
|
|
73
|
-
|
|
119
|
+
Default prompts test key behaviors:
|
|
74
120
|
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
performance: "The dashboard is slow, optimize it.",
|
|
83
|
-
testing: "Add tests for the user service.",
|
|
84
|
-
},
|
|
121
|
+
| Prompt | What it tests |
|
|
122
|
+
|--------|---------------|
|
|
123
|
+
| `exploreBeforeBuild` | Does it read files before coding? |
|
|
124
|
+
| `bunPreference` | Does it use Bun instead of Node? |
|
|
125
|
+
| `rootCause` | Does it fix root cause instead of adding spinners? |
|
|
126
|
+
| `simplicity` | Does it avoid over-engineering? |
|
|
127
|
+
| `permissionSeeking` | Does it execute without asking permission? |
|
|
85
128
|
|
|
86
|
-
|
|
87
|
-
// Your system prompt variations
|
|
88
|
-
baseline: "You are a helpful coding assistant.",
|
|
129
|
+
## CLI Reference
|
|
89
130
|
|
|
90
|
-
|
|
91
|
-
1. Read files before coding
|
|
92
|
-
2. State plan then proceed immediately
|
|
93
|
-
3. Run tests and show output
|
|
94
|
-
4. Only pause for destructive actions`,
|
|
131
|
+
### Basic Usage
|
|
95
132
|
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
},
|
|
133
|
+
```bash
|
|
134
|
+
# Turnkey: auto-detect, generate, test
|
|
135
|
+
cceval
|
|
100
136
|
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
delayMs: 1000, // rate limiting
|
|
104
|
-
outputFile: "results.json",
|
|
105
|
-
}
|
|
137
|
+
# Same as above, explicit
|
|
138
|
+
cceval auto
|
|
106
139
|
|
|
107
|
-
|
|
108
|
-
|
|
140
|
+
# Specify a different CLAUDE.md
|
|
141
|
+
cceval auto -p ./docs/CLAUDE.md
|
|
109
142
|
|
|
110
|
-
|
|
143
|
+
# Use a smarter model for better variations
|
|
144
|
+
cceval auto -m sonnet
|
|
111
145
|
|
|
112
|
-
|
|
113
|
-
cceval
|
|
146
|
+
# Skip regeneration, use cached variations
|
|
147
|
+
cceval auto --skip-generate
|
|
148
|
+
|
|
149
|
+
# See what strategies are available
|
|
150
|
+
cceval auto --strategies-only
|
|
114
151
|
```
|
|
115
152
|
|
|
116
|
-
###
|
|
153
|
+
### Output Options
|
|
117
154
|
|
|
118
155
|
```bash
|
|
119
|
-
# Use specific model
|
|
120
|
-
cceval run -m sonnet
|
|
121
|
-
|
|
122
156
|
# Custom output file
|
|
123
|
-
cceval
|
|
157
|
+
cceval -o my-results.json
|
|
124
158
|
|
|
125
|
-
#
|
|
126
|
-
cceval
|
|
159
|
+
# Generate markdown report too
|
|
160
|
+
cceval --markdown REPORT.md
|
|
161
|
+
```
|
|
127
162
|
|
|
128
|
-
|
|
129
|
-
cceval run --prompts-only
|
|
163
|
+
### Advanced: Custom Config
|
|
130
164
|
|
|
131
|
-
|
|
132
|
-
cceval run --variations-only
|
|
165
|
+
For full control, use a config file:
|
|
133
166
|
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
167
|
+
```bash
|
|
168
|
+
# Create starter config
|
|
169
|
+
cceval init
|
|
137
170
|
|
|
138
|
-
|
|
171
|
+
# Edit cceval.config.ts with your prompts/variations
|
|
139
172
|
|
|
140
|
-
|
|
173
|
+
# Run with config
|
|
174
|
+
cceval run
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
### Reports
|
|
141
178
|
|
|
142
179
|
```bash
|
|
180
|
+
# Generate report from previous results
|
|
143
181
|
cceval report evaluation-results.json
|
|
144
182
|
```
|
|
145
183
|
|
|
@@ -155,61 +193,47 @@ cceval measures:
|
|
|
155
193
|
| `proposedRootCause` | For "slow" prompts: fixes root cause instead of adding spinners |
|
|
156
194
|
| `ranVerification` | Mentions running tests or showing output |
|
|
157
195
|
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
Add your own analyzers:
|
|
161
|
-
|
|
162
|
-
```typescript
|
|
163
|
-
const config: EvalConfig = {
|
|
164
|
-
// ...prompts and variations...
|
|
196
|
+
## Cost Estimates
|
|
165
197
|
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
198
|
+
| Model | Cost per test | 35 tests (auto) | 25 tests (manual) |
|
|
199
|
+
|-------|---------------|-----------------|-------------------|
|
|
200
|
+
| haiku | ~$0.08 | ~$2.80 | ~$2.00 |
|
|
201
|
+
| sonnet | ~$0.30 | ~$10.50 | ~$7.50 |
|
|
202
|
+
| opus | ~$1.50 | ~$52.50 | ~$37.50 |
|
|
170
203
|
|
|
171
|
-
|
|
172
|
-
consideredSecurity: (response) =>
|
|
173
|
-
/security|auth|permission|sanitize|validate/i.test(response),
|
|
174
|
-
},
|
|
175
|
-
}
|
|
176
|
-
```
|
|
204
|
+
We recommend **haiku** for iteration (it's fast and cheap), then validate findings with **sonnet**.
|
|
177
205
|
|
|
178
206
|
## Programmatic Usage
|
|
179
207
|
|
|
180
|
-
Use as a library:
|
|
181
|
-
|
|
182
208
|
```typescript
|
|
183
|
-
import {
|
|
209
|
+
import {
|
|
210
|
+
generateVariations,
|
|
211
|
+
variationsToConfig,
|
|
212
|
+
runEvaluation,
|
|
213
|
+
printConsoleReport
|
|
214
|
+
} from "cceval"
|
|
215
|
+
|
|
216
|
+
// Generate variations from a CLAUDE.md
|
|
217
|
+
const generated = await generateVariations({
|
|
218
|
+
claudeMdPath: "./CLAUDE.md",
|
|
219
|
+
model: "haiku",
|
|
220
|
+
})
|
|
184
221
|
|
|
222
|
+
// Convert to config
|
|
223
|
+
const variations = variationsToConfig(generated)
|
|
224
|
+
|
|
225
|
+
// Run evaluation
|
|
185
226
|
const results = await runEvaluation({
|
|
186
227
|
config: {
|
|
187
228
|
prompts: { test: "Create a hello world server." },
|
|
188
|
-
variations
|
|
189
|
-
v1: "Use Bun.",
|
|
190
|
-
v2: "Use Node.",
|
|
191
|
-
},
|
|
229
|
+
variations,
|
|
192
230
|
model: "haiku",
|
|
193
231
|
},
|
|
194
|
-
onProgress: (variation, prompt, result) => {
|
|
195
|
-
console.log(`${variation}/${prompt}: $${result?.cost.toFixed(4)}`)
|
|
196
|
-
},
|
|
197
232
|
})
|
|
198
233
|
|
|
199
234
|
printConsoleReport(results)
|
|
200
|
-
const markdown = generateReport(results)
|
|
201
235
|
```
|
|
202
236
|
|
|
203
|
-
## Cost Estimates
|
|
204
|
-
|
|
205
|
-
| Model | Cost per test | 25 tests |
|
|
206
|
-
|-------|---------------|----------|
|
|
207
|
-
| haiku | ~$0.08 | ~$2.00 |
|
|
208
|
-
| sonnet | ~$0.30 | ~$7.50 |
|
|
209
|
-
| opus | ~$1.50 | ~$37.50 |
|
|
210
|
-
|
|
211
|
-
We recommend starting with **haiku** for iteration, then validating findings with **sonnet**.
|
|
212
|
-
|
|
213
237
|
## Key Findings from Our Research
|
|
214
238
|
|
|
215
239
|
Based on evaluating 25+ prompt variations:
|
|
@@ -231,11 +255,30 @@ File edits and test runs are routine.
|
|
|
231
255
|
```
|
|
232
256
|
|
|
233
257
|
### 3. Verification Is the Biggest Win
|
|
234
|
-
Adding "Run tests and show actual output" improved verification from 20%
|
|
258
|
+
Adding "Run tests and show actual output" improved verification from 20% to 100%.
|
|
235
259
|
|
|
236
260
|
### 4. Keep It Concise
|
|
237
261
|
The winning prompt was just 4 lines. Long instructions get ignored.
|
|
238
262
|
|
|
263
|
+
## Workflow
|
|
264
|
+
|
|
265
|
+
Recommended workflow for optimizing your CLAUDE.md:
|
|
266
|
+
|
|
267
|
+
1. **Baseline**: Run `cceval` to see how your current CLAUDE.md performs
|
|
268
|
+
2. **Analyze**: Look at which variation scored best and why
|
|
269
|
+
3. **Apply**: Update your CLAUDE.md based on the winning strategy
|
|
270
|
+
4. **Iterate**: Run `cceval` again to verify improvement
|
|
271
|
+
|
|
272
|
+
## Files Generated
|
|
273
|
+
|
|
274
|
+
| File | Description |
|
|
275
|
+
|------|-------------|
|
|
276
|
+
| `.cceval-variations.json` | Cached generated variations (re-run faster with `--skip-generate`) |
|
|
277
|
+
| `evaluation-results.json` | Full test results (JSON) |
|
|
278
|
+
| `REPORT.md` | Markdown report (if `--markdown` specified) |
|
|
279
|
+
|
|
280
|
+
Add `.cceval-variations.json` and `evaluation-results.json` to `.gitignore`.
|
|
281
|
+
|
|
239
282
|
## Contributing
|
|
240
283
|
|
|
241
284
|
PRs welcome! Ideas:
|
package/package.json
CHANGED
package/src/cli.ts
CHANGED
|
@@ -3,20 +3,36 @@ import { parseArgs } from "util"
|
|
|
3
3
|
import { runEvaluation } from "./runner.ts"
|
|
4
4
|
import { printConsoleReport, generateReport } from "./scoring.ts"
|
|
5
5
|
import { defaultConfig, defaultPrompts, defaultVariations } from "./defaults.ts"
|
|
6
|
+
import {
|
|
7
|
+
generateVariations,
|
|
8
|
+
variationsToConfig,
|
|
9
|
+
findClaudeMd,
|
|
10
|
+
defaultStrategies,
|
|
11
|
+
} from "./generator.ts"
|
|
6
12
|
import type { EvalConfig } from "./types.ts"
|
|
7
13
|
|
|
8
|
-
const VERSION = "0.
|
|
14
|
+
const VERSION = "0.2.0"
|
|
9
15
|
|
|
10
16
|
const HELP = `
|
|
11
17
|
cceval - Evaluate your CLAUDE.md effectiveness
|
|
12
18
|
|
|
13
19
|
Usage:
|
|
14
|
-
cceval
|
|
20
|
+
cceval Auto-detect CLAUDE.md, generate variations, and test
|
|
21
|
+
cceval auto [options] Same as above (explicit)
|
|
22
|
+
cceval run [options] Run with custom config file
|
|
15
23
|
cceval report <file> Generate report from existing results
|
|
16
24
|
cceval init Create a starter config file
|
|
17
25
|
cceval --help Show this help
|
|
18
26
|
cceval --version Show version
|
|
19
27
|
|
|
28
|
+
Auto Options:
|
|
29
|
+
-p, --path <file> Path to CLAUDE.md (default: auto-detect)
|
|
30
|
+
-m, --model <model> Model for generation & testing (default: haiku)
|
|
31
|
+
-o, --output <file> Output file for results (default: evaluation-results.json)
|
|
32
|
+
--markdown <file> Also save markdown report to file
|
|
33
|
+
--strategies-only Only show variation strategies, don't run
|
|
34
|
+
--skip-generate Skip generation, use cached variations
|
|
35
|
+
|
|
20
36
|
Run Options:
|
|
21
37
|
-c, --config <file> Config file (default: cceval.config.ts)
|
|
22
38
|
-m, --model <model> Model to use (default: haiku)
|
|
@@ -26,17 +42,20 @@ Run Options:
|
|
|
26
42
|
--variations-only Only show configured variations, don't run
|
|
27
43
|
|
|
28
44
|
Examples:
|
|
29
|
-
#
|
|
30
|
-
cceval
|
|
45
|
+
# Turnkey: auto-detect CLAUDE.md, generate variations, test
|
|
46
|
+
cceval
|
|
47
|
+
|
|
48
|
+
# Specify a different CLAUDE.md path
|
|
49
|
+
cceval auto -p ./docs/CLAUDE.md
|
|
31
50
|
|
|
32
|
-
#
|
|
33
|
-
cceval
|
|
51
|
+
# Use a smarter model for generation
|
|
52
|
+
cceval auto -m sonnet
|
|
53
|
+
|
|
54
|
+
# Run with custom config file
|
|
55
|
+
cceval run -c my-config.ts
|
|
34
56
|
|
|
35
57
|
# Generate report from previous results
|
|
36
58
|
cceval report evaluation-results.json
|
|
37
|
-
|
|
38
|
-
# Create starter config
|
|
39
|
-
cceval init
|
|
40
59
|
`
|
|
41
60
|
|
|
42
61
|
async function loadConfig(configPath?: string): Promise<EvalConfig> {
|
|
@@ -160,6 +179,138 @@ async function reportCommand(args: string[]) {
|
|
|
160
179
|
console.log("\n" + report)
|
|
161
180
|
}
|
|
162
181
|
|
|
182
|
+
async function autoCommand(args: string[]) {
|
|
183
|
+
const { values } = parseArgs({
|
|
184
|
+
args,
|
|
185
|
+
options: {
|
|
186
|
+
path: { type: "string", short: "p" },
|
|
187
|
+
model: { type: "string", short: "m" },
|
|
188
|
+
output: { type: "string", short: "o" },
|
|
189
|
+
markdown: { type: "string" },
|
|
190
|
+
"strategies-only": { type: "boolean" },
|
|
191
|
+
"skip-generate": { type: "boolean" },
|
|
192
|
+
},
|
|
193
|
+
allowPositionals: true,
|
|
194
|
+
})
|
|
195
|
+
|
|
196
|
+
const model = values.model || "haiku"
|
|
197
|
+
const outputFile = values.output || "evaluation-results.json"
|
|
198
|
+
const cachedVariationsFile = ".cceval-variations.json"
|
|
199
|
+
|
|
200
|
+
// Show strategies only
|
|
201
|
+
if (values["strategies-only"]) {
|
|
202
|
+
console.log("🔀 Variation Strategies:\n")
|
|
203
|
+
for (const strategy of defaultStrategies) {
|
|
204
|
+
console.log(` ${strategy.name}:`)
|
|
205
|
+
console.log(` ${strategy.description}\n`)
|
|
206
|
+
}
|
|
207
|
+
return
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
// Check if CLAUDE.md exists
|
|
211
|
+
const claudeMdPath = values.path || (await findClaudeMd())
|
|
212
|
+
if (!claudeMdPath) {
|
|
213
|
+
console.error("❌ No CLAUDE.md found in current directory.")
|
|
214
|
+
console.error("\nTo use cceval, you need a CLAUDE.md file. Options:")
|
|
215
|
+
console.error(" 1. Create a CLAUDE.md in your project root")
|
|
216
|
+
console.error(" 2. Specify a path: cceval auto -p ./path/to/CLAUDE.md")
|
|
217
|
+
console.error(" 3. Use manual config: cceval init && cceval run")
|
|
218
|
+
process.exit(1)
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
let variations: Record<string, string>
|
|
222
|
+
|
|
223
|
+
// Try to use cached variations
|
|
224
|
+
if (values["skip-generate"] && (await Bun.file(cachedVariationsFile).exists())) {
|
|
225
|
+
console.log("📦 Using cached variations from .cceval-variations.json")
|
|
226
|
+
const cached = await Bun.file(cachedVariationsFile).json()
|
|
227
|
+
variations = cached.variations
|
|
228
|
+
} else {
|
|
229
|
+
// Generate variations
|
|
230
|
+
console.log("🚀 cceval - Turnkey CLAUDE.md Evaluation")
|
|
231
|
+
console.log("=".repeat(60))
|
|
232
|
+
console.log(`\n📄 Source: ${claudeMdPath}`)
|
|
233
|
+
console.log(`🤖 Model: ${model}`)
|
|
234
|
+
console.log(`\n🔄 Generating ${defaultStrategies.length + 2} variations...\n`)
|
|
235
|
+
|
|
236
|
+
const generated = await generateVariations({
|
|
237
|
+
claudeMdPath,
|
|
238
|
+
model,
|
|
239
|
+
onProgress: (strategy, status) => {
|
|
240
|
+
if (status === "start") {
|
|
241
|
+
process.stdout.write(` ⏳ Generating "${strategy}"...`)
|
|
242
|
+
} else if (status === "done") {
|
|
243
|
+
console.log(" ✓")
|
|
244
|
+
} else {
|
|
245
|
+
console.log(" ✗")
|
|
246
|
+
}
|
|
247
|
+
},
|
|
248
|
+
})
|
|
249
|
+
|
|
250
|
+
variations = variationsToConfig(generated)
|
|
251
|
+
|
|
252
|
+
// Cache the variations
|
|
253
|
+
await Bun.write(
|
|
254
|
+
cachedVariationsFile,
|
|
255
|
+
JSON.stringify({ sourceFile: claudeMdPath, variations }, null, 2)
|
|
256
|
+
)
|
|
257
|
+
console.log(`\n📦 Cached variations to ${cachedVariationsFile}`)
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
// Build config with generated variations
|
|
261
|
+
const config: EvalConfig = {
|
|
262
|
+
prompts: defaultPrompts,
|
|
263
|
+
variations,
|
|
264
|
+
model,
|
|
265
|
+
tools: "Read,Glob,Grep",
|
|
266
|
+
delayMs: 1000,
|
|
267
|
+
outputFile,
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
const totalTests =
|
|
271
|
+
Object.keys(config.prompts).length * Object.keys(config.variations).length
|
|
272
|
+
|
|
273
|
+
console.log("\n" + "=".repeat(60))
|
|
274
|
+
console.log("📊 Starting Evaluation")
|
|
275
|
+
console.log("=".repeat(60))
|
|
276
|
+
console.log(`Testing ${Object.keys(config.variations).length} variations:`)
|
|
277
|
+
for (const name of Object.keys(config.variations)) {
|
|
278
|
+
console.log(` • ${name}`)
|
|
279
|
+
}
|
|
280
|
+
console.log(`\nWith ${Object.keys(config.prompts).length} test prompts each`)
|
|
281
|
+
console.log(`Total tests: ${totalTests}`)
|
|
282
|
+
console.log("=".repeat(60) + "\n")
|
|
283
|
+
|
|
284
|
+
const results = await runEvaluation({
|
|
285
|
+
config,
|
|
286
|
+
onProgress: (variation, prompt, result, error) => {
|
|
287
|
+
if (error) {
|
|
288
|
+
console.log(` ✗ ${variation}/${prompt}: Error - ${error.message}`)
|
|
289
|
+
} else if (result) {
|
|
290
|
+
console.log(` ✓ ${variation}/${prompt}: $${result.cost.toFixed(4)}`)
|
|
291
|
+
}
|
|
292
|
+
},
|
|
293
|
+
})
|
|
294
|
+
|
|
295
|
+
printConsoleReport(results)
|
|
296
|
+
|
|
297
|
+
// Save JSON results
|
|
298
|
+
await Bun.write(outputFile, JSON.stringify(results, null, 2))
|
|
299
|
+
console.log(`\n📁 Results saved to ${outputFile}`)
|
|
300
|
+
|
|
301
|
+
// Save markdown if requested
|
|
302
|
+
if (values.markdown) {
|
|
303
|
+
const report = generateReport(results)
|
|
304
|
+
await Bun.write(values.markdown, report)
|
|
305
|
+
console.log(`📄 Markdown report saved to ${values.markdown}`)
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
console.log("\n💡 Tips:")
|
|
309
|
+
console.log(" • Re-run faster with: cceval auto --skip-generate")
|
|
310
|
+
console.log(" • Generate new variations: cceval auto (without --skip-generate)")
|
|
311
|
+
console.log(" • See full report: cceval report " + outputFile)
|
|
312
|
+
}
|
|
313
|
+
|
|
163
314
|
async function initCommand() {
|
|
164
315
|
const configFile = "cceval.config.ts"
|
|
165
316
|
|
|
@@ -227,7 +378,7 @@ export default config
|
|
|
227
378
|
async function main() {
|
|
228
379
|
const args = process.argv.slice(2)
|
|
229
380
|
|
|
230
|
-
if (args.
|
|
381
|
+
if (args.includes("--help") || args.includes("-h")) {
|
|
231
382
|
console.log(HELP)
|
|
232
383
|
return
|
|
233
384
|
}
|
|
@@ -237,10 +388,19 @@ async function main() {
|
|
|
237
388
|
return
|
|
238
389
|
}
|
|
239
390
|
|
|
391
|
+
// No args = turnkey auto mode
|
|
392
|
+
if (args.length === 0) {
|
|
393
|
+
await autoCommand([])
|
|
394
|
+
return
|
|
395
|
+
}
|
|
396
|
+
|
|
240
397
|
const command = args[0]
|
|
241
398
|
const commandArgs = args.slice(1)
|
|
242
399
|
|
|
243
400
|
switch (command) {
|
|
401
|
+
case "auto":
|
|
402
|
+
await autoCommand(commandArgs)
|
|
403
|
+
break
|
|
244
404
|
case "run":
|
|
245
405
|
await runCommand(commandArgs)
|
|
246
406
|
break
|
|
@@ -251,9 +411,15 @@ async function main() {
|
|
|
251
411
|
await initCommand()
|
|
252
412
|
break
|
|
253
413
|
default:
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
414
|
+
// If not a known command, treat as auto with args
|
|
415
|
+
// This handles cases like: cceval -p ./CLAUDE.md
|
|
416
|
+
if (command?.startsWith("-")) {
|
|
417
|
+
await autoCommand(args)
|
|
418
|
+
} else {
|
|
419
|
+
console.error(`Unknown command: ${command}`)
|
|
420
|
+
console.log(HELP)
|
|
421
|
+
process.exit(1)
|
|
422
|
+
}
|
|
257
423
|
}
|
|
258
424
|
}
|
|
259
425
|
|
package/src/generator.ts
ADDED
|
@@ -0,0 +1,281 @@
|
|
|
1
|
+
import { $ } from "bun"
|
|
2
|
+
|
|
3
|
+
export interface VariationStrategy {
|
|
4
|
+
name: string
|
|
5
|
+
description: string
|
|
6
|
+
prompt: string
|
|
7
|
+
}
|
|
8
|
+
|
|
9
|
+
export const defaultStrategies: VariationStrategy[] = [
|
|
10
|
+
{
|
|
11
|
+
name: "condensed",
|
|
12
|
+
description: "Shorter version keeping only critical rules",
|
|
13
|
+
prompt: `Create a condensed version of this CLAUDE.md that:
|
|
14
|
+
- Keeps only the most critical rules and instructions
|
|
15
|
+
- Removes redundant or overlapping guidance
|
|
16
|
+
- Uses shorter, more direct language
|
|
17
|
+
- Aims for 30-50% of the original length
|
|
18
|
+
- Preserves the most impactful behaviors
|
|
19
|
+
|
|
20
|
+
Return ONLY the condensed content, no explanations.`,
|
|
21
|
+
},
|
|
22
|
+
{
|
|
23
|
+
name: "explicit",
|
|
24
|
+
description: "More explicit version with clear imperatives",
|
|
25
|
+
prompt: `Create a more explicit version of this CLAUDE.md that:
|
|
26
|
+
- Converts suggestions into direct commands (MUST, NEVER, ALWAYS)
|
|
27
|
+
- Makes implicit rules explicit
|
|
28
|
+
- Uses bullet points and clear formatting
|
|
29
|
+
- Removes ambiguous language
|
|
30
|
+
- Keeps the same rules but makes them unmistakably clear
|
|
31
|
+
|
|
32
|
+
Return ONLY the explicit content, no explanations.`,
|
|
33
|
+
},
|
|
34
|
+
{
|
|
35
|
+
name: "prioritized",
|
|
36
|
+
description: "Reordered with most important rules first",
|
|
37
|
+
prompt: `Reorganize this CLAUDE.md to:
|
|
38
|
+
- Put the most important/impactful rules at the very top
|
|
39
|
+
- Group related rules together
|
|
40
|
+
- Add a "Critical Rules" section at the start with the top 5 rules
|
|
41
|
+
- Keep all original content but in a better order
|
|
42
|
+
- Rules that affect code quality should come before style rules
|
|
43
|
+
|
|
44
|
+
Return ONLY the reorganized content, no explanations.`,
|
|
45
|
+
},
|
|
46
|
+
{
|
|
47
|
+
name: "minimal",
|
|
48
|
+
description: "Just the 3-5 most critical rules",
|
|
49
|
+
prompt: `Extract ONLY the 3-5 most critical rules from this CLAUDE.md:
|
|
50
|
+
- Choose rules that have the highest impact on code quality
|
|
51
|
+
- Choose rules that are most unique to this project
|
|
52
|
+
- Skip generic advice that any AI would follow
|
|
53
|
+
- Format as a very short, focused instruction set
|
|
54
|
+
|
|
55
|
+
Return ONLY the minimal rules, no explanations.`,
|
|
56
|
+
},
|
|
57
|
+
{
|
|
58
|
+
name: "structured",
|
|
59
|
+
description: "Well-organized with clear sections and headers",
|
|
60
|
+
prompt: `Restructure this CLAUDE.md into a well-organized document:
|
|
61
|
+
- Add clear section headers (## Syntax)
|
|
62
|
+
- Group related rules under appropriate headers
|
|
63
|
+
- Use consistent formatting throughout
|
|
64
|
+
- Add brief section summaries where helpful
|
|
65
|
+
- Keep all original rules but organize them logically
|
|
66
|
+
|
|
67
|
+
Return ONLY the restructured content, no explanations.`,
|
|
68
|
+
},
|
|
69
|
+
]
|
|
70
|
+
|
|
71
|
+
export interface GenerateOptions {
|
|
72
|
+
claudeMdPath?: string
|
|
73
|
+
strategies?: VariationStrategy[]
|
|
74
|
+
model?: string
|
|
75
|
+
onProgress?: (strategy: string, status: "start" | "done" | "error", result?: string) => void
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
export async function findClaudeMd(startPath?: string): Promise<string | null> {
|
|
79
|
+
const searchPaths = startPath
|
|
80
|
+
? [startPath]
|
|
81
|
+
: ["./CLAUDE.md", "./.claude/CLAUDE.md", "../CLAUDE.md"]
|
|
82
|
+
|
|
83
|
+
for (const p of searchPaths) {
|
|
84
|
+
if (await Bun.file(p).exists()) {
|
|
85
|
+
return p
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
return null
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
export async function readClaudeMd(path: string): Promise<string> {
|
|
92
|
+
const file = Bun.file(path)
|
|
93
|
+
if (!(await file.exists())) {
|
|
94
|
+
throw new Error(`CLAUDE.md not found at ${path}`)
|
|
95
|
+
}
|
|
96
|
+
return await file.text()
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
// Map cceval model names to Anthropic model IDs
|
|
100
|
+
function getAnthropicModel(model: string): string {
|
|
101
|
+
const modelMap: Record<string, string> = {
|
|
102
|
+
haiku: "claude-haiku-4-5-20251001",
|
|
103
|
+
sonnet: "claude-sonnet-4-20250514",
|
|
104
|
+
opus: "claude-opus-4-5-20251101",
|
|
105
|
+
}
|
|
106
|
+
return modelMap[model] || model
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
async function generateVariationWithAPI(
|
|
110
|
+
prompt: string,
|
|
111
|
+
model: string
|
|
112
|
+
): Promise<string> {
|
|
113
|
+
const apiKey = process.env.ANTHROPIC_API_KEY
|
|
114
|
+
if (!apiKey) {
|
|
115
|
+
throw new Error("ANTHROPIC_API_KEY not set")
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
const anthropicModel = getAnthropicModel(model)
|
|
119
|
+
|
|
120
|
+
const response = await fetch("https://api.anthropic.com/v1/messages", {
|
|
121
|
+
method: "POST",
|
|
122
|
+
headers: {
|
|
123
|
+
"Content-Type": "application/json",
|
|
124
|
+
"x-api-key": apiKey,
|
|
125
|
+
"anthropic-version": "2023-06-01",
|
|
126
|
+
},
|
|
127
|
+
body: JSON.stringify({
|
|
128
|
+
model: anthropicModel,
|
|
129
|
+
max_tokens: 4096,
|
|
130
|
+
messages: [{ role: "user", content: prompt }],
|
|
131
|
+
}),
|
|
132
|
+
})
|
|
133
|
+
|
|
134
|
+
if (!response.ok) {
|
|
135
|
+
const error = await response.text()
|
|
136
|
+
throw new Error(`Anthropic API error: ${response.status} - ${error}`)
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
const data = (await response.json()) as {
|
|
140
|
+
content?: Array<{ type: string; text?: string }>
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
let result = ""
|
|
144
|
+
for (const block of data.content || []) {
|
|
145
|
+
if (block.type === "text" && block.text) {
|
|
146
|
+
result += block.text
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
return result.trim()
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
async function generateVariationWithCLI(
|
|
154
|
+
prompt: string,
|
|
155
|
+
model: string
|
|
156
|
+
): Promise<string> {
|
|
157
|
+
// Write prompt to a temp file to avoid shell escaping issues
|
|
158
|
+
const tempFile = `/tmp/cceval-prompt-${Date.now()}.txt`
|
|
159
|
+
await Bun.write(tempFile, prompt)
|
|
160
|
+
|
|
161
|
+
try {
|
|
162
|
+
// Use --tools "" to disable all tools, preventing file modifications
|
|
163
|
+
// Use --settings to disable hooks
|
|
164
|
+
const cmd = `cat "${tempFile}" | claude \
|
|
165
|
+
--model "${model}" \
|
|
166
|
+
--no-chrome \
|
|
167
|
+
--disable-slash-commands \
|
|
168
|
+
--tools "" \
|
|
169
|
+
--settings '{"disableAllHooks": true}' \
|
|
170
|
+
--output-format stream-json \
|
|
171
|
+
--verbose \
|
|
172
|
+
-p`
|
|
173
|
+
|
|
174
|
+
const result = await $`${{ raw: cmd }}`.nothrow().quiet()
|
|
175
|
+
const output = result.stdout.toString()
|
|
176
|
+
|
|
177
|
+
// Parse the stream-json output
|
|
178
|
+
const lines = output.split("\n").filter((l) => l.trim())
|
|
179
|
+
let fullResponse = ""
|
|
180
|
+
|
|
181
|
+
for (const line of lines) {
|
|
182
|
+
try {
|
|
183
|
+
const parsed = JSON.parse(line)
|
|
184
|
+
if (parsed.type === "assistant" && parsed.message?.content) {
|
|
185
|
+
for (const block of parsed.message.content) {
|
|
186
|
+
if (block.type === "text") {
|
|
187
|
+
fullResponse += block.text
|
|
188
|
+
}
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
} catch {
|
|
192
|
+
// Skip non-JSON lines
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
return fullResponse.trim()
|
|
197
|
+
} finally {
|
|
198
|
+
// Clean up temp file
|
|
199
|
+
await Bun.file(tempFile).exists() && (await $`rm ${tempFile}`.nothrow())
|
|
200
|
+
}
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
async function generateVariation(
|
|
204
|
+
originalContent: string,
|
|
205
|
+
strategy: VariationStrategy,
|
|
206
|
+
model: string
|
|
207
|
+
): Promise<string> {
|
|
208
|
+
const prompt = `Here is a CLAUDE.md file that configures an AI coding assistant:
|
|
209
|
+
|
|
210
|
+
<claude_md>
|
|
211
|
+
${originalContent}
|
|
212
|
+
</claude_md>
|
|
213
|
+
|
|
214
|
+
${strategy.prompt}`
|
|
215
|
+
|
|
216
|
+
// Try direct API first (faster, more reliable), fall back to CLI
|
|
217
|
+
if (process.env.ANTHROPIC_API_KEY) {
|
|
218
|
+
return generateVariationWithAPI(prompt, model)
|
|
219
|
+
} else {
|
|
220
|
+
return generateVariationWithCLI(prompt, model)
|
|
221
|
+
}
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
export interface GeneratedVariations {
|
|
225
|
+
original: string
|
|
226
|
+
baseline: string
|
|
227
|
+
variations: Record<string, string>
|
|
228
|
+
sourceFile: string
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
export async function generateVariations(
|
|
232
|
+
options: GenerateOptions = {}
|
|
233
|
+
): Promise<GeneratedVariations> {
|
|
234
|
+
const { strategies = defaultStrategies, model = "haiku", onProgress } = options
|
|
235
|
+
|
|
236
|
+
// Find and read CLAUDE.md
|
|
237
|
+
const claudeMdPath = options.claudeMdPath || (await findClaudeMd())
|
|
238
|
+
if (!claudeMdPath) {
|
|
239
|
+
throw new Error(
|
|
240
|
+
"No CLAUDE.md found. Please create one or specify --path <file>"
|
|
241
|
+
)
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
const originalContent = await readClaudeMd(claudeMdPath)
|
|
245
|
+
console.log(`📄 Found CLAUDE.md at ${claudeMdPath} (${originalContent.length} chars)`)
|
|
246
|
+
|
|
247
|
+
const variations: Record<string, string> = {}
|
|
248
|
+
|
|
249
|
+
// Generate each variation
|
|
250
|
+
for (const strategy of strategies) {
|
|
251
|
+
onProgress?.(strategy.name, "start")
|
|
252
|
+
try {
|
|
253
|
+
const variation = await generateVariation(originalContent, strategy, model)
|
|
254
|
+
variations[strategy.name] = variation
|
|
255
|
+
onProgress?.(strategy.name, "done", variation)
|
|
256
|
+
} catch (error) {
|
|
257
|
+
onProgress?.(strategy.name, "error")
|
|
258
|
+
console.error(` ✗ Failed to generate ${strategy.name}: ${error}`)
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
// Small delay to avoid rate limiting
|
|
262
|
+
await Bun.sleep(500)
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
return {
|
|
266
|
+
original: originalContent,
|
|
267
|
+
baseline: "You are a helpful coding assistant.",
|
|
268
|
+
variations,
|
|
269
|
+
sourceFile: claudeMdPath,
|
|
270
|
+
}
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
export function variationsToConfig(
|
|
274
|
+
generated: GeneratedVariations
|
|
275
|
+
): Record<string, string> {
|
|
276
|
+
return {
|
|
277
|
+
baseline: generated.baseline,
|
|
278
|
+
original: generated.original,
|
|
279
|
+
...generated.variations,
|
|
280
|
+
}
|
|
281
|
+
}
|
package/src/index.ts
CHANGED
|
@@ -10,3 +10,13 @@ export { scoreResults, generateReport, printConsoleReport } from "./scoring.ts"
|
|
|
10
10
|
|
|
11
11
|
// Defaults
|
|
12
12
|
export { defaultConfig, defaultPrompts, defaultVariations } from "./defaults.ts"
|
|
13
|
+
|
|
14
|
+
// Generator (for turnkey variation generation)
|
|
15
|
+
export {
|
|
16
|
+
generateVariations,
|
|
17
|
+
variationsToConfig,
|
|
18
|
+
findClaudeMd,
|
|
19
|
+
readClaudeMd,
|
|
20
|
+
defaultStrategies,
|
|
21
|
+
} from "./generator.ts"
|
|
22
|
+
export type { VariationStrategy, GenerateOptions, GeneratedVariations } from "./generator.ts"
|