agentv 3.9.2 → 3.10.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-OIVGGWJ3.js → chunk-GWHHM6X2.js} +25 -14
- package/dist/chunk-GWHHM6X2.js.map +1 -0
- package/dist/{chunk-6ZAFWUBT.js → chunk-JLFFYTZA.js} +4 -4
- package/dist/{chunk-JGMJL2LV.js → chunk-TXCVDTEE.js} +8 -7
- package/dist/{chunk-JGMJL2LV.js.map → chunk-TXCVDTEE.js.map} +1 -1
- package/dist/cli.js +3 -3
- package/dist/{dist-PUPHGVKL.js → dist-FPC7J7KQ.js} +2 -2
- package/dist/index.js +3 -3
- package/dist/{interactive-BD56NB23.js → interactive-N463HRIL.js} +3 -3
- package/dist/templates/.agents/skills/agentv-chat-to-eval/README.md +84 -0
- package/dist/templates/.agents/skills/agentv-chat-to-eval/SKILL.md +144 -0
- package/dist/templates/.agents/skills/agentv-chat-to-eval/examples/transcript-json.md +67 -0
- package/dist/templates/.agents/skills/agentv-chat-to-eval/examples/transcript-markdown.md +101 -0
- package/dist/templates/.agents/skills/agentv-eval-builder/SKILL.md +458 -0
- package/dist/templates/.agents/skills/agentv-eval-builder/references/config-schema.json +36 -0
- package/dist/templates/.agents/skills/agentv-eval-builder/references/custom-evaluators.md +118 -0
- package/dist/templates/.agents/skills/agentv-eval-builder/references/eval-schema.json +12753 -0
- package/dist/templates/.agents/skills/agentv-eval-builder/references/rubric-evaluator.md +77 -0
- package/dist/templates/.agents/skills/agentv-eval-orchestrator/SKILL.md +50 -0
- package/dist/templates/.agents/skills/agentv-prompt-optimizer/SKILL.md +78 -0
- package/dist/templates/.agentv/.env.example +25 -0
- package/dist/templates/.claude/skills/agentv-eval-builder/SKILL.md +177 -0
- package/dist/templates/.claude/skills/agentv-eval-builder/references/batch-cli-evaluator.md +316 -0
- package/dist/templates/.claude/skills/agentv-eval-builder/references/compare-command.md +137 -0
- package/dist/templates/.claude/skills/agentv-eval-builder/references/composite-evaluator.md +215 -0
- package/dist/templates/.claude/skills/agentv-eval-builder/references/config-schema.json +27 -0
- package/dist/templates/.claude/skills/agentv-eval-builder/references/custom-evaluators.md +115 -0
- package/dist/templates/.claude/skills/agentv-eval-builder/references/eval-schema.json +278 -0
- package/dist/templates/.claude/skills/agentv-eval-builder/references/example-evals.md +333 -0
- package/dist/templates/.claude/skills/agentv-eval-builder/references/rubric-evaluator.md +79 -0
- package/dist/templates/.claude/skills/agentv-eval-builder/references/structured-data-evaluators.md +121 -0
- package/dist/templates/.claude/skills/agentv-eval-builder/references/tool-trajectory-evaluator.md +298 -0
- package/dist/templates/.claude/skills/agentv-prompt-optimizer/SKILL.md +78 -0
- package/dist/templates/.github/prompts/agentv-eval-build.prompt.md +5 -0
- package/dist/templates/.github/prompts/agentv-optimize.prompt.md +4 -0
- package/package.json +3 -3
- package/dist/chunk-OIVGGWJ3.js.map +0 -1
- /package/dist/{chunk-6ZAFWUBT.js.map → chunk-JLFFYTZA.js.map} +0 -0
- /package/dist/{dist-PUPHGVKL.js.map → dist-FPC7J7KQ.js.map} +0 -0
- /package/dist/{interactive-BD56NB23.js.map → interactive-N463HRIL.js.map} +0 -0
|
@@ -0,0 +1,316 @@
|
|
|
1
|
+
# Batch CLI Evaluation Guide
|
|
2
|
+
|
|
3
|
+
Guide for evaluating batch CLI output where a single runner processes all evalcases at once and outputs JSONL.
|
|
4
|
+
|
|
5
|
+
## Overview
|
|
6
|
+
|
|
7
|
+
Batch CLI evaluation is used when:
|
|
8
|
+
- An external tool processes multiple inputs in a single invocation (e.g., AML screening, bulk classification)
|
|
9
|
+
- The runner reads the eval YAML directly to extract all evalcases
|
|
10
|
+
- Output is JSONL with records keyed by evalcase `id`
|
|
11
|
+
- Each evalcase has its own evaluator to validate its corresponding output record
|
|
12
|
+
|
|
13
|
+
## Execution Flow
|
|
14
|
+
|
|
15
|
+
1. **AgentV** invokes the batch runner once, passing `--eval <yaml-path>` and `--output <jsonl-path>`
|
|
16
|
+
2. **Batch runner** reads the eval YAML, extracts all evalcases, processes them, writes JSONL output keyed by `id`
|
|
17
|
+
3. **AgentV** parses JSONL, routes each record to its matching evalcase by `id`
|
|
18
|
+
4. **Per-case evaluator** validates the output for each evalcase independently
|
|
19
|
+
|
|
20
|
+
## Eval File Structure
|
|
21
|
+
|
|
22
|
+
```yaml
|
|
23
|
+
description: Batch CLI demo using structured input_messages
|
|
24
|
+
execution:
|
|
25
|
+
target: batch_cli
|
|
26
|
+
|
|
27
|
+
evalcases:
|
|
28
|
+
- id: case-001
|
|
29
|
+
expected_outcome: |-
|
|
30
|
+
Batch runner returns JSON with decision=CLEAR.
|
|
31
|
+
|
|
32
|
+
expected_messages:
|
|
33
|
+
- role: assistant
|
|
34
|
+
content:
|
|
35
|
+
decision: CLEAR # Structured expected output
|
|
36
|
+
|
|
37
|
+
input_messages:
|
|
38
|
+
- role: system
|
|
39
|
+
content: You are a batch processor.
|
|
40
|
+
- role: user
|
|
41
|
+
content: # Structured input (runner extracts this)
|
|
42
|
+
request:
|
|
43
|
+
type: screening_check
|
|
44
|
+
jurisdiction: AU
|
|
45
|
+
row:
|
|
46
|
+
id: case-001
|
|
47
|
+
name: Example A
|
|
48
|
+
amount: 5000
|
|
49
|
+
|
|
50
|
+
execution:
|
|
51
|
+
evaluators:
|
|
52
|
+
- name: decision-check
|
|
53
|
+
type: code_judge
|
|
54
|
+
script: bun run ./scripts/check-output.ts
|
|
55
|
+
cwd: .
|
|
56
|
+
|
|
57
|
+
- id: case-002
|
|
58
|
+
expected_outcome: |-
|
|
59
|
+
Batch runner returns JSON with decision=REVIEW.
|
|
60
|
+
|
|
61
|
+
expected_messages:
|
|
62
|
+
- role: assistant
|
|
63
|
+
content:
|
|
64
|
+
decision: REVIEW
|
|
65
|
+
|
|
66
|
+
input_messages:
|
|
67
|
+
- role: system
|
|
68
|
+
content: You are a batch processor.
|
|
69
|
+
- role: user
|
|
70
|
+
content:
|
|
71
|
+
request:
|
|
72
|
+
type: screening_check
|
|
73
|
+
jurisdiction: AU
|
|
74
|
+
row:
|
|
75
|
+
id: case-002
|
|
76
|
+
name: Example B
|
|
77
|
+
amount: 25000
|
|
78
|
+
|
|
79
|
+
execution:
|
|
80
|
+
evaluators:
|
|
81
|
+
- name: decision-check
|
|
82
|
+
type: code_judge
|
|
83
|
+
script: bun run ./scripts/check-output.ts
|
|
84
|
+
cwd: .
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
## Batch Runner Implementation
|
|
88
|
+
|
|
89
|
+
The batch runner reads the eval YAML directly and processes all evalcases in one invocation.
|
|
90
|
+
|
|
91
|
+
### Runner Contract
|
|
92
|
+
|
|
93
|
+
**Input:** The runner receives the eval file path via `--eval` flag:
|
|
94
|
+
```bash
|
|
95
|
+
bun run batch-runner.ts --eval ./my-eval.yaml --output ./results.jsonl
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
**Output:** JSONL file where each line is a JSON object with:
|
|
99
|
+
```json
|
|
100
|
+
{"id": "case-001", "text": "{\"decision\": \"CLEAR\", ...}"}
|
|
101
|
+
{"id": "case-002", "text": "{\"decision\": \"REVIEW\", ...}"}
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
The `id` field must match the evalcase `id` for AgentV to route output to the correct evaluator.
|
|
105
|
+
|
|
106
|
+
### Output with Tool Trajectory Support
|
|
107
|
+
|
|
108
|
+
To enable `tool_trajectory` evaluation, include `output_messages` with `tool_calls`:
|
|
109
|
+
|
|
110
|
+
```json
|
|
111
|
+
{
|
|
112
|
+
"id": "case-001",
|
|
113
|
+
"text": "{\"decision\": \"CLEAR\", ...}",
|
|
114
|
+
"output_messages": [
|
|
115
|
+
{
|
|
116
|
+
"role": "assistant",
|
|
117
|
+
"tool_calls": [
|
|
118
|
+
{
|
|
119
|
+
"tool": "screening_check",
|
|
120
|
+
"input": { "origin_country": "NZ", "amount": 5000 },
|
|
121
|
+
"output": { "decision": "CLEAR", "reasons": [] }
|
|
122
|
+
}
|
|
123
|
+
]
|
|
124
|
+
},
|
|
125
|
+
{
|
|
126
|
+
"role": "assistant",
|
|
127
|
+
"content": { "decision": "CLEAR" }
|
|
128
|
+
}
|
|
129
|
+
]
|
|
130
|
+
}
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
AgentV extracts tool calls directly from `output_messages[].tool_calls[]` for `tool_trajectory` evaluators. This is the recommended format for batch runners that make tool calls.
|
|
134
|
+
|
|
135
|
+
### Example Runner (TypeScript)
|
|
136
|
+
|
|
137
|
+
```typescript
|
|
138
|
+
import fs from 'node:fs/promises';
|
|
139
|
+
import { parse } from 'yaml';
|
|
140
|
+
|
|
141
|
+
type EvalCase = {
|
|
142
|
+
id: string;
|
|
143
|
+
input_messages: Array<{ role: string; content: unknown }>;
|
|
144
|
+
};
|
|
145
|
+
|
|
146
|
+
async function main() {
|
|
147
|
+
const args = process.argv.slice(2);
|
|
148
|
+
const evalPath = getFlag(args, '--eval');
|
|
149
|
+
const outPath = getFlag(args, '--output');
|
|
150
|
+
|
|
151
|
+
// Read and parse eval YAML
|
|
152
|
+
const yamlText = await fs.readFile(evalPath, 'utf8');
|
|
153
|
+
const parsed = parse(yamlText);
|
|
154
|
+
const evalcases = parsed.evalcases as EvalCase[];
|
|
155
|
+
|
|
156
|
+
// Process each evalcase
|
|
157
|
+
const results: Array<{ id: string; text: string }> = [];
|
|
158
|
+
for (const evalcase of evalcases) {
|
|
159
|
+
const userContent = findUserContent(evalcase.input_messages);
|
|
160
|
+
const decision = processInput(userContent); // Your logic here
|
|
161
|
+
|
|
162
|
+
results.push({
|
|
163
|
+
id: evalcase.id,
|
|
164
|
+
text: JSON.stringify({ decision, ...otherFields }),
|
|
165
|
+
});
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
// Write JSONL output
|
|
169
|
+
const jsonl = results.map((r) => JSON.stringify(r)).join('\n') + '\n';
|
|
170
|
+
await fs.writeFile(outPath, jsonl, 'utf8');
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
function getFlag(args: string[], name: string): string {
|
|
174
|
+
const idx = args.indexOf(name);
|
|
175
|
+
return args[idx + 1];
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
function findUserContent(messages: Array<{ role: string; content: unknown }>) {
|
|
179
|
+
return messages.find((m) => m.role === 'user')?.content;
|
|
180
|
+
}
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
## Evaluator Implementation
|
|
184
|
+
|
|
185
|
+
Each evalcase has its own evaluator that validates the output. The evaluator receives the standard code_judge input.
|
|
186
|
+
|
|
187
|
+
### Evaluator Contract
|
|
188
|
+
|
|
189
|
+
**Input (stdin):** Standard AgentV code_judge format:
|
|
190
|
+
```json
|
|
191
|
+
{
|
|
192
|
+
"candidate_answer": "{\"id\":\"case-001\",\"decision\":\"CLEAR\",...}",
|
|
193
|
+
"expected_messages": [{"role": "assistant", "content": {"decision": "CLEAR"}}],
|
|
194
|
+
"input_messages": [...],
|
|
195
|
+
...
|
|
196
|
+
}
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
**Output (stdout):** Standard evaluator result:
|
|
200
|
+
```json
|
|
201
|
+
{
|
|
202
|
+
"score": 1.0,
|
|
203
|
+
"hits": ["decision matches: CLEAR"],
|
|
204
|
+
"misses": [],
|
|
205
|
+
"reasoning": "Batch runner decision matches expected."
|
|
206
|
+
}
|
|
207
|
+
```
|
|
208
|
+
|
|
209
|
+
### Example Evaluator (TypeScript)
|
|
210
|
+
|
|
211
|
+
```typescript
|
|
212
|
+
import fs from 'node:fs';
|
|
213
|
+
|
|
214
|
+
type EvalInput = {
|
|
215
|
+
candidate_answer?: string;
|
|
216
|
+
expected_messages?: Array<{ role: string; content: unknown }>;
|
|
217
|
+
};
|
|
218
|
+
|
|
219
|
+
function main() {
|
|
220
|
+
const stdin = fs.readFileSync(0, 'utf8');
|
|
221
|
+
const input = JSON.parse(stdin) as EvalInput;
|
|
222
|
+
|
|
223
|
+
// Extract expected value from expected_messages
|
|
224
|
+
const expectedDecision = findExpectedDecision(input.expected_messages);
|
|
225
|
+
|
|
226
|
+
// Parse candidate answer (output from batch runner)
|
|
227
|
+
let candidateDecision: string | undefined;
|
|
228
|
+
try {
|
|
229
|
+
const parsed = JSON.parse(input.candidate_answer ?? '');
|
|
230
|
+
candidateDecision = parsed.decision;
|
|
231
|
+
} catch {
|
|
232
|
+
candidateDecision = undefined;
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
// Compare
|
|
236
|
+
const hits: string[] = [];
|
|
237
|
+
const misses: string[] = [];
|
|
238
|
+
|
|
239
|
+
if (expectedDecision === candidateDecision) {
|
|
240
|
+
hits.push(`decision matches: ${expectedDecision}`);
|
|
241
|
+
} else {
|
|
242
|
+
misses.push(`mismatch: expected=${expectedDecision} actual=${candidateDecision}`);
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
const score = misses.length === 0 ? 1 : 0;
|
|
246
|
+
|
|
247
|
+
process.stdout.write(JSON.stringify({
|
|
248
|
+
score,
|
|
249
|
+
hits,
|
|
250
|
+
misses,
|
|
251
|
+
reasoning: score === 1
|
|
252
|
+
? 'Batch runner output matches expected.'
|
|
253
|
+
: 'Batch runner output did not match expected.',
|
|
254
|
+
}));
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
function findExpectedDecision(messages?: Array<{ role: string; content: unknown }>) {
|
|
258
|
+
if (!messages) return undefined;
|
|
259
|
+
for (const msg of messages) {
|
|
260
|
+
if (typeof msg.content === 'object' && msg.content !== null) {
|
|
261
|
+
return (msg.content as Record<string, unknown>).decision as string;
|
|
262
|
+
}
|
|
263
|
+
}
|
|
264
|
+
return undefined;
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
main();
|
|
268
|
+
```
|
|
269
|
+
|
|
270
|
+
## Structured Content in expected_messages
|
|
271
|
+
|
|
272
|
+
For batch evaluation, use structured objects in `expected_messages.content` to define expected output fields:
|
|
273
|
+
|
|
274
|
+
```yaml
|
|
275
|
+
expected_messages:
|
|
276
|
+
- role: assistant
|
|
277
|
+
content:
|
|
278
|
+
decision: CLEAR
|
|
279
|
+
confidence: high
|
|
280
|
+
reasons: []
|
|
281
|
+
```
|
|
282
|
+
|
|
283
|
+
The evaluator then extracts these fields and compares against the parsed candidate output.
|
|
284
|
+
|
|
285
|
+
## Best Practices
|
|
286
|
+
|
|
287
|
+
1. **Use unique evalcase IDs** - The batch runner and AgentV use `id` to route outputs
|
|
288
|
+
2. **Structured input_messages** - Put structured data in `user.content` for the runner to extract
|
|
289
|
+
3. **Structured expected_messages** - Define expected output as objects for easy validation
|
|
290
|
+
4. **Deterministic runners** - Batch runners should produce consistent output for testing
|
|
291
|
+
5. **Healthcheck support** - Add `--healthcheck` flag for runner validation:
|
|
292
|
+
```typescript
|
|
293
|
+
if (args.includes('--healthcheck')) {
|
|
294
|
+
console.log('batch-runner: healthy');
|
|
295
|
+
return;
|
|
296
|
+
}
|
|
297
|
+
```
|
|
298
|
+
|
|
299
|
+
## Target Configuration
|
|
300
|
+
|
|
301
|
+
Configure the batch CLI provider in your target:
|
|
302
|
+
|
|
303
|
+
```yaml
|
|
304
|
+
# In agentv-targets.yaml or eval file
|
|
305
|
+
targets:
|
|
306
|
+
batch_cli:
|
|
307
|
+
provider: cli
|
|
308
|
+
commandTemplate: bun run ./scripts/batch-runner.ts --eval {EVAL_FILE} --output {OUTPUT_FILE}
|
|
309
|
+
provider_batching: true
|
|
310
|
+
```
|
|
311
|
+
|
|
312
|
+
Key settings:
|
|
313
|
+
- `provider: cli` - Use CLI provider
|
|
314
|
+
- `provider_batching: true` - Run once for all evalcases
|
|
315
|
+
- `{EVAL_FILE}` - Placeholder for eval file path
|
|
316
|
+
- `{OUTPUT_FILE}` - Placeholder for JSONL output path
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
# Compare Command
|
|
2
|
+
|
|
3
|
+
Compare evaluation results between two runs to measure performance differences.
|
|
4
|
+
|
|
5
|
+
## Usage
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
agentv compare <baseline.jsonl> <candidate.jsonl> [options]
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## Arguments
|
|
12
|
+
|
|
13
|
+
| Argument | Description |
|
|
14
|
+
|----------|-------------|
|
|
15
|
+
| `result1` | Path to baseline JSONL result file |
|
|
16
|
+
| `result2` | Path to candidate JSONL result file |
|
|
17
|
+
| `--threshold`, `-t` | Score delta threshold for win/loss classification (default: 0.1) |
|
|
18
|
+
| `--format`, `-f` | Output format: `table` (default) or `json` |
|
|
19
|
+
| `--json` | Shorthand for `--format=json` |
|
|
20
|
+
|
|
21
|
+
## How It Works
|
|
22
|
+
|
|
23
|
+
1. **Load Results**: Reads both JSONL files containing evaluation results
|
|
24
|
+
2. **Match by eval_id**: Pairs results with matching `eval_id` fields
|
|
25
|
+
3. **Compute Deltas**: Calculates `delta = score2 - score1` for each pair
|
|
26
|
+
4. **Classify Outcomes**:
|
|
27
|
+
- `win`: delta >= threshold (candidate better)
|
|
28
|
+
- `loss`: delta <= -threshold (baseline better)
|
|
29
|
+
- `tie`: |delta| < threshold (no significant difference)
|
|
30
|
+
5. **Output Summary**: Human-readable table (default) or JSON
|
|
31
|
+
|
|
32
|
+
## Output Format
|
|
33
|
+
|
|
34
|
+
### Table Format (default)
|
|
35
|
+
|
|
36
|
+
```
|
|
37
|
+
Comparing: baseline.jsonl → candidate.jsonl
|
|
38
|
+
|
|
39
|
+
Eval ID Baseline Candidate Delta Result
|
|
40
|
+
───────────── ──────── ───────── ──────── ────────
|
|
41
|
+
safety-check 0.70 0.90 +0.20 ✓ win
|
|
42
|
+
accuracy-test 0.85 0.80 -0.05 = tie
|
|
43
|
+
latency-eval 0.90 0.75 -0.15 ✗ loss
|
|
44
|
+
|
|
45
|
+
Summary: 1 win, 1 loss, 1 tie | Mean Δ: +0.000 | Status: neutral
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
Colors are used to highlight wins (green), losses (red), and ties (gray). Colors are automatically disabled when output is piped or `NO_COLOR` is set.
|
|
49
|
+
|
|
50
|
+
### JSON Format (`--json`)
|
|
51
|
+
|
|
52
|
+
Output uses snake_case for Python ecosystem compatibility:
|
|
53
|
+
|
|
54
|
+
```json
|
|
55
|
+
{
|
|
56
|
+
"matched": [
|
|
57
|
+
{
|
|
58
|
+
"eval_id": "case-1",
|
|
59
|
+
"score1": 0.7,
|
|
60
|
+
"score2": 0.9,
|
|
61
|
+
"delta": 0.2,
|
|
62
|
+
"outcome": "win"
|
|
63
|
+
}
|
|
64
|
+
],
|
|
65
|
+
"unmatched": {
|
|
66
|
+
"file1": 0,
|
|
67
|
+
"file2": 0
|
|
68
|
+
},
|
|
69
|
+
"summary": {
|
|
70
|
+
"total": 2,
|
|
71
|
+
"matched": 1,
|
|
72
|
+
"wins": 1,
|
|
73
|
+
"losses": 0,
|
|
74
|
+
"ties": 0,
|
|
75
|
+
"mean_delta": 0.2
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
## Exit Codes
|
|
81
|
+
|
|
82
|
+
| Code | Meaning |
|
|
83
|
+
|------|---------|
|
|
84
|
+
| `0` | Candidate is equal or better (meanDelta >= 0) |
|
|
85
|
+
| `1` | Baseline is better (regression detected) |
|
|
86
|
+
|
|
87
|
+
## Workflow Examples
|
|
88
|
+
|
|
89
|
+
### Model Comparison
|
|
90
|
+
|
|
91
|
+
Compare different model versions:
|
|
92
|
+
|
|
93
|
+
```bash
|
|
94
|
+
# Run baseline evaluation
|
|
95
|
+
agentv eval evals/*.yaml --target gpt-4 --out baseline.jsonl
|
|
96
|
+
|
|
97
|
+
# Run candidate evaluation
|
|
98
|
+
agentv eval evals/*.yaml --target gpt-4o --out candidate.jsonl
|
|
99
|
+
|
|
100
|
+
# Compare results
|
|
101
|
+
agentv compare baseline.jsonl candidate.jsonl
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
### Prompt Optimization
|
|
105
|
+
|
|
106
|
+
Compare before/after prompt changes:
|
|
107
|
+
|
|
108
|
+
```bash
|
|
109
|
+
# Run with original prompt
|
|
110
|
+
agentv eval evals/*.yaml --out before.jsonl
|
|
111
|
+
|
|
112
|
+
# Modify prompt, then run again
|
|
113
|
+
agentv eval evals/*.yaml --out after.jsonl
|
|
114
|
+
|
|
115
|
+
# Compare with strict threshold
|
|
116
|
+
agentv compare before.jsonl after.jsonl --threshold 0.05
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
### CI Quality Gate
|
|
120
|
+
|
|
121
|
+
Fail CI if candidate regresses:
|
|
122
|
+
|
|
123
|
+
```bash
|
|
124
|
+
#!/bin/bash
|
|
125
|
+
agentv compare baseline.jsonl candidate.jsonl
|
|
126
|
+
if [ $? -eq 1 ]; then
|
|
127
|
+
echo "Regression detected! Candidate performs worse than baseline."
|
|
128
|
+
exit 1
|
|
129
|
+
fi
|
|
130
|
+
echo "Candidate is equal or better than baseline."
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
## Tips
|
|
134
|
+
|
|
135
|
+
- **Threshold Selection**: Default 0.1 means 10% difference required. Use stricter thresholds (0.05) for critical evaluations.
|
|
136
|
+
- **Unmatched Results**: Check `unmatched` counts to identify eval cases that only exist in one file.
|
|
137
|
+
- **Multiple Comparisons**: Compare against multiple baselines by running the command multiple times.
|
|
@@ -0,0 +1,215 @@
|
|
|
1
|
+
# Composite Evaluator Guide
|
|
2
|
+
|
|
3
|
+
Composite evaluators combine multiple evaluators and aggregate their results. This enables sophisticated evaluation patterns like safety gates, weighted scoring, and conflict resolution.
|
|
4
|
+
|
|
5
|
+
## Basic Structure
|
|
6
|
+
|
|
7
|
+
```yaml
|
|
8
|
+
execution:
|
|
9
|
+
evaluators:
|
|
10
|
+
- name: my_composite
|
|
11
|
+
type: composite
|
|
12
|
+
evaluators:
|
|
13
|
+
- name: evaluator_1
|
|
14
|
+
type: llm_judge
|
|
15
|
+
prompt: ./prompts/check1.md
|
|
16
|
+
- name: evaluator_2
|
|
17
|
+
type: code_judge
|
|
18
|
+
script: uv run check2.py
|
|
19
|
+
aggregator:
|
|
20
|
+
type: weighted_average
|
|
21
|
+
weights:
|
|
22
|
+
evaluator_1: 0.6
|
|
23
|
+
evaluator_2: 0.4
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
## Aggregator Types
|
|
27
|
+
|
|
28
|
+
### 1. Weighted Average (Default)
|
|
29
|
+
|
|
30
|
+
Combines scores using weighted arithmetic mean:
|
|
31
|
+
|
|
32
|
+
```yaml
|
|
33
|
+
aggregator:
|
|
34
|
+
type: weighted_average
|
|
35
|
+
weights:
|
|
36
|
+
safety: 0.3 # 30% weight
|
|
37
|
+
quality: 0.7 # 70% weight
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
If weights are omitted, all evaluators have equal weight (1.0).
|
|
41
|
+
|
|
42
|
+
**Score calculation:**
|
|
43
|
+
```
|
|
44
|
+
final_score = Σ(score_i × weight_i) / Σ(weight_i)
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
### 2. Code Judge Aggregator
|
|
48
|
+
|
|
49
|
+
Run custom code to decide final score based on all evaluator results:
|
|
50
|
+
|
|
51
|
+
```yaml
|
|
52
|
+
aggregator:
|
|
53
|
+
type: code_judge
|
|
54
|
+
path: node ./scripts/safety-gate.js
|
|
55
|
+
cwd: ./evaluators # optional working directory
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
**Input (stdin):**
|
|
59
|
+
```json
|
|
60
|
+
{
|
|
61
|
+
"results": {
|
|
62
|
+
"safety": { "score": 0.9, "hits": [...], "misses": [...] },
|
|
63
|
+
"quality": { "score": 0.85, "hits": [...], "misses": [...] }
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
**Output (stdout):**
|
|
69
|
+
```json
|
|
70
|
+
{
|
|
71
|
+
"score": 0.87,
|
|
72
|
+
"verdict": "pass",
|
|
73
|
+
"hits": ["Combined check passed"],
|
|
74
|
+
"misses": [],
|
|
75
|
+
"reasoning": "Safety gate passed, quality acceptable"
|
|
76
|
+
}
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
### 3. LLM Judge Aggregator
|
|
80
|
+
|
|
81
|
+
Use an LLM to resolve conflicts or make nuanced decisions:
|
|
82
|
+
|
|
83
|
+
```yaml
|
|
84
|
+
aggregator:
|
|
85
|
+
type: llm_judge
|
|
86
|
+
prompt: ./prompts/conflict-resolution.md
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
The `{{EVALUATOR_RESULTS_JSON}}` variable is replaced with the JSON results from all child evaluators.
|
|
90
|
+
|
|
91
|
+
## Example Patterns
|
|
92
|
+
|
|
93
|
+
### Safety Gate Pattern
|
|
94
|
+
|
|
95
|
+
Block outputs that fail safety even if quality is high:
|
|
96
|
+
|
|
97
|
+
```yaml
|
|
98
|
+
evalcases:
|
|
99
|
+
- id: safety-gated-response
|
|
100
|
+
expected_outcome: Safe and accurate response
|
|
101
|
+
|
|
102
|
+
input_messages:
|
|
103
|
+
- role: user
|
|
104
|
+
content: Explain quantum computing
|
|
105
|
+
|
|
106
|
+
execution:
|
|
107
|
+
evaluators:
|
|
108
|
+
- name: safety_gate
|
|
109
|
+
type: composite
|
|
110
|
+
evaluators:
|
|
111
|
+
- name: safety
|
|
112
|
+
type: llm_judge
|
|
113
|
+
prompt: ./prompts/safety-check.md
|
|
114
|
+
- name: quality
|
|
115
|
+
type: llm_judge
|
|
116
|
+
prompt: ./prompts/quality-check.md
|
|
117
|
+
aggregator:
|
|
118
|
+
type: code_judge
|
|
119
|
+
path: ./scripts/safety-gate.js
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
### Multi-Criteria Weighted Evaluation
|
|
123
|
+
|
|
124
|
+
```yaml
|
|
125
|
+
- name: release_readiness
|
|
126
|
+
type: composite
|
|
127
|
+
evaluators:
|
|
128
|
+
- name: correctness
|
|
129
|
+
type: llm_judge
|
|
130
|
+
prompt: ./prompts/correctness.md
|
|
131
|
+
- name: style
|
|
132
|
+
type: code_judge
|
|
133
|
+
script: uv run style_checker.py
|
|
134
|
+
- name: security
|
|
135
|
+
type: llm_judge
|
|
136
|
+
prompt: ./prompts/security.md
|
|
137
|
+
aggregator:
|
|
138
|
+
type: weighted_average
|
|
139
|
+
weights:
|
|
140
|
+
correctness: 0.5
|
|
141
|
+
style: 0.2
|
|
142
|
+
security: 0.3
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
### Nested Composites
|
|
146
|
+
|
|
147
|
+
Composites can contain other composites for complex hierarchies:
|
|
148
|
+
|
|
149
|
+
```yaml
|
|
150
|
+
- name: comprehensive_eval
|
|
151
|
+
type: composite
|
|
152
|
+
evaluators:
|
|
153
|
+
- name: content_quality
|
|
154
|
+
type: composite
|
|
155
|
+
evaluators:
|
|
156
|
+
- name: accuracy
|
|
157
|
+
type: llm_judge
|
|
158
|
+
prompt: ./prompts/accuracy.md
|
|
159
|
+
- name: clarity
|
|
160
|
+
type: llm_judge
|
|
161
|
+
prompt: ./prompts/clarity.md
|
|
162
|
+
aggregator:
|
|
163
|
+
type: weighted_average
|
|
164
|
+
weights:
|
|
165
|
+
accuracy: 0.6
|
|
166
|
+
clarity: 0.4
|
|
167
|
+
- name: safety
|
|
168
|
+
type: llm_judge
|
|
169
|
+
prompt: ./prompts/safety.md
|
|
170
|
+
aggregator:
|
|
171
|
+
type: weighted_average
|
|
172
|
+
weights:
|
|
173
|
+
content_quality: 0.7
|
|
174
|
+
safety: 0.3
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
## Result Structure
|
|
178
|
+
|
|
179
|
+
Composite evaluators return nested `evaluator_results`:
|
|
180
|
+
|
|
181
|
+
```json
|
|
182
|
+
{
|
|
183
|
+
"score": 0.85,
|
|
184
|
+
"verdict": "pass",
|
|
185
|
+
"hits": ["[safety] No harmful content", "[quality] Clear explanation"],
|
|
186
|
+
"misses": ["[quality] Could use more examples"],
|
|
187
|
+
"reasoning": "safety: Passed all checks; quality: Good but could improve",
|
|
188
|
+
"evaluator_results": [
|
|
189
|
+
{
|
|
190
|
+
"name": "safety",
|
|
191
|
+
"type": "llm_judge",
|
|
192
|
+
"score": 0.95,
|
|
193
|
+
"verdict": "pass",
|
|
194
|
+
"hits": ["No harmful content"],
|
|
195
|
+
"misses": []
|
|
196
|
+
},
|
|
197
|
+
{
|
|
198
|
+
"name": "quality",
|
|
199
|
+
"type": "llm_judge",
|
|
200
|
+
"score": 0.8,
|
|
201
|
+
"verdict": "pass",
|
|
202
|
+
"hits": ["Clear explanation"],
|
|
203
|
+
"misses": ["Could use more examples"]
|
|
204
|
+
}
|
|
205
|
+
]
|
|
206
|
+
}
|
|
207
|
+
```
|
|
208
|
+
|
|
209
|
+
## Best Practices
|
|
210
|
+
|
|
211
|
+
1. **Name evaluators clearly** - Names appear in results and debugging output
|
|
212
|
+
2. **Use safety gates for critical checks** - Don't let high quality override safety failures
|
|
213
|
+
3. **Balance weights thoughtfully** - Consider which aspects matter most for your use case
|
|
214
|
+
4. **Keep nesting shallow** - Deep nesting makes debugging harder
|
|
215
|
+
5. **Test aggregators independently** - Verify your custom aggregation logic with unit tests
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$schema": "http://json-schema.org/draft-07/schema#",
|
|
3
|
+
"title": "AgentV Config Schema",
|
|
4
|
+
"description": "Schema for .agentv/config.yaml configuration files",
|
|
5
|
+
"type": "object",
|
|
6
|
+
"properties": {
|
|
7
|
+
"$schema": {
|
|
8
|
+
"type": "string",
|
|
9
|
+
"description": "Schema identifier",
|
|
10
|
+
"enum": ["agentv-config-v2"]
|
|
11
|
+
},
|
|
12
|
+
"guideline_patterns": {
|
|
13
|
+
"type": "array",
|
|
14
|
+
"description": "Glob patterns for identifying guideline files (instructions, prompts). Files matching these patterns are treated as guidelines, while non-matching files are treated as regular file content.",
|
|
15
|
+
"items": {
|
|
16
|
+
"type": "string",
|
|
17
|
+
"description": "Glob pattern (e.g., '**/*.instructions.md', '**/prompts/**')"
|
|
18
|
+
},
|
|
19
|
+
"examples": [
|
|
20
|
+
["**/*.instructions.md", "**/instructions/**", "**/*.prompt.md", "**/prompts/**"],
|
|
21
|
+
["**/*.guide.md", "**/guidelines/**", "docs/AGENTS.md"]
|
|
22
|
+
]
|
|
23
|
+
}
|
|
24
|
+
},
|
|
25
|
+
"required": ["$schema"],
|
|
26
|
+
"additionalProperties": false
|
|
27
|
+
}
|