agentv 1.6.1 → 2.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +18 -46
- package/dist/{chunk-HU4B6ODF.js → chunk-5AJ7DFUO.js} +2807 -717
- package/dist/chunk-5AJ7DFUO.js.map +1 -0
- package/dist/cli.js +4 -2
- package/dist/cli.js.map +1 -1
- package/dist/index.js +1 -1
- package/dist/templates/.agentv/.env.template +23 -23
- package/dist/templates/.agentv/config.yaml +15 -15
- package/dist/templates/.agentv/targets.yaml +16 -0
- package/dist/templates/.claude/skills/agentv-eval-builder/SKILL.md +4 -1
- package/dist/templates/.claude/skills/agentv-eval-builder/references/batch-cli-evaluator.md +12 -0
- package/dist/templates/.claude/skills/agentv-eval-builder/references/compare-command.md +25 -3
- package/dist/templates/.claude/skills/agentv-eval-builder/references/composite-evaluator.md +215 -215
- package/dist/templates/.claude/skills/agentv-eval-builder/references/custom-evaluators.md +132 -18
- package/dist/templates/.claude/skills/agentv-eval-builder/references/eval-schema.json +10 -6
- package/dist/templates/.claude/skills/agentv-eval-builder/references/structured-data-evaluators.md +121 -0
- package/dist/templates/.claude/skills/agentv-prompt-optimizer/SKILL.md +77 -77
- package/dist/templates/.github/prompts/agentv-eval-build.prompt.md +4 -4
- package/dist/templates/.github/prompts/agentv-optimize.prompt.md +3 -3
- package/package.json +4 -2
- package/dist/chunk-HU4B6ODF.js.map +0 -1
package/dist/cli.js
CHANGED
|
@@ -1,11 +1,13 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
import {
|
|
3
3
|
runCli
|
|
4
|
-
} from "./chunk-
|
|
4
|
+
} from "./chunk-5AJ7DFUO.js";
|
|
5
5
|
import "./chunk-UE4GLFVL.js";
|
|
6
6
|
|
|
7
7
|
// src/cli.ts
|
|
8
|
-
runCli().
|
|
8
|
+
runCli().then(() => {
|
|
9
|
+
process.exit(0);
|
|
10
|
+
}).catch((error) => {
|
|
9
11
|
console.error(error);
|
|
10
12
|
process.exit(1);
|
|
11
13
|
});
|
package/dist/cli.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../src/cli.ts"],"sourcesContent":["#!/usr/bin/env node\nimport { runCli } from './index.js';\n\nrunCli().catch((error) => {\n
|
|
1
|
+
{"version":3,"sources":["../src/cli.ts"],"sourcesContent":["#!/usr/bin/env node\nimport { runCli } from './index.js';\n\nrunCli()\n .then(() => {\n process.exit(0);\n })\n .catch((error) => {\n console.error(error);\n process.exit(1);\n });\n"],"mappings":";;;;;;;AAGA,OAAO,EACJ,KAAK,MAAM;AACV,UAAQ,KAAK,CAAC;AAChB,CAAC,EACA,MAAM,CAAC,UAAU;AAChB,UAAQ,MAAM,KAAK;AACnB,UAAQ,KAAK,CAAC;AAChB,CAAC;","names":[]}
|
package/dist/index.js
CHANGED
|
@@ -1,23 +1,23 @@
|
|
|
1
|
-
# Copy this file to .env and fill in your credentials
|
|
2
|
-
|
|
3
|
-
# Azure OpenAI Configuration
|
|
4
|
-
AZURE_OPENAI_ENDPOINT=https://your-endpoint.openai.azure.com/
|
|
5
|
-
AZURE_OPENAI_API_KEY=your-openai-api-key-here
|
|
6
|
-
AZURE_DEPLOYMENT_NAME=gpt-5-chat
|
|
7
|
-
AZURE_OPENAI_API_VERSION=2024-12-01-preview
|
|
8
|
-
|
|
9
|
-
# Google Gemini
|
|
10
|
-
GOOGLE_GENERATIVE_AI_API_KEY=your-gemini-api-key-here
|
|
11
|
-
GEMINI_MODEL_NAME=gemini-2.5-flash
|
|
12
|
-
|
|
13
|
-
# Anthropic
|
|
14
|
-
ANTHROPIC_API_KEY=your-anthropic-api-key-here
|
|
15
|
-
|
|
16
|
-
# VS Code Workspace Paths for Execution Targets
|
|
17
|
-
# Note: Using forward slashes is recommended for paths in .env files
|
|
18
|
-
# to avoid issues with escape characters.
|
|
19
|
-
PROJECTX_WORKSPACE_PATH=C:/Users/your-username/OneDrive - Company Pty Ltd/sample.code-workspace
|
|
20
|
-
|
|
21
|
-
# CLI provider sample (used by the local_cli target)
|
|
22
|
-
CLI_EVALS_DIR=./docs/examples/simple/evals/local-cli
|
|
23
|
-
LOCAL_AGENT_TOKEN=dummytoken
|
|
1
|
+
# Copy this file to .env and fill in your credentials
|
|
2
|
+
|
|
3
|
+
# Azure OpenAI Configuration
|
|
4
|
+
AZURE_OPENAI_ENDPOINT=https://your-endpoint.openai.azure.com/
|
|
5
|
+
AZURE_OPENAI_API_KEY=your-openai-api-key-here
|
|
6
|
+
AZURE_DEPLOYMENT_NAME=gpt-5-chat
|
|
7
|
+
AZURE_OPENAI_API_VERSION=2024-12-01-preview
|
|
8
|
+
|
|
9
|
+
# Google Gemini
|
|
10
|
+
GOOGLE_GENERATIVE_AI_API_KEY=your-gemini-api-key-here
|
|
11
|
+
GEMINI_MODEL_NAME=gemini-2.5-flash
|
|
12
|
+
|
|
13
|
+
# Anthropic
|
|
14
|
+
ANTHROPIC_API_KEY=your-anthropic-api-key-here
|
|
15
|
+
|
|
16
|
+
# VS Code Workspace Paths for Execution Targets
|
|
17
|
+
# Note: Using forward slashes is recommended for paths in .env files
|
|
18
|
+
# to avoid issues with escape characters.
|
|
19
|
+
PROJECTX_WORKSPACE_PATH=C:/Users/your-username/OneDrive - Company Pty Ltd/sample.code-workspace
|
|
20
|
+
|
|
21
|
+
# CLI provider sample (used by the local_cli target)
|
|
22
|
+
CLI_EVALS_DIR=./docs/examples/simple/evals/local-cli
|
|
23
|
+
LOCAL_AGENT_TOKEN=dummytoken
|
|
@@ -1,15 +1,15 @@
|
|
|
1
|
-
$schema: agentv-config-v2
|
|
2
|
-
|
|
3
|
-
# Customize which files are treated as guidelines vs regular file content
|
|
4
|
-
|
|
5
|
-
# Custom guideline patterns:
|
|
6
|
-
guideline_patterns:
|
|
7
|
-
- "**/*.instructions.md"
|
|
8
|
-
- "**/*.prompt.md"
|
|
9
|
-
- "**/SKILL.md"
|
|
10
|
-
|
|
11
|
-
# Notes:
|
|
12
|
-
# - Patterns use standard glob syntax (via micromatch library)
|
|
13
|
-
# - Paths are normalized to forward slashes for cross-platform compatibility
|
|
14
|
-
# - Only files matching these patterns are loaded as guidelines
|
|
15
|
-
# - All other files referenced in eval cases are treated as regular file content
|
|
1
|
+
$schema: agentv-config-v2
|
|
2
|
+
|
|
3
|
+
# Customize which files are treated as guidelines vs regular file content
|
|
4
|
+
|
|
5
|
+
# Custom guideline patterns:
|
|
6
|
+
guideline_patterns:
|
|
7
|
+
- "**/*.instructions.md"
|
|
8
|
+
- "**/*.prompt.md"
|
|
9
|
+
- "**/SKILL.md"
|
|
10
|
+
|
|
11
|
+
# Notes:
|
|
12
|
+
# - Patterns use standard glob syntax (via micromatch library)
|
|
13
|
+
# - Paths are normalized to forward slashes for cross-platform compatibility
|
|
14
|
+
# - Only files matching these patterns are loaded as guidelines
|
|
15
|
+
# - All other files referenced in eval cases are treated as regular file content
|
|
@@ -31,6 +31,22 @@ targets:
|
|
|
31
31
|
log_dir: ${{ CODEX_LOG_DIR }} # Optional: where Codex CLI stream logs are stored (defaults to ./.agentv/logs/codex)
|
|
32
32
|
log_format: json # Optional: 'summary' (default) or 'json' for raw event logs
|
|
33
33
|
|
|
34
|
+
# Claude Code - Anthropic's autonomous coding CLI
|
|
35
|
+
- name: claude-code
|
|
36
|
+
provider: claude-code
|
|
37
|
+
judge_target: azure_base
|
|
38
|
+
# Uses the Claude Code CLI (defaults to `claude` on PATH)
|
|
39
|
+
# executable: ${{ CLAUDE_CODE_CLI_PATH }} # Optional: override executable path
|
|
40
|
+
# model: claude-sonnet-4-20250514 # Optional: override model
|
|
41
|
+
# args: # Optional additional CLI arguments
|
|
42
|
+
# - --allowedTools
|
|
43
|
+
# - Read,Write,Edit,Bash
|
|
44
|
+
timeout_seconds: 180
|
|
45
|
+
# cwd: ${{ CLAUDE_CODE_WORKSPACE_DIR }} # Optional: working directory (defaults to process.cwd())
|
|
46
|
+
# log_dir: ${{ CLAUDE_CODE_LOG_DIR }} # Optional: where stream logs are stored (defaults to ./.agentv/logs/claude-code)
|
|
47
|
+
log_format: json # Optional: 'summary' (default) or 'json' for raw event logs
|
|
48
|
+
# system_prompt: optional override (default instructs agent to include code in response)
|
|
49
|
+
|
|
34
50
|
- name: vscode_projectx
|
|
35
51
|
provider: vscode
|
|
36
52
|
workspace_template: ${{ PROJECTX_WORKSPACE_PATH }}
|
|
@@ -14,6 +14,7 @@ description: Create and maintain AgentV YAML evaluation files for testing AI age
|
|
|
14
14
|
- Rubrics: `references/rubric-evaluator.md` - Structured criteria-based evaluation
|
|
15
15
|
- Composite Evaluators: `references/composite-evaluator.md` - Combine multiple evaluators
|
|
16
16
|
- Tool Trajectory: `references/tool-trajectory-evaluator.md` - Validate agent tool usage
|
|
17
|
+
- Structured Data + Metrics: `references/structured-data-evaluators.md` - `field_accuracy`, `latency`, `cost`
|
|
17
18
|
- Custom Evaluators: `references/custom-evaluators.md` - Code and LLM judge templates
|
|
18
19
|
- Batch CLI: `references/batch-cli-evaluator.md` - Evaluate batch runner output (JSONL)
|
|
19
20
|
- Compare: `references/compare-command.md` - Compare evaluation results between runs
|
|
@@ -49,7 +50,9 @@ execution:
|
|
|
49
50
|
- Input (stdin): JSON with `question`, `expected_outcome`, `reference_answer`, `candidate_answer`, `guideline_files` (file paths), `input_files` (file paths, excludes guidelines), `input_messages`
|
|
50
51
|
- Output (stdout): JSON with `score` (0.0-1.0), `hits`, `misses`, `reasoning`
|
|
51
52
|
|
|
52
|
-
**
|
|
53
|
+
**TypeScript evaluators:** Keep `.ts` source files and run them via Node-compatible loaders such as `npx --yes tsx` so global `agentv` installs stay portable. See `references/custom-evaluators.md` for complete templates and command examples.
|
|
54
|
+
|
|
55
|
+
**Template:** See `references/custom-evaluators.md` for Python and TypeScript templates
|
|
53
56
|
|
|
54
57
|
### LLM Judges
|
|
55
58
|
Language models evaluate response quality:
|
|
@@ -17,6 +17,18 @@ Batch CLI evaluation is used when:
|
|
|
17
17
|
3. **AgentV** parses JSONL, routes each record to its matching evalcase by `id`
|
|
18
18
|
4. **Per-case evaluator** validates the output for each evalcase independently
|
|
19
19
|
|
|
20
|
+
## Error Handling
|
|
21
|
+
|
|
22
|
+
### Missing IDs / Per-item Failures
|
|
23
|
+
|
|
24
|
+
In batch mode, a runner can succeed overall while still failing to produce an output record for a specific evalcase `id`.
|
|
25
|
+
|
|
26
|
+
- When an output record is missing, AgentV treats this as a per-item error.
|
|
27
|
+
- The evaluation result will include an `error` field describing the issue.
|
|
28
|
+
- The CLI progress display will show that evalcase as failed (❌) while still continuing to evaluate other cases.
|
|
29
|
+
|
|
30
|
+
This behavior is intentional so one bad/missing record does not discard the entire batch.
|
|
31
|
+
|
|
20
32
|
## Eval File Structure
|
|
21
33
|
|
|
22
34
|
```yaml
|
|
@@ -5,7 +5,7 @@ Compare evaluation results between two runs to measure performance differences.
|
|
|
5
5
|
## Usage
|
|
6
6
|
|
|
7
7
|
```bash
|
|
8
|
-
agentv compare <baseline.jsonl> <candidate.jsonl> [
|
|
8
|
+
agentv compare <baseline.jsonl> <candidate.jsonl> [options]
|
|
9
9
|
```
|
|
10
10
|
|
|
11
11
|
## Arguments
|
|
@@ -15,6 +15,8 @@ agentv compare <baseline.jsonl> <candidate.jsonl> [--threshold <value>]
|
|
|
15
15
|
| `result1` | Path to baseline JSONL result file |
|
|
16
16
|
| `result2` | Path to candidate JSONL result file |
|
|
17
17
|
| `--threshold`, `-t` | Score delta threshold for win/loss classification (default: 0.1) |
|
|
18
|
+
| `--format`, `-f` | Output format: `table` (default) or `json` |
|
|
19
|
+
| `--json` | Shorthand for `--format=json` |
|
|
18
20
|
|
|
19
21
|
## How It Works
|
|
20
22
|
|
|
@@ -25,10 +27,30 @@ agentv compare <baseline.jsonl> <candidate.jsonl> [--threshold <value>]
|
|
|
25
27
|
- `win`: delta >= threshold (candidate better)
|
|
26
28
|
- `loss`: delta <= -threshold (baseline better)
|
|
27
29
|
- `tie`: |delta| < threshold (no significant difference)
|
|
28
|
-
5. **Output Summary**:
|
|
30
|
+
5. **Output Summary**: Human-readable table (default) or JSON
|
|
29
31
|
|
|
30
32
|
## Output Format
|
|
31
33
|
|
|
34
|
+
### Table Format (default)
|
|
35
|
+
|
|
36
|
+
```
|
|
37
|
+
Comparing: baseline.jsonl → candidate.jsonl
|
|
38
|
+
|
|
39
|
+
Eval ID Baseline Candidate Delta Result
|
|
40
|
+
───────────── ──────── ───────── ──────── ────────
|
|
41
|
+
safety-check 0.70 0.90 +0.20 ✓ win
|
|
42
|
+
accuracy-test 0.85 0.80 -0.05 = tie
|
|
43
|
+
latency-eval 0.90 0.75 -0.15 ✗ loss
|
|
44
|
+
|
|
45
|
+
Summary: 1 win, 1 loss, 1 tie | Mean Δ: +0.000 | Status: neutral
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
Colors are used to highlight wins (green), losses (red), and ties (gray). Colors are automatically disabled when output is piped or `NO_COLOR` is set.
|
|
49
|
+
|
|
50
|
+
### JSON Format (`--json`)
|
|
51
|
+
|
|
52
|
+
Output uses snake_case for Python ecosystem compatibility:
|
|
53
|
+
|
|
32
54
|
```json
|
|
33
55
|
{
|
|
34
56
|
"matched": [
|
|
@@ -50,7 +72,7 @@ agentv compare <baseline.jsonl> <candidate.jsonl> [--threshold <value>]
|
|
|
50
72
|
"wins": 1,
|
|
51
73
|
"losses": 0,
|
|
52
74
|
"ties": 0,
|
|
53
|
-
"
|
|
75
|
+
"mean_delta": 0.2
|
|
54
76
|
}
|
|
55
77
|
}
|
|
56
78
|
```
|
|
@@ -1,215 +1,215 @@
|
|
|
1
|
-
# Composite Evaluator Guide
|
|
2
|
-
|
|
3
|
-
Composite evaluators combine multiple evaluators and aggregate their results. This enables sophisticated evaluation patterns like safety gates, weighted scoring, and conflict resolution.
|
|
4
|
-
|
|
5
|
-
## Basic Structure
|
|
6
|
-
|
|
7
|
-
```yaml
|
|
8
|
-
execution:
|
|
9
|
-
evaluators:
|
|
10
|
-
- name: my_composite
|
|
11
|
-
type: composite
|
|
12
|
-
evaluators:
|
|
13
|
-
- name: evaluator_1
|
|
14
|
-
type: llm_judge
|
|
15
|
-
prompt: ./prompts/check1.md
|
|
16
|
-
- name: evaluator_2
|
|
17
|
-
type: code_judge
|
|
18
|
-
script: uv run check2.py
|
|
19
|
-
aggregator:
|
|
20
|
-
type: weighted_average
|
|
21
|
-
weights:
|
|
22
|
-
evaluator_1: 0.6
|
|
23
|
-
evaluator_2: 0.4
|
|
24
|
-
```
|
|
25
|
-
|
|
26
|
-
## Aggregator Types
|
|
27
|
-
|
|
28
|
-
### 1. Weighted Average (Default)
|
|
29
|
-
|
|
30
|
-
Combines scores using weighted arithmetic mean:
|
|
31
|
-
|
|
32
|
-
```yaml
|
|
33
|
-
aggregator:
|
|
34
|
-
type: weighted_average
|
|
35
|
-
weights:
|
|
36
|
-
safety: 0.3 # 30% weight
|
|
37
|
-
quality: 0.7 # 70% weight
|
|
38
|
-
```
|
|
39
|
-
|
|
40
|
-
If weights are omitted, all evaluators have equal weight (1.0).
|
|
41
|
-
|
|
42
|
-
**Score calculation:**
|
|
43
|
-
```
|
|
44
|
-
final_score = Σ(score_i × weight_i) / Σ(weight_i)
|
|
45
|
-
```
|
|
46
|
-
|
|
47
|
-
### 2. Code Judge Aggregator
|
|
48
|
-
|
|
49
|
-
Run custom code to decide final score based on all evaluator results:
|
|
50
|
-
|
|
51
|
-
```yaml
|
|
52
|
-
aggregator:
|
|
53
|
-
type: code_judge
|
|
54
|
-
path: node ./scripts/safety-gate.js
|
|
55
|
-
cwd: ./evaluators # optional working directory
|
|
56
|
-
```
|
|
57
|
-
|
|
58
|
-
**Input (stdin):**
|
|
59
|
-
```json
|
|
60
|
-
{
|
|
61
|
-
"results": {
|
|
62
|
-
"safety": { "score": 0.9, "hits": [...], "misses": [...] },
|
|
63
|
-
"quality": { "score": 0.85, "hits": [...], "misses": [...] }
|
|
64
|
-
}
|
|
65
|
-
}
|
|
66
|
-
```
|
|
67
|
-
|
|
68
|
-
**Output (stdout):**
|
|
69
|
-
```json
|
|
70
|
-
{
|
|
71
|
-
"score": 0.87,
|
|
72
|
-
"verdict": "pass",
|
|
73
|
-
"hits": ["Combined check passed"],
|
|
74
|
-
"misses": [],
|
|
75
|
-
"reasoning": "Safety gate passed, quality acceptable"
|
|
76
|
-
}
|
|
77
|
-
```
|
|
78
|
-
|
|
79
|
-
### 3. LLM Judge Aggregator
|
|
80
|
-
|
|
81
|
-
Use an LLM to resolve conflicts or make nuanced decisions:
|
|
82
|
-
|
|
83
|
-
```yaml
|
|
84
|
-
aggregator:
|
|
85
|
-
type: llm_judge
|
|
86
|
-
prompt: ./prompts/conflict-resolution.md
|
|
87
|
-
```
|
|
88
|
-
|
|
89
|
-
The `{{EVALUATOR_RESULTS_JSON}}` variable is replaced with the JSON results from all child evaluators.
|
|
90
|
-
|
|
91
|
-
## Example Patterns
|
|
92
|
-
|
|
93
|
-
### Safety Gate Pattern
|
|
94
|
-
|
|
95
|
-
Block outputs that fail safety even if quality is high:
|
|
96
|
-
|
|
97
|
-
```yaml
|
|
98
|
-
evalcases:
|
|
99
|
-
- id: safety-gated-response
|
|
100
|
-
expected_outcome: Safe and accurate response
|
|
101
|
-
|
|
102
|
-
input_messages:
|
|
103
|
-
- role: user
|
|
104
|
-
content: Explain quantum computing
|
|
105
|
-
|
|
106
|
-
execution:
|
|
107
|
-
evaluators:
|
|
108
|
-
- name: safety_gate
|
|
109
|
-
type: composite
|
|
110
|
-
evaluators:
|
|
111
|
-
- name: safety
|
|
112
|
-
type: llm_judge
|
|
113
|
-
prompt: ./prompts/safety-check.md
|
|
114
|
-
- name: quality
|
|
115
|
-
type: llm_judge
|
|
116
|
-
prompt: ./prompts/quality-check.md
|
|
117
|
-
aggregator:
|
|
118
|
-
type: code_judge
|
|
119
|
-
path: ./scripts/safety-gate.js
|
|
120
|
-
```
|
|
121
|
-
|
|
122
|
-
### Multi-Criteria Weighted Evaluation
|
|
123
|
-
|
|
124
|
-
```yaml
|
|
125
|
-
- name: release_readiness
|
|
126
|
-
type: composite
|
|
127
|
-
evaluators:
|
|
128
|
-
- name: correctness
|
|
129
|
-
type: llm_judge
|
|
130
|
-
prompt: ./prompts/correctness.md
|
|
131
|
-
- name: style
|
|
132
|
-
type: code_judge
|
|
133
|
-
script: uv run style_checker.py
|
|
134
|
-
- name: security
|
|
135
|
-
type: llm_judge
|
|
136
|
-
prompt: ./prompts/security.md
|
|
137
|
-
aggregator:
|
|
138
|
-
type: weighted_average
|
|
139
|
-
weights:
|
|
140
|
-
correctness: 0.5
|
|
141
|
-
style: 0.2
|
|
142
|
-
security: 0.3
|
|
143
|
-
```
|
|
144
|
-
|
|
145
|
-
### Nested Composites
|
|
146
|
-
|
|
147
|
-
Composites can contain other composites for complex hierarchies:
|
|
148
|
-
|
|
149
|
-
```yaml
|
|
150
|
-
- name: comprehensive_eval
|
|
151
|
-
type: composite
|
|
152
|
-
evaluators:
|
|
153
|
-
- name: content_quality
|
|
154
|
-
type: composite
|
|
155
|
-
evaluators:
|
|
156
|
-
- name: accuracy
|
|
157
|
-
type: llm_judge
|
|
158
|
-
prompt: ./prompts/accuracy.md
|
|
159
|
-
- name: clarity
|
|
160
|
-
type: llm_judge
|
|
161
|
-
prompt: ./prompts/clarity.md
|
|
162
|
-
aggregator:
|
|
163
|
-
type: weighted_average
|
|
164
|
-
weights:
|
|
165
|
-
accuracy: 0.6
|
|
166
|
-
clarity: 0.4
|
|
167
|
-
- name: safety
|
|
168
|
-
type: llm_judge
|
|
169
|
-
prompt: ./prompts/safety.md
|
|
170
|
-
aggregator:
|
|
171
|
-
type: weighted_average
|
|
172
|
-
weights:
|
|
173
|
-
content_quality: 0.7
|
|
174
|
-
safety: 0.3
|
|
175
|
-
```
|
|
176
|
-
|
|
177
|
-
## Result Structure
|
|
178
|
-
|
|
179
|
-
Composite evaluators return nested `evaluator_results`:
|
|
180
|
-
|
|
181
|
-
```json
|
|
182
|
-
{
|
|
183
|
-
"score": 0.85,
|
|
184
|
-
"verdict": "pass",
|
|
185
|
-
"hits": ["[safety] No harmful content", "[quality] Clear explanation"],
|
|
186
|
-
"misses": ["[quality] Could use more examples"],
|
|
187
|
-
"reasoning": "safety: Passed all checks; quality: Good but could improve",
|
|
188
|
-
"evaluator_results": [
|
|
189
|
-
{
|
|
190
|
-
"name": "safety",
|
|
191
|
-
"type": "llm_judge",
|
|
192
|
-
"score": 0.95,
|
|
193
|
-
"verdict": "pass",
|
|
194
|
-
"hits": ["No harmful content"],
|
|
195
|
-
"misses": []
|
|
196
|
-
},
|
|
197
|
-
{
|
|
198
|
-
"name": "quality",
|
|
199
|
-
"type": "llm_judge",
|
|
200
|
-
"score": 0.8,
|
|
201
|
-
"verdict": "pass",
|
|
202
|
-
"hits": ["Clear explanation"],
|
|
203
|
-
"misses": ["Could use more examples"]
|
|
204
|
-
}
|
|
205
|
-
]
|
|
206
|
-
}
|
|
207
|
-
```
|
|
208
|
-
|
|
209
|
-
## Best Practices
|
|
210
|
-
|
|
211
|
-
1. **Name evaluators clearly** - Names appear in results and debugging output
|
|
212
|
-
2. **Use safety gates for critical checks** - Don't let high quality override safety failures
|
|
213
|
-
3. **Balance weights thoughtfully** - Consider which aspects matter most for your use case
|
|
214
|
-
4. **Keep nesting shallow** - Deep nesting makes debugging harder
|
|
215
|
-
5. **Test aggregators independently** - Verify your custom aggregation logic with unit tests
|
|
1
|
+
# Composite Evaluator Guide
|
|
2
|
+
|
|
3
|
+
Composite evaluators combine multiple evaluators and aggregate their results. This enables sophisticated evaluation patterns like safety gates, weighted scoring, and conflict resolution.
|
|
4
|
+
|
|
5
|
+
## Basic Structure
|
|
6
|
+
|
|
7
|
+
```yaml
|
|
8
|
+
execution:
|
|
9
|
+
evaluators:
|
|
10
|
+
- name: my_composite
|
|
11
|
+
type: composite
|
|
12
|
+
evaluators:
|
|
13
|
+
- name: evaluator_1
|
|
14
|
+
type: llm_judge
|
|
15
|
+
prompt: ./prompts/check1.md
|
|
16
|
+
- name: evaluator_2
|
|
17
|
+
type: code_judge
|
|
18
|
+
script: uv run check2.py
|
|
19
|
+
aggregator:
|
|
20
|
+
type: weighted_average
|
|
21
|
+
weights:
|
|
22
|
+
evaluator_1: 0.6
|
|
23
|
+
evaluator_2: 0.4
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
## Aggregator Types
|
|
27
|
+
|
|
28
|
+
### 1. Weighted Average (Default)
|
|
29
|
+
|
|
30
|
+
Combines scores using weighted arithmetic mean:
|
|
31
|
+
|
|
32
|
+
```yaml
|
|
33
|
+
aggregator:
|
|
34
|
+
type: weighted_average
|
|
35
|
+
weights:
|
|
36
|
+
safety: 0.3 # 30% weight
|
|
37
|
+
quality: 0.7 # 70% weight
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
If weights are omitted, all evaluators have equal weight (1.0).
|
|
41
|
+
|
|
42
|
+
**Score calculation:**
|
|
43
|
+
```
|
|
44
|
+
final_score = Σ(score_i × weight_i) / Σ(weight_i)
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
### 2. Code Judge Aggregator
|
|
48
|
+
|
|
49
|
+
Run custom code to decide final score based on all evaluator results:
|
|
50
|
+
|
|
51
|
+
```yaml
|
|
52
|
+
aggregator:
|
|
53
|
+
type: code_judge
|
|
54
|
+
path: node ./scripts/safety-gate.js
|
|
55
|
+
cwd: ./evaluators # optional working directory
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
**Input (stdin):**
|
|
59
|
+
```json
|
|
60
|
+
{
|
|
61
|
+
"results": {
|
|
62
|
+
"safety": { "score": 0.9, "hits": [...], "misses": [...] },
|
|
63
|
+
"quality": { "score": 0.85, "hits": [...], "misses": [...] }
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
**Output (stdout):**
|
|
69
|
+
```json
|
|
70
|
+
{
|
|
71
|
+
"score": 0.87,
|
|
72
|
+
"verdict": "pass",
|
|
73
|
+
"hits": ["Combined check passed"],
|
|
74
|
+
"misses": [],
|
|
75
|
+
"reasoning": "Safety gate passed, quality acceptable"
|
|
76
|
+
}
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
### 3. LLM Judge Aggregator
|
|
80
|
+
|
|
81
|
+
Use an LLM to resolve conflicts or make nuanced decisions:
|
|
82
|
+
|
|
83
|
+
```yaml
|
|
84
|
+
aggregator:
|
|
85
|
+
type: llm_judge
|
|
86
|
+
prompt: ./prompts/conflict-resolution.md
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
The `{{EVALUATOR_RESULTS_JSON}}` variable is replaced with the JSON results from all child evaluators.
|
|
90
|
+
|
|
91
|
+
## Example Patterns
|
|
92
|
+
|
|
93
|
+
### Safety Gate Pattern
|
|
94
|
+
|
|
95
|
+
Block outputs that fail safety even if quality is high:
|
|
96
|
+
|
|
97
|
+
```yaml
|
|
98
|
+
evalcases:
|
|
99
|
+
- id: safety-gated-response
|
|
100
|
+
expected_outcome: Safe and accurate response
|
|
101
|
+
|
|
102
|
+
input_messages:
|
|
103
|
+
- role: user
|
|
104
|
+
content: Explain quantum computing
|
|
105
|
+
|
|
106
|
+
execution:
|
|
107
|
+
evaluators:
|
|
108
|
+
- name: safety_gate
|
|
109
|
+
type: composite
|
|
110
|
+
evaluators:
|
|
111
|
+
- name: safety
|
|
112
|
+
type: llm_judge
|
|
113
|
+
prompt: ./prompts/safety-check.md
|
|
114
|
+
- name: quality
|
|
115
|
+
type: llm_judge
|
|
116
|
+
prompt: ./prompts/quality-check.md
|
|
117
|
+
aggregator:
|
|
118
|
+
type: code_judge
|
|
119
|
+
path: ./scripts/safety-gate.js
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
### Multi-Criteria Weighted Evaluation
|
|
123
|
+
|
|
124
|
+
```yaml
|
|
125
|
+
- name: release_readiness
|
|
126
|
+
type: composite
|
|
127
|
+
evaluators:
|
|
128
|
+
- name: correctness
|
|
129
|
+
type: llm_judge
|
|
130
|
+
prompt: ./prompts/correctness.md
|
|
131
|
+
- name: style
|
|
132
|
+
type: code_judge
|
|
133
|
+
script: uv run style_checker.py
|
|
134
|
+
- name: security
|
|
135
|
+
type: llm_judge
|
|
136
|
+
prompt: ./prompts/security.md
|
|
137
|
+
aggregator:
|
|
138
|
+
type: weighted_average
|
|
139
|
+
weights:
|
|
140
|
+
correctness: 0.5
|
|
141
|
+
style: 0.2
|
|
142
|
+
security: 0.3
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
### Nested Composites
|
|
146
|
+
|
|
147
|
+
Composites can contain other composites for complex hierarchies:
|
|
148
|
+
|
|
149
|
+
```yaml
|
|
150
|
+
- name: comprehensive_eval
|
|
151
|
+
type: composite
|
|
152
|
+
evaluators:
|
|
153
|
+
- name: content_quality
|
|
154
|
+
type: composite
|
|
155
|
+
evaluators:
|
|
156
|
+
- name: accuracy
|
|
157
|
+
type: llm_judge
|
|
158
|
+
prompt: ./prompts/accuracy.md
|
|
159
|
+
- name: clarity
|
|
160
|
+
type: llm_judge
|
|
161
|
+
prompt: ./prompts/clarity.md
|
|
162
|
+
aggregator:
|
|
163
|
+
type: weighted_average
|
|
164
|
+
weights:
|
|
165
|
+
accuracy: 0.6
|
|
166
|
+
clarity: 0.4
|
|
167
|
+
- name: safety
|
|
168
|
+
type: llm_judge
|
|
169
|
+
prompt: ./prompts/safety.md
|
|
170
|
+
aggregator:
|
|
171
|
+
type: weighted_average
|
|
172
|
+
weights:
|
|
173
|
+
content_quality: 0.7
|
|
174
|
+
safety: 0.3
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
## Result Structure
|
|
178
|
+
|
|
179
|
+
Composite evaluators return nested `evaluator_results`:
|
|
180
|
+
|
|
181
|
+
```json
|
|
182
|
+
{
|
|
183
|
+
"score": 0.85,
|
|
184
|
+
"verdict": "pass",
|
|
185
|
+
"hits": ["[safety] No harmful content", "[quality] Clear explanation"],
|
|
186
|
+
"misses": ["[quality] Could use more examples"],
|
|
187
|
+
"reasoning": "safety: Passed all checks; quality: Good but could improve",
|
|
188
|
+
"evaluator_results": [
|
|
189
|
+
{
|
|
190
|
+
"name": "safety",
|
|
191
|
+
"type": "llm_judge",
|
|
192
|
+
"score": 0.95,
|
|
193
|
+
"verdict": "pass",
|
|
194
|
+
"hits": ["No harmful content"],
|
|
195
|
+
"misses": []
|
|
196
|
+
},
|
|
197
|
+
{
|
|
198
|
+
"name": "quality",
|
|
199
|
+
"type": "llm_judge",
|
|
200
|
+
"score": 0.8,
|
|
201
|
+
"verdict": "pass",
|
|
202
|
+
"hits": ["Clear explanation"],
|
|
203
|
+
"misses": ["Could use more examples"]
|
|
204
|
+
}
|
|
205
|
+
]
|
|
206
|
+
}
|
|
207
|
+
```
|
|
208
|
+
|
|
209
|
+
## Best Practices
|
|
210
|
+
|
|
211
|
+
1. **Name evaluators clearly** - Names appear in results and debugging output
|
|
212
|
+
2. **Use safety gates for critical checks** - Don't let high quality override safety failures
|
|
213
|
+
3. **Balance weights thoughtfully** - Consider which aspects matter most for your use case
|
|
214
|
+
4. **Keep nesting shallow** - Deep nesting makes debugging harder
|
|
215
|
+
5. **Test aggregators independently** - Verify your custom aggregation logic with unit tests
|