agentv 0.23.0 → 0.26.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +14 -11
- package/dist/{chunk-4T62HFF4.js → chunk-6ZM7WVSC.js} +900 -250
- package/dist/chunk-6ZM7WVSC.js.map +1 -0
- package/dist/cli.js +1 -1
- package/dist/index.js +1 -1
- package/dist/templates/.agentv/.env.template +10 -10
- package/dist/templates/.agentv/targets.yaml +8 -1
- package/dist/templates/.claude/skills/agentv-eval-builder/SKILL.md +75 -6
- package/dist/templates/.claude/skills/agentv-eval-builder/references/composite-evaluator.md +215 -0
- package/dist/templates/.claude/skills/agentv-eval-builder/references/rubric-evaluator.md +139 -0
- package/dist/templates/.claude/skills/agentv-eval-builder/references/tool-trajectory-evaluator.md +237 -0
- package/package.json +1 -1
- package/dist/chunk-4T62HFF4.js.map +0 -1
package/dist/cli.js
CHANGED
package/dist/index.js
CHANGED
|
@@ -1,16 +1,16 @@
|
|
|
1
|
-
# Example environment configuration for AgentV
|
|
2
1
|
# Copy this file to .env and fill in your credentials
|
|
3
2
|
|
|
4
|
-
# Model Provider Selection (Optional - can be configured via targets.yaml)
|
|
5
|
-
PROVIDER=azure
|
|
6
|
-
|
|
7
3
|
# Azure OpenAI Configuration
|
|
8
|
-
# These are the default environment variable names used in the provided targets.yaml
|
|
9
4
|
AZURE_OPENAI_ENDPOINT=https://your-endpoint.openai.azure.com/
|
|
10
|
-
AZURE_OPENAI_API_KEY=your-api-key-here
|
|
11
|
-
AZURE_DEPLOYMENT_NAME=gpt-
|
|
5
|
+
AZURE_OPENAI_API_KEY=your-openai-api-key-here
|
|
6
|
+
AZURE_DEPLOYMENT_NAME=gpt-5-chat
|
|
7
|
+
AZURE_OPENAI_API_VERSION=2024-12-01-preview
|
|
8
|
+
|
|
9
|
+
# Google Gemini
|
|
10
|
+
GOOGLE_GENERATIVE_AI_API_KEY=your-gemini-api-key-here
|
|
11
|
+
GEMINI_MODEL_NAME=gemini-2.5-flash
|
|
12
12
|
|
|
13
|
-
# Anthropic
|
|
13
|
+
# Anthropic
|
|
14
14
|
ANTHROPIC_API_KEY=your-anthropic-api-key-here
|
|
15
15
|
|
|
16
16
|
# VS Code Workspace Paths for Execution Targets
|
|
@@ -19,5 +19,5 @@ ANTHROPIC_API_KEY=your-anthropic-api-key-here
|
|
|
19
19
|
PROJECTX_WORKSPACE_PATH=C:/Users/your-username/OneDrive - Company Pty Ltd/sample.code-workspace
|
|
20
20
|
|
|
21
21
|
# CLI provider sample (used by the local_cli target)
|
|
22
|
-
|
|
23
|
-
LOCAL_AGENT_TOKEN=
|
|
22
|
+
CLI_EVALS_DIR=./docs/examples/simple/evals/local-cli
|
|
23
|
+
LOCAL_AGENT_TOKEN=dummytoken
|
|
@@ -10,6 +10,7 @@ targets:
|
|
|
10
10
|
endpoint: ${{ AZURE_OPENAI_ENDPOINT }}
|
|
11
11
|
api_key: ${{ AZURE_OPENAI_API_KEY }}
|
|
12
12
|
model: ${{ AZURE_DEPLOYMENT_NAME }}
|
|
13
|
+
# version: ${{ AZURE_OPENAI_API_VERSION }} # Optional: uncomment to override default (2024-12-01-preview)
|
|
13
14
|
|
|
14
15
|
- name: vscode
|
|
15
16
|
provider: vscode
|
|
@@ -49,13 +50,19 @@ targets:
|
|
|
49
50
|
endpoint: ${{ AZURE_OPENAI_ENDPOINT }}
|
|
50
51
|
api_key: ${{ AZURE_OPENAI_API_KEY }}
|
|
51
52
|
model: ${{ AZURE_DEPLOYMENT_NAME }}
|
|
53
|
+
version: ${{ AZURE_OPENAI_API_VERSION }}
|
|
54
|
+
|
|
55
|
+
- name: gemini_base
|
|
56
|
+
provider: gemini
|
|
57
|
+
api_key: ${{ GOOGLE_GENERATIVE_AI_API_KEY }}
|
|
58
|
+
model: ${{ GEMINI_MODEL_NAME }}
|
|
52
59
|
|
|
53
60
|
- name: local_cli
|
|
54
61
|
provider: cli
|
|
55
62
|
judge_target: azure_base
|
|
56
63
|
# Passes the fully rendered prompt and any attached files to a local Python script
|
|
57
64
|
# NOTE: Do not add quotes around {PROMPT} or {FILES} - they are already shell-escaped
|
|
58
|
-
command_template: uv run ./mock_cli.py --prompt {PROMPT} {FILES}
|
|
65
|
+
command_template: uv run ./mock_cli.py --prompt {PROMPT} {FILES} --output {OUTPUT_FILE}
|
|
59
66
|
# Format for each file in {FILES}. {path} and {basename} are automatically shell-escaped, so no quotes needed
|
|
60
67
|
files_format: --file {path}
|
|
61
68
|
# Optional working directory resolved from .env
|
|
@@ -10,10 +10,17 @@ description: Create and maintain AgentV YAML evaluation files for testing AI age
|
|
|
10
10
|
- Format: YAML with structured content arrays
|
|
11
11
|
- Examples: `references/example-evals.md`
|
|
12
12
|
|
|
13
|
+
## Feature Reference
|
|
14
|
+
- Rubrics: `references/rubric-evaluator.md` - Structured criteria-based evaluation
|
|
15
|
+
- Composite Evaluators: `references/composite-evaluator.md` - Combine multiple evaluators
|
|
16
|
+
- Tool Trajectory: `references/tool-trajectory-evaluator.md` - Validate agent tool usage
|
|
17
|
+
- Custom Evaluators: `references/custom-evaluators.md` - Code and LLM judge templates
|
|
18
|
+
|
|
13
19
|
## Structure Requirements
|
|
14
|
-
- Root level: `description` (optional), `execution` (optional), `evalcases` (required)
|
|
15
|
-
- Eval case fields: `id` (required), `expected_outcome` (required), `input_messages` (required)
|
|
16
|
-
- Optional fields: `conversation_id`, `
|
|
20
|
+
- Root level: `description` (optional), `target` (optional), `execution` (optional), `evalcases` (required)
|
|
21
|
+
- Eval case fields: `id` (required), `expected_outcome` (required), `input_messages` (required)
|
|
22
|
+
- Optional fields: `expected_messages`, `conversation_id`, `rubrics`, `execution`
|
|
23
|
+
- `expected_messages` is optional - omit for outcome-only evaluation where the LLM judge evaluates based on `expected_outcome` criteria alone
|
|
17
24
|
- Message fields: `role` (required), `content` (required)
|
|
18
25
|
- Message roles: `system`, `user`, `assistant`, `tool`
|
|
19
26
|
- Content types: `text` (inline), `file` (relative or absolute path)
|
|
@@ -31,13 +38,13 @@ Scripts that validate output programmatically:
|
|
|
31
38
|
execution:
|
|
32
39
|
evaluators:
|
|
33
40
|
- name: json_format_validator
|
|
34
|
-
type:
|
|
41
|
+
type: code_judge
|
|
35
42
|
script: uv run validate_output.py
|
|
36
43
|
cwd: ../../evaluators/scripts
|
|
37
44
|
```
|
|
38
45
|
|
|
39
46
|
**Contract:**
|
|
40
|
-
- Input (stdin): JSON with `question`, `expected_outcome`, `reference_answer`, `candidate_answer`, `
|
|
47
|
+
- Input (stdin): JSON with `question`, `expected_outcome`, `reference_answer`, `candidate_answer`, `guideline_files` (file paths), `input_files` (file paths, excludes guidelines), `input_messages`
|
|
41
48
|
- Output (stdout): JSON with `score` (0.0-1.0), `hits`, `misses`, `reasoning`
|
|
42
49
|
|
|
43
50
|
**Template:** See `references/custom-evaluators.md` for Python code evaluator template
|
|
@@ -61,12 +68,74 @@ Evaluators run sequentially:
|
|
|
61
68
|
execution:
|
|
62
69
|
evaluators:
|
|
63
70
|
- name: format_check # Runs first
|
|
64
|
-
type:
|
|
71
|
+
type: code_judge
|
|
65
72
|
script: uv run validate_json.py
|
|
66
73
|
- name: content_check # Runs second
|
|
67
74
|
type: llm_judge
|
|
68
75
|
```
|
|
69
76
|
|
|
77
|
+
### Rubric Evaluator
|
|
78
|
+
Inline rubrics for structured criteria-based evaluation:
|
|
79
|
+
|
|
80
|
+
```yaml
|
|
81
|
+
evalcases:
|
|
82
|
+
- id: explanation-task
|
|
83
|
+
expected_outcome: Clear explanation of quicksort
|
|
84
|
+
input_messages:
|
|
85
|
+
- role: user
|
|
86
|
+
content: Explain quicksort
|
|
87
|
+
rubrics:
|
|
88
|
+
- Mentions divide-and-conquer approach
|
|
89
|
+
- Explains the partition step
|
|
90
|
+
- id: complexity
|
|
91
|
+
description: States time complexity correctly
|
|
92
|
+
weight: 2.0
|
|
93
|
+
required: true
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
See `references/rubric-evaluator.md` for detailed rubric configuration.
|
|
97
|
+
|
|
98
|
+
### Composite Evaluator
|
|
99
|
+
Combine multiple evaluators with aggregation:
|
|
100
|
+
|
|
101
|
+
```yaml
|
|
102
|
+
execution:
|
|
103
|
+
evaluators:
|
|
104
|
+
- name: release_gate
|
|
105
|
+
type: composite
|
|
106
|
+
evaluators:
|
|
107
|
+
- name: safety
|
|
108
|
+
type: llm_judge
|
|
109
|
+
prompt: ./prompts/safety.md
|
|
110
|
+
- name: quality
|
|
111
|
+
type: llm_judge
|
|
112
|
+
prompt: ./prompts/quality.md
|
|
113
|
+
aggregator:
|
|
114
|
+
type: weighted_average
|
|
115
|
+
weights:
|
|
116
|
+
safety: 0.3
|
|
117
|
+
quality: 0.7
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
See `references/composite-evaluator.md` for aggregation types and patterns.
|
|
121
|
+
|
|
122
|
+
### Tool Trajectory Evaluator
|
|
123
|
+
Validate agent tool usage from trace data:
|
|
124
|
+
|
|
125
|
+
```yaml
|
|
126
|
+
execution:
|
|
127
|
+
evaluators:
|
|
128
|
+
- name: workflow-check
|
|
129
|
+
type: tool_trajectory
|
|
130
|
+
mode: in_order # or: any_order, exact
|
|
131
|
+
expected:
|
|
132
|
+
- tool: fetchData
|
|
133
|
+
- tool: processData
|
|
134
|
+
- tool: saveResults
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
See `references/tool-trajectory-evaluator.md` for modes and configuration.
|
|
138
|
+
|
|
70
139
|
## Example
|
|
71
140
|
```yaml
|
|
72
141
|
$schema: agentv-eval-v2
|
|
@@ -0,0 +1,215 @@
|
|
|
1
|
+
# Composite Evaluator Guide
|
|
2
|
+
|
|
3
|
+
Composite evaluators combine multiple evaluators and aggregate their results. This enables sophisticated evaluation patterns like safety gates, weighted scoring, and conflict resolution.
|
|
4
|
+
|
|
5
|
+
## Basic Structure
|
|
6
|
+
|
|
7
|
+
```yaml
|
|
8
|
+
execution:
|
|
9
|
+
evaluators:
|
|
10
|
+
- name: my_composite
|
|
11
|
+
type: composite
|
|
12
|
+
evaluators:
|
|
13
|
+
- name: evaluator_1
|
|
14
|
+
type: llm_judge
|
|
15
|
+
prompt: ./prompts/check1.md
|
|
16
|
+
- name: evaluator_2
|
|
17
|
+
type: code_judge
|
|
18
|
+
script: uv run check2.py
|
|
19
|
+
aggregator:
|
|
20
|
+
type: weighted_average
|
|
21
|
+
weights:
|
|
22
|
+
evaluator_1: 0.6
|
|
23
|
+
evaluator_2: 0.4
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
## Aggregator Types
|
|
27
|
+
|
|
28
|
+
### 1. Weighted Average (Default)
|
|
29
|
+
|
|
30
|
+
Combines scores using weighted arithmetic mean:
|
|
31
|
+
|
|
32
|
+
```yaml
|
|
33
|
+
aggregator:
|
|
34
|
+
type: weighted_average
|
|
35
|
+
weights:
|
|
36
|
+
safety: 0.3 # 30% weight
|
|
37
|
+
quality: 0.7 # 70% weight
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
If weights are omitted, all evaluators have equal weight (1.0).
|
|
41
|
+
|
|
42
|
+
**Score calculation:**
|
|
43
|
+
```
|
|
44
|
+
final_score = Σ(score_i × weight_i) / Σ(weight_i)
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
### 2. Code Judge Aggregator
|
|
48
|
+
|
|
49
|
+
Run custom code to decide final score based on all evaluator results:
|
|
50
|
+
|
|
51
|
+
```yaml
|
|
52
|
+
aggregator:
|
|
53
|
+
type: code_judge
|
|
54
|
+
path: node ./scripts/safety-gate.js
|
|
55
|
+
cwd: ./evaluators # optional working directory
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
**Input (stdin):**
|
|
59
|
+
```json
|
|
60
|
+
{
|
|
61
|
+
"results": {
|
|
62
|
+
"safety": { "score": 0.9, "hits": [...], "misses": [...] },
|
|
63
|
+
"quality": { "score": 0.85, "hits": [...], "misses": [...] }
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
**Output (stdout):**
|
|
69
|
+
```json
|
|
70
|
+
{
|
|
71
|
+
"score": 0.87,
|
|
72
|
+
"verdict": "pass",
|
|
73
|
+
"hits": ["Combined check passed"],
|
|
74
|
+
"misses": [],
|
|
75
|
+
"reasoning": "Safety gate passed, quality acceptable"
|
|
76
|
+
}
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
### 3. LLM Judge Aggregator
|
|
80
|
+
|
|
81
|
+
Use an LLM to resolve conflicts or make nuanced decisions:
|
|
82
|
+
|
|
83
|
+
```yaml
|
|
84
|
+
aggregator:
|
|
85
|
+
type: llm_judge
|
|
86
|
+
prompt: ./prompts/conflict-resolution.md
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
The `{{EVALUATOR_RESULTS_JSON}}` variable is replaced with the JSON results from all child evaluators.
|
|
90
|
+
|
|
91
|
+
## Example Patterns
|
|
92
|
+
|
|
93
|
+
### Safety Gate Pattern
|
|
94
|
+
|
|
95
|
+
Block outputs that fail safety even if quality is high:
|
|
96
|
+
|
|
97
|
+
```yaml
|
|
98
|
+
evalcases:
|
|
99
|
+
- id: safety-gated-response
|
|
100
|
+
expected_outcome: Safe and accurate response
|
|
101
|
+
|
|
102
|
+
input_messages:
|
|
103
|
+
- role: user
|
|
104
|
+
content: Explain quantum computing
|
|
105
|
+
|
|
106
|
+
execution:
|
|
107
|
+
evaluators:
|
|
108
|
+
- name: safety_gate
|
|
109
|
+
type: composite
|
|
110
|
+
evaluators:
|
|
111
|
+
- name: safety
|
|
112
|
+
type: llm_judge
|
|
113
|
+
prompt: ./prompts/safety-check.md
|
|
114
|
+
- name: quality
|
|
115
|
+
type: llm_judge
|
|
116
|
+
prompt: ./prompts/quality-check.md
|
|
117
|
+
aggregator:
|
|
118
|
+
type: code_judge
|
|
119
|
+
path: ./scripts/safety-gate.js
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
### Multi-Criteria Weighted Evaluation
|
|
123
|
+
|
|
124
|
+
```yaml
|
|
125
|
+
- name: release_readiness
|
|
126
|
+
type: composite
|
|
127
|
+
evaluators:
|
|
128
|
+
- name: correctness
|
|
129
|
+
type: llm_judge
|
|
130
|
+
prompt: ./prompts/correctness.md
|
|
131
|
+
- name: style
|
|
132
|
+
type: code_judge
|
|
133
|
+
script: uv run style_checker.py
|
|
134
|
+
- name: security
|
|
135
|
+
type: llm_judge
|
|
136
|
+
prompt: ./prompts/security.md
|
|
137
|
+
aggregator:
|
|
138
|
+
type: weighted_average
|
|
139
|
+
weights:
|
|
140
|
+
correctness: 0.5
|
|
141
|
+
style: 0.2
|
|
142
|
+
security: 0.3
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
### Nested Composites
|
|
146
|
+
|
|
147
|
+
Composites can contain other composites for complex hierarchies:
|
|
148
|
+
|
|
149
|
+
```yaml
|
|
150
|
+
- name: comprehensive_eval
|
|
151
|
+
type: composite
|
|
152
|
+
evaluators:
|
|
153
|
+
- name: content_quality
|
|
154
|
+
type: composite
|
|
155
|
+
evaluators:
|
|
156
|
+
- name: accuracy
|
|
157
|
+
type: llm_judge
|
|
158
|
+
prompt: ./prompts/accuracy.md
|
|
159
|
+
- name: clarity
|
|
160
|
+
type: llm_judge
|
|
161
|
+
prompt: ./prompts/clarity.md
|
|
162
|
+
aggregator:
|
|
163
|
+
type: weighted_average
|
|
164
|
+
weights:
|
|
165
|
+
accuracy: 0.6
|
|
166
|
+
clarity: 0.4
|
|
167
|
+
- name: safety
|
|
168
|
+
type: llm_judge
|
|
169
|
+
prompt: ./prompts/safety.md
|
|
170
|
+
aggregator:
|
|
171
|
+
type: weighted_average
|
|
172
|
+
weights:
|
|
173
|
+
content_quality: 0.7
|
|
174
|
+
safety: 0.3
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
## Result Structure
|
|
178
|
+
|
|
179
|
+
Composite evaluators return nested `evaluator_results`:
|
|
180
|
+
|
|
181
|
+
```json
|
|
182
|
+
{
|
|
183
|
+
"score": 0.85,
|
|
184
|
+
"verdict": "pass",
|
|
185
|
+
"hits": ["[safety] No harmful content", "[quality] Clear explanation"],
|
|
186
|
+
"misses": ["[quality] Could use more examples"],
|
|
187
|
+
"reasoning": "safety: Passed all checks; quality: Good but could improve",
|
|
188
|
+
"evaluator_results": [
|
|
189
|
+
{
|
|
190
|
+
"name": "safety",
|
|
191
|
+
"type": "llm_judge",
|
|
192
|
+
"score": 0.95,
|
|
193
|
+
"verdict": "pass",
|
|
194
|
+
"hits": ["No harmful content"],
|
|
195
|
+
"misses": []
|
|
196
|
+
},
|
|
197
|
+
{
|
|
198
|
+
"name": "quality",
|
|
199
|
+
"type": "llm_judge",
|
|
200
|
+
"score": 0.8,
|
|
201
|
+
"verdict": "pass",
|
|
202
|
+
"hits": ["Clear explanation"],
|
|
203
|
+
"misses": ["Could use more examples"]
|
|
204
|
+
}
|
|
205
|
+
]
|
|
206
|
+
}
|
|
207
|
+
```
|
|
208
|
+
|
|
209
|
+
## Best Practices
|
|
210
|
+
|
|
211
|
+
1. **Name evaluators clearly** - Names appear in results and debugging output
|
|
212
|
+
2. **Use safety gates for critical checks** - Don't let high quality override safety failures
|
|
213
|
+
3. **Balance weights thoughtfully** - Consider which aspects matter most for your use case
|
|
214
|
+
4. **Keep nesting shallow** - Deep nesting makes debugging harder
|
|
215
|
+
5. **Test aggregators independently** - Verify your custom aggregation logic with unit tests
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
# Rubric Evaluator Guide
|
|
2
|
+
|
|
3
|
+
Rubrics provide structured evaluation through lists of criteria that define what makes a good response. Rubrics are checked by an LLM judge and scored based on weights and requirements.
|
|
4
|
+
|
|
5
|
+
## Basic Usage
|
|
6
|
+
|
|
7
|
+
### Simple String Rubrics
|
|
8
|
+
|
|
9
|
+
Define rubrics as simple strings - each becomes a required criterion with weight 1.0:
|
|
10
|
+
|
|
11
|
+
```yaml
|
|
12
|
+
$schema: agentv-eval-v2
|
|
13
|
+
|
|
14
|
+
evalcases:
|
|
15
|
+
- id: quicksort-explanation
|
|
16
|
+
expected_outcome: Explain how quicksort works
|
|
17
|
+
|
|
18
|
+
input_messages:
|
|
19
|
+
- role: user
|
|
20
|
+
content: Explain how the quicksort algorithm works
|
|
21
|
+
|
|
22
|
+
rubrics:
|
|
23
|
+
- Mentions divide-and-conquer approach
|
|
24
|
+
- Explains the partition step
|
|
25
|
+
- States time complexity correctly
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
### Detailed Rubric Objects
|
|
29
|
+
|
|
30
|
+
Use objects for fine-grained control over weights and requirements:
|
|
31
|
+
|
|
32
|
+
```yaml
|
|
33
|
+
evalcases:
|
|
34
|
+
- id: technical-guide
|
|
35
|
+
expected_outcome: Write a comprehensive HTTP status codes guide
|
|
36
|
+
|
|
37
|
+
input_messages:
|
|
38
|
+
- role: user
|
|
39
|
+
content: Write a guide explaining HTTP status codes
|
|
40
|
+
|
|
41
|
+
rubrics:
|
|
42
|
+
- id: structure
|
|
43
|
+
description: Has clear headings and organization
|
|
44
|
+
weight: 1.0
|
|
45
|
+
required: true
|
|
46
|
+
|
|
47
|
+
- id: success-codes
|
|
48
|
+
description: Covers 2xx success codes with examples
|
|
49
|
+
weight: 2.0
|
|
50
|
+
required: true
|
|
51
|
+
|
|
52
|
+
- id: client-errors
|
|
53
|
+
description: Explains 4xx client error codes
|
|
54
|
+
weight: 2.0
|
|
55
|
+
required: true
|
|
56
|
+
|
|
57
|
+
- id: server-errors
|
|
58
|
+
description: Explains 5xx server error codes
|
|
59
|
+
weight: 1.5
|
|
60
|
+
required: false
|
|
61
|
+
|
|
62
|
+
- id: practical-examples
|
|
63
|
+
description: Includes practical use case examples
|
|
64
|
+
weight: 1.0
|
|
65
|
+
required: false
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
## Rubric Object Fields
|
|
69
|
+
|
|
70
|
+
| Field | Type | Default | Description |
|
|
71
|
+
|-------|------|---------|-------------|
|
|
72
|
+
| `id` | string | auto-generated | Unique identifier for the rubric |
|
|
73
|
+
| `description` | string | required | The criterion being evaluated |
|
|
74
|
+
| `weight` | number | 1.0 | Relative importance (higher = more impact on score) |
|
|
75
|
+
| `required` | boolean | true | If true, failing this rubric forces verdict to 'fail' |
|
|
76
|
+
|
|
77
|
+
## Scoring and Verdicts
|
|
78
|
+
|
|
79
|
+
**Score Calculation:**
|
|
80
|
+
```
|
|
81
|
+
score = (sum of satisfied weights) / (total weights)
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
**Verdict Rules:**
|
|
85
|
+
- `pass`: Score ≥ 0.8 AND all required rubrics satisfied
|
|
86
|
+
- `borderline`: Score ≥ 0.6 AND all required rubrics satisfied
|
|
87
|
+
- `fail`: Score < 0.6 OR any required rubric failed
|
|
88
|
+
|
|
89
|
+
## Combining Rubrics with Other Evaluators
|
|
90
|
+
|
|
91
|
+
Rubrics can be combined with code evaluators for comprehensive validation:
|
|
92
|
+
|
|
93
|
+
```yaml
|
|
94
|
+
evalcases:
|
|
95
|
+
- id: email-validator
|
|
96
|
+
expected_outcome: Python function to validate email addresses
|
|
97
|
+
|
|
98
|
+
input_messages:
|
|
99
|
+
- role: user
|
|
100
|
+
content: Write a Python function to validate email addresses
|
|
101
|
+
|
|
102
|
+
# Semantic evaluation via rubrics
|
|
103
|
+
rubrics:
|
|
104
|
+
- Uses regular expressions for validation
|
|
105
|
+
- Includes type hints
|
|
106
|
+
- Has docstring documentation
|
|
107
|
+
- Handles edge cases (None, empty string)
|
|
108
|
+
|
|
109
|
+
execution:
|
|
110
|
+
evaluators:
|
|
111
|
+
# Rubric evaluator is auto-added from inline rubrics field
|
|
112
|
+
|
|
113
|
+
# Additional code evaluator for syntax checking
|
|
114
|
+
- name: python_syntax
|
|
115
|
+
type: code_judge
|
|
116
|
+
script: uv run python -m py_compile
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
## Generate Rubrics from Expected Outcome
|
|
120
|
+
|
|
121
|
+
Use the CLI to auto-generate rubrics from `expected_outcome`:
|
|
122
|
+
|
|
123
|
+
```bash
|
|
124
|
+
# Generate rubrics for eval cases that don't have them
|
|
125
|
+
agentv generate rubrics evals/my-eval.yaml
|
|
126
|
+
|
|
127
|
+
# Use a specific LLM target for generation
|
|
128
|
+
agentv generate rubrics evals/my-eval.yaml --target azure_base
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
This analyzes each `expected_outcome` and creates appropriate rubric items.
|
|
132
|
+
|
|
133
|
+
## Best Practices
|
|
134
|
+
|
|
135
|
+
1. **Use required sparingly** - Only mark rubrics as `required: true` for critical criteria
|
|
136
|
+
2. **Balance weights** - Use higher weights (2.0+) for core requirements, lower (0.5) for nice-to-haves
|
|
137
|
+
3. **Be specific** - "Includes error handling" is better than "Good code quality"
|
|
138
|
+
4. **Keep rubrics atomic** - Each rubric should test one thing
|
|
139
|
+
5. **Consider partial credit** - Non-required rubrics allow partial scores
|