agentv 0.23.0 → 0.25.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +13 -10
- package/dist/{chunk-4T62HFF4.js → chunk-ZVSFP6NK.js} +822 -233
- package/dist/chunk-ZVSFP6NK.js.map +1 -0
- package/dist/cli.js +1 -1
- package/dist/cli.js.map +1 -1
- package/dist/index.js +1 -1
- package/dist/templates/.agentv/.env.template +10 -10
- package/dist/templates/.agentv/targets.yaml +8 -1
- package/dist/templates/.claude/skills/agentv-eval-builder/SKILL.md +75 -6
- package/dist/templates/.claude/skills/agentv-eval-builder/references/composite-evaluator.md +215 -0
- package/dist/templates/.claude/skills/agentv-eval-builder/references/eval-schema.json +217 -217
- package/dist/templates/.claude/skills/agentv-eval-builder/references/rubric-evaluator.md +139 -0
- package/dist/templates/.claude/skills/agentv-eval-builder/references/tool-trajectory-evaluator.md +237 -0
- package/package.json +1 -1
- package/dist/chunk-4T62HFF4.js.map +0 -1
- package/dist/templates/agentv/.env.template +0 -23
|
@@ -1,217 +1,217 @@
|
|
|
1
|
-
{
|
|
2
|
-
"$schema": "http://json-schema.org/draft-07/schema#",
|
|
3
|
-
"title": "AgentV Eval Schema",
|
|
4
|
-
"description": "Schema for YAML evaluation files with conversation flows, multiple evaluators, and execution configuration",
|
|
5
|
-
"type": "object",
|
|
6
|
-
"properties": {
|
|
7
|
-
"$schema": {
|
|
8
|
-
"type": "string",
|
|
9
|
-
"description": "Schema identifier",
|
|
10
|
-
"enum": ["agentv-eval-v2"]
|
|
11
|
-
},
|
|
12
|
-
"description": {
|
|
13
|
-
"type": "string",
|
|
14
|
-
"description": "Description of what this eval suite covers"
|
|
15
|
-
},
|
|
16
|
-
"target": {
|
|
17
|
-
"type": "string",
|
|
18
|
-
"description": "(Deprecated: use execution.target instead) Default target configuration name. Can be overridden per eval case."
|
|
19
|
-
},
|
|
20
|
-
"execution": {
|
|
21
|
-
"type": "object",
|
|
22
|
-
"description": "Default execution configuration for all eval cases (can be overridden per case)",
|
|
23
|
-
"properties": {
|
|
24
|
-
"target": {
|
|
25
|
-
"type": "string",
|
|
26
|
-
"description": "Default target configuration name (e.g., default, azure_base, vscode_projectx). Can be overridden per eval case."
|
|
27
|
-
},
|
|
28
|
-
"evaluators": {
|
|
29
|
-
"type": "array",
|
|
30
|
-
"description": "Default evaluators for all eval cases (code-based and LLM judges)",
|
|
31
|
-
"items": {
|
|
32
|
-
"type": "object",
|
|
33
|
-
"properties": {
|
|
34
|
-
"name": {
|
|
35
|
-
"type": "string",
|
|
36
|
-
"description": "Evaluator name/identifier"
|
|
37
|
-
},
|
|
38
|
-
"type": {
|
|
39
|
-
"type": "string",
|
|
40
|
-
"enum": ["code", "llm_judge"],
|
|
41
|
-
"description": "Evaluator type: 'code' for scripts/regex/keywords, 'llm_judge' for LLM-based evaluation"
|
|
42
|
-
},
|
|
43
|
-
"script": {
|
|
44
|
-
"type": "string",
|
|
45
|
-
"description": "Path to evaluator script (for type: code)"
|
|
46
|
-
},
|
|
47
|
-
"prompt": {
|
|
48
|
-
"type": "string",
|
|
49
|
-
"description": "Path to judge prompt file (for type: llm_judge)"
|
|
50
|
-
}
|
|
51
|
-
},
|
|
52
|
-
"required": ["name", "type"],
|
|
53
|
-
"additionalProperties": true
|
|
54
|
-
}
|
|
55
|
-
}
|
|
56
|
-
},
|
|
57
|
-
"additionalProperties": true
|
|
58
|
-
},
|
|
59
|
-
"evalcases": {
|
|
60
|
-
"type": "array",
|
|
61
|
-
"description": "Array of evaluation cases",
|
|
62
|
-
"minItems": 1,
|
|
63
|
-
"items": {
|
|
64
|
-
"type": "object",
|
|
65
|
-
"properties": {
|
|
66
|
-
"id": {
|
|
67
|
-
"type": "string",
|
|
68
|
-
"description": "Unique identifier for the eval case"
|
|
69
|
-
},
|
|
70
|
-
"conversation_id": {
|
|
71
|
-
"type": "string",
|
|
72
|
-
"description": "Optional conversation identifier for threading multiple eval cases together"
|
|
73
|
-
},
|
|
74
|
-
"expected_outcome": {
|
|
75
|
-
"type": "string",
|
|
76
|
-
"description": "Description of what the AI should accomplish in this eval"
|
|
77
|
-
},
|
|
78
|
-
"note": {
|
|
79
|
-
"type": "string",
|
|
80
|
-
"description": "Optional note or additional context for the eval case. Use this to document test-specific considerations, known limitations, or rationale for expected behavior."
|
|
81
|
-
},
|
|
82
|
-
"input_messages": {
|
|
83
|
-
"type": "array",
|
|
84
|
-
"description": "Input messages for the conversation",
|
|
85
|
-
"minItems": 1,
|
|
86
|
-
"items": {
|
|
87
|
-
"type": "object",
|
|
88
|
-
"properties": {
|
|
89
|
-
"role": {
|
|
90
|
-
"type": "string",
|
|
91
|
-
"enum": ["system", "user", "assistant", "tool"],
|
|
92
|
-
"description": "Message role"
|
|
93
|
-
},
|
|
94
|
-
"content": {
|
|
95
|
-
"oneOf": [
|
|
96
|
-
{
|
|
97
|
-
"type": "string",
|
|
98
|
-
"description": "Simple text content"
|
|
99
|
-
},
|
|
100
|
-
{
|
|
101
|
-
"type": "array",
|
|
102
|
-
"description": "Mixed content items (text and file references)",
|
|
103
|
-
"items": {
|
|
104
|
-
"type": "object",
|
|
105
|
-
"properties": {
|
|
106
|
-
"type": {
|
|
107
|
-
"type": "string",
|
|
108
|
-
"enum": ["text", "file"],
|
|
109
|
-
"description": "Content type: 'text' for inline content, 'file' for file references"
|
|
110
|
-
},
|
|
111
|
-
"value": {
|
|
112
|
-
"type": "string",
|
|
113
|
-
"description": "Text content or file path. Relative paths (e.g., ../prompts/file.md) are resolved from eval file directory. Absolute paths (e.g., /docs/examples/prompts/file.md) are resolved from repo root."
|
|
114
|
-
}
|
|
115
|
-
},
|
|
116
|
-
"required": ["type", "value"],
|
|
117
|
-
"additionalProperties": false
|
|
118
|
-
}
|
|
119
|
-
}
|
|
120
|
-
]
|
|
121
|
-
}
|
|
122
|
-
},
|
|
123
|
-
"required": ["role", "content"],
|
|
124
|
-
"additionalProperties": false
|
|
125
|
-
}
|
|
126
|
-
},
|
|
127
|
-
"expected_messages": {
|
|
128
|
-
"type": "array",
|
|
129
|
-
"description": "Expected response messages",
|
|
130
|
-
"minItems": 1,
|
|
131
|
-
"items": {
|
|
132
|
-
"type": "object",
|
|
133
|
-
"properties": {
|
|
134
|
-
"role": {
|
|
135
|
-
"type": "string",
|
|
136
|
-
"enum": ["system", "user", "assistant", "tool"],
|
|
137
|
-
"description": "Message role"
|
|
138
|
-
},
|
|
139
|
-
"content": {
|
|
140
|
-
"oneOf": [
|
|
141
|
-
{
|
|
142
|
-
"type": "string",
|
|
143
|
-
"description": "Simple text content"
|
|
144
|
-
},
|
|
145
|
-
{
|
|
146
|
-
"type": "array",
|
|
147
|
-
"description": "Mixed content items",
|
|
148
|
-
"items": {
|
|
149
|
-
"type": "object",
|
|
150
|
-
"properties": {
|
|
151
|
-
"type": {
|
|
152
|
-
"type": "string",
|
|
153
|
-
"enum": ["text", "file"]
|
|
154
|
-
},
|
|
155
|
-
"value": {
|
|
156
|
-
"type": "string"
|
|
157
|
-
}
|
|
158
|
-
},
|
|
159
|
-
"required": ["type", "value"],
|
|
160
|
-
"additionalProperties": false
|
|
161
|
-
}
|
|
162
|
-
}
|
|
163
|
-
]
|
|
164
|
-
}
|
|
165
|
-
},
|
|
166
|
-
"required": ["role", "content"],
|
|
167
|
-
"additionalProperties": false
|
|
168
|
-
}
|
|
169
|
-
},
|
|
170
|
-
"execution": {
|
|
171
|
-
"type": "object",
|
|
172
|
-
"description": "Per-case execution configuration",
|
|
173
|
-
"properties": {
|
|
174
|
-
"target": {
|
|
175
|
-
"type": "string",
|
|
176
|
-
"description": "Override target for this specific eval case"
|
|
177
|
-
},
|
|
178
|
-
"evaluators": {
|
|
179
|
-
"type": "array",
|
|
180
|
-
"description": "Multiple evaluators (code-based and LLM judges)",
|
|
181
|
-
"items": {
|
|
182
|
-
"type": "object",
|
|
183
|
-
"properties": {
|
|
184
|
-
"name": {
|
|
185
|
-
"type": "string",
|
|
186
|
-
"description": "Evaluator name/identifier"
|
|
187
|
-
},
|
|
188
|
-
"type": {
|
|
189
|
-
"type": "string",
|
|
190
|
-
"enum": ["code", "llm_judge"],
|
|
191
|
-
"description": "Evaluator type: 'code' for scripts/regex/keywords, 'llm_judge' for LLM-based evaluation"
|
|
192
|
-
},
|
|
193
|
-
"script": {
|
|
194
|
-
"type": "string",
|
|
195
|
-
"description": "Path to evaluator script (for type: code)"
|
|
196
|
-
},
|
|
197
|
-
"prompt": {
|
|
198
|
-
"type": "string",
|
|
199
|
-
"description": "Path to judge prompt file (for type: llm_judge)"
|
|
200
|
-
}
|
|
201
|
-
},
|
|
202
|
-
"required": ["name", "type"],
|
|
203
|
-
"additionalProperties": true
|
|
204
|
-
}
|
|
205
|
-
}
|
|
206
|
-
},
|
|
207
|
-
"additionalProperties": true
|
|
208
|
-
}
|
|
209
|
-
},
|
|
210
|
-
"required": ["id", "expected_outcome", "input_messages", "expected_messages"],
|
|
211
|
-
"additionalProperties": false
|
|
212
|
-
}
|
|
213
|
-
}
|
|
214
|
-
},
|
|
215
|
-
"required": ["evalcases"],
|
|
216
|
-
"additionalProperties": false
|
|
217
|
-
}
|
|
1
|
+
{
|
|
2
|
+
"$schema": "http://json-schema.org/draft-07/schema#",
|
|
3
|
+
"title": "AgentV Eval Schema",
|
|
4
|
+
"description": "Schema for YAML evaluation files with conversation flows, multiple evaluators, and execution configuration",
|
|
5
|
+
"type": "object",
|
|
6
|
+
"properties": {
|
|
7
|
+
"$schema": {
|
|
8
|
+
"type": "string",
|
|
9
|
+
"description": "Schema identifier",
|
|
10
|
+
"enum": ["agentv-eval-v2"]
|
|
11
|
+
},
|
|
12
|
+
"description": {
|
|
13
|
+
"type": "string",
|
|
14
|
+
"description": "Description of what this eval suite covers"
|
|
15
|
+
},
|
|
16
|
+
"target": {
|
|
17
|
+
"type": "string",
|
|
18
|
+
"description": "(Deprecated: use execution.target instead) Default target configuration name. Can be overridden per eval case."
|
|
19
|
+
},
|
|
20
|
+
"execution": {
|
|
21
|
+
"type": "object",
|
|
22
|
+
"description": "Default execution configuration for all eval cases (can be overridden per case)",
|
|
23
|
+
"properties": {
|
|
24
|
+
"target": {
|
|
25
|
+
"type": "string",
|
|
26
|
+
"description": "Default target configuration name (e.g., default, azure_base, vscode_projectx). Can be overridden per eval case."
|
|
27
|
+
},
|
|
28
|
+
"evaluators": {
|
|
29
|
+
"type": "array",
|
|
30
|
+
"description": "Default evaluators for all eval cases (code-based and LLM judges)",
|
|
31
|
+
"items": {
|
|
32
|
+
"type": "object",
|
|
33
|
+
"properties": {
|
|
34
|
+
"name": {
|
|
35
|
+
"type": "string",
|
|
36
|
+
"description": "Evaluator name/identifier"
|
|
37
|
+
},
|
|
38
|
+
"type": {
|
|
39
|
+
"type": "string",
|
|
40
|
+
"enum": ["code", "llm_judge"],
|
|
41
|
+
"description": "Evaluator type: 'code' for scripts/regex/keywords, 'llm_judge' for LLM-based evaluation"
|
|
42
|
+
},
|
|
43
|
+
"script": {
|
|
44
|
+
"type": "string",
|
|
45
|
+
"description": "Path to evaluator script (for type: code)"
|
|
46
|
+
},
|
|
47
|
+
"prompt": {
|
|
48
|
+
"type": "string",
|
|
49
|
+
"description": "Path to judge prompt file (for type: llm_judge)"
|
|
50
|
+
}
|
|
51
|
+
},
|
|
52
|
+
"required": ["name", "type"],
|
|
53
|
+
"additionalProperties": true
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
},
|
|
57
|
+
"additionalProperties": true
|
|
58
|
+
},
|
|
59
|
+
"evalcases": {
|
|
60
|
+
"type": "array",
|
|
61
|
+
"description": "Array of evaluation cases",
|
|
62
|
+
"minItems": 1,
|
|
63
|
+
"items": {
|
|
64
|
+
"type": "object",
|
|
65
|
+
"properties": {
|
|
66
|
+
"id": {
|
|
67
|
+
"type": "string",
|
|
68
|
+
"description": "Unique identifier for the eval case"
|
|
69
|
+
},
|
|
70
|
+
"conversation_id": {
|
|
71
|
+
"type": "string",
|
|
72
|
+
"description": "Optional conversation identifier for threading multiple eval cases together"
|
|
73
|
+
},
|
|
74
|
+
"expected_outcome": {
|
|
75
|
+
"type": "string",
|
|
76
|
+
"description": "Description of what the AI should accomplish in this eval"
|
|
77
|
+
},
|
|
78
|
+
"note": {
|
|
79
|
+
"type": "string",
|
|
80
|
+
"description": "Optional note or additional context for the eval case. Use this to document test-specific considerations, known limitations, or rationale for expected behavior."
|
|
81
|
+
},
|
|
82
|
+
"input_messages": {
|
|
83
|
+
"type": "array",
|
|
84
|
+
"description": "Input messages for the conversation",
|
|
85
|
+
"minItems": 1,
|
|
86
|
+
"items": {
|
|
87
|
+
"type": "object",
|
|
88
|
+
"properties": {
|
|
89
|
+
"role": {
|
|
90
|
+
"type": "string",
|
|
91
|
+
"enum": ["system", "user", "assistant", "tool"],
|
|
92
|
+
"description": "Message role"
|
|
93
|
+
},
|
|
94
|
+
"content": {
|
|
95
|
+
"oneOf": [
|
|
96
|
+
{
|
|
97
|
+
"type": "string",
|
|
98
|
+
"description": "Simple text content"
|
|
99
|
+
},
|
|
100
|
+
{
|
|
101
|
+
"type": "array",
|
|
102
|
+
"description": "Mixed content items (text and file references)",
|
|
103
|
+
"items": {
|
|
104
|
+
"type": "object",
|
|
105
|
+
"properties": {
|
|
106
|
+
"type": {
|
|
107
|
+
"type": "string",
|
|
108
|
+
"enum": ["text", "file"],
|
|
109
|
+
"description": "Content type: 'text' for inline content, 'file' for file references"
|
|
110
|
+
},
|
|
111
|
+
"value": {
|
|
112
|
+
"type": "string",
|
|
113
|
+
"description": "Text content or file path. Relative paths (e.g., ../prompts/file.md) are resolved from eval file directory. Absolute paths (e.g., /docs/examples/prompts/file.md) are resolved from repo root."
|
|
114
|
+
}
|
|
115
|
+
},
|
|
116
|
+
"required": ["type", "value"],
|
|
117
|
+
"additionalProperties": false
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
]
|
|
121
|
+
}
|
|
122
|
+
},
|
|
123
|
+
"required": ["role", "content"],
|
|
124
|
+
"additionalProperties": false
|
|
125
|
+
}
|
|
126
|
+
},
|
|
127
|
+
"expected_messages": {
|
|
128
|
+
"type": "array",
|
|
129
|
+
"description": "Expected response messages",
|
|
130
|
+
"minItems": 1,
|
|
131
|
+
"items": {
|
|
132
|
+
"type": "object",
|
|
133
|
+
"properties": {
|
|
134
|
+
"role": {
|
|
135
|
+
"type": "string",
|
|
136
|
+
"enum": ["system", "user", "assistant", "tool"],
|
|
137
|
+
"description": "Message role"
|
|
138
|
+
},
|
|
139
|
+
"content": {
|
|
140
|
+
"oneOf": [
|
|
141
|
+
{
|
|
142
|
+
"type": "string",
|
|
143
|
+
"description": "Simple text content"
|
|
144
|
+
},
|
|
145
|
+
{
|
|
146
|
+
"type": "array",
|
|
147
|
+
"description": "Mixed content items",
|
|
148
|
+
"items": {
|
|
149
|
+
"type": "object",
|
|
150
|
+
"properties": {
|
|
151
|
+
"type": {
|
|
152
|
+
"type": "string",
|
|
153
|
+
"enum": ["text", "file"]
|
|
154
|
+
},
|
|
155
|
+
"value": {
|
|
156
|
+
"type": "string"
|
|
157
|
+
}
|
|
158
|
+
},
|
|
159
|
+
"required": ["type", "value"],
|
|
160
|
+
"additionalProperties": false
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
]
|
|
164
|
+
}
|
|
165
|
+
},
|
|
166
|
+
"required": ["role", "content"],
|
|
167
|
+
"additionalProperties": false
|
|
168
|
+
}
|
|
169
|
+
},
|
|
170
|
+
"execution": {
|
|
171
|
+
"type": "object",
|
|
172
|
+
"description": "Per-case execution configuration",
|
|
173
|
+
"properties": {
|
|
174
|
+
"target": {
|
|
175
|
+
"type": "string",
|
|
176
|
+
"description": "Override target for this specific eval case"
|
|
177
|
+
},
|
|
178
|
+
"evaluators": {
|
|
179
|
+
"type": "array",
|
|
180
|
+
"description": "Multiple evaluators (code-based and LLM judges)",
|
|
181
|
+
"items": {
|
|
182
|
+
"type": "object",
|
|
183
|
+
"properties": {
|
|
184
|
+
"name": {
|
|
185
|
+
"type": "string",
|
|
186
|
+
"description": "Evaluator name/identifier"
|
|
187
|
+
},
|
|
188
|
+
"type": {
|
|
189
|
+
"type": "string",
|
|
190
|
+
"enum": ["code", "llm_judge"],
|
|
191
|
+
"description": "Evaluator type: 'code' for scripts/regex/keywords, 'llm_judge' for LLM-based evaluation"
|
|
192
|
+
},
|
|
193
|
+
"script": {
|
|
194
|
+
"type": "string",
|
|
195
|
+
"description": "Path to evaluator script (for type: code)"
|
|
196
|
+
},
|
|
197
|
+
"prompt": {
|
|
198
|
+
"type": "string",
|
|
199
|
+
"description": "Path to judge prompt file (for type: llm_judge)"
|
|
200
|
+
}
|
|
201
|
+
},
|
|
202
|
+
"required": ["name", "type"],
|
|
203
|
+
"additionalProperties": true
|
|
204
|
+
}
|
|
205
|
+
}
|
|
206
|
+
},
|
|
207
|
+
"additionalProperties": true
|
|
208
|
+
}
|
|
209
|
+
},
|
|
210
|
+
"required": ["id", "expected_outcome", "input_messages", "expected_messages"],
|
|
211
|
+
"additionalProperties": false
|
|
212
|
+
}
|
|
213
|
+
}
|
|
214
|
+
},
|
|
215
|
+
"required": ["evalcases"],
|
|
216
|
+
"additionalProperties": false
|
|
217
|
+
}
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
# Rubric Evaluator Guide
|
|
2
|
+
|
|
3
|
+
Rubrics provide structured evaluation through lists of criteria that define what makes a good response. Rubrics are checked by an LLM judge and scored based on weights and requirements.
|
|
4
|
+
|
|
5
|
+
## Basic Usage
|
|
6
|
+
|
|
7
|
+
### Simple String Rubrics
|
|
8
|
+
|
|
9
|
+
Define rubrics as simple strings - each becomes a required criterion with weight 1.0:
|
|
10
|
+
|
|
11
|
+
```yaml
|
|
12
|
+
$schema: agentv-eval-v2
|
|
13
|
+
|
|
14
|
+
evalcases:
|
|
15
|
+
- id: quicksort-explanation
|
|
16
|
+
expected_outcome: Explain how quicksort works
|
|
17
|
+
|
|
18
|
+
input_messages:
|
|
19
|
+
- role: user
|
|
20
|
+
content: Explain how the quicksort algorithm works
|
|
21
|
+
|
|
22
|
+
rubrics:
|
|
23
|
+
- Mentions divide-and-conquer approach
|
|
24
|
+
- Explains the partition step
|
|
25
|
+
- States time complexity correctly
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
### Detailed Rubric Objects
|
|
29
|
+
|
|
30
|
+
Use objects for fine-grained control over weights and requirements:
|
|
31
|
+
|
|
32
|
+
```yaml
|
|
33
|
+
evalcases:
|
|
34
|
+
- id: technical-guide
|
|
35
|
+
expected_outcome: Write a comprehensive HTTP status codes guide
|
|
36
|
+
|
|
37
|
+
input_messages:
|
|
38
|
+
- role: user
|
|
39
|
+
content: Write a guide explaining HTTP status codes
|
|
40
|
+
|
|
41
|
+
rubrics:
|
|
42
|
+
- id: structure
|
|
43
|
+
description: Has clear headings and organization
|
|
44
|
+
weight: 1.0
|
|
45
|
+
required: true
|
|
46
|
+
|
|
47
|
+
- id: success-codes
|
|
48
|
+
description: Covers 2xx success codes with examples
|
|
49
|
+
weight: 2.0
|
|
50
|
+
required: true
|
|
51
|
+
|
|
52
|
+
- id: client-errors
|
|
53
|
+
description: Explains 4xx client error codes
|
|
54
|
+
weight: 2.0
|
|
55
|
+
required: true
|
|
56
|
+
|
|
57
|
+
- id: server-errors
|
|
58
|
+
description: Explains 5xx server error codes
|
|
59
|
+
weight: 1.5
|
|
60
|
+
required: false
|
|
61
|
+
|
|
62
|
+
- id: practical-examples
|
|
63
|
+
description: Includes practical use case examples
|
|
64
|
+
weight: 1.0
|
|
65
|
+
required: false
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
## Rubric Object Fields
|
|
69
|
+
|
|
70
|
+
| Field | Type | Default | Description |
|
|
71
|
+
|-------|------|---------|-------------|
|
|
72
|
+
| `id` | string | auto-generated | Unique identifier for the rubric |
|
|
73
|
+
| `description` | string | required | The criterion being evaluated |
|
|
74
|
+
| `weight` | number | 1.0 | Relative importance (higher = more impact on score) |
|
|
75
|
+
| `required` | boolean | true | If true, failing this rubric forces verdict to 'fail' |
|
|
76
|
+
|
|
77
|
+
## Scoring and Verdicts
|
|
78
|
+
|
|
79
|
+
**Score Calculation:**
|
|
80
|
+
```
|
|
81
|
+
score = (sum of satisfied weights) / (total weights)
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
**Verdict Rules:**
|
|
85
|
+
- `pass`: Score ≥ 0.8 AND all required rubrics satisfied
|
|
86
|
+
- `borderline`: Score ≥ 0.6 AND all required rubrics satisfied
|
|
87
|
+
- `fail`: Score < 0.6 OR any required rubric failed
|
|
88
|
+
|
|
89
|
+
## Combining Rubrics with Other Evaluators
|
|
90
|
+
|
|
91
|
+
Rubrics can be combined with code evaluators for comprehensive validation:
|
|
92
|
+
|
|
93
|
+
```yaml
|
|
94
|
+
evalcases:
|
|
95
|
+
- id: email-validator
|
|
96
|
+
expected_outcome: Python function to validate email addresses
|
|
97
|
+
|
|
98
|
+
input_messages:
|
|
99
|
+
- role: user
|
|
100
|
+
content: Write a Python function to validate email addresses
|
|
101
|
+
|
|
102
|
+
# Semantic evaluation via rubrics
|
|
103
|
+
rubrics:
|
|
104
|
+
- Uses regular expressions for validation
|
|
105
|
+
- Includes type hints
|
|
106
|
+
- Has docstring documentation
|
|
107
|
+
- Handles edge cases (None, empty string)
|
|
108
|
+
|
|
109
|
+
execution:
|
|
110
|
+
evaluators:
|
|
111
|
+
# Rubric evaluator is auto-added from inline rubrics field
|
|
112
|
+
|
|
113
|
+
# Additional code evaluator for syntax checking
|
|
114
|
+
- name: python_syntax
|
|
115
|
+
type: code_judge
|
|
116
|
+
script: uv run python -m py_compile
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
## Generate Rubrics from Expected Outcome
|
|
120
|
+
|
|
121
|
+
Use the CLI to auto-generate rubrics from `expected_outcome`:
|
|
122
|
+
|
|
123
|
+
```bash
|
|
124
|
+
# Generate rubrics for eval cases that don't have them
|
|
125
|
+
agentv generate rubrics evals/my-eval.yaml
|
|
126
|
+
|
|
127
|
+
# Use a specific LLM target for generation
|
|
128
|
+
agentv generate rubrics evals/my-eval.yaml --target azure_base
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
This analyzes each `expected_outcome` and creates appropriate rubric items.
|
|
132
|
+
|
|
133
|
+
## Best Practices
|
|
134
|
+
|
|
135
|
+
1. **Use required sparingly** - Only mark rubrics as `required: true` for critical criteria
|
|
136
|
+
2. **Balance weights** - Use higher weights (2.0+) for core requirements, lower (0.5) for nice-to-haves
|
|
137
|
+
3. **Be specific** - "Includes error handling" is better than "Good code quality"
|
|
138
|
+
4. **Keep rubrics atomic** - Each rubric should test one thing
|
|
139
|
+
5. **Consider partial credit** - Non-required rubrics allow partial scores
|