agentv 0.23.0 → 0.25.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,217 +1,217 @@
1
- {
2
- "$schema": "http://json-schema.org/draft-07/schema#",
3
- "title": "AgentV Eval Schema",
4
- "description": "Schema for YAML evaluation files with conversation flows, multiple evaluators, and execution configuration",
5
- "type": "object",
6
- "properties": {
7
- "$schema": {
8
- "type": "string",
9
- "description": "Schema identifier",
10
- "enum": ["agentv-eval-v2"]
11
- },
12
- "description": {
13
- "type": "string",
14
- "description": "Description of what this eval suite covers"
15
- },
16
- "target": {
17
- "type": "string",
18
- "description": "(Deprecated: use execution.target instead) Default target configuration name. Can be overridden per eval case."
19
- },
20
- "execution": {
21
- "type": "object",
22
- "description": "Default execution configuration for all eval cases (can be overridden per case)",
23
- "properties": {
24
- "target": {
25
- "type": "string",
26
- "description": "Default target configuration name (e.g., default, azure_base, vscode_projectx). Can be overridden per eval case."
27
- },
28
- "evaluators": {
29
- "type": "array",
30
- "description": "Default evaluators for all eval cases (code-based and LLM judges)",
31
- "items": {
32
- "type": "object",
33
- "properties": {
34
- "name": {
35
- "type": "string",
36
- "description": "Evaluator name/identifier"
37
- },
38
- "type": {
39
- "type": "string",
40
- "enum": ["code", "llm_judge"],
41
- "description": "Evaluator type: 'code' for scripts/regex/keywords, 'llm_judge' for LLM-based evaluation"
42
- },
43
- "script": {
44
- "type": "string",
45
- "description": "Path to evaluator script (for type: code)"
46
- },
47
- "prompt": {
48
- "type": "string",
49
- "description": "Path to judge prompt file (for type: llm_judge)"
50
- }
51
- },
52
- "required": ["name", "type"],
53
- "additionalProperties": true
54
- }
55
- }
56
- },
57
- "additionalProperties": true
58
- },
59
- "evalcases": {
60
- "type": "array",
61
- "description": "Array of evaluation cases",
62
- "minItems": 1,
63
- "items": {
64
- "type": "object",
65
- "properties": {
66
- "id": {
67
- "type": "string",
68
- "description": "Unique identifier for the eval case"
69
- },
70
- "conversation_id": {
71
- "type": "string",
72
- "description": "Optional conversation identifier for threading multiple eval cases together"
73
- },
74
- "expected_outcome": {
75
- "type": "string",
76
- "description": "Description of what the AI should accomplish in this eval"
77
- },
78
- "note": {
79
- "type": "string",
80
- "description": "Optional note or additional context for the eval case. Use this to document test-specific considerations, known limitations, or rationale for expected behavior."
81
- },
82
- "input_messages": {
83
- "type": "array",
84
- "description": "Input messages for the conversation",
85
- "minItems": 1,
86
- "items": {
87
- "type": "object",
88
- "properties": {
89
- "role": {
90
- "type": "string",
91
- "enum": ["system", "user", "assistant", "tool"],
92
- "description": "Message role"
93
- },
94
- "content": {
95
- "oneOf": [
96
- {
97
- "type": "string",
98
- "description": "Simple text content"
99
- },
100
- {
101
- "type": "array",
102
- "description": "Mixed content items (text and file references)",
103
- "items": {
104
- "type": "object",
105
- "properties": {
106
- "type": {
107
- "type": "string",
108
- "enum": ["text", "file"],
109
- "description": "Content type: 'text' for inline content, 'file' for file references"
110
- },
111
- "value": {
112
- "type": "string",
113
- "description": "Text content or file path. Relative paths (e.g., ../prompts/file.md) are resolved from eval file directory. Absolute paths (e.g., /docs/examples/prompts/file.md) are resolved from repo root."
114
- }
115
- },
116
- "required": ["type", "value"],
117
- "additionalProperties": false
118
- }
119
- }
120
- ]
121
- }
122
- },
123
- "required": ["role", "content"],
124
- "additionalProperties": false
125
- }
126
- },
127
- "expected_messages": {
128
- "type": "array",
129
- "description": "Expected response messages",
130
- "minItems": 1,
131
- "items": {
132
- "type": "object",
133
- "properties": {
134
- "role": {
135
- "type": "string",
136
- "enum": ["system", "user", "assistant", "tool"],
137
- "description": "Message role"
138
- },
139
- "content": {
140
- "oneOf": [
141
- {
142
- "type": "string",
143
- "description": "Simple text content"
144
- },
145
- {
146
- "type": "array",
147
- "description": "Mixed content items",
148
- "items": {
149
- "type": "object",
150
- "properties": {
151
- "type": {
152
- "type": "string",
153
- "enum": ["text", "file"]
154
- },
155
- "value": {
156
- "type": "string"
157
- }
158
- },
159
- "required": ["type", "value"],
160
- "additionalProperties": false
161
- }
162
- }
163
- ]
164
- }
165
- },
166
- "required": ["role", "content"],
167
- "additionalProperties": false
168
- }
169
- },
170
- "execution": {
171
- "type": "object",
172
- "description": "Per-case execution configuration",
173
- "properties": {
174
- "target": {
175
- "type": "string",
176
- "description": "Override target for this specific eval case"
177
- },
178
- "evaluators": {
179
- "type": "array",
180
- "description": "Multiple evaluators (code-based and LLM judges)",
181
- "items": {
182
- "type": "object",
183
- "properties": {
184
- "name": {
185
- "type": "string",
186
- "description": "Evaluator name/identifier"
187
- },
188
- "type": {
189
- "type": "string",
190
- "enum": ["code", "llm_judge"],
191
- "description": "Evaluator type: 'code' for scripts/regex/keywords, 'llm_judge' for LLM-based evaluation"
192
- },
193
- "script": {
194
- "type": "string",
195
- "description": "Path to evaluator script (for type: code)"
196
- },
197
- "prompt": {
198
- "type": "string",
199
- "description": "Path to judge prompt file (for type: llm_judge)"
200
- }
201
- },
202
- "required": ["name", "type"],
203
- "additionalProperties": true
204
- }
205
- }
206
- },
207
- "additionalProperties": true
208
- }
209
- },
210
- "required": ["id", "expected_outcome", "input_messages", "expected_messages"],
211
- "additionalProperties": false
212
- }
213
- }
214
- },
215
- "required": ["evalcases"],
216
- "additionalProperties": false
217
- }
1
+ {
2
+ "$schema": "http://json-schema.org/draft-07/schema#",
3
+ "title": "AgentV Eval Schema",
4
+ "description": "Schema for YAML evaluation files with conversation flows, multiple evaluators, and execution configuration",
5
+ "type": "object",
6
+ "properties": {
7
+ "$schema": {
8
+ "type": "string",
9
+ "description": "Schema identifier",
10
+ "enum": ["agentv-eval-v2"]
11
+ },
12
+ "description": {
13
+ "type": "string",
14
+ "description": "Description of what this eval suite covers"
15
+ },
16
+ "target": {
17
+ "type": "string",
18
+ "description": "(Deprecated: use execution.target instead) Default target configuration name. Can be overridden per eval case."
19
+ },
20
+ "execution": {
21
+ "type": "object",
22
+ "description": "Default execution configuration for all eval cases (can be overridden per case)",
23
+ "properties": {
24
+ "target": {
25
+ "type": "string",
26
+ "description": "Default target configuration name (e.g., default, azure_base, vscode_projectx). Can be overridden per eval case."
27
+ },
28
+ "evaluators": {
29
+ "type": "array",
30
+ "description": "Default evaluators for all eval cases (code-based and LLM judges)",
31
+ "items": {
32
+ "type": "object",
33
+ "properties": {
34
+ "name": {
35
+ "type": "string",
36
+ "description": "Evaluator name/identifier"
37
+ },
38
+ "type": {
39
+ "type": "string",
40
+ "enum": ["code", "llm_judge"],
41
+ "description": "Evaluator type: 'code' for scripts/regex/keywords, 'llm_judge' for LLM-based evaluation"
42
+ },
43
+ "script": {
44
+ "type": "string",
45
+ "description": "Path to evaluator script (for type: code)"
46
+ },
47
+ "prompt": {
48
+ "type": "string",
49
+ "description": "Path to judge prompt file (for type: llm_judge)"
50
+ }
51
+ },
52
+ "required": ["name", "type"],
53
+ "additionalProperties": true
54
+ }
55
+ }
56
+ },
57
+ "additionalProperties": true
58
+ },
59
+ "evalcases": {
60
+ "type": "array",
61
+ "description": "Array of evaluation cases",
62
+ "minItems": 1,
63
+ "items": {
64
+ "type": "object",
65
+ "properties": {
66
+ "id": {
67
+ "type": "string",
68
+ "description": "Unique identifier for the eval case"
69
+ },
70
+ "conversation_id": {
71
+ "type": "string",
72
+ "description": "Optional conversation identifier for threading multiple eval cases together"
73
+ },
74
+ "expected_outcome": {
75
+ "type": "string",
76
+ "description": "Description of what the AI should accomplish in this eval"
77
+ },
78
+ "note": {
79
+ "type": "string",
80
+ "description": "Optional note or additional context for the eval case. Use this to document test-specific considerations, known limitations, or rationale for expected behavior."
81
+ },
82
+ "input_messages": {
83
+ "type": "array",
84
+ "description": "Input messages for the conversation",
85
+ "minItems": 1,
86
+ "items": {
87
+ "type": "object",
88
+ "properties": {
89
+ "role": {
90
+ "type": "string",
91
+ "enum": ["system", "user", "assistant", "tool"],
92
+ "description": "Message role"
93
+ },
94
+ "content": {
95
+ "oneOf": [
96
+ {
97
+ "type": "string",
98
+ "description": "Simple text content"
99
+ },
100
+ {
101
+ "type": "array",
102
+ "description": "Mixed content items (text and file references)",
103
+ "items": {
104
+ "type": "object",
105
+ "properties": {
106
+ "type": {
107
+ "type": "string",
108
+ "enum": ["text", "file"],
109
+ "description": "Content type: 'text' for inline content, 'file' for file references"
110
+ },
111
+ "value": {
112
+ "type": "string",
113
+ "description": "Text content or file path. Relative paths (e.g., ../prompts/file.md) are resolved from eval file directory. Absolute paths (e.g., /docs/examples/prompts/file.md) are resolved from repo root."
114
+ }
115
+ },
116
+ "required": ["type", "value"],
117
+ "additionalProperties": false
118
+ }
119
+ }
120
+ ]
121
+ }
122
+ },
123
+ "required": ["role", "content"],
124
+ "additionalProperties": false
125
+ }
126
+ },
127
+ "expected_messages": {
128
+ "type": "array",
129
+ "description": "Expected response messages",
130
+ "minItems": 1,
131
+ "items": {
132
+ "type": "object",
133
+ "properties": {
134
+ "role": {
135
+ "type": "string",
136
+ "enum": ["system", "user", "assistant", "tool"],
137
+ "description": "Message role"
138
+ },
139
+ "content": {
140
+ "oneOf": [
141
+ {
142
+ "type": "string",
143
+ "description": "Simple text content"
144
+ },
145
+ {
146
+ "type": "array",
147
+ "description": "Mixed content items",
148
+ "items": {
149
+ "type": "object",
150
+ "properties": {
151
+ "type": {
152
+ "type": "string",
153
+ "enum": ["text", "file"]
154
+ },
155
+ "value": {
156
+ "type": "string"
157
+ }
158
+ },
159
+ "required": ["type", "value"],
160
+ "additionalProperties": false
161
+ }
162
+ }
163
+ ]
164
+ }
165
+ },
166
+ "required": ["role", "content"],
167
+ "additionalProperties": false
168
+ }
169
+ },
170
+ "execution": {
171
+ "type": "object",
172
+ "description": "Per-case execution configuration",
173
+ "properties": {
174
+ "target": {
175
+ "type": "string",
176
+ "description": "Override target for this specific eval case"
177
+ },
178
+ "evaluators": {
179
+ "type": "array",
180
+ "description": "Multiple evaluators (code-based and LLM judges)",
181
+ "items": {
182
+ "type": "object",
183
+ "properties": {
184
+ "name": {
185
+ "type": "string",
186
+ "description": "Evaluator name/identifier"
187
+ },
188
+ "type": {
189
+ "type": "string",
190
+ "enum": ["code", "llm_judge"],
191
+ "description": "Evaluator type: 'code' for scripts/regex/keywords, 'llm_judge' for LLM-based evaluation"
192
+ },
193
+ "script": {
194
+ "type": "string",
195
+ "description": "Path to evaluator script (for type: code)"
196
+ },
197
+ "prompt": {
198
+ "type": "string",
199
+ "description": "Path to judge prompt file (for type: llm_judge)"
200
+ }
201
+ },
202
+ "required": ["name", "type"],
203
+ "additionalProperties": true
204
+ }
205
+ }
206
+ },
207
+ "additionalProperties": true
208
+ }
209
+ },
210
+ "required": ["id", "expected_outcome", "input_messages", "expected_messages"],
211
+ "additionalProperties": false
212
+ }
213
+ }
214
+ },
215
+ "required": ["evalcases"],
216
+ "additionalProperties": false
217
+ }
@@ -0,0 +1,139 @@
1
+ # Rubric Evaluator Guide
2
+
3
+ Rubrics provide structured evaluation through lists of criteria that define what makes a good response. Rubrics are checked by an LLM judge and scored based on weights and requirements.
4
+
5
+ ## Basic Usage
6
+
7
+ ### Simple String Rubrics
8
+
9
+ Define rubrics as simple strings - each becomes a required criterion with weight 1.0:
10
+
11
+ ```yaml
12
+ $schema: agentv-eval-v2
13
+
14
+ evalcases:
15
+ - id: quicksort-explanation
16
+ expected_outcome: Explain how quicksort works
17
+
18
+ input_messages:
19
+ - role: user
20
+ content: Explain how the quicksort algorithm works
21
+
22
+ rubrics:
23
+ - Mentions divide-and-conquer approach
24
+ - Explains the partition step
25
+ - States time complexity correctly
26
+ ```
27
+
28
+ ### Detailed Rubric Objects
29
+
30
+ Use objects for fine-grained control over weights and requirements:
31
+
32
+ ```yaml
33
+ evalcases:
34
+ - id: technical-guide
35
+ expected_outcome: Write a comprehensive HTTP status codes guide
36
+
37
+ input_messages:
38
+ - role: user
39
+ content: Write a guide explaining HTTP status codes
40
+
41
+ rubrics:
42
+ - id: structure
43
+ description: Has clear headings and organization
44
+ weight: 1.0
45
+ required: true
46
+
47
+ - id: success-codes
48
+ description: Covers 2xx success codes with examples
49
+ weight: 2.0
50
+ required: true
51
+
52
+ - id: client-errors
53
+ description: Explains 4xx client error codes
54
+ weight: 2.0
55
+ required: true
56
+
57
+ - id: server-errors
58
+ description: Explains 5xx server error codes
59
+ weight: 1.5
60
+ required: false
61
+
62
+ - id: practical-examples
63
+ description: Includes practical use case examples
64
+ weight: 1.0
65
+ required: false
66
+ ```
67
+
68
+ ## Rubric Object Fields
69
+
70
+ | Field | Type | Default | Description |
71
+ |-------|------|---------|-------------|
72
+ | `id` | string | auto-generated | Unique identifier for the rubric |
73
+ | `description` | string | required | The criterion being evaluated |
74
+ | `weight` | number | 1.0 | Relative importance (higher = more impact on score) |
75
+ | `required` | boolean | true | If true, failing this rubric forces verdict to 'fail' |
76
+
77
+ ## Scoring and Verdicts
78
+
79
+ **Score Calculation:**
80
+ ```
81
+ score = (sum of satisfied weights) / (total weights)
82
+ ```
83
+
84
+ **Verdict Rules:**
85
+ - `pass`: Score ≥ 0.8 AND all required rubrics satisfied
86
+ - `borderline`: Score ≥ 0.6 AND all required rubrics satisfied
87
+ - `fail`: Score < 0.6 OR any required rubric failed
88
+
89
+ ## Combining Rubrics with Other Evaluators
90
+
91
+ Rubrics can be combined with code evaluators for comprehensive validation:
92
+
93
+ ```yaml
94
+ evalcases:
95
+ - id: email-validator
96
+ expected_outcome: Python function to validate email addresses
97
+
98
+ input_messages:
99
+ - role: user
100
+ content: Write a Python function to validate email addresses
101
+
102
+ # Semantic evaluation via rubrics
103
+ rubrics:
104
+ - Uses regular expressions for validation
105
+ - Includes type hints
106
+ - Has docstring documentation
107
+ - Handles edge cases (None, empty string)
108
+
109
+ execution:
110
+ evaluators:
111
+ # Rubric evaluator is auto-added from inline rubrics field
112
+
113
+ # Additional code evaluator for syntax checking
114
+ - name: python_syntax
115
+ type: code_judge
116
+ script: uv run python -m py_compile
117
+ ```
118
+
119
+ ## Generate Rubrics from Expected Outcome
120
+
121
+ Use the CLI to auto-generate rubrics from `expected_outcome`:
122
+
123
+ ```bash
124
+ # Generate rubrics for eval cases that don't have them
125
+ agentv generate rubrics evals/my-eval.yaml
126
+
127
+ # Use a specific LLM target for generation
128
+ agentv generate rubrics evals/my-eval.yaml --target azure_base
129
+ ```
130
+
131
+ This analyzes each `expected_outcome` and creates appropriate rubric items.
132
+
133
+ ## Best Practices
134
+
135
+ 1. **Use required sparingly** - Only mark rubrics as `required: true` for critical criteria
136
+ 2. **Balance weights** - Use higher weights (2.0+) for core requirements, lower (0.5) for nice-to-haves
137
+ 3. **Be specific** - "Includes error handling" is better than "Good code quality"
138
+ 4. **Keep rubrics atomic** - Each rubric should test one thing
139
+ 5. **Consider partial credit** - Non-required rubrics allow partial scores