agentv 2.5.4 → 2.5.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +17 -17
- package/dist/{chunk-I4EMT5Q2.js → chunk-APKXUJF3.js} +1250 -462
- package/dist/chunk-APKXUJF3.js.map +1 -0
- package/dist/{chunk-LTPZBEJU.js → chunk-BKMQNEUD.js} +9 -3
- package/dist/{chunk-LTPZBEJU.js.map → chunk-BKMQNEUD.js.map} +1 -1
- package/dist/{chunk-A7TQUSVG.js → chunk-LJVS3JAK.js} +2 -2
- package/dist/cli.js +2 -2
- package/dist/index.js +2 -2
- package/dist/templates/.claude/skills/agentv-eval-builder/SKILL.md +123 -244
- package/dist/templates/.claude/skills/agentv-eval-builder/references/custom-evaluators.md +56 -271
- package/dist/templates/.claude/skills/agentv-eval-builder/references/rubric-evaluator.md +55 -180
- package/dist/{token-DVVSDOYP.js → token-D3IYDJQZ.js} +3 -3
- package/dist/{token-util-YEKFTEJA.js → token-util-FWFPR2BV.js} +3 -3
- package/package.json +5 -2
- package/dist/chunk-I4EMT5Q2.js.map +0 -1
- /package/dist/{chunk-A7TQUSVG.js.map → chunk-LJVS3JAK.js.map} +0 -0
- /package/dist/{token-DVVSDOYP.js.map → token-D3IYDJQZ.js.map} +0 -0
- /package/dist/{token-util-YEKFTEJA.js.map → token-util-FWFPR2BV.js.map} +0 -0
|
@@ -1,112 +1,83 @@
|
|
|
1
|
-
# Custom Evaluators
|
|
1
|
+
# Custom Evaluators
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
## Wire Format
|
|
4
4
|
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
Code evaluators receive input via stdin and write output to stdout, both as JSON.
|
|
8
|
-
|
|
9
|
-
### Input Format (via stdin)
|
|
10
|
-
|
|
11
|
-
Wire format uses snake_case for cross-language compatibility:
|
|
5
|
+
### Input (stdin JSON)
|
|
12
6
|
|
|
13
7
|
```json
|
|
14
8
|
{
|
|
15
|
-
"question": "string
|
|
16
|
-
"expected_outcome": "
|
|
17
|
-
"reference_answer": "
|
|
18
|
-
"candidate_answer": "
|
|
19
|
-
"guideline_files": ["
|
|
20
|
-
"input_files": ["
|
|
9
|
+
"question": "string",
|
|
10
|
+
"expected_outcome": "string",
|
|
11
|
+
"reference_answer": "string",
|
|
12
|
+
"candidate_answer": "string",
|
|
13
|
+
"guideline_files": ["path"],
|
|
14
|
+
"input_files": ["path"],
|
|
21
15
|
"input_messages": [{"role": "user", "content": "..."}],
|
|
22
|
-
"expected_messages": [
|
|
23
|
-
|
|
24
|
-
"role": "assistant",
|
|
25
|
-
"tool_calls": [
|
|
26
|
-
{
|
|
27
|
-
"tool": "vector_search",
|
|
28
|
-
"input": { "query": "..." },
|
|
29
|
-
"output": { "results": ["doc1", "doc2"] }
|
|
30
|
-
}
|
|
31
|
-
]
|
|
32
|
-
}
|
|
33
|
-
],
|
|
34
|
-
"output_messages": [
|
|
35
|
-
{
|
|
36
|
-
"role": "assistant",
|
|
37
|
-
"content": "...",
|
|
38
|
-
"tool_calls": [...]
|
|
39
|
-
}
|
|
40
|
-
],
|
|
16
|
+
"expected_messages": [{"role": "assistant", "content": "..."}],
|
|
17
|
+
"output_messages": [{"role": "assistant", "content": "..."}],
|
|
41
18
|
"trace_summary": {
|
|
42
19
|
"event_count": 5,
|
|
43
|
-
"tool_names": ["fetch"
|
|
44
|
-
"tool_calls_by_name": {
|
|
20
|
+
"tool_names": ["fetch"],
|
|
21
|
+
"tool_calls_by_name": {"fetch": 1},
|
|
45
22
|
"error_count": 0,
|
|
46
|
-
"token_usage": {
|
|
23
|
+
"token_usage": {"input": 1000, "output": 500},
|
|
47
24
|
"cost_usd": 0.0015,
|
|
48
25
|
"duration_ms": 3500
|
|
49
26
|
}
|
|
50
27
|
}
|
|
51
28
|
```
|
|
52
29
|
|
|
53
|
-
|
|
54
|
-
- `expected_messages` - Expected agent behavior from YAML, including tool calls with outputs (use for retrieval context in RAG evals)
|
|
55
|
-
- `output_messages` - Actual agent execution trace with tool calls (from live agent runs)
|
|
56
|
-
- `trace_summary` - Lightweight summary with execution metrics (counts only, no tool arguments)
|
|
57
|
-
|
|
58
|
-
### Output Format (to stdout)
|
|
30
|
+
### Output (stdout JSON)
|
|
59
31
|
|
|
60
32
|
```json
|
|
61
33
|
{
|
|
62
34
|
"score": 0.85,
|
|
63
|
-
"hits": ["
|
|
64
|
-
"misses": ["failed check
|
|
65
|
-
"reasoning": "
|
|
35
|
+
"hits": ["passed check"],
|
|
36
|
+
"misses": ["failed check"],
|
|
37
|
+
"reasoning": "explanation"
|
|
66
38
|
}
|
|
67
39
|
```
|
|
68
40
|
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
41
|
+
`score` (0.0-1.0) required. `hits`, `misses`, `reasoning` optional.
|
|
42
|
+
|
|
43
|
+
## SDK Functions
|
|
44
|
+
|
|
45
|
+
```typescript
|
|
46
|
+
import { defineCodeJudge, createTargetClient, definePromptTemplate } from '@agentv/eval';
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
- `defineCodeJudge(fn)` - Wraps evaluation function with stdin/stdout handling
|
|
50
|
+
- `createTargetClient()` - Returns LLM proxy client (when `target: {}` configured)
|
|
51
|
+
- `.invoke({question, systemPrompt})` - Single LLM call
|
|
52
|
+
- `.invokeBatch(requests)` - Batch LLM calls
|
|
53
|
+
- `definePromptTemplate(fn)` - Wraps prompt generation function
|
|
54
|
+
- Context fields: `question`, `candidateAnswer`, `referenceAnswer`, `expectedOutcome`, `expectedMessages`, `outputMessages`, `config`, `traceSummary`
|
|
74
55
|
|
|
75
|
-
## Python
|
|
56
|
+
## Python Example
|
|
76
57
|
|
|
77
58
|
```python
|
|
78
59
|
#!/usr/bin/env python3
|
|
79
|
-
import json
|
|
80
|
-
import sys
|
|
60
|
+
import json, sys
|
|
81
61
|
|
|
82
62
|
def evaluate(data: dict) -> dict:
|
|
83
63
|
candidate = data.get("candidate_answer", "")
|
|
84
64
|
hits, misses = [], []
|
|
85
|
-
|
|
86
|
-
# Your validation logic here
|
|
87
|
-
keywords = ["async", "await"]
|
|
88
|
-
for kw in keywords:
|
|
65
|
+
for kw in ["async", "await"]:
|
|
89
66
|
(hits if kw in candidate else misses).append(f"Keyword '{kw}'")
|
|
90
|
-
|
|
91
67
|
return {
|
|
92
|
-
"score": len(hits) / len(
|
|
93
|
-
"hits": hits,
|
|
94
|
-
"misses": misses,
|
|
95
|
-
"reasoning": f"Found {len(hits)}/{len(keywords)} keywords"
|
|
68
|
+
"score": len(hits) / max(len(hits) + len(misses), 1),
|
|
69
|
+
"hits": hits, "misses": misses
|
|
96
70
|
}
|
|
97
71
|
|
|
98
72
|
if __name__ == "__main__":
|
|
99
73
|
try:
|
|
100
|
-
|
|
101
|
-
print(json.dumps(result, indent=2))
|
|
74
|
+
print(json.dumps(evaluate(json.loads(sys.stdin.read()))))
|
|
102
75
|
except Exception as e:
|
|
103
|
-
print(json.dumps({"score": 0, "
|
|
76
|
+
print(json.dumps({"score": 0, "misses": [str(e)]}))
|
|
104
77
|
sys.exit(1)
|
|
105
78
|
```
|
|
106
79
|
|
|
107
|
-
## TypeScript
|
|
108
|
-
|
|
109
|
-
The `@agentv/eval` SDK provides a declarative API with automatic stdin/stdout handling.
|
|
80
|
+
## TypeScript Example
|
|
110
81
|
|
|
111
82
|
```typescript
|
|
112
83
|
#!/usr/bin/env bun
|
|
@@ -115,216 +86,30 @@ import { defineCodeJudge } from '@agentv/eval';
|
|
|
115
86
|
export default defineCodeJudge(({ candidateAnswer, expectedOutcome }) => {
|
|
116
87
|
const hits: string[] = [];
|
|
117
88
|
const misses: string[] = [];
|
|
118
|
-
|
|
119
|
-
// Your validation logic here
|
|
120
89
|
if (candidateAnswer.includes(expectedOutcome)) {
|
|
121
|
-
hits.push('
|
|
90
|
+
hits.push('Matches expected outcome');
|
|
122
91
|
} else {
|
|
123
|
-
misses.push('
|
|
92
|
+
misses.push('Does not match expected outcome');
|
|
124
93
|
}
|
|
125
|
-
|
|
126
|
-
const total = hits.length + misses.length;
|
|
127
94
|
return {
|
|
128
|
-
score:
|
|
129
|
-
hits,
|
|
130
|
-
misses,
|
|
131
|
-
reasoning: `Passed ${hits.length}/${total} checks`,
|
|
95
|
+
score: hits.length / Math.max(hits.length + misses.length, 1),
|
|
96
|
+
hits, misses,
|
|
132
97
|
};
|
|
133
98
|
});
|
|
134
99
|
```
|
|
135
100
|
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
## Target Access for Code Evaluators
|
|
139
|
-
|
|
140
|
-
Code judges can access an LLM through a **target proxy** for metrics requiring multiple LLM calls (contextual precision, semantic similarity, etc).
|
|
141
|
-
|
|
142
|
-
### Configuration
|
|
143
|
-
|
|
144
|
-
```yaml
|
|
145
|
-
evaluators:
|
|
146
|
-
- name: contextual-precision
|
|
147
|
-
type: code_judge
|
|
148
|
-
script: bun scripts/contextual-precision.ts
|
|
149
|
-
target:
|
|
150
|
-
max_calls: 10 # Default: 50
|
|
151
|
-
```
|
|
152
|
-
|
|
153
|
-
### Usage
|
|
154
|
-
|
|
155
|
-
```typescript
|
|
156
|
-
#!/usr/bin/env bun
|
|
157
|
-
import { createTargetClient, defineCodeJudge } from '@agentv/eval';
|
|
158
|
-
|
|
159
|
-
export default defineCodeJudge(async ({ question, candidateAnswer }) => {
|
|
160
|
-
const target = createTargetClient();
|
|
161
|
-
if (!target) return { score: 0, misses: ['Target not configured'] };
|
|
162
|
-
|
|
163
|
-
const response = await target.invoke({
|
|
164
|
-
question: `Is this relevant to: ${question}? Response: ${candidateAnswer}`,
|
|
165
|
-
systemPrompt: 'Respond with JSON: { "relevant": true/false }'
|
|
166
|
-
});
|
|
167
|
-
|
|
168
|
-
const result = JSON.parse(response.rawText ?? '{}');
|
|
169
|
-
return { score: result.relevant ? 1.0 : 0.0 };
|
|
170
|
-
});
|
|
171
|
-
```
|
|
172
|
-
|
|
173
|
-
**Batch invocation:** Use `target.invokeBatch(requests)` for multiple calls.
|
|
174
|
-
|
|
175
|
-
**Environment variables** (set automatically when `target` is configured):
|
|
176
|
-
- `AGENTV_TARGET_PROXY_URL` - Local proxy URL
|
|
177
|
-
- `AGENTV_TARGET_PROXY_TOKEN` - Bearer token for authentication
|
|
178
|
-
|
|
179
|
-
**See also:** `examples/features/code-judge-with-llm-calls/`
|
|
180
|
-
|
|
181
|
-
## LLM Judge Prompt Templates
|
|
182
|
-
|
|
183
|
-
LLM judges support two types of prompt templates:
|
|
184
|
-
|
|
185
|
-
### Text Templates (Markdown)
|
|
186
|
-
|
|
187
|
-
Simple markdown files with variable substitution. AgentV handles the output format automatically.
|
|
188
|
-
|
|
189
|
-
### TypeScript/JavaScript Templates
|
|
190
|
-
|
|
191
|
-
For dynamic prompt generation with full programming capabilities. Uses the same subprocess pattern as code evaluators.
|
|
192
|
-
|
|
193
|
-
**YAML Configuration:**
|
|
194
|
-
|
|
195
|
-
```yaml
|
|
196
|
-
evaluators:
|
|
197
|
-
- name: custom-eval
|
|
198
|
-
type: llm_judge
|
|
199
|
-
prompt:
|
|
200
|
-
script: [bun, run, ../prompts/custom-evaluator.ts]
|
|
201
|
-
config: # Optional, passed to script
|
|
202
|
-
rubric: "Your rubric here"
|
|
203
|
-
strictMode: true
|
|
204
|
-
```
|
|
205
|
-
|
|
206
|
-
**TypeScript Template:**
|
|
207
|
-
|
|
208
|
-
```typescript
|
|
209
|
-
#!/usr/bin/env bun
|
|
210
|
-
import { definePromptTemplate } from '@agentv/eval';
|
|
211
|
-
|
|
212
|
-
export default definePromptTemplate((ctx) => {
|
|
213
|
-
const rubric = ctx.config?.rubric as string | undefined;
|
|
214
|
-
|
|
215
|
-
return `You are evaluating an AI assistant's response.
|
|
216
|
-
|
|
217
|
-
## Question
|
|
218
|
-
${ctx.question}
|
|
219
|
-
|
|
220
|
-
## Candidate Answer
|
|
221
|
-
${ctx.candidateAnswer}
|
|
222
|
-
|
|
223
|
-
${ctx.referenceAnswer ? `## Reference Answer\n${ctx.referenceAnswer}` : ''}
|
|
224
|
-
|
|
225
|
-
${rubric ? `## Evaluation Criteria\n${rubric}` : ''}
|
|
226
|
-
|
|
227
|
-
Evaluate and provide a score from 0 to 1.`;
|
|
228
|
-
});
|
|
229
|
-
```
|
|
230
|
-
|
|
231
|
-
**Available context fields:** `question`, `candidateAnswer`, `referenceAnswer`, `expectedOutcome`, `expectedMessages`, `outputMessages`, `config`, `traceSummary`
|
|
232
|
-
|
|
233
|
-
**See also:** `examples/features/prompt-template-sdk/`
|
|
234
|
-
|
|
235
|
-
---
|
|
236
|
-
|
|
237
|
-
## Template Variable Derivation
|
|
238
|
-
|
|
239
|
-
Template variables are **derived internally** — users never author them directly. They flow through three layers:
|
|
240
|
-
|
|
241
|
-
1. **Authoring layer** (what users write in YAML/JSONL):
|
|
242
|
-
- `input` or `input_messages` — two syntaxes for the same data. `input: "What is 2+2?"` expands to `[{ role: "user", content: "What is 2+2?" }]`. If both are present, `input_messages` takes precedence.
|
|
243
|
-
- `expected_output` or `expected_messages` — two syntaxes for the same data. `expected_output: "4"` expands to `[{ role: "assistant", content: "4" }]`. Structured objects and message arrays are also supported. If both are present, `expected_messages` takes precedence.
|
|
244
|
-
|
|
245
|
-
2. **Resolved layer** (after parsing):
|
|
246
|
-
- `input_messages: TestMessage[]` — canonical resolved input
|
|
247
|
-
- `expected_messages: TestMessage[]` — canonical resolved expected output
|
|
248
|
-
- At this layer, `input` and `expected_output` no longer exist as separate fields.
|
|
249
|
-
|
|
250
|
-
3. **Template variable layer** (derived strings injected into evaluator prompts):
|
|
251
|
-
- `question` — content of the first `user` role entry in `input_messages`
|
|
252
|
-
- `expected_outcome` — passed through from the eval case field
|
|
253
|
-
- `reference_answer` — content of the **last** entry in `expected_messages` (the gold-standard answer for grading, not an exact-match target)
|
|
254
|
-
- `candidate_answer` — content of the **last** entry in `output_messages` (the provider's actual response being graded)
|
|
255
|
-
- `input_messages` — full resolved input array, JSON-serialized
|
|
256
|
-
- `expected_messages` — full resolved expected array, JSON-serialized
|
|
257
|
-
- `output_messages` — full provider output array, JSON-serialized
|
|
258
|
-
|
|
259
|
-
**Example flow:**
|
|
260
|
-
```yaml
|
|
261
|
-
# User writes:
|
|
262
|
-
input: "What is 2+2?"
|
|
263
|
-
expected_output: "The answer is 4"
|
|
264
|
-
```
|
|
265
|
-
```
|
|
266
|
-
# Resolved:
|
|
267
|
-
input_messages: [{ role: "user", content: "What is 2+2?" }]
|
|
268
|
-
expected_messages: [{ role: "assistant", content: "The answer is 4" }]
|
|
269
|
-
|
|
270
|
-
# Derived template variables:
|
|
271
|
-
question: "What is 2+2?"
|
|
272
|
-
reference_answer: "The answer is 4"
|
|
273
|
-
candidate_answer: (extracted from provider output at runtime)
|
|
274
|
-
```
|
|
275
|
-
|
|
276
|
-
## Text Template Variables
|
|
277
|
-
|
|
278
|
-
**Available variables for markdown templates:**
|
|
279
|
-
- `{{question}}` - Derived from first user message in `input_messages`
|
|
280
|
-
- `{{expected_outcome}}` - What the answer should accomplish (from eval case field)
|
|
281
|
-
- `{{candidate_answer}}` - Derived from last entry in `output_messages` (provider response)
|
|
282
|
-
- `{{reference_answer}}` - Derived from last entry in `expected_messages` (gold standard)
|
|
283
|
-
- `{{input_messages}}` - Full resolved input messages, JSON-serialized
|
|
284
|
-
- `{{expected_messages}}` - Full resolved expected messages, JSON-serialized
|
|
285
|
-
- `{{output_messages}}` - Full provider output messages, JSON-serialized
|
|
286
|
-
|
|
287
|
-
**Default Template:**
|
|
288
|
-
|
|
289
|
-
```
|
|
290
|
-
You are an expert evaluator. Grade the candidate_answer based on how well it achieves the expected_outcome.
|
|
291
|
-
|
|
292
|
-
Use reference_answer as a gold standard (if provided). The candidate_answer doesn't need to match verbatim, but should capture key points.
|
|
101
|
+
## Template Variables
|
|
293
102
|
|
|
294
|
-
|
|
103
|
+
Derived from eval case fields (users never author these directly):
|
|
295
104
|
|
|
296
|
-
|
|
297
|
-
|
|
105
|
+
| Variable | Source |
|
|
106
|
+
|----------|--------|
|
|
107
|
+
| `question` | First user message in `input_messages` |
|
|
108
|
+
| `expected_outcome` | Eval case `expected_outcome` field |
|
|
109
|
+
| `reference_answer` | Last entry in `expected_messages` |
|
|
110
|
+
| `candidate_answer` | Last entry in `output_messages` (runtime) |
|
|
111
|
+
| `input_messages` | Full resolved input array (JSON) |
|
|
112
|
+
| `expected_messages` | Full resolved expected array (JSON) |
|
|
113
|
+
| `output_messages` | Full provider output array (JSON) |
|
|
298
114
|
|
|
299
|
-
|
|
300
|
-
{{question}}
|
|
301
|
-
|
|
302
|
-
[[ ## reference_answer ## ]]
|
|
303
|
-
{{reference_answer}}
|
|
304
|
-
|
|
305
|
-
[[ ## candidate_answer ## ]]
|
|
306
|
-
{{candidate_answer}}
|
|
307
|
-
```
|
|
308
|
-
|
|
309
|
-
## Best Practices
|
|
310
|
-
|
|
311
|
-
### Code Evaluators
|
|
312
|
-
1. **Focus on `candidate_answer`** - Most evaluators only need this field
|
|
313
|
-
2. **Be deterministic** - Same input → same output
|
|
314
|
-
3. **Handle errors gracefully** - Return valid result even on failure
|
|
315
|
-
4. **Use `hits`/`misses`** - Explain the score clearly
|
|
316
|
-
|
|
317
|
-
### LLM Judges
|
|
318
|
-
1. **Clear criteria** - Define what you're evaluating
|
|
319
|
-
2. **Specific rubrics** - Provide scoring guidelines
|
|
320
|
-
3. **Concise prompts** - Keep instructions focused
|
|
321
|
-
|
|
322
|
-
## Testing Locally
|
|
323
|
-
|
|
324
|
-
```bash
|
|
325
|
-
# Python
|
|
326
|
-
echo '{"candidate_answer": "test", "question": "task", "expected_outcome": "result"}' | uv run my_validator.py
|
|
327
|
-
|
|
328
|
-
# TypeScript
|
|
329
|
-
echo '{"candidate_answer": "test", "question": "task", "expected_outcome": "result"}' | bun run ./check.ts
|
|
330
|
-
```
|
|
115
|
+
Markdown templates use `{{variable}}` syntax. TypeScript templates receive context object.
|
|
@@ -1,204 +1,79 @@
|
|
|
1
|
-
# Rubric Evaluator
|
|
1
|
+
# Rubric Evaluator
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
## Field Reference
|
|
4
4
|
|
|
5
|
-
|
|
5
|
+
| Field | Type | Default | Description |
|
|
6
|
+
|-------|------|---------|-------------|
|
|
7
|
+
| `id` | string | auto-generated | Unique identifier |
|
|
8
|
+
| `expected_outcome` | string | required* | Criterion being evaluated (*optional if `score_ranges` used) |
|
|
9
|
+
| `weight` | number | 1.0 | Relative importance |
|
|
10
|
+
| `required` | boolean | true | Failing forces verdict to 'fail' (checklist mode) |
|
|
11
|
+
| `required_min_score` | integer | - | Minimum 0-10 score to pass (score-range mode) |
|
|
12
|
+
| `score_ranges` | map or array | - | Score range definitions for analytic scoring |
|
|
6
13
|
|
|
7
|
-
|
|
14
|
+
`description` is a backward-compatible alias for `expected_outcome`.
|
|
8
15
|
|
|
9
|
-
|
|
16
|
+
## Checklist Mode
|
|
10
17
|
|
|
11
18
|
```yaml
|
|
12
|
-
|
|
13
|
-
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
- Explains the partition step
|
|
23
|
-
- States time complexity correctly
|
|
19
|
+
rubrics:
|
|
20
|
+
- Mentions divide-and-conquer approach
|
|
21
|
+
- id: complexity
|
|
22
|
+
expected_outcome: States time complexity correctly
|
|
23
|
+
weight: 2.0
|
|
24
|
+
required: true
|
|
25
|
+
- id: examples
|
|
26
|
+
expected_outcome: Includes code examples
|
|
27
|
+
weight: 1.0
|
|
28
|
+
required: false
|
|
24
29
|
```
|
|
25
30
|
|
|
26
|
-
|
|
31
|
+
## Score-Range Mode
|
|
27
32
|
|
|
28
|
-
|
|
33
|
+
Shorthand map format (recommended):
|
|
29
34
|
|
|
30
35
|
```yaml
|
|
31
|
-
|
|
32
|
-
- id:
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
- id: structure
|
|
41
|
-
expected_outcome: Has clear headings and organization
|
|
42
|
-
weight: 1.0
|
|
43
|
-
required: true
|
|
44
|
-
|
|
45
|
-
- id: success-codes
|
|
46
|
-
expected_outcome: Covers 2xx success codes with examples
|
|
47
|
-
weight: 2.0
|
|
48
|
-
required: true
|
|
49
|
-
|
|
50
|
-
- id: client-errors
|
|
51
|
-
expected_outcome: Explains 4xx client error codes
|
|
52
|
-
weight: 2.0
|
|
53
|
-
required: true
|
|
54
|
-
|
|
55
|
-
- id: server-errors
|
|
56
|
-
expected_outcome: Explains 5xx server error codes
|
|
57
|
-
weight: 1.5
|
|
58
|
-
required: false
|
|
59
|
-
|
|
60
|
-
- id: practical-examples
|
|
61
|
-
expected_outcome: Includes practical use case examples
|
|
62
|
-
weight: 1.0
|
|
63
|
-
required: false
|
|
36
|
+
rubrics:
|
|
37
|
+
- id: correctness
|
|
38
|
+
weight: 2.0
|
|
39
|
+
required_min_score: 7
|
|
40
|
+
score_ranges:
|
|
41
|
+
0: Critical bugs
|
|
42
|
+
3: Minor bugs
|
|
43
|
+
6: Correct with minor issues
|
|
44
|
+
9: Fully correct
|
|
64
45
|
```
|
|
65
46
|
|
|
66
|
-
|
|
47
|
+
Map keys are lower bounds (0-10). Each range extends from its key to (next key - 1), with the last extending to 10. Must start at 0.
|
|
67
48
|
|
|
68
|
-
|
|
49
|
+
Array format is also accepted:
|
|
69
50
|
|
|
70
51
|
```yaml
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
- id: correctness
|
|
81
|
-
weight: 2.0
|
|
82
|
-
required_min_score: 7 # Fail if score < 7
|
|
83
|
-
score_ranges:
|
|
84
|
-
- score_range: [0, 2]
|
|
85
|
-
expected_outcome: Contains critical bugs or errors
|
|
86
|
-
- score_range: [3, 5]
|
|
87
|
-
expected_outcome: Has minor bugs or edge case issues
|
|
88
|
-
- score_range: [6, 8]
|
|
89
|
-
expected_outcome: Functionally correct with minor issues
|
|
90
|
-
- score_range: [9, 10]
|
|
91
|
-
expected_outcome: Fully correct implementation
|
|
92
|
-
|
|
93
|
-
- id: style
|
|
94
|
-
weight: 1.0
|
|
95
|
-
score_ranges:
|
|
96
|
-
- score_range: [0, 3]
|
|
97
|
-
expected_outcome: Poor style, hard to read
|
|
98
|
-
- score_range: [4, 6]
|
|
99
|
-
expected_outcome: Acceptable style with issues
|
|
100
|
-
- score_range: [7, 10]
|
|
101
|
-
expected_outcome: Clean, idiomatic code
|
|
52
|
+
score_ranges:
|
|
53
|
+
- score_range: [0, 2]
|
|
54
|
+
expected_outcome: Critical bugs
|
|
55
|
+
- score_range: [3, 5]
|
|
56
|
+
expected_outcome: Minor bugs
|
|
57
|
+
- score_range: [6, 8]
|
|
58
|
+
expected_outcome: Correct with minor issues
|
|
59
|
+
- score_range: [9, 10]
|
|
60
|
+
expected_outcome: Fully correct
|
|
102
61
|
```
|
|
103
62
|
|
|
104
|
-
|
|
105
|
-
- Ranges must be integers within 0-10
|
|
106
|
-
- Ranges must not overlap
|
|
107
|
-
- Ranges must cover all values 0-10 (no gaps)
|
|
108
|
-
- Each range must have a non-empty `expected_outcome`
|
|
109
|
-
|
|
110
|
-
## Rubric Object Fields
|
|
111
|
-
|
|
112
|
-
| Field | Type | Default | Description |
|
|
113
|
-
|-------|------|---------|-------------|
|
|
114
|
-
| `id` | string | auto-generated | Unique identifier for the rubric |
|
|
115
|
-
| `expected_outcome` | string | required* | The criterion being evaluated (*optional if `score_ranges` used) |
|
|
116
|
-
| `weight` | number | 1.0 | Relative importance (higher = more impact on score) |
|
|
117
|
-
| `required` | boolean | true | If true, failing this rubric forces verdict to 'fail' (checklist mode) |
|
|
118
|
-
| `required_min_score` | integer | - | Minimum 0-10 score required to pass (score-range mode) |
|
|
119
|
-
| `score_ranges` | array | - | Score range definitions for analytic rubric scoring |
|
|
120
|
-
|
|
121
|
-
> **Note:** `description` is supported as a backward-compatible alias for `expected_outcome`.
|
|
63
|
+
Ranges must be integers 0-10, non-overlapping, covering all values 0-10.
|
|
122
64
|
|
|
123
|
-
## Scoring
|
|
124
|
-
|
|
125
|
-
### Checklist Mode (boolean)
|
|
126
|
-
```
|
|
127
|
-
score = (sum of satisfied weights) / (total weights)
|
|
128
|
-
```
|
|
129
|
-
|
|
130
|
-
### Score-Range Mode (0-10 integers)
|
|
131
|
-
```
|
|
132
|
-
normalized_score = raw_score / 10 # Convert 0-10 to 0-1
|
|
133
|
-
final_score = weighted_average(normalized_scores)
|
|
134
|
-
```
|
|
65
|
+
## Scoring
|
|
135
66
|
|
|
136
|
-
**
|
|
137
|
-
- `pass`: Score ≥ 0.8 AND all gating criteria satisfied
|
|
138
|
-
- `borderline`: Score ≥ 0.6 AND all gating criteria satisfied
|
|
139
|
-
- `fail`: Score < 0.6 OR any gating criterion failed
|
|
67
|
+
**Checklist:** `score = sum(satisfied weights) / sum(all weights)`
|
|
140
68
|
|
|
141
|
-
**
|
|
142
|
-
- Checklist mode: `required: true` means must be satisfied
|
|
143
|
-
- Score-range mode: `required_min_score: N` means score must be ≥ N
|
|
144
|
-
|
|
145
|
-
## When to Use Each Mode
|
|
146
|
-
|
|
147
|
-
| Use Case | Mode | Why |
|
|
148
|
-
|----------|------|-----|
|
|
149
|
-
| Binary pass/fail criteria | Checklist | Simple yes/no evaluation |
|
|
150
|
-
| Quality gradient | Score-range | Captures nuance (poor → excellent) |
|
|
151
|
-
| Critical requirements | Checklist + `required: true` | Hard gating on must-haves |
|
|
152
|
-
| Minimum quality bar | Score-range + `required_min_score` | Flexible threshold gating |
|
|
153
|
-
|
|
154
|
-
## Combining Rubrics with Other Evaluators
|
|
155
|
-
|
|
156
|
-
Rubrics can be combined with code evaluators for comprehensive validation:
|
|
157
|
-
|
|
158
|
-
```yaml
|
|
159
|
-
evalcases:
|
|
160
|
-
- id: email-validator
|
|
161
|
-
expected_outcome: Python function to validate email addresses
|
|
162
|
-
|
|
163
|
-
input_messages:
|
|
164
|
-
- role: user
|
|
165
|
-
content: Write a Python function to validate email addresses
|
|
166
|
-
|
|
167
|
-
# Semantic evaluation via rubrics
|
|
168
|
-
rubrics:
|
|
169
|
-
- Uses regular expressions for validation
|
|
170
|
-
- Includes type hints
|
|
171
|
-
- Has docstring documentation
|
|
172
|
-
- Handles edge cases (None, empty string)
|
|
173
|
-
|
|
174
|
-
execution:
|
|
175
|
-
evaluators:
|
|
176
|
-
# Rubric evaluator is auto-added from inline rubrics field
|
|
177
|
-
|
|
178
|
-
# Additional code evaluator for syntax checking
|
|
179
|
-
- name: python_syntax
|
|
180
|
-
type: code_judge
|
|
181
|
-
script: uv run python -m py_compile
|
|
182
|
-
```
|
|
183
|
-
|
|
184
|
-
## Generate Rubrics from Expected Outcome
|
|
185
|
-
|
|
186
|
-
Use the CLI to auto-generate rubrics from `expected_outcome`:
|
|
187
|
-
|
|
188
|
-
```bash
|
|
189
|
-
# Generate rubrics for eval cases that don't have them
|
|
190
|
-
agentv generate rubrics evals/my-eval.yaml
|
|
191
|
-
|
|
192
|
-
# Use a specific LLM target for generation
|
|
193
|
-
agentv generate rubrics evals/my-eval.yaml --target azure_base
|
|
194
|
-
```
|
|
69
|
+
**Score-range:** `score = weighted_average(raw_score / 10)` per criterion
|
|
195
70
|
|
|
196
|
-
|
|
71
|
+
## Verdicts
|
|
197
72
|
|
|
198
|
-
|
|
73
|
+
| Verdict | Condition |
|
|
74
|
+
|---------|-----------|
|
|
75
|
+
| `pass` | score >= 0.8 AND all gating criteria satisfied |
|
|
76
|
+
| `borderline` | score >= 0.6 AND all gating criteria satisfied |
|
|
77
|
+
| `fail` | score < 0.6 OR any gating criterion failed |
|
|
199
78
|
|
|
200
|
-
|
|
201
|
-
2. **Balance weights** - Use higher weights (2.0+) for core requirements, lower (0.5) for nice-to-haves
|
|
202
|
-
3. **Be specific** - "Includes error handling" is better than "Good code quality"
|
|
203
|
-
4. **Keep rubrics atomic** - Each rubric should test one thing
|
|
204
|
-
5. **Consider partial credit** - Non-required rubrics allow partial scores
|
|
79
|
+
Gating: checklist uses `required: true`, score-range uses `required_min_score: N`.
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
import { createRequire } from 'node:module'; const require = createRequire(import.meta.url);
|
|
2
2
|
import {
|
|
3
3
|
require_token_util
|
|
4
|
-
} from "./chunk-
|
|
4
|
+
} from "./chunk-LJVS3JAK.js";
|
|
5
5
|
import {
|
|
6
6
|
__commonJS,
|
|
7
7
|
require_token_error
|
|
8
|
-
} from "./chunk-
|
|
8
|
+
} from "./chunk-BKMQNEUD.js";
|
|
9
9
|
|
|
10
10
|
// ../../node_modules/.bun/@vercel+oidc@3.0.5/node_modules/@vercel/oidc/dist/token.js
|
|
11
11
|
var require_token = __commonJS({
|
|
@@ -61,4 +61,4 @@ var require_token = __commonJS({
|
|
|
61
61
|
}
|
|
62
62
|
});
|
|
63
63
|
export default require_token();
|
|
64
|
-
//# sourceMappingURL=token-
|
|
64
|
+
//# sourceMappingURL=token-D3IYDJQZ.js.map
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import { createRequire } from 'node:module'; const require = createRequire(import.meta.url);
|
|
2
2
|
import {
|
|
3
3
|
require_token_util
|
|
4
|
-
} from "./chunk-
|
|
5
|
-
import "./chunk-
|
|
4
|
+
} from "./chunk-LJVS3JAK.js";
|
|
5
|
+
import "./chunk-BKMQNEUD.js";
|
|
6
6
|
export default require_token_util();
|
|
7
|
-
//# sourceMappingURL=token-util-
|
|
7
|
+
//# sourceMappingURL=token-util-FWFPR2BV.js.map
|