agentv 2.5.4 → 2.5.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,112 +1,83 @@
1
- # Custom Evaluators Guide
1
+ # Custom Evaluators
2
2
 
3
- Templates and best practices for code evaluators and LLM judges. For YAML configuration, see `SKILL.md`.
3
+ ## Wire Format
4
4
 
5
- ## Code Evaluator Contract
6
-
7
- Code evaluators receive input via stdin and write output to stdout, both as JSON.
8
-
9
- ### Input Format (via stdin)
10
-
11
- Wire format uses snake_case for cross-language compatibility:
5
+ ### Input (stdin JSON)
12
6
 
13
7
  ```json
14
8
  {
15
- "question": "string describing the task/question",
16
- "expected_outcome": "expected outcome description",
17
- "reference_answer": "gold standard answer (optional)",
18
- "candidate_answer": "generated code/text from the agent",
19
- "guideline_files": ["path1", "path2"],
20
- "input_files": ["file1", "file2"],
9
+ "question": "string",
10
+ "expected_outcome": "string",
11
+ "reference_answer": "string",
12
+ "candidate_answer": "string",
13
+ "guideline_files": ["path"],
14
+ "input_files": ["path"],
21
15
  "input_messages": [{"role": "user", "content": "..."}],
22
- "expected_messages": [
23
- {
24
- "role": "assistant",
25
- "tool_calls": [
26
- {
27
- "tool": "vector_search",
28
- "input": { "query": "..." },
29
- "output": { "results": ["doc1", "doc2"] }
30
- }
31
- ]
32
- }
33
- ],
34
- "output_messages": [
35
- {
36
- "role": "assistant",
37
- "content": "...",
38
- "tool_calls": [...]
39
- }
40
- ],
16
+ "expected_messages": [{"role": "assistant", "content": "..."}],
17
+ "output_messages": [{"role": "assistant", "content": "..."}],
41
18
  "trace_summary": {
42
19
  "event_count": 5,
43
- "tool_names": ["fetch", "search"],
44
- "tool_calls_by_name": { "search": 2, "fetch": 1 },
20
+ "tool_names": ["fetch"],
21
+ "tool_calls_by_name": {"fetch": 1},
45
22
  "error_count": 0,
46
- "token_usage": { "input": 1000, "output": 500 },
23
+ "token_usage": {"input": 1000, "output": 500},
47
24
  "cost_usd": 0.0015,
48
25
  "duration_ms": 3500
49
26
  }
50
27
  }
51
28
  ```
52
29
 
53
- **Key fields:**
54
- - `expected_messages` - Expected agent behavior from YAML, including tool calls with outputs (use for retrieval context in RAG evals)
55
- - `output_messages` - Actual agent execution trace with tool calls (from live agent runs)
56
- - `trace_summary` - Lightweight summary with execution metrics (counts only, no tool arguments)
57
-
58
- ### Output Format (to stdout)
30
+ ### Output (stdout JSON)
59
31
 
60
32
  ```json
61
33
  {
62
34
  "score": 0.85,
63
- "hits": ["successful check 1", "successful check 2"],
64
- "misses": ["failed check 1"],
65
- "reasoning": "Brief explanation of the score"
35
+ "hits": ["passed check"],
36
+ "misses": ["failed check"],
37
+ "reasoning": "explanation"
66
38
  }
67
39
  ```
68
40
 
69
- **Field Requirements:**
70
- - `score`: Float between 0.0 and 1.0 (required)
71
- - `hits`: Array of strings describing what passed (optional but recommended)
72
- - `misses`: Array of strings describing what failed (optional but recommended)
73
- - `reasoning`: String explaining the score (optional but recommended)
41
+ `score` (0.0-1.0) required. `hits`, `misses`, `reasoning` optional.
42
+
43
+ ## SDK Functions
44
+
45
+ ```typescript
46
+ import { defineCodeJudge, createTargetClient, definePromptTemplate } from '@agentv/eval';
47
+ ```
48
+
49
+ - `defineCodeJudge(fn)` - Wraps evaluation function with stdin/stdout handling
50
+ - `createTargetClient()` - Returns LLM proxy client (when `target: {}` configured)
51
+ - `.invoke({question, systemPrompt})` - Single LLM call
52
+ - `.invokeBatch(requests)` - Batch LLM calls
53
+ - `definePromptTemplate(fn)` - Wraps prompt generation function
54
+ - Context fields: `question`, `candidateAnswer`, `referenceAnswer`, `expectedOutcome`, `expectedMessages`, `outputMessages`, `config`, `traceSummary`
74
55
 
75
- ## Python Code Evaluator Template
56
+ ## Python Example
76
57
 
77
58
  ```python
78
59
  #!/usr/bin/env python3
79
- import json
80
- import sys
60
+ import json, sys
81
61
 
82
62
  def evaluate(data: dict) -> dict:
83
63
  candidate = data.get("candidate_answer", "")
84
64
  hits, misses = [], []
85
-
86
- # Your validation logic here
87
- keywords = ["async", "await"]
88
- for kw in keywords:
65
+ for kw in ["async", "await"]:
89
66
  (hits if kw in candidate else misses).append(f"Keyword '{kw}'")
90
-
91
67
  return {
92
- "score": len(hits) / len(keywords) if keywords else 1.0,
93
- "hits": hits,
94
- "misses": misses,
95
- "reasoning": f"Found {len(hits)}/{len(keywords)} keywords"
68
+ "score": len(hits) / max(len(hits) + len(misses), 1),
69
+ "hits": hits, "misses": misses
96
70
  }
97
71
 
98
72
  if __name__ == "__main__":
99
73
  try:
100
- result = evaluate(json.loads(sys.stdin.read()))
101
- print(json.dumps(result, indent=2))
74
+ print(json.dumps(evaluate(json.loads(sys.stdin.read()))))
102
75
  except Exception as e:
103
- print(json.dumps({"score": 0, "hits": [], "misses": [str(e)], "reasoning": "Error"}))
76
+ print(json.dumps({"score": 0, "misses": [str(e)]}))
104
77
  sys.exit(1)
105
78
  ```
106
79
 
107
- ## TypeScript Code Evaluator Template
108
-
109
- The `@agentv/eval` SDK provides a declarative API with automatic stdin/stdout handling.
80
+ ## TypeScript Example
110
81
 
111
82
  ```typescript
112
83
  #!/usr/bin/env bun
@@ -115,216 +86,30 @@ import { defineCodeJudge } from '@agentv/eval';
115
86
  export default defineCodeJudge(({ candidateAnswer, expectedOutcome }) => {
116
87
  const hits: string[] = [];
117
88
  const misses: string[] = [];
118
-
119
- // Your validation logic here
120
89
  if (candidateAnswer.includes(expectedOutcome)) {
121
- hits.push('Answer matches expected outcome');
90
+ hits.push('Matches expected outcome');
122
91
  } else {
123
- misses.push('Answer does not match expected outcome');
92
+ misses.push('Does not match expected outcome');
124
93
  }
125
-
126
- const total = hits.length + misses.length;
127
94
  return {
128
- score: total === 0 ? 0 : hits.length / total,
129
- hits,
130
- misses,
131
- reasoning: `Passed ${hits.length}/${total} checks`,
95
+ score: hits.length / Math.max(hits.length + misses.length, 1),
96
+ hits, misses,
132
97
  };
133
98
  });
134
99
  ```
135
100
 
136
- **SDK exports:** `defineCodeJudge`, `Message`, `ToolCall`, `TraceSummary`, `CodeJudgeInput`, `CodeJudgeResult`
137
-
138
- ## Target Access for Code Evaluators
139
-
140
- Code judges can access an LLM through a **target proxy** for metrics requiring multiple LLM calls (contextual precision, semantic similarity, etc).
141
-
142
- ### Configuration
143
-
144
- ```yaml
145
- evaluators:
146
- - name: contextual-precision
147
- type: code_judge
148
- script: bun scripts/contextual-precision.ts
149
- target:
150
- max_calls: 10 # Default: 50
151
- ```
152
-
153
- ### Usage
154
-
155
- ```typescript
156
- #!/usr/bin/env bun
157
- import { createTargetClient, defineCodeJudge } from '@agentv/eval';
158
-
159
- export default defineCodeJudge(async ({ question, candidateAnswer }) => {
160
- const target = createTargetClient();
161
- if (!target) return { score: 0, misses: ['Target not configured'] };
162
-
163
- const response = await target.invoke({
164
- question: `Is this relevant to: ${question}? Response: ${candidateAnswer}`,
165
- systemPrompt: 'Respond with JSON: { "relevant": true/false }'
166
- });
167
-
168
- const result = JSON.parse(response.rawText ?? '{}');
169
- return { score: result.relevant ? 1.0 : 0.0 };
170
- });
171
- ```
172
-
173
- **Batch invocation:** Use `target.invokeBatch(requests)` for multiple calls.
174
-
175
- **Environment variables** (set automatically when `target` is configured):
176
- - `AGENTV_TARGET_PROXY_URL` - Local proxy URL
177
- - `AGENTV_TARGET_PROXY_TOKEN` - Bearer token for authentication
178
-
179
- **See also:** `examples/features/code-judge-with-llm-calls/`
180
-
181
- ## LLM Judge Prompt Templates
182
-
183
- LLM judges support two types of prompt templates:
184
-
185
- ### Text Templates (Markdown)
186
-
187
- Simple markdown files with variable substitution. AgentV handles the output format automatically.
188
-
189
- ### TypeScript/JavaScript Templates
190
-
191
- For dynamic prompt generation with full programming capabilities. Uses the same subprocess pattern as code evaluators.
192
-
193
- **YAML Configuration:**
194
-
195
- ```yaml
196
- evaluators:
197
- - name: custom-eval
198
- type: llm_judge
199
- prompt:
200
- script: [bun, run, ../prompts/custom-evaluator.ts]
201
- config: # Optional, passed to script
202
- rubric: "Your rubric here"
203
- strictMode: true
204
- ```
205
-
206
- **TypeScript Template:**
207
-
208
- ```typescript
209
- #!/usr/bin/env bun
210
- import { definePromptTemplate } from '@agentv/eval';
211
-
212
- export default definePromptTemplate((ctx) => {
213
- const rubric = ctx.config?.rubric as string | undefined;
214
-
215
- return `You are evaluating an AI assistant's response.
216
-
217
- ## Question
218
- ${ctx.question}
219
-
220
- ## Candidate Answer
221
- ${ctx.candidateAnswer}
222
-
223
- ${ctx.referenceAnswer ? `## Reference Answer\n${ctx.referenceAnswer}` : ''}
224
-
225
- ${rubric ? `## Evaluation Criteria\n${rubric}` : ''}
226
-
227
- Evaluate and provide a score from 0 to 1.`;
228
- });
229
- ```
230
-
231
- **Available context fields:** `question`, `candidateAnswer`, `referenceAnswer`, `expectedOutcome`, `expectedMessages`, `outputMessages`, `config`, `traceSummary`
232
-
233
- **See also:** `examples/features/prompt-template-sdk/`
234
-
235
- ---
236
-
237
- ## Template Variable Derivation
238
-
239
- Template variables are **derived internally** — users never author them directly. They flow through three layers:
240
-
241
- 1. **Authoring layer** (what users write in YAML/JSONL):
242
- - `input` or `input_messages` — two syntaxes for the same data. `input: "What is 2+2?"` expands to `[{ role: "user", content: "What is 2+2?" }]`. If both are present, `input_messages` takes precedence.
243
- - `expected_output` or `expected_messages` — two syntaxes for the same data. `expected_output: "4"` expands to `[{ role: "assistant", content: "4" }]`. Structured objects and message arrays are also supported. If both are present, `expected_messages` takes precedence.
244
-
245
- 2. **Resolved layer** (after parsing):
246
- - `input_messages: TestMessage[]` — canonical resolved input
247
- - `expected_messages: TestMessage[]` — canonical resolved expected output
248
- - At this layer, `input` and `expected_output` no longer exist as separate fields.
249
-
250
- 3. **Template variable layer** (derived strings injected into evaluator prompts):
251
- - `question` — content of the first `user` role entry in `input_messages`
252
- - `expected_outcome` — passed through from the eval case field
253
- - `reference_answer` — content of the **last** entry in `expected_messages` (the gold-standard answer for grading, not an exact-match target)
254
- - `candidate_answer` — content of the **last** entry in `output_messages` (the provider's actual response being graded)
255
- - `input_messages` — full resolved input array, JSON-serialized
256
- - `expected_messages` — full resolved expected array, JSON-serialized
257
- - `output_messages` — full provider output array, JSON-serialized
258
-
259
- **Example flow:**
260
- ```yaml
261
- # User writes:
262
- input: "What is 2+2?"
263
- expected_output: "The answer is 4"
264
- ```
265
- ```
266
- # Resolved:
267
- input_messages: [{ role: "user", content: "What is 2+2?" }]
268
- expected_messages: [{ role: "assistant", content: "The answer is 4" }]
269
-
270
- # Derived template variables:
271
- question: "What is 2+2?"
272
- reference_answer: "The answer is 4"
273
- candidate_answer: (extracted from provider output at runtime)
274
- ```
275
-
276
- ## Text Template Variables
277
-
278
- **Available variables for markdown templates:**
279
- - `{{question}}` - Derived from first user message in `input_messages`
280
- - `{{expected_outcome}}` - What the answer should accomplish (from eval case field)
281
- - `{{candidate_answer}}` - Derived from last entry in `output_messages` (provider response)
282
- - `{{reference_answer}}` - Derived from last entry in `expected_messages` (gold standard)
283
- - `{{input_messages}}` - Full resolved input messages, JSON-serialized
284
- - `{{expected_messages}}` - Full resolved expected messages, JSON-serialized
285
- - `{{output_messages}}` - Full provider output messages, JSON-serialized
286
-
287
- **Default Template:**
288
-
289
- ```
290
- You are an expert evaluator. Grade the candidate_answer based on how well it achieves the expected_outcome.
291
-
292
- Use reference_answer as a gold standard (if provided). The candidate_answer doesn't need to match verbatim, but should capture key points.
101
+ ## Template Variables
293
102
 
294
- Be concise. Provide specific feedback rather than verbose explanations.
103
+ Derived from eval case fields (users never author these directly):
295
104
 
296
- [[ ## expected_outcome ## ]]
297
- {{expected_outcome}}
105
+ | Variable | Source |
106
+ |----------|--------|
107
+ | `question` | First user message in `input_messages` |
108
+ | `expected_outcome` | Eval case `expected_outcome` field |
109
+ | `reference_answer` | Last entry in `expected_messages` |
110
+ | `candidate_answer` | Last entry in `output_messages` (runtime) |
111
+ | `input_messages` | Full resolved input array (JSON) |
112
+ | `expected_messages` | Full resolved expected array (JSON) |
113
+ | `output_messages` | Full provider output array (JSON) |
298
114
 
299
- [[ ## question ## ]]
300
- {{question}}
301
-
302
- [[ ## reference_answer ## ]]
303
- {{reference_answer}}
304
-
305
- [[ ## candidate_answer ## ]]
306
- {{candidate_answer}}
307
- ```
308
-
309
- ## Best Practices
310
-
311
- ### Code Evaluators
312
- 1. **Focus on `candidate_answer`** - Most evaluators only need this field
313
- 2. **Be deterministic** - Same input → same output
314
- 3. **Handle errors gracefully** - Return valid result even on failure
315
- 4. **Use `hits`/`misses`** - Explain the score clearly
316
-
317
- ### LLM Judges
318
- 1. **Clear criteria** - Define what you're evaluating
319
- 2. **Specific rubrics** - Provide scoring guidelines
320
- 3. **Concise prompts** - Keep instructions focused
321
-
322
- ## Testing Locally
323
-
324
- ```bash
325
- # Python
326
- echo '{"candidate_answer": "test", "question": "task", "expected_outcome": "result"}' | uv run my_validator.py
327
-
328
- # TypeScript
329
- echo '{"candidate_answer": "test", "question": "task", "expected_outcome": "result"}' | bun run ./check.ts
330
- ```
115
+ Markdown templates use `{{variable}}` syntax. TypeScript templates receive context object.
@@ -1,204 +1,79 @@
1
- # Rubric Evaluator Guide
1
+ # Rubric Evaluator
2
2
 
3
- Rubrics provide structured evaluation through lists of criteria that define what makes a good response. Rubrics are checked by an LLM judge and scored based on weights and requirements.
3
+ ## Field Reference
4
4
 
5
- ## Basic Usage
5
+ | Field | Type | Default | Description |
6
+ |-------|------|---------|-------------|
7
+ | `id` | string | auto-generated | Unique identifier |
8
+ | `expected_outcome` | string | required* | Criterion being evaluated (*optional if `score_ranges` used) |
9
+ | `weight` | number | 1.0 | Relative importance |
10
+ | `required` | boolean | true | Failing forces verdict to 'fail' (checklist mode) |
11
+ | `required_min_score` | integer | - | Minimum 0-10 score to pass (score-range mode) |
12
+ | `score_ranges` | map or array | - | Score range definitions for analytic scoring |
6
13
 
7
- ### Simple String Rubrics
14
+ `description` is a backward-compatible alias for `expected_outcome`.
8
15
 
9
- Define rubrics as simple strings - each becomes a required criterion with weight 1.0:
16
+ ## Checklist Mode
10
17
 
11
18
  ```yaml
12
- evalcases:
13
- - id: quicksort-explanation
14
- expected_outcome: Explain how quicksort works
15
-
16
- input_messages:
17
- - role: user
18
- content: Explain how the quicksort algorithm works
19
-
20
- rubrics:
21
- - Mentions divide-and-conquer approach
22
- - Explains the partition step
23
- - States time complexity correctly
19
+ rubrics:
20
+ - Mentions divide-and-conquer approach
21
+ - id: complexity
22
+ expected_outcome: States time complexity correctly
23
+ weight: 2.0
24
+ required: true
25
+ - id: examples
26
+ expected_outcome: Includes code examples
27
+ weight: 1.0
28
+ required: false
24
29
  ```
25
30
 
26
- ### Detailed Rubric Objects (Checklist Mode)
31
+ ## Score-Range Mode
27
32
 
28
- Use objects for fine-grained control over weights and requirements:
33
+ Shorthand map format (recommended):
29
34
 
30
35
  ```yaml
31
- evalcases:
32
- - id: technical-guide
33
- expected_outcome: Write a comprehensive HTTP status codes guide
34
-
35
- input_messages:
36
- - role: user
37
- content: Write a guide explaining HTTP status codes
38
-
39
- rubrics:
40
- - id: structure
41
- expected_outcome: Has clear headings and organization
42
- weight: 1.0
43
- required: true
44
-
45
- - id: success-codes
46
- expected_outcome: Covers 2xx success codes with examples
47
- weight: 2.0
48
- required: true
49
-
50
- - id: client-errors
51
- expected_outcome: Explains 4xx client error codes
52
- weight: 2.0
53
- required: true
54
-
55
- - id: server-errors
56
- expected_outcome: Explains 5xx server error codes
57
- weight: 1.5
58
- required: false
59
-
60
- - id: practical-examples
61
- expected_outcome: Includes practical use case examples
62
- weight: 1.0
63
- required: false
36
+ rubrics:
37
+ - id: correctness
38
+ weight: 2.0
39
+ required_min_score: 7
40
+ score_ranges:
41
+ 0: Critical bugs
42
+ 3: Minor bugs
43
+ 6: Correct with minor issues
44
+ 9: Fully correct
64
45
  ```
65
46
 
66
- ### Score-Range Rubrics (Analytic Mode)
47
+ Map keys are lower bounds (0-10). Each range extends from its key to (next key - 1), with the last extending to 10. Must start at 0.
67
48
 
68
- For more granular scoring, use `score_ranges` to define 0-10 integer scoring per criterion:
49
+ Array format is also accepted:
69
50
 
70
51
  ```yaml
71
- evalcases:
72
- - id: code-review
73
- expected_outcome: Review the code for correctness and style
74
-
75
- input_messages:
76
- - role: user
77
- content: Review this Python function for issues
78
-
79
- rubrics:
80
- - id: correctness
81
- weight: 2.0
82
- required_min_score: 7 # Fail if score < 7
83
- score_ranges:
84
- - score_range: [0, 2]
85
- expected_outcome: Contains critical bugs or errors
86
- - score_range: [3, 5]
87
- expected_outcome: Has minor bugs or edge case issues
88
- - score_range: [6, 8]
89
- expected_outcome: Functionally correct with minor issues
90
- - score_range: [9, 10]
91
- expected_outcome: Fully correct implementation
92
-
93
- - id: style
94
- weight: 1.0
95
- score_ranges:
96
- - score_range: [0, 3]
97
- expected_outcome: Poor style, hard to read
98
- - score_range: [4, 6]
99
- expected_outcome: Acceptable style with issues
100
- - score_range: [7, 10]
101
- expected_outcome: Clean, idiomatic code
52
+ score_ranges:
53
+ - score_range: [0, 2]
54
+ expected_outcome: Critical bugs
55
+ - score_range: [3, 5]
56
+ expected_outcome: Minor bugs
57
+ - score_range: [6, 8]
58
+ expected_outcome: Correct with minor issues
59
+ - score_range: [9, 10]
60
+ expected_outcome: Fully correct
102
61
  ```
103
62
 
104
- **Score-range validation rules:**
105
- - Ranges must be integers within 0-10
106
- - Ranges must not overlap
107
- - Ranges must cover all values 0-10 (no gaps)
108
- - Each range must have a non-empty `expected_outcome`
109
-
110
- ## Rubric Object Fields
111
-
112
- | Field | Type | Default | Description |
113
- |-------|------|---------|-------------|
114
- | `id` | string | auto-generated | Unique identifier for the rubric |
115
- | `expected_outcome` | string | required* | The criterion being evaluated (*optional if `score_ranges` used) |
116
- | `weight` | number | 1.0 | Relative importance (higher = more impact on score) |
117
- | `required` | boolean | true | If true, failing this rubric forces verdict to 'fail' (checklist mode) |
118
- | `required_min_score` | integer | - | Minimum 0-10 score required to pass (score-range mode) |
119
- | `score_ranges` | array | - | Score range definitions for analytic rubric scoring |
120
-
121
- > **Note:** `description` is supported as a backward-compatible alias for `expected_outcome`.
63
+ Ranges must be integers 0-10, non-overlapping, covering all values 0-10.
122
64
 
123
- ## Scoring and Verdicts
124
-
125
- ### Checklist Mode (boolean)
126
- ```
127
- score = (sum of satisfied weights) / (total weights)
128
- ```
129
-
130
- ### Score-Range Mode (0-10 integers)
131
- ```
132
- normalized_score = raw_score / 10 # Convert 0-10 to 0-1
133
- final_score = weighted_average(normalized_scores)
134
- ```
65
+ ## Scoring
135
66
 
136
- **Verdict Rules:**
137
- - `pass`: Score ≥ 0.8 AND all gating criteria satisfied
138
- - `borderline`: Score ≥ 0.6 AND all gating criteria satisfied
139
- - `fail`: Score < 0.6 OR any gating criterion failed
67
+ **Checklist:** `score = sum(satisfied weights) / sum(all weights)`
140
68
 
141
- **Gating:**
142
- - Checklist mode: `required: true` means must be satisfied
143
- - Score-range mode: `required_min_score: N` means score must be ≥ N
144
-
145
- ## When to Use Each Mode
146
-
147
- | Use Case | Mode | Why |
148
- |----------|------|-----|
149
- | Binary pass/fail criteria | Checklist | Simple yes/no evaluation |
150
- | Quality gradient | Score-range | Captures nuance (poor → excellent) |
151
- | Critical requirements | Checklist + `required: true` | Hard gating on must-haves |
152
- | Minimum quality bar | Score-range + `required_min_score` | Flexible threshold gating |
153
-
154
- ## Combining Rubrics with Other Evaluators
155
-
156
- Rubrics can be combined with code evaluators for comprehensive validation:
157
-
158
- ```yaml
159
- evalcases:
160
- - id: email-validator
161
- expected_outcome: Python function to validate email addresses
162
-
163
- input_messages:
164
- - role: user
165
- content: Write a Python function to validate email addresses
166
-
167
- # Semantic evaluation via rubrics
168
- rubrics:
169
- - Uses regular expressions for validation
170
- - Includes type hints
171
- - Has docstring documentation
172
- - Handles edge cases (None, empty string)
173
-
174
- execution:
175
- evaluators:
176
- # Rubric evaluator is auto-added from inline rubrics field
177
-
178
- # Additional code evaluator for syntax checking
179
- - name: python_syntax
180
- type: code_judge
181
- script: uv run python -m py_compile
182
- ```
183
-
184
- ## Generate Rubrics from Expected Outcome
185
-
186
- Use the CLI to auto-generate rubrics from `expected_outcome`:
187
-
188
- ```bash
189
- # Generate rubrics for eval cases that don't have them
190
- agentv generate rubrics evals/my-eval.yaml
191
-
192
- # Use a specific LLM target for generation
193
- agentv generate rubrics evals/my-eval.yaml --target azure_base
194
- ```
69
+ **Score-range:** `score = weighted_average(raw_score / 10)` per criterion
195
70
 
196
- This analyzes each `expected_outcome` and creates appropriate rubric items.
71
+ ## Verdicts
197
72
 
198
- ## Best Practices
73
+ | Verdict | Condition |
74
+ |---------|-----------|
75
+ | `pass` | score >= 0.8 AND all gating criteria satisfied |
76
+ | `borderline` | score >= 0.6 AND all gating criteria satisfied |
77
+ | `fail` | score < 0.6 OR any gating criterion failed |
199
78
 
200
- 1. **Use required sparingly** - Only mark rubrics as `required: true` for critical criteria
201
- 2. **Balance weights** - Use higher weights (2.0+) for core requirements, lower (0.5) for nice-to-haves
202
- 3. **Be specific** - "Includes error handling" is better than "Good code quality"
203
- 4. **Keep rubrics atomic** - Each rubric should test one thing
204
- 5. **Consider partial credit** - Non-required rubrics allow partial scores
79
+ Gating: checklist uses `required: true`, score-range uses `required_min_score: N`.
@@ -1,11 +1,11 @@
1
1
  import { createRequire } from 'node:module'; const require = createRequire(import.meta.url);
2
2
  import {
3
3
  require_token_util
4
- } from "./chunk-A7TQUSVG.js";
4
+ } from "./chunk-LJVS3JAK.js";
5
5
  import {
6
6
  __commonJS,
7
7
  require_token_error
8
- } from "./chunk-LTPZBEJU.js";
8
+ } from "./chunk-BKMQNEUD.js";
9
9
 
10
10
  // ../../node_modules/.bun/@vercel+oidc@3.0.5/node_modules/@vercel/oidc/dist/token.js
11
11
  var require_token = __commonJS({
@@ -61,4 +61,4 @@ var require_token = __commonJS({
61
61
  }
62
62
  });
63
63
  export default require_token();
64
- //# sourceMappingURL=token-DVVSDOYP.js.map
64
+ //# sourceMappingURL=token-D3IYDJQZ.js.map
@@ -1,7 +1,7 @@
1
1
  import { createRequire } from 'node:module'; const require = createRequire(import.meta.url);
2
2
  import {
3
3
  require_token_util
4
- } from "./chunk-A7TQUSVG.js";
5
- import "./chunk-LTPZBEJU.js";
4
+ } from "./chunk-LJVS3JAK.js";
5
+ import "./chunk-BKMQNEUD.js";
6
6
  export default require_token_util();
7
- //# sourceMappingURL=token-util-YEKFTEJA.js.map
7
+ //# sourceMappingURL=token-util-FWFPR2BV.js.map