agentv 2.0.2 → 2.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +199 -325
- package/dist/{chunk-5AJ7DFUO.js → chunk-HTTN5OWL.js} +1203 -895
- package/dist/chunk-HTTN5OWL.js.map +1 -0
- package/dist/cli.js +1 -1
- package/dist/index.js +1 -1
- package/dist/templates/.claude/skills/agentv-eval-builder/SKILL.md +24 -2
- package/dist/templates/.claude/skills/agentv-eval-builder/references/batch-cli-evaluator.md +0 -12
- package/dist/templates/.claude/skills/agentv-eval-builder/references/composite-evaluator.md +215 -215
- package/dist/templates/.claude/skills/agentv-eval-builder/references/custom-evaluators.md +90 -209
- package/package.json +2 -2
- package/dist/chunk-5AJ7DFUO.js.map +0 -1
- /package/dist/templates/.agentv/{.env.template → .env.example} +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# Custom Evaluators Guide
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
Templates and best practices for code evaluators and LLM judges. For YAML configuration, see `SKILL.md`.
|
|
4
4
|
|
|
5
5
|
## Code Evaluator Contract
|
|
6
6
|
|
|
@@ -19,21 +19,25 @@ Wire format uses snake_case for cross-language compatibility:
|
|
|
19
19
|
"guideline_files": ["path1", "path2"],
|
|
20
20
|
"input_files": ["file1", "file2"],
|
|
21
21
|
"input_messages": [{"role": "user", "content": "..."}],
|
|
22
|
-
"
|
|
22
|
+
"expected_messages": [
|
|
23
23
|
{
|
|
24
24
|
"role": "assistant",
|
|
25
|
-
"content": "...",
|
|
26
25
|
"tool_calls": [
|
|
27
26
|
{
|
|
28
|
-
"tool": "
|
|
27
|
+
"tool": "vector_search",
|
|
29
28
|
"input": { "query": "..." },
|
|
30
|
-
"output": { "results": [
|
|
31
|
-
"id": "call_123",
|
|
32
|
-
"timestamp": "2024-01-15T10:30:00Z"
|
|
29
|
+
"output": { "results": ["doc1", "doc2"] }
|
|
33
30
|
}
|
|
34
31
|
]
|
|
35
32
|
}
|
|
36
33
|
],
|
|
34
|
+
"output_messages": [
|
|
35
|
+
{
|
|
36
|
+
"role": "assistant",
|
|
37
|
+
"content": "...",
|
|
38
|
+
"tool_calls": [...]
|
|
39
|
+
}
|
|
40
|
+
],
|
|
37
41
|
"trace_summary": {
|
|
38
42
|
"event_count": 5,
|
|
39
43
|
"tool_names": ["fetch", "search"],
|
|
@@ -47,7 +51,8 @@ Wire format uses snake_case for cross-language compatibility:
|
|
|
47
51
|
```
|
|
48
52
|
|
|
49
53
|
**Key fields:**
|
|
50
|
-
- `
|
|
54
|
+
- `expected_messages` - Expected agent behavior from YAML, including tool calls with outputs (use for retrieval context in RAG evals)
|
|
55
|
+
- `output_messages` - Actual agent execution trace with tool calls (from live agent runs)
|
|
51
56
|
- `trace_summary` - Lightweight summary with execution metrics (counts only, no tool arguments)
|
|
52
57
|
|
|
53
58
|
### Output Format (to stdout)
|
|
@@ -71,201 +76,128 @@ Wire format uses snake_case for cross-language compatibility:
|
|
|
71
76
|
|
|
72
77
|
```python
|
|
73
78
|
#!/usr/bin/env python3
|
|
74
|
-
"""
|
|
75
|
-
Example code evaluator for AgentV
|
|
76
|
-
|
|
77
|
-
This evaluator checks for specific keywords in the output.
|
|
78
|
-
Replace validation logic as needed.
|
|
79
|
-
"""
|
|
80
|
-
|
|
81
79
|
import json
|
|
82
80
|
import sys
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
Evaluate the agent output.
|
|
89
|
-
|
|
90
|
-
Args:
|
|
91
|
-
input_data: Full input context from AgentV
|
|
92
|
-
|
|
93
|
-
Returns:
|
|
94
|
-
Evaluation result with score, hits, misses, reasoning
|
|
95
|
-
"""
|
|
96
|
-
# Extract only the fields you need
|
|
97
|
-
# Most evaluators only need 'candidate_answer' - avoid using unnecessary fields
|
|
98
|
-
candidate_answer = input_data.get("candidate_answer", "")
|
|
99
|
-
|
|
81
|
+
|
|
82
|
+
def evaluate(data: dict) -> dict:
|
|
83
|
+
candidate = data.get("candidate_answer", "")
|
|
84
|
+
hits, misses = [], []
|
|
85
|
+
|
|
100
86
|
# Your validation logic here
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
required_keywords = ["async", "await"]
|
|
106
|
-
for keyword in required_keywords:
|
|
107
|
-
if keyword in candidate_answer:
|
|
108
|
-
hits.append(f"Contains required keyword: {keyword}")
|
|
109
|
-
else:
|
|
110
|
-
misses.append(f"Missing required keyword: {keyword}")
|
|
111
|
-
|
|
112
|
-
# Calculate score
|
|
113
|
-
if not required_keywords:
|
|
114
|
-
score = 1.0
|
|
115
|
-
else:
|
|
116
|
-
score = len(hits) / len(required_keywords)
|
|
117
|
-
|
|
118
|
-
# Build result
|
|
87
|
+
keywords = ["async", "await"]
|
|
88
|
+
for kw in keywords:
|
|
89
|
+
(hits if kw in candidate else misses).append(f"Keyword '{kw}'")
|
|
90
|
+
|
|
119
91
|
return {
|
|
120
|
-
"score":
|
|
92
|
+
"score": len(hits) / len(keywords) if keywords else 1.0,
|
|
121
93
|
"hits": hits,
|
|
122
94
|
"misses": misses,
|
|
123
|
-
"reasoning": f"Found {len(hits)}/{len(
|
|
95
|
+
"reasoning": f"Found {len(hits)}/{len(keywords)} keywords"
|
|
124
96
|
}
|
|
125
97
|
|
|
126
|
-
|
|
127
|
-
def main():
|
|
128
|
-
"""Main entry point for AgentV code evaluator."""
|
|
98
|
+
if __name__ == "__main__":
|
|
129
99
|
try:
|
|
130
|
-
|
|
131
|
-
input_data = json.loads(sys.stdin.read())
|
|
132
|
-
|
|
133
|
-
# Run evaluation
|
|
134
|
-
result = evaluate(input_data)
|
|
135
|
-
|
|
136
|
-
# Write result to stdout
|
|
100
|
+
result = evaluate(json.loads(sys.stdin.read()))
|
|
137
101
|
print(json.dumps(result, indent=2))
|
|
138
|
-
|
|
139
102
|
except Exception as e:
|
|
140
|
-
|
|
141
|
-
error_result = {
|
|
142
|
-
"score": 0.0,
|
|
143
|
-
"hits": [],
|
|
144
|
-
"misses": [f"Evaluator error: {str(e)}"],
|
|
145
|
-
"reasoning": f"Evaluator error: {str(e)}"
|
|
146
|
-
}
|
|
147
|
-
print(json.dumps(error_result, indent=2))
|
|
103
|
+
print(json.dumps({"score": 0, "hits": [], "misses": [str(e)], "reasoning": "Error"}))
|
|
148
104
|
sys.exit(1)
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
if __name__ == "__main__":
|
|
152
|
-
main()
|
|
153
105
|
```
|
|
154
106
|
|
|
155
|
-
## TypeScript Code Evaluator Template
|
|
156
|
-
|
|
157
|
-
The `@agentv/eval` SDK provides a declarative API for code evaluators with automatic stdin/stdout handling, validation, and error handling.
|
|
107
|
+
## TypeScript Code Evaluator Template
|
|
158
108
|
|
|
159
|
-
|
|
109
|
+
The `@agentv/eval` SDK provides a declarative API with automatic stdin/stdout handling.
|
|
160
110
|
|
|
161
111
|
```typescript
|
|
162
112
|
#!/usr/bin/env bun
|
|
163
|
-
/**
|
|
164
|
-
* Example TypeScript code evaluator using defineCodeJudge
|
|
165
|
-
*
|
|
166
|
-
* Run with: bun run ./evaluators/example-check.ts
|
|
167
|
-
* or: npx --yes tsx ./evaluators/example-check.ts
|
|
168
|
-
*
|
|
169
|
-
* The SDK handles:
|
|
170
|
-
* - Reading JSON from stdin
|
|
171
|
-
* - Converting snake_case to camelCase
|
|
172
|
-
* - Validating input with Zod
|
|
173
|
-
* - Error handling and output formatting
|
|
174
|
-
*/
|
|
175
113
|
import { defineCodeJudge } from '@agentv/eval';
|
|
176
114
|
|
|
177
|
-
export default defineCodeJudge(({ candidateAnswer, expectedOutcome
|
|
115
|
+
export default defineCodeJudge(({ candidateAnswer, expectedOutcome }) => {
|
|
178
116
|
const hits: string[] = [];
|
|
179
117
|
const misses: string[] = [];
|
|
180
118
|
|
|
181
|
-
//
|
|
119
|
+
// Your validation logic here
|
|
182
120
|
if (candidateAnswer.includes(expectedOutcome)) {
|
|
183
121
|
hits.push('Answer matches expected outcome');
|
|
184
122
|
} else {
|
|
185
123
|
misses.push('Answer does not match expected outcome');
|
|
186
124
|
}
|
|
187
125
|
|
|
188
|
-
|
|
189
|
-
const attachments = [...guidelineFiles, ...inputFiles];
|
|
190
|
-
for (const filePath of attachments) {
|
|
191
|
-
const fileName = filePath.split('/').pop() ?? filePath;
|
|
192
|
-
if (candidateAnswer.includes(fileName)) {
|
|
193
|
-
hits.push(`Mentions attachment: ${fileName}`);
|
|
194
|
-
} else {
|
|
195
|
-
misses.push(`Missing attachment: ${fileName}`);
|
|
196
|
-
}
|
|
197
|
-
}
|
|
198
|
-
|
|
199
|
-
// Calculate score
|
|
200
|
-
const totalChecks = hits.length + misses.length;
|
|
201
|
-
const score = totalChecks === 0 ? 0 : hits.length / totalChecks;
|
|
202
|
-
|
|
126
|
+
const total = hits.length + misses.length;
|
|
203
127
|
return {
|
|
204
|
-
score,
|
|
128
|
+
score: total === 0 ? 0 : hits.length / total,
|
|
205
129
|
hits,
|
|
206
130
|
misses,
|
|
207
|
-
reasoning: `Passed ${hits.length}/${
|
|
131
|
+
reasoning: `Passed ${hits.length}/${total} checks`,
|
|
208
132
|
};
|
|
209
133
|
});
|
|
210
134
|
```
|
|
211
135
|
|
|
212
|
-
**
|
|
213
|
-
- **Zero boilerplate**: No try/catch, stdin parsing, or JSON.stringify needed
|
|
214
|
-
- **Type-safe**: `CodeJudgeInput` interface with all fields typed
|
|
215
|
-
- **camelCase**: Idiomatic TypeScript naming (`candidateAnswer` vs `candidate_answer`)
|
|
216
|
-
- **Validation**: Zod schemas validate input and output at runtime
|
|
217
|
-
- **Error handling**: Exceptions automatically produce valid failure results
|
|
136
|
+
**SDK exports:** `defineCodeJudge`, `Message`, `ToolCall`, `TraceSummary`, `CodeJudgeInput`, `CodeJudgeResult`
|
|
218
137
|
|
|
219
|
-
|
|
220
|
-
- `defineCodeJudge(handler)`: Define a code judge evaluator (recommended)
|
|
221
|
-
- `CodeJudgeInput`: TypeScript type for input payload
|
|
222
|
-
- `CodeJudgeResult`: TypeScript type for result
|
|
223
|
-
- `TraceSummary`, `OutputMessage`: Types for trace data
|
|
224
|
-
- `z`: Re-exported Zod for custom config schemas
|
|
138
|
+
## Target Access for Code Evaluators
|
|
225
139
|
|
|
226
|
-
**
|
|
140
|
+
Code judges can access an LLM through a **target proxy** for metrics requiring multiple LLM calls (contextual precision, semantic similarity, etc).
|
|
141
|
+
|
|
142
|
+
### Configuration
|
|
143
|
+
|
|
144
|
+
```yaml
|
|
145
|
+
evaluators:
|
|
146
|
+
- name: contextual-precision
|
|
147
|
+
type: code_judge
|
|
148
|
+
script: bun scripts/contextual-precision.ts
|
|
149
|
+
target:
|
|
150
|
+
max_calls: 10 # Default: 50
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
### Usage
|
|
227
154
|
|
|
228
155
|
```typescript
|
|
229
|
-
|
|
156
|
+
#!/usr/bin/env bun
|
|
157
|
+
import { createTargetClient, defineCodeJudge } from '@agentv/eval';
|
|
230
158
|
|
|
231
|
-
export default defineCodeJudge(({
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
}
|
|
159
|
+
export default defineCodeJudge(async ({ question, candidateAnswer }) => {
|
|
160
|
+
const target = createTargetClient();
|
|
161
|
+
if (!target) return { score: 0, misses: ['Target not configured'] };
|
|
235
162
|
|
|
236
|
-
const
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
};
|
|
163
|
+
const response = await target.invoke({
|
|
164
|
+
question: `Is this relevant to: ${question}? Response: ${candidateAnswer}`,
|
|
165
|
+
systemPrompt: 'Respond with JSON: { "relevant": true/false }'
|
|
166
|
+
});
|
|
167
|
+
|
|
168
|
+
const result = JSON.parse(response.rawText ?? '{}');
|
|
169
|
+
return { score: result.relevant ? 1.0 : 0.0 };
|
|
242
170
|
});
|
|
243
171
|
```
|
|
244
172
|
|
|
245
|
-
**
|
|
173
|
+
**Batch invocation:** Use `target.invokeBatch(requests)` for multiple calls.
|
|
174
|
+
|
|
175
|
+
**Environment variables** (set automatically when `target` is configured):
|
|
176
|
+
- `AGENTV_TARGET_PROXY_URL` - Local proxy URL
|
|
177
|
+
- `AGENTV_TARGET_PROXY_TOKEN` - Bearer token for authentication
|
|
178
|
+
|
|
179
|
+
**See also:** `examples/features/code-judge-with-llm-calls/`
|
|
246
180
|
|
|
247
181
|
## LLM Judge Prompt Template
|
|
248
182
|
|
|
249
|
-
LLM judges use markdown prompts
|
|
183
|
+
LLM judges use markdown prompts. AgentV handles the output format automatically.
|
|
250
184
|
|
|
251
185
|
**Available Template Variables:**
|
|
252
186
|
- `{{question}}` - The original question/task
|
|
253
187
|
- `{{expected_outcome}}` - What the answer should accomplish
|
|
254
188
|
- `{{candidate_answer}}` - The actual output to evaluate
|
|
255
|
-
- `{{reference_answer}}` - Gold standard answer (optional
|
|
256
|
-
- `{{input_messages}}` - JSON stringified input
|
|
257
|
-
- `{{output_messages}}` - JSON stringified
|
|
189
|
+
- `{{reference_answer}}` - Gold standard answer (optional)
|
|
190
|
+
- `{{input_messages}}` - JSON stringified input messages
|
|
191
|
+
- `{{output_messages}}` - JSON stringified output messages
|
|
258
192
|
|
|
259
|
-
**Default
|
|
260
|
-
|
|
261
|
-
If you don't specify a custom evaluator template, AgentV uses this default:
|
|
193
|
+
**Default Template:**
|
|
262
194
|
|
|
263
195
|
```
|
|
264
|
-
You are an expert evaluator.
|
|
196
|
+
You are an expert evaluator. Grade the candidate_answer based on how well it achieves the expected_outcome.
|
|
265
197
|
|
|
266
|
-
Use
|
|
198
|
+
Use reference_answer as a gold standard (if provided). The candidate_answer doesn't need to match verbatim, but should capture key points.
|
|
267
199
|
|
|
268
|
-
Be concise
|
|
200
|
+
Be concise. Provide specific feedback rather than verbose explanations.
|
|
269
201
|
|
|
270
202
|
[[ ## expected_outcome ## ]]
|
|
271
203
|
{{expected_outcome}}
|
|
@@ -280,76 +212,25 @@ Be concise and focused in your evaluation. Provide succinct, specific feedback r
|
|
|
280
212
|
{{candidate_answer}}
|
|
281
213
|
```
|
|
282
214
|
|
|
283
|
-
You can customize this template in your eval file using the `evaluatorTemplate` field to add domain-specific criteria or scoring guidelines.
|
|
284
|
-
|
|
285
215
|
## Best Practices
|
|
286
216
|
|
|
287
|
-
###
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
4. **Handle errors gracefully** - Return a valid result even when evaluation fails
|
|
293
|
-
5. **Provide helpful feedback** - Use `hits` and `misses` to explain the score
|
|
294
|
-
|
|
295
|
-
### For Prompt-based Evaluators (LLM Judges)
|
|
217
|
+
### Code Evaluators
|
|
218
|
+
1. **Focus on `candidate_answer`** - Most evaluators only need this field
|
|
219
|
+
2. **Be deterministic** - Same input → same output
|
|
220
|
+
3. **Handle errors gracefully** - Return valid result even on failure
|
|
221
|
+
4. **Use `hits`/`misses`** - Explain the score clearly
|
|
296
222
|
|
|
223
|
+
### LLM Judges
|
|
297
224
|
1. **Clear criteria** - Define what you're evaluating
|
|
298
|
-
2. **Specific
|
|
299
|
-
3. **
|
|
300
|
-
4. **Examples** - Show what good/bad looks like
|
|
301
|
-
5. **Concise prompts** - Keep instructions focused
|
|
302
|
-
|
|
303
|
-
## Running Code Evaluators
|
|
225
|
+
2. **Specific rubrics** - Provide scoring guidelines
|
|
226
|
+
3. **Concise prompts** - Keep instructions focused
|
|
304
227
|
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
```yaml
|
|
308
|
-
execution:
|
|
309
|
-
evaluators:
|
|
310
|
-
- name: my_validator
|
|
311
|
-
type: code_judge
|
|
312
|
-
script: uv run my_validator.py
|
|
313
|
-
cwd: ./evaluators
|
|
314
|
-
```
|
|
315
|
-
|
|
316
|
-
TypeScript evaluators use the same structure but invoke `tsx` (or another Node-compatible loader) so they work everywhere:
|
|
317
|
-
|
|
318
|
-
```yaml
|
|
319
|
-
execution:
|
|
320
|
-
evaluators:
|
|
321
|
-
- name: csv_guardrail
|
|
322
|
-
type: code_judge
|
|
323
|
-
script: npx --yes tsx ./evaluators/check-csv.ts
|
|
324
|
-
cwd: ./evaluators
|
|
325
|
-
```
|
|
326
|
-
|
|
327
|
-
### Command Line Testing
|
|
328
|
-
|
|
329
|
-
Test your evaluator locally:
|
|
228
|
+
## Testing Locally
|
|
330
229
|
|
|
331
230
|
```bash
|
|
332
|
-
#
|
|
333
|
-
echo '{
|
|
334
|
-
"candidate_answer": "test output here",
|
|
335
|
-
"question": "test task",
|
|
336
|
-
"expected_outcome": "expected result"
|
|
337
|
-
}' | uv run my_validator.py
|
|
338
|
-
|
|
339
|
-
# Should output:
|
|
340
|
-
# {
|
|
341
|
-
# "score": 0.8,
|
|
342
|
-
# "hits": ["check 1 passed"],
|
|
343
|
-
# "misses": ["check 2 failed"],
|
|
344
|
-
# "reasoning": "..."
|
|
345
|
-
# }
|
|
346
|
-
```
|
|
231
|
+
# Python
|
|
232
|
+
echo '{"candidate_answer": "test", "question": "task", "expected_outcome": "result"}' | uv run my_validator.py
|
|
347
233
|
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
echo '{
|
|
351
|
-
"candidate_answer": "test output here",
|
|
352
|
-
"question": "test task",
|
|
353
|
-
"expected_outcome": "expected result"
|
|
354
|
-
}' | npx --yes tsx ./evaluators/check-csv.ts
|
|
234
|
+
# TypeScript
|
|
235
|
+
echo '{"candidate_answer": "test", "question": "task", "expected_outcome": "result"}' | bun run ./check.ts
|
|
355
236
|
```
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "agentv",
|
|
3
|
-
"version": "2.
|
|
3
|
+
"version": "2.1.1",
|
|
4
4
|
"description": "CLI entry point for AgentV",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"repository": {
|
|
@@ -31,7 +31,7 @@
|
|
|
31
31
|
"test:watch": "bun test --watch"
|
|
32
32
|
},
|
|
33
33
|
"dependencies": {
|
|
34
|
-
"@agentv/core": "2.0.
|
|
34
|
+
"@agentv/core": "2.0.2",
|
|
35
35
|
"@mariozechner/pi-agent": "^0.9.0",
|
|
36
36
|
"@mariozechner/pi-ai": "^0.37.2",
|
|
37
37
|
"cmd-ts": "^0.14.3",
|