agentv 2.0.1 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +199 -318
- package/dist/{chunk-6SHT2QS6.js → chunk-5BLNVACB.js} +1286 -756
- package/dist/chunk-5BLNVACB.js.map +1 -0
- package/dist/cli.js +4 -2
- package/dist/cli.js.map +1 -1
- package/dist/index.js +1 -1
- package/dist/templates/.claude/skills/agentv-eval-builder/SKILL.md +24 -2
- package/dist/templates/.claude/skills/agentv-eval-builder/references/batch-cli-evaluator.md +0 -12
- package/dist/templates/.claude/skills/agentv-eval-builder/references/composite-evaluator.md +215 -215
- package/dist/templates/.claude/skills/agentv-eval-builder/references/custom-evaluators.md +100 -217
- package/package.json +4 -2
- package/dist/chunk-6SHT2QS6.js.map +0 -1
- /package/dist/templates/.agentv/{.env.template → .env.example} +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# Custom Evaluators Guide
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
Templates and best practices for code evaluators and LLM judges. For YAML configuration, see `SKILL.md`.
|
|
4
4
|
|
|
5
5
|
## Code Evaluator Contract
|
|
6
6
|
|
|
@@ -19,21 +19,25 @@ Wire format uses snake_case for cross-language compatibility:
|
|
|
19
19
|
"guideline_files": ["path1", "path2"],
|
|
20
20
|
"input_files": ["file1", "file2"],
|
|
21
21
|
"input_messages": [{"role": "user", "content": "..."}],
|
|
22
|
-
"
|
|
22
|
+
"expected_messages": [
|
|
23
23
|
{
|
|
24
24
|
"role": "assistant",
|
|
25
|
-
"content": "...",
|
|
26
25
|
"tool_calls": [
|
|
27
26
|
{
|
|
28
|
-
"tool": "
|
|
27
|
+
"tool": "vector_search",
|
|
29
28
|
"input": { "query": "..." },
|
|
30
|
-
"output": { "results": [
|
|
31
|
-
"id": "call_123",
|
|
32
|
-
"timestamp": "2024-01-15T10:30:00Z"
|
|
29
|
+
"output": { "results": ["doc1", "doc2"] }
|
|
33
30
|
}
|
|
34
31
|
]
|
|
35
32
|
}
|
|
36
33
|
],
|
|
34
|
+
"output_messages": [
|
|
35
|
+
{
|
|
36
|
+
"role": "assistant",
|
|
37
|
+
"content": "...",
|
|
38
|
+
"tool_calls": [...]
|
|
39
|
+
}
|
|
40
|
+
],
|
|
37
41
|
"trace_summary": {
|
|
38
42
|
"event_count": 5,
|
|
39
43
|
"tool_names": ["fetch", "search"],
|
|
@@ -47,7 +51,8 @@ Wire format uses snake_case for cross-language compatibility:
|
|
|
47
51
|
```
|
|
48
52
|
|
|
49
53
|
**Key fields:**
|
|
50
|
-
- `
|
|
54
|
+
- `expected_messages` - Expected agent behavior from YAML, including tool calls with outputs (use for retrieval context in RAG evals)
|
|
55
|
+
- `output_messages` - Actual agent execution trace with tool calls (from live agent runs)
|
|
51
56
|
- `trace_summary` - Lightweight summary with execution metrics (counts only, no tool arguments)
|
|
52
57
|
|
|
53
58
|
### Output Format (to stdout)
|
|
@@ -71,199 +76,128 @@ Wire format uses snake_case for cross-language compatibility:
|
|
|
71
76
|
|
|
72
77
|
```python
|
|
73
78
|
#!/usr/bin/env python3
|
|
74
|
-
"""
|
|
75
|
-
Example code evaluator for AgentV
|
|
76
|
-
|
|
77
|
-
This evaluator checks for specific keywords in the output.
|
|
78
|
-
Replace validation logic as needed.
|
|
79
|
-
"""
|
|
80
|
-
|
|
81
79
|
import json
|
|
82
80
|
import sys
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
Evaluate the agent output.
|
|
89
|
-
|
|
90
|
-
Args:
|
|
91
|
-
input_data: Full input context from AgentV
|
|
92
|
-
|
|
93
|
-
Returns:
|
|
94
|
-
Evaluation result with score, hits, misses, reasoning
|
|
95
|
-
"""
|
|
96
|
-
# Extract only the fields you need
|
|
97
|
-
# Most evaluators only need 'candidate_answer' - avoid using unnecessary fields
|
|
98
|
-
candidate_answer = input_data.get("candidate_answer", "")
|
|
99
|
-
|
|
81
|
+
|
|
82
|
+
def evaluate(data: dict) -> dict:
|
|
83
|
+
candidate = data.get("candidate_answer", "")
|
|
84
|
+
hits, misses = [], []
|
|
85
|
+
|
|
100
86
|
# Your validation logic here
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
required_keywords = ["async", "await"]
|
|
106
|
-
for keyword in required_keywords:
|
|
107
|
-
if keyword in candidate_answer:
|
|
108
|
-
hits.append(f"Contains required keyword: {keyword}")
|
|
109
|
-
else:
|
|
110
|
-
misses.append(f"Missing required keyword: {keyword}")
|
|
111
|
-
|
|
112
|
-
# Calculate score
|
|
113
|
-
if not required_keywords:
|
|
114
|
-
score = 1.0
|
|
115
|
-
else:
|
|
116
|
-
score = len(hits) / len(required_keywords)
|
|
117
|
-
|
|
118
|
-
# Build result
|
|
87
|
+
keywords = ["async", "await"]
|
|
88
|
+
for kw in keywords:
|
|
89
|
+
(hits if kw in candidate else misses).append(f"Keyword '{kw}'")
|
|
90
|
+
|
|
119
91
|
return {
|
|
120
|
-
"score":
|
|
92
|
+
"score": len(hits) / len(keywords) if keywords else 1.0,
|
|
121
93
|
"hits": hits,
|
|
122
94
|
"misses": misses,
|
|
123
|
-
"reasoning": f"Found {len(hits)}/{len(
|
|
95
|
+
"reasoning": f"Found {len(hits)}/{len(keywords)} keywords"
|
|
124
96
|
}
|
|
125
97
|
|
|
126
|
-
|
|
127
|
-
def main():
|
|
128
|
-
"""Main entry point for AgentV code evaluator."""
|
|
98
|
+
if __name__ == "__main__":
|
|
129
99
|
try:
|
|
130
|
-
|
|
131
|
-
input_data = json.loads(sys.stdin.read())
|
|
132
|
-
|
|
133
|
-
# Run evaluation
|
|
134
|
-
result = evaluate(input_data)
|
|
135
|
-
|
|
136
|
-
# Write result to stdout
|
|
100
|
+
result = evaluate(json.loads(sys.stdin.read()))
|
|
137
101
|
print(json.dumps(result, indent=2))
|
|
138
|
-
|
|
139
102
|
except Exception as e:
|
|
140
|
-
|
|
141
|
-
error_result = {
|
|
142
|
-
"score": 0.0,
|
|
143
|
-
"hits": [],
|
|
144
|
-
"misses": [f"Evaluator error: {str(e)}"],
|
|
145
|
-
"reasoning": f"Evaluator error: {str(e)}"
|
|
146
|
-
}
|
|
147
|
-
print(json.dumps(error_result, indent=2))
|
|
103
|
+
print(json.dumps({"score": 0, "hits": [], "misses": [str(e)], "reasoning": "Error"}))
|
|
148
104
|
sys.exit(1)
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
if __name__ == "__main__":
|
|
152
|
-
main()
|
|
153
105
|
```
|
|
154
106
|
|
|
155
|
-
## TypeScript Code Evaluator Template
|
|
107
|
+
## TypeScript Code Evaluator Template
|
|
156
108
|
|
|
157
|
-
The
|
|
158
|
-
|
|
159
|
-
**Execution:** Keep evaluators as `.ts` files and run via Node loaders like `npx --yes tsx ./evaluators/my-check.ts` so users don't need Bun after `npm install -g agentv`.
|
|
160
|
-
|
|
161
|
-
**Without SDK:** Skip the import and parse JSON from stdin directly (similar to the Python template above).
|
|
109
|
+
The `@agentv/eval` SDK provides a declarative API with automatic stdin/stdout handling.
|
|
162
110
|
|
|
163
111
|
```typescript
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
*
|
|
167
|
-
* Run with: npx --yes tsx ./evaluators/example-check.ts
|
|
168
|
-
*
|
|
169
|
-
* The SDK provides:
|
|
170
|
-
* - Type-safe CodeJudgePayload interface with all fields
|
|
171
|
-
* - camelCase properties (candidateAnswer, expectedOutcome, etc.)
|
|
172
|
-
* - Automatic conversion from snake_case wire format
|
|
173
|
-
*/
|
|
174
|
-
|
|
175
|
-
import { readCodeJudgePayload } from '@agentv/core';
|
|
176
|
-
|
|
177
|
-
try {
|
|
178
|
-
// Read and parse stdin with automatic snake_case → camelCase conversion
|
|
179
|
-
const payload = readCodeJudgePayload();
|
|
180
|
-
|
|
181
|
-
// Type-safe camelCase access to all fields
|
|
182
|
-
const { candidateAnswer, expectedOutcome, inputFiles, guidelineFiles } = payload;
|
|
112
|
+
#!/usr/bin/env bun
|
|
113
|
+
import { defineCodeJudge } from '@agentv/eval';
|
|
183
114
|
|
|
184
|
-
|
|
115
|
+
export default defineCodeJudge(({ candidateAnswer, expectedOutcome }) => {
|
|
185
116
|
const hits: string[] = [];
|
|
186
117
|
const misses: string[] = [];
|
|
187
118
|
|
|
188
|
-
//
|
|
119
|
+
// Your validation logic here
|
|
189
120
|
if (candidateAnswer.includes(expectedOutcome)) {
|
|
190
121
|
hits.push('Answer matches expected outcome');
|
|
191
122
|
} else {
|
|
192
123
|
misses.push('Answer does not match expected outcome');
|
|
193
124
|
}
|
|
194
125
|
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
const fileName = filePath.split('/').pop() ?? filePath;
|
|
199
|
-
if (candidateAnswer.includes(fileName)) {
|
|
200
|
-
hits.push(`Mentions attachment: ${fileName}`);
|
|
201
|
-
} else {
|
|
202
|
-
misses.push(`Missing attachment: ${fileName}`);
|
|
203
|
-
}
|
|
204
|
-
}
|
|
205
|
-
|
|
206
|
-
// Calculate score
|
|
207
|
-
const totalChecks = hits.length + misses.length;
|
|
208
|
-
const score = totalChecks === 0 ? 0 : hits.length / totalChecks;
|
|
209
|
-
|
|
210
|
-
// Build result
|
|
211
|
-
const result = {
|
|
212
|
-
score,
|
|
126
|
+
const total = hits.length + misses.length;
|
|
127
|
+
return {
|
|
128
|
+
score: total === 0 ? 0 : hits.length / total,
|
|
213
129
|
hits,
|
|
214
130
|
misses,
|
|
215
|
-
reasoning: `Passed ${hits.length}/${
|
|
131
|
+
reasoning: `Passed ${hits.length}/${total} checks`,
|
|
216
132
|
};
|
|
133
|
+
});
|
|
134
|
+
```
|
|
217
135
|
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
136
|
+
**SDK exports:** `defineCodeJudge`, `Message`, `ToolCall`, `TraceSummary`, `CodeJudgeInput`, `CodeJudgeResult`
|
|
137
|
+
|
|
138
|
+
## Target Access for Code Evaluators
|
|
139
|
+
|
|
140
|
+
Code judges can access an LLM through a **target proxy** for metrics requiring multiple LLM calls (contextual precision, semantic similarity, etc).
|
|
141
|
+
|
|
142
|
+
### Configuration
|
|
143
|
+
|
|
144
|
+
```yaml
|
|
145
|
+
evaluators:
|
|
146
|
+
- name: contextual-precision
|
|
147
|
+
type: code_judge
|
|
148
|
+
script: bun scripts/contextual-precision.ts
|
|
149
|
+
target:
|
|
150
|
+
max_calls: 10 # Default: 50
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
### Usage
|
|
154
|
+
|
|
155
|
+
```typescript
|
|
156
|
+
#!/usr/bin/env bun
|
|
157
|
+
import { createTargetClient, defineCodeJudge } from '@agentv/eval';
|
|
158
|
+
|
|
159
|
+
export default defineCodeJudge(async ({ question, candidateAnswer }) => {
|
|
160
|
+
const target = createTargetClient();
|
|
161
|
+
if (!target) return { score: 0, misses: ['Target not configured'] };
|
|
162
|
+
|
|
163
|
+
const response = await target.invoke({
|
|
164
|
+
question: `Is this relevant to: ${question}? Response: ${candidateAnswer}`,
|
|
165
|
+
systemPrompt: 'Respond with JSON: { "relevant": true/false }'
|
|
166
|
+
});
|
|
167
|
+
|
|
168
|
+
const result = JSON.parse(response.rawText ?? '{}');
|
|
169
|
+
return { score: result.relevant ? 1.0 : 0.0 };
|
|
170
|
+
});
|
|
230
171
|
```
|
|
231
172
|
|
|
232
|
-
**
|
|
233
|
-
- **Type-safe**: `CodeJudgePayload` interface with all fields typed
|
|
234
|
-
- **camelCase**: Idiomatic TypeScript naming (`candidateAnswer` vs `candidate_answer`)
|
|
235
|
-
- **Automatic conversion**: Handles snake_case wire format → camelCase objects
|
|
236
|
-
- **Compile-time safety**: Catch typos and missing fields before runtime
|
|
173
|
+
**Batch invocation:** Use `target.invokeBatch(requests)` for multiple calls.
|
|
237
174
|
|
|
238
|
-
**
|
|
239
|
-
- `
|
|
240
|
-
- `
|
|
241
|
-
- `CodeJudgePayload`: TypeScript interface for type safety
|
|
175
|
+
**Environment variables** (set automatically when `target` is configured):
|
|
176
|
+
- `AGENTV_TARGET_PROXY_URL` - Local proxy URL
|
|
177
|
+
- `AGENTV_TARGET_PROXY_TOKEN` - Bearer token for authentication
|
|
242
178
|
|
|
243
|
-
**See also:** `examples/features/code-judge-
|
|
179
|
+
**See also:** `examples/features/code-judge-with-llm-calls/`
|
|
244
180
|
|
|
245
181
|
## LLM Judge Prompt Template
|
|
246
182
|
|
|
247
|
-
LLM judges use markdown prompts
|
|
183
|
+
LLM judges use markdown prompts. AgentV handles the output format automatically.
|
|
248
184
|
|
|
249
185
|
**Available Template Variables:**
|
|
250
186
|
- `{{question}}` - The original question/task
|
|
251
187
|
- `{{expected_outcome}}` - What the answer should accomplish
|
|
252
188
|
- `{{candidate_answer}}` - The actual output to evaluate
|
|
253
|
-
- `{{reference_answer}}` - Gold standard answer (optional
|
|
254
|
-
- `{{input_messages}}` - JSON stringified input
|
|
255
|
-
- `{{output_messages}}` - JSON stringified
|
|
256
|
-
|
|
257
|
-
**Default Evaluator Template:**
|
|
189
|
+
- `{{reference_answer}}` - Gold standard answer (optional)
|
|
190
|
+
- `{{input_messages}}` - JSON stringified input messages
|
|
191
|
+
- `{{output_messages}}` - JSON stringified output messages
|
|
258
192
|
|
|
259
|
-
|
|
193
|
+
**Default Template:**
|
|
260
194
|
|
|
261
195
|
```
|
|
262
|
-
You are an expert evaluator.
|
|
196
|
+
You are an expert evaluator. Grade the candidate_answer based on how well it achieves the expected_outcome.
|
|
263
197
|
|
|
264
|
-
Use
|
|
198
|
+
Use reference_answer as a gold standard (if provided). The candidate_answer doesn't need to match verbatim, but should capture key points.
|
|
265
199
|
|
|
266
|
-
Be concise
|
|
200
|
+
Be concise. Provide specific feedback rather than verbose explanations.
|
|
267
201
|
|
|
268
202
|
[[ ## expected_outcome ## ]]
|
|
269
203
|
{{expected_outcome}}
|
|
@@ -278,76 +212,25 @@ Be concise and focused in your evaluation. Provide succinct, specific feedback r
|
|
|
278
212
|
{{candidate_answer}}
|
|
279
213
|
```
|
|
280
214
|
|
|
281
|
-
You can customize this template in your eval file using the `evaluatorTemplate` field to add domain-specific criteria or scoring guidelines.
|
|
282
|
-
|
|
283
215
|
## Best Practices
|
|
284
216
|
|
|
285
|
-
###
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
4. **Handle errors gracefully** - Return a valid result even when evaluation fails
|
|
291
|
-
5. **Provide helpful feedback** - Use `hits` and `misses` to explain the score
|
|
292
|
-
|
|
293
|
-
### For Prompt-based Evaluators (LLM Judges)
|
|
217
|
+
### Code Evaluators
|
|
218
|
+
1. **Focus on `candidate_answer`** - Most evaluators only need this field
|
|
219
|
+
2. **Be deterministic** - Same input → same output
|
|
220
|
+
3. **Handle errors gracefully** - Return valid result even on failure
|
|
221
|
+
4. **Use `hits`/`misses`** - Explain the score clearly
|
|
294
222
|
|
|
223
|
+
### LLM Judges
|
|
295
224
|
1. **Clear criteria** - Define what you're evaluating
|
|
296
|
-
2. **Specific
|
|
297
|
-
3. **
|
|
298
|
-
4. **Examples** - Show what good/bad looks like
|
|
299
|
-
5. **Concise prompts** - Keep instructions focused
|
|
300
|
-
|
|
301
|
-
## Running Code Evaluators
|
|
302
|
-
|
|
303
|
-
### In Eval Files
|
|
304
|
-
|
|
305
|
-
```yaml
|
|
306
|
-
execution:
|
|
307
|
-
evaluators:
|
|
308
|
-
- name: my_validator
|
|
309
|
-
type: code_judge
|
|
310
|
-
script: uv run my_validator.py
|
|
311
|
-
cwd: ./evaluators
|
|
312
|
-
```
|
|
225
|
+
2. **Specific rubrics** - Provide scoring guidelines
|
|
226
|
+
3. **Concise prompts** - Keep instructions focused
|
|
313
227
|
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
```yaml
|
|
317
|
-
execution:
|
|
318
|
-
evaluators:
|
|
319
|
-
- name: csv_guardrail
|
|
320
|
-
type: code_judge
|
|
321
|
-
script: npx --yes tsx ./evaluators/check-csv.ts
|
|
322
|
-
cwd: ./evaluators
|
|
323
|
-
```
|
|
324
|
-
|
|
325
|
-
### Command Line Testing
|
|
326
|
-
|
|
327
|
-
Test your evaluator locally:
|
|
228
|
+
## Testing Locally
|
|
328
229
|
|
|
329
230
|
```bash
|
|
330
|
-
#
|
|
331
|
-
echo '{
|
|
332
|
-
"candidate_answer": "test output here",
|
|
333
|
-
"question": "test task",
|
|
334
|
-
"expected_outcome": "expected result"
|
|
335
|
-
}' | uv run my_validator.py
|
|
336
|
-
|
|
337
|
-
# Should output:
|
|
338
|
-
# {
|
|
339
|
-
# "score": 0.8,
|
|
340
|
-
# "hits": ["check 1 passed"],
|
|
341
|
-
# "misses": ["check 2 failed"],
|
|
342
|
-
# "reasoning": "..."
|
|
343
|
-
# }
|
|
344
|
-
```
|
|
231
|
+
# Python
|
|
232
|
+
echo '{"candidate_answer": "test", "question": "task", "expected_outcome": "result"}' | uv run my_validator.py
|
|
345
233
|
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
echo '{
|
|
349
|
-
"candidate_answer": "test output here",
|
|
350
|
-
"question": "test task",
|
|
351
|
-
"expected_outcome": "expected result"
|
|
352
|
-
}' | npx --yes tsx ./evaluators/check-csv.ts
|
|
234
|
+
# TypeScript
|
|
235
|
+
echo '{"candidate_answer": "test", "question": "task", "expected_outcome": "result"}' | bun run ./check.ts
|
|
353
236
|
```
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "agentv",
|
|
3
|
-
"version": "2.0
|
|
3
|
+
"version": "2.1.0",
|
|
4
4
|
"description": "CLI entry point for AgentV",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"repository": {
|
|
@@ -31,7 +31,9 @@
|
|
|
31
31
|
"test:watch": "bun test --watch"
|
|
32
32
|
},
|
|
33
33
|
"dependencies": {
|
|
34
|
-
"@agentv/core": "
|
|
34
|
+
"@agentv/core": "2.0.2",
|
|
35
|
+
"@mariozechner/pi-agent": "^0.9.0",
|
|
36
|
+
"@mariozechner/pi-ai": "^0.37.2",
|
|
35
37
|
"cmd-ts": "^0.14.3",
|
|
36
38
|
"dotenv": "^16.4.5",
|
|
37
39
|
"fast-glob": "^3.3.3",
|