agentv 2.0.2 → 2.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  # Custom Evaluators Guide
2
2
 
3
- Guide for writing custom code evaluators and LLM judges for AgentV eval files.
3
+ Templates and best practices for code evaluators and LLM judges. For YAML configuration, see `SKILL.md`.
4
4
 
5
5
  ## Code Evaluator Contract
6
6
 
@@ -19,21 +19,25 @@ Wire format uses snake_case for cross-language compatibility:
19
19
  "guideline_files": ["path1", "path2"],
20
20
  "input_files": ["file1", "file2"],
21
21
  "input_messages": [{"role": "user", "content": "..."}],
22
- "output_messages": [
22
+ "expected_messages": [
23
23
  {
24
24
  "role": "assistant",
25
- "content": "...",
26
25
  "tool_calls": [
27
26
  {
28
- "tool": "search",
27
+ "tool": "vector_search",
29
28
  "input": { "query": "..." },
30
- "output": { "results": [...] },
31
- "id": "call_123",
32
- "timestamp": "2024-01-15T10:30:00Z"
29
+ "output": { "results": ["doc1", "doc2"] }
33
30
  }
34
31
  ]
35
32
  }
36
33
  ],
34
+ "output_messages": [
35
+ {
36
+ "role": "assistant",
37
+ "content": "...",
38
+ "tool_calls": [...]
39
+ }
40
+ ],
37
41
  "trace_summary": {
38
42
  "event_count": 5,
39
43
  "tool_names": ["fetch", "search"],
@@ -47,7 +51,8 @@ Wire format uses snake_case for cross-language compatibility:
47
51
  ```
48
52
 
49
53
  **Key fields:**
50
- - `output_messages` - Full agent execution trace with tool calls (use `tool_calls[].input` for arguments)
54
+ - `expected_messages` - Expected agent behavior from YAML, including tool calls with outputs (use for retrieval context in RAG evals)
55
+ - `output_messages` - Actual agent execution trace with tool calls (from live agent runs)
51
56
  - `trace_summary` - Lightweight summary with execution metrics (counts only, no tool arguments)
52
57
 
53
58
  ### Output Format (to stdout)
@@ -71,201 +76,128 @@ Wire format uses snake_case for cross-language compatibility:
71
76
 
72
77
  ```python
73
78
  #!/usr/bin/env python3
74
- """
75
- Example code evaluator for AgentV
76
-
77
- This evaluator checks for specific keywords in the output.
78
- Replace validation logic as needed.
79
- """
80
-
81
79
  import json
82
80
  import sys
83
- from typing import Any
84
-
85
-
86
- def evaluate(input_data: dict[str, Any]) -> dict[str, Any]:
87
- """
88
- Evaluate the agent output.
89
-
90
- Args:
91
- input_data: Full input context from AgentV
92
-
93
- Returns:
94
- Evaluation result with score, hits, misses, reasoning
95
- """
96
- # Extract only the fields you need
97
- # Most evaluators only need 'candidate_answer' - avoid using unnecessary fields
98
- candidate_answer = input_data.get("candidate_answer", "")
99
-
81
+
82
+ def evaluate(data: dict) -> dict:
83
+ candidate = data.get("candidate_answer", "")
84
+ hits, misses = [], []
85
+
100
86
  # Your validation logic here
101
- hits = []
102
- misses = []
103
-
104
- # Example: Check for keywords
105
- required_keywords = ["async", "await"]
106
- for keyword in required_keywords:
107
- if keyword in candidate_answer:
108
- hits.append(f"Contains required keyword: {keyword}")
109
- else:
110
- misses.append(f"Missing required keyword: {keyword}")
111
-
112
- # Calculate score
113
- if not required_keywords:
114
- score = 1.0
115
- else:
116
- score = len(hits) / len(required_keywords)
117
-
118
- # Build result
87
+ keywords = ["async", "await"]
88
+ for kw in keywords:
89
+ (hits if kw in candidate else misses).append(f"Keyword '{kw}'")
90
+
119
91
  return {
120
- "score": score,
92
+ "score": len(hits) / len(keywords) if keywords else 1.0,
121
93
  "hits": hits,
122
94
  "misses": misses,
123
- "reasoning": f"Found {len(hits)}/{len(required_keywords)} required keywords"
95
+ "reasoning": f"Found {len(hits)}/{len(keywords)} keywords"
124
96
  }
125
97
 
126
-
127
- def main():
128
- """Main entry point for AgentV code evaluator."""
98
+ if __name__ == "__main__":
129
99
  try:
130
- # Read input from stdin
131
- input_data = json.loads(sys.stdin.read())
132
-
133
- # Run evaluation
134
- result = evaluate(input_data)
135
-
136
- # Write result to stdout
100
+ result = evaluate(json.loads(sys.stdin.read()))
137
101
  print(json.dumps(result, indent=2))
138
-
139
102
  except Exception as e:
140
- # Error handling: return zero score with error message
141
- error_result = {
142
- "score": 0.0,
143
- "hits": [],
144
- "misses": [f"Evaluator error: {str(e)}"],
145
- "reasoning": f"Evaluator error: {str(e)}"
146
- }
147
- print(json.dumps(error_result, indent=2))
103
+ print(json.dumps({"score": 0, "hits": [], "misses": [str(e)], "reasoning": "Error"}))
148
104
  sys.exit(1)
149
-
150
-
151
- if __name__ == "__main__":
152
- main()
153
105
  ```
154
106
 
155
- ## TypeScript Code Evaluator Template (with SDK)
156
-
157
- The `@agentv/eval` SDK provides a declarative API for code evaluators with automatic stdin/stdout handling, validation, and error handling.
107
+ ## TypeScript Code Evaluator Template
158
108
 
159
- **Execution:** Keep evaluators as `.ts` files and run via `bun run` or Node loaders like `npx --yes tsx ./evaluators/my-check.ts`.
109
+ The `@agentv/eval` SDK provides a declarative API with automatic stdin/stdout handling.
160
110
 
161
111
  ```typescript
162
112
  #!/usr/bin/env bun
163
- /**
164
- * Example TypeScript code evaluator using defineCodeJudge
165
- *
166
- * Run with: bun run ./evaluators/example-check.ts
167
- * or: npx --yes tsx ./evaluators/example-check.ts
168
- *
169
- * The SDK handles:
170
- * - Reading JSON from stdin
171
- * - Converting snake_case to camelCase
172
- * - Validating input with Zod
173
- * - Error handling and output formatting
174
- */
175
113
  import { defineCodeJudge } from '@agentv/eval';
176
114
 
177
- export default defineCodeJudge(({ candidateAnswer, expectedOutcome, inputFiles, guidelineFiles }) => {
115
+ export default defineCodeJudge(({ candidateAnswer, expectedOutcome }) => {
178
116
  const hits: string[] = [];
179
117
  const misses: string[] = [];
180
118
 
181
- // Example: Check if answer contains expected outcome
119
+ // Your validation logic here
182
120
  if (candidateAnswer.includes(expectedOutcome)) {
183
121
  hits.push('Answer matches expected outcome');
184
122
  } else {
185
123
  misses.push('Answer does not match expected outcome');
186
124
  }
187
125
 
188
- // Example: Check attachment mentions
189
- const attachments = [...guidelineFiles, ...inputFiles];
190
- for (const filePath of attachments) {
191
- const fileName = filePath.split('/').pop() ?? filePath;
192
- if (candidateAnswer.includes(fileName)) {
193
- hits.push(`Mentions attachment: ${fileName}`);
194
- } else {
195
- misses.push(`Missing attachment: ${fileName}`);
196
- }
197
- }
198
-
199
- // Calculate score
200
- const totalChecks = hits.length + misses.length;
201
- const score = totalChecks === 0 ? 0 : hits.length / totalChecks;
202
-
126
+ const total = hits.length + misses.length;
203
127
  return {
204
- score,
128
+ score: total === 0 ? 0 : hits.length / total,
205
129
  hits,
206
130
  misses,
207
- reasoning: `Passed ${hits.length}/${totalChecks} checks`,
131
+ reasoning: `Passed ${hits.length}/${total} checks`,
208
132
  };
209
133
  });
210
134
  ```
211
135
 
212
- **TypeScript SDK Benefits:**
213
- - **Zero boilerplate**: No try/catch, stdin parsing, or JSON.stringify needed
214
- - **Type-safe**: `CodeJudgeInput` interface with all fields typed
215
- - **camelCase**: Idiomatic TypeScript naming (`candidateAnswer` vs `candidate_answer`)
216
- - **Validation**: Zod schemas validate input and output at runtime
217
- - **Error handling**: Exceptions automatically produce valid failure results
136
+ **SDK exports:** `defineCodeJudge`, `Message`, `ToolCall`, `TraceSummary`, `CodeJudgeInput`, `CodeJudgeResult`
218
137
 
219
- **Available exports from `@agentv/eval`:**
220
- - `defineCodeJudge(handler)`: Define a code judge evaluator (recommended)
221
- - `CodeJudgeInput`: TypeScript type for input payload
222
- - `CodeJudgeResult`: TypeScript type for result
223
- - `TraceSummary`, `OutputMessage`: Types for trace data
224
- - `z`: Re-exported Zod for custom config schemas
138
+ ## Target Access for Code Evaluators
225
139
 
226
- **Using execution metrics:**
140
+ Code judges can access an LLM through a **target proxy** for metrics requiring multiple LLM calls (contextual precision, semantic similarity, etc).
141
+
142
+ ### Configuration
143
+
144
+ ```yaml
145
+ evaluators:
146
+ - name: contextual-precision
147
+ type: code_judge
148
+ script: bun scripts/contextual-precision.ts
149
+ target:
150
+ max_calls: 10 # Default: 50
151
+ ```
152
+
153
+ ### Usage
227
154
 
228
155
  ```typescript
229
- import { defineCodeJudge } from '@agentv/eval';
156
+ #!/usr/bin/env bun
157
+ import { createTargetClient, defineCodeJudge } from '@agentv/eval';
230
158
 
231
- export default defineCodeJudge(({ traceSummary }) => {
232
- if (!traceSummary) {
233
- return { score: 0.5, reasoning: 'No trace available' };
234
- }
159
+ export default defineCodeJudge(async ({ question, candidateAnswer }) => {
160
+ const target = createTargetClient();
161
+ if (!target) return { score: 0, misses: ['Target not configured'] };
235
162
 
236
- const efficient = traceSummary.eventCount <= 10;
237
- return {
238
- score: efficient ? 1.0 : 0.5,
239
- hits: efficient ? ['Efficient execution'] : [],
240
- misses: efficient ? [] : ['Too many tool calls'],
241
- };
163
+ const response = await target.invoke({
164
+ question: `Is this relevant to: ${question}? Response: ${candidateAnswer}`,
165
+ systemPrompt: 'Respond with JSON: { "relevant": true/false }'
166
+ });
167
+
168
+ const result = JSON.parse(response.rawText ?? '{}');
169
+ return { score: result.relevant ? 1.0 : 0.0 };
242
170
  });
243
171
  ```
244
172
 
245
- **See also:** `examples/features/code-judge-sdk/` for complete working examples
173
+ **Batch invocation:** Use `target.invokeBatch(requests)` for multiple calls.
174
+
175
+ **Environment variables** (set automatically when `target` is configured):
176
+ - `AGENTV_TARGET_PROXY_URL` - Local proxy URL
177
+ - `AGENTV_TARGET_PROXY_TOKEN` - Bearer token for authentication
178
+
179
+ **See also:** `examples/features/code-judge-with-llm-calls/`
246
180
 
247
181
  ## LLM Judge Prompt Template
248
182
 
249
- LLM judges use markdown prompts to guide evaluation. AgentV automatically handles the output format, so focus your prompt on evaluation criteria and guidelines.
183
+ LLM judges use markdown prompts. AgentV handles the output format automatically.
250
184
 
251
185
  **Available Template Variables:**
252
186
  - `{{question}}` - The original question/task
253
187
  - `{{expected_outcome}}` - What the answer should accomplish
254
188
  - `{{candidate_answer}}` - The actual output to evaluate
255
- - `{{reference_answer}}` - Gold standard answer (optional, may be empty)
256
- - `{{input_messages}}` - JSON stringified input message segments
257
- - `{{output_messages}}` - JSON stringified expected output segments
189
+ - `{{reference_answer}}` - Gold standard answer (optional)
190
+ - `{{input_messages}}` - JSON stringified input messages
191
+ - `{{output_messages}}` - JSON stringified output messages
258
192
 
259
- **Default Evaluator Template:**
260
-
261
- If you don't specify a custom evaluator template, AgentV uses this default:
193
+ **Default Template:**
262
194
 
263
195
  ```
264
- You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
196
+ You are an expert evaluator. Grade the candidate_answer based on how well it achieves the expected_outcome.
265
197
 
266
- Use the reference_answer as a gold standard for a high-quality response (if provided). The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
198
+ Use reference_answer as a gold standard (if provided). The candidate_answer doesn't need to match verbatim, but should capture key points.
267
199
 
268
- Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.
200
+ Be concise. Provide specific feedback rather than verbose explanations.
269
201
 
270
202
  [[ ## expected_outcome ## ]]
271
203
  {{expected_outcome}}
@@ -280,76 +212,25 @@ Be concise and focused in your evaluation. Provide succinct, specific feedback r
280
212
  {{candidate_answer}}
281
213
  ```
282
214
 
283
- You can customize this template in your eval file using the `evaluatorTemplate` field to add domain-specific criteria or scoring guidelines.
284
-
285
215
  ## Best Practices
286
216
 
287
- ### For Code-based Evaluators
288
-
289
- 1. **Focus on relevant fields** - Most evaluators only need the `candidate_answer` field
290
- 2. **Avoid false positives** - Don't check fields like `question` or `reference_answer` unless you specifically need context
291
- 3. **Be deterministic** - Same input should always produce same output
292
- 4. **Handle errors gracefully** - Return a valid result even when evaluation fails
293
- 5. **Provide helpful feedback** - Use `hits` and `misses` to explain the score
294
-
295
- ### For Prompt-based Evaluators (LLM Judges)
217
+ ### Code Evaluators
218
+ 1. **Focus on `candidate_answer`** - Most evaluators only need this field
219
+ 2. **Be deterministic** - Same input same output
220
+ 3. **Handle errors gracefully** - Return valid result even on failure
221
+ 4. **Use `hits`/`misses`** - Explain the score clearly
296
222
 
223
+ ### LLM Judges
297
224
  1. **Clear criteria** - Define what you're evaluating
298
- 2. **Specific guidelines** - Provide scoring rubrics
299
- 3. **JSON output** - Enforce structured output format
300
- 4. **Examples** - Show what good/bad looks like
301
- 5. **Concise prompts** - Keep instructions focused
302
-
303
- ## Running Code Evaluators
225
+ 2. **Specific rubrics** - Provide scoring guidelines
226
+ 3. **Concise prompts** - Keep instructions focused
304
227
 
305
- ### In Eval Files
306
-
307
- ```yaml
308
- execution:
309
- evaluators:
310
- - name: my_validator
311
- type: code_judge
312
- script: uv run my_validator.py
313
- cwd: ./evaluators
314
- ```
315
-
316
- TypeScript evaluators use the same structure but invoke `tsx` (or another Node-compatible loader) so they work everywhere:
317
-
318
- ```yaml
319
- execution:
320
- evaluators:
321
- - name: csv_guardrail
322
- type: code_judge
323
- script: npx --yes tsx ./evaluators/check-csv.ts
324
- cwd: ./evaluators
325
- ```
326
-
327
- ### Command Line Testing
328
-
329
- Test your evaluator locally:
228
+ ## Testing Locally
330
229
 
331
230
  ```bash
332
- # Create test input
333
- echo '{
334
- "candidate_answer": "test output here",
335
- "question": "test task",
336
- "expected_outcome": "expected result"
337
- }' | uv run my_validator.py
338
-
339
- # Should output:
340
- # {
341
- # "score": 0.8,
342
- # "hits": ["check 1 passed"],
343
- # "misses": ["check 2 failed"],
344
- # "reasoning": "..."
345
- # }
346
- ```
231
+ # Python
232
+ echo '{"candidate_answer": "test", "question": "task", "expected_outcome": "result"}' | uv run my_validator.py
347
233
 
348
- ```bash
349
- # TypeScript (uses tsx loader under Node)
350
- echo '{
351
- "candidate_answer": "test output here",
352
- "question": "test task",
353
- "expected_outcome": "expected result"
354
- }' | npx --yes tsx ./evaluators/check-csv.ts
234
+ # TypeScript
235
+ echo '{"candidate_answer": "test", "question": "task", "expected_outcome": "result"}' | bun run ./check.ts
355
236
  ```
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "agentv",
3
- "version": "2.0.2",
3
+ "version": "2.1.1",
4
4
  "description": "CLI entry point for AgentV",
5
5
  "type": "module",
6
6
  "repository": {
@@ -31,7 +31,7 @@
31
31
  "test:watch": "bun test --watch"
32
32
  },
33
33
  "dependencies": {
34
- "@agentv/core": "2.0.1",
34
+ "@agentv/core": "2.0.2",
35
35
  "@mariozechner/pi-agent": "^0.9.0",
36
36
  "@mariozechner/pi-ai": "^0.37.2",
37
37
  "cmd-ts": "^0.14.3",