agentv 0.22.0 → 0.23.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-QRY42RAP.js → chunk-4T62HFF4.js} +1 -1
- package/dist/chunk-4T62HFF4.js.map +1 -0
- package/dist/cli.js +1 -1
- package/dist/cli.js.map +1 -1
- package/dist/index.js +1 -1
- package/dist/templates/.claude/skills/agentv-eval-builder/SKILL.md +1 -1
- package/dist/templates/.claude/skills/agentv-eval-builder/references/custom-evaluators.md +40 -226
- package/dist/templates/.claude/skills/agentv-eval-builder/references/eval-schema.json +217 -217
- package/dist/templates/.claude/skills/agentv-eval-builder/references/example-evals.md +5 -5
- package/dist/templates/agentv/.env.template +23 -0
- package/package.json +5 -2
- package/dist/chunk-QRY42RAP.js.map +0 -1
package/dist/cli.js
CHANGED
package/dist/cli.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../src/cli.ts"],"sourcesContent":["#!/usr/bin/env node\nimport { runCli } from './index.js';\n\nrunCli().catch((error) => {\n console.error(error);\n process.exit(1);\n});\n"],"mappings":";;;;;;;AAGA,OAAO,EAAE,MAAM,CAAC,UAAU;AACxB,UAAQ,MAAM,KAAK;AACnB,UAAQ,KAAK,CAAC;AAChB,CAAC;","names":[]}
|
|
1
|
+
{"version":3,"sources":["../src/cli.ts"],"sourcesContent":["#!/usr/bin/env node\r\nimport { runCli } from './index.js';\r\n\r\nrunCli().catch((error) => {\r\n console.error(error);\r\n process.exit(1);\r\n});\r\n"],"mappings":";;;;;;;AAGA,OAAO,EAAE,MAAM,CAAC,UAAU;AACxB,UAAQ,MAAM,KAAK;AACnB,UAAQ,KAAK,CAAC;AAChB,CAAC;","names":[]}
|
package/dist/index.js
CHANGED
|
@@ -37,7 +37,7 @@ execution:
|
|
|
37
37
|
```
|
|
38
38
|
|
|
39
39
|
**Contract:**
|
|
40
|
-
- Input (stdin): JSON with `
|
|
40
|
+
- Input (stdin): JSON with `question`, `expected_outcome`, `reference_answer`, `candidate_answer`, `guideline_paths`, `input_files`, `input_messages`
|
|
41
41
|
- Output (stdout): JSON with `score` (0.0-1.0), `hits`, `misses`, `reasoning`
|
|
42
42
|
|
|
43
43
|
**Template:** See `references/custom-evaluators.md` for Python code evaluator template
|
|
@@ -10,14 +10,13 @@ Code evaluators receive input via stdin and write output to stdout, both as JSON
|
|
|
10
10
|
|
|
11
11
|
```json
|
|
12
12
|
{
|
|
13
|
-
"
|
|
14
|
-
"
|
|
15
|
-
"
|
|
16
|
-
"
|
|
17
|
-
"system_message": "system message if any",
|
|
13
|
+
"question": "string describing the task/question",
|
|
14
|
+
"expected_outcome": "expected outcome description",
|
|
15
|
+
"reference_answer": "gold standard answer (optional)",
|
|
16
|
+
"candidate_answer": "generated code/text from the agent",
|
|
18
17
|
"guideline_paths": ["path1", "path2"],
|
|
19
|
-
"
|
|
20
|
-
"
|
|
18
|
+
"input_files": ["file1", "file2"],
|
|
19
|
+
"input_messages": [{"role": "user", "content": "..."}]
|
|
21
20
|
}
|
|
22
21
|
```
|
|
23
22
|
|
|
@@ -65,8 +64,8 @@ def evaluate(input_data: dict[str, Any]) -> dict[str, Any]:
|
|
|
65
64
|
Evaluation result with score, hits, misses, reasoning
|
|
66
65
|
"""
|
|
67
66
|
# Extract only the fields you need
|
|
68
|
-
# Most evaluators only need '
|
|
69
|
-
|
|
67
|
+
# Most evaluators only need 'candidate_answer' - avoid using unnecessary fields
|
|
68
|
+
candidate_answer = input_data.get("candidate_answer", "")
|
|
70
69
|
|
|
71
70
|
# Your validation logic here
|
|
72
71
|
hits = []
|
|
@@ -75,7 +74,7 @@ def evaluate(input_data: dict[str, Any]) -> dict[str, Any]:
|
|
|
75
74
|
# Example: Check for keywords
|
|
76
75
|
required_keywords = ["async", "await"]
|
|
77
76
|
for keyword in required_keywords:
|
|
78
|
-
if keyword in
|
|
77
|
+
if keyword in candidate_answer:
|
|
79
78
|
hits.append(f"Contains required keyword: {keyword}")
|
|
80
79
|
else:
|
|
81
80
|
misses.append(f"Missing required keyword: {keyword}")
|
|
@@ -123,157 +122,55 @@ if __name__ == "__main__":
|
|
|
123
122
|
main()
|
|
124
123
|
```
|
|
125
124
|
|
|
126
|
-
##
|
|
127
|
-
|
|
128
|
-
A common pattern is validating JSON output structure:
|
|
129
|
-
|
|
130
|
-
```python
|
|
131
|
-
#!/usr/bin/env python3
|
|
132
|
-
"""
|
|
133
|
-
JSON Format Validator for AgentV
|
|
134
|
-
Validates that output is valid JSON with required keys.
|
|
135
|
-
"""
|
|
136
|
-
|
|
137
|
-
import json
|
|
138
|
-
import sys
|
|
139
|
-
from typing import Any
|
|
140
|
-
|
|
125
|
+
## LLM Judge Prompt Template
|
|
141
126
|
|
|
142
|
-
|
|
143
|
-
"""
|
|
144
|
-
Validate that output is valid JSON with required keys.
|
|
145
|
-
|
|
146
|
-
Args:
|
|
147
|
-
output: The candidate output to validate
|
|
148
|
-
required_keys: List of required top-level keys
|
|
149
|
-
|
|
150
|
-
Returns:
|
|
151
|
-
Evaluation result dict
|
|
152
|
-
"""
|
|
153
|
-
# Try to parse as JSON
|
|
154
|
-
try:
|
|
155
|
-
parsed = json.loads(output.strip())
|
|
156
|
-
except json.JSONDecodeError as e:
|
|
157
|
-
return {
|
|
158
|
-
"score": 0.0,
|
|
159
|
-
"hits": [],
|
|
160
|
-
"misses": ["Not valid JSON"],
|
|
161
|
-
"reasoning": f"Output is not valid JSON. Parse error: {str(e)}"
|
|
162
|
-
}
|
|
163
|
-
|
|
164
|
-
# Check if it's a dict
|
|
165
|
-
if not isinstance(parsed, dict):
|
|
166
|
-
return {
|
|
167
|
-
"score": 0.0,
|
|
168
|
-
"hits": [],
|
|
169
|
-
"misses": ["JSON is not an object/dict"],
|
|
170
|
-
"reasoning": f"Output is valid JSON but not an object. Got: {type(parsed).__name__}"
|
|
171
|
-
}
|
|
172
|
-
|
|
173
|
-
# Check for required keys
|
|
174
|
-
missing_keys = [key for key in required_keys if key not in parsed]
|
|
175
|
-
present_keys = [key for key in required_keys if key in parsed]
|
|
176
|
-
|
|
177
|
-
if missing_keys:
|
|
178
|
-
return {
|
|
179
|
-
"score": 0.0,
|
|
180
|
-
"hits": [f"Has key: {key}" for key in present_keys],
|
|
181
|
-
"misses": [f"Missing key: {key}" for key in missing_keys],
|
|
182
|
-
"reasoning": f"Valid JSON but missing required keys: {', '.join(missing_keys)}"
|
|
183
|
-
}
|
|
184
|
-
|
|
185
|
-
# All checks passed
|
|
186
|
-
return {
|
|
187
|
-
"score": 1.0,
|
|
188
|
-
"hits": [f"Valid JSON with all required keys: {', '.join(required_keys)}"],
|
|
189
|
-
"misses": [],
|
|
190
|
-
"reasoning": f"Valid JSON with all required keys: {', '.join(required_keys)}"
|
|
191
|
-
}
|
|
127
|
+
LLM judges use markdown prompts to guide evaluation. AgentV automatically handles the output format, so focus your prompt on evaluation criteria and guidelines.
|
|
192
128
|
|
|
129
|
+
**Available Template Variables:**
|
|
130
|
+
- `{{question}}` - The original question/task
|
|
131
|
+
- `{{expected_outcome}}` - What the answer should accomplish
|
|
132
|
+
- `{{candidate_answer}}` - The actual output to evaluate
|
|
133
|
+
- `{{reference_answer}}` - Gold standard answer (optional, may be empty)
|
|
134
|
+
- `{{input_messages}}` - JSON stringified input message segments
|
|
135
|
+
- `{{output_messages}}` - JSON stringified expected output segments
|
|
193
136
|
|
|
194
|
-
|
|
195
|
-
"""Main entry point."""
|
|
196
|
-
try:
|
|
197
|
-
input_data = json.loads(sys.stdin.read())
|
|
198
|
-
output = input_data.get("output", "")
|
|
199
|
-
|
|
200
|
-
# Define required keys (customize as needed)
|
|
201
|
-
required_keys = ["criticalityRating", "reasoning"]
|
|
202
|
-
|
|
203
|
-
result = validate_json_format(output, required_keys)
|
|
204
|
-
print(json.dumps(result, indent=2))
|
|
205
|
-
|
|
206
|
-
except Exception as e:
|
|
207
|
-
error_result = {
|
|
208
|
-
"score": 0.0,
|
|
209
|
-
"hits": [],
|
|
210
|
-
"misses": [f"Evaluator error: {str(e)}"],
|
|
211
|
-
"reasoning": f"Evaluator error: {str(e)}"
|
|
212
|
-
}
|
|
213
|
-
print(json.dumps(error_result, indent=2))
|
|
214
|
-
sys.exit(1)
|
|
137
|
+
**Default Evaluator Template:**
|
|
215
138
|
|
|
139
|
+
If you don't specify a custom evaluator template, AgentV uses this default:
|
|
216
140
|
|
|
217
|
-
if __name__ == "__main__":
|
|
218
|
-
main()
|
|
219
141
|
```
|
|
142
|
+
You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
|
|
220
143
|
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
LLM judges use markdown prompts to guide evaluation:
|
|
224
|
-
|
|
225
|
-
```markdown
|
|
226
|
-
# Code Quality Judge
|
|
227
|
-
|
|
228
|
-
Evaluate the candidate code for quality, correctness, and best practices.
|
|
229
|
-
|
|
230
|
-
## Evaluation Criteria
|
|
144
|
+
Use the reference_answer as a gold standard for a high-quality response (if provided). The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
|
|
231
145
|
|
|
232
|
-
|
|
233
|
-
1. **Correctness** - Does it solve the problem?
|
|
234
|
-
2. **Style** - Does it follow best practices?
|
|
235
|
-
3. **Completeness** - Are edge cases handled?
|
|
236
|
-
4. **Documentation** - Are there helpful comments/docstrings?
|
|
146
|
+
Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.
|
|
237
147
|
|
|
238
|
-
##
|
|
148
|
+
[[ ## expected_outcome ## ]]
|
|
149
|
+
{{expected_outcome}}
|
|
239
150
|
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
- **0.5-0.6:** Adequate - Works but has quality issues
|
|
243
|
-
- **0.3-0.4:** Poor - Has bugs or major style problems
|
|
244
|
-
- **0.0-0.2:** Unacceptable - Does not work or completely wrong
|
|
151
|
+
[[ ## question ## ]]
|
|
152
|
+
{{question}}
|
|
245
153
|
|
|
246
|
-
##
|
|
154
|
+
[[ ## reference_answer ## ]]
|
|
155
|
+
{{reference_answer}}
|
|
247
156
|
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
```json
|
|
251
|
-
{
|
|
252
|
-
"score": 0.85,
|
|
253
|
-
"hits": [
|
|
254
|
-
"Correctly implements the algorithm",
|
|
255
|
-
"Good error handling"
|
|
256
|
-
],
|
|
257
|
-
"misses": [
|
|
258
|
-
"Missing type hints",
|
|
259
|
-
"No docstring"
|
|
260
|
-
],
|
|
261
|
-
"reasoning": "Code is correct and handles errors well, but lacks documentation."
|
|
262
|
-
}
|
|
263
|
-
```
|
|
157
|
+
[[ ## candidate_answer ## ]]
|
|
158
|
+
{{candidate_answer}}
|
|
264
159
|
```
|
|
265
160
|
|
|
161
|
+
You can customize this template in your eval file using the `evaluatorTemplate` field to add domain-specific criteria or scoring guidelines.
|
|
162
|
+
|
|
266
163
|
## Best Practices
|
|
267
164
|
|
|
268
|
-
### For Code Evaluators
|
|
165
|
+
### For Code-based Evaluators
|
|
269
166
|
|
|
270
|
-
1. **Focus on relevant fields** - Most evaluators only need the `
|
|
271
|
-
2. **Avoid false positives** - Don't check fields like `
|
|
167
|
+
1. **Focus on relevant fields** - Most evaluators only need the `candidate_answer` field
|
|
168
|
+
2. **Avoid false positives** - Don't check fields like `question` or `reference_answer` unless you specifically need context
|
|
272
169
|
3. **Be deterministic** - Same input should always produce same output
|
|
273
170
|
4. **Handle errors gracefully** - Return a valid result even when evaluation fails
|
|
274
171
|
5. **Provide helpful feedback** - Use `hits` and `misses` to explain the score
|
|
275
172
|
|
|
276
|
-
### For LLM Judges
|
|
173
|
+
### For Prompt-based Evaluators (LLM Judges)
|
|
277
174
|
|
|
278
175
|
1. **Clear criteria** - Define what you're evaluating
|
|
279
176
|
2. **Specific guidelines** - Provide scoring rubrics
|
|
@@ -281,37 +178,6 @@ Respond with valid JSON:
|
|
|
281
178
|
4. **Examples** - Show what good/bad looks like
|
|
282
179
|
5. **Concise prompts** - Keep instructions focused
|
|
283
180
|
|
|
284
|
-
### Common Pitfalls to Avoid
|
|
285
|
-
|
|
286
|
-
**❌ Checking unnecessary fields:**
|
|
287
|
-
```python
|
|
288
|
-
# BAD: Checking 'task' or 'expected' when you only need to validate format
|
|
289
|
-
if "async" in input_data.get("task", ""):
|
|
290
|
-
# This creates false positives
|
|
291
|
-
```
|
|
292
|
-
|
|
293
|
-
**✅ Focus on output:**
|
|
294
|
-
```python
|
|
295
|
-
# GOOD: Only check the actual output
|
|
296
|
-
output = input_data.get("output", "")
|
|
297
|
-
if "async" in output:
|
|
298
|
-
# This is what you actually want to validate
|
|
299
|
-
```
|
|
300
|
-
|
|
301
|
-
**❌ Brittle string matching:**
|
|
302
|
-
```python
|
|
303
|
-
# BAD: Exact match is too strict
|
|
304
|
-
if output == "The answer is 42":
|
|
305
|
-
score = 1.0
|
|
306
|
-
```
|
|
307
|
-
|
|
308
|
-
**✅ Flexible validation:**
|
|
309
|
-
```python
|
|
310
|
-
# GOOD: Check for semantic correctness
|
|
311
|
-
if "42" in output and "answer" in output.lower():
|
|
312
|
-
score = 1.0
|
|
313
|
-
```
|
|
314
|
-
|
|
315
181
|
## Running Code Evaluators
|
|
316
182
|
|
|
317
183
|
### In Eval Files
|
|
@@ -332,8 +198,9 @@ Test your evaluator locally:
|
|
|
332
198
|
```bash
|
|
333
199
|
# Create test input
|
|
334
200
|
echo '{
|
|
335
|
-
"
|
|
336
|
-
"
|
|
201
|
+
"candidate_answer": "test output here",
|
|
202
|
+
"question": "test task",
|
|
203
|
+
"expected_outcome": "expected result"
|
|
337
204
|
}' | uv run my_validator.py
|
|
338
205
|
|
|
339
206
|
# Should output:
|
|
@@ -344,56 +211,3 @@ echo '{
|
|
|
344
211
|
# "reasoning": "..."
|
|
345
212
|
# }
|
|
346
213
|
```
|
|
347
|
-
|
|
348
|
-
## Advanced Patterns
|
|
349
|
-
|
|
350
|
-
### Combining Multiple Checks
|
|
351
|
-
|
|
352
|
-
```python
|
|
353
|
-
def evaluate(input_data: dict[str, Any]) -> dict[str, Any]:
|
|
354
|
-
output = input_data.get("output", "")
|
|
355
|
-
|
|
356
|
-
checks = [
|
|
357
|
-
("has_async", "async" in output, "Contains async keyword"),
|
|
358
|
-
("has_await", "await" in output, "Contains await keyword"),
|
|
359
|
-
("has_try", "try:" in output, "Has error handling"),
|
|
360
|
-
]
|
|
361
|
-
|
|
362
|
-
hits = [msg for _, passed, msg in checks if passed]
|
|
363
|
-
misses = [msg for _, passed, msg in checks if not passed]
|
|
364
|
-
score = len(hits) / len(checks)
|
|
365
|
-
|
|
366
|
-
return {
|
|
367
|
-
"score": score,
|
|
368
|
-
"hits": hits,
|
|
369
|
-
"misses": misses,
|
|
370
|
-
"reasoning": f"Passed {len(hits)}/{len(checks)} checks"
|
|
371
|
-
}
|
|
372
|
-
```
|
|
373
|
-
|
|
374
|
-
### Weighted Scoring
|
|
375
|
-
|
|
376
|
-
```python
|
|
377
|
-
def evaluate(input_data: dict[str, Any]) -> dict[str, Any]:
|
|
378
|
-
output = input_data.get("output", "")
|
|
379
|
-
|
|
380
|
-
# Define checks with weights
|
|
381
|
-
checks = [
|
|
382
|
-
("correctness", is_correct(output), 0.5),
|
|
383
|
-
("style", has_good_style(output), 0.3),
|
|
384
|
-
("docs", has_docs(output), 0.2),
|
|
385
|
-
]
|
|
386
|
-
|
|
387
|
-
hits = [name for name, passed, _ in checks if passed]
|
|
388
|
-
misses = [name for name, passed, _ in checks if not passed]
|
|
389
|
-
|
|
390
|
-
# Weighted score
|
|
391
|
-
score = sum(weight for _, passed, weight in checks if passed)
|
|
392
|
-
|
|
393
|
-
return {
|
|
394
|
-
"score": score,
|
|
395
|
-
"hits": hits,
|
|
396
|
-
"misses": misses,
|
|
397
|
-
"reasoning": f"Weighted score: {score:.2f}"
|
|
398
|
-
}
|
|
399
|
-
```
|