agentv 1.6.1 → 2.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +18 -46
- package/dist/{chunk-HU4B6ODF.js → chunk-5AJ7DFUO.js} +2807 -717
- package/dist/chunk-5AJ7DFUO.js.map +1 -0
- package/dist/cli.js +4 -2
- package/dist/cli.js.map +1 -1
- package/dist/index.js +1 -1
- package/dist/templates/.agentv/.env.template +23 -23
- package/dist/templates/.agentv/config.yaml +15 -15
- package/dist/templates/.agentv/targets.yaml +16 -0
- package/dist/templates/.claude/skills/agentv-eval-builder/SKILL.md +4 -1
- package/dist/templates/.claude/skills/agentv-eval-builder/references/batch-cli-evaluator.md +12 -0
- package/dist/templates/.claude/skills/agentv-eval-builder/references/compare-command.md +25 -3
- package/dist/templates/.claude/skills/agentv-eval-builder/references/composite-evaluator.md +215 -215
- package/dist/templates/.claude/skills/agentv-eval-builder/references/custom-evaluators.md +132 -18
- package/dist/templates/.claude/skills/agentv-eval-builder/references/eval-schema.json +10 -6
- package/dist/templates/.claude/skills/agentv-eval-builder/references/structured-data-evaluators.md +121 -0
- package/dist/templates/.claude/skills/agentv-prompt-optimizer/SKILL.md +77 -77
- package/dist/templates/.github/prompts/agentv-eval-build.prompt.md +4 -4
- package/dist/templates/.github/prompts/agentv-optimize.prompt.md +3 -3
- package/package.json +4 -2
- package/dist/chunk-HU4B6ODF.js.map +0 -1
package/README.md
CHANGED
|
@@ -59,6 +59,14 @@ bun run build
|
|
|
59
59
|
bun test
|
|
60
60
|
```
|
|
61
61
|
|
|
62
|
+
5. (Optional) Install example dependencies:
|
|
63
|
+
|
|
64
|
+
```bash
|
|
65
|
+
bun run examples:install
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
This step is required if you want to run the examples in the `examples/` directory, as they are self-contained packages with their own dependencies.
|
|
69
|
+
|
|
62
70
|
You are now ready to start development. The monorepo contains:
|
|
63
71
|
|
|
64
72
|
- `packages/core/` - Core evaluation engine
|
|
@@ -77,9 +85,8 @@ You are now ready to start development. The monorepo contains:
|
|
|
77
85
|
|
|
78
86
|
## Quick Start
|
|
79
87
|
|
|
80
|
-
You can use the following examples as a starting point
|
|
81
|
-
- [
|
|
82
|
-
- [Showcase](docs/examples/showcase/README.md): A collection of advanced use cases and real-world agent evaluation scenarios.
|
|
88
|
+
You can use the following examples as a starting point:
|
|
89
|
+
- [Examples](examples/README.md): Feature demonstrations and real-world showcase examples
|
|
83
90
|
|
|
84
91
|
### Validating Eval Files
|
|
85
92
|
|
|
@@ -129,9 +136,6 @@ agentv eval --target vscode_projectx --targets "path/to/targets.yaml" --eval-id
|
|
|
129
136
|
- `--agent-timeout SECONDS`: Timeout in seconds for agent response polling (default: 120)
|
|
130
137
|
- `--max-retries COUNT`: Maximum number of retries for timeout cases (default: 2)
|
|
131
138
|
- `--cache`: Enable caching of LLM responses (default: disabled)
|
|
132
|
-
- `--dump-prompts`: Save all prompts to `.agentv/prompts/` directory
|
|
133
|
-
- `--dump-traces`: Write trace files to `.agentv/traces/` directory
|
|
134
|
-
- `--include-trace`: Include full trace in result output (verbose)
|
|
135
139
|
- `--workers COUNT`: Parallel workers for eval cases (default: 3; target `workers` setting used when provided)
|
|
136
140
|
- `--verbose`: Verbose output
|
|
137
141
|
|
|
@@ -297,45 +301,13 @@ Code evaluators receive input via stdin and write output to stdout as JSON.
|
|
|
297
301
|
- Score range: `0.0` to `1.0` (float)
|
|
298
302
|
- `hits` and `misses` are optional but recommended for debugging
|
|
299
303
|
|
|
300
|
-
### Code Evaluator
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
def evaluate(input_data):
|
|
308
|
-
# Extract only the fields you need
|
|
309
|
-
candidate_answer = input_data.get("candidate_answer", "")
|
|
310
|
-
|
|
311
|
-
# Your validation logic here
|
|
312
|
-
score = 0.0 # to 1.0
|
|
313
|
-
hits = ["successful check 1", "successful check 2"]
|
|
314
|
-
misses = ["failed check 1"]
|
|
315
|
-
reasoning = "Explanation of score"
|
|
316
|
-
|
|
317
|
-
return {
|
|
318
|
-
"score": score,
|
|
319
|
-
"hits": hits,
|
|
320
|
-
"misses": misses,
|
|
321
|
-
"reasoning": reasoning
|
|
322
|
-
}
|
|
323
|
-
|
|
324
|
-
if __name__ == "__main__":
|
|
325
|
-
try:
|
|
326
|
-
input_data = json.loads(sys.stdin.read())
|
|
327
|
-
result = evaluate(input_data)
|
|
328
|
-
print(json.dumps(result, indent=2))
|
|
329
|
-
except Exception as e:
|
|
330
|
-
error_result = {
|
|
331
|
-
"score": 0.0,
|
|
332
|
-
"hits": [],
|
|
333
|
-
"misses": [f"Evaluator error: {str(e)}"],
|
|
334
|
-
"reasoning": f"Evaluator error: {str(e)}"
|
|
335
|
-
}
|
|
336
|
-
print(json.dumps(error_result, indent=2))
|
|
337
|
-
sys.exit(1)
|
|
338
|
-
```
|
|
304
|
+
### Code Evaluator Templates
|
|
305
|
+
|
|
306
|
+
Custom evaluators can be written in any language. For complete templates and examples:
|
|
307
|
+
|
|
308
|
+
- **Python template**: See `apps/cli/src/templates/.claude/skills/agentv-eval-builder/references/custom-evaluators.md`
|
|
309
|
+
- **TypeScript template (with SDK)**: See `apps/cli/src/templates/.claude/skills/agentv-eval-builder/references/custom-evaluators.md`
|
|
310
|
+
- **Working examples**: See [examples/features/code-judge-sdk](examples/features/code-judge-sdk)
|
|
339
311
|
|
|
340
312
|
### LLM Judge Template Structure
|
|
341
313
|
|
|
@@ -408,7 +380,7 @@ agentv generate rubrics evals/my-eval.yaml --target openai:gpt-4o
|
|
|
408
380
|
- `borderline`: Score ≥ 0.6 and all required rubrics met
|
|
409
381
|
- `fail`: Score < 0.6 or any required rubric failed
|
|
410
382
|
|
|
411
|
-
For complete examples and detailed patterns, see [examples/features/
|
|
383
|
+
For complete examples and detailed patterns, see [examples/features/rubric/](examples/features/rubric/).
|
|
412
384
|
|
|
413
385
|
## Advanced Configuration
|
|
414
386
|
|