agentv 4.26.1 → 4.27.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-XBUHMRX2.js → chunk-PH5MHKPL.js} +431 -49
- package/dist/chunk-PH5MHKPL.js.map +1 -0
- package/dist/{chunk-JA4WQNE6.js → chunk-VO3THAOI.js} +10 -2
- package/dist/chunk-VO3THAOI.js.map +1 -0
- package/dist/cli.js +2 -2
- package/dist/index.js +2 -2
- package/dist/{interactive-YMKWKPD7.js → interactive-UG4YNLYK.js} +2 -2
- package/dist/skills/agentv-bench/LICENSE.txt +202 -0
- package/dist/skills/agentv-bench/SKILL.md +459 -0
- package/dist/skills/agentv-bench/agents/analyzer.md +177 -0
- package/dist/skills/agentv-bench/agents/comparator.md +247 -0
- package/dist/skills/agentv-bench/agents/executor.md +30 -0
- package/dist/skills/agentv-bench/agents/grader.md +238 -0
- package/dist/skills/agentv-bench/agents/mutator.md +172 -0
- package/dist/skills/agentv-bench/references/autoresearch.md +309 -0
- package/dist/skills/agentv-bench/references/description-optimization.md +66 -0
- package/dist/skills/agentv-bench/references/environment-adaptation.md +82 -0
- package/dist/skills/agentv-bench/references/eval-yaml-spec.md +338 -0
- package/dist/skills/agentv-bench/references/migrating-from-skill-creator.md +103 -0
- package/dist/skills/agentv-bench/references/schemas.md +432 -0
- package/dist/skills/agentv-bench/references/subagent-pipeline.md +181 -0
- package/dist/skills/agentv-bench/scripts/trajectory.html +462 -0
- package/dist/skills/agentv-eval-review/SKILL.md +53 -0
- package/dist/skills/agentv-eval-review/scripts/lint_eval.py +239 -0
- package/dist/skills/agentv-eval-writer/SKILL.md +707 -0
- package/dist/skills/agentv-eval-writer/references/config-schema.json +63 -0
- package/dist/skills/agentv-eval-writer/references/custom-evaluators.md +119 -0
- package/dist/skills/agentv-eval-writer/references/eval-schema.json +19077 -0
- package/dist/skills/agentv-eval-writer/references/rubric-evaluator.md +114 -0
- package/dist/skills/agentv-governance/SKILL.md +79 -0
- package/dist/skills/agentv-governance/references/eu-ai-act-risk-tiers.md +37 -0
- package/dist/skills/agentv-governance/references/governance-yaml-shape.md +125 -0
- package/dist/skills/agentv-governance/references/iso-42001-controls.md +46 -0
- package/dist/skills/agentv-governance/references/lint-rules.md +169 -0
- package/dist/skills/agentv-governance/references/mitre-atlas.md +38 -0
- package/dist/skills/agentv-governance/references/owasp-agentic-top-10-2025.md +28 -0
- package/dist/skills/agentv-governance/references/owasp-llm-top-10-2025.md +25 -0
- package/dist/skills/agentv-trace-analyst/SKILL.md +161 -0
- package/package.json +1 -1
- package/dist/chunk-JA4WQNE6.js.map +0 -1
- package/dist/chunk-XBUHMRX2.js.map +0 -1
- /package/dist/{interactive-YMKWKPD7.js.map → interactive-UG4YNLYK.js.map} +0 -0
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$schema": "http://json-schema.org/draft-07/schema#",
|
|
3
|
+
"title": "AgentV Config Schema",
|
|
4
|
+
"description": "Schema for .agentv/config.yaml configuration files",
|
|
5
|
+
"type": "object",
|
|
6
|
+
"properties": {
|
|
7
|
+
"$schema": {
|
|
8
|
+
"type": "string",
|
|
9
|
+
"description": "Schema identifier",
|
|
10
|
+
"enum": ["agentv-config-v2"]
|
|
11
|
+
},
|
|
12
|
+
"required_version": {
|
|
13
|
+
"type": "string",
|
|
14
|
+
"description": "Minimum AgentV version required to run this project's evals. Uses semver range syntax (e.g., '>=2.11.0', '^2.11.0'). When the installed version is below the range, AgentV warns and prompts to update.",
|
|
15
|
+
"examples": [">=2.11.0", "^2.12.0", ">=2.11.0 <3.0.0"]
|
|
16
|
+
},
|
|
17
|
+
"eval_patterns": {
|
|
18
|
+
"type": "array",
|
|
19
|
+
"description": "Glob patterns for discovering eval files during interactive mode (`agentv eval` with no args). Defaults to ['**/evals/**/dataset*.yaml', '**/evals/**/eval.yaml'] if not specified.",
|
|
20
|
+
"items": {
|
|
21
|
+
"type": "string",
|
|
22
|
+
"description": "Glob pattern (e.g., '**/evals/**/dataset*.yaml', '**/evals/**/eval.yaml')"
|
|
23
|
+
},
|
|
24
|
+
"examples": [["**/evals/**/dataset*.yaml", "**/evals/**/eval.yaml"], ["**/evals/**/*.yaml"]]
|
|
25
|
+
},
|
|
26
|
+
"execution": {
|
|
27
|
+
"type": "object",
|
|
28
|
+
"description": "Default execution options. CLI flags take precedence over these values.",
|
|
29
|
+
"properties": {
|
|
30
|
+
"verbose": {
|
|
31
|
+
"type": "boolean",
|
|
32
|
+
"description": "Enable verbose logging (equivalent to --verbose)",
|
|
33
|
+
"default": false
|
|
34
|
+
},
|
|
35
|
+
"keep_workspaces": {
|
|
36
|
+
"type": "boolean",
|
|
37
|
+
"description": "Always keep temp workspaces after eval (equivalent to --keep-workspaces)",
|
|
38
|
+
"default": false
|
|
39
|
+
},
|
|
40
|
+
"otel_file": {
|
|
41
|
+
"type": "string",
|
|
42
|
+
"description": "Write OTLP JSON trace to this path (equivalent to --otel-file). Supports {timestamp} placeholder.",
|
|
43
|
+
"examples": [".agentv/results/otel-{timestamp}.json"]
|
|
44
|
+
}
|
|
45
|
+
},
|
|
46
|
+
"additionalProperties": false
|
|
47
|
+
},
|
|
48
|
+
"hooks": {
|
|
49
|
+
"type": "object",
|
|
50
|
+
"description": "Lifecycle hooks that run at specific points during agentv execution.",
|
|
51
|
+
"properties": {
|
|
52
|
+
"before_session": {
|
|
53
|
+
"type": "string",
|
|
54
|
+
"description": "Shell command to run once at agentv startup, before any command executes. stdout is parsed for env var exports (KEY=value or export KEY=\"value\") and injected into process.env. Keys already set in the environment are not overwritten. stderr is forwarded to the user. Non-zero exit aborts with an error.",
|
|
55
|
+
"examples": ["bun scripts/load-secrets.ts", "eval $(aws ssm get-parameters-by-path ...)"]
|
|
56
|
+
}
|
|
57
|
+
},
|
|
58
|
+
"additionalProperties": false
|
|
59
|
+
}
|
|
60
|
+
},
|
|
61
|
+
"required": ["$schema"],
|
|
62
|
+
"additionalProperties": false
|
|
63
|
+
}
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
# Custom Graders
|
|
2
|
+
|
|
3
|
+
## Wire Format
|
|
4
|
+
|
|
5
|
+
### Input (stdin JSON)
|
|
6
|
+
|
|
7
|
+
```json
|
|
8
|
+
{
|
|
9
|
+
"question": "string",
|
|
10
|
+
"criteria": "string",
|
|
11
|
+
"reference_answer": "string",
|
|
12
|
+
"answer": "string",
|
|
13
|
+
"input_files": ["path"],
|
|
14
|
+
"input": [{"role": "user", "content": "..."}],
|
|
15
|
+
"expected_output": [{"role": "assistant", "content": "..."}],
|
|
16
|
+
"output": [{"role": "assistant", "content": "..."}],
|
|
17
|
+
"trace": {
|
|
18
|
+
"event_count": 5,
|
|
19
|
+
"tool_calls": {"fetch": 1},
|
|
20
|
+
"error_count": 0,
|
|
21
|
+
"llm_call_count": 2
|
|
22
|
+
},
|
|
23
|
+
"token_usage": {"input": 1000, "output": 500},
|
|
24
|
+
"cost_usd": 0.0015,
|
|
25
|
+
"duration_ms": 3500,
|
|
26
|
+
"start_time": "2026-02-13T10:00:00.000Z",
|
|
27
|
+
"end_time": "2026-02-13T10:00:03.500Z"
|
|
28
|
+
}
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
### Output (stdout JSON)
|
|
32
|
+
|
|
33
|
+
```json
|
|
34
|
+
{
|
|
35
|
+
"score": 0.85,
|
|
36
|
+
"assertions": [
|
|
37
|
+
{ "text": "passed check", "passed": true },
|
|
38
|
+
{ "text": "failed check", "passed": false }
|
|
39
|
+
],
|
|
40
|
+
"reasoning": "explanation"
|
|
41
|
+
}
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
`score` (0.0-1.0) required. `assertions`, `reasoning` optional.
|
|
45
|
+
|
|
46
|
+
## SDK Functions
|
|
47
|
+
|
|
48
|
+
```typescript
|
|
49
|
+
import { defineCodeGrader, createTargetClient, definePromptTemplate } from '@agentv/eval';
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
- `defineCodeGrader(fn)` - Wraps evaluation function with stdin/stdout handling
|
|
53
|
+
- `createTargetClient()` - Returns LLM proxy client (when `target: {}` configured)
|
|
54
|
+
- `.invoke({question, systemPrompt})` - Single LLM call
|
|
55
|
+
- `.invokeBatch(requests)` - Batch LLM calls
|
|
56
|
+
- `definePromptTemplate(fn)` - Wraps prompt generation function
|
|
57
|
+
- Context fields: `question`, `answer`, `referenceAnswer`, `criteria`, `expectedOutput`, `output`, `config`, `trace`, `tokenUsage`, `costUsd`, `durationMs`, `startTime`, `endTime`
|
|
58
|
+
|
|
59
|
+
## Python Example
|
|
60
|
+
|
|
61
|
+
```python
|
|
62
|
+
#!/usr/bin/env python3
|
|
63
|
+
import json, sys
|
|
64
|
+
|
|
65
|
+
def evaluate(data: dict) -> dict:
|
|
66
|
+
candidate = data.get("answer", "")
|
|
67
|
+
assertions = []
|
|
68
|
+
for kw in ["async", "await"]:
|
|
69
|
+
assertions.append({"text": f"Keyword '{kw}'", "passed": kw in candidate})
|
|
70
|
+
passed = sum(1 for a in assertions if a["passed"])
|
|
71
|
+
return {
|
|
72
|
+
"score": passed / max(len(assertions), 1),
|
|
73
|
+
"assertions": assertions,
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
if __name__ == "__main__":
|
|
77
|
+
try:
|
|
78
|
+
print(json.dumps(evaluate(json.loads(sys.stdin.read()))))
|
|
79
|
+
except Exception as e:
|
|
80
|
+
print(json.dumps({"score": 0, "assertions": [{"text": str(e), "passed": False}]}))
|
|
81
|
+
sys.exit(1)
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
## TypeScript Example
|
|
85
|
+
|
|
86
|
+
```typescript
|
|
87
|
+
#!/usr/bin/env bun
|
|
88
|
+
import { defineCodeGrader } from '@agentv/eval';
|
|
89
|
+
|
|
90
|
+
export default defineCodeGrader(({ answer, criteria }) => {
|
|
91
|
+
const assertions: Array<{ text: string; passed: boolean }> = [];
|
|
92
|
+
if (answer.includes(criteria)) {
|
|
93
|
+
assertions.push({ text: 'Matches expected outcome', passed: true });
|
|
94
|
+
} else {
|
|
95
|
+
assertions.push({ text: 'Does not match expected outcome', passed: false });
|
|
96
|
+
}
|
|
97
|
+
const passed = assertions.filter(a => a.passed).length;
|
|
98
|
+
return {
|
|
99
|
+
score: passed / Math.max(assertions.length, 1),
|
|
100
|
+
assertions,
|
|
101
|
+
};
|
|
102
|
+
});
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
## Template Variables
|
|
106
|
+
|
|
107
|
+
Derived from test fields (users never author these directly):
|
|
108
|
+
|
|
109
|
+
| Variable | Source |
|
|
110
|
+
|----------|--------|
|
|
111
|
+
| `question` | First user message in `input` |
|
|
112
|
+
| `criteria` | Test `criteria` field |
|
|
113
|
+
| `reference_answer` | Last entry in `expected_output` |
|
|
114
|
+
| `answer` | Last entry in `output` (runtime) |
|
|
115
|
+
| `input` | Full resolved input array (JSON) |
|
|
116
|
+
| `expected_output` | Full resolved expected array (JSON) |
|
|
117
|
+
| `output` | Full provider output array (JSON) |
|
|
118
|
+
|
|
119
|
+
Markdown templates use `{{variable}}` syntax. TypeScript templates receive context object.
|