agentv 4.26.1 → 4.27.0-next.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. package/dist/{chunk-JA4WQNE6.js → chunk-47JX7NNZ.js} +10 -2
  2. package/dist/chunk-47JX7NNZ.js.map +1 -0
  3. package/dist/{chunk-XBUHMRX2.js → chunk-V3LWJB5X.js} +431 -49
  4. package/dist/chunk-V3LWJB5X.js.map +1 -0
  5. package/dist/cli.js +2 -2
  6. package/dist/index.js +2 -2
  7. package/dist/{interactive-YMKWKPD7.js → interactive-L6PIIFNQ.js} +2 -2
  8. package/dist/skills/agentv-bench/LICENSE.txt +202 -0
  9. package/dist/skills/agentv-bench/SKILL.md +459 -0
  10. package/dist/skills/agentv-bench/agents/analyzer.md +177 -0
  11. package/dist/skills/agentv-bench/agents/comparator.md +247 -0
  12. package/dist/skills/agentv-bench/agents/executor.md +30 -0
  13. package/dist/skills/agentv-bench/agents/grader.md +238 -0
  14. package/dist/skills/agentv-bench/agents/mutator.md +172 -0
  15. package/dist/skills/agentv-bench/references/autoresearch.md +309 -0
  16. package/dist/skills/agentv-bench/references/description-optimization.md +66 -0
  17. package/dist/skills/agentv-bench/references/environment-adaptation.md +82 -0
  18. package/dist/skills/agentv-bench/references/eval-yaml-spec.md +338 -0
  19. package/dist/skills/agentv-bench/references/migrating-from-skill-creator.md +103 -0
  20. package/dist/skills/agentv-bench/references/schemas.md +432 -0
  21. package/dist/skills/agentv-bench/references/subagent-pipeline.md +181 -0
  22. package/dist/skills/agentv-bench/scripts/trajectory.html +462 -0
  23. package/dist/skills/agentv-eval-review/SKILL.md +53 -0
  24. package/dist/skills/agentv-eval-review/scripts/lint_eval.py +239 -0
  25. package/dist/skills/agentv-eval-writer/SKILL.md +707 -0
  26. package/dist/skills/agentv-eval-writer/references/config-schema.json +63 -0
  27. package/dist/skills/agentv-eval-writer/references/custom-evaluators.md +119 -0
  28. package/dist/skills/agentv-eval-writer/references/eval-schema.json +19077 -0
  29. package/dist/skills/agentv-eval-writer/references/rubric-evaluator.md +114 -0
  30. package/dist/skills/agentv-governance/SKILL.md +79 -0
  31. package/dist/skills/agentv-governance/references/eu-ai-act-risk-tiers.md +37 -0
  32. package/dist/skills/agentv-governance/references/governance-yaml-shape.md +125 -0
  33. package/dist/skills/agentv-governance/references/iso-42001-controls.md +46 -0
  34. package/dist/skills/agentv-governance/references/lint-rules.md +169 -0
  35. package/dist/skills/agentv-governance/references/mitre-atlas.md +38 -0
  36. package/dist/skills/agentv-governance/references/owasp-agentic-top-10-2025.md +28 -0
  37. package/dist/skills/agentv-governance/references/owasp-llm-top-10-2025.md +25 -0
  38. package/dist/skills/agentv-trace-analyst/SKILL.md +161 -0
  39. package/package.json +1 -1
  40. package/dist/chunk-JA4WQNE6.js.map +0 -1
  41. package/dist/chunk-XBUHMRX2.js.map +0 -1
  42. /package/dist/{interactive-YMKWKPD7.js.map → interactive-L6PIIFNQ.js.map} +0 -0
@@ -0,0 +1,63 @@
1
+ {
2
+ "$schema": "http://json-schema.org/draft-07/schema#",
3
+ "title": "AgentV Config Schema",
4
+ "description": "Schema for .agentv/config.yaml configuration files",
5
+ "type": "object",
6
+ "properties": {
7
+ "$schema": {
8
+ "type": "string",
9
+ "description": "Schema identifier",
10
+ "enum": ["agentv-config-v2"]
11
+ },
12
+ "required_version": {
13
+ "type": "string",
14
+ "description": "Minimum AgentV version required to run this project's evals. Uses semver range syntax (e.g., '>=2.11.0', '^2.11.0'). When the installed version is below the range, AgentV warns and prompts to update.",
15
+ "examples": [">=2.11.0", "^2.12.0", ">=2.11.0 <3.0.0"]
16
+ },
17
+ "eval_patterns": {
18
+ "type": "array",
19
+ "description": "Glob patterns for discovering eval files during interactive mode (`agentv eval` with no args). Defaults to ['**/evals/**/dataset*.yaml', '**/evals/**/eval.yaml'] if not specified.",
20
+ "items": {
21
+ "type": "string",
22
+ "description": "Glob pattern (e.g., '**/evals/**/dataset*.yaml', '**/evals/**/eval.yaml')"
23
+ },
24
+ "examples": [["**/evals/**/dataset*.yaml", "**/evals/**/eval.yaml"], ["**/evals/**/*.yaml"]]
25
+ },
26
+ "execution": {
27
+ "type": "object",
28
+ "description": "Default execution options. CLI flags take precedence over these values.",
29
+ "properties": {
30
+ "verbose": {
31
+ "type": "boolean",
32
+ "description": "Enable verbose logging (equivalent to --verbose)",
33
+ "default": false
34
+ },
35
+ "keep_workspaces": {
36
+ "type": "boolean",
37
+ "description": "Always keep temp workspaces after eval (equivalent to --keep-workspaces)",
38
+ "default": false
39
+ },
40
+ "otel_file": {
41
+ "type": "string",
42
+ "description": "Write OTLP JSON trace to this path (equivalent to --otel-file). Supports {timestamp} placeholder.",
43
+ "examples": [".agentv/results/otel-{timestamp}.json"]
44
+ }
45
+ },
46
+ "additionalProperties": false
47
+ },
48
+ "hooks": {
49
+ "type": "object",
50
+ "description": "Lifecycle hooks that run at specific points during agentv execution.",
51
+ "properties": {
52
+ "before_session": {
53
+ "type": "string",
54
+ "description": "Shell command to run once at agentv startup, before any command executes. stdout is parsed for env var exports (KEY=value or export KEY=\"value\") and injected into process.env. Keys already set in the environment are not overwritten. stderr is forwarded to the user. Non-zero exit aborts with an error.",
55
+ "examples": ["bun scripts/load-secrets.ts", "eval $(aws ssm get-parameters-by-path ...)"]
56
+ }
57
+ },
58
+ "additionalProperties": false
59
+ }
60
+ },
61
+ "required": ["$schema"],
62
+ "additionalProperties": false
63
+ }
@@ -0,0 +1,119 @@
1
+ # Custom Graders
2
+
3
+ ## Wire Format
4
+
5
+ ### Input (stdin JSON)
6
+
7
+ ```json
8
+ {
9
+ "question": "string",
10
+ "criteria": "string",
11
+ "reference_answer": "string",
12
+ "answer": "string",
13
+ "input_files": ["path"],
14
+ "input": [{"role": "user", "content": "..."}],
15
+ "expected_output": [{"role": "assistant", "content": "..."}],
16
+ "output": [{"role": "assistant", "content": "..."}],
17
+ "trace": {
18
+ "event_count": 5,
19
+ "tool_calls": {"fetch": 1},
20
+ "error_count": 0,
21
+ "llm_call_count": 2
22
+ },
23
+ "token_usage": {"input": 1000, "output": 500},
24
+ "cost_usd": 0.0015,
25
+ "duration_ms": 3500,
26
+ "start_time": "2026-02-13T10:00:00.000Z",
27
+ "end_time": "2026-02-13T10:00:03.500Z"
28
+ }
29
+ ```
30
+
31
+ ### Output (stdout JSON)
32
+
33
+ ```json
34
+ {
35
+ "score": 0.85,
36
+ "assertions": [
37
+ { "text": "passed check", "passed": true },
38
+ { "text": "failed check", "passed": false }
39
+ ],
40
+ "reasoning": "explanation"
41
+ }
42
+ ```
43
+
44
+ `score` (0.0-1.0) required. `assertions`, `reasoning` optional.
45
+
46
+ ## SDK Functions
47
+
48
+ ```typescript
49
+ import { defineCodeGrader, createTargetClient, definePromptTemplate } from '@agentv/eval';
50
+ ```
51
+
52
+ - `defineCodeGrader(fn)` - Wraps evaluation function with stdin/stdout handling
53
+ - `createTargetClient()` - Returns LLM proxy client (when `target: {}` configured)
54
+ - `.invoke({question, systemPrompt})` - Single LLM call
55
+ - `.invokeBatch(requests)` - Batch LLM calls
56
+ - `definePromptTemplate(fn)` - Wraps prompt generation function
57
+ - Context fields: `question`, `answer`, `referenceAnswer`, `criteria`, `expectedOutput`, `output`, `config`, `trace`, `tokenUsage`, `costUsd`, `durationMs`, `startTime`, `endTime`
58
+
59
+ ## Python Example
60
+
61
+ ```python
62
+ #!/usr/bin/env python3
63
+ import json, sys
64
+
65
+ def evaluate(data: dict) -> dict:
66
+ candidate = data.get("answer", "")
67
+ assertions = []
68
+ for kw in ["async", "await"]:
69
+ assertions.append({"text": f"Keyword '{kw}'", "passed": kw in candidate})
70
+ passed = sum(1 for a in assertions if a["passed"])
71
+ return {
72
+ "score": passed / max(len(assertions), 1),
73
+ "assertions": assertions,
74
+ }
75
+
76
+ if __name__ == "__main__":
77
+ try:
78
+ print(json.dumps(evaluate(json.loads(sys.stdin.read()))))
79
+ except Exception as e:
80
+ print(json.dumps({"score": 0, "assertions": [{"text": str(e), "passed": False}]}))
81
+ sys.exit(1)
82
+ ```
83
+
84
+ ## TypeScript Example
85
+
86
+ ```typescript
87
+ #!/usr/bin/env bun
88
+ import { defineCodeGrader } from '@agentv/eval';
89
+
90
+ export default defineCodeGrader(({ answer, criteria }) => {
91
+ const assertions: Array<{ text: string; passed: boolean }> = [];
92
+ if (answer.includes(criteria)) {
93
+ assertions.push({ text: 'Matches expected outcome', passed: true });
94
+ } else {
95
+ assertions.push({ text: 'Does not match expected outcome', passed: false });
96
+ }
97
+ const passed = assertions.filter(a => a.passed).length;
98
+ return {
99
+ score: passed / Math.max(assertions.length, 1),
100
+ assertions,
101
+ };
102
+ });
103
+ ```
104
+
105
+ ## Template Variables
106
+
107
+ Derived from test fields (users never author these directly):
108
+
109
+ | Variable | Source |
110
+ |----------|--------|
111
+ | `question` | First user message in `input` |
112
+ | `criteria` | Test `criteria` field |
113
+ | `reference_answer` | Last entry in `expected_output` |
114
+ | `answer` | Last entry in `output` (runtime) |
115
+ | `input` | Full resolved input array (JSON) |
116
+ | `expected_output` | Full resolved expected array (JSON) |
117
+ | `output` | Full provider output array (JSON) |
118
+
119
+ Markdown templates use `{{variable}}` syntax. TypeScript templates receive context object.