agentv 2.6.0 → 2.7.1-next.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +128 -33
- package/dist/chunk-3L2L5GIL.js +51 -0
- package/dist/{chunk-BKMQNEUD.js.map → chunk-3L2L5GIL.js.map} +1 -1
- package/dist/{chunk-BKMQNEUD.js → chunk-5H446C7X.js} +2 -45
- package/dist/{chunk-LJVS3JAK.js → chunk-BL4PVUAT.js} +6 -4
- package/dist/{chunk-LJVS3JAK.js.map → chunk-BL4PVUAT.js.map} +1 -1
- package/dist/{chunk-MGK6HHRR.js → chunk-BWLYFF5N.js} +9813 -11237
- package/dist/chunk-BWLYFF5N.js.map +1 -0
- package/dist/chunk-C5GOHBQM.js +84 -0
- package/dist/chunk-C5GOHBQM.js.map +1 -0
- package/dist/chunk-EJEG3DU2.js +5476 -0
- package/dist/chunk-EJEG3DU2.js.map +1 -0
- package/dist/chunk-FV32QHPB.js +565 -0
- package/dist/chunk-FV32QHPB.js.map +1 -0
- package/dist/chunk-H5FFZCKI.js +2957 -0
- package/dist/chunk-H5FFZCKI.js.map +1 -0
- package/dist/chunk-JK6V4KVD.js +114 -0
- package/dist/chunk-JK6V4KVD.js.map +1 -0
- package/dist/chunk-LRULMAAA.js +1711 -0
- package/dist/chunk-LRULMAAA.js.map +1 -0
- package/dist/chunk-SR4I5KET.js +1238 -0
- package/dist/chunk-SR4I5KET.js.map +1 -0
- package/dist/chunk-VQ2ZO7XJ.js +2098 -0
- package/dist/chunk-VQ2ZO7XJ.js.map +1 -0
- package/dist/chunk-XALGXSKB.js +21 -0
- package/dist/chunk-XALGXSKB.js.map +1 -0
- package/dist/cli.js +8 -2
- package/dist/cli.js.map +1 -1
- package/dist/dist-R3OCWGXH.js +257 -0
- package/dist/dist-R3OCWGXH.js.map +1 -0
- package/dist/esm-5Q4BZALM-5REQWAUV.js +924 -0
- package/dist/esm-5Q4BZALM-5REQWAUV.js.map +1 -0
- package/dist/esm-DX3WQKEN.js +32 -0
- package/dist/esm-DX3WQKEN.js.map +1 -0
- package/dist/esm-QNEMCJPL.js +933 -0
- package/dist/esm-QNEMCJPL.js.map +1 -0
- package/dist/esm-R77SNOF5.js +65 -0
- package/dist/esm-R77SNOF5.js.map +1 -0
- package/dist/esm-RVQPUGWH.js +1207 -0
- package/dist/esm-RVQPUGWH.js.map +1 -0
- package/dist/getMachineId-bsd-HSK5LZMG.js +41 -0
- package/dist/getMachineId-bsd-HSK5LZMG.js.map +1 -0
- package/dist/getMachineId-darwin-4DP6CCJV.js +41 -0
- package/dist/getMachineId-darwin-4DP6CCJV.js.map +1 -0
- package/dist/getMachineId-linux-44LJ5UJB.js +33 -0
- package/dist/getMachineId-linux-44LJ5UJB.js.map +1 -0
- package/dist/getMachineId-unsupported-NVK6IATM.js +24 -0
- package/dist/getMachineId-unsupported-NVK6IATM.js.map +1 -0
- package/dist/getMachineId-win-YZ36S7VA.js +43 -0
- package/dist/getMachineId-win-YZ36S7VA.js.map +1 -0
- package/dist/index.js +10 -2
- package/dist/interactive-33TCZXLF.js +333 -0
- package/dist/interactive-33TCZXLF.js.map +1 -0
- package/dist/otlp-json-file-exporter-77FDBRSY-EZAPHWP6.js +9 -0
- package/dist/otlp-json-file-exporter-77FDBRSY-EZAPHWP6.js.map +1 -0
- package/dist/simple-trace-file-exporter-S76DMABU-5FCJESD2.js +9 -0
- package/dist/simple-trace-file-exporter-S76DMABU-5FCJESD2.js.map +1 -0
- package/dist/src-2N5EJ2N6.js +1733 -0
- package/dist/src-2N5EJ2N6.js.map +1 -0
- package/dist/templates/.agents/skills/agentv-chat-to-eval/README.md +84 -0
- package/dist/templates/.agents/skills/agentv-chat-to-eval/SKILL.md +144 -0
- package/dist/templates/.agents/skills/agentv-chat-to-eval/examples/transcript-json.md +67 -0
- package/dist/templates/.agents/skills/agentv-chat-to-eval/examples/transcript-markdown.md +101 -0
- package/dist/templates/.agents/skills/agentv-eval-builder/SKILL.md +433 -0
- package/dist/templates/.agents/skills/agentv-eval-builder/references/config-schema.json +36 -0
- package/dist/templates/.agents/skills/agentv-eval-builder/references/custom-evaluators.md +118 -0
- package/dist/templates/.agents/skills/agentv-eval-builder/references/eval-schema.json +251 -0
- package/dist/templates/.agents/skills/agentv-eval-builder/references/rubric-evaluator.md +77 -0
- package/dist/templates/.agents/skills/agentv-eval-orchestrator/SKILL.md +50 -0
- package/dist/templates/.agents/skills/agentv-prompt-optimizer/SKILL.md +78 -0
- package/dist/templates/.agentv/.env.example +23 -23
- package/dist/templates/.agentv/config.yaml +15 -15
- package/dist/templates/.claude/skills/agentv-eval-builder/SKILL.md +38 -13
- package/dist/templates/.claude/skills/agentv-eval-builder/references/custom-evaluators.md +9 -6
- package/dist/templates/.claude/skills/agentv-eval-builder/references/eval-schema.json +4 -4
- package/dist/templates/.claude/skills/agentv-eval-builder/references/rubric-evaluator.md +7 -9
- package/dist/templates/.github/prompts/agentv-eval-build.prompt.md +4 -4
- package/dist/templates/.github/prompts/agentv-optimize.prompt.md +3 -3
- package/dist/{token-D3IYDJQZ.js → token-POXF46NU.js} +6 -4
- package/dist/{token-D3IYDJQZ.js.map → token-POXF46NU.js.map} +1 -1
- package/dist/{token-util-FWFPR2BV.js → token-util-6GWYZWGE.js} +4 -3
- package/dist/token-util-6GWYZWGE.js.map +1 -0
- package/package.json +7 -3
- package/dist/chunk-MGK6HHRR.js.map +0 -1
- /package/dist/{token-util-FWFPR2BV.js.map → chunk-5H446C7X.js.map} +0 -0
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
# Example: Markdown Transcript Input
|
|
2
|
+
|
|
3
|
+
## Input
|
|
4
|
+
|
|
5
|
+
```
|
|
6
|
+
User: What are the SOLID principles in software engineering?
|
|
7
|
+
Assistant: The SOLID principles are five design principles for object-oriented programming:
|
|
8
|
+
|
|
9
|
+
1. **Single Responsibility** — A class should have only one reason to change
|
|
10
|
+
2. **Open/Closed** — Open for extension, closed for modification
|
|
11
|
+
3. **Liskov Substitution** — Subtypes must be substitutable for their base types
|
|
12
|
+
4. **Interface Segregation** — Prefer small, specific interfaces over large ones
|
|
13
|
+
5. **Dependency Inversion** — Depend on abstractions, not concretions
|
|
14
|
+
|
|
15
|
+
User: Can you give me an example of the Single Responsibility Principle?
|
|
16
|
+
Assistant: Sure! Instead of a class that handles both user authentication and email sending, split them:
|
|
17
|
+
|
|
18
|
+
Bad: `UserService` handles login, registration, AND sends welcome emails.
|
|
19
|
+
Good: `AuthService` handles login/registration. `EmailService` handles sending emails.
|
|
20
|
+
|
|
21
|
+
Each class has one job, making them easier to test and maintain.
|
|
22
|
+
|
|
23
|
+
User: What's the difference between an interface and an abstract class?
|
|
24
|
+
Assistant: Key differences:
|
|
25
|
+
|
|
26
|
+
- **Interface**: Defines a contract (method signatures only). A class can implement multiple interfaces. No state.
|
|
27
|
+
- **Abstract class**: Can have both abstract and concrete methods. A class can extend only one. Can hold state.
|
|
28
|
+
|
|
29
|
+
Use interfaces for "can-do" relationships (e.g., `Serializable`). Use abstract classes for "is-a" relationships with shared behavior (e.g., `Animal` base class).
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
## Output
|
|
33
|
+
|
|
34
|
+
```yaml
|
|
35
|
+
description: "Software engineering concepts — SOLID principles and OOP"
|
|
36
|
+
|
|
37
|
+
tests:
|
|
38
|
+
- id: solid-principles-overview
|
|
39
|
+
criteria: "List and briefly explain all five SOLID principles"
|
|
40
|
+
input: "What are the SOLID principles in software engineering?"
|
|
41
|
+
expected_output: |
|
|
42
|
+
The SOLID principles are five design principles for object-oriented programming:
|
|
43
|
+
|
|
44
|
+
1. **Single Responsibility** — A class should have only one reason to change
|
|
45
|
+
2. **Open/Closed** — Open for extension, closed for modification
|
|
46
|
+
3. **Liskov Substitution** — Subtypes must be substitutable for their base types
|
|
47
|
+
4. **Interface Segregation** — Prefer small, specific interfaces over large ones
|
|
48
|
+
5. **Dependency Inversion** — Depend on abstractions, not concretions
|
|
49
|
+
assert:
|
|
50
|
+
- type: rubrics
|
|
51
|
+
criteria:
|
|
52
|
+
- Lists all five SOLID principles by name
|
|
53
|
+
- Provides a concise explanation for each principle
|
|
54
|
+
- Explanations are technically accurate
|
|
55
|
+
|
|
56
|
+
- id: srp-example
|
|
57
|
+
criteria: "Provide a concrete example of the Single Responsibility Principle"
|
|
58
|
+
input:
|
|
59
|
+
- role: user
|
|
60
|
+
content: "What are the SOLID principles in software engineering?"
|
|
61
|
+
- role: assistant
|
|
62
|
+
content: "The SOLID principles are five design principles..."
|
|
63
|
+
- role: user
|
|
64
|
+
content: "Can you give me an example of the Single Responsibility Principle?"
|
|
65
|
+
expected_output: |
|
|
66
|
+
Sure! Instead of a class that handles both user authentication and email sending, split them:
|
|
67
|
+
|
|
68
|
+
Bad: `UserService` handles login, registration, AND sends welcome emails.
|
|
69
|
+
Good: `AuthService` handles login/registration. `EmailService` handles sending emails.
|
|
70
|
+
|
|
71
|
+
Each class has one job, making them easier to test and maintain.
|
|
72
|
+
assert:
|
|
73
|
+
- type: rubrics
|
|
74
|
+
criteria:
|
|
75
|
+
- Shows a bad example that violates SRP
|
|
76
|
+
- Shows a good example that follows SRP
|
|
77
|
+
- Explains why the separation is beneficial
|
|
78
|
+
|
|
79
|
+
- id: interface-vs-abstract-class
|
|
80
|
+
criteria: "Explain the difference between interfaces and abstract classes"
|
|
81
|
+
input: "What's the difference between an interface and an abstract class?"
|
|
82
|
+
expected_output: |
|
|
83
|
+
Key differences:
|
|
84
|
+
|
|
85
|
+
- **Interface**: Defines a contract (method signatures only). A class can implement multiple interfaces. No state.
|
|
86
|
+
- **Abstract class**: Can have both abstract and concrete methods. A class can extend only one. Can hold state.
|
|
87
|
+
|
|
88
|
+
Use interfaces for "can-do" relationships (e.g., `Serializable`). Use abstract classes for "is-a" relationships with shared behavior (e.g., `Animal` base class).
|
|
89
|
+
assert:
|
|
90
|
+
- type: rubrics
|
|
91
|
+
criteria:
|
|
92
|
+
- Correctly distinguishes interfaces from abstract classes
|
|
93
|
+
- Mentions multiple inheritance support for interfaces
|
|
94
|
+
- Provides guidance on when to use each
|
|
95
|
+
|
|
96
|
+
# Suggested additional evaluators:
|
|
97
|
+
# assert:
|
|
98
|
+
# - name: quality
|
|
99
|
+
# type: llm_judge
|
|
100
|
+
# prompt: ./prompts/quality.md
|
|
101
|
+
```
|
|
@@ -0,0 +1,433 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: agentv-eval-builder
|
|
3
|
+
description: Create and maintain AgentV YAML evaluation files for testing AI agent performance. Use this skill when creating new eval files, adding tests, or configuring evaluators.
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# AgentV Eval Builder
|
|
7
|
+
|
|
8
|
+
Comprehensive docs: https://agentv.dev
|
|
9
|
+
|
|
10
|
+
## Quick Start
|
|
11
|
+
|
|
12
|
+
```yaml
|
|
13
|
+
description: Example eval
|
|
14
|
+
execution:
|
|
15
|
+
target: default
|
|
16
|
+
|
|
17
|
+
tests:
|
|
18
|
+
- id: greeting
|
|
19
|
+
criteria: Friendly greeting
|
|
20
|
+
input: "Say hello"
|
|
21
|
+
expected_output: "Hello! How can I help you?"
|
|
22
|
+
assert:
|
|
23
|
+
- type: rubrics
|
|
24
|
+
criteria:
|
|
25
|
+
- Greeting is friendly and warm
|
|
26
|
+
- Offers to help
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
## Eval File Structure
|
|
30
|
+
|
|
31
|
+
**Required:** `tests` (array or string path)
|
|
32
|
+
**Optional:** `name`, `description`, `version`, `author`, `tags`, `license`, `requires`, `execution`, `dataset`, `workspace`, `assert`
|
|
33
|
+
|
|
34
|
+
**Test fields:**
|
|
35
|
+
|
|
36
|
+
| Field | Required | Description |
|
|
37
|
+
|-------|----------|-------------|
|
|
38
|
+
| `id` | yes | Unique identifier |
|
|
39
|
+
| `criteria` | yes | What the response should accomplish |
|
|
40
|
+
| `input` / `input` | yes | Input to the agent |
|
|
41
|
+
| `expected_output` / `expected_output` | no | Gold-standard reference answer |
|
|
42
|
+
| `assert` | no | Evaluators: assertions, rubrics, judges |
|
|
43
|
+
| `rubrics` | no | **Deprecated** — use `assert: [{type: rubrics, criteria: [...]}]` instead |
|
|
44
|
+
| `execution` | no | Per-case execution overrides |
|
|
45
|
+
| `workspace` | no | Per-case workspace config (overrides suite-level) |
|
|
46
|
+
| `metadata` | no | Arbitrary key-value pairs passed to setup/teardown scripts |
|
|
47
|
+
| `conversation_id` | no | Thread grouping |
|
|
48
|
+
|
|
49
|
+
**Shorthand aliases:**
|
|
50
|
+
- `input` (string) expands to `[{role: "user", content: "..."}]`
|
|
51
|
+
- `expected_output` (string/object) expands to `[{role: "assistant", content: ...}]`
|
|
52
|
+
- Canonical `input` / `expected_output` take precedence when both present
|
|
53
|
+
|
|
54
|
+
**Message format:** `{role, content}` where role is `system`, `user`, `assistant`, or `tool`
|
|
55
|
+
**Content types:** inline text, `{type: "file", value: "./path.md"}`
|
|
56
|
+
**File paths:** relative from eval file dir, or absolute with `/` prefix from repo root
|
|
57
|
+
|
|
58
|
+
**JSONL format:** One test per line as JSON. Optional `.yaml` sidecar for shared defaults. See `examples/features/basic-jsonl/`.
|
|
59
|
+
|
|
60
|
+
## Metadata
|
|
61
|
+
|
|
62
|
+
When `name` is present, the suite is parsed as a metadata-bearing eval:
|
|
63
|
+
|
|
64
|
+
```yaml
|
|
65
|
+
name: export-screening # required, lowercase/hyphens, max 64 chars
|
|
66
|
+
description: Evaluates export control screening accuracy
|
|
67
|
+
version: "1.0"
|
|
68
|
+
author: acme-compliance
|
|
69
|
+
tags: [compliance, agents]
|
|
70
|
+
license: Apache-2.0
|
|
71
|
+
requires:
|
|
72
|
+
agentv: ">=0.30.0"
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
## Tests as String Path
|
|
76
|
+
|
|
77
|
+
Point `tests` to an external file instead of inlining:
|
|
78
|
+
|
|
79
|
+
```yaml
|
|
80
|
+
name: my-eval
|
|
81
|
+
description: My evaluation suite
|
|
82
|
+
tests: ./cases.yaml # relative to eval file dir
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
The external file can be YAML (array of test objects) or JSONL.
|
|
86
|
+
|
|
87
|
+
## Assert Field
|
|
88
|
+
|
|
89
|
+
`assert` defines evaluators at the suite level or per-test level. It is the canonical field for all evaluators (replaces `execution.evaluators`):
|
|
90
|
+
|
|
91
|
+
```yaml
|
|
92
|
+
# Suite-level (appended to every test)
|
|
93
|
+
assert:
|
|
94
|
+
- type: is_json
|
|
95
|
+
required: true
|
|
96
|
+
- type: contains
|
|
97
|
+
value: "status"
|
|
98
|
+
|
|
99
|
+
tests:
|
|
100
|
+
- id: test-1
|
|
101
|
+
criteria: Returns JSON
|
|
102
|
+
input: Get status
|
|
103
|
+
# Per-test assert (runs before suite-level)
|
|
104
|
+
assert:
|
|
105
|
+
- type: equals
|
|
106
|
+
value: '{"status": "ok"}'
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
`execution.evaluators` is deprecated. When both `assert` and `execution.evaluators` are present, `assert` takes precedence.
|
|
110
|
+
|
|
111
|
+
## Required Gates
|
|
112
|
+
|
|
113
|
+
Any evaluator can be marked `required` to enforce a minimum score:
|
|
114
|
+
|
|
115
|
+
```yaml
|
|
116
|
+
assert:
|
|
117
|
+
- type: contains
|
|
118
|
+
value: "DENIED"
|
|
119
|
+
required: true # must score >= 0.8 (default)
|
|
120
|
+
- type: rubrics
|
|
121
|
+
required: 0.6 # must score >= 0.6 (custom threshold)
|
|
122
|
+
criteria:
|
|
123
|
+
- id: accuracy
|
|
124
|
+
outcome: Identifies the denied party
|
|
125
|
+
weight: 5.0
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
If a required evaluator scores below its threshold, the overall verdict is forced to `fail`.
|
|
129
|
+
|
|
130
|
+
## Workspace Setup/Teardown
|
|
131
|
+
|
|
132
|
+
Run scripts before/after each test. Define at suite level or override per case:
|
|
133
|
+
|
|
134
|
+
```yaml
|
|
135
|
+
workspace:
|
|
136
|
+
template: ./workspace-templates/my-project
|
|
137
|
+
setup:
|
|
138
|
+
script: ["bun", "run", "setup.ts"]
|
|
139
|
+
timeout_ms: 120000
|
|
140
|
+
teardown:
|
|
141
|
+
script: ["bun", "run", "teardown.ts"]
|
|
142
|
+
|
|
143
|
+
tests:
|
|
144
|
+
- id: case-1
|
|
145
|
+
input: Fix the bug
|
|
146
|
+
criteria: Bug is fixed
|
|
147
|
+
metadata:
|
|
148
|
+
repo: sympy/sympy
|
|
149
|
+
base_commit: "abc123"
|
|
150
|
+
workspace:
|
|
151
|
+
setup:
|
|
152
|
+
script: ["python", "custom-setup.py"] # overrides suite-level
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
**Lifecycle:** template copy → setup → git baseline → agent → file changes → teardown → cleanup
|
|
156
|
+
**Merge:** Case-level fields replace suite-level fields.
|
|
157
|
+
**Scripts receive stdin JSON:** `{workspace_path, test_id, eval_run_id, case_input, case_metadata}`
|
|
158
|
+
**Setup failure:** aborts case. **Teardown failure:** non-fatal (warning).
|
|
159
|
+
See https://agentv.dev/targets/configuration/#workspace-setupteardown
|
|
160
|
+
|
|
161
|
+
## Evaluator Types
|
|
162
|
+
|
|
163
|
+
Configure via `assert` array. Multiple evaluators produce a weighted average score.
|
|
164
|
+
|
|
165
|
+
### code_judge
|
|
166
|
+
```yaml
|
|
167
|
+
- name: format_check
|
|
168
|
+
type: code_judge
|
|
169
|
+
script: uv run validate.py
|
|
170
|
+
cwd: ./scripts # optional working directory
|
|
171
|
+
target: {} # optional: enable LLM target proxy (max_calls: 50)
|
|
172
|
+
```
|
|
173
|
+
Contract: stdin JSON -> stdout JSON `{score, hits, misses, reasoning}`
|
|
174
|
+
Input includes: `question`, `criteria`, `answer`, `reference_answer`, `output`, `trace`, `file_changes`, `workspace_path`, `config`
|
|
175
|
+
When `workspace_template` is configured, `workspace_path` is the absolute path to the workspace dir (also available as `AGENTV_WORKSPACE_PATH` env var). Use this for functional grading (e.g., running `npm test` in the workspace).
|
|
176
|
+
See docs at https://agentv.dev/evaluators/code-judges/
|
|
177
|
+
|
|
178
|
+
### llm_judge
|
|
179
|
+
```yaml
|
|
180
|
+
- name: quality
|
|
181
|
+
type: llm_judge
|
|
182
|
+
prompt: ./prompts/eval.md # markdown template or script config
|
|
183
|
+
model: gpt-5-chat # optional model override
|
|
184
|
+
config: # passed to script templates as context.config
|
|
185
|
+
strictness: high
|
|
186
|
+
```
|
|
187
|
+
Variables: `{{question}}`, `{{criteria}}`, `{{answer}}`, `{{reference_answer}}`, `{{input}}`, `{{expected_output}}`, `{{output}}`, `{{file_changes}}`
|
|
188
|
+
- Markdown templates: use `{{variable}}` syntax
|
|
189
|
+
- TypeScript templates: use `definePromptTemplate(fn)` from `@agentv/eval`, receives context object with all variables + `config`
|
|
190
|
+
|
|
191
|
+
### composite
|
|
192
|
+
```yaml
|
|
193
|
+
- name: gate
|
|
194
|
+
type: composite
|
|
195
|
+
assert:
|
|
196
|
+
- name: safety
|
|
197
|
+
type: llm_judge
|
|
198
|
+
prompt: ./safety.md
|
|
199
|
+
- name: quality
|
|
200
|
+
type: llm_judge
|
|
201
|
+
aggregator:
|
|
202
|
+
type: weighted_average
|
|
203
|
+
weights: { safety: 0.3, quality: 0.7 }
|
|
204
|
+
```
|
|
205
|
+
Aggregator types: `weighted_average`, `all_or_nothing`, `minimum`, `maximum`, `safety_gate`
|
|
206
|
+
- `safety_gate`: fails immediately if the named gate evaluator scores below threshold (default 1.0)
|
|
207
|
+
|
|
208
|
+
### tool_trajectory
|
|
209
|
+
```yaml
|
|
210
|
+
- name: tool_check
|
|
211
|
+
type: tool_trajectory
|
|
212
|
+
mode: any_order # any_order | in_order | exact
|
|
213
|
+
minimums: # for any_order
|
|
214
|
+
knowledgeSearch: 2
|
|
215
|
+
expected: # for in_order/exact
|
|
216
|
+
- tool: knowledgeSearch
|
|
217
|
+
args: { query: "search term" } # partial deep equality match
|
|
218
|
+
- tool: documentRetrieve
|
|
219
|
+
args: any # any arguments accepted
|
|
220
|
+
max_duration_ms: 5000 # per-tool latency assertion
|
|
221
|
+
- tool: summarize # omit args to skip argument checking
|
|
222
|
+
```
|
|
223
|
+
|
|
224
|
+
### field_accuracy
|
|
225
|
+
```yaml
|
|
226
|
+
- name: fields
|
|
227
|
+
type: field_accuracy
|
|
228
|
+
match_type: exact # exact | date | numeric_tolerance
|
|
229
|
+
numeric_tolerance: 0.01 # for numeric_tolerance match_type
|
|
230
|
+
aggregation: weighted_average # weighted_average | all_or_nothing
|
|
231
|
+
```
|
|
232
|
+
Compares `output` fields against `expected_output` fields.
|
|
233
|
+
|
|
234
|
+
### latency
|
|
235
|
+
```yaml
|
|
236
|
+
- name: speed
|
|
237
|
+
type: latency
|
|
238
|
+
max_ms: 5000
|
|
239
|
+
```
|
|
240
|
+
|
|
241
|
+
### cost
|
|
242
|
+
```yaml
|
|
243
|
+
- name: budget
|
|
244
|
+
type: cost
|
|
245
|
+
max_usd: 0.10
|
|
246
|
+
```
|
|
247
|
+
|
|
248
|
+
### token_usage
|
|
249
|
+
```yaml
|
|
250
|
+
- name: tokens
|
|
251
|
+
type: token_usage
|
|
252
|
+
max_total_tokens: 4000
|
|
253
|
+
```
|
|
254
|
+
|
|
255
|
+
### execution_metrics
|
|
256
|
+
```yaml
|
|
257
|
+
- name: efficiency
|
|
258
|
+
type: execution_metrics
|
|
259
|
+
max_tool_calls: 10 # Maximum tool invocations
|
|
260
|
+
max_llm_calls: 5 # Maximum LLM calls (assistant messages)
|
|
261
|
+
max_tokens: 5000 # Maximum total tokens (input + output)
|
|
262
|
+
max_cost_usd: 0.05 # Maximum cost in USD
|
|
263
|
+
max_duration_ms: 30000 # Maximum execution duration
|
|
264
|
+
target_exploration_ratio: 0.6 # Target ratio of read-only tool calls
|
|
265
|
+
exploration_tolerance: 0.2 # Tolerance for ratio check (default: 0.2)
|
|
266
|
+
```
|
|
267
|
+
Declarative threshold-based checks on execution metrics. Only specified thresholds are checked.
|
|
268
|
+
Score is proportional: `hits / (hits + misses)`. Missing data counts as a miss.
|
|
269
|
+
|
|
270
|
+
### contains
|
|
271
|
+
```yaml
|
|
272
|
+
- type: contains
|
|
273
|
+
value: "DENIED"
|
|
274
|
+
required: true
|
|
275
|
+
```
|
|
276
|
+
Binary check: does output contain the substring? Name auto-generated if omitted.
|
|
277
|
+
|
|
278
|
+
### regex
|
|
279
|
+
```yaml
|
|
280
|
+
- type: regex
|
|
281
|
+
value: "\\d{3}-\\d{2}-\\d{4}"
|
|
282
|
+
```
|
|
283
|
+
Binary check: does output match the regex pattern?
|
|
284
|
+
|
|
285
|
+
### equals
|
|
286
|
+
```yaml
|
|
287
|
+
- type: equals
|
|
288
|
+
value: "42"
|
|
289
|
+
```
|
|
290
|
+
Binary check: does output exactly equal the value (both trimmed)?
|
|
291
|
+
|
|
292
|
+
### is_json
|
|
293
|
+
```yaml
|
|
294
|
+
- type: is_json
|
|
295
|
+
required: true
|
|
296
|
+
```
|
|
297
|
+
Binary check: is the output valid JSON?
|
|
298
|
+
|
|
299
|
+
### rubrics
|
|
300
|
+
```yaml
|
|
301
|
+
- type: rubrics
|
|
302
|
+
criteria:
|
|
303
|
+
- id: accuracy
|
|
304
|
+
outcome: Correctly identifies the denied party
|
|
305
|
+
weight: 5.0
|
|
306
|
+
- id: reasoning
|
|
307
|
+
outcome: Provides clear reasoning
|
|
308
|
+
weight: 3.0
|
|
309
|
+
```
|
|
310
|
+
LLM-judged structured evaluation with weighted criteria. Criteria items support `id`, `outcome`, `weight`, and `required` fields.
|
|
311
|
+
|
|
312
|
+
### rubrics (inline, deprecated)
|
|
313
|
+
Top-level `rubrics:` field is deprecated. Use `type: rubrics` under `assert` instead.
|
|
314
|
+
See `references/rubric-evaluator.md` for score-range mode and scoring formula.
|
|
315
|
+
|
|
316
|
+
## CLI Commands
|
|
317
|
+
|
|
318
|
+
```bash
|
|
319
|
+
# Run evaluation (requires API keys)
|
|
320
|
+
agentv eval <file.yaml> [--test-id <id>] [--target <name>] [--dry-run]
|
|
321
|
+
|
|
322
|
+
# Run with trace file (human-readable JSONL)
|
|
323
|
+
agentv eval <file.yaml> --trace-file traces/eval.jsonl
|
|
324
|
+
|
|
325
|
+
# Run with OTLP JSON file (importable by OTel backends)
|
|
326
|
+
agentv eval <file.yaml> --otel-file traces/eval.otlp.json
|
|
327
|
+
|
|
328
|
+
# Agent-orchestrated evals (no API keys needed)
|
|
329
|
+
agentv prompt eval <file.yaml> # orchestration overview
|
|
330
|
+
agentv prompt eval input <file.yaml> --test-id <id> # task input JSON (file paths, not embedded content)
|
|
331
|
+
agentv prompt eval judge <file.yaml> --test-id <id> --answer-file f # judge prompts / code judge results
|
|
332
|
+
|
|
333
|
+
# Validate eval file
|
|
334
|
+
agentv validate <file.yaml>
|
|
335
|
+
|
|
336
|
+
# Compare results between runs
|
|
337
|
+
agentv compare <results1.jsonl> <results2.jsonl>
|
|
338
|
+
|
|
339
|
+
# Generate rubrics from criteria
|
|
340
|
+
agentv generate rubrics <file.yaml> [--target <name>]
|
|
341
|
+
```
|
|
342
|
+
|
|
343
|
+
## Code Judge SDK
|
|
344
|
+
|
|
345
|
+
Use `@agentv/eval` to build custom evaluators in TypeScript/JavaScript:
|
|
346
|
+
|
|
347
|
+
### defineAssertion (recommended for custom checks)
|
|
348
|
+
```typescript
|
|
349
|
+
#!/usr/bin/env bun
|
|
350
|
+
import { defineAssertion } from '@agentv/eval';
|
|
351
|
+
|
|
352
|
+
export default defineAssertion(({ answer, trace }) => ({
|
|
353
|
+
pass: answer.length > 0 && (trace?.eventCount ?? 0) <= 10,
|
|
354
|
+
reasoning: 'Checks content exists and is efficient',
|
|
355
|
+
}));
|
|
356
|
+
```
|
|
357
|
+
|
|
358
|
+
Assertions support both `pass: boolean` and `score: number` (0-1). If only `pass` is given, score is 1 (pass) or 0 (fail).
|
|
359
|
+
|
|
360
|
+
### defineCodeJudge (full control)
|
|
361
|
+
```typescript
|
|
362
|
+
#!/usr/bin/env bun
|
|
363
|
+
import { defineCodeJudge } from '@agentv/eval';
|
|
364
|
+
|
|
365
|
+
export default defineCodeJudge(({ trace, answer }) => ({
|
|
366
|
+
score: trace?.eventCount <= 5 ? 1.0 : 0.5,
|
|
367
|
+
hits: ['Efficient tool usage'],
|
|
368
|
+
misses: [],
|
|
369
|
+
}));
|
|
370
|
+
```
|
|
371
|
+
|
|
372
|
+
Both are used via `type: code_judge` in YAML with `script: bun run judge.ts`.
|
|
373
|
+
|
|
374
|
+
### Convention-Based Discovery
|
|
375
|
+
|
|
376
|
+
Place assertion files in `.agentv/assertions/` — they auto-register by filename:
|
|
377
|
+
|
|
378
|
+
```
|
|
379
|
+
.agentv/assertions/word-count.ts → type: word-count
|
|
380
|
+
.agentv/assertions/sentiment.ts → type: sentiment
|
|
381
|
+
```
|
|
382
|
+
|
|
383
|
+
No `script:` needed in YAML — just use `type: <filename>`.
|
|
384
|
+
|
|
385
|
+
## Programmatic API
|
|
386
|
+
|
|
387
|
+
Use `evaluate()` from `@agentv/core` to run evals as a library:
|
|
388
|
+
|
|
389
|
+
```typescript
|
|
390
|
+
import { evaluate } from '@agentv/core';
|
|
391
|
+
|
|
392
|
+
const { results, summary } = await evaluate({
|
|
393
|
+
tests: [
|
|
394
|
+
{
|
|
395
|
+
id: 'greeting',
|
|
396
|
+
input: 'Say hello',
|
|
397
|
+
assert: [{ type: 'contains', value: 'hello' }],
|
|
398
|
+
},
|
|
399
|
+
],
|
|
400
|
+
target: { provider: 'mock_agent' },
|
|
401
|
+
});
|
|
402
|
+
console.log(`${summary.passed}/${summary.total} passed`);
|
|
403
|
+
```
|
|
404
|
+
|
|
405
|
+
Supports inline tests (no YAML) or file-based via `specFile`.
|
|
406
|
+
|
|
407
|
+
## defineConfig
|
|
408
|
+
|
|
409
|
+
Type-safe project configuration in `agentv.config.ts`:
|
|
410
|
+
|
|
411
|
+
```typescript
|
|
412
|
+
import { defineConfig } from '@agentv/core';
|
|
413
|
+
|
|
414
|
+
export default defineConfig({
|
|
415
|
+
execution: { workers: 5, maxRetries: 2 },
|
|
416
|
+
output: { format: 'jsonl', dir: './results' },
|
|
417
|
+
limits: { maxCostUsd: 10.0 },
|
|
418
|
+
});
|
|
419
|
+
```
|
|
420
|
+
|
|
421
|
+
Auto-discovered from project root. Validated with Zod.
|
|
422
|
+
|
|
423
|
+
## Scaffold Commands
|
|
424
|
+
|
|
425
|
+
```bash
|
|
426
|
+
agentv create assertion <name> # → .agentv/assertions/<name>.ts
|
|
427
|
+
agentv create eval <name> # → evals/<name>.eval.yaml + .cases.jsonl
|
|
428
|
+
```
|
|
429
|
+
|
|
430
|
+
## Schemas
|
|
431
|
+
|
|
432
|
+
- Eval file: `references/eval-schema.json`
|
|
433
|
+
- Config: `references/config-schema.json`
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$schema": "http://json-schema.org/draft-07/schema#",
|
|
3
|
+
"title": "AgentV Config Schema",
|
|
4
|
+
"description": "Schema for .agentv/config.yaml configuration files",
|
|
5
|
+
"type": "object",
|
|
6
|
+
"properties": {
|
|
7
|
+
"$schema": {
|
|
8
|
+
"type": "string",
|
|
9
|
+
"description": "Schema identifier",
|
|
10
|
+
"enum": ["agentv-config-v2"]
|
|
11
|
+
},
|
|
12
|
+
"guideline_patterns": {
|
|
13
|
+
"type": "array",
|
|
14
|
+
"description": "Glob patterns for identifying guideline files (instructions, prompts). Files matching these patterns are treated as guidelines, while non-matching files are treated as regular file content.",
|
|
15
|
+
"items": {
|
|
16
|
+
"type": "string",
|
|
17
|
+
"description": "Glob pattern (e.g., '**/*.instructions.md', '**/prompts/**')"
|
|
18
|
+
},
|
|
19
|
+
"examples": [
|
|
20
|
+
["**/*.instructions.md", "**/instructions/**", "**/*.prompt.md", "**/prompts/**"],
|
|
21
|
+
["**/*.guide.md", "**/guidelines/**", "docs/AGENTS.md"]
|
|
22
|
+
]
|
|
23
|
+
},
|
|
24
|
+
"eval_patterns": {
|
|
25
|
+
"type": "array",
|
|
26
|
+
"description": "Glob patterns for discovering eval files during interactive mode (`agentv eval` with no args). Defaults to ['**/evals/**/dataset*.yaml', '**/evals/**/eval.yaml'] if not specified.",
|
|
27
|
+
"items": {
|
|
28
|
+
"type": "string",
|
|
29
|
+
"description": "Glob pattern (e.g., '**/evals/**/dataset*.yaml', '**/evals/**/eval.yaml')"
|
|
30
|
+
},
|
|
31
|
+
"examples": [["**/evals/**/dataset*.yaml", "**/evals/**/eval.yaml"], ["**/evals/**/*.yaml"]]
|
|
32
|
+
}
|
|
33
|
+
},
|
|
34
|
+
"required": ["$schema"],
|
|
35
|
+
"additionalProperties": false
|
|
36
|
+
}
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
# Custom Evaluators
|
|
2
|
+
|
|
3
|
+
## Wire Format
|
|
4
|
+
|
|
5
|
+
### Input (stdin JSON)
|
|
6
|
+
|
|
7
|
+
```json
|
|
8
|
+
{
|
|
9
|
+
"question": "string",
|
|
10
|
+
"criteria": "string",
|
|
11
|
+
"reference_answer": "string",
|
|
12
|
+
"answer": "string",
|
|
13
|
+
"guideline_files": ["path"],
|
|
14
|
+
"input_files": ["path"],
|
|
15
|
+
"input": [{"role": "user", "content": "..."}],
|
|
16
|
+
"expected_output": [{"role": "assistant", "content": "..."}],
|
|
17
|
+
"output": [{"role": "assistant", "content": "..."}],
|
|
18
|
+
"trace": {
|
|
19
|
+
"event_count": 5,
|
|
20
|
+
"tool_names": ["fetch"],
|
|
21
|
+
"tool_calls_by_name": {"fetch": 1},
|
|
22
|
+
"error_count": 0,
|
|
23
|
+
"llm_call_count": 2,
|
|
24
|
+
"token_usage": {"input": 1000, "output": 500},
|
|
25
|
+
"cost_usd": 0.0015,
|
|
26
|
+
"duration_ms": 3500,
|
|
27
|
+
"start_time": "2026-02-13T10:00:00.000Z",
|
|
28
|
+
"end_time": "2026-02-13T10:00:03.500Z"
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
### Output (stdout JSON)
|
|
34
|
+
|
|
35
|
+
```json
|
|
36
|
+
{
|
|
37
|
+
"score": 0.85,
|
|
38
|
+
"hits": ["passed check"],
|
|
39
|
+
"misses": ["failed check"],
|
|
40
|
+
"reasoning": "explanation"
|
|
41
|
+
}
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
`score` (0.0-1.0) required. `hits`, `misses`, `reasoning` optional.
|
|
45
|
+
|
|
46
|
+
## SDK Functions
|
|
47
|
+
|
|
48
|
+
```typescript
|
|
49
|
+
import { defineCodeJudge, createTargetClient, definePromptTemplate } from '@agentv/eval';
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
- `defineCodeJudge(fn)` - Wraps evaluation function with stdin/stdout handling
|
|
53
|
+
- `createTargetClient()` - Returns LLM proxy client (when `target: {}` configured)
|
|
54
|
+
- `.invoke({question, systemPrompt})` - Single LLM call
|
|
55
|
+
- `.invokeBatch(requests)` - Batch LLM calls
|
|
56
|
+
- `definePromptTemplate(fn)` - Wraps prompt generation function
|
|
57
|
+
- Context fields: `question`, `answer`, `referenceAnswer`, `criteria`, `expectedOutput`, `output`, `config`, `trace`
|
|
58
|
+
|
|
59
|
+
## Python Example
|
|
60
|
+
|
|
61
|
+
```python
|
|
62
|
+
#!/usr/bin/env python3
|
|
63
|
+
import json, sys
|
|
64
|
+
|
|
65
|
+
def evaluate(data: dict) -> dict:
|
|
66
|
+
candidate = data.get("answer", "")
|
|
67
|
+
hits, misses = [], []
|
|
68
|
+
for kw in ["async", "await"]:
|
|
69
|
+
(hits if kw in candidate else misses).append(f"Keyword '{kw}'")
|
|
70
|
+
return {
|
|
71
|
+
"score": len(hits) / max(len(hits) + len(misses), 1),
|
|
72
|
+
"hits": hits, "misses": misses
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
if __name__ == "__main__":
|
|
76
|
+
try:
|
|
77
|
+
print(json.dumps(evaluate(json.loads(sys.stdin.read()))))
|
|
78
|
+
except Exception as e:
|
|
79
|
+
print(json.dumps({"score": 0, "misses": [str(e)]}))
|
|
80
|
+
sys.exit(1)
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
## TypeScript Example
|
|
84
|
+
|
|
85
|
+
```typescript
|
|
86
|
+
#!/usr/bin/env bun
|
|
87
|
+
import { defineCodeJudge } from '@agentv/eval';
|
|
88
|
+
|
|
89
|
+
export default defineCodeJudge(({ answer, criteria }) => {
|
|
90
|
+
const hits: string[] = [];
|
|
91
|
+
const misses: string[] = [];
|
|
92
|
+
if (answer.includes(criteria)) {
|
|
93
|
+
hits.push('Matches expected outcome');
|
|
94
|
+
} else {
|
|
95
|
+
misses.push('Does not match expected outcome');
|
|
96
|
+
}
|
|
97
|
+
return {
|
|
98
|
+
score: hits.length / Math.max(hits.length + misses.length, 1),
|
|
99
|
+
hits, misses,
|
|
100
|
+
};
|
|
101
|
+
});
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
## Template Variables
|
|
105
|
+
|
|
106
|
+
Derived from test fields (users never author these directly):
|
|
107
|
+
|
|
108
|
+
| Variable | Source |
|
|
109
|
+
|----------|--------|
|
|
110
|
+
| `question` | First user message in `input` |
|
|
111
|
+
| `criteria` | Test `criteria` field |
|
|
112
|
+
| `reference_answer` | Last entry in `expected_output` |
|
|
113
|
+
| `answer` | Last entry in `output` (runtime) |
|
|
114
|
+
| `input` | Full resolved input array (JSON) |
|
|
115
|
+
| `expected_output` | Full resolved expected array (JSON) |
|
|
116
|
+
| `output` | Full provider output array (JSON) |
|
|
117
|
+
|
|
118
|
+
Markdown templates use `{{variable}}` syntax. TypeScript templates receive context object.
|