agentv 2.11.1 → 2.11.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-D6KWUG7C.js → chunk-SNABHVUB.js} +2 -2
- package/dist/{chunk-D6KWUG7C.js.map → chunk-SNABHVUB.js.map} +1 -1
- package/dist/cli.js +1 -1
- package/dist/index.js +1 -1
- package/package.json +1 -1
- package/dist/templates/.agents/skills/agentv-chat-to-eval/README.md +0 -84
- package/dist/templates/.agents/skills/agentv-chat-to-eval/SKILL.md +0 -144
- package/dist/templates/.agents/skills/agentv-chat-to-eval/examples/transcript-json.md +0 -67
- package/dist/templates/.agents/skills/agentv-chat-to-eval/examples/transcript-markdown.md +0 -101
- package/dist/templates/.agents/skills/agentv-eval-builder/SKILL.md +0 -458
- package/dist/templates/.agents/skills/agentv-eval-builder/references/config-schema.json +0 -36
- package/dist/templates/.agents/skills/agentv-eval-builder/references/custom-evaluators.md +0 -118
- package/dist/templates/.agents/skills/agentv-eval-builder/references/eval-schema.json +0 -12753
- package/dist/templates/.agents/skills/agentv-eval-builder/references/rubric-evaluator.md +0 -77
- package/dist/templates/.agents/skills/agentv-eval-orchestrator/SKILL.md +0 -50
- package/dist/templates/.agents/skills/agentv-prompt-optimizer/SKILL.md +0 -78
|
@@ -1,458 +0,0 @@
|
|
|
1
|
-
---
|
|
2
|
-
name: agentv-eval-builder
|
|
3
|
-
description: Create and maintain AgentV YAML evaluation files for testing AI agent performance. Use this skill when creating new eval files, adding tests, or configuring evaluators.
|
|
4
|
-
---
|
|
5
|
-
|
|
6
|
-
# AgentV Eval Builder
|
|
7
|
-
|
|
8
|
-
Comprehensive docs: https://agentv.dev
|
|
9
|
-
|
|
10
|
-
## Quick Start
|
|
11
|
-
|
|
12
|
-
```yaml
|
|
13
|
-
description: Example eval
|
|
14
|
-
execution:
|
|
15
|
-
target: default
|
|
16
|
-
|
|
17
|
-
tests:
|
|
18
|
-
- id: greeting
|
|
19
|
-
criteria: Friendly greeting
|
|
20
|
-
input: "Say hello"
|
|
21
|
-
expected_output: "Hello! How can I help you?"
|
|
22
|
-
assert:
|
|
23
|
-
- type: rubrics
|
|
24
|
-
criteria:
|
|
25
|
-
- Greeting is friendly and warm
|
|
26
|
-
- Offers to help
|
|
27
|
-
```
|
|
28
|
-
|
|
29
|
-
## Eval File Structure
|
|
30
|
-
|
|
31
|
-
**Required:** `tests` (array or string path)
|
|
32
|
-
**Optional:** `name`, `description`, `version`, `author`, `tags`, `license`, `requires`, `execution`, `dataset`, `workspace`, `assert`, `input`
|
|
33
|
-
|
|
34
|
-
**Test fields:**
|
|
35
|
-
|
|
36
|
-
| Field | Required | Description |
|
|
37
|
-
|-------|----------|-------------|
|
|
38
|
-
| `id` | yes | Unique identifier |
|
|
39
|
-
| `criteria` | yes | What the response should accomplish |
|
|
40
|
-
| `input` / `input` | yes | Input to the agent |
|
|
41
|
-
| `expected_output` / `expected_output` | no | Gold-standard reference answer |
|
|
42
|
-
| `assert` | no | Evaluators: assertions, rubrics, judges |
|
|
43
|
-
| `rubrics` | no | **Deprecated** — use `assert: [{type: rubrics, criteria: [...]}]` instead |
|
|
44
|
-
| `execution` | no | Per-case execution overrides |
|
|
45
|
-
| `workspace` | no | Per-case workspace config (overrides suite-level) |
|
|
46
|
-
| `metadata` | no | Arbitrary key-value pairs passed to setup/teardown scripts |
|
|
47
|
-
| `conversation_id` | no | Thread grouping |
|
|
48
|
-
|
|
49
|
-
**Shorthand aliases:**
|
|
50
|
-
- `input` (string) expands to `[{role: "user", content: "..."}]`
|
|
51
|
-
- `expected_output` (string/object) expands to `[{role: "assistant", content: ...}]`
|
|
52
|
-
- Canonical `input` / `expected_output` take precedence when both present
|
|
53
|
-
|
|
54
|
-
**Message format:** `{role, content}` where role is `system`, `user`, `assistant`, or `tool`
|
|
55
|
-
**Content types:** inline text, `{type: "file", value: "./path.md"}`
|
|
56
|
-
**File paths:** relative from eval file dir, or absolute with `/` prefix from repo root
|
|
57
|
-
|
|
58
|
-
**JSONL format:** One test per line as JSON. Optional `.yaml` sidecar for shared defaults. See `examples/features/basic-jsonl/`.
|
|
59
|
-
|
|
60
|
-
## Metadata
|
|
61
|
-
|
|
62
|
-
When `name` is present, the suite is parsed as a metadata-bearing eval:
|
|
63
|
-
|
|
64
|
-
```yaml
|
|
65
|
-
name: export-screening # required, lowercase/hyphens, max 64 chars
|
|
66
|
-
description: Evaluates export control screening accuracy
|
|
67
|
-
version: "1.0"
|
|
68
|
-
author: acme-compliance
|
|
69
|
-
tags: [compliance, agents]
|
|
70
|
-
license: Apache-2.0
|
|
71
|
-
requires:
|
|
72
|
-
agentv: ">=0.30.0"
|
|
73
|
-
```
|
|
74
|
-
|
|
75
|
-
## Suite-level Input
|
|
76
|
-
|
|
77
|
-
Prepend shared input messages to every test (like suite-level `assert`). Avoids repeating the same prompt file in each test:
|
|
78
|
-
|
|
79
|
-
```yaml
|
|
80
|
-
input:
|
|
81
|
-
- role: user
|
|
82
|
-
content:
|
|
83
|
-
- type: file
|
|
84
|
-
value: ./system-prompt.md
|
|
85
|
-
|
|
86
|
-
tests: ./cases.yaml
|
|
87
|
-
|
|
88
|
-
# cases.yaml — each test only needs its own query
|
|
89
|
-
# - id: test-1
|
|
90
|
-
# criteria: ...
|
|
91
|
-
# input: "User question here"
|
|
92
|
-
```
|
|
93
|
-
|
|
94
|
-
Effective input: `[...suite input, ...test input]`. Skipped when `execution.skip_defaults: true`.
|
|
95
|
-
Accepts same formats as test `input` (string or message array).
|
|
96
|
-
|
|
97
|
-
## Tests as String Path
|
|
98
|
-
|
|
99
|
-
Point `tests` to an external file instead of inlining:
|
|
100
|
-
|
|
101
|
-
```yaml
|
|
102
|
-
name: my-eval
|
|
103
|
-
description: My evaluation suite
|
|
104
|
-
tests: ./cases.yaml # relative to eval file dir
|
|
105
|
-
```
|
|
106
|
-
|
|
107
|
-
The external file can be YAML (array of test objects) or JSONL.
|
|
108
|
-
|
|
109
|
-
## Assert Field
|
|
110
|
-
|
|
111
|
-
`assert` defines evaluators at the suite level or per-test level. It is the canonical field for all evaluators (replaces `execution.evaluators`):
|
|
112
|
-
|
|
113
|
-
```yaml
|
|
114
|
-
# Suite-level (appended to every test)
|
|
115
|
-
assert:
|
|
116
|
-
- type: is_json
|
|
117
|
-
required: true
|
|
118
|
-
- type: contains
|
|
119
|
-
value: "status"
|
|
120
|
-
|
|
121
|
-
tests:
|
|
122
|
-
- id: test-1
|
|
123
|
-
criteria: Returns JSON
|
|
124
|
-
input: Get status
|
|
125
|
-
# Per-test assert (runs before suite-level)
|
|
126
|
-
assert:
|
|
127
|
-
- type: equals
|
|
128
|
-
value: '{"status": "ok"}'
|
|
129
|
-
```
|
|
130
|
-
|
|
131
|
-
`execution.evaluators` is deprecated. When both `assert` and `execution.evaluators` are present, `assert` takes precedence.
|
|
132
|
-
|
|
133
|
-
## Required Gates
|
|
134
|
-
|
|
135
|
-
Any evaluator can be marked `required` to enforce a minimum score:
|
|
136
|
-
|
|
137
|
-
```yaml
|
|
138
|
-
assert:
|
|
139
|
-
- type: contains
|
|
140
|
-
value: "DENIED"
|
|
141
|
-
required: true # must score >= 0.8 (default)
|
|
142
|
-
- type: rubrics
|
|
143
|
-
required: 0.6 # must score >= 0.6 (custom threshold)
|
|
144
|
-
criteria:
|
|
145
|
-
- id: accuracy
|
|
146
|
-
outcome: Identifies the denied party
|
|
147
|
-
weight: 5.0
|
|
148
|
-
```
|
|
149
|
-
|
|
150
|
-
If a required evaluator scores below its threshold, the overall verdict is forced to `fail`.
|
|
151
|
-
|
|
152
|
-
## Workspace Setup/Teardown
|
|
153
|
-
|
|
154
|
-
Run scripts before/after each test. Define at suite level or override per case:
|
|
155
|
-
|
|
156
|
-
```yaml
|
|
157
|
-
workspace:
|
|
158
|
-
template: ./workspace-templates/my-project
|
|
159
|
-
setup:
|
|
160
|
-
command: ["bun", "run", "setup.ts"]
|
|
161
|
-
timeout_ms: 120000
|
|
162
|
-
teardown:
|
|
163
|
-
command: ["bun", "run", "teardown.ts"]
|
|
164
|
-
|
|
165
|
-
tests:
|
|
166
|
-
- id: case-1
|
|
167
|
-
input: Fix the bug
|
|
168
|
-
criteria: Bug is fixed
|
|
169
|
-
metadata:
|
|
170
|
-
repo: sympy/sympy
|
|
171
|
-
base_commit: "abc123"
|
|
172
|
-
workspace:
|
|
173
|
-
setup:
|
|
174
|
-
command: ["python", "custom-setup.py"] # overrides suite-level
|
|
175
|
-
```
|
|
176
|
-
|
|
177
|
-
**Lifecycle:** template copy → setup → git baseline → agent → file changes → teardown → cleanup
|
|
178
|
-
**Merge:** Case-level fields replace suite-level fields.
|
|
179
|
-
**Commands receive stdin JSON:** `{workspace_path, test_id, eval_run_id, case_input, case_metadata}`
|
|
180
|
-
**Setup failure:** aborts case. **Teardown failure:** non-fatal (warning).
|
|
181
|
-
See https://agentv.dev/targets/configuration/#workspace-setupteardown
|
|
182
|
-
|
|
183
|
-
## Evaluator Types
|
|
184
|
-
|
|
185
|
-
Configure via `assert` array. Multiple evaluators produce a weighted average score.
|
|
186
|
-
|
|
187
|
-
### code_judge
|
|
188
|
-
```yaml
|
|
189
|
-
- name: format_check
|
|
190
|
-
type: code_judge
|
|
191
|
-
command: [uv, run, validate.py]
|
|
192
|
-
cwd: ./scripts # optional working directory
|
|
193
|
-
target: {} # optional: enable LLM target proxy (max_calls: 50)
|
|
194
|
-
```
|
|
195
|
-
Contract: stdin JSON -> stdout JSON `{score, hits, misses, reasoning}`
|
|
196
|
-
Input includes: `question`, `criteria`, `answer`, `reference_answer`, `output`, `trace`, `token_usage`, `cost_usd`, `duration_ms`, `start_time`, `end_time`, `file_changes`, `workspace_path`, `config`
|
|
197
|
-
When `workspace_template` is configured, `workspace_path` is the absolute path to the workspace dir (also available as `AGENTV_WORKSPACE_PATH` env var). Use this for functional grading (e.g., running `npm test` in the workspace).
|
|
198
|
-
See docs at https://agentv.dev/evaluators/code-judges/
|
|
199
|
-
|
|
200
|
-
### llm_judge
|
|
201
|
-
```yaml
|
|
202
|
-
- name: quality
|
|
203
|
-
type: llm_judge
|
|
204
|
-
prompt: ./prompts/eval.md # markdown template or command config
|
|
205
|
-
model: gpt-5-chat # optional model override
|
|
206
|
-
config: # passed to prompt templates as context.config
|
|
207
|
-
strictness: high
|
|
208
|
-
```
|
|
209
|
-
Variables: `{{question}}`, `{{criteria}}`, `{{answer}}`, `{{reference_answer}}`, `{{input}}`, `{{expected_output}}`, `{{output}}`, `{{file_changes}}`
|
|
210
|
-
- Markdown templates: use `{{variable}}` syntax
|
|
211
|
-
- TypeScript templates: use `definePromptTemplate(fn)` from `@agentv/eval`, receives context object with all variables + `config`
|
|
212
|
-
|
|
213
|
-
### composite
|
|
214
|
-
```yaml
|
|
215
|
-
- name: gate
|
|
216
|
-
type: composite
|
|
217
|
-
assert:
|
|
218
|
-
- name: safety
|
|
219
|
-
type: llm_judge
|
|
220
|
-
prompt: ./safety.md
|
|
221
|
-
- name: quality
|
|
222
|
-
type: llm_judge
|
|
223
|
-
aggregator:
|
|
224
|
-
type: weighted_average
|
|
225
|
-
weights: { safety: 0.3, quality: 0.7 }
|
|
226
|
-
```
|
|
227
|
-
Aggregator types: `weighted_average`, `all_or_nothing`, `minimum`, `maximum`, `safety_gate`
|
|
228
|
-
- `safety_gate`: fails immediately if the named gate evaluator scores below threshold (default 1.0)
|
|
229
|
-
|
|
230
|
-
### tool_trajectory
|
|
231
|
-
```yaml
|
|
232
|
-
- name: tool_check
|
|
233
|
-
type: tool_trajectory
|
|
234
|
-
mode: any_order # any_order | in_order | exact
|
|
235
|
-
minimums: # for any_order
|
|
236
|
-
knowledgeSearch: 2
|
|
237
|
-
expected: # for in_order/exact
|
|
238
|
-
- tool: knowledgeSearch
|
|
239
|
-
args: { query: "search term" } # partial deep equality match
|
|
240
|
-
- tool: documentRetrieve
|
|
241
|
-
args: any # any arguments accepted
|
|
242
|
-
max_duration_ms: 5000 # per-tool latency assertion
|
|
243
|
-
- tool: summarize # omit args to skip argument checking
|
|
244
|
-
```
|
|
245
|
-
|
|
246
|
-
### field_accuracy
|
|
247
|
-
```yaml
|
|
248
|
-
- name: fields
|
|
249
|
-
type: field_accuracy
|
|
250
|
-
match_type: exact # exact | date | numeric_tolerance
|
|
251
|
-
numeric_tolerance: 0.01 # for numeric_tolerance match_type
|
|
252
|
-
aggregation: weighted_average # weighted_average | all_or_nothing
|
|
253
|
-
```
|
|
254
|
-
Compares `output` fields against `expected_output` fields.
|
|
255
|
-
|
|
256
|
-
### latency
|
|
257
|
-
```yaml
|
|
258
|
-
- name: speed
|
|
259
|
-
type: latency
|
|
260
|
-
max_ms: 5000
|
|
261
|
-
```
|
|
262
|
-
|
|
263
|
-
### cost
|
|
264
|
-
```yaml
|
|
265
|
-
- name: budget
|
|
266
|
-
type: cost
|
|
267
|
-
max_usd: 0.10
|
|
268
|
-
```
|
|
269
|
-
|
|
270
|
-
### token_usage
|
|
271
|
-
```yaml
|
|
272
|
-
- name: tokens
|
|
273
|
-
type: token_usage
|
|
274
|
-
max_total_tokens: 4000
|
|
275
|
-
```
|
|
276
|
-
|
|
277
|
-
### execution_metrics
|
|
278
|
-
```yaml
|
|
279
|
-
- name: efficiency
|
|
280
|
-
type: execution_metrics
|
|
281
|
-
max_tool_calls: 10 # Maximum tool invocations
|
|
282
|
-
max_llm_calls: 5 # Maximum LLM calls (assistant messages)
|
|
283
|
-
max_tokens: 5000 # Maximum total tokens (input + output)
|
|
284
|
-
max_cost_usd: 0.05 # Maximum cost in USD
|
|
285
|
-
max_duration_ms: 30000 # Maximum execution duration
|
|
286
|
-
target_exploration_ratio: 0.6 # Target ratio of read-only tool calls
|
|
287
|
-
exploration_tolerance: 0.2 # Tolerance for ratio check (default: 0.2)
|
|
288
|
-
```
|
|
289
|
-
Declarative threshold-based checks on execution metrics. Only specified thresholds are checked.
|
|
290
|
-
Score is proportional: `hits / (hits + misses)`. Missing data counts as a miss.
|
|
291
|
-
|
|
292
|
-
### contains
|
|
293
|
-
```yaml
|
|
294
|
-
- type: contains
|
|
295
|
-
value: "DENIED"
|
|
296
|
-
required: true
|
|
297
|
-
```
|
|
298
|
-
Binary check: does output contain the substring? Name auto-generated if omitted.
|
|
299
|
-
|
|
300
|
-
### regex
|
|
301
|
-
```yaml
|
|
302
|
-
- type: regex
|
|
303
|
-
value: "\\d{3}-\\d{2}-\\d{4}"
|
|
304
|
-
```
|
|
305
|
-
Binary check: does output match the regex pattern?
|
|
306
|
-
|
|
307
|
-
### equals
|
|
308
|
-
```yaml
|
|
309
|
-
- type: equals
|
|
310
|
-
value: "42"
|
|
311
|
-
```
|
|
312
|
-
Binary check: does output exactly equal the value (both trimmed)?
|
|
313
|
-
|
|
314
|
-
### is_json
|
|
315
|
-
```yaml
|
|
316
|
-
- type: is_json
|
|
317
|
-
required: true
|
|
318
|
-
```
|
|
319
|
-
Binary check: is the output valid JSON?
|
|
320
|
-
|
|
321
|
-
### rubrics
|
|
322
|
-
```yaml
|
|
323
|
-
- type: rubrics
|
|
324
|
-
criteria:
|
|
325
|
-
- id: accuracy
|
|
326
|
-
outcome: Correctly identifies the denied party
|
|
327
|
-
weight: 5.0
|
|
328
|
-
- id: reasoning
|
|
329
|
-
outcome: Provides clear reasoning
|
|
330
|
-
weight: 3.0
|
|
331
|
-
```
|
|
332
|
-
LLM-judged structured evaluation with weighted criteria. Criteria items support `id`, `outcome`, `weight`, and `required` fields.
|
|
333
|
-
|
|
334
|
-
### rubrics (inline, deprecated)
|
|
335
|
-
Top-level `rubrics:` field is deprecated. Use `type: rubrics` under `assert` instead.
|
|
336
|
-
See `references/rubric-evaluator.md` for score-range mode and scoring formula.
|
|
337
|
-
|
|
338
|
-
## CLI Commands
|
|
339
|
-
|
|
340
|
-
```bash
|
|
341
|
-
# Run evaluation (requires API keys)
|
|
342
|
-
agentv eval <file.yaml> [--test-id <id>] [--target <name>] [--dry-run]
|
|
343
|
-
|
|
344
|
-
# Run with trace file (human-readable JSONL)
|
|
345
|
-
agentv eval <file.yaml> --trace-file traces/eval.jsonl
|
|
346
|
-
|
|
347
|
-
# Run with OTLP JSON file (importable by OTel backends)
|
|
348
|
-
agentv eval <file.yaml> --otel-file traces/eval.otlp.json
|
|
349
|
-
|
|
350
|
-
# Agent-orchestrated evals (no API keys needed)
|
|
351
|
-
agentv prompt eval <file.yaml> # orchestration overview
|
|
352
|
-
agentv prompt eval input <file.yaml> --test-id <id> # task input JSON (file paths, not embedded content)
|
|
353
|
-
agentv prompt eval judge <file.yaml> --test-id <id> --answer-file f # judge prompts / code judge results
|
|
354
|
-
|
|
355
|
-
# Validate eval file
|
|
356
|
-
agentv validate <file.yaml>
|
|
357
|
-
|
|
358
|
-
# Compare results — N-way matrix from combined JSONL
|
|
359
|
-
agentv compare <combined-results.jsonl>
|
|
360
|
-
agentv compare <combined-results.jsonl> --baseline <target> # CI regression gate
|
|
361
|
-
agentv compare <combined-results.jsonl> --baseline <target> --candidate <target> # pairwise
|
|
362
|
-
agentv compare <results1.jsonl> <results2.jsonl> # two-file pairwise
|
|
363
|
-
|
|
364
|
-
# Generate rubrics from criteria
|
|
365
|
-
agentv generate rubrics <file.yaml> [--target <name>]
|
|
366
|
-
```
|
|
367
|
-
|
|
368
|
-
## Code Judge SDK
|
|
369
|
-
|
|
370
|
-
Use `@agentv/eval` to build custom evaluators in TypeScript/JavaScript:
|
|
371
|
-
|
|
372
|
-
### defineAssertion (recommended for custom checks)
|
|
373
|
-
```typescript
|
|
374
|
-
#!/usr/bin/env bun
|
|
375
|
-
import { defineAssertion } from '@agentv/eval';
|
|
376
|
-
|
|
377
|
-
export default defineAssertion(({ answer, trace }) => ({
|
|
378
|
-
pass: answer.length > 0 && (trace?.eventCount ?? 0) <= 10,
|
|
379
|
-
reasoning: 'Checks content exists and is efficient',
|
|
380
|
-
}));
|
|
381
|
-
```
|
|
382
|
-
|
|
383
|
-
Assertions support both `pass: boolean` and `score: number` (0-1). If only `pass` is given, score is 1 (pass) or 0 (fail).
|
|
384
|
-
|
|
385
|
-
### defineCodeJudge (full control)
|
|
386
|
-
```typescript
|
|
387
|
-
#!/usr/bin/env bun
|
|
388
|
-
import { defineCodeJudge } from '@agentv/eval';
|
|
389
|
-
|
|
390
|
-
export default defineCodeJudge(({ trace, answer }) => ({
|
|
391
|
-
score: trace?.eventCount <= 5 ? 1.0 : 0.5,
|
|
392
|
-
hits: ['Efficient tool usage'],
|
|
393
|
-
misses: [],
|
|
394
|
-
}));
|
|
395
|
-
```
|
|
396
|
-
|
|
397
|
-
Both are used via `type: code_judge` in YAML with `command: [bun, run, judge.ts]`.
|
|
398
|
-
|
|
399
|
-
### Convention-Based Discovery
|
|
400
|
-
|
|
401
|
-
Place assertion files in `.agentv/assertions/` — they auto-register by filename:
|
|
402
|
-
|
|
403
|
-
```
|
|
404
|
-
.agentv/assertions/word-count.ts → type: word-count
|
|
405
|
-
.agentv/assertions/sentiment.ts → type: sentiment
|
|
406
|
-
```
|
|
407
|
-
|
|
408
|
-
No `command:` needed in YAML — just use `type: <filename>`.
|
|
409
|
-
|
|
410
|
-
## Programmatic API
|
|
411
|
-
|
|
412
|
-
Use `evaluate()` from `@agentv/core` to run evals as a library:
|
|
413
|
-
|
|
414
|
-
```typescript
|
|
415
|
-
import { evaluate } from '@agentv/core';
|
|
416
|
-
|
|
417
|
-
const { results, summary } = await evaluate({
|
|
418
|
-
tests: [
|
|
419
|
-
{
|
|
420
|
-
id: 'greeting',
|
|
421
|
-
input: 'Say hello',
|
|
422
|
-
assert: [{ type: 'contains', value: 'hello' }],
|
|
423
|
-
},
|
|
424
|
-
],
|
|
425
|
-
target: { provider: 'mock_agent' },
|
|
426
|
-
});
|
|
427
|
-
console.log(`${summary.passed}/${summary.total} passed`);
|
|
428
|
-
```
|
|
429
|
-
|
|
430
|
-
Supports inline tests (no YAML) or file-based via `specFile`.
|
|
431
|
-
|
|
432
|
-
## defineConfig
|
|
433
|
-
|
|
434
|
-
Type-safe project configuration in `agentv.config.ts`:
|
|
435
|
-
|
|
436
|
-
```typescript
|
|
437
|
-
import { defineConfig } from '@agentv/core';
|
|
438
|
-
|
|
439
|
-
export default defineConfig({
|
|
440
|
-
execution: { workers: 5, maxRetries: 2 },
|
|
441
|
-
output: { format: 'jsonl', dir: './results' },
|
|
442
|
-
limits: { maxCostUsd: 10.0 },
|
|
443
|
-
});
|
|
444
|
-
```
|
|
445
|
-
|
|
446
|
-
Auto-discovered from project root. Validated with Zod.
|
|
447
|
-
|
|
448
|
-
## Scaffold Commands
|
|
449
|
-
|
|
450
|
-
```bash
|
|
451
|
-
agentv create assertion <name> # → .agentv/assertions/<name>.ts
|
|
452
|
-
agentv create eval <name> # → evals/<name>.eval.yaml + .cases.jsonl
|
|
453
|
-
```
|
|
454
|
-
|
|
455
|
-
## Schemas
|
|
456
|
-
|
|
457
|
-
- Eval file: `references/eval-schema.json`
|
|
458
|
-
- Config: `references/config-schema.json`
|
|
@@ -1,36 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"$schema": "http://json-schema.org/draft-07/schema#",
|
|
3
|
-
"title": "AgentV Config Schema",
|
|
4
|
-
"description": "Schema for .agentv/config.yaml configuration files",
|
|
5
|
-
"type": "object",
|
|
6
|
-
"properties": {
|
|
7
|
-
"$schema": {
|
|
8
|
-
"type": "string",
|
|
9
|
-
"description": "Schema identifier",
|
|
10
|
-
"enum": ["agentv-config-v2"]
|
|
11
|
-
},
|
|
12
|
-
"guideline_patterns": {
|
|
13
|
-
"type": "array",
|
|
14
|
-
"description": "Glob patterns for identifying guideline files (instructions, prompts). Files matching these patterns are treated as guidelines, while non-matching files are treated as regular file content.",
|
|
15
|
-
"items": {
|
|
16
|
-
"type": "string",
|
|
17
|
-
"description": "Glob pattern (e.g., '**/*.instructions.md', '**/prompts/**')"
|
|
18
|
-
},
|
|
19
|
-
"examples": [
|
|
20
|
-
["**/*.instructions.md", "**/instructions/**", "**/*.prompt.md", "**/prompts/**"],
|
|
21
|
-
["**/*.guide.md", "**/guidelines/**", "docs/AGENTS.md"]
|
|
22
|
-
]
|
|
23
|
-
},
|
|
24
|
-
"eval_patterns": {
|
|
25
|
-
"type": "array",
|
|
26
|
-
"description": "Glob patterns for discovering eval files during interactive mode (`agentv eval` with no args). Defaults to ['**/evals/**/dataset*.yaml', '**/evals/**/eval.yaml'] if not specified.",
|
|
27
|
-
"items": {
|
|
28
|
-
"type": "string",
|
|
29
|
-
"description": "Glob pattern (e.g., '**/evals/**/dataset*.yaml', '**/evals/**/eval.yaml')"
|
|
30
|
-
},
|
|
31
|
-
"examples": [["**/evals/**/dataset*.yaml", "**/evals/**/eval.yaml"], ["**/evals/**/*.yaml"]]
|
|
32
|
-
}
|
|
33
|
-
},
|
|
34
|
-
"required": ["$schema"],
|
|
35
|
-
"additionalProperties": false
|
|
36
|
-
}
|
|
@@ -1,118 +0,0 @@
|
|
|
1
|
-
# Custom Evaluators
|
|
2
|
-
|
|
3
|
-
## Wire Format
|
|
4
|
-
|
|
5
|
-
### Input (stdin JSON)
|
|
6
|
-
|
|
7
|
-
```json
|
|
8
|
-
{
|
|
9
|
-
"question": "string",
|
|
10
|
-
"criteria": "string",
|
|
11
|
-
"reference_answer": "string",
|
|
12
|
-
"answer": "string",
|
|
13
|
-
"guideline_files": ["path"],
|
|
14
|
-
"input_files": ["path"],
|
|
15
|
-
"input": [{"role": "user", "content": "..."}],
|
|
16
|
-
"expected_output": [{"role": "assistant", "content": "..."}],
|
|
17
|
-
"output": [{"role": "assistant", "content": "..."}],
|
|
18
|
-
"trace": {
|
|
19
|
-
"event_count": 5,
|
|
20
|
-
"tool_names": ["fetch"],
|
|
21
|
-
"tool_calls_by_name": {"fetch": 1},
|
|
22
|
-
"error_count": 0,
|
|
23
|
-
"llm_call_count": 2
|
|
24
|
-
},
|
|
25
|
-
"token_usage": {"input": 1000, "output": 500},
|
|
26
|
-
"cost_usd": 0.0015,
|
|
27
|
-
"duration_ms": 3500,
|
|
28
|
-
"start_time": "2026-02-13T10:00:00.000Z",
|
|
29
|
-
"end_time": "2026-02-13T10:00:03.500Z"
|
|
30
|
-
}
|
|
31
|
-
```
|
|
32
|
-
|
|
33
|
-
### Output (stdout JSON)
|
|
34
|
-
|
|
35
|
-
```json
|
|
36
|
-
{
|
|
37
|
-
"score": 0.85,
|
|
38
|
-
"hits": ["passed check"],
|
|
39
|
-
"misses": ["failed check"],
|
|
40
|
-
"reasoning": "explanation"
|
|
41
|
-
}
|
|
42
|
-
```
|
|
43
|
-
|
|
44
|
-
`score` (0.0-1.0) required. `hits`, `misses`, `reasoning` optional.
|
|
45
|
-
|
|
46
|
-
## SDK Functions
|
|
47
|
-
|
|
48
|
-
```typescript
|
|
49
|
-
import { defineCodeJudge, createTargetClient, definePromptTemplate } from '@agentv/eval';
|
|
50
|
-
```
|
|
51
|
-
|
|
52
|
-
- `defineCodeJudge(fn)` - Wraps evaluation function with stdin/stdout handling
|
|
53
|
-
- `createTargetClient()` - Returns LLM proxy client (when `target: {}` configured)
|
|
54
|
-
- `.invoke({question, systemPrompt})` - Single LLM call
|
|
55
|
-
- `.invokeBatch(requests)` - Batch LLM calls
|
|
56
|
-
- `definePromptTemplate(fn)` - Wraps prompt generation function
|
|
57
|
-
- Context fields: `question`, `answer`, `referenceAnswer`, `criteria`, `expectedOutput`, `output`, `config`, `trace`, `tokenUsage`, `costUsd`, `durationMs`, `startTime`, `endTime`
|
|
58
|
-
|
|
59
|
-
## Python Example
|
|
60
|
-
|
|
61
|
-
```python
|
|
62
|
-
#!/usr/bin/env python3
|
|
63
|
-
import json, sys
|
|
64
|
-
|
|
65
|
-
def evaluate(data: dict) -> dict:
|
|
66
|
-
candidate = data.get("answer", "")
|
|
67
|
-
hits, misses = [], []
|
|
68
|
-
for kw in ["async", "await"]:
|
|
69
|
-
(hits if kw in candidate else misses).append(f"Keyword '{kw}'")
|
|
70
|
-
return {
|
|
71
|
-
"score": len(hits) / max(len(hits) + len(misses), 1),
|
|
72
|
-
"hits": hits, "misses": misses
|
|
73
|
-
}
|
|
74
|
-
|
|
75
|
-
if __name__ == "__main__":
|
|
76
|
-
try:
|
|
77
|
-
print(json.dumps(evaluate(json.loads(sys.stdin.read()))))
|
|
78
|
-
except Exception as e:
|
|
79
|
-
print(json.dumps({"score": 0, "misses": [str(e)]}))
|
|
80
|
-
sys.exit(1)
|
|
81
|
-
```
|
|
82
|
-
|
|
83
|
-
## TypeScript Example
|
|
84
|
-
|
|
85
|
-
```typescript
|
|
86
|
-
#!/usr/bin/env bun
|
|
87
|
-
import { defineCodeJudge } from '@agentv/eval';
|
|
88
|
-
|
|
89
|
-
export default defineCodeJudge(({ answer, criteria }) => {
|
|
90
|
-
const hits: string[] = [];
|
|
91
|
-
const misses: string[] = [];
|
|
92
|
-
if (answer.includes(criteria)) {
|
|
93
|
-
hits.push('Matches expected outcome');
|
|
94
|
-
} else {
|
|
95
|
-
misses.push('Does not match expected outcome');
|
|
96
|
-
}
|
|
97
|
-
return {
|
|
98
|
-
score: hits.length / Math.max(hits.length + misses.length, 1),
|
|
99
|
-
hits, misses,
|
|
100
|
-
};
|
|
101
|
-
});
|
|
102
|
-
```
|
|
103
|
-
|
|
104
|
-
## Template Variables
|
|
105
|
-
|
|
106
|
-
Derived from test fields (users never author these directly):
|
|
107
|
-
|
|
108
|
-
| Variable | Source |
|
|
109
|
-
|----------|--------|
|
|
110
|
-
| `question` | First user message in `input` |
|
|
111
|
-
| `criteria` | Test `criteria` field |
|
|
112
|
-
| `reference_answer` | Last entry in `expected_output` |
|
|
113
|
-
| `answer` | Last entry in `output` (runtime) |
|
|
114
|
-
| `input` | Full resolved input array (JSON) |
|
|
115
|
-
| `expected_output` | Full resolved expected array (JSON) |
|
|
116
|
-
| `output` | Full provider output array (JSON) |
|
|
117
|
-
|
|
118
|
-
Markdown templates use `{{variable}}` syntax. TypeScript templates receive context object.
|