agentv 2.6.0 → 2.7.1-next.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. package/README.md +128 -33
  2. package/dist/chunk-3L2L5GIL.js +51 -0
  3. package/dist/{chunk-BKMQNEUD.js.map → chunk-3L2L5GIL.js.map} +1 -1
  4. package/dist/{chunk-BKMQNEUD.js → chunk-5H446C7X.js} +2 -45
  5. package/dist/{chunk-LJVS3JAK.js → chunk-BL4PVUAT.js} +6 -4
  6. package/dist/{chunk-LJVS3JAK.js.map → chunk-BL4PVUAT.js.map} +1 -1
  7. package/dist/{chunk-MGK6HHRR.js → chunk-BWLYFF5N.js} +9813 -11237
  8. package/dist/chunk-BWLYFF5N.js.map +1 -0
  9. package/dist/chunk-C5GOHBQM.js +84 -0
  10. package/dist/chunk-C5GOHBQM.js.map +1 -0
  11. package/dist/chunk-EJEG3DU2.js +5476 -0
  12. package/dist/chunk-EJEG3DU2.js.map +1 -0
  13. package/dist/chunk-FV32QHPB.js +565 -0
  14. package/dist/chunk-FV32QHPB.js.map +1 -0
  15. package/dist/chunk-H5FFZCKI.js +2957 -0
  16. package/dist/chunk-H5FFZCKI.js.map +1 -0
  17. package/dist/chunk-JK6V4KVD.js +114 -0
  18. package/dist/chunk-JK6V4KVD.js.map +1 -0
  19. package/dist/chunk-LRULMAAA.js +1711 -0
  20. package/dist/chunk-LRULMAAA.js.map +1 -0
  21. package/dist/chunk-SR4I5KET.js +1238 -0
  22. package/dist/chunk-SR4I5KET.js.map +1 -0
  23. package/dist/chunk-VQ2ZO7XJ.js +2098 -0
  24. package/dist/chunk-VQ2ZO7XJ.js.map +1 -0
  25. package/dist/chunk-XALGXSKB.js +21 -0
  26. package/dist/chunk-XALGXSKB.js.map +1 -0
  27. package/dist/cli.js +8 -2
  28. package/dist/cli.js.map +1 -1
  29. package/dist/dist-R3OCWGXH.js +257 -0
  30. package/dist/dist-R3OCWGXH.js.map +1 -0
  31. package/dist/esm-5Q4BZALM-5REQWAUV.js +924 -0
  32. package/dist/esm-5Q4BZALM-5REQWAUV.js.map +1 -0
  33. package/dist/esm-DX3WQKEN.js +32 -0
  34. package/dist/esm-DX3WQKEN.js.map +1 -0
  35. package/dist/esm-QNEMCJPL.js +933 -0
  36. package/dist/esm-QNEMCJPL.js.map +1 -0
  37. package/dist/esm-R77SNOF5.js +65 -0
  38. package/dist/esm-R77SNOF5.js.map +1 -0
  39. package/dist/esm-RVQPUGWH.js +1207 -0
  40. package/dist/esm-RVQPUGWH.js.map +1 -0
  41. package/dist/getMachineId-bsd-HSK5LZMG.js +41 -0
  42. package/dist/getMachineId-bsd-HSK5LZMG.js.map +1 -0
  43. package/dist/getMachineId-darwin-4DP6CCJV.js +41 -0
  44. package/dist/getMachineId-darwin-4DP6CCJV.js.map +1 -0
  45. package/dist/getMachineId-linux-44LJ5UJB.js +33 -0
  46. package/dist/getMachineId-linux-44LJ5UJB.js.map +1 -0
  47. package/dist/getMachineId-unsupported-NVK6IATM.js +24 -0
  48. package/dist/getMachineId-unsupported-NVK6IATM.js.map +1 -0
  49. package/dist/getMachineId-win-YZ36S7VA.js +43 -0
  50. package/dist/getMachineId-win-YZ36S7VA.js.map +1 -0
  51. package/dist/index.js +10 -2
  52. package/dist/interactive-33TCZXLF.js +333 -0
  53. package/dist/interactive-33TCZXLF.js.map +1 -0
  54. package/dist/otlp-json-file-exporter-77FDBRSY-EZAPHWP6.js +9 -0
  55. package/dist/otlp-json-file-exporter-77FDBRSY-EZAPHWP6.js.map +1 -0
  56. package/dist/simple-trace-file-exporter-S76DMABU-5FCJESD2.js +9 -0
  57. package/dist/simple-trace-file-exporter-S76DMABU-5FCJESD2.js.map +1 -0
  58. package/dist/src-2N5EJ2N6.js +1733 -0
  59. package/dist/src-2N5EJ2N6.js.map +1 -0
  60. package/dist/templates/.agents/skills/agentv-chat-to-eval/README.md +84 -0
  61. package/dist/templates/.agents/skills/agentv-chat-to-eval/SKILL.md +144 -0
  62. package/dist/templates/.agents/skills/agentv-chat-to-eval/examples/transcript-json.md +67 -0
  63. package/dist/templates/.agents/skills/agentv-chat-to-eval/examples/transcript-markdown.md +101 -0
  64. package/dist/templates/.agents/skills/agentv-eval-builder/SKILL.md +433 -0
  65. package/dist/templates/.agents/skills/agentv-eval-builder/references/config-schema.json +36 -0
  66. package/dist/templates/.agents/skills/agentv-eval-builder/references/custom-evaluators.md +118 -0
  67. package/dist/templates/.agents/skills/agentv-eval-builder/references/eval-schema.json +251 -0
  68. package/dist/templates/.agents/skills/agentv-eval-builder/references/rubric-evaluator.md +77 -0
  69. package/dist/templates/.agents/skills/agentv-eval-orchestrator/SKILL.md +50 -0
  70. package/dist/templates/.agents/skills/agentv-prompt-optimizer/SKILL.md +78 -0
  71. package/dist/templates/.agentv/.env.example +23 -23
  72. package/dist/templates/.agentv/config.yaml +15 -15
  73. package/dist/templates/.claude/skills/agentv-eval-builder/SKILL.md +38 -13
  74. package/dist/templates/.claude/skills/agentv-eval-builder/references/custom-evaluators.md +9 -6
  75. package/dist/templates/.claude/skills/agentv-eval-builder/references/eval-schema.json +4 -4
  76. package/dist/templates/.claude/skills/agentv-eval-builder/references/rubric-evaluator.md +7 -9
  77. package/dist/templates/.github/prompts/agentv-eval-build.prompt.md +4 -4
  78. package/dist/templates/.github/prompts/agentv-optimize.prompt.md +3 -3
  79. package/dist/{token-D3IYDJQZ.js → token-POXF46NU.js} +6 -4
  80. package/dist/{token-D3IYDJQZ.js.map → token-POXF46NU.js.map} +1 -1
  81. package/dist/{token-util-FWFPR2BV.js → token-util-6GWYZWGE.js} +4 -3
  82. package/dist/token-util-6GWYZWGE.js.map +1 -0
  83. package/package.json +7 -3
  84. package/dist/chunk-MGK6HHRR.js.map +0 -1
  85. /package/dist/{token-util-FWFPR2BV.js.map → chunk-5H446C7X.js.map} +0 -0
@@ -0,0 +1,101 @@
1
+ # Example: Markdown Transcript Input
2
+
3
+ ## Input
4
+
5
+ ```
6
+ User: What are the SOLID principles in software engineering?
7
+ Assistant: The SOLID principles are five design principles for object-oriented programming:
8
+
9
+ 1. **Single Responsibility** — A class should have only one reason to change
10
+ 2. **Open/Closed** — Open for extension, closed for modification
11
+ 3. **Liskov Substitution** — Subtypes must be substitutable for their base types
12
+ 4. **Interface Segregation** — Prefer small, specific interfaces over large ones
13
+ 5. **Dependency Inversion** — Depend on abstractions, not concretions
14
+
15
+ User: Can you give me an example of the Single Responsibility Principle?
16
+ Assistant: Sure! Instead of a class that handles both user authentication and email sending, split them:
17
+
18
+ Bad: `UserService` handles login, registration, AND sends welcome emails.
19
+ Good: `AuthService` handles login/registration. `EmailService` handles sending emails.
20
+
21
+ Each class has one job, making them easier to test and maintain.
22
+
23
+ User: What's the difference between an interface and an abstract class?
24
+ Assistant: Key differences:
25
+
26
+ - **Interface**: Defines a contract (method signatures only). A class can implement multiple interfaces. No state.
27
+ - **Abstract class**: Can have both abstract and concrete methods. A class can extend only one. Can hold state.
28
+
29
+ Use interfaces for "can-do" relationships (e.g., `Serializable`). Use abstract classes for "is-a" relationships with shared behavior (e.g., `Animal` base class).
30
+ ```
31
+
32
+ ## Output
33
+
34
+ ```yaml
35
+ description: "Software engineering concepts — SOLID principles and OOP"
36
+
37
+ tests:
38
+ - id: solid-principles-overview
39
+ criteria: "List and briefly explain all five SOLID principles"
40
+ input: "What are the SOLID principles in software engineering?"
41
+ expected_output: |
42
+ The SOLID principles are five design principles for object-oriented programming:
43
+
44
+ 1. **Single Responsibility** — A class should have only one reason to change
45
+ 2. **Open/Closed** — Open for extension, closed for modification
46
+ 3. **Liskov Substitution** — Subtypes must be substitutable for their base types
47
+ 4. **Interface Segregation** — Prefer small, specific interfaces over large ones
48
+ 5. **Dependency Inversion** — Depend on abstractions, not concretions
49
+ assert:
50
+ - type: rubrics
51
+ criteria:
52
+ - Lists all five SOLID principles by name
53
+ - Provides a concise explanation for each principle
54
+ - Explanations are technically accurate
55
+
56
+ - id: srp-example
57
+ criteria: "Provide a concrete example of the Single Responsibility Principle"
58
+ input:
59
+ - role: user
60
+ content: "What are the SOLID principles in software engineering?"
61
+ - role: assistant
62
+ content: "The SOLID principles are five design principles..."
63
+ - role: user
64
+ content: "Can you give me an example of the Single Responsibility Principle?"
65
+ expected_output: |
66
+ Sure! Instead of a class that handles both user authentication and email sending, split them:
67
+
68
+ Bad: `UserService` handles login, registration, AND sends welcome emails.
69
+ Good: `AuthService` handles login/registration. `EmailService` handles sending emails.
70
+
71
+ Each class has one job, making them easier to test and maintain.
72
+ assert:
73
+ - type: rubrics
74
+ criteria:
75
+ - Shows a bad example that violates SRP
76
+ - Shows a good example that follows SRP
77
+ - Explains why the separation is beneficial
78
+
79
+ - id: interface-vs-abstract-class
80
+ criteria: "Explain the difference between interfaces and abstract classes"
81
+ input: "What's the difference between an interface and an abstract class?"
82
+ expected_output: |
83
+ Key differences:
84
+
85
+ - **Interface**: Defines a contract (method signatures only). A class can implement multiple interfaces. No state.
86
+ - **Abstract class**: Can have both abstract and concrete methods. A class can extend only one. Can hold state.
87
+
88
+ Use interfaces for "can-do" relationships (e.g., `Serializable`). Use abstract classes for "is-a" relationships with shared behavior (e.g., `Animal` base class).
89
+ assert:
90
+ - type: rubrics
91
+ criteria:
92
+ - Correctly distinguishes interfaces from abstract classes
93
+ - Mentions multiple inheritance support for interfaces
94
+ - Provides guidance on when to use each
95
+
96
+ # Suggested additional evaluators:
97
+ # assert:
98
+ # - name: quality
99
+ # type: llm_judge
100
+ # prompt: ./prompts/quality.md
101
+ ```
@@ -0,0 +1,433 @@
1
+ ---
2
+ name: agentv-eval-builder
3
+ description: Create and maintain AgentV YAML evaluation files for testing AI agent performance. Use this skill when creating new eval files, adding tests, or configuring evaluators.
4
+ ---
5
+
6
+ # AgentV Eval Builder
7
+
8
+ Comprehensive docs: https://agentv.dev
9
+
10
+ ## Quick Start
11
+
12
+ ```yaml
13
+ description: Example eval
14
+ execution:
15
+ target: default
16
+
17
+ tests:
18
+ - id: greeting
19
+ criteria: Friendly greeting
20
+ input: "Say hello"
21
+ expected_output: "Hello! How can I help you?"
22
+ assert:
23
+ - type: rubrics
24
+ criteria:
25
+ - Greeting is friendly and warm
26
+ - Offers to help
27
+ ```
28
+
29
+ ## Eval File Structure
30
+
31
+ **Required:** `tests` (array or string path)
32
+ **Optional:** `name`, `description`, `version`, `author`, `tags`, `license`, `requires`, `execution`, `dataset`, `workspace`, `assert`
33
+
34
+ **Test fields:**
35
+
36
+ | Field | Required | Description |
37
+ |-------|----------|-------------|
38
+ | `id` | yes | Unique identifier |
39
+ | `criteria` | yes | What the response should accomplish |
40
+ | `input` / `input` | yes | Input to the agent |
41
+ | `expected_output` / `expected_output` | no | Gold-standard reference answer |
42
+ | `assert` | no | Evaluators: assertions, rubrics, judges |
43
+ | `rubrics` | no | **Deprecated** — use `assert: [{type: rubrics, criteria: [...]}]` instead |
44
+ | `execution` | no | Per-case execution overrides |
45
+ | `workspace` | no | Per-case workspace config (overrides suite-level) |
46
+ | `metadata` | no | Arbitrary key-value pairs passed to setup/teardown scripts |
47
+ | `conversation_id` | no | Thread grouping |
48
+
49
+ **Shorthand aliases:**
50
+ - `input` (string) expands to `[{role: "user", content: "..."}]`
51
+ - `expected_output` (string/object) expands to `[{role: "assistant", content: ...}]`
52
+ - Canonical `input` / `expected_output` take precedence when both present
53
+
54
+ **Message format:** `{role, content}` where role is `system`, `user`, `assistant`, or `tool`
55
+ **Content types:** inline text, `{type: "file", value: "./path.md"}`
56
+ **File paths:** relative from eval file dir, or absolute with `/` prefix from repo root
57
+
58
+ **JSONL format:** One test per line as JSON. Optional `.yaml` sidecar for shared defaults. See `examples/features/basic-jsonl/`.
59
+
60
+ ## Metadata
61
+
62
+ When `name` is present, the suite is parsed as a metadata-bearing eval:
63
+
64
+ ```yaml
65
+ name: export-screening # required, lowercase/hyphens, max 64 chars
66
+ description: Evaluates export control screening accuracy
67
+ version: "1.0"
68
+ author: acme-compliance
69
+ tags: [compliance, agents]
70
+ license: Apache-2.0
71
+ requires:
72
+ agentv: ">=0.30.0"
73
+ ```
74
+
75
+ ## Tests as String Path
76
+
77
+ Point `tests` to an external file instead of inlining:
78
+
79
+ ```yaml
80
+ name: my-eval
81
+ description: My evaluation suite
82
+ tests: ./cases.yaml # relative to eval file dir
83
+ ```
84
+
85
+ The external file can be YAML (array of test objects) or JSONL.
86
+
87
+ ## Assert Field
88
+
89
+ `assert` defines evaluators at the suite level or per-test level. It is the canonical field for all evaluators (replaces `execution.evaluators`):
90
+
91
+ ```yaml
92
+ # Suite-level (appended to every test)
93
+ assert:
94
+ - type: is_json
95
+ required: true
96
+ - type: contains
97
+ value: "status"
98
+
99
+ tests:
100
+ - id: test-1
101
+ criteria: Returns JSON
102
+ input: Get status
103
+ # Per-test assert (runs before suite-level)
104
+ assert:
105
+ - type: equals
106
+ value: '{"status": "ok"}'
107
+ ```
108
+
109
+ `execution.evaluators` is deprecated. When both `assert` and `execution.evaluators` are present, `assert` takes precedence.
110
+
111
+ ## Required Gates
112
+
113
+ Any evaluator can be marked `required` to enforce a minimum score:
114
+
115
+ ```yaml
116
+ assert:
117
+ - type: contains
118
+ value: "DENIED"
119
+ required: true # must score >= 0.8 (default)
120
+ - type: rubrics
121
+ required: 0.6 # must score >= 0.6 (custom threshold)
122
+ criteria:
123
+ - id: accuracy
124
+ outcome: Identifies the denied party
125
+ weight: 5.0
126
+ ```
127
+
128
+ If a required evaluator scores below its threshold, the overall verdict is forced to `fail`.
129
+
130
+ ## Workspace Setup/Teardown
131
+
132
+ Run scripts before/after each test. Define at suite level or override per case:
133
+
134
+ ```yaml
135
+ workspace:
136
+ template: ./workspace-templates/my-project
137
+ setup:
138
+ script: ["bun", "run", "setup.ts"]
139
+ timeout_ms: 120000
140
+ teardown:
141
+ script: ["bun", "run", "teardown.ts"]
142
+
143
+ tests:
144
+ - id: case-1
145
+ input: Fix the bug
146
+ criteria: Bug is fixed
147
+ metadata:
148
+ repo: sympy/sympy
149
+ base_commit: "abc123"
150
+ workspace:
151
+ setup:
152
+ script: ["python", "custom-setup.py"] # overrides suite-level
153
+ ```
154
+
155
+ **Lifecycle:** template copy → setup → git baseline → agent → file changes → teardown → cleanup
156
+ **Merge:** Case-level fields replace suite-level fields.
157
+ **Scripts receive stdin JSON:** `{workspace_path, test_id, eval_run_id, case_input, case_metadata}`
158
+ **Setup failure:** aborts case. **Teardown failure:** non-fatal (warning).
159
+ See https://agentv.dev/targets/configuration/#workspace-setupteardown
160
+
161
+ ## Evaluator Types
162
+
163
+ Configure via `assert` array. Multiple evaluators produce a weighted average score.
164
+
165
+ ### code_judge
166
+ ```yaml
167
+ - name: format_check
168
+ type: code_judge
169
+ script: uv run validate.py
170
+ cwd: ./scripts # optional working directory
171
+ target: {} # optional: enable LLM target proxy (max_calls: 50)
172
+ ```
173
+ Contract: stdin JSON -> stdout JSON `{score, hits, misses, reasoning}`
174
+ Input includes: `question`, `criteria`, `answer`, `reference_answer`, `output`, `trace`, `file_changes`, `workspace_path`, `config`
175
+ When `workspace_template` is configured, `workspace_path` is the absolute path to the workspace dir (also available as `AGENTV_WORKSPACE_PATH` env var). Use this for functional grading (e.g., running `npm test` in the workspace).
176
+ See docs at https://agentv.dev/evaluators/code-judges/
177
+
178
+ ### llm_judge
179
+ ```yaml
180
+ - name: quality
181
+ type: llm_judge
182
+ prompt: ./prompts/eval.md # markdown template or script config
183
+ model: gpt-5-chat # optional model override
184
+ config: # passed to script templates as context.config
185
+ strictness: high
186
+ ```
187
+ Variables: `{{question}}`, `{{criteria}}`, `{{answer}}`, `{{reference_answer}}`, `{{input}}`, `{{expected_output}}`, `{{output}}`, `{{file_changes}}`
188
+ - Markdown templates: use `{{variable}}` syntax
189
+ - TypeScript templates: use `definePromptTemplate(fn)` from `@agentv/eval`, receives context object with all variables + `config`
190
+
191
+ ### composite
192
+ ```yaml
193
+ - name: gate
194
+ type: composite
195
+ assert:
196
+ - name: safety
197
+ type: llm_judge
198
+ prompt: ./safety.md
199
+ - name: quality
200
+ type: llm_judge
201
+ aggregator:
202
+ type: weighted_average
203
+ weights: { safety: 0.3, quality: 0.7 }
204
+ ```
205
+ Aggregator types: `weighted_average`, `all_or_nothing`, `minimum`, `maximum`, `safety_gate`
206
+ - `safety_gate`: fails immediately if the named gate evaluator scores below threshold (default 1.0)
207
+
208
+ ### tool_trajectory
209
+ ```yaml
210
+ - name: tool_check
211
+ type: tool_trajectory
212
+ mode: any_order # any_order | in_order | exact
213
+ minimums: # for any_order
214
+ knowledgeSearch: 2
215
+ expected: # for in_order/exact
216
+ - tool: knowledgeSearch
217
+ args: { query: "search term" } # partial deep equality match
218
+ - tool: documentRetrieve
219
+ args: any # any arguments accepted
220
+ max_duration_ms: 5000 # per-tool latency assertion
221
+ - tool: summarize # omit args to skip argument checking
222
+ ```
223
+
224
+ ### field_accuracy
225
+ ```yaml
226
+ - name: fields
227
+ type: field_accuracy
228
+ match_type: exact # exact | date | numeric_tolerance
229
+ numeric_tolerance: 0.01 # for numeric_tolerance match_type
230
+ aggregation: weighted_average # weighted_average | all_or_nothing
231
+ ```
232
+ Compares `output` fields against `expected_output` fields.
233
+
234
+ ### latency
235
+ ```yaml
236
+ - name: speed
237
+ type: latency
238
+ max_ms: 5000
239
+ ```
240
+
241
+ ### cost
242
+ ```yaml
243
+ - name: budget
244
+ type: cost
245
+ max_usd: 0.10
246
+ ```
247
+
248
+ ### token_usage
249
+ ```yaml
250
+ - name: tokens
251
+ type: token_usage
252
+ max_total_tokens: 4000
253
+ ```
254
+
255
+ ### execution_metrics
256
+ ```yaml
257
+ - name: efficiency
258
+ type: execution_metrics
259
+ max_tool_calls: 10 # Maximum tool invocations
260
+ max_llm_calls: 5 # Maximum LLM calls (assistant messages)
261
+ max_tokens: 5000 # Maximum total tokens (input + output)
262
+ max_cost_usd: 0.05 # Maximum cost in USD
263
+ max_duration_ms: 30000 # Maximum execution duration
264
+ target_exploration_ratio: 0.6 # Target ratio of read-only tool calls
265
+ exploration_tolerance: 0.2 # Tolerance for ratio check (default: 0.2)
266
+ ```
267
+ Declarative threshold-based checks on execution metrics. Only specified thresholds are checked.
268
+ Score is proportional: `hits / (hits + misses)`. Missing data counts as a miss.
269
+
270
+ ### contains
271
+ ```yaml
272
+ - type: contains
273
+ value: "DENIED"
274
+ required: true
275
+ ```
276
+ Binary check: does output contain the substring? Name auto-generated if omitted.
277
+
278
+ ### regex
279
+ ```yaml
280
+ - type: regex
281
+ value: "\\d{3}-\\d{2}-\\d{4}"
282
+ ```
283
+ Binary check: does output match the regex pattern?
284
+
285
+ ### equals
286
+ ```yaml
287
+ - type: equals
288
+ value: "42"
289
+ ```
290
+ Binary check: does output exactly equal the value (both trimmed)?
291
+
292
+ ### is_json
293
+ ```yaml
294
+ - type: is_json
295
+ required: true
296
+ ```
297
+ Binary check: is the output valid JSON?
298
+
299
+ ### rubrics
300
+ ```yaml
301
+ - type: rubrics
302
+ criteria:
303
+ - id: accuracy
304
+ outcome: Correctly identifies the denied party
305
+ weight: 5.0
306
+ - id: reasoning
307
+ outcome: Provides clear reasoning
308
+ weight: 3.0
309
+ ```
310
+ LLM-judged structured evaluation with weighted criteria. Criteria items support `id`, `outcome`, `weight`, and `required` fields.
311
+
312
+ ### rubrics (inline, deprecated)
313
+ Top-level `rubrics:` field is deprecated. Use `type: rubrics` under `assert` instead.
314
+ See `references/rubric-evaluator.md` for score-range mode and scoring formula.
315
+
316
+ ## CLI Commands
317
+
318
+ ```bash
319
+ # Run evaluation (requires API keys)
320
+ agentv eval <file.yaml> [--test-id <id>] [--target <name>] [--dry-run]
321
+
322
+ # Run with trace file (human-readable JSONL)
323
+ agentv eval <file.yaml> --trace-file traces/eval.jsonl
324
+
325
+ # Run with OTLP JSON file (importable by OTel backends)
326
+ agentv eval <file.yaml> --otel-file traces/eval.otlp.json
327
+
328
+ # Agent-orchestrated evals (no API keys needed)
329
+ agentv prompt eval <file.yaml> # orchestration overview
330
+ agentv prompt eval input <file.yaml> --test-id <id> # task input JSON (file paths, not embedded content)
331
+ agentv prompt eval judge <file.yaml> --test-id <id> --answer-file f # judge prompts / code judge results
332
+
333
+ # Validate eval file
334
+ agentv validate <file.yaml>
335
+
336
+ # Compare results between runs
337
+ agentv compare <results1.jsonl> <results2.jsonl>
338
+
339
+ # Generate rubrics from criteria
340
+ agentv generate rubrics <file.yaml> [--target <name>]
341
+ ```
342
+
343
+ ## Code Judge SDK
344
+
345
+ Use `@agentv/eval` to build custom evaluators in TypeScript/JavaScript:
346
+
347
+ ### defineAssertion (recommended for custom checks)
348
+ ```typescript
349
+ #!/usr/bin/env bun
350
+ import { defineAssertion } from '@agentv/eval';
351
+
352
+ export default defineAssertion(({ answer, trace }) => ({
353
+ pass: answer.length > 0 && (trace?.eventCount ?? 0) <= 10,
354
+ reasoning: 'Checks content exists and is efficient',
355
+ }));
356
+ ```
357
+
358
+ Assertions support both `pass: boolean` and `score: number` (0-1). If only `pass` is given, score is 1 (pass) or 0 (fail).
359
+
360
+ ### defineCodeJudge (full control)
361
+ ```typescript
362
+ #!/usr/bin/env bun
363
+ import { defineCodeJudge } from '@agentv/eval';
364
+
365
+ export default defineCodeJudge(({ trace, answer }) => ({
366
+ score: trace?.eventCount <= 5 ? 1.0 : 0.5,
367
+ hits: ['Efficient tool usage'],
368
+ misses: [],
369
+ }));
370
+ ```
371
+
372
+ Both are used via `type: code_judge` in YAML with `script: bun run judge.ts`.
373
+
374
+ ### Convention-Based Discovery
375
+
376
+ Place assertion files in `.agentv/assertions/` — they auto-register by filename:
377
+
378
+ ```
379
+ .agentv/assertions/word-count.ts → type: word-count
380
+ .agentv/assertions/sentiment.ts → type: sentiment
381
+ ```
382
+
383
+ No `script:` needed in YAML — just use `type: <filename>`.
384
+
385
+ ## Programmatic API
386
+
387
+ Use `evaluate()` from `@agentv/core` to run evals as a library:
388
+
389
+ ```typescript
390
+ import { evaluate } from '@agentv/core';
391
+
392
+ const { results, summary } = await evaluate({
393
+ tests: [
394
+ {
395
+ id: 'greeting',
396
+ input: 'Say hello',
397
+ assert: [{ type: 'contains', value: 'hello' }],
398
+ },
399
+ ],
400
+ target: { provider: 'mock_agent' },
401
+ });
402
+ console.log(`${summary.passed}/${summary.total} passed`);
403
+ ```
404
+
405
+ Supports inline tests (no YAML) or file-based via `specFile`.
406
+
407
+ ## defineConfig
408
+
409
+ Type-safe project configuration in `agentv.config.ts`:
410
+
411
+ ```typescript
412
+ import { defineConfig } from '@agentv/core';
413
+
414
+ export default defineConfig({
415
+ execution: { workers: 5, maxRetries: 2 },
416
+ output: { format: 'jsonl', dir: './results' },
417
+ limits: { maxCostUsd: 10.0 },
418
+ });
419
+ ```
420
+
421
+ Auto-discovered from project root. Validated with Zod.
422
+
423
+ ## Scaffold Commands
424
+
425
+ ```bash
426
+ agentv create assertion <name> # → .agentv/assertions/<name>.ts
427
+ agentv create eval <name> # → evals/<name>.eval.yaml + .cases.jsonl
428
+ ```
429
+
430
+ ## Schemas
431
+
432
+ - Eval file: `references/eval-schema.json`
433
+ - Config: `references/config-schema.json`
@@ -0,0 +1,36 @@
1
+ {
2
+ "$schema": "http://json-schema.org/draft-07/schema#",
3
+ "title": "AgentV Config Schema",
4
+ "description": "Schema for .agentv/config.yaml configuration files",
5
+ "type": "object",
6
+ "properties": {
7
+ "$schema": {
8
+ "type": "string",
9
+ "description": "Schema identifier",
10
+ "enum": ["agentv-config-v2"]
11
+ },
12
+ "guideline_patterns": {
13
+ "type": "array",
14
+ "description": "Glob patterns for identifying guideline files (instructions, prompts). Files matching these patterns are treated as guidelines, while non-matching files are treated as regular file content.",
15
+ "items": {
16
+ "type": "string",
17
+ "description": "Glob pattern (e.g., '**/*.instructions.md', '**/prompts/**')"
18
+ },
19
+ "examples": [
20
+ ["**/*.instructions.md", "**/instructions/**", "**/*.prompt.md", "**/prompts/**"],
21
+ ["**/*.guide.md", "**/guidelines/**", "docs/AGENTS.md"]
22
+ ]
23
+ },
24
+ "eval_patterns": {
25
+ "type": "array",
26
+ "description": "Glob patterns for discovering eval files during interactive mode (`agentv eval` with no args). Defaults to ['**/evals/**/dataset*.yaml', '**/evals/**/eval.yaml'] if not specified.",
27
+ "items": {
28
+ "type": "string",
29
+ "description": "Glob pattern (e.g., '**/evals/**/dataset*.yaml', '**/evals/**/eval.yaml')"
30
+ },
31
+ "examples": [["**/evals/**/dataset*.yaml", "**/evals/**/eval.yaml"], ["**/evals/**/*.yaml"]]
32
+ }
33
+ },
34
+ "required": ["$schema"],
35
+ "additionalProperties": false
36
+ }
@@ -0,0 +1,118 @@
1
+ # Custom Evaluators
2
+
3
+ ## Wire Format
4
+
5
+ ### Input (stdin JSON)
6
+
7
+ ```json
8
+ {
9
+ "question": "string",
10
+ "criteria": "string",
11
+ "reference_answer": "string",
12
+ "answer": "string",
13
+ "guideline_files": ["path"],
14
+ "input_files": ["path"],
15
+ "input": [{"role": "user", "content": "..."}],
16
+ "expected_output": [{"role": "assistant", "content": "..."}],
17
+ "output": [{"role": "assistant", "content": "..."}],
18
+ "trace": {
19
+ "event_count": 5,
20
+ "tool_names": ["fetch"],
21
+ "tool_calls_by_name": {"fetch": 1},
22
+ "error_count": 0,
23
+ "llm_call_count": 2,
24
+ "token_usage": {"input": 1000, "output": 500},
25
+ "cost_usd": 0.0015,
26
+ "duration_ms": 3500,
27
+ "start_time": "2026-02-13T10:00:00.000Z",
28
+ "end_time": "2026-02-13T10:00:03.500Z"
29
+ }
30
+ }
31
+ ```
32
+
33
+ ### Output (stdout JSON)
34
+
35
+ ```json
36
+ {
37
+ "score": 0.85,
38
+ "hits": ["passed check"],
39
+ "misses": ["failed check"],
40
+ "reasoning": "explanation"
41
+ }
42
+ ```
43
+
44
+ `score` (0.0-1.0) required. `hits`, `misses`, `reasoning` optional.
45
+
46
+ ## SDK Functions
47
+
48
+ ```typescript
49
+ import { defineCodeJudge, createTargetClient, definePromptTemplate } from '@agentv/eval';
50
+ ```
51
+
52
+ - `defineCodeJudge(fn)` - Wraps evaluation function with stdin/stdout handling
53
+ - `createTargetClient()` - Returns LLM proxy client (when `target: {}` configured)
54
+ - `.invoke({question, systemPrompt})` - Single LLM call
55
+ - `.invokeBatch(requests)` - Batch LLM calls
56
+ - `definePromptTemplate(fn)` - Wraps prompt generation function
57
+ - Context fields: `question`, `answer`, `referenceAnswer`, `criteria`, `expectedOutput`, `output`, `config`, `trace`
58
+
59
+ ## Python Example
60
+
61
+ ```python
62
+ #!/usr/bin/env python3
63
+ import json, sys
64
+
65
+ def evaluate(data: dict) -> dict:
66
+ candidate = data.get("answer", "")
67
+ hits, misses = [], []
68
+ for kw in ["async", "await"]:
69
+ (hits if kw in candidate else misses).append(f"Keyword '{kw}'")
70
+ return {
71
+ "score": len(hits) / max(len(hits) + len(misses), 1),
72
+ "hits": hits, "misses": misses
73
+ }
74
+
75
+ if __name__ == "__main__":
76
+ try:
77
+ print(json.dumps(evaluate(json.loads(sys.stdin.read()))))
78
+ except Exception as e:
79
+ print(json.dumps({"score": 0, "misses": [str(e)]}))
80
+ sys.exit(1)
81
+ ```
82
+
83
+ ## TypeScript Example
84
+
85
+ ```typescript
86
+ #!/usr/bin/env bun
87
+ import { defineCodeJudge } from '@agentv/eval';
88
+
89
+ export default defineCodeJudge(({ answer, criteria }) => {
90
+ const hits: string[] = [];
91
+ const misses: string[] = [];
92
+ if (answer.includes(criteria)) {
93
+ hits.push('Matches expected outcome');
94
+ } else {
95
+ misses.push('Does not match expected outcome');
96
+ }
97
+ return {
98
+ score: hits.length / Math.max(hits.length + misses.length, 1),
99
+ hits, misses,
100
+ };
101
+ });
102
+ ```
103
+
104
+ ## Template Variables
105
+
106
+ Derived from test fields (users never author these directly):
107
+
108
+ | Variable | Source |
109
+ |----------|--------|
110
+ | `question` | First user message in `input` |
111
+ | `criteria` | Test `criteria` field |
112
+ | `reference_answer` | Last entry in `expected_output` |
113
+ | `answer` | Last entry in `output` (runtime) |
114
+ | `input` | Full resolved input array (JSON) |
115
+ | `expected_output` | Full resolved expected array (JSON) |
116
+ | `output` | Full provider output array (JSON) |
117
+
118
+ Markdown templates use `{{variable}}` syntax. TypeScript templates receive context object.