agentv 3.10.2 → 3.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-6UE665XI.js → chunk-ETMDLQ72.js} +1141 -60
- package/dist/chunk-ETMDLQ72.js.map +1 -0
- package/dist/{chunk-KGK5NUFG.js → chunk-EZGWZVVK.js} +377 -163
- package/dist/chunk-EZGWZVVK.js.map +1 -0
- package/dist/{chunk-F7LAJMTO.js → chunk-JEW3FEO7.js} +68 -32
- package/dist/chunk-JEW3FEO7.js.map +1 -0
- package/dist/cli.js +3 -3
- package/dist/{dist-3QUJEJUT.js → dist-QERRYDSC.js} +2 -2
- package/dist/index.js +3 -3
- package/dist/{interactive-EO6AR2R3.js → interactive-AD4PRYDN.js} +3 -3
- package/package.json +3 -1
- package/dist/chunk-6UE665XI.js.map +0 -1
- package/dist/chunk-F7LAJMTO.js.map +0 -1
- package/dist/chunk-KGK5NUFG.js.map +0 -1
- package/dist/templates/.agents/skills/agentv-chat-to-eval/README.md +0 -84
- package/dist/templates/.agents/skills/agentv-chat-to-eval/SKILL.md +0 -144
- package/dist/templates/.agents/skills/agentv-chat-to-eval/examples/transcript-json.md +0 -67
- package/dist/templates/.agents/skills/agentv-chat-to-eval/examples/transcript-markdown.md +0 -101
- package/dist/templates/.agents/skills/agentv-eval-builder/SKILL.md +0 -458
- package/dist/templates/.agents/skills/agentv-eval-builder/references/config-schema.json +0 -36
- package/dist/templates/.agents/skills/agentv-eval-builder/references/custom-evaluators.md +0 -118
- package/dist/templates/.agents/skills/agentv-eval-builder/references/eval-schema.json +0 -12753
- package/dist/templates/.agents/skills/agentv-eval-builder/references/rubric-evaluator.md +0 -77
- package/dist/templates/.agents/skills/agentv-eval-orchestrator/SKILL.md +0 -50
- package/dist/templates/.agents/skills/agentv-prompt-optimizer/SKILL.md +0 -78
- package/dist/templates/.agentv/.env.example +0 -25
- package/dist/templates/.claude/skills/agentv-eval-builder/SKILL.md +0 -177
- package/dist/templates/.claude/skills/agentv-eval-builder/references/batch-cli-evaluator.md +0 -316
- package/dist/templates/.claude/skills/agentv-eval-builder/references/compare-command.md +0 -137
- package/dist/templates/.claude/skills/agentv-eval-builder/references/composite-evaluator.md +0 -215
- package/dist/templates/.claude/skills/agentv-eval-builder/references/config-schema.json +0 -27
- package/dist/templates/.claude/skills/agentv-eval-builder/references/custom-evaluators.md +0 -115
- package/dist/templates/.claude/skills/agentv-eval-builder/references/eval-schema.json +0 -278
- package/dist/templates/.claude/skills/agentv-eval-builder/references/example-evals.md +0 -333
- package/dist/templates/.claude/skills/agentv-eval-builder/references/rubric-evaluator.md +0 -79
- package/dist/templates/.claude/skills/agentv-eval-builder/references/structured-data-evaluators.md +0 -121
- package/dist/templates/.claude/skills/agentv-eval-builder/references/tool-trajectory-evaluator.md +0 -298
- package/dist/templates/.claude/skills/agentv-prompt-optimizer/SKILL.md +0 -78
- package/dist/templates/.github/prompts/agentv-eval-build.prompt.md +0 -5
- package/dist/templates/.github/prompts/agentv-optimize.prompt.md +0 -4
- /package/dist/{dist-3QUJEJUT.js.map → dist-QERRYDSC.js.map} +0 -0
- /package/dist/{interactive-EO6AR2R3.js.map → interactive-AD4PRYDN.js.map} +0 -0
|
@@ -1,115 +0,0 @@
|
|
|
1
|
-
# Custom Evaluators
|
|
2
|
-
|
|
3
|
-
## Wire Format
|
|
4
|
-
|
|
5
|
-
### Input (stdin JSON)
|
|
6
|
-
|
|
7
|
-
```json
|
|
8
|
-
{
|
|
9
|
-
"question": "string",
|
|
10
|
-
"expected_outcome": "string",
|
|
11
|
-
"reference_answer": "string",
|
|
12
|
-
"candidate_answer": "string",
|
|
13
|
-
"guideline_files": ["path"],
|
|
14
|
-
"input_files": ["path"],
|
|
15
|
-
"input_messages": [{"role": "user", "content": "..."}],
|
|
16
|
-
"expected_messages": [{"role": "assistant", "content": "..."}],
|
|
17
|
-
"output_messages": [{"role": "assistant", "content": "..."}],
|
|
18
|
-
"trace_summary": {
|
|
19
|
-
"event_count": 5,
|
|
20
|
-
"tool_names": ["fetch"],
|
|
21
|
-
"tool_calls_by_name": {"fetch": 1},
|
|
22
|
-
"error_count": 0,
|
|
23
|
-
"token_usage": {"input": 1000, "output": 500},
|
|
24
|
-
"cost_usd": 0.0015,
|
|
25
|
-
"duration_ms": 3500
|
|
26
|
-
}
|
|
27
|
-
}
|
|
28
|
-
```
|
|
29
|
-
|
|
30
|
-
### Output (stdout JSON)
|
|
31
|
-
|
|
32
|
-
```json
|
|
33
|
-
{
|
|
34
|
-
"score": 0.85,
|
|
35
|
-
"hits": ["passed check"],
|
|
36
|
-
"misses": ["failed check"],
|
|
37
|
-
"reasoning": "explanation"
|
|
38
|
-
}
|
|
39
|
-
```
|
|
40
|
-
|
|
41
|
-
`score` (0.0-1.0) required. `hits`, `misses`, `reasoning` optional.
|
|
42
|
-
|
|
43
|
-
## SDK Functions
|
|
44
|
-
|
|
45
|
-
```typescript
|
|
46
|
-
import { defineCodeJudge, createTargetClient, definePromptTemplate } from '@agentv/eval';
|
|
47
|
-
```
|
|
48
|
-
|
|
49
|
-
- `defineCodeJudge(fn)` - Wraps evaluation function with stdin/stdout handling
|
|
50
|
-
- `createTargetClient()` - Returns LLM proxy client (when `target: {}` configured)
|
|
51
|
-
- `.invoke({question, systemPrompt})` - Single LLM call
|
|
52
|
-
- `.invokeBatch(requests)` - Batch LLM calls
|
|
53
|
-
- `definePromptTemplate(fn)` - Wraps prompt generation function
|
|
54
|
-
- Context fields: `question`, `candidateAnswer`, `referenceAnswer`, `expectedOutcome`, `expectedMessages`, `outputMessages`, `config`, `traceSummary`
|
|
55
|
-
|
|
56
|
-
## Python Example
|
|
57
|
-
|
|
58
|
-
```python
|
|
59
|
-
#!/usr/bin/env python3
|
|
60
|
-
import json, sys
|
|
61
|
-
|
|
62
|
-
def evaluate(data: dict) -> dict:
|
|
63
|
-
candidate = data.get("candidate_answer", "")
|
|
64
|
-
hits, misses = [], []
|
|
65
|
-
for kw in ["async", "await"]:
|
|
66
|
-
(hits if kw in candidate else misses).append(f"Keyword '{kw}'")
|
|
67
|
-
return {
|
|
68
|
-
"score": len(hits) / max(len(hits) + len(misses), 1),
|
|
69
|
-
"hits": hits, "misses": misses
|
|
70
|
-
}
|
|
71
|
-
|
|
72
|
-
if __name__ == "__main__":
|
|
73
|
-
try:
|
|
74
|
-
print(json.dumps(evaluate(json.loads(sys.stdin.read()))))
|
|
75
|
-
except Exception as e:
|
|
76
|
-
print(json.dumps({"score": 0, "misses": [str(e)]}))
|
|
77
|
-
sys.exit(1)
|
|
78
|
-
```
|
|
79
|
-
|
|
80
|
-
## TypeScript Example
|
|
81
|
-
|
|
82
|
-
```typescript
|
|
83
|
-
#!/usr/bin/env bun
|
|
84
|
-
import { defineCodeJudge } from '@agentv/eval';
|
|
85
|
-
|
|
86
|
-
export default defineCodeJudge(({ candidateAnswer, expectedOutcome }) => {
|
|
87
|
-
const hits: string[] = [];
|
|
88
|
-
const misses: string[] = [];
|
|
89
|
-
if (candidateAnswer.includes(expectedOutcome)) {
|
|
90
|
-
hits.push('Matches expected outcome');
|
|
91
|
-
} else {
|
|
92
|
-
misses.push('Does not match expected outcome');
|
|
93
|
-
}
|
|
94
|
-
return {
|
|
95
|
-
score: hits.length / Math.max(hits.length + misses.length, 1),
|
|
96
|
-
hits, misses,
|
|
97
|
-
};
|
|
98
|
-
});
|
|
99
|
-
```
|
|
100
|
-
|
|
101
|
-
## Template Variables
|
|
102
|
-
|
|
103
|
-
Derived from eval case fields (users never author these directly):
|
|
104
|
-
|
|
105
|
-
| Variable | Source |
|
|
106
|
-
|----------|--------|
|
|
107
|
-
| `question` | First user message in `input_messages` |
|
|
108
|
-
| `expected_outcome` | Eval case `expected_outcome` field |
|
|
109
|
-
| `reference_answer` | Last entry in `expected_messages` |
|
|
110
|
-
| `candidate_answer` | Last entry in `output_messages` (runtime) |
|
|
111
|
-
| `input_messages` | Full resolved input array (JSON) |
|
|
112
|
-
| `expected_messages` | Full resolved expected array (JSON) |
|
|
113
|
-
| `output_messages` | Full provider output array (JSON) |
|
|
114
|
-
|
|
115
|
-
Markdown templates use `{{variable}}` syntax. TypeScript templates receive context object.
|
|
@@ -1,278 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"$schema": "http://json-schema.org/draft-07/schema#",
|
|
3
|
-
"title": "AgentV Eval Schema",
|
|
4
|
-
"description": "Schema for YAML evaluation files with conversation flows, multiple evaluators, and execution configuration",
|
|
5
|
-
"type": "object",
|
|
6
|
-
"properties": {
|
|
7
|
-
"description": {
|
|
8
|
-
"type": "string",
|
|
9
|
-
"description": "Description of what this eval suite covers"
|
|
10
|
-
},
|
|
11
|
-
"target": {
|
|
12
|
-
"type": "string",
|
|
13
|
-
"description": "(Deprecated: use execution.target instead) Default target configuration name. Can be overridden per eval case."
|
|
14
|
-
},
|
|
15
|
-
"execution": {
|
|
16
|
-
"type": "object",
|
|
17
|
-
"description": "Default execution configuration for all eval cases (can be overridden per case)",
|
|
18
|
-
"properties": {
|
|
19
|
-
"target": {
|
|
20
|
-
"type": "string",
|
|
21
|
-
"description": "Default target configuration name (e.g., default, azure_base, vscode_projectx). Can be overridden per eval case."
|
|
22
|
-
},
|
|
23
|
-
"evaluators": {
|
|
24
|
-
"type": "array",
|
|
25
|
-
"description": "Default evaluators for all eval cases (code-based and LLM judges)",
|
|
26
|
-
"items": {
|
|
27
|
-
"type": "object",
|
|
28
|
-
"properties": {
|
|
29
|
-
"name": {
|
|
30
|
-
"type": "string",
|
|
31
|
-
"description": "Evaluator name/identifier"
|
|
32
|
-
},
|
|
33
|
-
"type": {
|
|
34
|
-
"type": "string",
|
|
35
|
-
"enum": [
|
|
36
|
-
"code",
|
|
37
|
-
"llm_judge",
|
|
38
|
-
"composite",
|
|
39
|
-
"tool_trajectory",
|
|
40
|
-
"field_accuracy",
|
|
41
|
-
"latency",
|
|
42
|
-
"cost",
|
|
43
|
-
"token_usage"
|
|
44
|
-
],
|
|
45
|
-
"description": "Evaluator type: 'code' for scripts/regex/keywords, 'llm_judge' for LLM-based evaluation"
|
|
46
|
-
},
|
|
47
|
-
"script": {
|
|
48
|
-
"type": "string",
|
|
49
|
-
"description": "Path to evaluator script (for type: code)"
|
|
50
|
-
},
|
|
51
|
-
"prompt": {
|
|
52
|
-
"type": "string",
|
|
53
|
-
"description": "Path to judge prompt file (for type: llm_judge)"
|
|
54
|
-
}
|
|
55
|
-
},
|
|
56
|
-
"required": ["name", "type"],
|
|
57
|
-
"additionalProperties": true
|
|
58
|
-
}
|
|
59
|
-
}
|
|
60
|
-
},
|
|
61
|
-
"additionalProperties": true
|
|
62
|
-
},
|
|
63
|
-
"evalcases": {
|
|
64
|
-
"type": "array",
|
|
65
|
-
"description": "Array of evaluation cases",
|
|
66
|
-
"minItems": 1,
|
|
67
|
-
"items": {
|
|
68
|
-
"type": "object",
|
|
69
|
-
"properties": {
|
|
70
|
-
"id": {
|
|
71
|
-
"type": "string",
|
|
72
|
-
"description": "Unique identifier for the eval case"
|
|
73
|
-
},
|
|
74
|
-
"conversation_id": {
|
|
75
|
-
"type": "string",
|
|
76
|
-
"description": "Optional conversation identifier for threading multiple eval cases together"
|
|
77
|
-
},
|
|
78
|
-
"expected_outcome": {
|
|
79
|
-
"type": "string",
|
|
80
|
-
"description": "Description of what the AI should accomplish in this eval"
|
|
81
|
-
},
|
|
82
|
-
"note": {
|
|
83
|
-
"type": "string",
|
|
84
|
-
"description": "Optional note or additional context for the eval case. Use this to document test-specific considerations, known limitations, or rationale for expected behavior."
|
|
85
|
-
},
|
|
86
|
-
"input_messages": {
|
|
87
|
-
"type": "array",
|
|
88
|
-
"description": "Input messages for the conversation",
|
|
89
|
-
"minItems": 1,
|
|
90
|
-
"items": {
|
|
91
|
-
"type": "object",
|
|
92
|
-
"properties": {
|
|
93
|
-
"role": {
|
|
94
|
-
"type": "string",
|
|
95
|
-
"enum": ["system", "user", "assistant", "tool"],
|
|
96
|
-
"description": "Message role"
|
|
97
|
-
},
|
|
98
|
-
"content": {
|
|
99
|
-
"oneOf": [
|
|
100
|
-
{
|
|
101
|
-
"type": "string",
|
|
102
|
-
"description": "Simple text content"
|
|
103
|
-
},
|
|
104
|
-
{
|
|
105
|
-
"type": "array",
|
|
106
|
-
"description": "Mixed content items (text and file references)",
|
|
107
|
-
"items": {
|
|
108
|
-
"type": "object",
|
|
109
|
-
"properties": {
|
|
110
|
-
"type": {
|
|
111
|
-
"type": "string",
|
|
112
|
-
"enum": ["text", "file"],
|
|
113
|
-
"description": "Content type: 'text' for inline content, 'file' for file references"
|
|
114
|
-
},
|
|
115
|
-
"value": {
|
|
116
|
-
"type": "string",
|
|
117
|
-
"description": "Text content or file path. Relative paths (e.g., ../prompts/file.md) are resolved from eval file directory. Absolute paths (e.g., /docs/examples/prompts/file.md) are resolved from repo root."
|
|
118
|
-
}
|
|
119
|
-
},
|
|
120
|
-
"required": ["type", "value"],
|
|
121
|
-
"additionalProperties": false
|
|
122
|
-
}
|
|
123
|
-
}
|
|
124
|
-
]
|
|
125
|
-
}
|
|
126
|
-
},
|
|
127
|
-
"required": ["role", "content"],
|
|
128
|
-
"additionalProperties": false
|
|
129
|
-
}
|
|
130
|
-
},
|
|
131
|
-
"input": {
|
|
132
|
-
"description": "Alias for input_messages with shorthand support. String expands to single user message, array of messages passes through.",
|
|
133
|
-
"oneOf": [
|
|
134
|
-
{
|
|
135
|
-
"type": "string",
|
|
136
|
-
"description": "Shorthand: single user message content"
|
|
137
|
-
},
|
|
138
|
-
{
|
|
139
|
-
"type": "array",
|
|
140
|
-
"description": "Array of messages (same format as input_messages)",
|
|
141
|
-
"items": {
|
|
142
|
-
"type": "object",
|
|
143
|
-
"properties": {
|
|
144
|
-
"role": {
|
|
145
|
-
"type": "string",
|
|
146
|
-
"enum": ["system", "user", "assistant", "tool"]
|
|
147
|
-
},
|
|
148
|
-
"content": {
|
|
149
|
-
"oneOf": [{ "type": "string" }, { "type": "array" }]
|
|
150
|
-
}
|
|
151
|
-
},
|
|
152
|
-
"required": ["role", "content"]
|
|
153
|
-
}
|
|
154
|
-
}
|
|
155
|
-
]
|
|
156
|
-
},
|
|
157
|
-
"expected_messages": {
|
|
158
|
-
"type": "array",
|
|
159
|
-
"description": "Expected response messages. Canonical form — use this or expected_output (alias). The content of the last entry is derived as the template variable 'reference_answer' for evaluator prompts.",
|
|
160
|
-
"minItems": 1,
|
|
161
|
-
"items": {
|
|
162
|
-
"type": "object",
|
|
163
|
-
"properties": {
|
|
164
|
-
"role": {
|
|
165
|
-
"type": "string",
|
|
166
|
-
"enum": ["system", "user", "assistant", "tool"],
|
|
167
|
-
"description": "Message role"
|
|
168
|
-
},
|
|
169
|
-
"content": {
|
|
170
|
-
"oneOf": [
|
|
171
|
-
{
|
|
172
|
-
"type": "string",
|
|
173
|
-
"description": "Simple text content"
|
|
174
|
-
},
|
|
175
|
-
{
|
|
176
|
-
"type": "array",
|
|
177
|
-
"description": "Mixed content items",
|
|
178
|
-
"items": {
|
|
179
|
-
"type": "object",
|
|
180
|
-
"properties": {
|
|
181
|
-
"type": {
|
|
182
|
-
"type": "string",
|
|
183
|
-
"enum": ["text", "file"]
|
|
184
|
-
},
|
|
185
|
-
"value": {
|
|
186
|
-
"type": "string"
|
|
187
|
-
}
|
|
188
|
-
},
|
|
189
|
-
"required": ["type", "value"],
|
|
190
|
-
"additionalProperties": false
|
|
191
|
-
}
|
|
192
|
-
}
|
|
193
|
-
]
|
|
194
|
-
}
|
|
195
|
-
},
|
|
196
|
-
"required": ["role", "content"],
|
|
197
|
-
"additionalProperties": false
|
|
198
|
-
}
|
|
199
|
-
},
|
|
200
|
-
"expected_output": {
|
|
201
|
-
"description": "Alias for expected_messages with shorthand support. String expands to single assistant message, object wraps as assistant message content. Resolves to expected_messages internally — the content of the last resolved entry becomes the template variable 'reference_answer'.",
|
|
202
|
-
"oneOf": [
|
|
203
|
-
{
|
|
204
|
-
"type": "string",
|
|
205
|
-
"description": "Shorthand: single assistant message content"
|
|
206
|
-
},
|
|
207
|
-
{
|
|
208
|
-
"type": "object",
|
|
209
|
-
"description": "Shorthand: structured content wraps as assistant message"
|
|
210
|
-
},
|
|
211
|
-
{
|
|
212
|
-
"type": "array",
|
|
213
|
-
"description": "Array of messages (same format as expected_messages)",
|
|
214
|
-
"items": {
|
|
215
|
-
"type": "object",
|
|
216
|
-
"properties": {
|
|
217
|
-
"role": {
|
|
218
|
-
"type": "string",
|
|
219
|
-
"enum": ["system", "user", "assistant", "tool"]
|
|
220
|
-
},
|
|
221
|
-
"content": {
|
|
222
|
-
"oneOf": [{ "type": "string" }, { "type": "object" }, { "type": "array" }]
|
|
223
|
-
}
|
|
224
|
-
},
|
|
225
|
-
"required": ["role", "content"]
|
|
226
|
-
}
|
|
227
|
-
}
|
|
228
|
-
]
|
|
229
|
-
},
|
|
230
|
-
"execution": {
|
|
231
|
-
"type": "object",
|
|
232
|
-
"description": "Per-case execution configuration",
|
|
233
|
-
"properties": {
|
|
234
|
-
"target": {
|
|
235
|
-
"type": "string",
|
|
236
|
-
"description": "Override target for this specific eval case"
|
|
237
|
-
},
|
|
238
|
-
"evaluators": {
|
|
239
|
-
"type": "array",
|
|
240
|
-
"description": "Multiple evaluators (code-based and LLM judges)",
|
|
241
|
-
"items": {
|
|
242
|
-
"type": "object",
|
|
243
|
-
"properties": {
|
|
244
|
-
"name": {
|
|
245
|
-
"type": "string",
|
|
246
|
-
"description": "Evaluator name/identifier"
|
|
247
|
-
},
|
|
248
|
-
"type": {
|
|
249
|
-
"type": "string",
|
|
250
|
-
"enum": ["code", "llm_judge"],
|
|
251
|
-
"description": "Evaluator type: 'code' for scripts/regex/keywords, 'llm_judge' for LLM-based evaluation"
|
|
252
|
-
},
|
|
253
|
-
"script": {
|
|
254
|
-
"type": "string",
|
|
255
|
-
"description": "Path to evaluator script (for type: code)"
|
|
256
|
-
},
|
|
257
|
-
"prompt": {
|
|
258
|
-
"type": "string",
|
|
259
|
-
"description": "Path to judge prompt file (for type: llm_judge)"
|
|
260
|
-
}
|
|
261
|
-
},
|
|
262
|
-
"required": ["name", "type"],
|
|
263
|
-
"additionalProperties": true
|
|
264
|
-
}
|
|
265
|
-
}
|
|
266
|
-
},
|
|
267
|
-
"additionalProperties": true
|
|
268
|
-
}
|
|
269
|
-
},
|
|
270
|
-
"required": ["id", "expected_outcome"],
|
|
271
|
-
"anyOf": [{ "required": ["input_messages"] }, { "required": ["input"] }],
|
|
272
|
-
"additionalProperties": true
|
|
273
|
-
}
|
|
274
|
-
}
|
|
275
|
-
},
|
|
276
|
-
"required": ["evalcases"],
|
|
277
|
-
"additionalProperties": false
|
|
278
|
-
}
|