agentv 0.26.0 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-6ZM7WVSC.js → chunk-RIJO5WBF.js} +13 -13
- package/dist/chunk-RIJO5WBF.js.map +1 -0
- package/dist/cli.js +1 -1
- package/dist/cli.js.map +1 -1
- package/dist/index.js +1 -1
- package/dist/templates/.claude/skills/agentv-eval-builder/SKILL.md +36 -19
- package/dist/templates/.claude/skills/agentv-eval-builder/references/eval-schema.json +217 -217
- package/dist/templates/.claude/skills/agentv-eval-builder/references/example-evals.md +94 -2
- package/dist/templates/.claude/skills/agentv-eval-builder/references/tool-trajectory-evaluator.md +8 -8
- package/package.json +1 -1
- package/dist/chunk-6ZM7WVSC.js.map +0 -1
- package/dist/templates/agentv/.env.template +0 -23
package/dist/cli.js
CHANGED
package/dist/cli.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../src/cli.ts"],"sourcesContent":["#!/usr/bin/env node\
|
|
1
|
+
{"version":3,"sources":["../src/cli.ts"],"sourcesContent":["#!/usr/bin/env node\nimport { runCli } from './index.js';\n\nrunCli().catch((error) => {\n console.error(error);\n process.exit(1);\n});\n"],"mappings":";;;;;;;AAGA,OAAO,EAAE,MAAM,CAAC,UAAU;AACxB,UAAQ,MAAM,KAAK;AACnB,UAAQ,KAAK,CAAC;AAChB,CAAC;","names":[]}
|
package/dist/index.js
CHANGED
|
@@ -61,8 +61,42 @@ execution:
|
|
|
61
61
|
model: gpt-5-chat
|
|
62
62
|
```
|
|
63
63
|
|
|
64
|
-
###
|
|
65
|
-
|
|
64
|
+
### Tool Trajectory Evaluators
|
|
65
|
+
Validate agent tool usage patterns (requires trace data from provider):
|
|
66
|
+
|
|
67
|
+
```yaml
|
|
68
|
+
execution:
|
|
69
|
+
evaluators:
|
|
70
|
+
- name: research_check
|
|
71
|
+
type: tool_trajectory
|
|
72
|
+
mode: any_order # Options: any_order, in_order, exact
|
|
73
|
+
minimums: # For any_order mode
|
|
74
|
+
knowledgeSearch: 2
|
|
75
|
+
expected: # For in_order/exact modes
|
|
76
|
+
- tool: knowledgeSearch
|
|
77
|
+
- tool: documentRetrieve
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
See `references/tool-trajectory-evaluator.md` for modes and configuration.
|
|
81
|
+
|
|
82
|
+
### Expected Tool Calls Evaluators
|
|
83
|
+
Validate tool calls and inputs inline with conversation flow:
|
|
84
|
+
|
|
85
|
+
```yaml
|
|
86
|
+
expected_messages:
|
|
87
|
+
- role: assistant
|
|
88
|
+
tool_calls:
|
|
89
|
+
- tool: getMetrics
|
|
90
|
+
input: { server: "prod-1" }
|
|
91
|
+
|
|
92
|
+
execution:
|
|
93
|
+
evaluators:
|
|
94
|
+
- name: input_check
|
|
95
|
+
type: expected_tool_calls
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
### Multiple Evaluators
|
|
99
|
+
Define multiple evaluators to run sequentially. The final score is a weighted average of all results.
|
|
66
100
|
|
|
67
101
|
```yaml
|
|
68
102
|
execution:
|
|
@@ -119,23 +153,6 @@ execution:
|
|
|
119
153
|
|
|
120
154
|
See `references/composite-evaluator.md` for aggregation types and patterns.
|
|
121
155
|
|
|
122
|
-
### Tool Trajectory Evaluator
|
|
123
|
-
Validate agent tool usage from trace data:
|
|
124
|
-
|
|
125
|
-
```yaml
|
|
126
|
-
execution:
|
|
127
|
-
evaluators:
|
|
128
|
-
- name: workflow-check
|
|
129
|
-
type: tool_trajectory
|
|
130
|
-
mode: in_order # or: any_order, exact
|
|
131
|
-
expected:
|
|
132
|
-
- tool: fetchData
|
|
133
|
-
- tool: processData
|
|
134
|
-
- tool: saveResults
|
|
135
|
-
```
|
|
136
|
-
|
|
137
|
-
See `references/tool-trajectory-evaluator.md` for modes and configuration.
|
|
138
|
-
|
|
139
156
|
## Example
|
|
140
157
|
```yaml
|
|
141
158
|
$schema: agentv-eval-v2
|
|
@@ -1,217 +1,217 @@
|
|
|
1
|
-
{
|
|
2
|
-
"$schema": "http://json-schema.org/draft-07/schema#",
|
|
3
|
-
"title": "AgentV Eval Schema",
|
|
4
|
-
"description": "Schema for YAML evaluation files with conversation flows, multiple evaluators, and execution configuration",
|
|
5
|
-
"type": "object",
|
|
6
|
-
"properties": {
|
|
7
|
-
"$schema": {
|
|
8
|
-
"type": "string",
|
|
9
|
-
"description": "Schema identifier",
|
|
10
|
-
"enum": ["agentv-eval-v2"]
|
|
11
|
-
},
|
|
12
|
-
"description": {
|
|
13
|
-
"type": "string",
|
|
14
|
-
"description": "Description of what this eval suite covers"
|
|
15
|
-
},
|
|
16
|
-
"target": {
|
|
17
|
-
"type": "string",
|
|
18
|
-
"description": "(Deprecated: use execution.target instead) Default target configuration name. Can be overridden per eval case."
|
|
19
|
-
},
|
|
20
|
-
"execution": {
|
|
21
|
-
"type": "object",
|
|
22
|
-
"description": "Default execution configuration for all eval cases (can be overridden per case)",
|
|
23
|
-
"properties": {
|
|
24
|
-
"target": {
|
|
25
|
-
"type": "string",
|
|
26
|
-
"description": "Default target configuration name (e.g., default, azure_base, vscode_projectx). Can be overridden per eval case."
|
|
27
|
-
},
|
|
28
|
-
"evaluators": {
|
|
29
|
-
"type": "array",
|
|
30
|
-
"description": "Default evaluators for all eval cases (code-based and LLM judges)",
|
|
31
|
-
"items": {
|
|
32
|
-
"type": "object",
|
|
33
|
-
"properties": {
|
|
34
|
-
"name": {
|
|
35
|
-
"type": "string",
|
|
36
|
-
"description": "Evaluator name/identifier"
|
|
37
|
-
},
|
|
38
|
-
"type": {
|
|
39
|
-
"type": "string",
|
|
40
|
-
"enum": ["code", "llm_judge"],
|
|
41
|
-
"description": "Evaluator type: 'code' for scripts/regex/keywords, 'llm_judge' for LLM-based evaluation"
|
|
42
|
-
},
|
|
43
|
-
"script": {
|
|
44
|
-
"type": "string",
|
|
45
|
-
"description": "Path to evaluator script (for type: code)"
|
|
46
|
-
},
|
|
47
|
-
"prompt": {
|
|
48
|
-
"type": "string",
|
|
49
|
-
"description": "Path to judge prompt file (for type: llm_judge)"
|
|
50
|
-
}
|
|
51
|
-
},
|
|
52
|
-
"required": ["name", "type"],
|
|
53
|
-
"additionalProperties": true
|
|
54
|
-
}
|
|
55
|
-
}
|
|
56
|
-
},
|
|
57
|
-
"additionalProperties": true
|
|
58
|
-
},
|
|
59
|
-
"evalcases": {
|
|
60
|
-
"type": "array",
|
|
61
|
-
"description": "Array of evaluation cases",
|
|
62
|
-
"minItems": 1,
|
|
63
|
-
"items": {
|
|
64
|
-
"type": "object",
|
|
65
|
-
"properties": {
|
|
66
|
-
"id": {
|
|
67
|
-
"type": "string",
|
|
68
|
-
"description": "Unique identifier for the eval case"
|
|
69
|
-
},
|
|
70
|
-
"conversation_id": {
|
|
71
|
-
"type": "string",
|
|
72
|
-
"description": "Optional conversation identifier for threading multiple eval cases together"
|
|
73
|
-
},
|
|
74
|
-
"expected_outcome": {
|
|
75
|
-
"type": "string",
|
|
76
|
-
"description": "Description of what the AI should accomplish in this eval"
|
|
77
|
-
},
|
|
78
|
-
"note": {
|
|
79
|
-
"type": "string",
|
|
80
|
-
"description": "Optional note or additional context for the eval case. Use this to document test-specific considerations, known limitations, or rationale for expected behavior."
|
|
81
|
-
},
|
|
82
|
-
"input_messages": {
|
|
83
|
-
"type": "array",
|
|
84
|
-
"description": "Input messages for the conversation",
|
|
85
|
-
"minItems": 1,
|
|
86
|
-
"items": {
|
|
87
|
-
"type": "object",
|
|
88
|
-
"properties": {
|
|
89
|
-
"role": {
|
|
90
|
-
"type": "string",
|
|
91
|
-
"enum": ["system", "user", "assistant", "tool"],
|
|
92
|
-
"description": "Message role"
|
|
93
|
-
},
|
|
94
|
-
"content": {
|
|
95
|
-
"oneOf": [
|
|
96
|
-
{
|
|
97
|
-
"type": "string",
|
|
98
|
-
"description": "Simple text content"
|
|
99
|
-
},
|
|
100
|
-
{
|
|
101
|
-
"type": "array",
|
|
102
|
-
"description": "Mixed content items (text and file references)",
|
|
103
|
-
"items": {
|
|
104
|
-
"type": "object",
|
|
105
|
-
"properties": {
|
|
106
|
-
"type": {
|
|
107
|
-
"type": "string",
|
|
108
|
-
"enum": ["text", "file"],
|
|
109
|
-
"description": "Content type: 'text' for inline content, 'file' for file references"
|
|
110
|
-
},
|
|
111
|
-
"value": {
|
|
112
|
-
"type": "string",
|
|
113
|
-
"description": "Text content or file path. Relative paths (e.g., ../prompts/file.md) are resolved from eval file directory. Absolute paths (e.g., /docs/examples/prompts/file.md) are resolved from repo root."
|
|
114
|
-
}
|
|
115
|
-
},
|
|
116
|
-
"required": ["type", "value"],
|
|
117
|
-
"additionalProperties": false
|
|
118
|
-
}
|
|
119
|
-
}
|
|
120
|
-
]
|
|
121
|
-
}
|
|
122
|
-
},
|
|
123
|
-
"required": ["role", "content"],
|
|
124
|
-
"additionalProperties": false
|
|
125
|
-
}
|
|
126
|
-
},
|
|
127
|
-
"expected_messages": {
|
|
128
|
-
"type": "array",
|
|
129
|
-
"description": "Expected response messages",
|
|
130
|
-
"minItems": 1,
|
|
131
|
-
"items": {
|
|
132
|
-
"type": "object",
|
|
133
|
-
"properties": {
|
|
134
|
-
"role": {
|
|
135
|
-
"type": "string",
|
|
136
|
-
"enum": ["system", "user", "assistant", "tool"],
|
|
137
|
-
"description": "Message role"
|
|
138
|
-
},
|
|
139
|
-
"content": {
|
|
140
|
-
"oneOf": [
|
|
141
|
-
{
|
|
142
|
-
"type": "string",
|
|
143
|
-
"description": "Simple text content"
|
|
144
|
-
},
|
|
145
|
-
{
|
|
146
|
-
"type": "array",
|
|
147
|
-
"description": "Mixed content items",
|
|
148
|
-
"items": {
|
|
149
|
-
"type": "object",
|
|
150
|
-
"properties": {
|
|
151
|
-
"type": {
|
|
152
|
-
"type": "string",
|
|
153
|
-
"enum": ["text", "file"]
|
|
154
|
-
},
|
|
155
|
-
"value": {
|
|
156
|
-
"type": "string"
|
|
157
|
-
}
|
|
158
|
-
},
|
|
159
|
-
"required": ["type", "value"],
|
|
160
|
-
"additionalProperties": false
|
|
161
|
-
}
|
|
162
|
-
}
|
|
163
|
-
]
|
|
164
|
-
}
|
|
165
|
-
},
|
|
166
|
-
"required": ["role", "content"],
|
|
167
|
-
"additionalProperties": false
|
|
168
|
-
}
|
|
169
|
-
},
|
|
170
|
-
"execution": {
|
|
171
|
-
"type": "object",
|
|
172
|
-
"description": "Per-case execution configuration",
|
|
173
|
-
"properties": {
|
|
174
|
-
"target": {
|
|
175
|
-
"type": "string",
|
|
176
|
-
"description": "Override target for this specific eval case"
|
|
177
|
-
},
|
|
178
|
-
"evaluators": {
|
|
179
|
-
"type": "array",
|
|
180
|
-
"description": "Multiple evaluators (code-based and LLM judges)",
|
|
181
|
-
"items": {
|
|
182
|
-
"type": "object",
|
|
183
|
-
"properties": {
|
|
184
|
-
"name": {
|
|
185
|
-
"type": "string",
|
|
186
|
-
"description": "Evaluator name/identifier"
|
|
187
|
-
},
|
|
188
|
-
"type": {
|
|
189
|
-
"type": "string",
|
|
190
|
-
"enum": ["code", "llm_judge"],
|
|
191
|
-
"description": "Evaluator type: 'code' for scripts/regex/keywords, 'llm_judge' for LLM-based evaluation"
|
|
192
|
-
},
|
|
193
|
-
"script": {
|
|
194
|
-
"type": "string",
|
|
195
|
-
"description": "Path to evaluator script (for type: code)"
|
|
196
|
-
},
|
|
197
|
-
"prompt": {
|
|
198
|
-
"type": "string",
|
|
199
|
-
"description": "Path to judge prompt file (for type: llm_judge)"
|
|
200
|
-
}
|
|
201
|
-
},
|
|
202
|
-
"required": ["name", "type"],
|
|
203
|
-
"additionalProperties": true
|
|
204
|
-
}
|
|
205
|
-
}
|
|
206
|
-
},
|
|
207
|
-
"additionalProperties": true
|
|
208
|
-
}
|
|
209
|
-
},
|
|
210
|
-
"required": ["id", "expected_outcome", "input_messages", "expected_messages"],
|
|
211
|
-
"additionalProperties": false
|
|
212
|
-
}
|
|
213
|
-
}
|
|
214
|
-
},
|
|
215
|
-
"required": ["evalcases"],
|
|
216
|
-
"additionalProperties": false
|
|
217
|
-
}
|
|
1
|
+
{
|
|
2
|
+
"$schema": "http://json-schema.org/draft-07/schema#",
|
|
3
|
+
"title": "AgentV Eval Schema",
|
|
4
|
+
"description": "Schema for YAML evaluation files with conversation flows, multiple evaluators, and execution configuration",
|
|
5
|
+
"type": "object",
|
|
6
|
+
"properties": {
|
|
7
|
+
"$schema": {
|
|
8
|
+
"type": "string",
|
|
9
|
+
"description": "Schema identifier",
|
|
10
|
+
"enum": ["agentv-eval-v2"]
|
|
11
|
+
},
|
|
12
|
+
"description": {
|
|
13
|
+
"type": "string",
|
|
14
|
+
"description": "Description of what this eval suite covers"
|
|
15
|
+
},
|
|
16
|
+
"target": {
|
|
17
|
+
"type": "string",
|
|
18
|
+
"description": "(Deprecated: use execution.target instead) Default target configuration name. Can be overridden per eval case."
|
|
19
|
+
},
|
|
20
|
+
"execution": {
|
|
21
|
+
"type": "object",
|
|
22
|
+
"description": "Default execution configuration for all eval cases (can be overridden per case)",
|
|
23
|
+
"properties": {
|
|
24
|
+
"target": {
|
|
25
|
+
"type": "string",
|
|
26
|
+
"description": "Default target configuration name (e.g., default, azure_base, vscode_projectx). Can be overridden per eval case."
|
|
27
|
+
},
|
|
28
|
+
"evaluators": {
|
|
29
|
+
"type": "array",
|
|
30
|
+
"description": "Default evaluators for all eval cases (code-based and LLM judges)",
|
|
31
|
+
"items": {
|
|
32
|
+
"type": "object",
|
|
33
|
+
"properties": {
|
|
34
|
+
"name": {
|
|
35
|
+
"type": "string",
|
|
36
|
+
"description": "Evaluator name/identifier"
|
|
37
|
+
},
|
|
38
|
+
"type": {
|
|
39
|
+
"type": "string",
|
|
40
|
+
"enum": ["code", "llm_judge"],
|
|
41
|
+
"description": "Evaluator type: 'code' for scripts/regex/keywords, 'llm_judge' for LLM-based evaluation"
|
|
42
|
+
},
|
|
43
|
+
"script": {
|
|
44
|
+
"type": "string",
|
|
45
|
+
"description": "Path to evaluator script (for type: code)"
|
|
46
|
+
},
|
|
47
|
+
"prompt": {
|
|
48
|
+
"type": "string",
|
|
49
|
+
"description": "Path to judge prompt file (for type: llm_judge)"
|
|
50
|
+
}
|
|
51
|
+
},
|
|
52
|
+
"required": ["name", "type"],
|
|
53
|
+
"additionalProperties": true
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
},
|
|
57
|
+
"additionalProperties": true
|
|
58
|
+
},
|
|
59
|
+
"evalcases": {
|
|
60
|
+
"type": "array",
|
|
61
|
+
"description": "Array of evaluation cases",
|
|
62
|
+
"minItems": 1,
|
|
63
|
+
"items": {
|
|
64
|
+
"type": "object",
|
|
65
|
+
"properties": {
|
|
66
|
+
"id": {
|
|
67
|
+
"type": "string",
|
|
68
|
+
"description": "Unique identifier for the eval case"
|
|
69
|
+
},
|
|
70
|
+
"conversation_id": {
|
|
71
|
+
"type": "string",
|
|
72
|
+
"description": "Optional conversation identifier for threading multiple eval cases together"
|
|
73
|
+
},
|
|
74
|
+
"expected_outcome": {
|
|
75
|
+
"type": "string",
|
|
76
|
+
"description": "Description of what the AI should accomplish in this eval"
|
|
77
|
+
},
|
|
78
|
+
"note": {
|
|
79
|
+
"type": "string",
|
|
80
|
+
"description": "Optional note or additional context for the eval case. Use this to document test-specific considerations, known limitations, or rationale for expected behavior."
|
|
81
|
+
},
|
|
82
|
+
"input_messages": {
|
|
83
|
+
"type": "array",
|
|
84
|
+
"description": "Input messages for the conversation",
|
|
85
|
+
"minItems": 1,
|
|
86
|
+
"items": {
|
|
87
|
+
"type": "object",
|
|
88
|
+
"properties": {
|
|
89
|
+
"role": {
|
|
90
|
+
"type": "string",
|
|
91
|
+
"enum": ["system", "user", "assistant", "tool"],
|
|
92
|
+
"description": "Message role"
|
|
93
|
+
},
|
|
94
|
+
"content": {
|
|
95
|
+
"oneOf": [
|
|
96
|
+
{
|
|
97
|
+
"type": "string",
|
|
98
|
+
"description": "Simple text content"
|
|
99
|
+
},
|
|
100
|
+
{
|
|
101
|
+
"type": "array",
|
|
102
|
+
"description": "Mixed content items (text and file references)",
|
|
103
|
+
"items": {
|
|
104
|
+
"type": "object",
|
|
105
|
+
"properties": {
|
|
106
|
+
"type": {
|
|
107
|
+
"type": "string",
|
|
108
|
+
"enum": ["text", "file"],
|
|
109
|
+
"description": "Content type: 'text' for inline content, 'file' for file references"
|
|
110
|
+
},
|
|
111
|
+
"value": {
|
|
112
|
+
"type": "string",
|
|
113
|
+
"description": "Text content or file path. Relative paths (e.g., ../prompts/file.md) are resolved from eval file directory. Absolute paths (e.g., /docs/examples/prompts/file.md) are resolved from repo root."
|
|
114
|
+
}
|
|
115
|
+
},
|
|
116
|
+
"required": ["type", "value"],
|
|
117
|
+
"additionalProperties": false
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
]
|
|
121
|
+
}
|
|
122
|
+
},
|
|
123
|
+
"required": ["role", "content"],
|
|
124
|
+
"additionalProperties": false
|
|
125
|
+
}
|
|
126
|
+
},
|
|
127
|
+
"expected_messages": {
|
|
128
|
+
"type": "array",
|
|
129
|
+
"description": "Expected response messages",
|
|
130
|
+
"minItems": 1,
|
|
131
|
+
"items": {
|
|
132
|
+
"type": "object",
|
|
133
|
+
"properties": {
|
|
134
|
+
"role": {
|
|
135
|
+
"type": "string",
|
|
136
|
+
"enum": ["system", "user", "assistant", "tool"],
|
|
137
|
+
"description": "Message role"
|
|
138
|
+
},
|
|
139
|
+
"content": {
|
|
140
|
+
"oneOf": [
|
|
141
|
+
{
|
|
142
|
+
"type": "string",
|
|
143
|
+
"description": "Simple text content"
|
|
144
|
+
},
|
|
145
|
+
{
|
|
146
|
+
"type": "array",
|
|
147
|
+
"description": "Mixed content items",
|
|
148
|
+
"items": {
|
|
149
|
+
"type": "object",
|
|
150
|
+
"properties": {
|
|
151
|
+
"type": {
|
|
152
|
+
"type": "string",
|
|
153
|
+
"enum": ["text", "file"]
|
|
154
|
+
},
|
|
155
|
+
"value": {
|
|
156
|
+
"type": "string"
|
|
157
|
+
}
|
|
158
|
+
},
|
|
159
|
+
"required": ["type", "value"],
|
|
160
|
+
"additionalProperties": false
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
]
|
|
164
|
+
}
|
|
165
|
+
},
|
|
166
|
+
"required": ["role", "content"],
|
|
167
|
+
"additionalProperties": false
|
|
168
|
+
}
|
|
169
|
+
},
|
|
170
|
+
"execution": {
|
|
171
|
+
"type": "object",
|
|
172
|
+
"description": "Per-case execution configuration",
|
|
173
|
+
"properties": {
|
|
174
|
+
"target": {
|
|
175
|
+
"type": "string",
|
|
176
|
+
"description": "Override target for this specific eval case"
|
|
177
|
+
},
|
|
178
|
+
"evaluators": {
|
|
179
|
+
"type": "array",
|
|
180
|
+
"description": "Multiple evaluators (code-based and LLM judges)",
|
|
181
|
+
"items": {
|
|
182
|
+
"type": "object",
|
|
183
|
+
"properties": {
|
|
184
|
+
"name": {
|
|
185
|
+
"type": "string",
|
|
186
|
+
"description": "Evaluator name/identifier"
|
|
187
|
+
},
|
|
188
|
+
"type": {
|
|
189
|
+
"type": "string",
|
|
190
|
+
"enum": ["code", "llm_judge"],
|
|
191
|
+
"description": "Evaluator type: 'code' for scripts/regex/keywords, 'llm_judge' for LLM-based evaluation"
|
|
192
|
+
},
|
|
193
|
+
"script": {
|
|
194
|
+
"type": "string",
|
|
195
|
+
"description": "Path to evaluator script (for type: code)"
|
|
196
|
+
},
|
|
197
|
+
"prompt": {
|
|
198
|
+
"type": "string",
|
|
199
|
+
"description": "Path to judge prompt file (for type: llm_judge)"
|
|
200
|
+
}
|
|
201
|
+
},
|
|
202
|
+
"required": ["name", "type"],
|
|
203
|
+
"additionalProperties": true
|
|
204
|
+
}
|
|
205
|
+
}
|
|
206
|
+
},
|
|
207
|
+
"additionalProperties": true
|
|
208
|
+
}
|
|
209
|
+
},
|
|
210
|
+
"required": ["id", "expected_outcome", "input_messages", "expected_messages"],
|
|
211
|
+
"additionalProperties": false
|
|
212
|
+
}
|
|
213
|
+
}
|
|
214
|
+
},
|
|
215
|
+
"required": ["evalcases"],
|
|
216
|
+
"additionalProperties": false
|
|
217
|
+
}
|
|
@@ -78,13 +78,12 @@ evalcases:
|
|
|
78
78
|
execution:
|
|
79
79
|
evaluators:
|
|
80
80
|
- name: json_format_validator
|
|
81
|
-
type:
|
|
81
|
+
type: code_judge
|
|
82
82
|
script: uv run validate_json.py
|
|
83
83
|
cwd: ./evaluators
|
|
84
84
|
- name: content_evaluator
|
|
85
85
|
type: llm_judge
|
|
86
86
|
prompt: ./judges/semantic_correctness.md
|
|
87
|
-
model: gpt-5-chat
|
|
88
87
|
|
|
89
88
|
input_messages:
|
|
90
89
|
- role: user
|
|
@@ -102,6 +101,99 @@ evalcases:
|
|
|
102
101
|
}
|
|
103
102
|
```
|
|
104
103
|
|
|
104
|
+
## Tool Trajectory Evaluation
|
|
105
|
+
|
|
106
|
+
Validate that an agent uses specific tools during execution.
|
|
107
|
+
|
|
108
|
+
```yaml
|
|
109
|
+
$schema: agentv-eval-v2
|
|
110
|
+
description: Tool usage validation
|
|
111
|
+
target: mock_agent
|
|
112
|
+
|
|
113
|
+
evalcases:
|
|
114
|
+
# Validate minimum tool usage (order doesn't matter)
|
|
115
|
+
- id: research-depth
|
|
116
|
+
expected_outcome: Agent researches thoroughly
|
|
117
|
+
input_messages:
|
|
118
|
+
- role: user
|
|
119
|
+
content: Research REST vs GraphQL
|
|
120
|
+
execution:
|
|
121
|
+
evaluators:
|
|
122
|
+
- name: research-check
|
|
123
|
+
type: tool_trajectory
|
|
124
|
+
mode: any_order
|
|
125
|
+
minimums:
|
|
126
|
+
knowledgeSearch: 2
|
|
127
|
+
documentRetrieve: 1
|
|
128
|
+
|
|
129
|
+
# Validate exact tool sequence
|
|
130
|
+
- id: auth-flow
|
|
131
|
+
expected_outcome: Agent follows auth sequence
|
|
132
|
+
input_messages:
|
|
133
|
+
- role: user
|
|
134
|
+
content: Authenticate user
|
|
135
|
+
execution:
|
|
136
|
+
evaluators:
|
|
137
|
+
- name: auth-sequence
|
|
138
|
+
type: tool_trajectory
|
|
139
|
+
mode: exact
|
|
140
|
+
expected:
|
|
141
|
+
- tool: checkCredentials
|
|
142
|
+
- tool: generateToken
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
## Expected Messages with Tool Calls
|
|
146
|
+
|
|
147
|
+
Validate precise tool inputs inline with expected messages.
|
|
148
|
+
|
|
149
|
+
```yaml
|
|
150
|
+
$schema: agentv-eval-v2
|
|
151
|
+
description: Tool input validation
|
|
152
|
+
target: mock_agent
|
|
153
|
+
|
|
154
|
+
evalcases:
|
|
155
|
+
- id: precise-inputs
|
|
156
|
+
expected_outcome: Agent calls tools with correct parameters
|
|
157
|
+
input_messages:
|
|
158
|
+
- role: user
|
|
159
|
+
content: Check CPU metrics for prod-1
|
|
160
|
+
expected_messages:
|
|
161
|
+
- role: assistant
|
|
162
|
+
content: Checking metrics...
|
|
163
|
+
tool_calls:
|
|
164
|
+
- tool: getCpuMetrics
|
|
165
|
+
input: { server: "prod-1" }
|
|
166
|
+
execution:
|
|
167
|
+
evaluators:
|
|
168
|
+
- name: input-validator
|
|
169
|
+
type: expected_tool_calls
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
## Static Trace Evaluation
|
|
173
|
+
|
|
174
|
+
Evaluate pre-existing trace files without running an agent.
|
|
175
|
+
|
|
176
|
+
```yaml
|
|
177
|
+
$schema: agentv-eval-v2
|
|
178
|
+
description: Static trace evaluation
|
|
179
|
+
target: static_trace
|
|
180
|
+
|
|
181
|
+
evalcases:
|
|
182
|
+
- id: validate-trace-file
|
|
183
|
+
expected_outcome: Trace contains required steps
|
|
184
|
+
input_messages:
|
|
185
|
+
- role: user
|
|
186
|
+
content: Analyze trace
|
|
187
|
+
execution:
|
|
188
|
+
evaluators:
|
|
189
|
+
- name: trace-check
|
|
190
|
+
type: tool_trajectory
|
|
191
|
+
mode: in_order
|
|
192
|
+
expected:
|
|
193
|
+
- tool: webSearch
|
|
194
|
+
- tool: readFile
|
|
195
|
+
```
|
|
196
|
+
|
|
105
197
|
## Multi-Turn Conversation (Single Eval Case)
|
|
106
198
|
|
|
107
199
|
```yaml
|