agentv 0.25.0 → 0.26.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -3
- package/dist/{chunk-ZVSFP6NK.js → chunk-6ZM7WVSC.js} +85 -24
- package/dist/chunk-6ZM7WVSC.js.map +1 -0
- package/dist/cli.js +1 -1
- package/dist/cli.js.map +1 -1
- package/dist/index.js +1 -1
- package/dist/templates/.claude/skills/agentv-eval-builder/SKILL.md +1 -1
- package/dist/templates/.claude/skills/agentv-eval-builder/references/eval-schema.json +217 -217
- package/dist/templates/agentv/.env.template +23 -0
- package/package.json +1 -1
- package/dist/chunk-ZVSFP6NK.js.map +0 -1
package/dist/cli.js
CHANGED
package/dist/cli.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../src/cli.ts"],"sourcesContent":["#!/usr/bin/env node\nimport { runCli } from './index.js';\n\nrunCli().catch((error) => {\n console.error(error);\n process.exit(1);\n});\n"],"mappings":";;;;;;;AAGA,OAAO,EAAE,MAAM,CAAC,UAAU;AACxB,UAAQ,MAAM,KAAK;AACnB,UAAQ,KAAK,CAAC;AAChB,CAAC;","names":[]}
|
|
1
|
+
{"version":3,"sources":["../src/cli.ts"],"sourcesContent":["#!/usr/bin/env node\r\nimport { runCli } from './index.js';\r\n\r\nrunCli().catch((error) => {\r\n console.error(error);\r\n process.exit(1);\r\n});\r\n"],"mappings":";;;;;;;AAGA,OAAO,EAAE,MAAM,CAAC,UAAU;AACxB,UAAQ,MAAM,KAAK;AACnB,UAAQ,KAAK,CAAC;AAChB,CAAC;","names":[]}
|
package/dist/index.js
CHANGED
|
@@ -44,7 +44,7 @@ execution:
|
|
|
44
44
|
```
|
|
45
45
|
|
|
46
46
|
**Contract:**
|
|
47
|
-
- Input (stdin): JSON with `question`, `expected_outcome`, `reference_answer`, `candidate_answer`, `
|
|
47
|
+
- Input (stdin): JSON with `question`, `expected_outcome`, `reference_answer`, `candidate_answer`, `guideline_files` (file paths), `input_files` (file paths, excludes guidelines), `input_messages`
|
|
48
48
|
- Output (stdout): JSON with `score` (0.0-1.0), `hits`, `misses`, `reasoning`
|
|
49
49
|
|
|
50
50
|
**Template:** See `references/custom-evaluators.md` for Python code evaluator template
|
|
@@ -1,217 +1,217 @@
|
|
|
1
|
-
{
|
|
2
|
-
"$schema": "http://json-schema.org/draft-07/schema#",
|
|
3
|
-
"title": "AgentV Eval Schema",
|
|
4
|
-
"description": "Schema for YAML evaluation files with conversation flows, multiple evaluators, and execution configuration",
|
|
5
|
-
"type": "object",
|
|
6
|
-
"properties": {
|
|
7
|
-
"$schema": {
|
|
8
|
-
"type": "string",
|
|
9
|
-
"description": "Schema identifier",
|
|
10
|
-
"enum": ["agentv-eval-v2"]
|
|
11
|
-
},
|
|
12
|
-
"description": {
|
|
13
|
-
"type": "string",
|
|
14
|
-
"description": "Description of what this eval suite covers"
|
|
15
|
-
},
|
|
16
|
-
"target": {
|
|
17
|
-
"type": "string",
|
|
18
|
-
"description": "(Deprecated: use execution.target instead) Default target configuration name. Can be overridden per eval case."
|
|
19
|
-
},
|
|
20
|
-
"execution": {
|
|
21
|
-
"type": "object",
|
|
22
|
-
"description": "Default execution configuration for all eval cases (can be overridden per case)",
|
|
23
|
-
"properties": {
|
|
24
|
-
"target": {
|
|
25
|
-
"type": "string",
|
|
26
|
-
"description": "Default target configuration name (e.g., default, azure_base, vscode_projectx). Can be overridden per eval case."
|
|
27
|
-
},
|
|
28
|
-
"evaluators": {
|
|
29
|
-
"type": "array",
|
|
30
|
-
"description": "Default evaluators for all eval cases (code-based and LLM judges)",
|
|
31
|
-
"items": {
|
|
32
|
-
"type": "object",
|
|
33
|
-
"properties": {
|
|
34
|
-
"name": {
|
|
35
|
-
"type": "string",
|
|
36
|
-
"description": "Evaluator name/identifier"
|
|
37
|
-
},
|
|
38
|
-
"type": {
|
|
39
|
-
"type": "string",
|
|
40
|
-
"enum": ["code", "llm_judge"],
|
|
41
|
-
"description": "Evaluator type: 'code' for scripts/regex/keywords, 'llm_judge' for LLM-based evaluation"
|
|
42
|
-
},
|
|
43
|
-
"script": {
|
|
44
|
-
"type": "string",
|
|
45
|
-
"description": "Path to evaluator script (for type: code)"
|
|
46
|
-
},
|
|
47
|
-
"prompt": {
|
|
48
|
-
"type": "string",
|
|
49
|
-
"description": "Path to judge prompt file (for type: llm_judge)"
|
|
50
|
-
}
|
|
51
|
-
},
|
|
52
|
-
"required": ["name", "type"],
|
|
53
|
-
"additionalProperties": true
|
|
54
|
-
}
|
|
55
|
-
}
|
|
56
|
-
},
|
|
57
|
-
"additionalProperties": true
|
|
58
|
-
},
|
|
59
|
-
"evalcases": {
|
|
60
|
-
"type": "array",
|
|
61
|
-
"description": "Array of evaluation cases",
|
|
62
|
-
"minItems": 1,
|
|
63
|
-
"items": {
|
|
64
|
-
"type": "object",
|
|
65
|
-
"properties": {
|
|
66
|
-
"id": {
|
|
67
|
-
"type": "string",
|
|
68
|
-
"description": "Unique identifier for the eval case"
|
|
69
|
-
},
|
|
70
|
-
"conversation_id": {
|
|
71
|
-
"type": "string",
|
|
72
|
-
"description": "Optional conversation identifier for threading multiple eval cases together"
|
|
73
|
-
},
|
|
74
|
-
"expected_outcome": {
|
|
75
|
-
"type": "string",
|
|
76
|
-
"description": "Description of what the AI should accomplish in this eval"
|
|
77
|
-
},
|
|
78
|
-
"note": {
|
|
79
|
-
"type": "string",
|
|
80
|
-
"description": "Optional note or additional context for the eval case. Use this to document test-specific considerations, known limitations, or rationale for expected behavior."
|
|
81
|
-
},
|
|
82
|
-
"input_messages": {
|
|
83
|
-
"type": "array",
|
|
84
|
-
"description": "Input messages for the conversation",
|
|
85
|
-
"minItems": 1,
|
|
86
|
-
"items": {
|
|
87
|
-
"type": "object",
|
|
88
|
-
"properties": {
|
|
89
|
-
"role": {
|
|
90
|
-
"type": "string",
|
|
91
|
-
"enum": ["system", "user", "assistant", "tool"],
|
|
92
|
-
"description": "Message role"
|
|
93
|
-
},
|
|
94
|
-
"content": {
|
|
95
|
-
"oneOf": [
|
|
96
|
-
{
|
|
97
|
-
"type": "string",
|
|
98
|
-
"description": "Simple text content"
|
|
99
|
-
},
|
|
100
|
-
{
|
|
101
|
-
"type": "array",
|
|
102
|
-
"description": "Mixed content items (text and file references)",
|
|
103
|
-
"items": {
|
|
104
|
-
"type": "object",
|
|
105
|
-
"properties": {
|
|
106
|
-
"type": {
|
|
107
|
-
"type": "string",
|
|
108
|
-
"enum": ["text", "file"],
|
|
109
|
-
"description": "Content type: 'text' for inline content, 'file' for file references"
|
|
110
|
-
},
|
|
111
|
-
"value": {
|
|
112
|
-
"type": "string",
|
|
113
|
-
"description": "Text content or file path. Relative paths (e.g., ../prompts/file.md) are resolved from eval file directory. Absolute paths (e.g., /docs/examples/prompts/file.md) are resolved from repo root."
|
|
114
|
-
}
|
|
115
|
-
},
|
|
116
|
-
"required": ["type", "value"],
|
|
117
|
-
"additionalProperties": false
|
|
118
|
-
}
|
|
119
|
-
}
|
|
120
|
-
]
|
|
121
|
-
}
|
|
122
|
-
},
|
|
123
|
-
"required": ["role", "content"],
|
|
124
|
-
"additionalProperties": false
|
|
125
|
-
}
|
|
126
|
-
},
|
|
127
|
-
"expected_messages": {
|
|
128
|
-
"type": "array",
|
|
129
|
-
"description": "Expected response messages",
|
|
130
|
-
"minItems": 1,
|
|
131
|
-
"items": {
|
|
132
|
-
"type": "object",
|
|
133
|
-
"properties": {
|
|
134
|
-
"role": {
|
|
135
|
-
"type": "string",
|
|
136
|
-
"enum": ["system", "user", "assistant", "tool"],
|
|
137
|
-
"description": "Message role"
|
|
138
|
-
},
|
|
139
|
-
"content": {
|
|
140
|
-
"oneOf": [
|
|
141
|
-
{
|
|
142
|
-
"type": "string",
|
|
143
|
-
"description": "Simple text content"
|
|
144
|
-
},
|
|
145
|
-
{
|
|
146
|
-
"type": "array",
|
|
147
|
-
"description": "Mixed content items",
|
|
148
|
-
"items": {
|
|
149
|
-
"type": "object",
|
|
150
|
-
"properties": {
|
|
151
|
-
"type": {
|
|
152
|
-
"type": "string",
|
|
153
|
-
"enum": ["text", "file"]
|
|
154
|
-
},
|
|
155
|
-
"value": {
|
|
156
|
-
"type": "string"
|
|
157
|
-
}
|
|
158
|
-
},
|
|
159
|
-
"required": ["type", "value"],
|
|
160
|
-
"additionalProperties": false
|
|
161
|
-
}
|
|
162
|
-
}
|
|
163
|
-
]
|
|
164
|
-
}
|
|
165
|
-
},
|
|
166
|
-
"required": ["role", "content"],
|
|
167
|
-
"additionalProperties": false
|
|
168
|
-
}
|
|
169
|
-
},
|
|
170
|
-
"execution": {
|
|
171
|
-
"type": "object",
|
|
172
|
-
"description": "Per-case execution configuration",
|
|
173
|
-
"properties": {
|
|
174
|
-
"target": {
|
|
175
|
-
"type": "string",
|
|
176
|
-
"description": "Override target for this specific eval case"
|
|
177
|
-
},
|
|
178
|
-
"evaluators": {
|
|
179
|
-
"type": "array",
|
|
180
|
-
"description": "Multiple evaluators (code-based and LLM judges)",
|
|
181
|
-
"items": {
|
|
182
|
-
"type": "object",
|
|
183
|
-
"properties": {
|
|
184
|
-
"name": {
|
|
185
|
-
"type": "string",
|
|
186
|
-
"description": "Evaluator name/identifier"
|
|
187
|
-
},
|
|
188
|
-
"type": {
|
|
189
|
-
"type": "string",
|
|
190
|
-
"enum": ["code", "llm_judge"],
|
|
191
|
-
"description": "Evaluator type: 'code' for scripts/regex/keywords, 'llm_judge' for LLM-based evaluation"
|
|
192
|
-
},
|
|
193
|
-
"script": {
|
|
194
|
-
"type": "string",
|
|
195
|
-
"description": "Path to evaluator script (for type: code)"
|
|
196
|
-
},
|
|
197
|
-
"prompt": {
|
|
198
|
-
"type": "string",
|
|
199
|
-
"description": "Path to judge prompt file (for type: llm_judge)"
|
|
200
|
-
}
|
|
201
|
-
},
|
|
202
|
-
"required": ["name", "type"],
|
|
203
|
-
"additionalProperties": true
|
|
204
|
-
}
|
|
205
|
-
}
|
|
206
|
-
},
|
|
207
|
-
"additionalProperties": true
|
|
208
|
-
}
|
|
209
|
-
},
|
|
210
|
-
"required": ["id", "expected_outcome", "input_messages", "expected_messages"],
|
|
211
|
-
"additionalProperties": false
|
|
212
|
-
}
|
|
213
|
-
}
|
|
214
|
-
},
|
|
215
|
-
"required": ["evalcases"],
|
|
216
|
-
"additionalProperties": false
|
|
217
|
-
}
|
|
1
|
+
{
|
|
2
|
+
"$schema": "http://json-schema.org/draft-07/schema#",
|
|
3
|
+
"title": "AgentV Eval Schema",
|
|
4
|
+
"description": "Schema for YAML evaluation files with conversation flows, multiple evaluators, and execution configuration",
|
|
5
|
+
"type": "object",
|
|
6
|
+
"properties": {
|
|
7
|
+
"$schema": {
|
|
8
|
+
"type": "string",
|
|
9
|
+
"description": "Schema identifier",
|
|
10
|
+
"enum": ["agentv-eval-v2"]
|
|
11
|
+
},
|
|
12
|
+
"description": {
|
|
13
|
+
"type": "string",
|
|
14
|
+
"description": "Description of what this eval suite covers"
|
|
15
|
+
},
|
|
16
|
+
"target": {
|
|
17
|
+
"type": "string",
|
|
18
|
+
"description": "(Deprecated: use execution.target instead) Default target configuration name. Can be overridden per eval case."
|
|
19
|
+
},
|
|
20
|
+
"execution": {
|
|
21
|
+
"type": "object",
|
|
22
|
+
"description": "Default execution configuration for all eval cases (can be overridden per case)",
|
|
23
|
+
"properties": {
|
|
24
|
+
"target": {
|
|
25
|
+
"type": "string",
|
|
26
|
+
"description": "Default target configuration name (e.g., default, azure_base, vscode_projectx). Can be overridden per eval case."
|
|
27
|
+
},
|
|
28
|
+
"evaluators": {
|
|
29
|
+
"type": "array",
|
|
30
|
+
"description": "Default evaluators for all eval cases (code-based and LLM judges)",
|
|
31
|
+
"items": {
|
|
32
|
+
"type": "object",
|
|
33
|
+
"properties": {
|
|
34
|
+
"name": {
|
|
35
|
+
"type": "string",
|
|
36
|
+
"description": "Evaluator name/identifier"
|
|
37
|
+
},
|
|
38
|
+
"type": {
|
|
39
|
+
"type": "string",
|
|
40
|
+
"enum": ["code", "llm_judge"],
|
|
41
|
+
"description": "Evaluator type: 'code' for scripts/regex/keywords, 'llm_judge' for LLM-based evaluation"
|
|
42
|
+
},
|
|
43
|
+
"script": {
|
|
44
|
+
"type": "string",
|
|
45
|
+
"description": "Path to evaluator script (for type: code)"
|
|
46
|
+
},
|
|
47
|
+
"prompt": {
|
|
48
|
+
"type": "string",
|
|
49
|
+
"description": "Path to judge prompt file (for type: llm_judge)"
|
|
50
|
+
}
|
|
51
|
+
},
|
|
52
|
+
"required": ["name", "type"],
|
|
53
|
+
"additionalProperties": true
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
},
|
|
57
|
+
"additionalProperties": true
|
|
58
|
+
},
|
|
59
|
+
"evalcases": {
|
|
60
|
+
"type": "array",
|
|
61
|
+
"description": "Array of evaluation cases",
|
|
62
|
+
"minItems": 1,
|
|
63
|
+
"items": {
|
|
64
|
+
"type": "object",
|
|
65
|
+
"properties": {
|
|
66
|
+
"id": {
|
|
67
|
+
"type": "string",
|
|
68
|
+
"description": "Unique identifier for the eval case"
|
|
69
|
+
},
|
|
70
|
+
"conversation_id": {
|
|
71
|
+
"type": "string",
|
|
72
|
+
"description": "Optional conversation identifier for threading multiple eval cases together"
|
|
73
|
+
},
|
|
74
|
+
"expected_outcome": {
|
|
75
|
+
"type": "string",
|
|
76
|
+
"description": "Description of what the AI should accomplish in this eval"
|
|
77
|
+
},
|
|
78
|
+
"note": {
|
|
79
|
+
"type": "string",
|
|
80
|
+
"description": "Optional note or additional context for the eval case. Use this to document test-specific considerations, known limitations, or rationale for expected behavior."
|
|
81
|
+
},
|
|
82
|
+
"input_messages": {
|
|
83
|
+
"type": "array",
|
|
84
|
+
"description": "Input messages for the conversation",
|
|
85
|
+
"minItems": 1,
|
|
86
|
+
"items": {
|
|
87
|
+
"type": "object",
|
|
88
|
+
"properties": {
|
|
89
|
+
"role": {
|
|
90
|
+
"type": "string",
|
|
91
|
+
"enum": ["system", "user", "assistant", "tool"],
|
|
92
|
+
"description": "Message role"
|
|
93
|
+
},
|
|
94
|
+
"content": {
|
|
95
|
+
"oneOf": [
|
|
96
|
+
{
|
|
97
|
+
"type": "string",
|
|
98
|
+
"description": "Simple text content"
|
|
99
|
+
},
|
|
100
|
+
{
|
|
101
|
+
"type": "array",
|
|
102
|
+
"description": "Mixed content items (text and file references)",
|
|
103
|
+
"items": {
|
|
104
|
+
"type": "object",
|
|
105
|
+
"properties": {
|
|
106
|
+
"type": {
|
|
107
|
+
"type": "string",
|
|
108
|
+
"enum": ["text", "file"],
|
|
109
|
+
"description": "Content type: 'text' for inline content, 'file' for file references"
|
|
110
|
+
},
|
|
111
|
+
"value": {
|
|
112
|
+
"type": "string",
|
|
113
|
+
"description": "Text content or file path. Relative paths (e.g., ../prompts/file.md) are resolved from eval file directory. Absolute paths (e.g., /docs/examples/prompts/file.md) are resolved from repo root."
|
|
114
|
+
}
|
|
115
|
+
},
|
|
116
|
+
"required": ["type", "value"],
|
|
117
|
+
"additionalProperties": false
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
]
|
|
121
|
+
}
|
|
122
|
+
},
|
|
123
|
+
"required": ["role", "content"],
|
|
124
|
+
"additionalProperties": false
|
|
125
|
+
}
|
|
126
|
+
},
|
|
127
|
+
"expected_messages": {
|
|
128
|
+
"type": "array",
|
|
129
|
+
"description": "Expected response messages",
|
|
130
|
+
"minItems": 1,
|
|
131
|
+
"items": {
|
|
132
|
+
"type": "object",
|
|
133
|
+
"properties": {
|
|
134
|
+
"role": {
|
|
135
|
+
"type": "string",
|
|
136
|
+
"enum": ["system", "user", "assistant", "tool"],
|
|
137
|
+
"description": "Message role"
|
|
138
|
+
},
|
|
139
|
+
"content": {
|
|
140
|
+
"oneOf": [
|
|
141
|
+
{
|
|
142
|
+
"type": "string",
|
|
143
|
+
"description": "Simple text content"
|
|
144
|
+
},
|
|
145
|
+
{
|
|
146
|
+
"type": "array",
|
|
147
|
+
"description": "Mixed content items",
|
|
148
|
+
"items": {
|
|
149
|
+
"type": "object",
|
|
150
|
+
"properties": {
|
|
151
|
+
"type": {
|
|
152
|
+
"type": "string",
|
|
153
|
+
"enum": ["text", "file"]
|
|
154
|
+
},
|
|
155
|
+
"value": {
|
|
156
|
+
"type": "string"
|
|
157
|
+
}
|
|
158
|
+
},
|
|
159
|
+
"required": ["type", "value"],
|
|
160
|
+
"additionalProperties": false
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
]
|
|
164
|
+
}
|
|
165
|
+
},
|
|
166
|
+
"required": ["role", "content"],
|
|
167
|
+
"additionalProperties": false
|
|
168
|
+
}
|
|
169
|
+
},
|
|
170
|
+
"execution": {
|
|
171
|
+
"type": "object",
|
|
172
|
+
"description": "Per-case execution configuration",
|
|
173
|
+
"properties": {
|
|
174
|
+
"target": {
|
|
175
|
+
"type": "string",
|
|
176
|
+
"description": "Override target for this specific eval case"
|
|
177
|
+
},
|
|
178
|
+
"evaluators": {
|
|
179
|
+
"type": "array",
|
|
180
|
+
"description": "Multiple evaluators (code-based and LLM judges)",
|
|
181
|
+
"items": {
|
|
182
|
+
"type": "object",
|
|
183
|
+
"properties": {
|
|
184
|
+
"name": {
|
|
185
|
+
"type": "string",
|
|
186
|
+
"description": "Evaluator name/identifier"
|
|
187
|
+
},
|
|
188
|
+
"type": {
|
|
189
|
+
"type": "string",
|
|
190
|
+
"enum": ["code", "llm_judge"],
|
|
191
|
+
"description": "Evaluator type: 'code' for scripts/regex/keywords, 'llm_judge' for LLM-based evaluation"
|
|
192
|
+
},
|
|
193
|
+
"script": {
|
|
194
|
+
"type": "string",
|
|
195
|
+
"description": "Path to evaluator script (for type: code)"
|
|
196
|
+
},
|
|
197
|
+
"prompt": {
|
|
198
|
+
"type": "string",
|
|
199
|
+
"description": "Path to judge prompt file (for type: llm_judge)"
|
|
200
|
+
}
|
|
201
|
+
},
|
|
202
|
+
"required": ["name", "type"],
|
|
203
|
+
"additionalProperties": true
|
|
204
|
+
}
|
|
205
|
+
}
|
|
206
|
+
},
|
|
207
|
+
"additionalProperties": true
|
|
208
|
+
}
|
|
209
|
+
},
|
|
210
|
+
"required": ["id", "expected_outcome", "input_messages", "expected_messages"],
|
|
211
|
+
"additionalProperties": false
|
|
212
|
+
}
|
|
213
|
+
}
|
|
214
|
+
},
|
|
215
|
+
"required": ["evalcases"],
|
|
216
|
+
"additionalProperties": false
|
|
217
|
+
}
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
# Example environment configuration for AgentV
|
|
2
|
+
# Copy this file to .env and fill in your credentials
|
|
3
|
+
|
|
4
|
+
# Model Provider Selection (Optional - can be configured via targets.yaml)
|
|
5
|
+
PROVIDER=azure
|
|
6
|
+
|
|
7
|
+
# Azure OpenAI Configuration
|
|
8
|
+
# These are the default environment variable names used in the provided targets.yaml
|
|
9
|
+
AZURE_OPENAI_ENDPOINT=https://your-endpoint.openai.azure.com/
|
|
10
|
+
AZURE_OPENAI_API_KEY=your-api-key-here
|
|
11
|
+
AZURE_DEPLOYMENT_NAME=gpt-4o
|
|
12
|
+
|
|
13
|
+
# Anthropic Configuration (if using Anthropic provider)
|
|
14
|
+
ANTHROPIC_API_KEY=your-anthropic-api-key-here
|
|
15
|
+
|
|
16
|
+
# VS Code Workspace Paths for Execution Targets
|
|
17
|
+
# Note: Using forward slashes is recommended for paths in .env files
|
|
18
|
+
# to avoid issues with escape characters.
|
|
19
|
+
PROJECTX_WORKSPACE_PATH=C:/Users/your-username/OneDrive - Company Pty Ltd/sample.code-workspace
|
|
20
|
+
|
|
21
|
+
# CLI provider sample (used by the local_cli target)
|
|
22
|
+
PROJECT_ROOT=D:/GitHub/your-username/agentv/docs/examples/simple
|
|
23
|
+
LOCAL_AGENT_TOKEN=your-cli-token
|