agentv 2.6.0 → 2.7.1-next.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +128 -33
- package/dist/chunk-3L2L5GIL.js +51 -0
- package/dist/{chunk-BKMQNEUD.js.map → chunk-3L2L5GIL.js.map} +1 -1
- package/dist/{chunk-BKMQNEUD.js → chunk-5H446C7X.js} +2 -45
- package/dist/{chunk-LJVS3JAK.js → chunk-BL4PVUAT.js} +6 -4
- package/dist/{chunk-LJVS3JAK.js.map → chunk-BL4PVUAT.js.map} +1 -1
- package/dist/{chunk-MGK6HHRR.js → chunk-BWLYFF5N.js} +9813 -11237
- package/dist/chunk-BWLYFF5N.js.map +1 -0
- package/dist/chunk-C5GOHBQM.js +84 -0
- package/dist/chunk-C5GOHBQM.js.map +1 -0
- package/dist/chunk-EJEG3DU2.js +5476 -0
- package/dist/chunk-EJEG3DU2.js.map +1 -0
- package/dist/chunk-FV32QHPB.js +565 -0
- package/dist/chunk-FV32QHPB.js.map +1 -0
- package/dist/chunk-H5FFZCKI.js +2957 -0
- package/dist/chunk-H5FFZCKI.js.map +1 -0
- package/dist/chunk-JK6V4KVD.js +114 -0
- package/dist/chunk-JK6V4KVD.js.map +1 -0
- package/dist/chunk-LRULMAAA.js +1711 -0
- package/dist/chunk-LRULMAAA.js.map +1 -0
- package/dist/chunk-SR4I5KET.js +1238 -0
- package/dist/chunk-SR4I5KET.js.map +1 -0
- package/dist/chunk-VQ2ZO7XJ.js +2098 -0
- package/dist/chunk-VQ2ZO7XJ.js.map +1 -0
- package/dist/chunk-XALGXSKB.js +21 -0
- package/dist/chunk-XALGXSKB.js.map +1 -0
- package/dist/cli.js +8 -2
- package/dist/cli.js.map +1 -1
- package/dist/dist-R3OCWGXH.js +257 -0
- package/dist/dist-R3OCWGXH.js.map +1 -0
- package/dist/esm-5Q4BZALM-5REQWAUV.js +924 -0
- package/dist/esm-5Q4BZALM-5REQWAUV.js.map +1 -0
- package/dist/esm-DX3WQKEN.js +32 -0
- package/dist/esm-DX3WQKEN.js.map +1 -0
- package/dist/esm-QNEMCJPL.js +933 -0
- package/dist/esm-QNEMCJPL.js.map +1 -0
- package/dist/esm-R77SNOF5.js +65 -0
- package/dist/esm-R77SNOF5.js.map +1 -0
- package/dist/esm-RVQPUGWH.js +1207 -0
- package/dist/esm-RVQPUGWH.js.map +1 -0
- package/dist/getMachineId-bsd-HSK5LZMG.js +41 -0
- package/dist/getMachineId-bsd-HSK5LZMG.js.map +1 -0
- package/dist/getMachineId-darwin-4DP6CCJV.js +41 -0
- package/dist/getMachineId-darwin-4DP6CCJV.js.map +1 -0
- package/dist/getMachineId-linux-44LJ5UJB.js +33 -0
- package/dist/getMachineId-linux-44LJ5UJB.js.map +1 -0
- package/dist/getMachineId-unsupported-NVK6IATM.js +24 -0
- package/dist/getMachineId-unsupported-NVK6IATM.js.map +1 -0
- package/dist/getMachineId-win-YZ36S7VA.js +43 -0
- package/dist/getMachineId-win-YZ36S7VA.js.map +1 -0
- package/dist/index.js +10 -2
- package/dist/interactive-33TCZXLF.js +333 -0
- package/dist/interactive-33TCZXLF.js.map +1 -0
- package/dist/otlp-json-file-exporter-77FDBRSY-EZAPHWP6.js +9 -0
- package/dist/otlp-json-file-exporter-77FDBRSY-EZAPHWP6.js.map +1 -0
- package/dist/simple-trace-file-exporter-S76DMABU-5FCJESD2.js +9 -0
- package/dist/simple-trace-file-exporter-S76DMABU-5FCJESD2.js.map +1 -0
- package/dist/src-2N5EJ2N6.js +1733 -0
- package/dist/src-2N5EJ2N6.js.map +1 -0
- package/dist/templates/.agents/skills/agentv-chat-to-eval/README.md +84 -0
- package/dist/templates/.agents/skills/agentv-chat-to-eval/SKILL.md +144 -0
- package/dist/templates/.agents/skills/agentv-chat-to-eval/examples/transcript-json.md +67 -0
- package/dist/templates/.agents/skills/agentv-chat-to-eval/examples/transcript-markdown.md +101 -0
- package/dist/templates/.agents/skills/agentv-eval-builder/SKILL.md +433 -0
- package/dist/templates/.agents/skills/agentv-eval-builder/references/config-schema.json +36 -0
- package/dist/templates/.agents/skills/agentv-eval-builder/references/custom-evaluators.md +118 -0
- package/dist/templates/.agents/skills/agentv-eval-builder/references/eval-schema.json +251 -0
- package/dist/templates/.agents/skills/agentv-eval-builder/references/rubric-evaluator.md +77 -0
- package/dist/templates/.agents/skills/agentv-eval-orchestrator/SKILL.md +50 -0
- package/dist/templates/.agents/skills/agentv-prompt-optimizer/SKILL.md +78 -0
- package/dist/templates/.agentv/.env.example +23 -23
- package/dist/templates/.agentv/config.yaml +15 -15
- package/dist/templates/.claude/skills/agentv-eval-builder/SKILL.md +38 -13
- package/dist/templates/.claude/skills/agentv-eval-builder/references/custom-evaluators.md +9 -6
- package/dist/templates/.claude/skills/agentv-eval-builder/references/eval-schema.json +4 -4
- package/dist/templates/.claude/skills/agentv-eval-builder/references/rubric-evaluator.md +7 -9
- package/dist/templates/.github/prompts/agentv-eval-build.prompt.md +4 -4
- package/dist/templates/.github/prompts/agentv-optimize.prompt.md +3 -3
- package/dist/{token-D3IYDJQZ.js → token-POXF46NU.js} +6 -4
- package/dist/{token-D3IYDJQZ.js.map → token-POXF46NU.js.map} +1 -1
- package/dist/{token-util-FWFPR2BV.js → token-util-6GWYZWGE.js} +4 -3
- package/dist/token-util-6GWYZWGE.js.map +1 -0
- package/package.json +7 -3
- package/dist/chunk-MGK6HHRR.js.map +0 -1
- /package/dist/{token-util-FWFPR2BV.js.map → chunk-5H446C7X.js.map} +0 -0
|
@@ -0,0 +1,251 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$schema": "http://json-schema.org/draft-07/schema#",
|
|
3
|
+
"title": "AgentV Eval Schema",
|
|
4
|
+
"description": "Schema for YAML evaluation files with conversation flows, multiple evaluators, and execution configuration",
|
|
5
|
+
"type": "object",
|
|
6
|
+
"properties": {
|
|
7
|
+
"description": {
|
|
8
|
+
"type": "string",
|
|
9
|
+
"description": "Description of what this eval suite covers"
|
|
10
|
+
},
|
|
11
|
+
"target": {
|
|
12
|
+
"type": "string",
|
|
13
|
+
"description": "(Deprecated: use execution.target instead) Default target configuration name. Can be overridden per test."
|
|
14
|
+
},
|
|
15
|
+
"execution": {
|
|
16
|
+
"type": "object",
|
|
17
|
+
"description": "Default execution configuration for all tests (can be overridden per test)",
|
|
18
|
+
"properties": {
|
|
19
|
+
"target": {
|
|
20
|
+
"type": "string",
|
|
21
|
+
"description": "Default target configuration name (e.g., default, azure_base, vscode_projectx). Can be overridden per test."
|
|
22
|
+
},
|
|
23
|
+
"evaluators": {
|
|
24
|
+
"type": "array",
|
|
25
|
+
"description": "Default evaluators appended to every test's evaluators (unless skip_defaults is set per test)",
|
|
26
|
+
"items": {
|
|
27
|
+
"type": "object",
|
|
28
|
+
"properties": {
|
|
29
|
+
"name": {
|
|
30
|
+
"type": "string",
|
|
31
|
+
"description": "Evaluator name/identifier"
|
|
32
|
+
},
|
|
33
|
+
"type": {
|
|
34
|
+
"type": "string",
|
|
35
|
+
"enum": [
|
|
36
|
+
"code",
|
|
37
|
+
"llm_judge",
|
|
38
|
+
"composite",
|
|
39
|
+
"tool_trajectory",
|
|
40
|
+
"field_accuracy",
|
|
41
|
+
"latency",
|
|
42
|
+
"cost",
|
|
43
|
+
"token_usage"
|
|
44
|
+
],
|
|
45
|
+
"description": "Evaluator type: 'code' for scripts/regex/keywords, 'llm_judge' for LLM-based evaluation"
|
|
46
|
+
},
|
|
47
|
+
"script": {
|
|
48
|
+
"type": "string",
|
|
49
|
+
"description": "Path to evaluator script (for type: code)"
|
|
50
|
+
},
|
|
51
|
+
"prompt": {
|
|
52
|
+
"type": "string",
|
|
53
|
+
"description": "Path to judge prompt file (for type: llm_judge)"
|
|
54
|
+
}
|
|
55
|
+
},
|
|
56
|
+
"required": ["name", "type"],
|
|
57
|
+
"additionalProperties": true
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
},
|
|
61
|
+
"additionalProperties": true
|
|
62
|
+
},
|
|
63
|
+
"tests": {
|
|
64
|
+
"type": "array",
|
|
65
|
+
"description": "Array of evaluation tests",
|
|
66
|
+
"minItems": 1,
|
|
67
|
+
"items": {
|
|
68
|
+
"type": "object",
|
|
69
|
+
"properties": {
|
|
70
|
+
"id": {
|
|
71
|
+
"type": "string",
|
|
72
|
+
"description": "Unique identifier for the test"
|
|
73
|
+
},
|
|
74
|
+
"conversation_id": {
|
|
75
|
+
"type": "string",
|
|
76
|
+
"description": "Optional conversation identifier for threading multiple tests together"
|
|
77
|
+
},
|
|
78
|
+
"criteria": {
|
|
79
|
+
"type": "string",
|
|
80
|
+
"description": "Description of what the AI should accomplish in this eval"
|
|
81
|
+
},
|
|
82
|
+
"note": {
|
|
83
|
+
"type": "string",
|
|
84
|
+
"description": "Optional note or additional context for the test. Use this to document test-specific considerations, known limitations, or rationale for expected behavior."
|
|
85
|
+
},
|
|
86
|
+
"input": {
|
|
87
|
+
"description": "Input messages for the conversation. String expands to single user message, array of messages passes through.",
|
|
88
|
+
"oneOf": [
|
|
89
|
+
{
|
|
90
|
+
"type": "string",
|
|
91
|
+
"description": "Shorthand: single user message content"
|
|
92
|
+
},
|
|
93
|
+
{
|
|
94
|
+
"type": "array",
|
|
95
|
+
"description": "Array of messages",
|
|
96
|
+
"minItems": 1,
|
|
97
|
+
"items": {
|
|
98
|
+
"type": "object",
|
|
99
|
+
"properties": {
|
|
100
|
+
"role": {
|
|
101
|
+
"type": "string",
|
|
102
|
+
"enum": ["system", "user", "assistant", "tool"],
|
|
103
|
+
"description": "Message role"
|
|
104
|
+
},
|
|
105
|
+
"content": {
|
|
106
|
+
"oneOf": [
|
|
107
|
+
{
|
|
108
|
+
"type": "string",
|
|
109
|
+
"description": "Simple text content"
|
|
110
|
+
},
|
|
111
|
+
{
|
|
112
|
+
"type": "array",
|
|
113
|
+
"description": "Mixed content items (text and file references)",
|
|
114
|
+
"items": {
|
|
115
|
+
"type": "object",
|
|
116
|
+
"properties": {
|
|
117
|
+
"type": {
|
|
118
|
+
"type": "string",
|
|
119
|
+
"enum": ["text", "file"],
|
|
120
|
+
"description": "Content type: 'text' for inline content, 'file' for file references"
|
|
121
|
+
},
|
|
122
|
+
"value": {
|
|
123
|
+
"type": "string",
|
|
124
|
+
"description": "Text content or file path. Relative paths (e.g., ../prompts/file.md) are resolved from eval file directory. Absolute paths (e.g., /docs/examples/prompts/file.md) are resolved from repo root."
|
|
125
|
+
}
|
|
126
|
+
},
|
|
127
|
+
"required": ["type", "value"],
|
|
128
|
+
"additionalProperties": false
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
]
|
|
132
|
+
}
|
|
133
|
+
},
|
|
134
|
+
"required": ["role", "content"],
|
|
135
|
+
"additionalProperties": false
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
]
|
|
139
|
+
},
|
|
140
|
+
"expected_output": {
|
|
141
|
+
"description": "Expected response messages. String expands to single assistant message, object wraps as assistant message content. The content of the last resolved entry becomes the template variable 'reference_answer'.",
|
|
142
|
+
"oneOf": [
|
|
143
|
+
{
|
|
144
|
+
"type": "string",
|
|
145
|
+
"description": "Shorthand: single assistant message content"
|
|
146
|
+
},
|
|
147
|
+
{
|
|
148
|
+
"type": "object",
|
|
149
|
+
"description": "Shorthand: structured content wraps as assistant message"
|
|
150
|
+
},
|
|
151
|
+
{
|
|
152
|
+
"type": "array",
|
|
153
|
+
"description": "Array of messages",
|
|
154
|
+
"items": {
|
|
155
|
+
"type": "object",
|
|
156
|
+
"properties": {
|
|
157
|
+
"role": {
|
|
158
|
+
"type": "string",
|
|
159
|
+
"enum": ["system", "user", "assistant", "tool"],
|
|
160
|
+
"description": "Message role"
|
|
161
|
+
},
|
|
162
|
+
"content": {
|
|
163
|
+
"oneOf": [
|
|
164
|
+
{
|
|
165
|
+
"type": "string",
|
|
166
|
+
"description": "Simple text content"
|
|
167
|
+
},
|
|
168
|
+
{
|
|
169
|
+
"type": "object",
|
|
170
|
+
"description": "Structured content object"
|
|
171
|
+
},
|
|
172
|
+
{
|
|
173
|
+
"type": "array",
|
|
174
|
+
"description": "Mixed content items",
|
|
175
|
+
"items": {
|
|
176
|
+
"type": "object",
|
|
177
|
+
"properties": {
|
|
178
|
+
"type": {
|
|
179
|
+
"type": "string",
|
|
180
|
+
"enum": ["text", "file"]
|
|
181
|
+
},
|
|
182
|
+
"value": {
|
|
183
|
+
"type": "string"
|
|
184
|
+
}
|
|
185
|
+
},
|
|
186
|
+
"required": ["type", "value"],
|
|
187
|
+
"additionalProperties": false
|
|
188
|
+
}
|
|
189
|
+
}
|
|
190
|
+
]
|
|
191
|
+
}
|
|
192
|
+
},
|
|
193
|
+
"required": ["role", "content"],
|
|
194
|
+
"additionalProperties": false
|
|
195
|
+
}
|
|
196
|
+
}
|
|
197
|
+
]
|
|
198
|
+
},
|
|
199
|
+
"execution": {
|
|
200
|
+
"type": "object",
|
|
201
|
+
"description": "Per-case execution configuration",
|
|
202
|
+
"properties": {
|
|
203
|
+
"target": {
|
|
204
|
+
"type": "string",
|
|
205
|
+
"description": "Override target for this specific test"
|
|
206
|
+
},
|
|
207
|
+
"skip_defaults": {
|
|
208
|
+
"type": "boolean",
|
|
209
|
+
"description": "When true, root-level execution.evaluators are not appended to this test's evaluators"
|
|
210
|
+
},
|
|
211
|
+
"evaluators": {
|
|
212
|
+
"type": "array",
|
|
213
|
+
"description": "Per-test evaluators (root-level evaluators are appended unless skip_defaults is true)",
|
|
214
|
+
"items": {
|
|
215
|
+
"type": "object",
|
|
216
|
+
"properties": {
|
|
217
|
+
"name": {
|
|
218
|
+
"type": "string",
|
|
219
|
+
"description": "Evaluator name/identifier"
|
|
220
|
+
},
|
|
221
|
+
"type": {
|
|
222
|
+
"type": "string",
|
|
223
|
+
"enum": ["code", "llm_judge"],
|
|
224
|
+
"description": "Evaluator type: 'code' for scripts/regex/keywords, 'llm_judge' for LLM-based evaluation"
|
|
225
|
+
},
|
|
226
|
+
"script": {
|
|
227
|
+
"type": "string",
|
|
228
|
+
"description": "Path to evaluator script (for type: code)"
|
|
229
|
+
},
|
|
230
|
+
"prompt": {
|
|
231
|
+
"type": "string",
|
|
232
|
+
"description": "Path to judge prompt file (for type: llm_judge)"
|
|
233
|
+
}
|
|
234
|
+
},
|
|
235
|
+
"required": ["name", "type"],
|
|
236
|
+
"additionalProperties": true
|
|
237
|
+
}
|
|
238
|
+
}
|
|
239
|
+
},
|
|
240
|
+
"additionalProperties": true
|
|
241
|
+
}
|
|
242
|
+
},
|
|
243
|
+
"required": ["id", "criteria"],
|
|
244
|
+
"anyOf": [{ "required": ["input"] }],
|
|
245
|
+
"additionalProperties": true
|
|
246
|
+
}
|
|
247
|
+
}
|
|
248
|
+
},
|
|
249
|
+
"required": ["tests"],
|
|
250
|
+
"additionalProperties": false
|
|
251
|
+
}
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
# Rubric Evaluator
|
|
2
|
+
|
|
3
|
+
## Field Reference
|
|
4
|
+
|
|
5
|
+
| Field | Type | Default | Description |
|
|
6
|
+
|-------|------|---------|-------------|
|
|
7
|
+
| `id` | string | auto-generated | Unique identifier |
|
|
8
|
+
| `outcome` | string | required* | Criterion being evaluated (*optional if `score_ranges` used) |
|
|
9
|
+
| `weight` | number | 1.0 | Relative importance |
|
|
10
|
+
| `required` | boolean | true | Failing forces verdict to 'fail' (checklist mode) |
|
|
11
|
+
| `required_min_score` | integer | - | Minimum 0-10 score to pass (score-range mode) |
|
|
12
|
+
| `score_ranges` | map or array | - | Score range definitions for analytic scoring |
|
|
13
|
+
|
|
14
|
+
## Checklist Mode
|
|
15
|
+
|
|
16
|
+
```yaml
|
|
17
|
+
rubrics:
|
|
18
|
+
- Mentions divide-and-conquer approach
|
|
19
|
+
- id: complexity
|
|
20
|
+
outcome: States time complexity correctly
|
|
21
|
+
weight: 2.0
|
|
22
|
+
required: true
|
|
23
|
+
- id: examples
|
|
24
|
+
outcome: Includes code examples
|
|
25
|
+
weight: 1.0
|
|
26
|
+
required: false
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
## Score-Range Mode
|
|
30
|
+
|
|
31
|
+
Shorthand map format (recommended):
|
|
32
|
+
|
|
33
|
+
```yaml
|
|
34
|
+
rubrics:
|
|
35
|
+
- id: correctness
|
|
36
|
+
weight: 2.0
|
|
37
|
+
required_min_score: 7
|
|
38
|
+
score_ranges:
|
|
39
|
+
0: Critical bugs
|
|
40
|
+
3: Minor bugs
|
|
41
|
+
6: Correct with minor issues
|
|
42
|
+
9: Fully correct
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
Map keys are lower bounds (0-10). Each range extends from its key to (next key - 1), with the last extending to 10. Must start at 0.
|
|
46
|
+
|
|
47
|
+
Array format is also accepted:
|
|
48
|
+
|
|
49
|
+
```yaml
|
|
50
|
+
score_ranges:
|
|
51
|
+
- score_range: [0, 2]
|
|
52
|
+
outcome: Critical bugs
|
|
53
|
+
- score_range: [3, 5]
|
|
54
|
+
outcome: Minor bugs
|
|
55
|
+
- score_range: [6, 8]
|
|
56
|
+
outcome: Correct with minor issues
|
|
57
|
+
- score_range: [9, 10]
|
|
58
|
+
outcome: Fully correct
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
Ranges must be integers 0-10, non-overlapping, covering all values 0-10.
|
|
62
|
+
|
|
63
|
+
## Scoring
|
|
64
|
+
|
|
65
|
+
**Checklist:** `score = sum(satisfied weights) / sum(all weights)`
|
|
66
|
+
|
|
67
|
+
**Score-range:** `score = weighted_average(raw_score / 10)` per criterion
|
|
68
|
+
|
|
69
|
+
## Verdicts
|
|
70
|
+
|
|
71
|
+
| Verdict | Condition |
|
|
72
|
+
|---------|-----------|
|
|
73
|
+
| `pass` | score >= 0.8 AND all gating criteria satisfied |
|
|
74
|
+
| `borderline` | score >= 0.6 AND all gating criteria satisfied |
|
|
75
|
+
| `fail` | score < 0.6 OR any gating criterion failed |
|
|
76
|
+
|
|
77
|
+
Gating: checklist uses `required: true`, score-range uses `required_min_score: N`.
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: agentv-eval-orchestrator
|
|
3
|
+
description: Run AgentV evaluations without API keys by orchestrating eval subcommands. Use this skill when asked to run evals, evaluate an agent, or test prompt quality using agentv.
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# AgentV Eval Orchestrator
|
|
7
|
+
|
|
8
|
+
Run AgentV evaluations by acting as the LLM yourself — no API keys needed.
|
|
9
|
+
|
|
10
|
+
## Quick Start
|
|
11
|
+
|
|
12
|
+
```bash
|
|
13
|
+
agentv prompt eval <eval-file.yaml>
|
|
14
|
+
```
|
|
15
|
+
|
|
16
|
+
This outputs a complete orchestration prompt with step-by-step instructions and all test IDs. Follow its instructions.
|
|
17
|
+
|
|
18
|
+
## Workflow
|
|
19
|
+
|
|
20
|
+
For each test, run these three steps:
|
|
21
|
+
|
|
22
|
+
### 1. Get Task Input
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
agentv prompt eval input <path> --test-id <id>
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
Returns JSON with `input`, `guideline_paths`, and `criteria`. File references in messages use absolute paths — read them from the filesystem.
|
|
29
|
+
|
|
30
|
+
### 2. Execute the Task
|
|
31
|
+
|
|
32
|
+
You ARE the candidate LLM. Read `input` from step 1, read any referenced files, and answer the task. Save your response to a temp file.
|
|
33
|
+
|
|
34
|
+
**Important**: Do not leak `criteria` into your answer — it's for your reference when judging, not part of the task.
|
|
35
|
+
|
|
36
|
+
### 3. Judge the Result
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
agentv prompt eval judge <path> --test-id <id> --answer-file /tmp/eval_<id>.txt
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
Returns JSON with an `evaluators` array. Each evaluator has a `status`:
|
|
43
|
+
|
|
44
|
+
- **`"completed"`** — Deterministic score is final. Read `result.score` (0.0–1.0).
|
|
45
|
+
- **`"prompt_ready"`** — LLM grading required. Send `prompt.system_prompt` as system and `prompt.user_prompt` as user to yourself. Parse the JSON response to get `score`, `hits`, `misses`.
|
|
46
|
+
|
|
47
|
+
## When to use this vs `agentv eval`
|
|
48
|
+
|
|
49
|
+
- **`agentv eval`** — You have API keys configured. Runs everything end-to-end automatically.
|
|
50
|
+
- **`agentv prompt`** — No API keys. You orchestrate: get input, answer the task yourself, judge the result.
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: agentv-prompt-optimizer
|
|
3
|
+
description: Iteratively optimize prompt files against AgentV evaluation datasets by analyzing failures and refining instructions.
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# AgentV Prompt Optimizer
|
|
7
|
+
|
|
8
|
+
## Input Variables
|
|
9
|
+
- `eval-path`: Path or glob pattern to the AgentV evaluation file(s) to optimize against
|
|
10
|
+
- `optimization-log-path` (optional): Path where optimization progress should be logged
|
|
11
|
+
|
|
12
|
+
## Workflow
|
|
13
|
+
|
|
14
|
+
1. **Initialize**
|
|
15
|
+
- Verify `<eval-path>` (file or glob) targets the correct system.
|
|
16
|
+
- **Identify Prompt Files**:
|
|
17
|
+
- Infer prompt files from the eval file content (look for `file:` references in `input` that match these patterns).
|
|
18
|
+
- Recursively check referenced prompt files for *other* prompt references (dependencies).
|
|
19
|
+
- If multiple prompts are found, consider ALL of them as candidates for optimization.
|
|
20
|
+
- **Identify Optimization Log**:
|
|
21
|
+
- If `<optimization-log-path>` is provided, use it.
|
|
22
|
+
- If not, create a new one in the parent directory of the eval files: `optimization-[timestamp].md`.
|
|
23
|
+
- Read content of the identified prompt file.
|
|
24
|
+
|
|
25
|
+
2. **Optimization Loop** (Max 10 iterations)
|
|
26
|
+
- **Execute (The Generator)**: Run `agentv eval <eval-path>`.
|
|
27
|
+
- *Targeted Run*: If iterating on specific stubborn failures, use `--test-id <test_id>` to run only the relevant tests.
|
|
28
|
+
- **Analyze (The Reflector)**:
|
|
29
|
+
- Locate the results file path from the console output (e.g., `.agentv/results/eval_...jsonl`).
|
|
30
|
+
- **Orchestrate Subagent**: Use `runSubagent` to analyze the results.
|
|
31
|
+
- **Task**: Read the results file, calculate pass rate, and perform root cause analysis.
|
|
32
|
+
- **Output**: Return a structured analysis including:
|
|
33
|
+
- **Score**: Current pass rate.
|
|
34
|
+
- **Root Cause**: Why failures occurred (e.g., "Ambiguous definition", "Hallucination").
|
|
35
|
+
- **Insight**: Key learning or pattern identified from the failures.
|
|
36
|
+
- **Strategy**: High-level plan to fix the prompt (e.g., "Clarify section X", "Add negative constraint").
|
|
37
|
+
- **Decide**:
|
|
38
|
+
- If **100% pass**: STOP and report success.
|
|
39
|
+
- If **Score decreased**: Revert last change, try different approach.
|
|
40
|
+
- If **No improvement** (2x): STOP and report stagnation.
|
|
41
|
+
- **Refine (The Curator)**:
|
|
42
|
+
- **Orchestrate Subagent**: Use `runSubagent` to apply the fix.
|
|
43
|
+
- **Task**: Read the relevant prompt file(s), apply the **Strategy** from the Reflector, and generate the log entry.
|
|
44
|
+
- **Output**: The **Log Entry** describing the specific operation performed.
|
|
45
|
+
```markdown
|
|
46
|
+
### Iteration [N]
|
|
47
|
+
- **Operation**: [ADD / UPDATE / DELETE]
|
|
48
|
+
- **Target**: [Section Name]
|
|
49
|
+
- **Change**: [Specific text added/modified]
|
|
50
|
+
- **Trigger**: [Specific failing test case or error pattern]
|
|
51
|
+
- **Rationale**: [From Reflector: Root Cause]
|
|
52
|
+
- **Score**: [From Reflector: Current Pass Rate]
|
|
53
|
+
- **Insight**: [From Reflector: Key Learning]
|
|
54
|
+
```
|
|
55
|
+
- **Strategy**: Treat the prompt as a structured set of rules. Execute atomic operations:
|
|
56
|
+
- **ADD**: Insert a new rule if a constraint was missed.
|
|
57
|
+
- **UPDATE**: Refine an existing rule to be clearer or more general.
|
|
58
|
+
- *Clarify*: Make ambiguous instructions specific.
|
|
59
|
+
- *Generalize*: Refactor specific fixes into high-level principles (First Principles).
|
|
60
|
+
- **DELETE**: Remove obsolete, redundant, or harmful rules.
|
|
61
|
+
- *Prune*: If a general rule covers specific cases, delete the specific ones.
|
|
62
|
+
- **Negative Constraint**: If hallucinating, explicitly state what NOT to do. Prefer generalized prohibitions over specific forbidden tokens where possible.
|
|
63
|
+
- **Safety Check**: Ensure new rules don't contradict existing ones (unless intended).
|
|
64
|
+
- **Constraint**: Avoid rewriting large sections. Make surgical, additive changes to preserve existing behavior.
|
|
65
|
+
- **Log Result**:
|
|
66
|
+
- Append the **Log Entry** returned by the Curator to the optimization log file.
|
|
67
|
+
|
|
68
|
+
3. **Completion**
|
|
69
|
+
- Report final score.
|
|
70
|
+
- Summarize key changes made to the prompt.
|
|
71
|
+
- **Finalize Optimization Log**: Add a summary header to the optimization log file indicating the session completion and final score.
|
|
72
|
+
|
|
73
|
+
## Guidelines
|
|
74
|
+
- **Generalization First**: Prefer broad, principle-based guidelines over specific examples or "hotfixes". Only use specific rules if generalized instructions fail to achieve the desired score.
|
|
75
|
+
- **Simplicity ("Less is More")**: Avoid overfitting to the test set. If a specific rule doesn't significantly improve the score compared to a general one, choose the general one.
|
|
76
|
+
- **Structure**: Maintain existing Markdown headers/sections.
|
|
77
|
+
- **Progressive Disclosure**: If the prompt grows too large (>200 lines), consider moving specialized logic into a separate file or skill.
|
|
78
|
+
- **Quality Criteria**: Ensure the prompt defines a clear persona, specific task, and measurable success criteria.
|
|
@@ -1,23 +1,23 @@
|
|
|
1
|
-
# Copy this file to .env and fill in your credentials
|
|
2
|
-
|
|
3
|
-
# Azure OpenAI Configuration
|
|
4
|
-
AZURE_OPENAI_ENDPOINT=https://your-endpoint.openai.azure.com/
|
|
5
|
-
AZURE_OPENAI_API_KEY=your-openai-api-key-here
|
|
6
|
-
AZURE_DEPLOYMENT_NAME=gpt-5-chat
|
|
7
|
-
AZURE_OPENAI_API_VERSION=2024-12-01-preview
|
|
8
|
-
|
|
9
|
-
# Google Gemini
|
|
10
|
-
GOOGLE_GENERATIVE_AI_API_KEY=your-gemini-api-key-here
|
|
11
|
-
GEMINI_MODEL_NAME=gemini-2.5-flash
|
|
12
|
-
|
|
13
|
-
# Anthropic
|
|
14
|
-
ANTHROPIC_API_KEY=your-anthropic-api-key-here
|
|
15
|
-
|
|
16
|
-
# VS Code Workspace Paths for Execution Targets
|
|
17
|
-
# Note: Using forward slashes is recommended for paths in .env files
|
|
18
|
-
# to avoid issues with escape characters.
|
|
19
|
-
PROJECTX_WORKSPACE_PATH=C:/Users/your-username/OneDrive - Company Pty Ltd/sample.code-workspace
|
|
20
|
-
|
|
21
|
-
# CLI provider sample (used by the local_cli target)
|
|
22
|
-
CLI_EVALS_DIR=./docs/examples/simple/evals/local-cli
|
|
23
|
-
LOCAL_AGENT_TOKEN=dummytoken
|
|
1
|
+
# Copy this file to .env and fill in your credentials
|
|
2
|
+
|
|
3
|
+
# Azure OpenAI Configuration
|
|
4
|
+
AZURE_OPENAI_ENDPOINT=https://your-endpoint.openai.azure.com/
|
|
5
|
+
AZURE_OPENAI_API_KEY=your-openai-api-key-here
|
|
6
|
+
AZURE_DEPLOYMENT_NAME=gpt-5-chat
|
|
7
|
+
AZURE_OPENAI_API_VERSION=2024-12-01-preview
|
|
8
|
+
|
|
9
|
+
# Google Gemini
|
|
10
|
+
GOOGLE_GENERATIVE_AI_API_KEY=your-gemini-api-key-here
|
|
11
|
+
GEMINI_MODEL_NAME=gemini-2.5-flash
|
|
12
|
+
|
|
13
|
+
# Anthropic
|
|
14
|
+
ANTHROPIC_API_KEY=your-anthropic-api-key-here
|
|
15
|
+
|
|
16
|
+
# VS Code Workspace Paths for Execution Targets
|
|
17
|
+
# Note: Using forward slashes is recommended for paths in .env files
|
|
18
|
+
# to avoid issues with escape characters.
|
|
19
|
+
PROJECTX_WORKSPACE_PATH=C:/Users/your-username/OneDrive - Company Pty Ltd/sample.code-workspace
|
|
20
|
+
|
|
21
|
+
# CLI provider sample (used by the local_cli target)
|
|
22
|
+
CLI_EVALS_DIR=./docs/examples/simple/evals/local-cli
|
|
23
|
+
LOCAL_AGENT_TOKEN=dummytoken
|
|
@@ -1,15 +1,15 @@
|
|
|
1
|
-
$schema: agentv-config-v2
|
|
2
|
-
|
|
3
|
-
# Customize which files are treated as guidelines vs regular file content
|
|
4
|
-
|
|
5
|
-
# Custom guideline patterns:
|
|
6
|
-
guideline_patterns:
|
|
7
|
-
- "**/*.instructions.md"
|
|
8
|
-
- "**/*.prompt.md"
|
|
9
|
-
- "**/SKILL.md"
|
|
10
|
-
|
|
11
|
-
# Notes:
|
|
12
|
-
# - Patterns use standard glob syntax (via micromatch library)
|
|
13
|
-
# - Paths are normalized to forward slashes for cross-platform compatibility
|
|
14
|
-
# - Only files matching these patterns are loaded as guidelines
|
|
15
|
-
# - All other files referenced in eval cases are treated as regular file content
|
|
1
|
+
$schema: agentv-config-v2
|
|
2
|
+
|
|
3
|
+
# Customize which files are treated as guidelines vs regular file content
|
|
4
|
+
|
|
5
|
+
# Custom guideline patterns:
|
|
6
|
+
guideline_patterns:
|
|
7
|
+
- "**/*.instructions.md"
|
|
8
|
+
- "**/*.prompt.md"
|
|
9
|
+
- "**/SKILL.md"
|
|
10
|
+
|
|
11
|
+
# Notes:
|
|
12
|
+
# - Patterns use standard glob syntax (via micromatch library)
|
|
13
|
+
# - Paths are normalized to forward slashes for cross-platform compatibility
|
|
14
|
+
# - Only files matching these patterns are loaded as guidelines
|
|
15
|
+
# - All other files referenced in eval cases are treated as regular file content
|
|
@@ -14,9 +14,9 @@ description: Example eval
|
|
|
14
14
|
execution:
|
|
15
15
|
target: default
|
|
16
16
|
|
|
17
|
-
|
|
17
|
+
cases:
|
|
18
18
|
- id: greeting
|
|
19
|
-
|
|
19
|
+
criteria: Friendly greeting
|
|
20
20
|
input: "Say hello"
|
|
21
21
|
expected_output: "Hello! How can I help you?"
|
|
22
22
|
rubrics:
|
|
@@ -26,7 +26,7 @@ evalcases:
|
|
|
26
26
|
|
|
27
27
|
## Eval File Structure
|
|
28
28
|
|
|
29
|
-
**Required:** `
|
|
29
|
+
**Required:** `cases` (array)
|
|
30
30
|
**Optional:** `description`, `execution`, `dataset`
|
|
31
31
|
|
|
32
32
|
**Eval case fields:**
|
|
@@ -34,7 +34,7 @@ evalcases:
|
|
|
34
34
|
| Field | Required | Description |
|
|
35
35
|
|-------|----------|-------------|
|
|
36
36
|
| `id` | yes | Unique identifier |
|
|
37
|
-
| `
|
|
37
|
+
| `criteria` | yes | What the response should accomplish |
|
|
38
38
|
| `input` / `input_messages` | yes | Input to the agent |
|
|
39
39
|
| `expected_output` / `expected_messages` | no | Gold-standard reference answer |
|
|
40
40
|
| `rubrics` | no | Inline evaluation criteria |
|
|
@@ -65,7 +65,9 @@ Configure via `execution.evaluators` array. Multiple evaluators produce a weight
|
|
|
65
65
|
target: {} # optional: enable LLM target proxy (max_calls: 50)
|
|
66
66
|
```
|
|
67
67
|
Contract: stdin JSON -> stdout JSON `{score, hits, misses, reasoning}`
|
|
68
|
-
|
|
68
|
+
Input includes: `question`, `criteria`, `candidate_answer`, `reference_answer`, `output_messages`, `trace_summary`, `file_changes`, `workspace_path`, `config`
|
|
69
|
+
When `workspace_template` is configured, `workspace_path` is the absolute path to the workspace dir (also available as `AGENTV_WORKSPACE_PATH` env var). Use this for functional grading (e.g., running `npm test` in the workspace).
|
|
70
|
+
See docs at https://agentv.dev/evaluators/code-judges/
|
|
69
71
|
|
|
70
72
|
### llm_judge
|
|
71
73
|
```yaml
|
|
@@ -76,7 +78,7 @@ See `references/custom-evaluators.md` for templates.
|
|
|
76
78
|
config: # passed to script templates as context.config
|
|
77
79
|
strictness: high
|
|
78
80
|
```
|
|
79
|
-
Variables: `{{question}}`, `{{
|
|
81
|
+
Variables: `{{question}}`, `{{criteria}}`, `{{candidate_answer}}`, `{{reference_answer}}`, `{{input_messages}}`, `{{expected_messages}}`, `{{output_messages}}`, `{{file_changes}}`
|
|
80
82
|
- Markdown templates: use `{{variable}}` syntax
|
|
81
83
|
- TypeScript templates: use `definePromptTemplate(fn)` from `@agentv/eval`, receives context object with all variables + `config`
|
|
82
84
|
|
|
@@ -144,12 +146,27 @@ Compares `output_messages` fields against `expected_messages` fields.
|
|
|
144
146
|
max_total_tokens: 4000
|
|
145
147
|
```
|
|
146
148
|
|
|
149
|
+
### execution_metrics
|
|
150
|
+
```yaml
|
|
151
|
+
- name: efficiency
|
|
152
|
+
type: execution_metrics
|
|
153
|
+
max_tool_calls: 10 # Maximum tool invocations
|
|
154
|
+
max_llm_calls: 5 # Maximum LLM calls (assistant messages)
|
|
155
|
+
max_tokens: 5000 # Maximum total tokens (input + output)
|
|
156
|
+
max_cost_usd: 0.05 # Maximum cost in USD
|
|
157
|
+
max_duration_ms: 30000 # Maximum execution duration
|
|
158
|
+
target_exploration_ratio: 0.6 # Target ratio of read-only tool calls
|
|
159
|
+
exploration_tolerance: 0.2 # Tolerance for ratio check (default: 0.2)
|
|
160
|
+
```
|
|
161
|
+
Declarative threshold-based checks on execution metrics. Only specified thresholds are checked.
|
|
162
|
+
Score is proportional: `hits / (hits + misses)`. Missing data counts as a miss.
|
|
163
|
+
|
|
147
164
|
### rubric (inline)
|
|
148
165
|
```yaml
|
|
149
166
|
rubrics:
|
|
150
167
|
- Simple string criterion
|
|
151
168
|
- id: weighted
|
|
152
|
-
|
|
169
|
+
criteria: Detailed criterion
|
|
153
170
|
weight: 2.0
|
|
154
171
|
required: true
|
|
155
172
|
```
|
|
@@ -158,17 +175,25 @@ See `references/rubric-evaluator.md` for score-range mode and scoring formula.
|
|
|
158
175
|
## CLI Commands
|
|
159
176
|
|
|
160
177
|
```bash
|
|
161
|
-
# Run evaluation
|
|
162
|
-
|
|
178
|
+
# Run evaluation (requires API keys)
|
|
179
|
+
agentv eval <file.yaml> [--eval-id <id>] [--target <name>] [--dry-run]
|
|
180
|
+
|
|
181
|
+
# Run with trace persistence (writes to .agentv/traces/)
|
|
182
|
+
agentv eval <file.yaml> --trace
|
|
183
|
+
|
|
184
|
+
# Agent-orchestrated evals (no API keys needed)
|
|
185
|
+
agentv eval prompt <file.yaml> # orchestration overview
|
|
186
|
+
agentv eval prompt input <file.yaml> --eval-id <id> # task input JSON (file paths, not embedded content)
|
|
187
|
+
agentv eval prompt judge <file.yaml> --eval-id <id> --answer-file f # judge prompts / code judge results
|
|
163
188
|
|
|
164
189
|
# Validate eval file
|
|
165
|
-
|
|
190
|
+
agentv validate <file.yaml>
|
|
166
191
|
|
|
167
192
|
# Compare results between runs
|
|
168
|
-
|
|
193
|
+
agentv compare <results1.jsonl> <results2.jsonl>
|
|
169
194
|
|
|
170
|
-
# Generate rubrics from
|
|
171
|
-
|
|
195
|
+
# Generate rubrics from criteria
|
|
196
|
+
agentv generate rubrics <file.yaml> [--target <name>]
|
|
172
197
|
```
|
|
173
198
|
|
|
174
199
|
## Schemas
|