agentv 3.10.2 → 3.10.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-6UE665XI.js → chunk-7LC3VNOC.js} +4 -4
- package/dist/{chunk-KGK5NUFG.js → chunk-JUQCB3ZW.js} +56 -15
- package/dist/chunk-JUQCB3ZW.js.map +1 -0
- package/dist/{chunk-F7LAJMTO.js → chunk-U556GRI3.js} +4 -4
- package/dist/{chunk-F7LAJMTO.js.map → chunk-U556GRI3.js.map} +1 -1
- package/dist/cli.js +3 -3
- package/dist/{dist-3QUJEJUT.js → dist-2X7A3TTC.js} +2 -2
- package/dist/index.js +3 -3
- package/dist/{interactive-EO6AR2R3.js → interactive-CSA4KIND.js} +3 -3
- package/dist/templates/.agentv/.env.example +9 -11
- package/dist/templates/.agentv/config.yaml +13 -4
- package/dist/templates/.agentv/targets.yaml +16 -0
- package/package.json +1 -1
- package/dist/chunk-KGK5NUFG.js.map +0 -1
- package/dist/templates/.agents/skills/agentv-chat-to-eval/README.md +0 -84
- package/dist/templates/.agents/skills/agentv-chat-to-eval/SKILL.md +0 -144
- package/dist/templates/.agents/skills/agentv-chat-to-eval/examples/transcript-json.md +0 -67
- package/dist/templates/.agents/skills/agentv-chat-to-eval/examples/transcript-markdown.md +0 -101
- package/dist/templates/.agents/skills/agentv-eval-builder/SKILL.md +0 -458
- package/dist/templates/.agents/skills/agentv-eval-builder/references/config-schema.json +0 -36
- package/dist/templates/.agents/skills/agentv-eval-builder/references/custom-evaluators.md +0 -118
- package/dist/templates/.agents/skills/agentv-eval-builder/references/eval-schema.json +0 -12753
- package/dist/templates/.agents/skills/agentv-eval-builder/references/rubric-evaluator.md +0 -77
- package/dist/templates/.agents/skills/agentv-eval-orchestrator/SKILL.md +0 -50
- package/dist/templates/.agents/skills/agentv-prompt-optimizer/SKILL.md +0 -78
- package/dist/templates/.claude/skills/agentv-eval-builder/SKILL.md +0 -177
- package/dist/templates/.claude/skills/agentv-eval-builder/references/batch-cli-evaluator.md +0 -316
- package/dist/templates/.claude/skills/agentv-eval-builder/references/compare-command.md +0 -137
- package/dist/templates/.claude/skills/agentv-eval-builder/references/composite-evaluator.md +0 -215
- package/dist/templates/.claude/skills/agentv-eval-builder/references/config-schema.json +0 -27
- package/dist/templates/.claude/skills/agentv-eval-builder/references/custom-evaluators.md +0 -115
- package/dist/templates/.claude/skills/agentv-eval-builder/references/eval-schema.json +0 -278
- package/dist/templates/.claude/skills/agentv-eval-builder/references/example-evals.md +0 -333
- package/dist/templates/.claude/skills/agentv-eval-builder/references/rubric-evaluator.md +0 -79
- package/dist/templates/.claude/skills/agentv-eval-builder/references/structured-data-evaluators.md +0 -121
- package/dist/templates/.claude/skills/agentv-eval-builder/references/tool-trajectory-evaluator.md +0 -298
- package/dist/templates/.claude/skills/agentv-prompt-optimizer/SKILL.md +0 -78
- package/dist/templates/.github/prompts/agentv-eval-build.prompt.md +0 -5
- package/dist/templates/.github/prompts/agentv-optimize.prompt.md +0 -4
- /package/dist/{chunk-6UE665XI.js.map → chunk-7LC3VNOC.js.map} +0 -0
- /package/dist/{dist-3QUJEJUT.js.map → dist-2X7A3TTC.js.map} +0 -0
- /package/dist/{interactive-EO6AR2R3.js.map → interactive-CSA4KIND.js.map} +0 -0
|
@@ -1,278 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"$schema": "http://json-schema.org/draft-07/schema#",
|
|
3
|
-
"title": "AgentV Eval Schema",
|
|
4
|
-
"description": "Schema for YAML evaluation files with conversation flows, multiple evaluators, and execution configuration",
|
|
5
|
-
"type": "object",
|
|
6
|
-
"properties": {
|
|
7
|
-
"description": {
|
|
8
|
-
"type": "string",
|
|
9
|
-
"description": "Description of what this eval suite covers"
|
|
10
|
-
},
|
|
11
|
-
"target": {
|
|
12
|
-
"type": "string",
|
|
13
|
-
"description": "(Deprecated: use execution.target instead) Default target configuration name. Can be overridden per eval case."
|
|
14
|
-
},
|
|
15
|
-
"execution": {
|
|
16
|
-
"type": "object",
|
|
17
|
-
"description": "Default execution configuration for all eval cases (can be overridden per case)",
|
|
18
|
-
"properties": {
|
|
19
|
-
"target": {
|
|
20
|
-
"type": "string",
|
|
21
|
-
"description": "Default target configuration name (e.g., default, azure_base, vscode_projectx). Can be overridden per eval case."
|
|
22
|
-
},
|
|
23
|
-
"evaluators": {
|
|
24
|
-
"type": "array",
|
|
25
|
-
"description": "Default evaluators for all eval cases (code-based and LLM judges)",
|
|
26
|
-
"items": {
|
|
27
|
-
"type": "object",
|
|
28
|
-
"properties": {
|
|
29
|
-
"name": {
|
|
30
|
-
"type": "string",
|
|
31
|
-
"description": "Evaluator name/identifier"
|
|
32
|
-
},
|
|
33
|
-
"type": {
|
|
34
|
-
"type": "string",
|
|
35
|
-
"enum": [
|
|
36
|
-
"code",
|
|
37
|
-
"llm_judge",
|
|
38
|
-
"composite",
|
|
39
|
-
"tool_trajectory",
|
|
40
|
-
"field_accuracy",
|
|
41
|
-
"latency",
|
|
42
|
-
"cost",
|
|
43
|
-
"token_usage"
|
|
44
|
-
],
|
|
45
|
-
"description": "Evaluator type: 'code' for scripts/regex/keywords, 'llm_judge' for LLM-based evaluation"
|
|
46
|
-
},
|
|
47
|
-
"script": {
|
|
48
|
-
"type": "string",
|
|
49
|
-
"description": "Path to evaluator script (for type: code)"
|
|
50
|
-
},
|
|
51
|
-
"prompt": {
|
|
52
|
-
"type": "string",
|
|
53
|
-
"description": "Path to judge prompt file (for type: llm_judge)"
|
|
54
|
-
}
|
|
55
|
-
},
|
|
56
|
-
"required": ["name", "type"],
|
|
57
|
-
"additionalProperties": true
|
|
58
|
-
}
|
|
59
|
-
}
|
|
60
|
-
},
|
|
61
|
-
"additionalProperties": true
|
|
62
|
-
},
|
|
63
|
-
"evalcases": {
|
|
64
|
-
"type": "array",
|
|
65
|
-
"description": "Array of evaluation cases",
|
|
66
|
-
"minItems": 1,
|
|
67
|
-
"items": {
|
|
68
|
-
"type": "object",
|
|
69
|
-
"properties": {
|
|
70
|
-
"id": {
|
|
71
|
-
"type": "string",
|
|
72
|
-
"description": "Unique identifier for the eval case"
|
|
73
|
-
},
|
|
74
|
-
"conversation_id": {
|
|
75
|
-
"type": "string",
|
|
76
|
-
"description": "Optional conversation identifier for threading multiple eval cases together"
|
|
77
|
-
},
|
|
78
|
-
"expected_outcome": {
|
|
79
|
-
"type": "string",
|
|
80
|
-
"description": "Description of what the AI should accomplish in this eval"
|
|
81
|
-
},
|
|
82
|
-
"note": {
|
|
83
|
-
"type": "string",
|
|
84
|
-
"description": "Optional note or additional context for the eval case. Use this to document test-specific considerations, known limitations, or rationale for expected behavior."
|
|
85
|
-
},
|
|
86
|
-
"input_messages": {
|
|
87
|
-
"type": "array",
|
|
88
|
-
"description": "Input messages for the conversation",
|
|
89
|
-
"minItems": 1,
|
|
90
|
-
"items": {
|
|
91
|
-
"type": "object",
|
|
92
|
-
"properties": {
|
|
93
|
-
"role": {
|
|
94
|
-
"type": "string",
|
|
95
|
-
"enum": ["system", "user", "assistant", "tool"],
|
|
96
|
-
"description": "Message role"
|
|
97
|
-
},
|
|
98
|
-
"content": {
|
|
99
|
-
"oneOf": [
|
|
100
|
-
{
|
|
101
|
-
"type": "string",
|
|
102
|
-
"description": "Simple text content"
|
|
103
|
-
},
|
|
104
|
-
{
|
|
105
|
-
"type": "array",
|
|
106
|
-
"description": "Mixed content items (text and file references)",
|
|
107
|
-
"items": {
|
|
108
|
-
"type": "object",
|
|
109
|
-
"properties": {
|
|
110
|
-
"type": {
|
|
111
|
-
"type": "string",
|
|
112
|
-
"enum": ["text", "file"],
|
|
113
|
-
"description": "Content type: 'text' for inline content, 'file' for file references"
|
|
114
|
-
},
|
|
115
|
-
"value": {
|
|
116
|
-
"type": "string",
|
|
117
|
-
"description": "Text content or file path. Relative paths (e.g., ../prompts/file.md) are resolved from eval file directory. Absolute paths (e.g., /docs/examples/prompts/file.md) are resolved from repo root."
|
|
118
|
-
}
|
|
119
|
-
},
|
|
120
|
-
"required": ["type", "value"],
|
|
121
|
-
"additionalProperties": false
|
|
122
|
-
}
|
|
123
|
-
}
|
|
124
|
-
]
|
|
125
|
-
}
|
|
126
|
-
},
|
|
127
|
-
"required": ["role", "content"],
|
|
128
|
-
"additionalProperties": false
|
|
129
|
-
}
|
|
130
|
-
},
|
|
131
|
-
"input": {
|
|
132
|
-
"description": "Alias for input_messages with shorthand support. String expands to single user message, array of messages passes through.",
|
|
133
|
-
"oneOf": [
|
|
134
|
-
{
|
|
135
|
-
"type": "string",
|
|
136
|
-
"description": "Shorthand: single user message content"
|
|
137
|
-
},
|
|
138
|
-
{
|
|
139
|
-
"type": "array",
|
|
140
|
-
"description": "Array of messages (same format as input_messages)",
|
|
141
|
-
"items": {
|
|
142
|
-
"type": "object",
|
|
143
|
-
"properties": {
|
|
144
|
-
"role": {
|
|
145
|
-
"type": "string",
|
|
146
|
-
"enum": ["system", "user", "assistant", "tool"]
|
|
147
|
-
},
|
|
148
|
-
"content": {
|
|
149
|
-
"oneOf": [{ "type": "string" }, { "type": "array" }]
|
|
150
|
-
}
|
|
151
|
-
},
|
|
152
|
-
"required": ["role", "content"]
|
|
153
|
-
}
|
|
154
|
-
}
|
|
155
|
-
]
|
|
156
|
-
},
|
|
157
|
-
"expected_messages": {
|
|
158
|
-
"type": "array",
|
|
159
|
-
"description": "Expected response messages. Canonical form — use this or expected_output (alias). The content of the last entry is derived as the template variable 'reference_answer' for evaluator prompts.",
|
|
160
|
-
"minItems": 1,
|
|
161
|
-
"items": {
|
|
162
|
-
"type": "object",
|
|
163
|
-
"properties": {
|
|
164
|
-
"role": {
|
|
165
|
-
"type": "string",
|
|
166
|
-
"enum": ["system", "user", "assistant", "tool"],
|
|
167
|
-
"description": "Message role"
|
|
168
|
-
},
|
|
169
|
-
"content": {
|
|
170
|
-
"oneOf": [
|
|
171
|
-
{
|
|
172
|
-
"type": "string",
|
|
173
|
-
"description": "Simple text content"
|
|
174
|
-
},
|
|
175
|
-
{
|
|
176
|
-
"type": "array",
|
|
177
|
-
"description": "Mixed content items",
|
|
178
|
-
"items": {
|
|
179
|
-
"type": "object",
|
|
180
|
-
"properties": {
|
|
181
|
-
"type": {
|
|
182
|
-
"type": "string",
|
|
183
|
-
"enum": ["text", "file"]
|
|
184
|
-
},
|
|
185
|
-
"value": {
|
|
186
|
-
"type": "string"
|
|
187
|
-
}
|
|
188
|
-
},
|
|
189
|
-
"required": ["type", "value"],
|
|
190
|
-
"additionalProperties": false
|
|
191
|
-
}
|
|
192
|
-
}
|
|
193
|
-
]
|
|
194
|
-
}
|
|
195
|
-
},
|
|
196
|
-
"required": ["role", "content"],
|
|
197
|
-
"additionalProperties": false
|
|
198
|
-
}
|
|
199
|
-
},
|
|
200
|
-
"expected_output": {
|
|
201
|
-
"description": "Alias for expected_messages with shorthand support. String expands to single assistant message, object wraps as assistant message content. Resolves to expected_messages internally — the content of the last resolved entry becomes the template variable 'reference_answer'.",
|
|
202
|
-
"oneOf": [
|
|
203
|
-
{
|
|
204
|
-
"type": "string",
|
|
205
|
-
"description": "Shorthand: single assistant message content"
|
|
206
|
-
},
|
|
207
|
-
{
|
|
208
|
-
"type": "object",
|
|
209
|
-
"description": "Shorthand: structured content wraps as assistant message"
|
|
210
|
-
},
|
|
211
|
-
{
|
|
212
|
-
"type": "array",
|
|
213
|
-
"description": "Array of messages (same format as expected_messages)",
|
|
214
|
-
"items": {
|
|
215
|
-
"type": "object",
|
|
216
|
-
"properties": {
|
|
217
|
-
"role": {
|
|
218
|
-
"type": "string",
|
|
219
|
-
"enum": ["system", "user", "assistant", "tool"]
|
|
220
|
-
},
|
|
221
|
-
"content": {
|
|
222
|
-
"oneOf": [{ "type": "string" }, { "type": "object" }, { "type": "array" }]
|
|
223
|
-
}
|
|
224
|
-
},
|
|
225
|
-
"required": ["role", "content"]
|
|
226
|
-
}
|
|
227
|
-
}
|
|
228
|
-
]
|
|
229
|
-
},
|
|
230
|
-
"execution": {
|
|
231
|
-
"type": "object",
|
|
232
|
-
"description": "Per-case execution configuration",
|
|
233
|
-
"properties": {
|
|
234
|
-
"target": {
|
|
235
|
-
"type": "string",
|
|
236
|
-
"description": "Override target for this specific eval case"
|
|
237
|
-
},
|
|
238
|
-
"evaluators": {
|
|
239
|
-
"type": "array",
|
|
240
|
-
"description": "Multiple evaluators (code-based and LLM judges)",
|
|
241
|
-
"items": {
|
|
242
|
-
"type": "object",
|
|
243
|
-
"properties": {
|
|
244
|
-
"name": {
|
|
245
|
-
"type": "string",
|
|
246
|
-
"description": "Evaluator name/identifier"
|
|
247
|
-
},
|
|
248
|
-
"type": {
|
|
249
|
-
"type": "string",
|
|
250
|
-
"enum": ["code", "llm_judge"],
|
|
251
|
-
"description": "Evaluator type: 'code' for scripts/regex/keywords, 'llm_judge' for LLM-based evaluation"
|
|
252
|
-
},
|
|
253
|
-
"script": {
|
|
254
|
-
"type": "string",
|
|
255
|
-
"description": "Path to evaluator script (for type: code)"
|
|
256
|
-
},
|
|
257
|
-
"prompt": {
|
|
258
|
-
"type": "string",
|
|
259
|
-
"description": "Path to judge prompt file (for type: llm_judge)"
|
|
260
|
-
}
|
|
261
|
-
},
|
|
262
|
-
"required": ["name", "type"],
|
|
263
|
-
"additionalProperties": true
|
|
264
|
-
}
|
|
265
|
-
}
|
|
266
|
-
},
|
|
267
|
-
"additionalProperties": true
|
|
268
|
-
}
|
|
269
|
-
},
|
|
270
|
-
"required": ["id", "expected_outcome"],
|
|
271
|
-
"anyOf": [{ "required": ["input_messages"] }, { "required": ["input"] }],
|
|
272
|
-
"additionalProperties": true
|
|
273
|
-
}
|
|
274
|
-
}
|
|
275
|
-
},
|
|
276
|
-
"required": ["evalcases"],
|
|
277
|
-
"additionalProperties": false
|
|
278
|
-
}
|
|
@@ -1,333 +0,0 @@
|
|
|
1
|
-
# Example Eval Files
|
|
2
|
-
|
|
3
|
-
This document contains complete examples of well-structured eval files demonstrating various AgentV patterns and best practices.
|
|
4
|
-
|
|
5
|
-
## Basic Example: Simple Q&A Eval
|
|
6
|
-
|
|
7
|
-
```yaml
|
|
8
|
-
description: Basic arithmetic evaluation
|
|
9
|
-
execution:
|
|
10
|
-
target: default
|
|
11
|
-
|
|
12
|
-
evalcases:
|
|
13
|
-
- id: simple-addition
|
|
14
|
-
expected_outcome: Correctly calculates 2+2
|
|
15
|
-
|
|
16
|
-
input_messages:
|
|
17
|
-
- role: user
|
|
18
|
-
content: What is 2 + 2?
|
|
19
|
-
|
|
20
|
-
expected_messages:
|
|
21
|
-
- role: assistant
|
|
22
|
-
content: "4"
|
|
23
|
-
```
|
|
24
|
-
|
|
25
|
-
## Code Review with File References
|
|
26
|
-
|
|
27
|
-
```yaml
|
|
28
|
-
description: Code review with guidelines
|
|
29
|
-
execution:
|
|
30
|
-
target: azure_base
|
|
31
|
-
|
|
32
|
-
evalcases:
|
|
33
|
-
- id: code-review-basic
|
|
34
|
-
expected_outcome: Assistant provides helpful code analysis with security considerations
|
|
35
|
-
|
|
36
|
-
input_messages:
|
|
37
|
-
- role: system
|
|
38
|
-
content: You are an expert code reviewer.
|
|
39
|
-
- role: user
|
|
40
|
-
content:
|
|
41
|
-
- type: text
|
|
42
|
-
value: |-
|
|
43
|
-
Review this function for security issues:
|
|
44
|
-
|
|
45
|
-
```python
|
|
46
|
-
def get_user(user_id):
|
|
47
|
-
query = f"SELECT * FROM users WHERE id = {user_id}"
|
|
48
|
-
return db.execute(query)
|
|
49
|
-
```
|
|
50
|
-
- type: file
|
|
51
|
-
value: /prompts/security-guidelines.md
|
|
52
|
-
|
|
53
|
-
expected_messages:
|
|
54
|
-
- role: assistant
|
|
55
|
-
content: |-
|
|
56
|
-
This code has a critical SQL injection vulnerability. The user_id is directly
|
|
57
|
-
interpolated into the query string without sanitization.
|
|
58
|
-
|
|
59
|
-
Recommended fix:
|
|
60
|
-
```python
|
|
61
|
-
def get_user(user_id):
|
|
62
|
-
query = "SELECT * FROM users WHERE id = ?"
|
|
63
|
-
return db.execute(query, (user_id,))
|
|
64
|
-
```
|
|
65
|
-
```
|
|
66
|
-
|
|
67
|
-
## Multi-Evaluator Configuration
|
|
68
|
-
|
|
69
|
-
```yaml
|
|
70
|
-
description: JSON generation with validation
|
|
71
|
-
execution:
|
|
72
|
-
target: default
|
|
73
|
-
|
|
74
|
-
evalcases:
|
|
75
|
-
- id: json-generation-with-validation
|
|
76
|
-
expected_outcome: Generates valid JSON with required fields
|
|
77
|
-
|
|
78
|
-
execution:
|
|
79
|
-
evaluators:
|
|
80
|
-
- name: json_format_validator
|
|
81
|
-
type: code_judge
|
|
82
|
-
script: uv run validate_json.py
|
|
83
|
-
cwd: ./evaluators
|
|
84
|
-
- name: content_evaluator
|
|
85
|
-
type: llm_judge
|
|
86
|
-
prompt: ./judges/semantic_correctness.md
|
|
87
|
-
|
|
88
|
-
input_messages:
|
|
89
|
-
- role: user
|
|
90
|
-
content: |-
|
|
91
|
-
Generate a JSON object for a user with name "Alice",
|
|
92
|
-
email "alice@example.com", and role "admin".
|
|
93
|
-
|
|
94
|
-
expected_messages:
|
|
95
|
-
- role: assistant
|
|
96
|
-
content: |-
|
|
97
|
-
{
|
|
98
|
-
"name": "Alice",
|
|
99
|
-
"email": "alice@example.com",
|
|
100
|
-
"role": "admin"
|
|
101
|
-
}
|
|
102
|
-
```
|
|
103
|
-
|
|
104
|
-
## Tool Trajectory Evaluation
|
|
105
|
-
|
|
106
|
-
Validate that an agent uses specific tools during execution.
|
|
107
|
-
|
|
108
|
-
```yaml
|
|
109
|
-
description: Tool usage validation
|
|
110
|
-
execution:
|
|
111
|
-
target: mock_agent
|
|
112
|
-
|
|
113
|
-
evalcases:
|
|
114
|
-
# Validate minimum tool usage (order doesn't matter)
|
|
115
|
-
- id: research-depth
|
|
116
|
-
expected_outcome: Agent researches thoroughly
|
|
117
|
-
input_messages:
|
|
118
|
-
- role: user
|
|
119
|
-
content: Research REST vs GraphQL
|
|
120
|
-
execution:
|
|
121
|
-
evaluators:
|
|
122
|
-
- name: research-check
|
|
123
|
-
type: tool_trajectory
|
|
124
|
-
mode: any_order
|
|
125
|
-
minimums:
|
|
126
|
-
knowledgeSearch: 2
|
|
127
|
-
documentRetrieve: 1
|
|
128
|
-
|
|
129
|
-
# Validate exact tool sequence
|
|
130
|
-
- id: auth-flow
|
|
131
|
-
expected_outcome: Agent follows auth sequence
|
|
132
|
-
input_messages:
|
|
133
|
-
- role: user
|
|
134
|
-
content: Authenticate user
|
|
135
|
-
execution:
|
|
136
|
-
evaluators:
|
|
137
|
-
- name: auth-sequence
|
|
138
|
-
type: tool_trajectory
|
|
139
|
-
mode: exact
|
|
140
|
-
expected:
|
|
141
|
-
- tool: checkCredentials
|
|
142
|
-
- tool: generateToken
|
|
143
|
-
```
|
|
144
|
-
|
|
145
|
-
## Static Trace Evaluation
|
|
146
|
-
|
|
147
|
-
Evaluate pre-existing trace files without running an agent.
|
|
148
|
-
|
|
149
|
-
```yaml
|
|
150
|
-
description: Static trace evaluation
|
|
151
|
-
execution:
|
|
152
|
-
target: static_trace
|
|
153
|
-
|
|
154
|
-
evalcases:
|
|
155
|
-
- id: validate-trace-file
|
|
156
|
-
expected_outcome: Trace contains required steps
|
|
157
|
-
input_messages:
|
|
158
|
-
- role: user
|
|
159
|
-
content: Analyze trace
|
|
160
|
-
execution:
|
|
161
|
-
evaluators:
|
|
162
|
-
- name: trace-check
|
|
163
|
-
type: tool_trajectory
|
|
164
|
-
mode: in_order
|
|
165
|
-
expected:
|
|
166
|
-
- tool: webSearch
|
|
167
|
-
- tool: readFile
|
|
168
|
-
```
|
|
169
|
-
|
|
170
|
-
## Multi-Turn Conversation (Single Eval Case)
|
|
171
|
-
|
|
172
|
-
```yaml
|
|
173
|
-
description: Multi-turn debugging session with clarifying questions
|
|
174
|
-
execution:
|
|
175
|
-
target: default
|
|
176
|
-
|
|
177
|
-
evalcases:
|
|
178
|
-
- id: debug-with-clarification
|
|
179
|
-
expected_outcome: |-
|
|
180
|
-
Assistant conducts a multi-turn debugging session, asking clarification
|
|
181
|
-
questions when needed, correctly diagnosing the bug, and proposing a clear
|
|
182
|
-
fix with rationale.
|
|
183
|
-
|
|
184
|
-
input_messages:
|
|
185
|
-
- role: system
|
|
186
|
-
content: You are an expert debugging assistant who reasons step by step, asks clarifying questions, and explains fixes clearly.
|
|
187
|
-
- role: user
|
|
188
|
-
content: |-
|
|
189
|
-
I'm getting an off-by-one error in this function, but I can't see why:
|
|
190
|
-
|
|
191
|
-
```python
|
|
192
|
-
def get_items(items):
|
|
193
|
-
result = []
|
|
194
|
-
for i in range(len(items) - 1):
|
|
195
|
-
result.append(items[i])
|
|
196
|
-
return result
|
|
197
|
-
```
|
|
198
|
-
|
|
199
|
-
Sometimes the last element is missing. Can you help debug this?
|
|
200
|
-
- role: assistant
|
|
201
|
-
content: |-
|
|
202
|
-
I can help debug this. Before I propose a fix, could you tell me:
|
|
203
|
-
- What output you expect for an example input list
|
|
204
|
-
- What output you actually get
|
|
205
|
-
- role: user
|
|
206
|
-
content: |-
|
|
207
|
-
For `[1, 2, 3, 4]` I expect `[1, 2, 3, 4]`, but I get `[1, 2, 3]`.
|
|
208
|
-
|
|
209
|
-
expected_messages:
|
|
210
|
-
- role: assistant
|
|
211
|
-
content: |-
|
|
212
|
-
You have an off-by-one error in your loop bounds.
|
|
213
|
-
You're iterating with `range(len(items) - 1)`, which stops before the last index.
|
|
214
|
-
To include all items, you can either:
|
|
215
|
-
- Use `range(len(items))`, or
|
|
216
|
-
- Iterate directly over the list: `for item in items:`
|
|
217
|
-
|
|
218
|
-
Here's a corrected version:
|
|
219
|
-
|
|
220
|
-
```python
|
|
221
|
-
def get_items(items):
|
|
222
|
-
result = []
|
|
223
|
-
for item in items:
|
|
224
|
-
result.append(item)
|
|
225
|
-
return result
|
|
226
|
-
```
|
|
227
|
-
```
|
|
228
|
-
|
|
229
|
-
## Batch CLI Evaluation
|
|
230
|
-
|
|
231
|
-
Evaluate external batch runners that process all evalcases in one invocation.
|
|
232
|
-
|
|
233
|
-
```yaml
|
|
234
|
-
description: Batch CLI demo (AML screening)
|
|
235
|
-
execution:
|
|
236
|
-
target: batch_cli
|
|
237
|
-
|
|
238
|
-
evalcases:
|
|
239
|
-
- id: aml-001
|
|
240
|
-
expected_outcome: |-
|
|
241
|
-
Batch runner returns JSON with decision=CLEAR.
|
|
242
|
-
|
|
243
|
-
expected_messages:
|
|
244
|
-
- role: assistant
|
|
245
|
-
content:
|
|
246
|
-
decision: CLEAR
|
|
247
|
-
|
|
248
|
-
input_messages:
|
|
249
|
-
- role: system
|
|
250
|
-
content: You are a deterministic AML screening batch checker.
|
|
251
|
-
- role: user
|
|
252
|
-
content:
|
|
253
|
-
request:
|
|
254
|
-
type: aml_screening_check
|
|
255
|
-
jurisdiction: AU
|
|
256
|
-
effective_date: 2025-01-01
|
|
257
|
-
row:
|
|
258
|
-
id: aml-001
|
|
259
|
-
customer_name: Example Customer A
|
|
260
|
-
origin_country: NZ
|
|
261
|
-
destination_country: AU
|
|
262
|
-
transaction_type: INTERNATIONAL_TRANSFER
|
|
263
|
-
amount: 5000
|
|
264
|
-
currency: USD
|
|
265
|
-
|
|
266
|
-
execution:
|
|
267
|
-
evaluators:
|
|
268
|
-
- name: decision-check
|
|
269
|
-
type: code_judge
|
|
270
|
-
script: bun run ./scripts/check-batch-cli-output.ts
|
|
271
|
-
cwd: .
|
|
272
|
-
|
|
273
|
-
- id: aml-002
|
|
274
|
-
expected_outcome: |-
|
|
275
|
-
Batch runner returns JSON with decision=REVIEW.
|
|
276
|
-
|
|
277
|
-
expected_messages:
|
|
278
|
-
- role: assistant
|
|
279
|
-
content:
|
|
280
|
-
decision: REVIEW
|
|
281
|
-
|
|
282
|
-
input_messages:
|
|
283
|
-
- role: system
|
|
284
|
-
content: You are a deterministic AML screening batch checker.
|
|
285
|
-
- role: user
|
|
286
|
-
content:
|
|
287
|
-
request:
|
|
288
|
-
type: aml_screening_check
|
|
289
|
-
jurisdiction: AU
|
|
290
|
-
effective_date: 2025-01-01
|
|
291
|
-
row:
|
|
292
|
-
id: aml-002
|
|
293
|
-
customer_name: Example Customer B
|
|
294
|
-
origin_country: IR
|
|
295
|
-
destination_country: AU
|
|
296
|
-
transaction_type: INTERNATIONAL_TRANSFER
|
|
297
|
-
amount: 2000
|
|
298
|
-
currency: USD
|
|
299
|
-
|
|
300
|
-
execution:
|
|
301
|
-
evaluators:
|
|
302
|
-
- name: decision-check
|
|
303
|
-
type: code_judge
|
|
304
|
-
script: bun run ./scripts/check-batch-cli-output.ts
|
|
305
|
-
cwd: .
|
|
306
|
-
```
|
|
307
|
-
|
|
308
|
-
### Batch CLI Pattern Notes
|
|
309
|
-
- **execution.target: batch_cli** - Configure CLI provider with `provider_batching: true`
|
|
310
|
-
- **Batch runner** - Reads eval YAML via `--eval` flag, outputs JSONL keyed by `id`
|
|
311
|
-
- **Structured input** - Put data in `user.content` as objects for runner to extract
|
|
312
|
-
- **Structured expected** - Use `expected_messages.content` with object fields
|
|
313
|
-
- **Per-case evaluators** - Each evalcase has its own evaluator to validate output
|
|
314
|
-
|
|
315
|
-
## Notes on Examples
|
|
316
|
-
|
|
317
|
-
### File Path Conventions
|
|
318
|
-
- **Absolute paths** (start with `/`): Resolved from repository root
|
|
319
|
-
- Example: `/prompts/guidelines.md` → `<repo_root>/prompts/guidelines.md`
|
|
320
|
-
- **Relative paths** (start with `./` or `../`): Resolved from eval file directory
|
|
321
|
-
- Example: `../../prompts/file.md` → Two directories up, then into prompts/
|
|
322
|
-
|
|
323
|
-
### expected_outcome Writing Tips
|
|
324
|
-
- Be specific about what success looks like
|
|
325
|
-
- Mention key elements that must be present
|
|
326
|
-
- For classification tasks, specify the expected category
|
|
327
|
-
- For reasoning tasks, describe the thought process expected
|
|
328
|
-
|
|
329
|
-
### Expected Messages
|
|
330
|
-
- Show the pattern, not rigid templates
|
|
331
|
-
- Allow for natural language variation
|
|
332
|
-
- Focus on semantic correctness over exact matching
|
|
333
|
-
- Evaluators will handle the actual validation
|
|
@@ -1,79 +0,0 @@
|
|
|
1
|
-
# Rubric Evaluator
|
|
2
|
-
|
|
3
|
-
## Field Reference
|
|
4
|
-
|
|
5
|
-
| Field | Type | Default | Description |
|
|
6
|
-
|-------|------|---------|-------------|
|
|
7
|
-
| `id` | string | auto-generated | Unique identifier |
|
|
8
|
-
| `expected_outcome` | string | required* | Criterion being evaluated (*optional if `score_ranges` used) |
|
|
9
|
-
| `weight` | number | 1.0 | Relative importance |
|
|
10
|
-
| `required` | boolean | true | Failing forces verdict to 'fail' (checklist mode) |
|
|
11
|
-
| `required_min_score` | integer | - | Minimum 0-10 score to pass (score-range mode) |
|
|
12
|
-
| `score_ranges` | map or array | - | Score range definitions for analytic scoring |
|
|
13
|
-
|
|
14
|
-
`description` is a backward-compatible alias for `expected_outcome`.
|
|
15
|
-
|
|
16
|
-
## Checklist Mode
|
|
17
|
-
|
|
18
|
-
```yaml
|
|
19
|
-
rubrics:
|
|
20
|
-
- Mentions divide-and-conquer approach
|
|
21
|
-
- id: complexity
|
|
22
|
-
expected_outcome: States time complexity correctly
|
|
23
|
-
weight: 2.0
|
|
24
|
-
required: true
|
|
25
|
-
- id: examples
|
|
26
|
-
expected_outcome: Includes code examples
|
|
27
|
-
weight: 1.0
|
|
28
|
-
required: false
|
|
29
|
-
```
|
|
30
|
-
|
|
31
|
-
## Score-Range Mode
|
|
32
|
-
|
|
33
|
-
Shorthand map format (recommended):
|
|
34
|
-
|
|
35
|
-
```yaml
|
|
36
|
-
rubrics:
|
|
37
|
-
- id: correctness
|
|
38
|
-
weight: 2.0
|
|
39
|
-
required_min_score: 7
|
|
40
|
-
score_ranges:
|
|
41
|
-
0: Critical bugs
|
|
42
|
-
3: Minor bugs
|
|
43
|
-
6: Correct with minor issues
|
|
44
|
-
9: Fully correct
|
|
45
|
-
```
|
|
46
|
-
|
|
47
|
-
Map keys are lower bounds (0-10). Each range extends from its key to (next key - 1), with the last extending to 10. Must start at 0.
|
|
48
|
-
|
|
49
|
-
Array format is also accepted:
|
|
50
|
-
|
|
51
|
-
```yaml
|
|
52
|
-
score_ranges:
|
|
53
|
-
- score_range: [0, 2]
|
|
54
|
-
expected_outcome: Critical bugs
|
|
55
|
-
- score_range: [3, 5]
|
|
56
|
-
expected_outcome: Minor bugs
|
|
57
|
-
- score_range: [6, 8]
|
|
58
|
-
expected_outcome: Correct with minor issues
|
|
59
|
-
- score_range: [9, 10]
|
|
60
|
-
expected_outcome: Fully correct
|
|
61
|
-
```
|
|
62
|
-
|
|
63
|
-
Ranges must be integers 0-10, non-overlapping, covering all values 0-10.
|
|
64
|
-
|
|
65
|
-
## Scoring
|
|
66
|
-
|
|
67
|
-
**Checklist:** `score = sum(satisfied weights) / sum(all weights)`
|
|
68
|
-
|
|
69
|
-
**Score-range:** `score = weighted_average(raw_score / 10)` per criterion
|
|
70
|
-
|
|
71
|
-
## Verdicts
|
|
72
|
-
|
|
73
|
-
| Verdict | Condition |
|
|
74
|
-
|---------|-----------|
|
|
75
|
-
| `pass` | score >= 0.8 AND all gating criteria satisfied |
|
|
76
|
-
| `borderline` | score >= 0.6 AND all gating criteria satisfied |
|
|
77
|
-
| `fail` | score < 0.6 OR any gating criterion failed |
|
|
78
|
-
|
|
79
|
-
Gating: checklist uses `required: true`, score-range uses `required_min_score: N`.
|