agentv 1.5.0 → 2.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +30 -44
- package/dist/{chunk-3RYQPI4H.js → chunk-6SHT2QS6.js} +4075 -1129
- package/dist/chunk-6SHT2QS6.js.map +1 -0
- package/dist/cli.js +1 -1
- package/dist/index.js +1 -1
- package/dist/templates/.agentv/.env.template +23 -23
- package/dist/templates/.agentv/config.yaml +15 -15
- package/dist/templates/.agentv/targets.yaml +16 -0
- package/dist/templates/.claude/skills/agentv-eval-builder/SKILL.md +6 -4
- package/dist/templates/.claude/skills/agentv-eval-builder/references/batch-cli-evaluator.md +12 -2
- package/dist/templates/.claude/skills/agentv-eval-builder/references/compare-command.md +137 -0
- package/dist/templates/.claude/skills/agentv-eval-builder/references/composite-evaluator.md +215 -215
- package/dist/templates/.claude/skills/agentv-eval-builder/references/custom-evaluators.md +141 -4
- package/dist/templates/.claude/skills/agentv-eval-builder/references/eval-schema.json +10 -6
- package/dist/templates/.claude/skills/agentv-eval-builder/references/example-evals.md +0 -7
- package/dist/templates/.claude/skills/agentv-eval-builder/references/rubric-evaluator.md +0 -2
- package/dist/templates/.claude/skills/agentv-eval-builder/references/structured-data-evaluators.md +121 -0
- package/dist/templates/.claude/skills/agentv-eval-builder/references/tool-trajectory-evaluator.md +28 -2
- package/dist/templates/.claude/skills/agentv-prompt-optimizer/SKILL.md +77 -77
- package/dist/templates/.github/prompts/agentv-eval-build.prompt.md +4 -4
- package/dist/templates/.github/prompts/agentv-optimize.prompt.md +3 -3
- package/package.json +6 -3
- package/dist/chunk-3RYQPI4H.js.map +0 -1
|
@@ -1,215 +1,215 @@
|
|
|
1
|
-
# Composite Evaluator Guide
|
|
2
|
-
|
|
3
|
-
Composite evaluators combine multiple evaluators and aggregate their results. This enables sophisticated evaluation patterns like safety gates, weighted scoring, and conflict resolution.
|
|
4
|
-
|
|
5
|
-
## Basic Structure
|
|
6
|
-
|
|
7
|
-
```yaml
|
|
8
|
-
execution:
|
|
9
|
-
evaluators:
|
|
10
|
-
- name: my_composite
|
|
11
|
-
type: composite
|
|
12
|
-
evaluators:
|
|
13
|
-
- name: evaluator_1
|
|
14
|
-
type: llm_judge
|
|
15
|
-
prompt: ./prompts/check1.md
|
|
16
|
-
- name: evaluator_2
|
|
17
|
-
type: code_judge
|
|
18
|
-
script: uv run check2.py
|
|
19
|
-
aggregator:
|
|
20
|
-
type: weighted_average
|
|
21
|
-
weights:
|
|
22
|
-
evaluator_1: 0.6
|
|
23
|
-
evaluator_2: 0.4
|
|
24
|
-
```
|
|
25
|
-
|
|
26
|
-
## Aggregator Types
|
|
27
|
-
|
|
28
|
-
### 1. Weighted Average (Default)
|
|
29
|
-
|
|
30
|
-
Combines scores using weighted arithmetic mean:
|
|
31
|
-
|
|
32
|
-
```yaml
|
|
33
|
-
aggregator:
|
|
34
|
-
type: weighted_average
|
|
35
|
-
weights:
|
|
36
|
-
safety: 0.3 # 30% weight
|
|
37
|
-
quality: 0.7 # 70% weight
|
|
38
|
-
```
|
|
39
|
-
|
|
40
|
-
If weights are omitted, all evaluators have equal weight (1.0).
|
|
41
|
-
|
|
42
|
-
**Score calculation:**
|
|
43
|
-
```
|
|
44
|
-
final_score = Σ(score_i × weight_i) / Σ(weight_i)
|
|
45
|
-
```
|
|
46
|
-
|
|
47
|
-
### 2. Code Judge Aggregator
|
|
48
|
-
|
|
49
|
-
Run custom code to decide final score based on all evaluator results:
|
|
50
|
-
|
|
51
|
-
```yaml
|
|
52
|
-
aggregator:
|
|
53
|
-
type: code_judge
|
|
54
|
-
path: node ./scripts/safety-gate.js
|
|
55
|
-
cwd: ./evaluators # optional working directory
|
|
56
|
-
```
|
|
57
|
-
|
|
58
|
-
**Input (stdin):**
|
|
59
|
-
```json
|
|
60
|
-
{
|
|
61
|
-
"results": {
|
|
62
|
-
"safety": { "score": 0.9, "hits": [...], "misses": [...] },
|
|
63
|
-
"quality": { "score": 0.85, "hits": [...], "misses": [...] }
|
|
64
|
-
}
|
|
65
|
-
}
|
|
66
|
-
```
|
|
67
|
-
|
|
68
|
-
**Output (stdout):**
|
|
69
|
-
```json
|
|
70
|
-
{
|
|
71
|
-
"score": 0.87,
|
|
72
|
-
"verdict": "pass",
|
|
73
|
-
"hits": ["Combined check passed"],
|
|
74
|
-
"misses": [],
|
|
75
|
-
"reasoning": "Safety gate passed, quality acceptable"
|
|
76
|
-
}
|
|
77
|
-
```
|
|
78
|
-
|
|
79
|
-
### 3. LLM Judge Aggregator
|
|
80
|
-
|
|
81
|
-
Use an LLM to resolve conflicts or make nuanced decisions:
|
|
82
|
-
|
|
83
|
-
```yaml
|
|
84
|
-
aggregator:
|
|
85
|
-
type: llm_judge
|
|
86
|
-
prompt: ./prompts/conflict-resolution.md
|
|
87
|
-
```
|
|
88
|
-
|
|
89
|
-
The `{{EVALUATOR_RESULTS_JSON}}` variable is replaced with the JSON results from all child evaluators.
|
|
90
|
-
|
|
91
|
-
## Example Patterns
|
|
92
|
-
|
|
93
|
-
### Safety Gate Pattern
|
|
94
|
-
|
|
95
|
-
Block outputs that fail safety even if quality is high:
|
|
96
|
-
|
|
97
|
-
```yaml
|
|
98
|
-
evalcases:
|
|
99
|
-
- id: safety-gated-response
|
|
100
|
-
expected_outcome: Safe and accurate response
|
|
101
|
-
|
|
102
|
-
input_messages:
|
|
103
|
-
- role: user
|
|
104
|
-
content: Explain quantum computing
|
|
105
|
-
|
|
106
|
-
execution:
|
|
107
|
-
evaluators:
|
|
108
|
-
- name: safety_gate
|
|
109
|
-
type: composite
|
|
110
|
-
evaluators:
|
|
111
|
-
- name: safety
|
|
112
|
-
type: llm_judge
|
|
113
|
-
prompt: ./prompts/safety-check.md
|
|
114
|
-
- name: quality
|
|
115
|
-
type: llm_judge
|
|
116
|
-
prompt: ./prompts/quality-check.md
|
|
117
|
-
aggregator:
|
|
118
|
-
type: code_judge
|
|
119
|
-
path: ./scripts/safety-gate.js
|
|
120
|
-
```
|
|
121
|
-
|
|
122
|
-
### Multi-Criteria Weighted Evaluation
|
|
123
|
-
|
|
124
|
-
```yaml
|
|
125
|
-
- name: release_readiness
|
|
126
|
-
type: composite
|
|
127
|
-
evaluators:
|
|
128
|
-
- name: correctness
|
|
129
|
-
type: llm_judge
|
|
130
|
-
prompt: ./prompts/correctness.md
|
|
131
|
-
- name: style
|
|
132
|
-
type: code_judge
|
|
133
|
-
script: uv run style_checker.py
|
|
134
|
-
- name: security
|
|
135
|
-
type: llm_judge
|
|
136
|
-
prompt: ./prompts/security.md
|
|
137
|
-
aggregator:
|
|
138
|
-
type: weighted_average
|
|
139
|
-
weights:
|
|
140
|
-
correctness: 0.5
|
|
141
|
-
style: 0.2
|
|
142
|
-
security: 0.3
|
|
143
|
-
```
|
|
144
|
-
|
|
145
|
-
### Nested Composites
|
|
146
|
-
|
|
147
|
-
Composites can contain other composites for complex hierarchies:
|
|
148
|
-
|
|
149
|
-
```yaml
|
|
150
|
-
- name: comprehensive_eval
|
|
151
|
-
type: composite
|
|
152
|
-
evaluators:
|
|
153
|
-
- name: content_quality
|
|
154
|
-
type: composite
|
|
155
|
-
evaluators:
|
|
156
|
-
- name: accuracy
|
|
157
|
-
type: llm_judge
|
|
158
|
-
prompt: ./prompts/accuracy.md
|
|
159
|
-
- name: clarity
|
|
160
|
-
type: llm_judge
|
|
161
|
-
prompt: ./prompts/clarity.md
|
|
162
|
-
aggregator:
|
|
163
|
-
type: weighted_average
|
|
164
|
-
weights:
|
|
165
|
-
accuracy: 0.6
|
|
166
|
-
clarity: 0.4
|
|
167
|
-
- name: safety
|
|
168
|
-
type: llm_judge
|
|
169
|
-
prompt: ./prompts/safety.md
|
|
170
|
-
aggregator:
|
|
171
|
-
type: weighted_average
|
|
172
|
-
weights:
|
|
173
|
-
content_quality: 0.7
|
|
174
|
-
safety: 0.3
|
|
175
|
-
```
|
|
176
|
-
|
|
177
|
-
## Result Structure
|
|
178
|
-
|
|
179
|
-
Composite evaluators return nested `evaluator_results`:
|
|
180
|
-
|
|
181
|
-
```json
|
|
182
|
-
{
|
|
183
|
-
"score": 0.85,
|
|
184
|
-
"verdict": "pass",
|
|
185
|
-
"hits": ["[safety] No harmful content", "[quality] Clear explanation"],
|
|
186
|
-
"misses": ["[quality] Could use more examples"],
|
|
187
|
-
"reasoning": "safety: Passed all checks; quality: Good but could improve",
|
|
188
|
-
"evaluator_results": [
|
|
189
|
-
{
|
|
190
|
-
"name": "safety",
|
|
191
|
-
"type": "llm_judge",
|
|
192
|
-
"score": 0.95,
|
|
193
|
-
"verdict": "pass",
|
|
194
|
-
"hits": ["No harmful content"],
|
|
195
|
-
"misses": []
|
|
196
|
-
},
|
|
197
|
-
{
|
|
198
|
-
"name": "quality",
|
|
199
|
-
"type": "llm_judge",
|
|
200
|
-
"score": 0.8,
|
|
201
|
-
"verdict": "pass",
|
|
202
|
-
"hits": ["Clear explanation"],
|
|
203
|
-
"misses": ["Could use more examples"]
|
|
204
|
-
}
|
|
205
|
-
]
|
|
206
|
-
}
|
|
207
|
-
```
|
|
208
|
-
|
|
209
|
-
## Best Practices
|
|
210
|
-
|
|
211
|
-
1. **Name evaluators clearly** - Names appear in results and debugging output
|
|
212
|
-
2. **Use safety gates for critical checks** - Don't let high quality override safety failures
|
|
213
|
-
3. **Balance weights thoughtfully** - Consider which aspects matter most for your use case
|
|
214
|
-
4. **Keep nesting shallow** - Deep nesting makes debugging harder
|
|
215
|
-
5. **Test aggregators independently** - Verify your custom aggregation logic with unit tests
|
|
1
|
+
# Composite Evaluator Guide
|
|
2
|
+
|
|
3
|
+
Composite evaluators combine multiple evaluators and aggregate their results. This enables sophisticated evaluation patterns like safety gates, weighted scoring, and conflict resolution.
|
|
4
|
+
|
|
5
|
+
## Basic Structure
|
|
6
|
+
|
|
7
|
+
```yaml
|
|
8
|
+
execution:
|
|
9
|
+
evaluators:
|
|
10
|
+
- name: my_composite
|
|
11
|
+
type: composite
|
|
12
|
+
evaluators:
|
|
13
|
+
- name: evaluator_1
|
|
14
|
+
type: llm_judge
|
|
15
|
+
prompt: ./prompts/check1.md
|
|
16
|
+
- name: evaluator_2
|
|
17
|
+
type: code_judge
|
|
18
|
+
script: uv run check2.py
|
|
19
|
+
aggregator:
|
|
20
|
+
type: weighted_average
|
|
21
|
+
weights:
|
|
22
|
+
evaluator_1: 0.6
|
|
23
|
+
evaluator_2: 0.4
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
## Aggregator Types
|
|
27
|
+
|
|
28
|
+
### 1. Weighted Average (Default)
|
|
29
|
+
|
|
30
|
+
Combines scores using weighted arithmetic mean:
|
|
31
|
+
|
|
32
|
+
```yaml
|
|
33
|
+
aggregator:
|
|
34
|
+
type: weighted_average
|
|
35
|
+
weights:
|
|
36
|
+
safety: 0.3 # 30% weight
|
|
37
|
+
quality: 0.7 # 70% weight
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
If weights are omitted, all evaluators have equal weight (1.0).
|
|
41
|
+
|
|
42
|
+
**Score calculation:**
|
|
43
|
+
```
|
|
44
|
+
final_score = Σ(score_i × weight_i) / Σ(weight_i)
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
### 2. Code Judge Aggregator
|
|
48
|
+
|
|
49
|
+
Run custom code to decide final score based on all evaluator results:
|
|
50
|
+
|
|
51
|
+
```yaml
|
|
52
|
+
aggregator:
|
|
53
|
+
type: code_judge
|
|
54
|
+
path: node ./scripts/safety-gate.js
|
|
55
|
+
cwd: ./evaluators # optional working directory
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
**Input (stdin):**
|
|
59
|
+
```json
|
|
60
|
+
{
|
|
61
|
+
"results": {
|
|
62
|
+
"safety": { "score": 0.9, "hits": [...], "misses": [...] },
|
|
63
|
+
"quality": { "score": 0.85, "hits": [...], "misses": [...] }
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
**Output (stdout):**
|
|
69
|
+
```json
|
|
70
|
+
{
|
|
71
|
+
"score": 0.87,
|
|
72
|
+
"verdict": "pass",
|
|
73
|
+
"hits": ["Combined check passed"],
|
|
74
|
+
"misses": [],
|
|
75
|
+
"reasoning": "Safety gate passed, quality acceptable"
|
|
76
|
+
}
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
### 3. LLM Judge Aggregator
|
|
80
|
+
|
|
81
|
+
Use an LLM to resolve conflicts or make nuanced decisions:
|
|
82
|
+
|
|
83
|
+
```yaml
|
|
84
|
+
aggregator:
|
|
85
|
+
type: llm_judge
|
|
86
|
+
prompt: ./prompts/conflict-resolution.md
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
The `{{EVALUATOR_RESULTS_JSON}}` variable is replaced with the JSON results from all child evaluators.
|
|
90
|
+
|
|
91
|
+
## Example Patterns
|
|
92
|
+
|
|
93
|
+
### Safety Gate Pattern
|
|
94
|
+
|
|
95
|
+
Block outputs that fail safety even if quality is high:
|
|
96
|
+
|
|
97
|
+
```yaml
|
|
98
|
+
evalcases:
|
|
99
|
+
- id: safety-gated-response
|
|
100
|
+
expected_outcome: Safe and accurate response
|
|
101
|
+
|
|
102
|
+
input_messages:
|
|
103
|
+
- role: user
|
|
104
|
+
content: Explain quantum computing
|
|
105
|
+
|
|
106
|
+
execution:
|
|
107
|
+
evaluators:
|
|
108
|
+
- name: safety_gate
|
|
109
|
+
type: composite
|
|
110
|
+
evaluators:
|
|
111
|
+
- name: safety
|
|
112
|
+
type: llm_judge
|
|
113
|
+
prompt: ./prompts/safety-check.md
|
|
114
|
+
- name: quality
|
|
115
|
+
type: llm_judge
|
|
116
|
+
prompt: ./prompts/quality-check.md
|
|
117
|
+
aggregator:
|
|
118
|
+
type: code_judge
|
|
119
|
+
path: ./scripts/safety-gate.js
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
### Multi-Criteria Weighted Evaluation
|
|
123
|
+
|
|
124
|
+
```yaml
|
|
125
|
+
- name: release_readiness
|
|
126
|
+
type: composite
|
|
127
|
+
evaluators:
|
|
128
|
+
- name: correctness
|
|
129
|
+
type: llm_judge
|
|
130
|
+
prompt: ./prompts/correctness.md
|
|
131
|
+
- name: style
|
|
132
|
+
type: code_judge
|
|
133
|
+
script: uv run style_checker.py
|
|
134
|
+
- name: security
|
|
135
|
+
type: llm_judge
|
|
136
|
+
prompt: ./prompts/security.md
|
|
137
|
+
aggregator:
|
|
138
|
+
type: weighted_average
|
|
139
|
+
weights:
|
|
140
|
+
correctness: 0.5
|
|
141
|
+
style: 0.2
|
|
142
|
+
security: 0.3
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
### Nested Composites
|
|
146
|
+
|
|
147
|
+
Composites can contain other composites for complex hierarchies:
|
|
148
|
+
|
|
149
|
+
```yaml
|
|
150
|
+
- name: comprehensive_eval
|
|
151
|
+
type: composite
|
|
152
|
+
evaluators:
|
|
153
|
+
- name: content_quality
|
|
154
|
+
type: composite
|
|
155
|
+
evaluators:
|
|
156
|
+
- name: accuracy
|
|
157
|
+
type: llm_judge
|
|
158
|
+
prompt: ./prompts/accuracy.md
|
|
159
|
+
- name: clarity
|
|
160
|
+
type: llm_judge
|
|
161
|
+
prompt: ./prompts/clarity.md
|
|
162
|
+
aggregator:
|
|
163
|
+
type: weighted_average
|
|
164
|
+
weights:
|
|
165
|
+
accuracy: 0.6
|
|
166
|
+
clarity: 0.4
|
|
167
|
+
- name: safety
|
|
168
|
+
type: llm_judge
|
|
169
|
+
prompt: ./prompts/safety.md
|
|
170
|
+
aggregator:
|
|
171
|
+
type: weighted_average
|
|
172
|
+
weights:
|
|
173
|
+
content_quality: 0.7
|
|
174
|
+
safety: 0.3
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
## Result Structure
|
|
178
|
+
|
|
179
|
+
Composite evaluators return nested `evaluator_results`:
|
|
180
|
+
|
|
181
|
+
```json
|
|
182
|
+
{
|
|
183
|
+
"score": 0.85,
|
|
184
|
+
"verdict": "pass",
|
|
185
|
+
"hits": ["[safety] No harmful content", "[quality] Clear explanation"],
|
|
186
|
+
"misses": ["[quality] Could use more examples"],
|
|
187
|
+
"reasoning": "safety: Passed all checks; quality: Good but could improve",
|
|
188
|
+
"evaluator_results": [
|
|
189
|
+
{
|
|
190
|
+
"name": "safety",
|
|
191
|
+
"type": "llm_judge",
|
|
192
|
+
"score": 0.95,
|
|
193
|
+
"verdict": "pass",
|
|
194
|
+
"hits": ["No harmful content"],
|
|
195
|
+
"misses": []
|
|
196
|
+
},
|
|
197
|
+
{
|
|
198
|
+
"name": "quality",
|
|
199
|
+
"type": "llm_judge",
|
|
200
|
+
"score": 0.8,
|
|
201
|
+
"verdict": "pass",
|
|
202
|
+
"hits": ["Clear explanation"],
|
|
203
|
+
"misses": ["Could use more examples"]
|
|
204
|
+
}
|
|
205
|
+
]
|
|
206
|
+
}
|
|
207
|
+
```
|
|
208
|
+
|
|
209
|
+
## Best Practices
|
|
210
|
+
|
|
211
|
+
1. **Name evaluators clearly** - Names appear in results and debugging output
|
|
212
|
+
2. **Use safety gates for critical checks** - Don't let high quality override safety failures
|
|
213
|
+
3. **Balance weights thoughtfully** - Consider which aspects matter most for your use case
|
|
214
|
+
4. **Keep nesting shallow** - Deep nesting makes debugging harder
|
|
215
|
+
5. **Test aggregators independently** - Verify your custom aggregation logic with unit tests
|
|
@@ -8,20 +8,47 @@ Code evaluators receive input via stdin and write output to stdout, both as JSON
|
|
|
8
8
|
|
|
9
9
|
### Input Format (via stdin)
|
|
10
10
|
|
|
11
|
+
Wire format uses snake_case for cross-language compatibility:
|
|
12
|
+
|
|
11
13
|
```json
|
|
12
14
|
{
|
|
13
15
|
"question": "string describing the task/question",
|
|
14
16
|
"expected_outcome": "expected outcome description",
|
|
15
17
|
"reference_answer": "gold standard answer (optional)",
|
|
16
18
|
"candidate_answer": "generated code/text from the agent",
|
|
17
|
-
"
|
|
19
|
+
"guideline_files": ["path1", "path2"],
|
|
18
20
|
"input_files": ["file1", "file2"],
|
|
19
21
|
"input_messages": [{"role": "user", "content": "..."}],
|
|
20
|
-
"output_messages": [
|
|
22
|
+
"output_messages": [
|
|
23
|
+
{
|
|
24
|
+
"role": "assistant",
|
|
25
|
+
"content": "...",
|
|
26
|
+
"tool_calls": [
|
|
27
|
+
{
|
|
28
|
+
"tool": "search",
|
|
29
|
+
"input": { "query": "..." },
|
|
30
|
+
"output": { "results": [...] },
|
|
31
|
+
"id": "call_123",
|
|
32
|
+
"timestamp": "2024-01-15T10:30:00Z"
|
|
33
|
+
}
|
|
34
|
+
]
|
|
35
|
+
}
|
|
36
|
+
],
|
|
37
|
+
"trace_summary": {
|
|
38
|
+
"event_count": 5,
|
|
39
|
+
"tool_names": ["fetch", "search"],
|
|
40
|
+
"tool_calls_by_name": { "search": 2, "fetch": 1 },
|
|
41
|
+
"error_count": 0,
|
|
42
|
+
"token_usage": { "input": 1000, "output": 500 },
|
|
43
|
+
"cost_usd": 0.0015,
|
|
44
|
+
"duration_ms": 3500
|
|
45
|
+
}
|
|
21
46
|
}
|
|
22
47
|
```
|
|
23
48
|
|
|
24
|
-
|
|
49
|
+
**Key fields:**
|
|
50
|
+
- `output_messages` - Full agent execution trace with tool calls (use `tool_calls[].input` for arguments)
|
|
51
|
+
- `trace_summary` - Lightweight summary with execution metrics (counts only, no tool arguments)
|
|
25
52
|
|
|
26
53
|
### Output Format (to stdout)
|
|
27
54
|
|
|
@@ -125,6 +152,96 @@ if __name__ == "__main__":
|
|
|
125
152
|
main()
|
|
126
153
|
```
|
|
127
154
|
|
|
155
|
+
## TypeScript Code Evaluator Template (with SDK)
|
|
156
|
+
|
|
157
|
+
The optional `@agentv/core` SDK provides type-safe payload parsing with camelCase properties (`candidateAnswer` vs `candidate_answer`).
|
|
158
|
+
|
|
159
|
+
**Execution:** Keep evaluators as `.ts` files and run via Node loaders like `npx --yes tsx ./evaluators/my-check.ts` so users don't need Bun after `npm install -g agentv`.
|
|
160
|
+
|
|
161
|
+
**Without SDK:** Skip the import and parse JSON from stdin directly (similar to the Python template above).
|
|
162
|
+
|
|
163
|
+
```typescript
|
|
164
|
+
/**
|
|
165
|
+
* Example TypeScript code evaluator using the AgentV SDK
|
|
166
|
+
*
|
|
167
|
+
* Run with: npx --yes tsx ./evaluators/example-check.ts
|
|
168
|
+
*
|
|
169
|
+
* The SDK provides:
|
|
170
|
+
* - Type-safe CodeJudgePayload interface with all fields
|
|
171
|
+
* - camelCase properties (candidateAnswer, expectedOutcome, etc.)
|
|
172
|
+
* - Automatic conversion from snake_case wire format
|
|
173
|
+
*/
|
|
174
|
+
|
|
175
|
+
import { readCodeJudgePayload } from '@agentv/core';
|
|
176
|
+
|
|
177
|
+
try {
|
|
178
|
+
// Read and parse stdin with automatic snake_case → camelCase conversion
|
|
179
|
+
const payload = readCodeJudgePayload();
|
|
180
|
+
|
|
181
|
+
// Type-safe camelCase access to all fields
|
|
182
|
+
const { candidateAnswer, expectedOutcome, inputFiles, guidelineFiles } = payload;
|
|
183
|
+
|
|
184
|
+
// Your validation logic here
|
|
185
|
+
const hits: string[] = [];
|
|
186
|
+
const misses: string[] = [];
|
|
187
|
+
|
|
188
|
+
// Example: Check if answer contains expected outcome
|
|
189
|
+
if (candidateAnswer.includes(expectedOutcome)) {
|
|
190
|
+
hits.push('Answer matches expected outcome');
|
|
191
|
+
} else {
|
|
192
|
+
misses.push('Answer does not match expected outcome');
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
// Example: Check attachment mentions
|
|
196
|
+
const attachments = [...guidelineFiles, ...inputFiles];
|
|
197
|
+
for (const filePath of attachments) {
|
|
198
|
+
const fileName = filePath.split('/').pop() ?? filePath;
|
|
199
|
+
if (candidateAnswer.includes(fileName)) {
|
|
200
|
+
hits.push(`Mentions attachment: ${fileName}`);
|
|
201
|
+
} else {
|
|
202
|
+
misses.push(`Missing attachment: ${fileName}`);
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
// Calculate score
|
|
207
|
+
const totalChecks = hits.length + misses.length;
|
|
208
|
+
const score = totalChecks === 0 ? 0 : hits.length / totalChecks;
|
|
209
|
+
|
|
210
|
+
// Build result
|
|
211
|
+
const result = {
|
|
212
|
+
score,
|
|
213
|
+
hits,
|
|
214
|
+
misses,
|
|
215
|
+
reasoning: `Passed ${hits.length}/${totalChecks} checks`
|
|
216
|
+
};
|
|
217
|
+
|
|
218
|
+
console.log(JSON.stringify(result, null, 2));
|
|
219
|
+
|
|
220
|
+
} catch (error) {
|
|
221
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
222
|
+
console.log(JSON.stringify({
|
|
223
|
+
score: 0,
|
|
224
|
+
hits: [],
|
|
225
|
+
misses: [`Error: ${message}`],
|
|
226
|
+
reasoning: 'Evaluator error'
|
|
227
|
+
}, null, 2));
|
|
228
|
+
process.exit(1);
|
|
229
|
+
}
|
|
230
|
+
```
|
|
231
|
+
|
|
232
|
+
**TypeScript SDK Benefits:**
|
|
233
|
+
- **Type-safe**: `CodeJudgePayload` interface with all fields typed
|
|
234
|
+
- **camelCase**: Idiomatic TypeScript naming (`candidateAnswer` vs `candidate_answer`)
|
|
235
|
+
- **Automatic conversion**: Handles snake_case wire format → camelCase objects
|
|
236
|
+
- **Compile-time safety**: Catch typos and missing fields before runtime
|
|
237
|
+
|
|
238
|
+
**Available in SDK:**
|
|
239
|
+
- `readCodeJudgePayload()`: Read stdin and convert to camelCase (recommended)
|
|
240
|
+
- `parseCodeJudgePayload(jsonString)`: Parse JSON string and convert to camelCase
|
|
241
|
+
- `CodeJudgePayload`: TypeScript interface for type safety
|
|
242
|
+
|
|
243
|
+
**See also:** `examples/features/code-judge-sdk/` for complete working examples
|
|
244
|
+
|
|
128
245
|
## LLM Judge Prompt Template
|
|
129
246
|
|
|
130
247
|
LLM judges use markdown prompts to guide evaluation. AgentV automatically handles the output format, so focus your prompt on evaluation criteria and guidelines.
|
|
@@ -189,11 +306,22 @@ You can customize this template in your eval file using the `evaluatorTemplate`
|
|
|
189
306
|
execution:
|
|
190
307
|
evaluators:
|
|
191
308
|
- name: my_validator
|
|
192
|
-
type:
|
|
309
|
+
type: code_judge
|
|
193
310
|
script: uv run my_validator.py
|
|
194
311
|
cwd: ./evaluators
|
|
195
312
|
```
|
|
196
313
|
|
|
314
|
+
TypeScript evaluators use the same structure but invoke `tsx` (or another Node-compatible loader) so they work everywhere:
|
|
315
|
+
|
|
316
|
+
```yaml
|
|
317
|
+
execution:
|
|
318
|
+
evaluators:
|
|
319
|
+
- name: csv_guardrail
|
|
320
|
+
type: code_judge
|
|
321
|
+
script: npx --yes tsx ./evaluators/check-csv.ts
|
|
322
|
+
cwd: ./evaluators
|
|
323
|
+
```
|
|
324
|
+
|
|
197
325
|
### Command Line Testing
|
|
198
326
|
|
|
199
327
|
Test your evaluator locally:
|
|
@@ -214,3 +342,12 @@ echo '{
|
|
|
214
342
|
# "reasoning": "..."
|
|
215
343
|
# }
|
|
216
344
|
```
|
|
345
|
+
|
|
346
|
+
```bash
|
|
347
|
+
# TypeScript (uses tsx loader under Node)
|
|
348
|
+
echo '{
|
|
349
|
+
"candidate_answer": "test output here",
|
|
350
|
+
"question": "test task",
|
|
351
|
+
"expected_outcome": "expected result"
|
|
352
|
+
}' | npx --yes tsx ./evaluators/check-csv.ts
|
|
353
|
+
```
|
|
@@ -4,11 +4,6 @@
|
|
|
4
4
|
"description": "Schema for YAML evaluation files with conversation flows, multiple evaluators, and execution configuration",
|
|
5
5
|
"type": "object",
|
|
6
6
|
"properties": {
|
|
7
|
-
"$schema": {
|
|
8
|
-
"type": "string",
|
|
9
|
-
"description": "Schema identifier",
|
|
10
|
-
"enum": ["agentv-eval-v2"]
|
|
11
|
-
},
|
|
12
7
|
"description": {
|
|
13
8
|
"type": "string",
|
|
14
9
|
"description": "Description of what this eval suite covers"
|
|
@@ -37,7 +32,16 @@
|
|
|
37
32
|
},
|
|
38
33
|
"type": {
|
|
39
34
|
"type": "string",
|
|
40
|
-
"enum": [
|
|
35
|
+
"enum": [
|
|
36
|
+
"code",
|
|
37
|
+
"llm_judge",
|
|
38
|
+
"composite",
|
|
39
|
+
"tool_trajectory",
|
|
40
|
+
"field_accuracy",
|
|
41
|
+
"latency",
|
|
42
|
+
"cost",
|
|
43
|
+
"token_usage"
|
|
44
|
+
],
|
|
41
45
|
"description": "Evaluator type: 'code' for scripts/regex/keywords, 'llm_judge' for LLM-based evaluation"
|
|
42
46
|
},
|
|
43
47
|
"script": {
|