agentv 1.5.0 → 2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (23) hide show
  1. package/README.md +30 -44
  2. package/dist/{chunk-3RYQPI4H.js → chunk-6SHT2QS6.js} +4075 -1129
  3. package/dist/chunk-6SHT2QS6.js.map +1 -0
  4. package/dist/cli.js +1 -1
  5. package/dist/index.js +1 -1
  6. package/dist/templates/.agentv/.env.template +23 -23
  7. package/dist/templates/.agentv/config.yaml +15 -15
  8. package/dist/templates/.agentv/targets.yaml +16 -0
  9. package/dist/templates/.claude/skills/agentv-eval-builder/SKILL.md +6 -4
  10. package/dist/templates/.claude/skills/agentv-eval-builder/references/batch-cli-evaluator.md +12 -2
  11. package/dist/templates/.claude/skills/agentv-eval-builder/references/compare-command.md +137 -0
  12. package/dist/templates/.claude/skills/agentv-eval-builder/references/composite-evaluator.md +215 -215
  13. package/dist/templates/.claude/skills/agentv-eval-builder/references/custom-evaluators.md +141 -4
  14. package/dist/templates/.claude/skills/agentv-eval-builder/references/eval-schema.json +10 -6
  15. package/dist/templates/.claude/skills/agentv-eval-builder/references/example-evals.md +0 -7
  16. package/dist/templates/.claude/skills/agentv-eval-builder/references/rubric-evaluator.md +0 -2
  17. package/dist/templates/.claude/skills/agentv-eval-builder/references/structured-data-evaluators.md +121 -0
  18. package/dist/templates/.claude/skills/agentv-eval-builder/references/tool-trajectory-evaluator.md +28 -2
  19. package/dist/templates/.claude/skills/agentv-prompt-optimizer/SKILL.md +77 -77
  20. package/dist/templates/.github/prompts/agentv-eval-build.prompt.md +4 -4
  21. package/dist/templates/.github/prompts/agentv-optimize.prompt.md +3 -3
  22. package/package.json +6 -3
  23. package/dist/chunk-3RYQPI4H.js.map +0 -1
@@ -1,215 +1,215 @@
1
- # Composite Evaluator Guide
2
-
3
- Composite evaluators combine multiple evaluators and aggregate their results. This enables sophisticated evaluation patterns like safety gates, weighted scoring, and conflict resolution.
4
-
5
- ## Basic Structure
6
-
7
- ```yaml
8
- execution:
9
- evaluators:
10
- - name: my_composite
11
- type: composite
12
- evaluators:
13
- - name: evaluator_1
14
- type: llm_judge
15
- prompt: ./prompts/check1.md
16
- - name: evaluator_2
17
- type: code_judge
18
- script: uv run check2.py
19
- aggregator:
20
- type: weighted_average
21
- weights:
22
- evaluator_1: 0.6
23
- evaluator_2: 0.4
24
- ```
25
-
26
- ## Aggregator Types
27
-
28
- ### 1. Weighted Average (Default)
29
-
30
- Combines scores using weighted arithmetic mean:
31
-
32
- ```yaml
33
- aggregator:
34
- type: weighted_average
35
- weights:
36
- safety: 0.3 # 30% weight
37
- quality: 0.7 # 70% weight
38
- ```
39
-
40
- If weights are omitted, all evaluators have equal weight (1.0).
41
-
42
- **Score calculation:**
43
- ```
44
- final_score = Σ(score_i × weight_i) / Σ(weight_i)
45
- ```
46
-
47
- ### 2. Code Judge Aggregator
48
-
49
- Run custom code to decide final score based on all evaluator results:
50
-
51
- ```yaml
52
- aggregator:
53
- type: code_judge
54
- path: node ./scripts/safety-gate.js
55
- cwd: ./evaluators # optional working directory
56
- ```
57
-
58
- **Input (stdin):**
59
- ```json
60
- {
61
- "results": {
62
- "safety": { "score": 0.9, "hits": [...], "misses": [...] },
63
- "quality": { "score": 0.85, "hits": [...], "misses": [...] }
64
- }
65
- }
66
- ```
67
-
68
- **Output (stdout):**
69
- ```json
70
- {
71
- "score": 0.87,
72
- "verdict": "pass",
73
- "hits": ["Combined check passed"],
74
- "misses": [],
75
- "reasoning": "Safety gate passed, quality acceptable"
76
- }
77
- ```
78
-
79
- ### 3. LLM Judge Aggregator
80
-
81
- Use an LLM to resolve conflicts or make nuanced decisions:
82
-
83
- ```yaml
84
- aggregator:
85
- type: llm_judge
86
- prompt: ./prompts/conflict-resolution.md
87
- ```
88
-
89
- The `{{EVALUATOR_RESULTS_JSON}}` variable is replaced with the JSON results from all child evaluators.
90
-
91
- ## Example Patterns
92
-
93
- ### Safety Gate Pattern
94
-
95
- Block outputs that fail safety even if quality is high:
96
-
97
- ```yaml
98
- evalcases:
99
- - id: safety-gated-response
100
- expected_outcome: Safe and accurate response
101
-
102
- input_messages:
103
- - role: user
104
- content: Explain quantum computing
105
-
106
- execution:
107
- evaluators:
108
- - name: safety_gate
109
- type: composite
110
- evaluators:
111
- - name: safety
112
- type: llm_judge
113
- prompt: ./prompts/safety-check.md
114
- - name: quality
115
- type: llm_judge
116
- prompt: ./prompts/quality-check.md
117
- aggregator:
118
- type: code_judge
119
- path: ./scripts/safety-gate.js
120
- ```
121
-
122
- ### Multi-Criteria Weighted Evaluation
123
-
124
- ```yaml
125
- - name: release_readiness
126
- type: composite
127
- evaluators:
128
- - name: correctness
129
- type: llm_judge
130
- prompt: ./prompts/correctness.md
131
- - name: style
132
- type: code_judge
133
- script: uv run style_checker.py
134
- - name: security
135
- type: llm_judge
136
- prompt: ./prompts/security.md
137
- aggregator:
138
- type: weighted_average
139
- weights:
140
- correctness: 0.5
141
- style: 0.2
142
- security: 0.3
143
- ```
144
-
145
- ### Nested Composites
146
-
147
- Composites can contain other composites for complex hierarchies:
148
-
149
- ```yaml
150
- - name: comprehensive_eval
151
- type: composite
152
- evaluators:
153
- - name: content_quality
154
- type: composite
155
- evaluators:
156
- - name: accuracy
157
- type: llm_judge
158
- prompt: ./prompts/accuracy.md
159
- - name: clarity
160
- type: llm_judge
161
- prompt: ./prompts/clarity.md
162
- aggregator:
163
- type: weighted_average
164
- weights:
165
- accuracy: 0.6
166
- clarity: 0.4
167
- - name: safety
168
- type: llm_judge
169
- prompt: ./prompts/safety.md
170
- aggregator:
171
- type: weighted_average
172
- weights:
173
- content_quality: 0.7
174
- safety: 0.3
175
- ```
176
-
177
- ## Result Structure
178
-
179
- Composite evaluators return nested `evaluator_results`:
180
-
181
- ```json
182
- {
183
- "score": 0.85,
184
- "verdict": "pass",
185
- "hits": ["[safety] No harmful content", "[quality] Clear explanation"],
186
- "misses": ["[quality] Could use more examples"],
187
- "reasoning": "safety: Passed all checks; quality: Good but could improve",
188
- "evaluator_results": [
189
- {
190
- "name": "safety",
191
- "type": "llm_judge",
192
- "score": 0.95,
193
- "verdict": "pass",
194
- "hits": ["No harmful content"],
195
- "misses": []
196
- },
197
- {
198
- "name": "quality",
199
- "type": "llm_judge",
200
- "score": 0.8,
201
- "verdict": "pass",
202
- "hits": ["Clear explanation"],
203
- "misses": ["Could use more examples"]
204
- }
205
- ]
206
- }
207
- ```
208
-
209
- ## Best Practices
210
-
211
- 1. **Name evaluators clearly** - Names appear in results and debugging output
212
- 2. **Use safety gates for critical checks** - Don't let high quality override safety failures
213
- 3. **Balance weights thoughtfully** - Consider which aspects matter most for your use case
214
- 4. **Keep nesting shallow** - Deep nesting makes debugging harder
215
- 5. **Test aggregators independently** - Verify your custom aggregation logic with unit tests
1
+ # Composite Evaluator Guide
2
+
3
+ Composite evaluators combine multiple evaluators and aggregate their results. This enables sophisticated evaluation patterns like safety gates, weighted scoring, and conflict resolution.
4
+
5
+ ## Basic Structure
6
+
7
+ ```yaml
8
+ execution:
9
+ evaluators:
10
+ - name: my_composite
11
+ type: composite
12
+ evaluators:
13
+ - name: evaluator_1
14
+ type: llm_judge
15
+ prompt: ./prompts/check1.md
16
+ - name: evaluator_2
17
+ type: code_judge
18
+ script: uv run check2.py
19
+ aggregator:
20
+ type: weighted_average
21
+ weights:
22
+ evaluator_1: 0.6
23
+ evaluator_2: 0.4
24
+ ```
25
+
26
+ ## Aggregator Types
27
+
28
+ ### 1. Weighted Average (Default)
29
+
30
+ Combines scores using weighted arithmetic mean:
31
+
32
+ ```yaml
33
+ aggregator:
34
+ type: weighted_average
35
+ weights:
36
+ safety: 0.3 # 30% weight
37
+ quality: 0.7 # 70% weight
38
+ ```
39
+
40
+ If weights are omitted, all evaluators have equal weight (1.0).
41
+
42
+ **Score calculation:**
43
+ ```
44
+ final_score = Σ(score_i × weight_i) / Σ(weight_i)
45
+ ```
46
+
47
+ ### 2. Code Judge Aggregator
48
+
49
+ Run custom code to decide final score based on all evaluator results:
50
+
51
+ ```yaml
52
+ aggregator:
53
+ type: code_judge
54
+ path: node ./scripts/safety-gate.js
55
+ cwd: ./evaluators # optional working directory
56
+ ```
57
+
58
+ **Input (stdin):**
59
+ ```json
60
+ {
61
+ "results": {
62
+ "safety": { "score": 0.9, "hits": [...], "misses": [...] },
63
+ "quality": { "score": 0.85, "hits": [...], "misses": [...] }
64
+ }
65
+ }
66
+ ```
67
+
68
+ **Output (stdout):**
69
+ ```json
70
+ {
71
+ "score": 0.87,
72
+ "verdict": "pass",
73
+ "hits": ["Combined check passed"],
74
+ "misses": [],
75
+ "reasoning": "Safety gate passed, quality acceptable"
76
+ }
77
+ ```
78
+
79
+ ### 3. LLM Judge Aggregator
80
+
81
+ Use an LLM to resolve conflicts or make nuanced decisions:
82
+
83
+ ```yaml
84
+ aggregator:
85
+ type: llm_judge
86
+ prompt: ./prompts/conflict-resolution.md
87
+ ```
88
+
89
+ The `{{EVALUATOR_RESULTS_JSON}}` variable is replaced with the JSON results from all child evaluators.
90
+
91
+ ## Example Patterns
92
+
93
+ ### Safety Gate Pattern
94
+
95
+ Block outputs that fail safety even if quality is high:
96
+
97
+ ```yaml
98
+ evalcases:
99
+ - id: safety-gated-response
100
+ expected_outcome: Safe and accurate response
101
+
102
+ input_messages:
103
+ - role: user
104
+ content: Explain quantum computing
105
+
106
+ execution:
107
+ evaluators:
108
+ - name: safety_gate
109
+ type: composite
110
+ evaluators:
111
+ - name: safety
112
+ type: llm_judge
113
+ prompt: ./prompts/safety-check.md
114
+ - name: quality
115
+ type: llm_judge
116
+ prompt: ./prompts/quality-check.md
117
+ aggregator:
118
+ type: code_judge
119
+ path: ./scripts/safety-gate.js
120
+ ```
121
+
122
+ ### Multi-Criteria Weighted Evaluation
123
+
124
+ ```yaml
125
+ - name: release_readiness
126
+ type: composite
127
+ evaluators:
128
+ - name: correctness
129
+ type: llm_judge
130
+ prompt: ./prompts/correctness.md
131
+ - name: style
132
+ type: code_judge
133
+ script: uv run style_checker.py
134
+ - name: security
135
+ type: llm_judge
136
+ prompt: ./prompts/security.md
137
+ aggregator:
138
+ type: weighted_average
139
+ weights:
140
+ correctness: 0.5
141
+ style: 0.2
142
+ security: 0.3
143
+ ```
144
+
145
+ ### Nested Composites
146
+
147
+ Composites can contain other composites for complex hierarchies:
148
+
149
+ ```yaml
150
+ - name: comprehensive_eval
151
+ type: composite
152
+ evaluators:
153
+ - name: content_quality
154
+ type: composite
155
+ evaluators:
156
+ - name: accuracy
157
+ type: llm_judge
158
+ prompt: ./prompts/accuracy.md
159
+ - name: clarity
160
+ type: llm_judge
161
+ prompt: ./prompts/clarity.md
162
+ aggregator:
163
+ type: weighted_average
164
+ weights:
165
+ accuracy: 0.6
166
+ clarity: 0.4
167
+ - name: safety
168
+ type: llm_judge
169
+ prompt: ./prompts/safety.md
170
+ aggregator:
171
+ type: weighted_average
172
+ weights:
173
+ content_quality: 0.7
174
+ safety: 0.3
175
+ ```
176
+
177
+ ## Result Structure
178
+
179
+ Composite evaluators return nested `evaluator_results`:
180
+
181
+ ```json
182
+ {
183
+ "score": 0.85,
184
+ "verdict": "pass",
185
+ "hits": ["[safety] No harmful content", "[quality] Clear explanation"],
186
+ "misses": ["[quality] Could use more examples"],
187
+ "reasoning": "safety: Passed all checks; quality: Good but could improve",
188
+ "evaluator_results": [
189
+ {
190
+ "name": "safety",
191
+ "type": "llm_judge",
192
+ "score": 0.95,
193
+ "verdict": "pass",
194
+ "hits": ["No harmful content"],
195
+ "misses": []
196
+ },
197
+ {
198
+ "name": "quality",
199
+ "type": "llm_judge",
200
+ "score": 0.8,
201
+ "verdict": "pass",
202
+ "hits": ["Clear explanation"],
203
+ "misses": ["Could use more examples"]
204
+ }
205
+ ]
206
+ }
207
+ ```
208
+
209
+ ## Best Practices
210
+
211
+ 1. **Name evaluators clearly** - Names appear in results and debugging output
212
+ 2. **Use safety gates for critical checks** - Don't let high quality override safety failures
213
+ 3. **Balance weights thoughtfully** - Consider which aspects matter most for your use case
214
+ 4. **Keep nesting shallow** - Deep nesting makes debugging harder
215
+ 5. **Test aggregators independently** - Verify your custom aggregation logic with unit tests
@@ -8,20 +8,47 @@ Code evaluators receive input via stdin and write output to stdout, both as JSON
8
8
 
9
9
  ### Input Format (via stdin)
10
10
 
11
+ Wire format uses snake_case for cross-language compatibility:
12
+
11
13
  ```json
12
14
  {
13
15
  "question": "string describing the task/question",
14
16
  "expected_outcome": "expected outcome description",
15
17
  "reference_answer": "gold standard answer (optional)",
16
18
  "candidate_answer": "generated code/text from the agent",
17
- "guideline_paths": ["path1", "path2"],
19
+ "guideline_files": ["path1", "path2"],
18
20
  "input_files": ["file1", "file2"],
19
21
  "input_messages": [{"role": "user", "content": "..."}],
20
- "output_messages": [{"role": "assistant", "content": "...", "tool_calls": [...]}]
22
+ "output_messages": [
23
+ {
24
+ "role": "assistant",
25
+ "content": "...",
26
+ "tool_calls": [
27
+ {
28
+ "tool": "search",
29
+ "input": { "query": "..." },
30
+ "output": { "results": [...] },
31
+ "id": "call_123",
32
+ "timestamp": "2024-01-15T10:30:00Z"
33
+ }
34
+ ]
35
+ }
36
+ ],
37
+ "trace_summary": {
38
+ "event_count": 5,
39
+ "tool_names": ["fetch", "search"],
40
+ "tool_calls_by_name": { "search": 2, "fetch": 1 },
41
+ "error_count": 0,
42
+ "token_usage": { "input": 1000, "output": 500 },
43
+ "cost_usd": 0.0015,
44
+ "duration_ms": 3500
45
+ }
21
46
  }
22
47
  ```
23
48
 
24
- The `output_messages` array contains the full agent execution trace with tool calls, enabling custom validation of agent behavior.
49
+ **Key fields:**
50
+ - `output_messages` - Full agent execution trace with tool calls (use `tool_calls[].input` for arguments)
51
+ - `trace_summary` - Lightweight summary with execution metrics (counts only, no tool arguments)
25
52
 
26
53
  ### Output Format (to stdout)
27
54
 
@@ -125,6 +152,96 @@ if __name__ == "__main__":
125
152
  main()
126
153
  ```
127
154
 
155
+ ## TypeScript Code Evaluator Template (with SDK)
156
+
157
+ The optional `@agentv/core` SDK provides type-safe payload parsing with camelCase properties (`candidateAnswer` vs `candidate_answer`).
158
+
159
+ **Execution:** Keep evaluators as `.ts` files and run via Node loaders like `npx --yes tsx ./evaluators/my-check.ts` so users don't need Bun after `npm install -g agentv`.
160
+
161
+ **Without SDK:** Skip the import and parse JSON from stdin directly (similar to the Python template above).
162
+
163
+ ```typescript
164
+ /**
165
+ * Example TypeScript code evaluator using the AgentV SDK
166
+ *
167
+ * Run with: npx --yes tsx ./evaluators/example-check.ts
168
+ *
169
+ * The SDK provides:
170
+ * - Type-safe CodeJudgePayload interface with all fields
171
+ * - camelCase properties (candidateAnswer, expectedOutcome, etc.)
172
+ * - Automatic conversion from snake_case wire format
173
+ */
174
+
175
+ import { readCodeJudgePayload } from '@agentv/core';
176
+
177
+ try {
178
+ // Read and parse stdin with automatic snake_case → camelCase conversion
179
+ const payload = readCodeJudgePayload();
180
+
181
+ // Type-safe camelCase access to all fields
182
+ const { candidateAnswer, expectedOutcome, inputFiles, guidelineFiles } = payload;
183
+
184
+ // Your validation logic here
185
+ const hits: string[] = [];
186
+ const misses: string[] = [];
187
+
188
+ // Example: Check if answer contains expected outcome
189
+ if (candidateAnswer.includes(expectedOutcome)) {
190
+ hits.push('Answer matches expected outcome');
191
+ } else {
192
+ misses.push('Answer does not match expected outcome');
193
+ }
194
+
195
+ // Example: Check attachment mentions
196
+ const attachments = [...guidelineFiles, ...inputFiles];
197
+ for (const filePath of attachments) {
198
+ const fileName = filePath.split('/').pop() ?? filePath;
199
+ if (candidateAnswer.includes(fileName)) {
200
+ hits.push(`Mentions attachment: ${fileName}`);
201
+ } else {
202
+ misses.push(`Missing attachment: ${fileName}`);
203
+ }
204
+ }
205
+
206
+ // Calculate score
207
+ const totalChecks = hits.length + misses.length;
208
+ const score = totalChecks === 0 ? 0 : hits.length / totalChecks;
209
+
210
+ // Build result
211
+ const result = {
212
+ score,
213
+ hits,
214
+ misses,
215
+ reasoning: `Passed ${hits.length}/${totalChecks} checks`
216
+ };
217
+
218
+ console.log(JSON.stringify(result, null, 2));
219
+
220
+ } catch (error) {
221
+ const message = error instanceof Error ? error.message : String(error);
222
+ console.log(JSON.stringify({
223
+ score: 0,
224
+ hits: [],
225
+ misses: [`Error: ${message}`],
226
+ reasoning: 'Evaluator error'
227
+ }, null, 2));
228
+ process.exit(1);
229
+ }
230
+ ```
231
+
232
+ **TypeScript SDK Benefits:**
233
+ - **Type-safe**: `CodeJudgePayload` interface with all fields typed
234
+ - **camelCase**: Idiomatic TypeScript naming (`candidateAnswer` vs `candidate_answer`)
235
+ - **Automatic conversion**: Handles snake_case wire format → camelCase objects
236
+ - **Compile-time safety**: Catch typos and missing fields before runtime
237
+
238
+ **Available in SDK:**
239
+ - `readCodeJudgePayload()`: Read stdin and convert to camelCase (recommended)
240
+ - `parseCodeJudgePayload(jsonString)`: Parse JSON string and convert to camelCase
241
+ - `CodeJudgePayload`: TypeScript interface for type safety
242
+
243
+ **See also:** `examples/features/code-judge-sdk/` for complete working examples
244
+
128
245
  ## LLM Judge Prompt Template
129
246
 
130
247
  LLM judges use markdown prompts to guide evaluation. AgentV automatically handles the output format, so focus your prompt on evaluation criteria and guidelines.
@@ -189,11 +306,22 @@ You can customize this template in your eval file using the `evaluatorTemplate`
189
306
  execution:
190
307
  evaluators:
191
308
  - name: my_validator
192
- type: code
309
+ type: code_judge
193
310
  script: uv run my_validator.py
194
311
  cwd: ./evaluators
195
312
  ```
196
313
 
314
+ TypeScript evaluators use the same structure but invoke `tsx` (or another Node-compatible loader) so they work everywhere:
315
+
316
+ ```yaml
317
+ execution:
318
+ evaluators:
319
+ - name: csv_guardrail
320
+ type: code_judge
321
+ script: npx --yes tsx ./evaluators/check-csv.ts
322
+ cwd: ./evaluators
323
+ ```
324
+
197
325
  ### Command Line Testing
198
326
 
199
327
  Test your evaluator locally:
@@ -214,3 +342,12 @@ echo '{
214
342
  # "reasoning": "..."
215
343
  # }
216
344
  ```
345
+
346
+ ```bash
347
+ # TypeScript (uses tsx loader under Node)
348
+ echo '{
349
+ "candidate_answer": "test output here",
350
+ "question": "test task",
351
+ "expected_outcome": "expected result"
352
+ }' | npx --yes tsx ./evaluators/check-csv.ts
353
+ ```
@@ -4,11 +4,6 @@
4
4
  "description": "Schema for YAML evaluation files with conversation flows, multiple evaluators, and execution configuration",
5
5
  "type": "object",
6
6
  "properties": {
7
- "$schema": {
8
- "type": "string",
9
- "description": "Schema identifier",
10
- "enum": ["agentv-eval-v2"]
11
- },
12
7
  "description": {
13
8
  "type": "string",
14
9
  "description": "Description of what this eval suite covers"
@@ -37,7 +32,16 @@
37
32
  },
38
33
  "type": {
39
34
  "type": "string",
40
- "enum": ["code", "llm_judge"],
35
+ "enum": [
36
+ "code",
37
+ "llm_judge",
38
+ "composite",
39
+ "tool_trajectory",
40
+ "field_accuracy",
41
+ "latency",
42
+ "cost",
43
+ "token_usage"
44
+ ],
41
45
  "description": "Evaluator type: 'code' for scripts/regex/keywords, 'llm_judge' for LLM-based evaluation"
42
46
  },
43
47
  "script": {