agentv 2.0.1 → 2.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +11 -4
- package/dist/{chunk-6SHT2QS6.js → chunk-5AJ7DFUO.js} +211 -7
- package/dist/chunk-5AJ7DFUO.js.map +1 -0
- package/dist/cli.js +4 -2
- package/dist/cli.js.map +1 -1
- package/dist/index.js +1 -1
- package/dist/templates/.claude/skills/agentv-eval-builder/references/custom-evaluators.md +45 -43
- package/package.json +4 -2
- package/dist/chunk-6SHT2QS6.js.map +0 -1
package/dist/cli.js
CHANGED
|
@@ -1,11 +1,13 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
import {
|
|
3
3
|
runCli
|
|
4
|
-
} from "./chunk-
|
|
4
|
+
} from "./chunk-5AJ7DFUO.js";
|
|
5
5
|
import "./chunk-UE4GLFVL.js";
|
|
6
6
|
|
|
7
7
|
// src/cli.ts
|
|
8
|
-
runCli().
|
|
8
|
+
runCli().then(() => {
|
|
9
|
+
process.exit(0);
|
|
10
|
+
}).catch((error) => {
|
|
9
11
|
console.error(error);
|
|
10
12
|
process.exit(1);
|
|
11
13
|
});
|
package/dist/cli.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../src/cli.ts"],"sourcesContent":["#!/usr/bin/env node\nimport { runCli } from './index.js';\n\nrunCli().catch((error) => {\n
|
|
1
|
+
{"version":3,"sources":["../src/cli.ts"],"sourcesContent":["#!/usr/bin/env node\nimport { runCli } from './index.js';\n\nrunCli()\n .then(() => {\n process.exit(0);\n })\n .catch((error) => {\n console.error(error);\n process.exit(1);\n });\n"],"mappings":";;;;;;;AAGA,OAAO,EACJ,KAAK,MAAM;AACV,UAAQ,KAAK,CAAC;AAChB,CAAC,EACA,MAAM,CAAC,UAAU;AAChB,UAAQ,MAAM,KAAK;AACnB,UAAQ,KAAK,CAAC;AAChB,CAAC;","names":[]}
|
package/dist/index.js
CHANGED
|
@@ -154,34 +154,27 @@ if __name__ == "__main__":
|
|
|
154
154
|
|
|
155
155
|
## TypeScript Code Evaluator Template (with SDK)
|
|
156
156
|
|
|
157
|
-
The
|
|
157
|
+
The `@agentv/eval` SDK provides a declarative API for code evaluators with automatic stdin/stdout handling, validation, and error handling.
|
|
158
158
|
|
|
159
|
-
**Execution:** Keep evaluators as `.ts` files and run via Node loaders like `npx --yes tsx ./evaluators/my-check.ts
|
|
160
|
-
|
|
161
|
-
**Without SDK:** Skip the import and parse JSON from stdin directly (similar to the Python template above).
|
|
159
|
+
**Execution:** Keep evaluators as `.ts` files and run via `bun run` or Node loaders like `npx --yes tsx ./evaluators/my-check.ts`.
|
|
162
160
|
|
|
163
161
|
```typescript
|
|
162
|
+
#!/usr/bin/env bun
|
|
164
163
|
/**
|
|
165
|
-
* Example TypeScript code evaluator using
|
|
164
|
+
* Example TypeScript code evaluator using defineCodeJudge
|
|
166
165
|
*
|
|
167
|
-
* Run with:
|
|
166
|
+
* Run with: bun run ./evaluators/example-check.ts
|
|
167
|
+
* or: npx --yes tsx ./evaluators/example-check.ts
|
|
168
168
|
*
|
|
169
|
-
* The SDK
|
|
170
|
-
* -
|
|
171
|
-
* -
|
|
172
|
-
* -
|
|
169
|
+
* The SDK handles:
|
|
170
|
+
* - Reading JSON from stdin
|
|
171
|
+
* - Converting snake_case to camelCase
|
|
172
|
+
* - Validating input with Zod
|
|
173
|
+
* - Error handling and output formatting
|
|
173
174
|
*/
|
|
175
|
+
import { defineCodeJudge } from '@agentv/eval';
|
|
174
176
|
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
try {
|
|
178
|
-
// Read and parse stdin with automatic snake_case → camelCase conversion
|
|
179
|
-
const payload = readCodeJudgePayload();
|
|
180
|
-
|
|
181
|
-
// Type-safe camelCase access to all fields
|
|
182
|
-
const { candidateAnswer, expectedOutcome, inputFiles, guidelineFiles } = payload;
|
|
183
|
-
|
|
184
|
-
// Your validation logic here
|
|
177
|
+
export default defineCodeJudge(({ candidateAnswer, expectedOutcome, inputFiles, guidelineFiles }) => {
|
|
185
178
|
const hits: string[] = [];
|
|
186
179
|
const misses: string[] = [];
|
|
187
180
|
|
|
@@ -207,38 +200,47 @@ try {
|
|
|
207
200
|
const totalChecks = hits.length + misses.length;
|
|
208
201
|
const score = totalChecks === 0 ? 0 : hits.length / totalChecks;
|
|
209
202
|
|
|
210
|
-
|
|
211
|
-
const result = {
|
|
203
|
+
return {
|
|
212
204
|
score,
|
|
213
205
|
hits,
|
|
214
206
|
misses,
|
|
215
|
-
reasoning: `Passed ${hits.length}/${totalChecks} checks
|
|
207
|
+
reasoning: `Passed ${hits.length}/${totalChecks} checks`,
|
|
216
208
|
};
|
|
217
|
-
|
|
218
|
-
console.log(JSON.stringify(result, null, 2));
|
|
219
|
-
|
|
220
|
-
} catch (error) {
|
|
221
|
-
const message = error instanceof Error ? error.message : String(error);
|
|
222
|
-
console.log(JSON.stringify({
|
|
223
|
-
score: 0,
|
|
224
|
-
hits: [],
|
|
225
|
-
misses: [`Error: ${message}`],
|
|
226
|
-
reasoning: 'Evaluator error'
|
|
227
|
-
}, null, 2));
|
|
228
|
-
process.exit(1);
|
|
229
|
-
}
|
|
209
|
+
});
|
|
230
210
|
```
|
|
231
211
|
|
|
232
212
|
**TypeScript SDK Benefits:**
|
|
233
|
-
- **
|
|
213
|
+
- **Zero boilerplate**: No try/catch, stdin parsing, or JSON.stringify needed
|
|
214
|
+
- **Type-safe**: `CodeJudgeInput` interface with all fields typed
|
|
234
215
|
- **camelCase**: Idiomatic TypeScript naming (`candidateAnswer` vs `candidate_answer`)
|
|
235
|
-
- **
|
|
236
|
-
- **
|
|
216
|
+
- **Validation**: Zod schemas validate input and output at runtime
|
|
217
|
+
- **Error handling**: Exceptions automatically produce valid failure results
|
|
218
|
+
|
|
219
|
+
**Available exports from `@agentv/eval`:**
|
|
220
|
+
- `defineCodeJudge(handler)`: Define a code judge evaluator (recommended)
|
|
221
|
+
- `CodeJudgeInput`: TypeScript type for input payload
|
|
222
|
+
- `CodeJudgeResult`: TypeScript type for result
|
|
223
|
+
- `TraceSummary`, `OutputMessage`: Types for trace data
|
|
224
|
+
- `z`: Re-exported Zod for custom config schemas
|
|
237
225
|
|
|
238
|
-
**
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
226
|
+
**Using execution metrics:**
|
|
227
|
+
|
|
228
|
+
```typescript
|
|
229
|
+
import { defineCodeJudge } from '@agentv/eval';
|
|
230
|
+
|
|
231
|
+
export default defineCodeJudge(({ traceSummary }) => {
|
|
232
|
+
if (!traceSummary) {
|
|
233
|
+
return { score: 0.5, reasoning: 'No trace available' };
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
const efficient = traceSummary.eventCount <= 10;
|
|
237
|
+
return {
|
|
238
|
+
score: efficient ? 1.0 : 0.5,
|
|
239
|
+
hits: efficient ? ['Efficient execution'] : [],
|
|
240
|
+
misses: efficient ? [] : ['Too many tool calls'],
|
|
241
|
+
};
|
|
242
|
+
});
|
|
243
|
+
```
|
|
242
244
|
|
|
243
245
|
**See also:** `examples/features/code-judge-sdk/` for complete working examples
|
|
244
246
|
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "agentv",
|
|
3
|
-
"version": "2.0.
|
|
3
|
+
"version": "2.0.2",
|
|
4
4
|
"description": "CLI entry point for AgentV",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"repository": {
|
|
@@ -31,7 +31,9 @@
|
|
|
31
31
|
"test:watch": "bun test --watch"
|
|
32
32
|
},
|
|
33
33
|
"dependencies": {
|
|
34
|
-
"@agentv/core": "
|
|
34
|
+
"@agentv/core": "2.0.1",
|
|
35
|
+
"@mariozechner/pi-agent": "^0.9.0",
|
|
36
|
+
"@mariozechner/pi-ai": "^0.37.2",
|
|
35
37
|
"cmd-ts": "^0.14.3",
|
|
36
38
|
"dotenv": "^16.4.5",
|
|
37
39
|
"fast-glob": "^3.3.3",
|