@agentv/eval 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +48 -0
- package/dist/index.cjs +176 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +450 -0
- package/dist/index.d.ts +450 -0
- package/dist/index.js +142 -0
- package/dist/index.js.map +1 -0
- package/package.json +39 -0
package/README.md
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
# @agentv/eval
|
|
2
|
+
|
|
3
|
+
Evaluation SDK for AgentV - build custom code judges with zero boilerplate.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
npm install @agentv/eval
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## Quick Start
|
|
12
|
+
|
|
13
|
+
```typescript
|
|
14
|
+
#!/usr/bin/env bun
|
|
15
|
+
import { defineCodeJudge } from '@agentv/eval';
|
|
16
|
+
|
|
17
|
+
export default defineCodeJudge(({ candidateAnswer, traceSummary }) => ({
|
|
18
|
+
score: candidateAnswer.length > 0 ? 1.0 : 0.0,
|
|
19
|
+
hits: ['Output received'],
|
|
20
|
+
}));
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
The `defineCodeJudge` function handles stdin/stdout parsing, snake_case conversion, Zod validation, and error handling automatically.
|
|
24
|
+
|
|
25
|
+
## Exports
|
|
26
|
+
|
|
27
|
+
- `defineCodeJudge(handler)` - Define a code judge evaluator
|
|
28
|
+
- `CodeJudgeInput`, `CodeJudgeResult` - TypeScript types
|
|
29
|
+
- `TraceSummary`, `OutputMessage`, `ToolCall` - Trace data types
|
|
30
|
+
- `z` - Re-exported Zod for custom config schemas
|
|
31
|
+
|
|
32
|
+
## Documentation
|
|
33
|
+
|
|
34
|
+
For complete documentation including:
|
|
35
|
+
- Full input/output schemas
|
|
36
|
+
- Typed config examples
|
|
37
|
+
- Execution metrics usage
|
|
38
|
+
- Best practices
|
|
39
|
+
|
|
40
|
+
See the [Custom Evaluators Guide](https://github.com/EntityProcess/agentv/blob/main/apps/cli/src/templates/.claude/skills/agentv-eval-builder/references/custom-evaluators.md) or run AgentV's `/agentv-eval-builder` skill.
|
|
41
|
+
|
|
42
|
+
## Repository
|
|
43
|
+
|
|
44
|
+
[https://github.com/EntityProcess/agentv](https://github.com/EntityProcess/agentv)
|
|
45
|
+
|
|
46
|
+
## License
|
|
47
|
+
|
|
48
|
+
MIT License - see [LICENSE](../../LICENSE) for details.
|
package/dist/index.cjs
ADDED
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __defProp = Object.defineProperty;
|
|
3
|
+
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
|
|
4
|
+
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
5
|
+
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
6
|
+
var __export = (target, all) => {
|
|
7
|
+
for (var name in all)
|
|
8
|
+
__defProp(target, name, { get: all[name], enumerable: true });
|
|
9
|
+
};
|
|
10
|
+
var __copyProps = (to, from, except, desc) => {
|
|
11
|
+
if (from && typeof from === "object" || typeof from === "function") {
|
|
12
|
+
for (let key of __getOwnPropNames(from))
|
|
13
|
+
if (!__hasOwnProp.call(to, key) && key !== except)
|
|
14
|
+
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
|
|
15
|
+
}
|
|
16
|
+
return to;
|
|
17
|
+
};
|
|
18
|
+
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
19
|
+
|
|
20
|
+
// src/index.ts
|
|
21
|
+
var index_exports = {};
|
|
22
|
+
__export(index_exports, {
|
|
23
|
+
CodeJudgeInputSchema: () => CodeJudgeInputSchema,
|
|
24
|
+
CodeJudgeResultSchema: () => CodeJudgeResultSchema,
|
|
25
|
+
OutputMessageSchema: () => OutputMessageSchema,
|
|
26
|
+
TokenUsageSchema: () => TokenUsageSchema,
|
|
27
|
+
ToolCallSchema: () => ToolCallSchema,
|
|
28
|
+
TraceSummarySchema: () => TraceSummarySchema,
|
|
29
|
+
defineCodeJudge: () => defineCodeJudge,
|
|
30
|
+
z: () => import_zod2.z
|
|
31
|
+
});
|
|
32
|
+
module.exports = __toCommonJS(index_exports);
|
|
33
|
+
|
|
34
|
+
// src/schemas.ts
|
|
35
|
+
var import_zod = require("zod");
|
|
36
|
+
var TokenUsageSchema = import_zod.z.object({
|
|
37
|
+
input: import_zod.z.number(),
|
|
38
|
+
output: import_zod.z.number(),
|
|
39
|
+
cached: import_zod.z.number().optional()
|
|
40
|
+
});
|
|
41
|
+
var TraceSummarySchema = import_zod.z.object({
|
|
42
|
+
eventCount: import_zod.z.number(),
|
|
43
|
+
toolNames: import_zod.z.array(import_zod.z.string()),
|
|
44
|
+
toolCallsByName: import_zod.z.record(import_zod.z.string(), import_zod.z.number()),
|
|
45
|
+
errorCount: import_zod.z.number(),
|
|
46
|
+
tokenUsage: TokenUsageSchema.optional(),
|
|
47
|
+
costUsd: import_zod.z.number().optional(),
|
|
48
|
+
durationMs: import_zod.z.number().optional(),
|
|
49
|
+
toolDurations: import_zod.z.record(import_zod.z.string(), import_zod.z.array(import_zod.z.number())).optional()
|
|
50
|
+
});
|
|
51
|
+
var ToolCallSchema = import_zod.z.object({
|
|
52
|
+
tool: import_zod.z.string(),
|
|
53
|
+
input: import_zod.z.unknown().optional(),
|
|
54
|
+
output: import_zod.z.unknown().optional(),
|
|
55
|
+
id: import_zod.z.string().optional(),
|
|
56
|
+
timestamp: import_zod.z.string().optional()
|
|
57
|
+
});
|
|
58
|
+
var OutputMessageSchema = import_zod.z.object({
|
|
59
|
+
role: import_zod.z.enum(["assistant", "user", "system", "tool"]),
|
|
60
|
+
content: import_zod.z.union([import_zod.z.string(), import_zod.z.record(import_zod.z.unknown()), import_zod.z.array(import_zod.z.record(import_zod.z.unknown()))]).optional(),
|
|
61
|
+
toolCalls: import_zod.z.array(ToolCallSchema).optional(),
|
|
62
|
+
timestamp: import_zod.z.string().optional(),
|
|
63
|
+
metadata: import_zod.z.record(import_zod.z.unknown()).optional()
|
|
64
|
+
});
|
|
65
|
+
var TestMessageSchema = import_zod.z.object({
|
|
66
|
+
role: import_zod.z.enum(["system", "user", "assistant", "tool"]),
|
|
67
|
+
content: import_zod.z.union([import_zod.z.string(), import_zod.z.record(import_zod.z.unknown()), import_zod.z.array(import_zod.z.record(import_zod.z.unknown()))])
|
|
68
|
+
});
|
|
69
|
+
var CodeJudgeInputSchema = import_zod.z.object({
|
|
70
|
+
question: import_zod.z.string(),
|
|
71
|
+
expectedOutcome: import_zod.z.string(),
|
|
72
|
+
expectedMessages: import_zod.z.array(import_zod.z.record(import_zod.z.unknown())),
|
|
73
|
+
referenceAnswer: import_zod.z.string().optional(),
|
|
74
|
+
candidateAnswer: import_zod.z.string(),
|
|
75
|
+
outputMessages: import_zod.z.array(OutputMessageSchema).nullable().optional(),
|
|
76
|
+
guidelineFiles: import_zod.z.array(import_zod.z.string()),
|
|
77
|
+
inputFiles: import_zod.z.array(import_zod.z.string()),
|
|
78
|
+
inputMessages: import_zod.z.array(TestMessageSchema),
|
|
79
|
+
traceSummary: TraceSummarySchema.nullable().optional(),
|
|
80
|
+
config: import_zod.z.record(import_zod.z.unknown()).nullable().optional()
|
|
81
|
+
});
|
|
82
|
+
var CodeJudgeResultSchema = import_zod.z.object({
|
|
83
|
+
score: import_zod.z.number().min(0).max(1),
|
|
84
|
+
hits: import_zod.z.array(import_zod.z.string()).optional().default([]),
|
|
85
|
+
misses: import_zod.z.array(import_zod.z.string()).optional().default([]),
|
|
86
|
+
reasoning: import_zod.z.string().optional()
|
|
87
|
+
});
|
|
88
|
+
|
|
89
|
+
// src/index.ts
|
|
90
|
+
var import_zod2 = require("zod");
|
|
91
|
+
|
|
92
|
+
// src/runtime.ts
|
|
93
|
+
var import_node_fs = require("fs");
|
|
94
|
+
|
|
95
|
+
// src/case-conversion.ts
|
|
96
|
+
function toCamelCase(str) {
|
|
97
|
+
if (/^[A-Z]/.test(str)) {
|
|
98
|
+
return str;
|
|
99
|
+
}
|
|
100
|
+
return str.replace(/_([a-z0-9])/g, (_, letter) => letter.toUpperCase());
|
|
101
|
+
}
|
|
102
|
+
function toCamelCaseDeep(obj) {
|
|
103
|
+
if (obj === null || obj === void 0) {
|
|
104
|
+
return obj;
|
|
105
|
+
}
|
|
106
|
+
if (Array.isArray(obj)) {
|
|
107
|
+
return obj.map((item) => toCamelCaseDeep(item));
|
|
108
|
+
}
|
|
109
|
+
if (typeof obj === "object") {
|
|
110
|
+
const result = {};
|
|
111
|
+
for (const [key, value] of Object.entries(obj)) {
|
|
112
|
+
const camelKey = toCamelCase(key);
|
|
113
|
+
result[camelKey] = toCamelCaseDeep(value);
|
|
114
|
+
}
|
|
115
|
+
return result;
|
|
116
|
+
}
|
|
117
|
+
return obj;
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
// src/runtime.ts
|
|
121
|
+
function readStdin() {
|
|
122
|
+
return (0, import_node_fs.readFileSync)(0, "utf8");
|
|
123
|
+
}
|
|
124
|
+
function clampScore(value) {
|
|
125
|
+
if (Number.isNaN(value) || !Number.isFinite(value)) {
|
|
126
|
+
return 0;
|
|
127
|
+
}
|
|
128
|
+
return Math.max(0, Math.min(1, value));
|
|
129
|
+
}
|
|
130
|
+
function formatError(error) {
|
|
131
|
+
if (error instanceof Error) {
|
|
132
|
+
return error.message;
|
|
133
|
+
}
|
|
134
|
+
return String(error);
|
|
135
|
+
}
|
|
136
|
+
async function runCodeJudge(handler) {
|
|
137
|
+
try {
|
|
138
|
+
const stdin = readStdin();
|
|
139
|
+
const rawInput = JSON.parse(stdin);
|
|
140
|
+
const camelInput = toCamelCaseDeep(rawInput);
|
|
141
|
+
const input = CodeJudgeInputSchema.parse(camelInput);
|
|
142
|
+
const rawResult = await handler(input);
|
|
143
|
+
const result = CodeJudgeResultSchema.parse({
|
|
144
|
+
...rawResult,
|
|
145
|
+
score: clampScore(rawResult.score)
|
|
146
|
+
});
|
|
147
|
+
console.log(JSON.stringify(result, null, 2));
|
|
148
|
+
} catch (error) {
|
|
149
|
+
const errorMessage = formatError(error);
|
|
150
|
+
const errorResult = {
|
|
151
|
+
score: 0,
|
|
152
|
+
hits: [],
|
|
153
|
+
misses: [errorMessage],
|
|
154
|
+
reasoning: `Evaluation failed: ${errorMessage}`
|
|
155
|
+
};
|
|
156
|
+
console.log(JSON.stringify(errorResult, null, 2));
|
|
157
|
+
process.exit(1);
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
// src/index.ts
|
|
162
|
+
function defineCodeJudge(handler) {
|
|
163
|
+
runCodeJudge(handler);
|
|
164
|
+
}
|
|
165
|
+
// Annotate the CommonJS export names for ESM import in node:
|
|
166
|
+
0 && (module.exports = {
|
|
167
|
+
CodeJudgeInputSchema,
|
|
168
|
+
CodeJudgeResultSchema,
|
|
169
|
+
OutputMessageSchema,
|
|
170
|
+
TokenUsageSchema,
|
|
171
|
+
ToolCallSchema,
|
|
172
|
+
TraceSummarySchema,
|
|
173
|
+
defineCodeJudge,
|
|
174
|
+
z
|
|
175
|
+
});
|
|
176
|
+
//# sourceMappingURL=index.cjs.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../src/index.ts","../src/schemas.ts","../src/runtime.ts","../src/case-conversion.ts"],"sourcesContent":["/**\n * AgentV Evaluation SDK\n *\n * Build custom code judges for evaluating AI agent outputs.\n *\n * @example\n * ```typescript\n * #!/usr/bin/env bun\n * import { defineCodeJudge } from '@agentv/eval';\n *\n * export default defineCodeJudge(({ traceSummary, candidateAnswer }) => ({\n * score: traceSummary?.eventCount <= 5 ? 1.0 : 0.5,\n * hits: ['Efficient tool usage'],\n * misses: [],\n * }));\n * ```\n *\n * @packageDocumentation\n */\n\n// Re-export schemas and types\nexport {\n CodeJudgeInputSchema,\n CodeJudgeResultSchema,\n TraceSummarySchema,\n OutputMessageSchema,\n ToolCallSchema,\n TokenUsageSchema,\n type CodeJudgeInput,\n type CodeJudgeResult,\n type TraceSummary,\n type OutputMessage,\n type ToolCall,\n type TokenUsage,\n} from './schemas.js';\n\n// Re-export Zod for typed config support\nexport { z } from 'zod';\n\n// Import runtime\nimport { type CodeJudgeHandler, runCodeJudge } from './runtime.js';\n\nexport type { CodeJudgeHandler };\n\n/**\n * Define a code judge evaluator with automatic stdin/stdout handling.\n *\n * This function:\n * 1. Reads JSON from stdin (snake_case format)\n * 2. Converts to camelCase and validates with Zod\n * 3. Calls your handler with typed input\n * 4. Validates the result and outputs JSON to stdout\n * 5. Handles errors gracefully with proper exit codes\n *\n * @param handler - Function that evaluates the input and returns a result\n *\n * @example\n * ```typescript\n * import { defineCodeJudge } from '@agentv/eval';\n *\n * export default defineCodeJudge(({ traceSummary }) => {\n * if (!traceSummary) {\n * return { score: 0.5, reasoning: 'No trace available' };\n * }\n *\n * const efficient = traceSummary.eventCount <= 10;\n * return {\n * score: efficient ? 1.0 : 0.5,\n * hits: efficient ? ['Efficient execution'] : [],\n * misses: efficient ? [] : ['Too many tool calls'],\n * };\n * });\n * ```\n *\n * @example With typed config\n * ```typescript\n * import { defineCodeJudge, z } from '@agentv/eval';\n *\n * const ConfigSchema = z.object({\n * maxToolCalls: z.number().default(10),\n * });\n *\n * export default defineCodeJudge(({ traceSummary, config }) => {\n * const { maxToolCalls } = ConfigSchema.parse(config ?? {});\n * // Use maxToolCalls...\n * });\n * ```\n */\nexport function defineCodeJudge(handler: CodeJudgeHandler): void {\n // Run immediately when module is loaded\n runCodeJudge(handler);\n}\n","/**\n * Zod schemas for code judge input/output validation.\n * Provides both compile-time types and runtime validation.\n */\nimport { z } from 'zod';\n\n/**\n * Token usage metrics schema.\n */\nexport const TokenUsageSchema = z.object({\n input: z.number(),\n output: z.number(),\n cached: z.number().optional(),\n});\n\n/**\n * Trace summary schema (camelCase for TypeScript ergonomics).\n */\nexport const TraceSummarySchema = z.object({\n eventCount: z.number(),\n toolNames: z.array(z.string()),\n toolCallsByName: z.record(z.string(), z.number()),\n errorCount: z.number(),\n tokenUsage: TokenUsageSchema.optional(),\n costUsd: z.number().optional(),\n durationMs: z.number().optional(),\n toolDurations: z.record(z.string(), z.array(z.number())).optional(),\n});\n\n/**\n * Tool call schema for output messages.\n */\nexport const ToolCallSchema = z.object({\n tool: z.string(),\n input: z.unknown().optional(),\n output: z.unknown().optional(),\n id: z.string().optional(),\n timestamp: z.string().optional(),\n});\n\n/**\n * Output message schema.\n */\nexport const OutputMessageSchema = z.object({\n role: z.enum(['assistant', 'user', 'system', 'tool']),\n content: z.union([z.string(), z.record(z.unknown()), z.array(z.record(z.unknown()))]).optional(),\n toolCalls: z.array(ToolCallSchema).optional(),\n timestamp: z.string().optional(),\n metadata: z.record(z.unknown()).optional(),\n});\n\n/**\n * Test message schema.\n */\nexport const TestMessageSchema = z.object({\n role: z.enum(['system', 'user', 'assistant', 'tool']),\n content: z.union([z.string(), z.record(z.unknown()), z.array(z.record(z.unknown()))]),\n});\n\n/**\n * Code judge input schema (camelCase, converted from snake_case wire format).\n */\nexport const CodeJudgeInputSchema = z.object({\n question: z.string(),\n expectedOutcome: z.string(),\n expectedMessages: z.array(z.record(z.unknown())),\n referenceAnswer: z.string().optional(),\n candidateAnswer: z.string(),\n outputMessages: z.array(OutputMessageSchema).nullable().optional(),\n guidelineFiles: z.array(z.string()),\n inputFiles: z.array(z.string()),\n inputMessages: z.array(TestMessageSchema),\n traceSummary: TraceSummarySchema.nullable().optional(),\n config: z.record(z.unknown()).nullable().optional(),\n});\n\n/**\n * Code judge result schema (validated before output).\n */\nexport const CodeJudgeResultSchema = z.object({\n score: z.number().min(0).max(1),\n hits: z.array(z.string()).optional().default([]),\n misses: z.array(z.string()).optional().default([]),\n reasoning: z.string().optional(),\n});\n\n/**\n * Inferred types from schemas.\n */\nexport type CodeJudgeInput = z.infer<typeof CodeJudgeInputSchema>;\nexport type CodeJudgeResult = z.infer<typeof CodeJudgeResultSchema>;\nexport type TraceSummary = z.infer<typeof TraceSummarySchema>;\nexport type OutputMessage = z.infer<typeof OutputMessageSchema>;\nexport type ToolCall = z.infer<typeof ToolCallSchema>;\nexport type TokenUsage = z.infer<typeof TokenUsageSchema>;\n","/**\n * Runtime for code judge evaluators.\n * Handles stdin parsing, validation, error handling, and output formatting.\n */\nimport { readFileSync } from 'node:fs';\n\nimport { toCamelCaseDeep } from './case-conversion.js';\nimport {\n type CodeJudgeInput,\n CodeJudgeInputSchema,\n type CodeJudgeResult,\n CodeJudgeResultSchema,\n} from './schemas.js';\n\n/**\n * Handler function type for code judges.\n */\nexport type CodeJudgeHandler = (\n input: CodeJudgeInput,\n) => CodeJudgeResult | Promise<CodeJudgeResult>;\n\n/**\n * Read stdin synchronously (works in both Node.js and Bun).\n */\nfunction readStdin(): string {\n return readFileSync(0, 'utf8');\n}\n\n/**\n * Clamp a value to the range [0, 1].\n */\nfunction clampScore(value: number): number {\n if (Number.isNaN(value) || !Number.isFinite(value)) {\n return 0;\n }\n return Math.max(0, Math.min(1, value));\n}\n\n/**\n * Format an error for output.\n */\nfunction formatError(error: unknown): string {\n if (error instanceof Error) {\n return error.message;\n }\n return String(error);\n}\n\n/**\n * Run a code judge handler with full stdin/stdout handling.\n * This is the internal implementation called by defineCodeJudge.\n */\nexport async function runCodeJudge(handler: CodeJudgeHandler): Promise<void> {\n try {\n // 1. Read stdin\n const stdin = readStdin();\n\n // 2. Parse JSON\n const rawInput = JSON.parse(stdin) as Record<string, unknown>;\n\n // 3. Convert snake_case to camelCase\n const camelInput = toCamelCaseDeep(rawInput);\n\n // 4. Validate input with Zod\n const input = CodeJudgeInputSchema.parse(camelInput);\n\n // 5. Run handler\n const rawResult = await handler(input);\n\n // 6. Validate and normalize output\n const result = CodeJudgeResultSchema.parse({\n ...rawResult,\n score: clampScore(rawResult.score),\n });\n\n // 7. Output JSON\n console.log(JSON.stringify(result, null, 2));\n } catch (error) {\n // Output failure result\n const errorMessage = formatError(error);\n const errorResult: CodeJudgeResult = {\n score: 0,\n hits: [],\n misses: [errorMessage],\n reasoning: `Evaluation failed: ${errorMessage}`,\n };\n console.log(JSON.stringify(errorResult, null, 2));\n process.exit(1);\n }\n}\n","/**\n * Case conversion utilities for JSON payloads.\n * Converts between snake_case (wire format) and camelCase (TypeScript).\n */\n\nfunction toCamelCase(str: string): string {\n // Don't convert keys that start with uppercase (proper nouns/tool names)\n if (/^[A-Z]/.test(str)) {\n return str;\n }\n return str.replace(/_([a-z0-9])/g, (_, letter) => letter.toUpperCase());\n}\n\n/**\n * Recursively converts all keys in an object from snake_case to camelCase.\n * Used to map wire payloads into TypeScript-friendly shapes.\n *\n * @param obj - The object to convert (can be any JSON-serializable value)\n * @returns A new object with all keys converted to camelCase\n */\nexport function toCamelCaseDeep(obj: unknown): unknown {\n if (obj === null || obj === undefined) {\n return obj;\n }\n\n if (Array.isArray(obj)) {\n return obj.map((item) => toCamelCaseDeep(item));\n }\n\n if (typeof obj === 'object') {\n const result: Record<string, unknown> = {};\n for (const [key, value] of Object.entries(obj)) {\n const camelKey = toCamelCase(key);\n result[camelKey] = toCamelCaseDeep(value);\n }\n return result;\n }\n\n return obj;\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;;;ACIA,iBAAkB;AAKX,IAAM,mBAAmB,aAAE,OAAO;AAAA,EACvC,OAAO,aAAE,OAAO;AAAA,EAChB,QAAQ,aAAE,OAAO;AAAA,EACjB,QAAQ,aAAE,OAAO,EAAE,SAAS;AAC9B,CAAC;AAKM,IAAM,qBAAqB,aAAE,OAAO;AAAA,EACzC,YAAY,aAAE,OAAO;AAAA,EACrB,WAAW,aAAE,MAAM,aAAE,OAAO,CAAC;AAAA,EAC7B,iBAAiB,aAAE,OAAO,aAAE,OAAO,GAAG,aAAE,OAAO,CAAC;AAAA,EAChD,YAAY,aAAE,OAAO;AAAA,EACrB,YAAY,iBAAiB,SAAS;AAAA,EACtC,SAAS,aAAE,OAAO,EAAE,SAAS;AAAA,EAC7B,YAAY,aAAE,OAAO,EAAE,SAAS;AAAA,EAChC,eAAe,aAAE,OAAO,aAAE,OAAO,GAAG,aAAE,MAAM,aAAE,OAAO,CAAC,CAAC,EAAE,SAAS;AACpE,CAAC;AAKM,IAAM,iBAAiB,aAAE,OAAO;AAAA,EACrC,MAAM,aAAE,OAAO;AAAA,EACf,OAAO,aAAE,QAAQ,EAAE,SAAS;AAAA,EAC5B,QAAQ,aAAE,QAAQ,EAAE,SAAS;AAAA,EAC7B,IAAI,aAAE,OAAO,EAAE,SAAS;AAAA,EACxB,WAAW,aAAE,OAAO,EAAE,SAAS;AACjC,CAAC;AAKM,IAAM,sBAAsB,aAAE,OAAO;AAAA,EAC1C,MAAM,aAAE,KAAK,CAAC,aAAa,QAAQ,UAAU,MAAM,CAAC;AAAA,EACpD,SAAS,aAAE,MAAM,CAAC,aAAE,OAAO,GAAG,aAAE,OAAO,aAAE,QAAQ,CAAC,GAAG,aAAE,MAAM,aAAE,OAAO,aAAE,QAAQ,CAAC,CAAC,CAAC,CAAC,EAAE,SAAS;AAAA,EAC/F,WAAW,aAAE,MAAM,cAAc,EAAE,SAAS;AAAA,EAC5C,WAAW,aAAE,OAAO,EAAE,SAAS;AAAA,EAC/B,UAAU,aAAE,OAAO,aAAE,QAAQ,CAAC,EAAE,SAAS;AAC3C,CAAC;AAKM,IAAM,oBAAoB,aAAE,OAAO;AAAA,EACxC,MAAM,aAAE,KAAK,CAAC,UAAU,QAAQ,aAAa,MAAM,CAAC;AAAA,EACpD,SAAS,aAAE,MAAM,CAAC,aAAE,OAAO,GAAG,aAAE,OAAO,aAAE,QAAQ,CAAC,GAAG,aAAE,MAAM,aAAE,OAAO,aAAE,QAAQ,CAAC,CAAC,CAAC,CAAC;AACtF,CAAC;AAKM,IAAM,uBAAuB,aAAE,OAAO;AAAA,EAC3C,UAAU,aAAE,OAAO;AAAA,EACnB,iBAAiB,aAAE,OAAO;AAAA,EAC1B,kBAAkB,aAAE,MAAM,aAAE,OAAO,aAAE,QAAQ,CAAC,CAAC;AAAA,EAC/C,iBAAiB,aAAE,OAAO,EAAE,SAAS;AAAA,EACrC,iBAAiB,aAAE,OAAO;AAAA,EAC1B,gBAAgB,aAAE,MAAM,mBAAmB,EAAE,SAAS,EAAE,SAAS;AAAA,EACjE,gBAAgB,aAAE,MAAM,aAAE,OAAO,CAAC;AAAA,EAClC,YAAY,aAAE,MAAM,aAAE,OAAO,CAAC;AAAA,EAC9B,eAAe,aAAE,MAAM,iBAAiB;AAAA,EACxC,cAAc,mBAAmB,SAAS,EAAE,SAAS;AAAA,EACrD,QAAQ,aAAE,OAAO,aAAE,QAAQ,CAAC,EAAE,SAAS,EAAE,SAAS;AACpD,CAAC;AAKM,IAAM,wBAAwB,aAAE,OAAO;AAAA,EAC5C,OAAO,aAAE,OAAO,EAAE,IAAI,CAAC,EAAE,IAAI,CAAC;AAAA,EAC9B,MAAM,aAAE,MAAM,aAAE,OAAO,CAAC,EAAE,SAAS,EAAE,QAAQ,CAAC,CAAC;AAAA,EAC/C,QAAQ,aAAE,MAAM,aAAE,OAAO,CAAC,EAAE,SAAS,EAAE,QAAQ,CAAC,CAAC;AAAA,EACjD,WAAW,aAAE,OAAO,EAAE,SAAS;AACjC,CAAC;;;AD/CD,IAAAA,cAAkB;;;AEjClB,qBAA6B;;;ACC7B,SAAS,YAAY,KAAqB;AAExC,MAAI,SAAS,KAAK,GAAG,GAAG;AACtB,WAAO;AAAA,EACT;AACA,SAAO,IAAI,QAAQ,gBAAgB,CAAC,GAAG,WAAW,OAAO,YAAY,CAAC;AACxE;AASO,SAAS,gBAAgB,KAAuB;AACrD,MAAI,QAAQ,QAAQ,QAAQ,QAAW;AACrC,WAAO;AAAA,EACT;AAEA,MAAI,MAAM,QAAQ,GAAG,GAAG;AACtB,WAAO,IAAI,IAAI,CAAC,SAAS,gBAAgB,IAAI,CAAC;AAAA,EAChD;AAEA,MAAI,OAAO,QAAQ,UAAU;AAC3B,UAAM,SAAkC,CAAC;AACzC,eAAW,CAAC,KAAK,KAAK,KAAK,OAAO,QAAQ,GAAG,GAAG;AAC9C,YAAM,WAAW,YAAY,GAAG;AAChC,aAAO,QAAQ,IAAI,gBAAgB,KAAK;AAAA,IAC1C;AACA,WAAO;AAAA,EACT;AAEA,SAAO;AACT;;;ADfA,SAAS,YAAoB;AAC3B,aAAO,6BAAa,GAAG,MAAM;AAC/B;AAKA,SAAS,WAAW,OAAuB;AACzC,MAAI,OAAO,MAAM,KAAK,KAAK,CAAC,OAAO,SAAS,KAAK,GAAG;AAClD,WAAO;AAAA,EACT;AACA,SAAO,KAAK,IAAI,GAAG,KAAK,IAAI,GAAG,KAAK,CAAC;AACvC;AAKA,SAAS,YAAY,OAAwB;AAC3C,MAAI,iBAAiB,OAAO;AAC1B,WAAO,MAAM;AAAA,EACf;AACA,SAAO,OAAO,KAAK;AACrB;AAMA,eAAsB,aAAa,SAA0C;AAC3E,MAAI;AAEF,UAAM,QAAQ,UAAU;AAGxB,UAAM,WAAW,KAAK,MAAM,KAAK;AAGjC,UAAM,aAAa,gBAAgB,QAAQ;AAG3C,UAAM,QAAQ,qBAAqB,MAAM,UAAU;AAGnD,UAAM,YAAY,MAAM,QAAQ,KAAK;AAGrC,UAAM,SAAS,sBAAsB,MAAM;AAAA,MACzC,GAAG;AAAA,MACH,OAAO,WAAW,UAAU,KAAK;AAAA,IACnC,CAAC;AAGD,YAAQ,IAAI,KAAK,UAAU,QAAQ,MAAM,CAAC,CAAC;AAAA,EAC7C,SAAS,OAAO;AAEd,UAAM,eAAe,YAAY,KAAK;AACtC,UAAM,cAA+B;AAAA,MACnC,OAAO;AAAA,MACP,MAAM,CAAC;AAAA,MACP,QAAQ,CAAC,YAAY;AAAA,MACrB,WAAW,sBAAsB,YAAY;AAAA,IAC/C;AACA,YAAQ,IAAI,KAAK,UAAU,aAAa,MAAM,CAAC,CAAC;AAChD,YAAQ,KAAK,CAAC;AAAA,EAChB;AACF;;;AFDO,SAAS,gBAAgB,SAAiC;AAE/D,eAAa,OAAO;AACtB;","names":["import_zod"]}
|
package/dist/index.d.cts
ADDED
|
@@ -0,0 +1,450 @@
|
|
|
1
|
+
import { z } from 'zod';
|
|
2
|
+
export { z } from 'zod';
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Zod schemas for code judge input/output validation.
|
|
6
|
+
* Provides both compile-time types and runtime validation.
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
/**
|
|
10
|
+
* Token usage metrics schema.
|
|
11
|
+
*/
|
|
12
|
+
declare const TokenUsageSchema: z.ZodObject<{
|
|
13
|
+
input: z.ZodNumber;
|
|
14
|
+
output: z.ZodNumber;
|
|
15
|
+
cached: z.ZodOptional<z.ZodNumber>;
|
|
16
|
+
}, "strip", z.ZodTypeAny, {
|
|
17
|
+
input: number;
|
|
18
|
+
output: number;
|
|
19
|
+
cached?: number | undefined;
|
|
20
|
+
}, {
|
|
21
|
+
input: number;
|
|
22
|
+
output: number;
|
|
23
|
+
cached?: number | undefined;
|
|
24
|
+
}>;
|
|
25
|
+
/**
|
|
26
|
+
* Trace summary schema (camelCase for TypeScript ergonomics).
|
|
27
|
+
*/
|
|
28
|
+
declare const TraceSummarySchema: z.ZodObject<{
|
|
29
|
+
eventCount: z.ZodNumber;
|
|
30
|
+
toolNames: z.ZodArray<z.ZodString, "many">;
|
|
31
|
+
toolCallsByName: z.ZodRecord<z.ZodString, z.ZodNumber>;
|
|
32
|
+
errorCount: z.ZodNumber;
|
|
33
|
+
tokenUsage: z.ZodOptional<z.ZodObject<{
|
|
34
|
+
input: z.ZodNumber;
|
|
35
|
+
output: z.ZodNumber;
|
|
36
|
+
cached: z.ZodOptional<z.ZodNumber>;
|
|
37
|
+
}, "strip", z.ZodTypeAny, {
|
|
38
|
+
input: number;
|
|
39
|
+
output: number;
|
|
40
|
+
cached?: number | undefined;
|
|
41
|
+
}, {
|
|
42
|
+
input: number;
|
|
43
|
+
output: number;
|
|
44
|
+
cached?: number | undefined;
|
|
45
|
+
}>>;
|
|
46
|
+
costUsd: z.ZodOptional<z.ZodNumber>;
|
|
47
|
+
durationMs: z.ZodOptional<z.ZodNumber>;
|
|
48
|
+
toolDurations: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodArray<z.ZodNumber, "many">>>;
|
|
49
|
+
}, "strip", z.ZodTypeAny, {
|
|
50
|
+
eventCount: number;
|
|
51
|
+
toolNames: string[];
|
|
52
|
+
toolCallsByName: Record<string, number>;
|
|
53
|
+
errorCount: number;
|
|
54
|
+
tokenUsage?: {
|
|
55
|
+
input: number;
|
|
56
|
+
output: number;
|
|
57
|
+
cached?: number | undefined;
|
|
58
|
+
} | undefined;
|
|
59
|
+
costUsd?: number | undefined;
|
|
60
|
+
durationMs?: number | undefined;
|
|
61
|
+
toolDurations?: Record<string, number[]> | undefined;
|
|
62
|
+
}, {
|
|
63
|
+
eventCount: number;
|
|
64
|
+
toolNames: string[];
|
|
65
|
+
toolCallsByName: Record<string, number>;
|
|
66
|
+
errorCount: number;
|
|
67
|
+
tokenUsage?: {
|
|
68
|
+
input: number;
|
|
69
|
+
output: number;
|
|
70
|
+
cached?: number | undefined;
|
|
71
|
+
} | undefined;
|
|
72
|
+
costUsd?: number | undefined;
|
|
73
|
+
durationMs?: number | undefined;
|
|
74
|
+
toolDurations?: Record<string, number[]> | undefined;
|
|
75
|
+
}>;
|
|
76
|
+
/**
|
|
77
|
+
* Tool call schema for output messages.
|
|
78
|
+
*/
|
|
79
|
+
declare const ToolCallSchema: z.ZodObject<{
|
|
80
|
+
tool: z.ZodString;
|
|
81
|
+
input: z.ZodOptional<z.ZodUnknown>;
|
|
82
|
+
output: z.ZodOptional<z.ZodUnknown>;
|
|
83
|
+
id: z.ZodOptional<z.ZodString>;
|
|
84
|
+
timestamp: z.ZodOptional<z.ZodString>;
|
|
85
|
+
}, "strip", z.ZodTypeAny, {
|
|
86
|
+
tool: string;
|
|
87
|
+
input?: unknown;
|
|
88
|
+
output?: unknown;
|
|
89
|
+
id?: string | undefined;
|
|
90
|
+
timestamp?: string | undefined;
|
|
91
|
+
}, {
|
|
92
|
+
tool: string;
|
|
93
|
+
input?: unknown;
|
|
94
|
+
output?: unknown;
|
|
95
|
+
id?: string | undefined;
|
|
96
|
+
timestamp?: string | undefined;
|
|
97
|
+
}>;
|
|
98
|
+
/**
|
|
99
|
+
* Output message schema.
|
|
100
|
+
*/
|
|
101
|
+
declare const OutputMessageSchema: z.ZodObject<{
|
|
102
|
+
role: z.ZodEnum<["assistant", "user", "system", "tool"]>;
|
|
103
|
+
content: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodRecord<z.ZodString, z.ZodUnknown>, z.ZodArray<z.ZodRecord<z.ZodString, z.ZodUnknown>, "many">]>>;
|
|
104
|
+
toolCalls: z.ZodOptional<z.ZodArray<z.ZodObject<{
|
|
105
|
+
tool: z.ZodString;
|
|
106
|
+
input: z.ZodOptional<z.ZodUnknown>;
|
|
107
|
+
output: z.ZodOptional<z.ZodUnknown>;
|
|
108
|
+
id: z.ZodOptional<z.ZodString>;
|
|
109
|
+
timestamp: z.ZodOptional<z.ZodString>;
|
|
110
|
+
}, "strip", z.ZodTypeAny, {
|
|
111
|
+
tool: string;
|
|
112
|
+
input?: unknown;
|
|
113
|
+
output?: unknown;
|
|
114
|
+
id?: string | undefined;
|
|
115
|
+
timestamp?: string | undefined;
|
|
116
|
+
}, {
|
|
117
|
+
tool: string;
|
|
118
|
+
input?: unknown;
|
|
119
|
+
output?: unknown;
|
|
120
|
+
id?: string | undefined;
|
|
121
|
+
timestamp?: string | undefined;
|
|
122
|
+
}>, "many">>;
|
|
123
|
+
timestamp: z.ZodOptional<z.ZodString>;
|
|
124
|
+
metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
125
|
+
}, "strip", z.ZodTypeAny, {
|
|
126
|
+
role: "tool" | "assistant" | "user" | "system";
|
|
127
|
+
timestamp?: string | undefined;
|
|
128
|
+
content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
|
|
129
|
+
toolCalls?: {
|
|
130
|
+
tool: string;
|
|
131
|
+
input?: unknown;
|
|
132
|
+
output?: unknown;
|
|
133
|
+
id?: string | undefined;
|
|
134
|
+
timestamp?: string | undefined;
|
|
135
|
+
}[] | undefined;
|
|
136
|
+
metadata?: Record<string, unknown> | undefined;
|
|
137
|
+
}, {
|
|
138
|
+
role: "tool" | "assistant" | "user" | "system";
|
|
139
|
+
timestamp?: string | undefined;
|
|
140
|
+
content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
|
|
141
|
+
toolCalls?: {
|
|
142
|
+
tool: string;
|
|
143
|
+
input?: unknown;
|
|
144
|
+
output?: unknown;
|
|
145
|
+
id?: string | undefined;
|
|
146
|
+
timestamp?: string | undefined;
|
|
147
|
+
}[] | undefined;
|
|
148
|
+
metadata?: Record<string, unknown> | undefined;
|
|
149
|
+
}>;
|
|
150
|
+
/**
|
|
151
|
+
* Code judge input schema (camelCase, converted from snake_case wire format).
|
|
152
|
+
*/
|
|
153
|
+
declare const CodeJudgeInputSchema: z.ZodObject<{
|
|
154
|
+
question: z.ZodString;
|
|
155
|
+
expectedOutcome: z.ZodString;
|
|
156
|
+
expectedMessages: z.ZodArray<z.ZodRecord<z.ZodString, z.ZodUnknown>, "many">;
|
|
157
|
+
referenceAnswer: z.ZodOptional<z.ZodString>;
|
|
158
|
+
candidateAnswer: z.ZodString;
|
|
159
|
+
outputMessages: z.ZodOptional<z.ZodNullable<z.ZodArray<z.ZodObject<{
|
|
160
|
+
role: z.ZodEnum<["assistant", "user", "system", "tool"]>;
|
|
161
|
+
content: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodRecord<z.ZodString, z.ZodUnknown>, z.ZodArray<z.ZodRecord<z.ZodString, z.ZodUnknown>, "many">]>>;
|
|
162
|
+
toolCalls: z.ZodOptional<z.ZodArray<z.ZodObject<{
|
|
163
|
+
tool: z.ZodString;
|
|
164
|
+
input: z.ZodOptional<z.ZodUnknown>;
|
|
165
|
+
output: z.ZodOptional<z.ZodUnknown>;
|
|
166
|
+
id: z.ZodOptional<z.ZodString>;
|
|
167
|
+
timestamp: z.ZodOptional<z.ZodString>;
|
|
168
|
+
}, "strip", z.ZodTypeAny, {
|
|
169
|
+
tool: string;
|
|
170
|
+
input?: unknown;
|
|
171
|
+
output?: unknown;
|
|
172
|
+
id?: string | undefined;
|
|
173
|
+
timestamp?: string | undefined;
|
|
174
|
+
}, {
|
|
175
|
+
tool: string;
|
|
176
|
+
input?: unknown;
|
|
177
|
+
output?: unknown;
|
|
178
|
+
id?: string | undefined;
|
|
179
|
+
timestamp?: string | undefined;
|
|
180
|
+
}>, "many">>;
|
|
181
|
+
timestamp: z.ZodOptional<z.ZodString>;
|
|
182
|
+
metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
183
|
+
}, "strip", z.ZodTypeAny, {
|
|
184
|
+
role: "tool" | "assistant" | "user" | "system";
|
|
185
|
+
timestamp?: string | undefined;
|
|
186
|
+
content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
|
|
187
|
+
toolCalls?: {
|
|
188
|
+
tool: string;
|
|
189
|
+
input?: unknown;
|
|
190
|
+
output?: unknown;
|
|
191
|
+
id?: string | undefined;
|
|
192
|
+
timestamp?: string | undefined;
|
|
193
|
+
}[] | undefined;
|
|
194
|
+
metadata?: Record<string, unknown> | undefined;
|
|
195
|
+
}, {
|
|
196
|
+
role: "tool" | "assistant" | "user" | "system";
|
|
197
|
+
timestamp?: string | undefined;
|
|
198
|
+
content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
|
|
199
|
+
toolCalls?: {
|
|
200
|
+
tool: string;
|
|
201
|
+
input?: unknown;
|
|
202
|
+
output?: unknown;
|
|
203
|
+
id?: string | undefined;
|
|
204
|
+
timestamp?: string | undefined;
|
|
205
|
+
}[] | undefined;
|
|
206
|
+
metadata?: Record<string, unknown> | undefined;
|
|
207
|
+
}>, "many">>>;
|
|
208
|
+
guidelineFiles: z.ZodArray<z.ZodString, "many">;
|
|
209
|
+
inputFiles: z.ZodArray<z.ZodString, "many">;
|
|
210
|
+
inputMessages: z.ZodArray<z.ZodObject<{
|
|
211
|
+
role: z.ZodEnum<["system", "user", "assistant", "tool"]>;
|
|
212
|
+
content: z.ZodUnion<[z.ZodString, z.ZodRecord<z.ZodString, z.ZodUnknown>, z.ZodArray<z.ZodRecord<z.ZodString, z.ZodUnknown>, "many">]>;
|
|
213
|
+
}, "strip", z.ZodTypeAny, {
|
|
214
|
+
role: "tool" | "assistant" | "user" | "system";
|
|
215
|
+
content: string | Record<string, unknown> | Record<string, unknown>[];
|
|
216
|
+
}, {
|
|
217
|
+
role: "tool" | "assistant" | "user" | "system";
|
|
218
|
+
content: string | Record<string, unknown> | Record<string, unknown>[];
|
|
219
|
+
}>, "many">;
|
|
220
|
+
traceSummary: z.ZodOptional<z.ZodNullable<z.ZodObject<{
|
|
221
|
+
eventCount: z.ZodNumber;
|
|
222
|
+
toolNames: z.ZodArray<z.ZodString, "many">;
|
|
223
|
+
toolCallsByName: z.ZodRecord<z.ZodString, z.ZodNumber>;
|
|
224
|
+
errorCount: z.ZodNumber;
|
|
225
|
+
tokenUsage: z.ZodOptional<z.ZodObject<{
|
|
226
|
+
input: z.ZodNumber;
|
|
227
|
+
output: z.ZodNumber;
|
|
228
|
+
cached: z.ZodOptional<z.ZodNumber>;
|
|
229
|
+
}, "strip", z.ZodTypeAny, {
|
|
230
|
+
input: number;
|
|
231
|
+
output: number;
|
|
232
|
+
cached?: number | undefined;
|
|
233
|
+
}, {
|
|
234
|
+
input: number;
|
|
235
|
+
output: number;
|
|
236
|
+
cached?: number | undefined;
|
|
237
|
+
}>>;
|
|
238
|
+
costUsd: z.ZodOptional<z.ZodNumber>;
|
|
239
|
+
durationMs: z.ZodOptional<z.ZodNumber>;
|
|
240
|
+
toolDurations: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodArray<z.ZodNumber, "many">>>;
|
|
241
|
+
}, "strip", z.ZodTypeAny, {
|
|
242
|
+
eventCount: number;
|
|
243
|
+
toolNames: string[];
|
|
244
|
+
toolCallsByName: Record<string, number>;
|
|
245
|
+
errorCount: number;
|
|
246
|
+
tokenUsage?: {
|
|
247
|
+
input: number;
|
|
248
|
+
output: number;
|
|
249
|
+
cached?: number | undefined;
|
|
250
|
+
} | undefined;
|
|
251
|
+
costUsd?: number | undefined;
|
|
252
|
+
durationMs?: number | undefined;
|
|
253
|
+
toolDurations?: Record<string, number[]> | undefined;
|
|
254
|
+
}, {
|
|
255
|
+
eventCount: number;
|
|
256
|
+
toolNames: string[];
|
|
257
|
+
toolCallsByName: Record<string, number>;
|
|
258
|
+
errorCount: number;
|
|
259
|
+
tokenUsage?: {
|
|
260
|
+
input: number;
|
|
261
|
+
output: number;
|
|
262
|
+
cached?: number | undefined;
|
|
263
|
+
} | undefined;
|
|
264
|
+
costUsd?: number | undefined;
|
|
265
|
+
durationMs?: number | undefined;
|
|
266
|
+
toolDurations?: Record<string, number[]> | undefined;
|
|
267
|
+
}>>>;
|
|
268
|
+
config: z.ZodOptional<z.ZodNullable<z.ZodRecord<z.ZodString, z.ZodUnknown>>>;
|
|
269
|
+
}, "strip", z.ZodTypeAny, {
|
|
270
|
+
question: string;
|
|
271
|
+
expectedOutcome: string;
|
|
272
|
+
expectedMessages: Record<string, unknown>[];
|
|
273
|
+
candidateAnswer: string;
|
|
274
|
+
guidelineFiles: string[];
|
|
275
|
+
inputFiles: string[];
|
|
276
|
+
inputMessages: {
|
|
277
|
+
role: "tool" | "assistant" | "user" | "system";
|
|
278
|
+
content: string | Record<string, unknown> | Record<string, unknown>[];
|
|
279
|
+
}[];
|
|
280
|
+
referenceAnswer?: string | undefined;
|
|
281
|
+
outputMessages?: {
|
|
282
|
+
role: "tool" | "assistant" | "user" | "system";
|
|
283
|
+
timestamp?: string | undefined;
|
|
284
|
+
content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
|
|
285
|
+
toolCalls?: {
|
|
286
|
+
tool: string;
|
|
287
|
+
input?: unknown;
|
|
288
|
+
output?: unknown;
|
|
289
|
+
id?: string | undefined;
|
|
290
|
+
timestamp?: string | undefined;
|
|
291
|
+
}[] | undefined;
|
|
292
|
+
metadata?: Record<string, unknown> | undefined;
|
|
293
|
+
}[] | null | undefined;
|
|
294
|
+
traceSummary?: {
|
|
295
|
+
eventCount: number;
|
|
296
|
+
toolNames: string[];
|
|
297
|
+
toolCallsByName: Record<string, number>;
|
|
298
|
+
errorCount: number;
|
|
299
|
+
tokenUsage?: {
|
|
300
|
+
input: number;
|
|
301
|
+
output: number;
|
|
302
|
+
cached?: number | undefined;
|
|
303
|
+
} | undefined;
|
|
304
|
+
costUsd?: number | undefined;
|
|
305
|
+
durationMs?: number | undefined;
|
|
306
|
+
toolDurations?: Record<string, number[]> | undefined;
|
|
307
|
+
} | null | undefined;
|
|
308
|
+
config?: Record<string, unknown> | null | undefined;
|
|
309
|
+
}, {
|
|
310
|
+
question: string;
|
|
311
|
+
expectedOutcome: string;
|
|
312
|
+
expectedMessages: Record<string, unknown>[];
|
|
313
|
+
candidateAnswer: string;
|
|
314
|
+
guidelineFiles: string[];
|
|
315
|
+
inputFiles: string[];
|
|
316
|
+
inputMessages: {
|
|
317
|
+
role: "tool" | "assistant" | "user" | "system";
|
|
318
|
+
content: string | Record<string, unknown> | Record<string, unknown>[];
|
|
319
|
+
}[];
|
|
320
|
+
referenceAnswer?: string | undefined;
|
|
321
|
+
outputMessages?: {
|
|
322
|
+
role: "tool" | "assistant" | "user" | "system";
|
|
323
|
+
timestamp?: string | undefined;
|
|
324
|
+
content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
|
|
325
|
+
toolCalls?: {
|
|
326
|
+
tool: string;
|
|
327
|
+
input?: unknown;
|
|
328
|
+
output?: unknown;
|
|
329
|
+
id?: string | undefined;
|
|
330
|
+
timestamp?: string | undefined;
|
|
331
|
+
}[] | undefined;
|
|
332
|
+
metadata?: Record<string, unknown> | undefined;
|
|
333
|
+
}[] | null | undefined;
|
|
334
|
+
traceSummary?: {
|
|
335
|
+
eventCount: number;
|
|
336
|
+
toolNames: string[];
|
|
337
|
+
toolCallsByName: Record<string, number>;
|
|
338
|
+
errorCount: number;
|
|
339
|
+
tokenUsage?: {
|
|
340
|
+
input: number;
|
|
341
|
+
output: number;
|
|
342
|
+
cached?: number | undefined;
|
|
343
|
+
} | undefined;
|
|
344
|
+
costUsd?: number | undefined;
|
|
345
|
+
durationMs?: number | undefined;
|
|
346
|
+
toolDurations?: Record<string, number[]> | undefined;
|
|
347
|
+
} | null | undefined;
|
|
348
|
+
config?: Record<string, unknown> | null | undefined;
|
|
349
|
+
}>;
|
|
350
|
+
/**
|
|
351
|
+
* Code judge result schema (validated before output).
|
|
352
|
+
*/
|
|
353
|
+
declare const CodeJudgeResultSchema: z.ZodObject<{
|
|
354
|
+
score: z.ZodNumber;
|
|
355
|
+
hits: z.ZodDefault<z.ZodOptional<z.ZodArray<z.ZodString, "many">>>;
|
|
356
|
+
misses: z.ZodDefault<z.ZodOptional<z.ZodArray<z.ZodString, "many">>>;
|
|
357
|
+
reasoning: z.ZodOptional<z.ZodString>;
|
|
358
|
+
}, "strip", z.ZodTypeAny, {
|
|
359
|
+
score: number;
|
|
360
|
+
hits: string[];
|
|
361
|
+
misses: string[];
|
|
362
|
+
reasoning?: string | undefined;
|
|
363
|
+
}, {
|
|
364
|
+
score: number;
|
|
365
|
+
hits?: string[] | undefined;
|
|
366
|
+
misses?: string[] | undefined;
|
|
367
|
+
reasoning?: string | undefined;
|
|
368
|
+
}>;
|
|
369
|
+
/**
|
|
370
|
+
* Inferred types from schemas.
|
|
371
|
+
*/
|
|
372
|
+
type CodeJudgeInput = z.infer<typeof CodeJudgeInputSchema>;
|
|
373
|
+
type CodeJudgeResult = z.infer<typeof CodeJudgeResultSchema>;
|
|
374
|
+
type TraceSummary = z.infer<typeof TraceSummarySchema>;
|
|
375
|
+
type OutputMessage = z.infer<typeof OutputMessageSchema>;
|
|
376
|
+
type ToolCall = z.infer<typeof ToolCallSchema>;
|
|
377
|
+
type TokenUsage = z.infer<typeof TokenUsageSchema>;
|
|
378
|
+
|
|
379
|
+
/**
|
|
380
|
+
* Handler function type for code judges.
|
|
381
|
+
*/
|
|
382
|
+
type CodeJudgeHandler = (input: CodeJudgeInput) => CodeJudgeResult | Promise<CodeJudgeResult>;
|
|
383
|
+
|
|
384
|
+
/**
|
|
385
|
+
* AgentV Evaluation SDK
|
|
386
|
+
*
|
|
387
|
+
* Build custom code judges for evaluating AI agent outputs.
|
|
388
|
+
*
|
|
389
|
+
* @example
|
|
390
|
+
* ```typescript
|
|
391
|
+
* #!/usr/bin/env bun
|
|
392
|
+
* import { defineCodeJudge } from '@agentv/eval';
|
|
393
|
+
*
|
|
394
|
+
* export default defineCodeJudge(({ traceSummary, candidateAnswer }) => ({
|
|
395
|
+
* score: traceSummary?.eventCount <= 5 ? 1.0 : 0.5,
|
|
396
|
+
* hits: ['Efficient tool usage'],
|
|
397
|
+
* misses: [],
|
|
398
|
+
* }));
|
|
399
|
+
* ```
|
|
400
|
+
*
|
|
401
|
+
* @packageDocumentation
|
|
402
|
+
*/
|
|
403
|
+
|
|
404
|
+
/**
|
|
405
|
+
* Define a code judge evaluator with automatic stdin/stdout handling.
|
|
406
|
+
*
|
|
407
|
+
* This function:
|
|
408
|
+
* 1. Reads JSON from stdin (snake_case format)
|
|
409
|
+
* 2. Converts to camelCase and validates with Zod
|
|
410
|
+
* 3. Calls your handler with typed input
|
|
411
|
+
* 4. Validates the result and outputs JSON to stdout
|
|
412
|
+
* 5. Handles errors gracefully with proper exit codes
|
|
413
|
+
*
|
|
414
|
+
* @param handler - Function that evaluates the input and returns a result
|
|
415
|
+
*
|
|
416
|
+
* @example
|
|
417
|
+
* ```typescript
|
|
418
|
+
* import { defineCodeJudge } from '@agentv/eval';
|
|
419
|
+
*
|
|
420
|
+
* export default defineCodeJudge(({ traceSummary }) => {
|
|
421
|
+
* if (!traceSummary) {
|
|
422
|
+
* return { score: 0.5, reasoning: 'No trace available' };
|
|
423
|
+
* }
|
|
424
|
+
*
|
|
425
|
+
* const efficient = traceSummary.eventCount <= 10;
|
|
426
|
+
* return {
|
|
427
|
+
* score: efficient ? 1.0 : 0.5,
|
|
428
|
+
* hits: efficient ? ['Efficient execution'] : [],
|
|
429
|
+
* misses: efficient ? [] : ['Too many tool calls'],
|
|
430
|
+
* };
|
|
431
|
+
* });
|
|
432
|
+
* ```
|
|
433
|
+
*
|
|
434
|
+
* @example With typed config
|
|
435
|
+
* ```typescript
|
|
436
|
+
* import { defineCodeJudge, z } from '@agentv/eval';
|
|
437
|
+
*
|
|
438
|
+
* const ConfigSchema = z.object({
|
|
439
|
+
* maxToolCalls: z.number().default(10),
|
|
440
|
+
* });
|
|
441
|
+
*
|
|
442
|
+
* export default defineCodeJudge(({ traceSummary, config }) => {
|
|
443
|
+
* const { maxToolCalls } = ConfigSchema.parse(config ?? {});
|
|
444
|
+
* // Use maxToolCalls...
|
|
445
|
+
* });
|
|
446
|
+
* ```
|
|
447
|
+
*/
|
|
448
|
+
declare function defineCodeJudge(handler: CodeJudgeHandler): void;
|
|
449
|
+
|
|
450
|
+
export { type CodeJudgeHandler, type CodeJudgeInput, CodeJudgeInputSchema, type CodeJudgeResult, CodeJudgeResultSchema, type OutputMessage, OutputMessageSchema, type TokenUsage, TokenUsageSchema, type ToolCall, ToolCallSchema, type TraceSummary, TraceSummarySchema, defineCodeJudge };
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,450 @@
|
|
|
1
|
+
import { z } from 'zod';
|
|
2
|
+
export { z } from 'zod';
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Zod schemas for code judge input/output validation.
|
|
6
|
+
* Provides both compile-time types and runtime validation.
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
/**
|
|
10
|
+
* Token usage metrics schema.
|
|
11
|
+
*/
|
|
12
|
+
declare const TokenUsageSchema: z.ZodObject<{
|
|
13
|
+
input: z.ZodNumber;
|
|
14
|
+
output: z.ZodNumber;
|
|
15
|
+
cached: z.ZodOptional<z.ZodNumber>;
|
|
16
|
+
}, "strip", z.ZodTypeAny, {
|
|
17
|
+
input: number;
|
|
18
|
+
output: number;
|
|
19
|
+
cached?: number | undefined;
|
|
20
|
+
}, {
|
|
21
|
+
input: number;
|
|
22
|
+
output: number;
|
|
23
|
+
cached?: number | undefined;
|
|
24
|
+
}>;
|
|
25
|
+
/**
|
|
26
|
+
* Trace summary schema (camelCase for TypeScript ergonomics).
|
|
27
|
+
*/
|
|
28
|
+
declare const TraceSummarySchema: z.ZodObject<{
|
|
29
|
+
eventCount: z.ZodNumber;
|
|
30
|
+
toolNames: z.ZodArray<z.ZodString, "many">;
|
|
31
|
+
toolCallsByName: z.ZodRecord<z.ZodString, z.ZodNumber>;
|
|
32
|
+
errorCount: z.ZodNumber;
|
|
33
|
+
tokenUsage: z.ZodOptional<z.ZodObject<{
|
|
34
|
+
input: z.ZodNumber;
|
|
35
|
+
output: z.ZodNumber;
|
|
36
|
+
cached: z.ZodOptional<z.ZodNumber>;
|
|
37
|
+
}, "strip", z.ZodTypeAny, {
|
|
38
|
+
input: number;
|
|
39
|
+
output: number;
|
|
40
|
+
cached?: number | undefined;
|
|
41
|
+
}, {
|
|
42
|
+
input: number;
|
|
43
|
+
output: number;
|
|
44
|
+
cached?: number | undefined;
|
|
45
|
+
}>>;
|
|
46
|
+
costUsd: z.ZodOptional<z.ZodNumber>;
|
|
47
|
+
durationMs: z.ZodOptional<z.ZodNumber>;
|
|
48
|
+
toolDurations: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodArray<z.ZodNumber, "many">>>;
|
|
49
|
+
}, "strip", z.ZodTypeAny, {
|
|
50
|
+
eventCount: number;
|
|
51
|
+
toolNames: string[];
|
|
52
|
+
toolCallsByName: Record<string, number>;
|
|
53
|
+
errorCount: number;
|
|
54
|
+
tokenUsage?: {
|
|
55
|
+
input: number;
|
|
56
|
+
output: number;
|
|
57
|
+
cached?: number | undefined;
|
|
58
|
+
} | undefined;
|
|
59
|
+
costUsd?: number | undefined;
|
|
60
|
+
durationMs?: number | undefined;
|
|
61
|
+
toolDurations?: Record<string, number[]> | undefined;
|
|
62
|
+
}, {
|
|
63
|
+
eventCount: number;
|
|
64
|
+
toolNames: string[];
|
|
65
|
+
toolCallsByName: Record<string, number>;
|
|
66
|
+
errorCount: number;
|
|
67
|
+
tokenUsage?: {
|
|
68
|
+
input: number;
|
|
69
|
+
output: number;
|
|
70
|
+
cached?: number | undefined;
|
|
71
|
+
} | undefined;
|
|
72
|
+
costUsd?: number | undefined;
|
|
73
|
+
durationMs?: number | undefined;
|
|
74
|
+
toolDurations?: Record<string, number[]> | undefined;
|
|
75
|
+
}>;
|
|
76
|
+
/**
|
|
77
|
+
* Tool call schema for output messages.
|
|
78
|
+
*/
|
|
79
|
+
declare const ToolCallSchema: z.ZodObject<{
|
|
80
|
+
tool: z.ZodString;
|
|
81
|
+
input: z.ZodOptional<z.ZodUnknown>;
|
|
82
|
+
output: z.ZodOptional<z.ZodUnknown>;
|
|
83
|
+
id: z.ZodOptional<z.ZodString>;
|
|
84
|
+
timestamp: z.ZodOptional<z.ZodString>;
|
|
85
|
+
}, "strip", z.ZodTypeAny, {
|
|
86
|
+
tool: string;
|
|
87
|
+
input?: unknown;
|
|
88
|
+
output?: unknown;
|
|
89
|
+
id?: string | undefined;
|
|
90
|
+
timestamp?: string | undefined;
|
|
91
|
+
}, {
|
|
92
|
+
tool: string;
|
|
93
|
+
input?: unknown;
|
|
94
|
+
output?: unknown;
|
|
95
|
+
id?: string | undefined;
|
|
96
|
+
timestamp?: string | undefined;
|
|
97
|
+
}>;
|
|
98
|
+
/**
|
|
99
|
+
* Output message schema.
|
|
100
|
+
*/
|
|
101
|
+
declare const OutputMessageSchema: z.ZodObject<{
|
|
102
|
+
role: z.ZodEnum<["assistant", "user", "system", "tool"]>;
|
|
103
|
+
content: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodRecord<z.ZodString, z.ZodUnknown>, z.ZodArray<z.ZodRecord<z.ZodString, z.ZodUnknown>, "many">]>>;
|
|
104
|
+
toolCalls: z.ZodOptional<z.ZodArray<z.ZodObject<{
|
|
105
|
+
tool: z.ZodString;
|
|
106
|
+
input: z.ZodOptional<z.ZodUnknown>;
|
|
107
|
+
output: z.ZodOptional<z.ZodUnknown>;
|
|
108
|
+
id: z.ZodOptional<z.ZodString>;
|
|
109
|
+
timestamp: z.ZodOptional<z.ZodString>;
|
|
110
|
+
}, "strip", z.ZodTypeAny, {
|
|
111
|
+
tool: string;
|
|
112
|
+
input?: unknown;
|
|
113
|
+
output?: unknown;
|
|
114
|
+
id?: string | undefined;
|
|
115
|
+
timestamp?: string | undefined;
|
|
116
|
+
}, {
|
|
117
|
+
tool: string;
|
|
118
|
+
input?: unknown;
|
|
119
|
+
output?: unknown;
|
|
120
|
+
id?: string | undefined;
|
|
121
|
+
timestamp?: string | undefined;
|
|
122
|
+
}>, "many">>;
|
|
123
|
+
timestamp: z.ZodOptional<z.ZodString>;
|
|
124
|
+
metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
125
|
+
}, "strip", z.ZodTypeAny, {
|
|
126
|
+
role: "tool" | "assistant" | "user" | "system";
|
|
127
|
+
timestamp?: string | undefined;
|
|
128
|
+
content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
|
|
129
|
+
toolCalls?: {
|
|
130
|
+
tool: string;
|
|
131
|
+
input?: unknown;
|
|
132
|
+
output?: unknown;
|
|
133
|
+
id?: string | undefined;
|
|
134
|
+
timestamp?: string | undefined;
|
|
135
|
+
}[] | undefined;
|
|
136
|
+
metadata?: Record<string, unknown> | undefined;
|
|
137
|
+
}, {
|
|
138
|
+
role: "tool" | "assistant" | "user" | "system";
|
|
139
|
+
timestamp?: string | undefined;
|
|
140
|
+
content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
|
|
141
|
+
toolCalls?: {
|
|
142
|
+
tool: string;
|
|
143
|
+
input?: unknown;
|
|
144
|
+
output?: unknown;
|
|
145
|
+
id?: string | undefined;
|
|
146
|
+
timestamp?: string | undefined;
|
|
147
|
+
}[] | undefined;
|
|
148
|
+
metadata?: Record<string, unknown> | undefined;
|
|
149
|
+
}>;
|
|
150
|
+
/**
|
|
151
|
+
* Code judge input schema (camelCase, converted from snake_case wire format).
|
|
152
|
+
*/
|
|
153
|
+
declare const CodeJudgeInputSchema: z.ZodObject<{
|
|
154
|
+
question: z.ZodString;
|
|
155
|
+
expectedOutcome: z.ZodString;
|
|
156
|
+
expectedMessages: z.ZodArray<z.ZodRecord<z.ZodString, z.ZodUnknown>, "many">;
|
|
157
|
+
referenceAnswer: z.ZodOptional<z.ZodString>;
|
|
158
|
+
candidateAnswer: z.ZodString;
|
|
159
|
+
outputMessages: z.ZodOptional<z.ZodNullable<z.ZodArray<z.ZodObject<{
|
|
160
|
+
role: z.ZodEnum<["assistant", "user", "system", "tool"]>;
|
|
161
|
+
content: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodRecord<z.ZodString, z.ZodUnknown>, z.ZodArray<z.ZodRecord<z.ZodString, z.ZodUnknown>, "many">]>>;
|
|
162
|
+
toolCalls: z.ZodOptional<z.ZodArray<z.ZodObject<{
|
|
163
|
+
tool: z.ZodString;
|
|
164
|
+
input: z.ZodOptional<z.ZodUnknown>;
|
|
165
|
+
output: z.ZodOptional<z.ZodUnknown>;
|
|
166
|
+
id: z.ZodOptional<z.ZodString>;
|
|
167
|
+
timestamp: z.ZodOptional<z.ZodString>;
|
|
168
|
+
}, "strip", z.ZodTypeAny, {
|
|
169
|
+
tool: string;
|
|
170
|
+
input?: unknown;
|
|
171
|
+
output?: unknown;
|
|
172
|
+
id?: string | undefined;
|
|
173
|
+
timestamp?: string | undefined;
|
|
174
|
+
}, {
|
|
175
|
+
tool: string;
|
|
176
|
+
input?: unknown;
|
|
177
|
+
output?: unknown;
|
|
178
|
+
id?: string | undefined;
|
|
179
|
+
timestamp?: string | undefined;
|
|
180
|
+
}>, "many">>;
|
|
181
|
+
timestamp: z.ZodOptional<z.ZodString>;
|
|
182
|
+
metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
183
|
+
}, "strip", z.ZodTypeAny, {
|
|
184
|
+
role: "tool" | "assistant" | "user" | "system";
|
|
185
|
+
timestamp?: string | undefined;
|
|
186
|
+
content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
|
|
187
|
+
toolCalls?: {
|
|
188
|
+
tool: string;
|
|
189
|
+
input?: unknown;
|
|
190
|
+
output?: unknown;
|
|
191
|
+
id?: string | undefined;
|
|
192
|
+
timestamp?: string | undefined;
|
|
193
|
+
}[] | undefined;
|
|
194
|
+
metadata?: Record<string, unknown> | undefined;
|
|
195
|
+
}, {
|
|
196
|
+
role: "tool" | "assistant" | "user" | "system";
|
|
197
|
+
timestamp?: string | undefined;
|
|
198
|
+
content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
|
|
199
|
+
toolCalls?: {
|
|
200
|
+
tool: string;
|
|
201
|
+
input?: unknown;
|
|
202
|
+
output?: unknown;
|
|
203
|
+
id?: string | undefined;
|
|
204
|
+
timestamp?: string | undefined;
|
|
205
|
+
}[] | undefined;
|
|
206
|
+
metadata?: Record<string, unknown> | undefined;
|
|
207
|
+
}>, "many">>>;
|
|
208
|
+
guidelineFiles: z.ZodArray<z.ZodString, "many">;
|
|
209
|
+
inputFiles: z.ZodArray<z.ZodString, "many">;
|
|
210
|
+
inputMessages: z.ZodArray<z.ZodObject<{
|
|
211
|
+
role: z.ZodEnum<["system", "user", "assistant", "tool"]>;
|
|
212
|
+
content: z.ZodUnion<[z.ZodString, z.ZodRecord<z.ZodString, z.ZodUnknown>, z.ZodArray<z.ZodRecord<z.ZodString, z.ZodUnknown>, "many">]>;
|
|
213
|
+
}, "strip", z.ZodTypeAny, {
|
|
214
|
+
role: "tool" | "assistant" | "user" | "system";
|
|
215
|
+
content: string | Record<string, unknown> | Record<string, unknown>[];
|
|
216
|
+
}, {
|
|
217
|
+
role: "tool" | "assistant" | "user" | "system";
|
|
218
|
+
content: string | Record<string, unknown> | Record<string, unknown>[];
|
|
219
|
+
}>, "many">;
|
|
220
|
+
traceSummary: z.ZodOptional<z.ZodNullable<z.ZodObject<{
|
|
221
|
+
eventCount: z.ZodNumber;
|
|
222
|
+
toolNames: z.ZodArray<z.ZodString, "many">;
|
|
223
|
+
toolCallsByName: z.ZodRecord<z.ZodString, z.ZodNumber>;
|
|
224
|
+
errorCount: z.ZodNumber;
|
|
225
|
+
tokenUsage: z.ZodOptional<z.ZodObject<{
|
|
226
|
+
input: z.ZodNumber;
|
|
227
|
+
output: z.ZodNumber;
|
|
228
|
+
cached: z.ZodOptional<z.ZodNumber>;
|
|
229
|
+
}, "strip", z.ZodTypeAny, {
|
|
230
|
+
input: number;
|
|
231
|
+
output: number;
|
|
232
|
+
cached?: number | undefined;
|
|
233
|
+
}, {
|
|
234
|
+
input: number;
|
|
235
|
+
output: number;
|
|
236
|
+
cached?: number | undefined;
|
|
237
|
+
}>>;
|
|
238
|
+
costUsd: z.ZodOptional<z.ZodNumber>;
|
|
239
|
+
durationMs: z.ZodOptional<z.ZodNumber>;
|
|
240
|
+
toolDurations: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodArray<z.ZodNumber, "many">>>;
|
|
241
|
+
}, "strip", z.ZodTypeAny, {
|
|
242
|
+
eventCount: number;
|
|
243
|
+
toolNames: string[];
|
|
244
|
+
toolCallsByName: Record<string, number>;
|
|
245
|
+
errorCount: number;
|
|
246
|
+
tokenUsage?: {
|
|
247
|
+
input: number;
|
|
248
|
+
output: number;
|
|
249
|
+
cached?: number | undefined;
|
|
250
|
+
} | undefined;
|
|
251
|
+
costUsd?: number | undefined;
|
|
252
|
+
durationMs?: number | undefined;
|
|
253
|
+
toolDurations?: Record<string, number[]> | undefined;
|
|
254
|
+
}, {
|
|
255
|
+
eventCount: number;
|
|
256
|
+
toolNames: string[];
|
|
257
|
+
toolCallsByName: Record<string, number>;
|
|
258
|
+
errorCount: number;
|
|
259
|
+
tokenUsage?: {
|
|
260
|
+
input: number;
|
|
261
|
+
output: number;
|
|
262
|
+
cached?: number | undefined;
|
|
263
|
+
} | undefined;
|
|
264
|
+
costUsd?: number | undefined;
|
|
265
|
+
durationMs?: number | undefined;
|
|
266
|
+
toolDurations?: Record<string, number[]> | undefined;
|
|
267
|
+
}>>>;
|
|
268
|
+
config: z.ZodOptional<z.ZodNullable<z.ZodRecord<z.ZodString, z.ZodUnknown>>>;
|
|
269
|
+
}, "strip", z.ZodTypeAny, {
|
|
270
|
+
question: string;
|
|
271
|
+
expectedOutcome: string;
|
|
272
|
+
expectedMessages: Record<string, unknown>[];
|
|
273
|
+
candidateAnswer: string;
|
|
274
|
+
guidelineFiles: string[];
|
|
275
|
+
inputFiles: string[];
|
|
276
|
+
inputMessages: {
|
|
277
|
+
role: "tool" | "assistant" | "user" | "system";
|
|
278
|
+
content: string | Record<string, unknown> | Record<string, unknown>[];
|
|
279
|
+
}[];
|
|
280
|
+
referenceAnswer?: string | undefined;
|
|
281
|
+
outputMessages?: {
|
|
282
|
+
role: "tool" | "assistant" | "user" | "system";
|
|
283
|
+
timestamp?: string | undefined;
|
|
284
|
+
content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
|
|
285
|
+
toolCalls?: {
|
|
286
|
+
tool: string;
|
|
287
|
+
input?: unknown;
|
|
288
|
+
output?: unknown;
|
|
289
|
+
id?: string | undefined;
|
|
290
|
+
timestamp?: string | undefined;
|
|
291
|
+
}[] | undefined;
|
|
292
|
+
metadata?: Record<string, unknown> | undefined;
|
|
293
|
+
}[] | null | undefined;
|
|
294
|
+
traceSummary?: {
|
|
295
|
+
eventCount: number;
|
|
296
|
+
toolNames: string[];
|
|
297
|
+
toolCallsByName: Record<string, number>;
|
|
298
|
+
errorCount: number;
|
|
299
|
+
tokenUsage?: {
|
|
300
|
+
input: number;
|
|
301
|
+
output: number;
|
|
302
|
+
cached?: number | undefined;
|
|
303
|
+
} | undefined;
|
|
304
|
+
costUsd?: number | undefined;
|
|
305
|
+
durationMs?: number | undefined;
|
|
306
|
+
toolDurations?: Record<string, number[]> | undefined;
|
|
307
|
+
} | null | undefined;
|
|
308
|
+
config?: Record<string, unknown> | null | undefined;
|
|
309
|
+
}, {
|
|
310
|
+
question: string;
|
|
311
|
+
expectedOutcome: string;
|
|
312
|
+
expectedMessages: Record<string, unknown>[];
|
|
313
|
+
candidateAnswer: string;
|
|
314
|
+
guidelineFiles: string[];
|
|
315
|
+
inputFiles: string[];
|
|
316
|
+
inputMessages: {
|
|
317
|
+
role: "tool" | "assistant" | "user" | "system";
|
|
318
|
+
content: string | Record<string, unknown> | Record<string, unknown>[];
|
|
319
|
+
}[];
|
|
320
|
+
referenceAnswer?: string | undefined;
|
|
321
|
+
outputMessages?: {
|
|
322
|
+
role: "tool" | "assistant" | "user" | "system";
|
|
323
|
+
timestamp?: string | undefined;
|
|
324
|
+
content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
|
|
325
|
+
toolCalls?: {
|
|
326
|
+
tool: string;
|
|
327
|
+
input?: unknown;
|
|
328
|
+
output?: unknown;
|
|
329
|
+
id?: string | undefined;
|
|
330
|
+
timestamp?: string | undefined;
|
|
331
|
+
}[] | undefined;
|
|
332
|
+
metadata?: Record<string, unknown> | undefined;
|
|
333
|
+
}[] | null | undefined;
|
|
334
|
+
traceSummary?: {
|
|
335
|
+
eventCount: number;
|
|
336
|
+
toolNames: string[];
|
|
337
|
+
toolCallsByName: Record<string, number>;
|
|
338
|
+
errorCount: number;
|
|
339
|
+
tokenUsage?: {
|
|
340
|
+
input: number;
|
|
341
|
+
output: number;
|
|
342
|
+
cached?: number | undefined;
|
|
343
|
+
} | undefined;
|
|
344
|
+
costUsd?: number | undefined;
|
|
345
|
+
durationMs?: number | undefined;
|
|
346
|
+
toolDurations?: Record<string, number[]> | undefined;
|
|
347
|
+
} | null | undefined;
|
|
348
|
+
config?: Record<string, unknown> | null | undefined;
|
|
349
|
+
}>;
|
|
350
|
+
/**
|
|
351
|
+
* Code judge result schema (validated before output).
|
|
352
|
+
*/
|
|
353
|
+
declare const CodeJudgeResultSchema: z.ZodObject<{
|
|
354
|
+
score: z.ZodNumber;
|
|
355
|
+
hits: z.ZodDefault<z.ZodOptional<z.ZodArray<z.ZodString, "many">>>;
|
|
356
|
+
misses: z.ZodDefault<z.ZodOptional<z.ZodArray<z.ZodString, "many">>>;
|
|
357
|
+
reasoning: z.ZodOptional<z.ZodString>;
|
|
358
|
+
}, "strip", z.ZodTypeAny, {
|
|
359
|
+
score: number;
|
|
360
|
+
hits: string[];
|
|
361
|
+
misses: string[];
|
|
362
|
+
reasoning?: string | undefined;
|
|
363
|
+
}, {
|
|
364
|
+
score: number;
|
|
365
|
+
hits?: string[] | undefined;
|
|
366
|
+
misses?: string[] | undefined;
|
|
367
|
+
reasoning?: string | undefined;
|
|
368
|
+
}>;
|
|
369
|
+
/**
|
|
370
|
+
* Inferred types from schemas.
|
|
371
|
+
*/
|
|
372
|
+
type CodeJudgeInput = z.infer<typeof CodeJudgeInputSchema>;
|
|
373
|
+
type CodeJudgeResult = z.infer<typeof CodeJudgeResultSchema>;
|
|
374
|
+
type TraceSummary = z.infer<typeof TraceSummarySchema>;
|
|
375
|
+
type OutputMessage = z.infer<typeof OutputMessageSchema>;
|
|
376
|
+
type ToolCall = z.infer<typeof ToolCallSchema>;
|
|
377
|
+
type TokenUsage = z.infer<typeof TokenUsageSchema>;
|
|
378
|
+
|
|
379
|
+
/**
|
|
380
|
+
* Handler function type for code judges.
|
|
381
|
+
*/
|
|
382
|
+
type CodeJudgeHandler = (input: CodeJudgeInput) => CodeJudgeResult | Promise<CodeJudgeResult>;
|
|
383
|
+
|
|
384
|
+
/**
|
|
385
|
+
* AgentV Evaluation SDK
|
|
386
|
+
*
|
|
387
|
+
* Build custom code judges for evaluating AI agent outputs.
|
|
388
|
+
*
|
|
389
|
+
* @example
|
|
390
|
+
* ```typescript
|
|
391
|
+
* #!/usr/bin/env bun
|
|
392
|
+
* import { defineCodeJudge } from '@agentv/eval';
|
|
393
|
+
*
|
|
394
|
+
* export default defineCodeJudge(({ traceSummary, candidateAnswer }) => ({
|
|
395
|
+
* score: traceSummary?.eventCount <= 5 ? 1.0 : 0.5,
|
|
396
|
+
* hits: ['Efficient tool usage'],
|
|
397
|
+
* misses: [],
|
|
398
|
+
* }));
|
|
399
|
+
* ```
|
|
400
|
+
*
|
|
401
|
+
* @packageDocumentation
|
|
402
|
+
*/
|
|
403
|
+
|
|
404
|
+
/**
|
|
405
|
+
* Define a code judge evaluator with automatic stdin/stdout handling.
|
|
406
|
+
*
|
|
407
|
+
* This function:
|
|
408
|
+
* 1. Reads JSON from stdin (snake_case format)
|
|
409
|
+
* 2. Converts to camelCase and validates with Zod
|
|
410
|
+
* 3. Calls your handler with typed input
|
|
411
|
+
* 4. Validates the result and outputs JSON to stdout
|
|
412
|
+
* 5. Handles errors gracefully with proper exit codes
|
|
413
|
+
*
|
|
414
|
+
* @param handler - Function that evaluates the input and returns a result
|
|
415
|
+
*
|
|
416
|
+
* @example
|
|
417
|
+
* ```typescript
|
|
418
|
+
* import { defineCodeJudge } from '@agentv/eval';
|
|
419
|
+
*
|
|
420
|
+
* export default defineCodeJudge(({ traceSummary }) => {
|
|
421
|
+
* if (!traceSummary) {
|
|
422
|
+
* return { score: 0.5, reasoning: 'No trace available' };
|
|
423
|
+
* }
|
|
424
|
+
*
|
|
425
|
+
* const efficient = traceSummary.eventCount <= 10;
|
|
426
|
+
* return {
|
|
427
|
+
* score: efficient ? 1.0 : 0.5,
|
|
428
|
+
* hits: efficient ? ['Efficient execution'] : [],
|
|
429
|
+
* misses: efficient ? [] : ['Too many tool calls'],
|
|
430
|
+
* };
|
|
431
|
+
* });
|
|
432
|
+
* ```
|
|
433
|
+
*
|
|
434
|
+
* @example With typed config
|
|
435
|
+
* ```typescript
|
|
436
|
+
* import { defineCodeJudge, z } from '@agentv/eval';
|
|
437
|
+
*
|
|
438
|
+
* const ConfigSchema = z.object({
|
|
439
|
+
* maxToolCalls: z.number().default(10),
|
|
440
|
+
* });
|
|
441
|
+
*
|
|
442
|
+
* export default defineCodeJudge(({ traceSummary, config }) => {
|
|
443
|
+
* const { maxToolCalls } = ConfigSchema.parse(config ?? {});
|
|
444
|
+
* // Use maxToolCalls...
|
|
445
|
+
* });
|
|
446
|
+
* ```
|
|
447
|
+
*/
|
|
448
|
+
declare function defineCodeJudge(handler: CodeJudgeHandler): void;
|
|
449
|
+
|
|
450
|
+
export { type CodeJudgeHandler, type CodeJudgeInput, CodeJudgeInputSchema, type CodeJudgeResult, CodeJudgeResultSchema, type OutputMessage, OutputMessageSchema, type TokenUsage, TokenUsageSchema, type ToolCall, ToolCallSchema, type TraceSummary, TraceSummarySchema, defineCodeJudge };
|
package/dist/index.js
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
// src/schemas.ts
|
|
2
|
+
import { z } from "zod";
|
|
3
|
+
var TokenUsageSchema = z.object({
|
|
4
|
+
input: z.number(),
|
|
5
|
+
output: z.number(),
|
|
6
|
+
cached: z.number().optional()
|
|
7
|
+
});
|
|
8
|
+
var TraceSummarySchema = z.object({
|
|
9
|
+
eventCount: z.number(),
|
|
10
|
+
toolNames: z.array(z.string()),
|
|
11
|
+
toolCallsByName: z.record(z.string(), z.number()),
|
|
12
|
+
errorCount: z.number(),
|
|
13
|
+
tokenUsage: TokenUsageSchema.optional(),
|
|
14
|
+
costUsd: z.number().optional(),
|
|
15
|
+
durationMs: z.number().optional(),
|
|
16
|
+
toolDurations: z.record(z.string(), z.array(z.number())).optional()
|
|
17
|
+
});
|
|
18
|
+
var ToolCallSchema = z.object({
|
|
19
|
+
tool: z.string(),
|
|
20
|
+
input: z.unknown().optional(),
|
|
21
|
+
output: z.unknown().optional(),
|
|
22
|
+
id: z.string().optional(),
|
|
23
|
+
timestamp: z.string().optional()
|
|
24
|
+
});
|
|
25
|
+
var OutputMessageSchema = z.object({
|
|
26
|
+
role: z.enum(["assistant", "user", "system", "tool"]),
|
|
27
|
+
content: z.union([z.string(), z.record(z.unknown()), z.array(z.record(z.unknown()))]).optional(),
|
|
28
|
+
toolCalls: z.array(ToolCallSchema).optional(),
|
|
29
|
+
timestamp: z.string().optional(),
|
|
30
|
+
metadata: z.record(z.unknown()).optional()
|
|
31
|
+
});
|
|
32
|
+
var TestMessageSchema = z.object({
|
|
33
|
+
role: z.enum(["system", "user", "assistant", "tool"]),
|
|
34
|
+
content: z.union([z.string(), z.record(z.unknown()), z.array(z.record(z.unknown()))])
|
|
35
|
+
});
|
|
36
|
+
var CodeJudgeInputSchema = z.object({
|
|
37
|
+
question: z.string(),
|
|
38
|
+
expectedOutcome: z.string(),
|
|
39
|
+
expectedMessages: z.array(z.record(z.unknown())),
|
|
40
|
+
referenceAnswer: z.string().optional(),
|
|
41
|
+
candidateAnswer: z.string(),
|
|
42
|
+
outputMessages: z.array(OutputMessageSchema).nullable().optional(),
|
|
43
|
+
guidelineFiles: z.array(z.string()),
|
|
44
|
+
inputFiles: z.array(z.string()),
|
|
45
|
+
inputMessages: z.array(TestMessageSchema),
|
|
46
|
+
traceSummary: TraceSummarySchema.nullable().optional(),
|
|
47
|
+
config: z.record(z.unknown()).nullable().optional()
|
|
48
|
+
});
|
|
49
|
+
var CodeJudgeResultSchema = z.object({
|
|
50
|
+
score: z.number().min(0).max(1),
|
|
51
|
+
hits: z.array(z.string()).optional().default([]),
|
|
52
|
+
misses: z.array(z.string()).optional().default([]),
|
|
53
|
+
reasoning: z.string().optional()
|
|
54
|
+
});
|
|
55
|
+
|
|
56
|
+
// src/index.ts
|
|
57
|
+
import { z as z2 } from "zod";
|
|
58
|
+
|
|
59
|
+
// src/runtime.ts
|
|
60
|
+
import { readFileSync } from "node:fs";
|
|
61
|
+
|
|
62
|
+
// src/case-conversion.ts
|
|
63
|
+
function toCamelCase(str) {
|
|
64
|
+
if (/^[A-Z]/.test(str)) {
|
|
65
|
+
return str;
|
|
66
|
+
}
|
|
67
|
+
return str.replace(/_([a-z0-9])/g, (_, letter) => letter.toUpperCase());
|
|
68
|
+
}
|
|
69
|
+
function toCamelCaseDeep(obj) {
|
|
70
|
+
if (obj === null || obj === void 0) {
|
|
71
|
+
return obj;
|
|
72
|
+
}
|
|
73
|
+
if (Array.isArray(obj)) {
|
|
74
|
+
return obj.map((item) => toCamelCaseDeep(item));
|
|
75
|
+
}
|
|
76
|
+
if (typeof obj === "object") {
|
|
77
|
+
const result = {};
|
|
78
|
+
for (const [key, value] of Object.entries(obj)) {
|
|
79
|
+
const camelKey = toCamelCase(key);
|
|
80
|
+
result[camelKey] = toCamelCaseDeep(value);
|
|
81
|
+
}
|
|
82
|
+
return result;
|
|
83
|
+
}
|
|
84
|
+
return obj;
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
// src/runtime.ts
|
|
88
|
+
function readStdin() {
|
|
89
|
+
return readFileSync(0, "utf8");
|
|
90
|
+
}
|
|
91
|
+
function clampScore(value) {
|
|
92
|
+
if (Number.isNaN(value) || !Number.isFinite(value)) {
|
|
93
|
+
return 0;
|
|
94
|
+
}
|
|
95
|
+
return Math.max(0, Math.min(1, value));
|
|
96
|
+
}
|
|
97
|
+
function formatError(error) {
|
|
98
|
+
if (error instanceof Error) {
|
|
99
|
+
return error.message;
|
|
100
|
+
}
|
|
101
|
+
return String(error);
|
|
102
|
+
}
|
|
103
|
+
async function runCodeJudge(handler) {
|
|
104
|
+
try {
|
|
105
|
+
const stdin = readStdin();
|
|
106
|
+
const rawInput = JSON.parse(stdin);
|
|
107
|
+
const camelInput = toCamelCaseDeep(rawInput);
|
|
108
|
+
const input = CodeJudgeInputSchema.parse(camelInput);
|
|
109
|
+
const rawResult = await handler(input);
|
|
110
|
+
const result = CodeJudgeResultSchema.parse({
|
|
111
|
+
...rawResult,
|
|
112
|
+
score: clampScore(rawResult.score)
|
|
113
|
+
});
|
|
114
|
+
console.log(JSON.stringify(result, null, 2));
|
|
115
|
+
} catch (error) {
|
|
116
|
+
const errorMessage = formatError(error);
|
|
117
|
+
const errorResult = {
|
|
118
|
+
score: 0,
|
|
119
|
+
hits: [],
|
|
120
|
+
misses: [errorMessage],
|
|
121
|
+
reasoning: `Evaluation failed: ${errorMessage}`
|
|
122
|
+
};
|
|
123
|
+
console.log(JSON.stringify(errorResult, null, 2));
|
|
124
|
+
process.exit(1);
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
// src/index.ts
|
|
129
|
+
function defineCodeJudge(handler) {
|
|
130
|
+
runCodeJudge(handler);
|
|
131
|
+
}
|
|
132
|
+
export {
|
|
133
|
+
CodeJudgeInputSchema,
|
|
134
|
+
CodeJudgeResultSchema,
|
|
135
|
+
OutputMessageSchema,
|
|
136
|
+
TokenUsageSchema,
|
|
137
|
+
ToolCallSchema,
|
|
138
|
+
TraceSummarySchema,
|
|
139
|
+
defineCodeJudge,
|
|
140
|
+
z2 as z
|
|
141
|
+
};
|
|
142
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../src/schemas.ts","../src/index.ts","../src/runtime.ts","../src/case-conversion.ts"],"sourcesContent":["/**\n * Zod schemas for code judge input/output validation.\n * Provides both compile-time types and runtime validation.\n */\nimport { z } from 'zod';\n\n/**\n * Token usage metrics schema.\n */\nexport const TokenUsageSchema = z.object({\n input: z.number(),\n output: z.number(),\n cached: z.number().optional(),\n});\n\n/**\n * Trace summary schema (camelCase for TypeScript ergonomics).\n */\nexport const TraceSummarySchema = z.object({\n eventCount: z.number(),\n toolNames: z.array(z.string()),\n toolCallsByName: z.record(z.string(), z.number()),\n errorCount: z.number(),\n tokenUsage: TokenUsageSchema.optional(),\n costUsd: z.number().optional(),\n durationMs: z.number().optional(),\n toolDurations: z.record(z.string(), z.array(z.number())).optional(),\n});\n\n/**\n * Tool call schema for output messages.\n */\nexport const ToolCallSchema = z.object({\n tool: z.string(),\n input: z.unknown().optional(),\n output: z.unknown().optional(),\n id: z.string().optional(),\n timestamp: z.string().optional(),\n});\n\n/**\n * Output message schema.\n */\nexport const OutputMessageSchema = z.object({\n role: z.enum(['assistant', 'user', 'system', 'tool']),\n content: z.union([z.string(), z.record(z.unknown()), z.array(z.record(z.unknown()))]).optional(),\n toolCalls: z.array(ToolCallSchema).optional(),\n timestamp: z.string().optional(),\n metadata: z.record(z.unknown()).optional(),\n});\n\n/**\n * Test message schema.\n */\nexport const TestMessageSchema = z.object({\n role: z.enum(['system', 'user', 'assistant', 'tool']),\n content: z.union([z.string(), z.record(z.unknown()), z.array(z.record(z.unknown()))]),\n});\n\n/**\n * Code judge input schema (camelCase, converted from snake_case wire format).\n */\nexport const CodeJudgeInputSchema = z.object({\n question: z.string(),\n expectedOutcome: z.string(),\n expectedMessages: z.array(z.record(z.unknown())),\n referenceAnswer: z.string().optional(),\n candidateAnswer: z.string(),\n outputMessages: z.array(OutputMessageSchema).nullable().optional(),\n guidelineFiles: z.array(z.string()),\n inputFiles: z.array(z.string()),\n inputMessages: z.array(TestMessageSchema),\n traceSummary: TraceSummarySchema.nullable().optional(),\n config: z.record(z.unknown()).nullable().optional(),\n});\n\n/**\n * Code judge result schema (validated before output).\n */\nexport const CodeJudgeResultSchema = z.object({\n score: z.number().min(0).max(1),\n hits: z.array(z.string()).optional().default([]),\n misses: z.array(z.string()).optional().default([]),\n reasoning: z.string().optional(),\n});\n\n/**\n * Inferred types from schemas.\n */\nexport type CodeJudgeInput = z.infer<typeof CodeJudgeInputSchema>;\nexport type CodeJudgeResult = z.infer<typeof CodeJudgeResultSchema>;\nexport type TraceSummary = z.infer<typeof TraceSummarySchema>;\nexport type OutputMessage = z.infer<typeof OutputMessageSchema>;\nexport type ToolCall = z.infer<typeof ToolCallSchema>;\nexport type TokenUsage = z.infer<typeof TokenUsageSchema>;\n","/**\n * AgentV Evaluation SDK\n *\n * Build custom code judges for evaluating AI agent outputs.\n *\n * @example\n * ```typescript\n * #!/usr/bin/env bun\n * import { defineCodeJudge } from '@agentv/eval';\n *\n * export default defineCodeJudge(({ traceSummary, candidateAnswer }) => ({\n * score: traceSummary?.eventCount <= 5 ? 1.0 : 0.5,\n * hits: ['Efficient tool usage'],\n * misses: [],\n * }));\n * ```\n *\n * @packageDocumentation\n */\n\n// Re-export schemas and types\nexport {\n CodeJudgeInputSchema,\n CodeJudgeResultSchema,\n TraceSummarySchema,\n OutputMessageSchema,\n ToolCallSchema,\n TokenUsageSchema,\n type CodeJudgeInput,\n type CodeJudgeResult,\n type TraceSummary,\n type OutputMessage,\n type ToolCall,\n type TokenUsage,\n} from './schemas.js';\n\n// Re-export Zod for typed config support\nexport { z } from 'zod';\n\n// Import runtime\nimport { type CodeJudgeHandler, runCodeJudge } from './runtime.js';\n\nexport type { CodeJudgeHandler };\n\n/**\n * Define a code judge evaluator with automatic stdin/stdout handling.\n *\n * This function:\n * 1. Reads JSON from stdin (snake_case format)\n * 2. Converts to camelCase and validates with Zod\n * 3. Calls your handler with typed input\n * 4. Validates the result and outputs JSON to stdout\n * 5. Handles errors gracefully with proper exit codes\n *\n * @param handler - Function that evaluates the input and returns a result\n *\n * @example\n * ```typescript\n * import { defineCodeJudge } from '@agentv/eval';\n *\n * export default defineCodeJudge(({ traceSummary }) => {\n * if (!traceSummary) {\n * return { score: 0.5, reasoning: 'No trace available' };\n * }\n *\n * const efficient = traceSummary.eventCount <= 10;\n * return {\n * score: efficient ? 1.0 : 0.5,\n * hits: efficient ? ['Efficient execution'] : [],\n * misses: efficient ? [] : ['Too many tool calls'],\n * };\n * });\n * ```\n *\n * @example With typed config\n * ```typescript\n * import { defineCodeJudge, z } from '@agentv/eval';\n *\n * const ConfigSchema = z.object({\n * maxToolCalls: z.number().default(10),\n * });\n *\n * export default defineCodeJudge(({ traceSummary, config }) => {\n * const { maxToolCalls } = ConfigSchema.parse(config ?? {});\n * // Use maxToolCalls...\n * });\n * ```\n */\nexport function defineCodeJudge(handler: CodeJudgeHandler): void {\n // Run immediately when module is loaded\n runCodeJudge(handler);\n}\n","/**\n * Runtime for code judge evaluators.\n * Handles stdin parsing, validation, error handling, and output formatting.\n */\nimport { readFileSync } from 'node:fs';\n\nimport { toCamelCaseDeep } from './case-conversion.js';\nimport {\n type CodeJudgeInput,\n CodeJudgeInputSchema,\n type CodeJudgeResult,\n CodeJudgeResultSchema,\n} from './schemas.js';\n\n/**\n * Handler function type for code judges.\n */\nexport type CodeJudgeHandler = (\n input: CodeJudgeInput,\n) => CodeJudgeResult | Promise<CodeJudgeResult>;\n\n/**\n * Read stdin synchronously (works in both Node.js and Bun).\n */\nfunction readStdin(): string {\n return readFileSync(0, 'utf8');\n}\n\n/**\n * Clamp a value to the range [0, 1].\n */\nfunction clampScore(value: number): number {\n if (Number.isNaN(value) || !Number.isFinite(value)) {\n return 0;\n }\n return Math.max(0, Math.min(1, value));\n}\n\n/**\n * Format an error for output.\n */\nfunction formatError(error: unknown): string {\n if (error instanceof Error) {\n return error.message;\n }\n return String(error);\n}\n\n/**\n * Run a code judge handler with full stdin/stdout handling.\n * This is the internal implementation called by defineCodeJudge.\n */\nexport async function runCodeJudge(handler: CodeJudgeHandler): Promise<void> {\n try {\n // 1. Read stdin\n const stdin = readStdin();\n\n // 2. Parse JSON\n const rawInput = JSON.parse(stdin) as Record<string, unknown>;\n\n // 3. Convert snake_case to camelCase\n const camelInput = toCamelCaseDeep(rawInput);\n\n // 4. Validate input with Zod\n const input = CodeJudgeInputSchema.parse(camelInput);\n\n // 5. Run handler\n const rawResult = await handler(input);\n\n // 6. Validate and normalize output\n const result = CodeJudgeResultSchema.parse({\n ...rawResult,\n score: clampScore(rawResult.score),\n });\n\n // 7. Output JSON\n console.log(JSON.stringify(result, null, 2));\n } catch (error) {\n // Output failure result\n const errorMessage = formatError(error);\n const errorResult: CodeJudgeResult = {\n score: 0,\n hits: [],\n misses: [errorMessage],\n reasoning: `Evaluation failed: ${errorMessage}`,\n };\n console.log(JSON.stringify(errorResult, null, 2));\n process.exit(1);\n }\n}\n","/**\n * Case conversion utilities for JSON payloads.\n * Converts between snake_case (wire format) and camelCase (TypeScript).\n */\n\nfunction toCamelCase(str: string): string {\n // Don't convert keys that start with uppercase (proper nouns/tool names)\n if (/^[A-Z]/.test(str)) {\n return str;\n }\n return str.replace(/_([a-z0-9])/g, (_, letter) => letter.toUpperCase());\n}\n\n/**\n * Recursively converts all keys in an object from snake_case to camelCase.\n * Used to map wire payloads into TypeScript-friendly shapes.\n *\n * @param obj - The object to convert (can be any JSON-serializable value)\n * @returns A new object with all keys converted to camelCase\n */\nexport function toCamelCaseDeep(obj: unknown): unknown {\n if (obj === null || obj === undefined) {\n return obj;\n }\n\n if (Array.isArray(obj)) {\n return obj.map((item) => toCamelCaseDeep(item));\n }\n\n if (typeof obj === 'object') {\n const result: Record<string, unknown> = {};\n for (const [key, value] of Object.entries(obj)) {\n const camelKey = toCamelCase(key);\n result[camelKey] = toCamelCaseDeep(value);\n }\n return result;\n }\n\n return obj;\n}\n"],"mappings":";AAIA,SAAS,SAAS;AAKX,IAAM,mBAAmB,EAAE,OAAO;AAAA,EACvC,OAAO,EAAE,OAAO;AAAA,EAChB,QAAQ,EAAE,OAAO;AAAA,EACjB,QAAQ,EAAE,OAAO,EAAE,SAAS;AAC9B,CAAC;AAKM,IAAM,qBAAqB,EAAE,OAAO;AAAA,EACzC,YAAY,EAAE,OAAO;AAAA,EACrB,WAAW,EAAE,MAAM,EAAE,OAAO,CAAC;AAAA,EAC7B,iBAAiB,EAAE,OAAO,EAAE,OAAO,GAAG,EAAE,OAAO,CAAC;AAAA,EAChD,YAAY,EAAE,OAAO;AAAA,EACrB,YAAY,iBAAiB,SAAS;AAAA,EACtC,SAAS,EAAE,OAAO,EAAE,SAAS;AAAA,EAC7B,YAAY,EAAE,OAAO,EAAE,SAAS;AAAA,EAChC,eAAe,EAAE,OAAO,EAAE,OAAO,GAAG,EAAE,MAAM,EAAE,OAAO,CAAC,CAAC,EAAE,SAAS;AACpE,CAAC;AAKM,IAAM,iBAAiB,EAAE,OAAO;AAAA,EACrC,MAAM,EAAE,OAAO;AAAA,EACf,OAAO,EAAE,QAAQ,EAAE,SAAS;AAAA,EAC5B,QAAQ,EAAE,QAAQ,EAAE,SAAS;AAAA,EAC7B,IAAI,EAAE,OAAO,EAAE,SAAS;AAAA,EACxB,WAAW,EAAE,OAAO,EAAE,SAAS;AACjC,CAAC;AAKM,IAAM,sBAAsB,EAAE,OAAO;AAAA,EAC1C,MAAM,EAAE,KAAK,CAAC,aAAa,QAAQ,UAAU,MAAM,CAAC;AAAA,EACpD,SAAS,EAAE,MAAM,CAAC,EAAE,OAAO,GAAG,EAAE,OAAO,EAAE,QAAQ,CAAC,GAAG,EAAE,MAAM,EAAE,OAAO,EAAE,QAAQ,CAAC,CAAC,CAAC,CAAC,EAAE,SAAS;AAAA,EAC/F,WAAW,EAAE,MAAM,cAAc,EAAE,SAAS;AAAA,EAC5C,WAAW,EAAE,OAAO,EAAE,SAAS;AAAA,EAC/B,UAAU,EAAE,OAAO,EAAE,QAAQ,CAAC,EAAE,SAAS;AAC3C,CAAC;AAKM,IAAM,oBAAoB,EAAE,OAAO;AAAA,EACxC,MAAM,EAAE,KAAK,CAAC,UAAU,QAAQ,aAAa,MAAM,CAAC;AAAA,EACpD,SAAS,EAAE,MAAM,CAAC,EAAE,OAAO,GAAG,EAAE,OAAO,EAAE,QAAQ,CAAC,GAAG,EAAE,MAAM,EAAE,OAAO,EAAE,QAAQ,CAAC,CAAC,CAAC,CAAC;AACtF,CAAC;AAKM,IAAM,uBAAuB,EAAE,OAAO;AAAA,EAC3C,UAAU,EAAE,OAAO;AAAA,EACnB,iBAAiB,EAAE,OAAO;AAAA,EAC1B,kBAAkB,EAAE,MAAM,EAAE,OAAO,EAAE,QAAQ,CAAC,CAAC;AAAA,EAC/C,iBAAiB,EAAE,OAAO,EAAE,SAAS;AAAA,EACrC,iBAAiB,EAAE,OAAO;AAAA,EAC1B,gBAAgB,EAAE,MAAM,mBAAmB,EAAE,SAAS,EAAE,SAAS;AAAA,EACjE,gBAAgB,EAAE,MAAM,EAAE,OAAO,CAAC;AAAA,EAClC,YAAY,EAAE,MAAM,EAAE,OAAO,CAAC;AAAA,EAC9B,eAAe,EAAE,MAAM,iBAAiB;AAAA,EACxC,cAAc,mBAAmB,SAAS,EAAE,SAAS;AAAA,EACrD,QAAQ,EAAE,OAAO,EAAE,QAAQ,CAAC,EAAE,SAAS,EAAE,SAAS;AACpD,CAAC;AAKM,IAAM,wBAAwB,EAAE,OAAO;AAAA,EAC5C,OAAO,EAAE,OAAO,EAAE,IAAI,CAAC,EAAE,IAAI,CAAC;AAAA,EAC9B,MAAM,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,SAAS,EAAE,QAAQ,CAAC,CAAC;AAAA,EAC/C,QAAQ,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,SAAS,EAAE,QAAQ,CAAC,CAAC;AAAA,EACjD,WAAW,EAAE,OAAO,EAAE,SAAS;AACjC,CAAC;;;AC/CD,SAAS,KAAAA,UAAS;;;ACjClB,SAAS,oBAAoB;;;ACC7B,SAAS,YAAY,KAAqB;AAExC,MAAI,SAAS,KAAK,GAAG,GAAG;AACtB,WAAO;AAAA,EACT;AACA,SAAO,IAAI,QAAQ,gBAAgB,CAAC,GAAG,WAAW,OAAO,YAAY,CAAC;AACxE;AASO,SAAS,gBAAgB,KAAuB;AACrD,MAAI,QAAQ,QAAQ,QAAQ,QAAW;AACrC,WAAO;AAAA,EACT;AAEA,MAAI,MAAM,QAAQ,GAAG,GAAG;AACtB,WAAO,IAAI,IAAI,CAAC,SAAS,gBAAgB,IAAI,CAAC;AAAA,EAChD;AAEA,MAAI,OAAO,QAAQ,UAAU;AAC3B,UAAM,SAAkC,CAAC;AACzC,eAAW,CAAC,KAAK,KAAK,KAAK,OAAO,QAAQ,GAAG,GAAG;AAC9C,YAAM,WAAW,YAAY,GAAG;AAChC,aAAO,QAAQ,IAAI,gBAAgB,KAAK;AAAA,IAC1C;AACA,WAAO;AAAA,EACT;AAEA,SAAO;AACT;;;ADfA,SAAS,YAAoB;AAC3B,SAAO,aAAa,GAAG,MAAM;AAC/B;AAKA,SAAS,WAAW,OAAuB;AACzC,MAAI,OAAO,MAAM,KAAK,KAAK,CAAC,OAAO,SAAS,KAAK,GAAG;AAClD,WAAO;AAAA,EACT;AACA,SAAO,KAAK,IAAI,GAAG,KAAK,IAAI,GAAG,KAAK,CAAC;AACvC;AAKA,SAAS,YAAY,OAAwB;AAC3C,MAAI,iBAAiB,OAAO;AAC1B,WAAO,MAAM;AAAA,EACf;AACA,SAAO,OAAO,KAAK;AACrB;AAMA,eAAsB,aAAa,SAA0C;AAC3E,MAAI;AAEF,UAAM,QAAQ,UAAU;AAGxB,UAAM,WAAW,KAAK,MAAM,KAAK;AAGjC,UAAM,aAAa,gBAAgB,QAAQ;AAG3C,UAAM,QAAQ,qBAAqB,MAAM,UAAU;AAGnD,UAAM,YAAY,MAAM,QAAQ,KAAK;AAGrC,UAAM,SAAS,sBAAsB,MAAM;AAAA,MACzC,GAAG;AAAA,MACH,OAAO,WAAW,UAAU,KAAK;AAAA,IACnC,CAAC;AAGD,YAAQ,IAAI,KAAK,UAAU,QAAQ,MAAM,CAAC,CAAC;AAAA,EAC7C,SAAS,OAAO;AAEd,UAAM,eAAe,YAAY,KAAK;AACtC,UAAM,cAA+B;AAAA,MACnC,OAAO;AAAA,MACP,MAAM,CAAC;AAAA,MACP,QAAQ,CAAC,YAAY;AAAA,MACrB,WAAW,sBAAsB,YAAY;AAAA,IAC/C;AACA,YAAQ,IAAI,KAAK,UAAU,aAAa,MAAM,CAAC,CAAC;AAChD,YAAQ,KAAK,CAAC;AAAA,EAChB;AACF;;;ADDO,SAAS,gBAAgB,SAAiC;AAE/D,eAAa,OAAO;AACtB;","names":["z"]}
|
package/package.json
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@agentv/eval",
|
|
3
|
+
"version": "0.2.0",
|
|
4
|
+
"description": "Evaluation SDK for AgentV - build custom code judges",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"repository": {
|
|
7
|
+
"type": "git",
|
|
8
|
+
"url": "https://github.com/EntityProcess/agentv.git"
|
|
9
|
+
},
|
|
10
|
+
"homepage": "https://github.com/EntityProcess/agentv#readme",
|
|
11
|
+
"bugs": {
|
|
12
|
+
"url": "https://github.com/EntityProcess/agentv/issues"
|
|
13
|
+
},
|
|
14
|
+
"main": "./dist/index.js",
|
|
15
|
+
"types": "./dist/index.d.ts",
|
|
16
|
+
"exports": {
|
|
17
|
+
".": {
|
|
18
|
+
"types": "./dist/index.d.ts",
|
|
19
|
+
"import": "./dist/index.js",
|
|
20
|
+
"require": "./dist/index.cjs"
|
|
21
|
+
}
|
|
22
|
+
},
|
|
23
|
+
"scripts": {
|
|
24
|
+
"build": "tsup",
|
|
25
|
+
"dev": "tsup --watch",
|
|
26
|
+
"typecheck": "tsc --noEmit",
|
|
27
|
+
"lint": "biome check .",
|
|
28
|
+
"format": "biome format --write .",
|
|
29
|
+
"fix": "biome check --write .",
|
|
30
|
+
"test": "bun test"
|
|
31
|
+
},
|
|
32
|
+
"files": [
|
|
33
|
+
"dist",
|
|
34
|
+
"README.md"
|
|
35
|
+
],
|
|
36
|
+
"dependencies": {
|
|
37
|
+
"zod": "^3.23.8"
|
|
38
|
+
}
|
|
39
|
+
}
|