@agentv/eval 0.3.0 → 2.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +30 -3
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +392 -1
- package/dist/index.d.ts +392 -1
- package/dist/index.js +28 -3
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.cjs
CHANGED
|
@@ -23,6 +23,7 @@ __export(index_exports, {
|
|
|
23
23
|
CodeJudgeInputSchema: () => CodeJudgeInputSchema,
|
|
24
24
|
CodeJudgeResultSchema: () => CodeJudgeResultSchema,
|
|
25
25
|
MessageSchema: () => MessageSchema,
|
|
26
|
+
PromptTemplateInputSchema: () => PromptTemplateInputSchema,
|
|
26
27
|
TargetInvocationError: () => TargetInvocationError,
|
|
27
28
|
TargetNotAvailableError: () => TargetNotAvailableError,
|
|
28
29
|
TokenUsageSchema: () => TokenUsageSchema,
|
|
@@ -30,6 +31,7 @@ __export(index_exports, {
|
|
|
30
31
|
TraceSummarySchema: () => TraceSummarySchema,
|
|
31
32
|
createTargetClient: () => createTargetClient,
|
|
32
33
|
defineCodeJudge: () => defineCodeJudge,
|
|
34
|
+
definePromptTemplate: () => definePromptTemplate,
|
|
33
35
|
z: () => import_zod2.z
|
|
34
36
|
});
|
|
35
37
|
module.exports = __toCommonJS(index_exports);
|
|
@@ -87,6 +89,7 @@ var CodeJudgeResultSchema = import_zod.z.object({
|
|
|
87
89
|
/** Optional structured details for domain-specific metrics (e.g., TP/TN/FP/FN counts, alignments). */
|
|
88
90
|
details: import_zod.z.record(import_zod.z.unknown()).optional()
|
|
89
91
|
});
|
|
92
|
+
var PromptTemplateInputSchema = CodeJudgeInputSchema;
|
|
90
93
|
|
|
91
94
|
// src/target-client.ts
|
|
92
95
|
var TargetNotAvailableError = class extends Error {
|
|
@@ -199,7 +202,7 @@ function createTargetClientInternal(url, token) {
|
|
|
199
202
|
// src/index.ts
|
|
200
203
|
var import_zod2 = require("zod");
|
|
201
204
|
|
|
202
|
-
// src/
|
|
205
|
+
// src/prompt-template.ts
|
|
203
206
|
var import_node_fs = require("fs");
|
|
204
207
|
|
|
205
208
|
// src/case-conversion.ts
|
|
@@ -227,10 +230,29 @@ function toCamelCaseDeep(obj) {
|
|
|
227
230
|
return obj;
|
|
228
231
|
}
|
|
229
232
|
|
|
230
|
-
// src/
|
|
233
|
+
// src/prompt-template.ts
|
|
231
234
|
function readStdin() {
|
|
232
235
|
return (0, import_node_fs.readFileSync)(0, "utf8");
|
|
233
236
|
}
|
|
237
|
+
async function runPromptTemplate(handler) {
|
|
238
|
+
try {
|
|
239
|
+
const stdin = readStdin();
|
|
240
|
+
const rawInput = JSON.parse(stdin);
|
|
241
|
+
const camelInput = toCamelCaseDeep(rawInput);
|
|
242
|
+
const input = PromptTemplateInputSchema.parse(camelInput);
|
|
243
|
+
const prompt = await handler(input);
|
|
244
|
+
console.log(prompt);
|
|
245
|
+
} catch (error) {
|
|
246
|
+
console.error(error instanceof Error ? error.message : String(error));
|
|
247
|
+
process.exit(1);
|
|
248
|
+
}
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
// src/runtime.ts
|
|
252
|
+
var import_node_fs2 = require("fs");
|
|
253
|
+
function readStdin2() {
|
|
254
|
+
return (0, import_node_fs2.readFileSync)(0, "utf8");
|
|
255
|
+
}
|
|
234
256
|
function clampScore(value) {
|
|
235
257
|
if (Number.isNaN(value) || !Number.isFinite(value)) {
|
|
236
258
|
return 0;
|
|
@@ -245,7 +267,7 @@ function formatError(error) {
|
|
|
245
267
|
}
|
|
246
268
|
async function runCodeJudge(handler) {
|
|
247
269
|
try {
|
|
248
|
-
const stdin =
|
|
270
|
+
const stdin = readStdin2();
|
|
249
271
|
const rawInput = JSON.parse(stdin);
|
|
250
272
|
const camelInput = toCamelCaseDeep(rawInput);
|
|
251
273
|
const input = CodeJudgeInputSchema.parse(camelInput);
|
|
@@ -272,11 +294,15 @@ async function runCodeJudge(handler) {
|
|
|
272
294
|
function defineCodeJudge(handler) {
|
|
273
295
|
runCodeJudge(handler);
|
|
274
296
|
}
|
|
297
|
+
function definePromptTemplate(handler) {
|
|
298
|
+
runPromptTemplate(handler);
|
|
299
|
+
}
|
|
275
300
|
// Annotate the CommonJS export names for ESM import in node:
|
|
276
301
|
0 && (module.exports = {
|
|
277
302
|
CodeJudgeInputSchema,
|
|
278
303
|
CodeJudgeResultSchema,
|
|
279
304
|
MessageSchema,
|
|
305
|
+
PromptTemplateInputSchema,
|
|
280
306
|
TargetInvocationError,
|
|
281
307
|
TargetNotAvailableError,
|
|
282
308
|
TokenUsageSchema,
|
|
@@ -284,6 +310,7 @@ function defineCodeJudge(handler) {
|
|
|
284
310
|
TraceSummarySchema,
|
|
285
311
|
createTargetClient,
|
|
286
312
|
defineCodeJudge,
|
|
313
|
+
definePromptTemplate,
|
|
287
314
|
z
|
|
288
315
|
});
|
|
289
316
|
//# sourceMappingURL=index.cjs.map
|
package/dist/index.cjs.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../src/index.ts","../src/schemas.ts","../src/target-client.ts","../src/runtime.ts","../src/case-conversion.ts"],"sourcesContent":["/**\n * AgentV Evaluation SDK\n *\n * Build custom code judges for evaluating AI agent outputs.\n *\n * @example Basic code judge\n * ```typescript\n * #!/usr/bin/env bun\n * import { defineCodeJudge } from '@agentv/eval';\n *\n * export default defineCodeJudge(({ traceSummary, candidateAnswer }) => ({\n * score: traceSummary?.eventCount <= 5 ? 1.0 : 0.5,\n * hits: ['Efficient tool usage'],\n * misses: [],\n * }));\n * ```\n *\n * @example Code judge with target access (requires `target` config in YAML)\n * ```typescript\n * #!/usr/bin/env bun\n * import { defineCodeJudge, createTargetClient } from '@agentv/eval';\n *\n * export default defineCodeJudge(async ({ question }) => {\n * const target = createTargetClient();\n * if (!target) {\n * return { score: 0, misses: ['Target not available'] };\n * }\n *\n * const response = await target.invoke({\n * question: `Evaluate: ${question}`,\n * systemPrompt: 'Respond with JSON: { \"score\": 0-1 }'\n * });\n *\n * const result = JSON.parse(response.rawText ?? '{}');\n * return { score: result.score ?? 0 };\n * });\n * ```\n *\n * @packageDocumentation\n */\n\n// Re-export schemas and types\nexport {\n CodeJudgeInputSchema,\n CodeJudgeResultSchema,\n TraceSummarySchema,\n MessageSchema,\n ToolCallSchema,\n TokenUsageSchema,\n type CodeJudgeInput,\n type CodeJudgeResult,\n type TraceSummary,\n type Message,\n type ToolCall,\n type TokenUsage,\n} from './schemas.js';\n\n// Re-export target client\nexport {\n createTargetClient,\n TargetNotAvailableError,\n TargetInvocationError,\n type TargetClient,\n type TargetInfo,\n type TargetInvokeRequest,\n type TargetInvokeResponse,\n} from './target-client.js';\n\n// Re-export Zod for typed config support\nexport { z } from 'zod';\n\n// Import runtime\nimport { type CodeJudgeHandler, runCodeJudge } from './runtime.js';\n\nexport type { CodeJudgeHandler };\n\n/**\n * Define a code judge evaluator with automatic stdin/stdout handling.\n *\n * This function:\n * 1. Reads JSON from stdin (snake_case format)\n * 2. Converts to camelCase and validates with Zod\n * 3. Calls your handler with typed input\n * 4. Validates the result and outputs JSON to stdout\n * 5. Handles errors gracefully with proper exit codes\n *\n * @param handler - Function that evaluates the input and returns a result\n *\n * @example\n * ```typescript\n * import { defineCodeJudge } from '@agentv/eval';\n *\n * export default defineCodeJudge(({ traceSummary }) => {\n * if (!traceSummary) {\n * return { score: 0.5, reasoning: 'No trace available' };\n * }\n *\n * const efficient = traceSummary.eventCount <= 10;\n * return {\n * score: efficient ? 1.0 : 0.5,\n * hits: efficient ? ['Efficient execution'] : [],\n * misses: efficient ? [] : ['Too many tool calls'],\n * };\n * });\n * ```\n *\n * @example With typed config\n * ```typescript\n * import { defineCodeJudge, z } from '@agentv/eval';\n *\n * const ConfigSchema = z.object({\n * maxToolCalls: z.number().default(10),\n * });\n *\n * export default defineCodeJudge(({ traceSummary, config }) => {\n * const { maxToolCalls } = ConfigSchema.parse(config ?? {});\n * // Use maxToolCalls...\n * });\n * ```\n */\nexport function defineCodeJudge(handler: CodeJudgeHandler): void {\n // Run immediately when module is loaded\n runCodeJudge(handler);\n}\n","/**\n * Zod schemas for code judge input/output validation.\n * Provides both compile-time types and runtime validation.\n */\nimport { z } from 'zod';\n\n/**\n * Token usage metrics schema.\n */\nexport const TokenUsageSchema = z.object({\n input: z.number(),\n output: z.number(),\n cached: z.number().optional(),\n});\n\n/**\n * Trace summary schema (camelCase for TypeScript ergonomics).\n */\nexport const TraceSummarySchema = z.object({\n eventCount: z.number(),\n toolNames: z.array(z.string()),\n toolCallsByName: z.record(z.string(), z.number()),\n errorCount: z.number(),\n tokenUsage: TokenUsageSchema.optional(),\n costUsd: z.number().optional(),\n durationMs: z.number().optional(),\n toolDurations: z.record(z.string(), z.array(z.number())).optional(),\n});\n\n/**\n * Tool call schema.\n */\nexport const ToolCallSchema = z.object({\n tool: z.string(),\n input: z.unknown().optional(),\n output: z.unknown().optional(),\n id: z.string().optional(),\n timestamp: z.string().optional(),\n});\n\n/**\n * Unified message schema for input, expected, and output messages.\n */\nexport const MessageSchema = z.object({\n role: z.enum(['assistant', 'user', 'system', 'tool']),\n content: z.union([z.string(), z.record(z.unknown()), z.array(z.record(z.unknown()))]).optional(),\n toolCalls: z.array(ToolCallSchema).optional(),\n name: z.string().optional(),\n timestamp: z.string().optional(),\n metadata: z.record(z.unknown()).optional(),\n});\n\n/**\n * Code judge input schema (camelCase, converted from snake_case wire format).\n */\nexport const CodeJudgeInputSchema = z.object({\n question: z.string(),\n expectedOutcome: z.string(),\n expectedMessages: z.array(MessageSchema),\n referenceAnswer: z.string().optional(),\n candidateAnswer: z.string(),\n outputMessages: z.array(MessageSchema).nullable().optional(),\n guidelineFiles: z.array(z.string()),\n inputFiles: z.array(z.string()),\n inputMessages: z.array(MessageSchema),\n traceSummary: TraceSummarySchema.nullable().optional(),\n config: z.record(z.unknown()).nullable().optional(),\n});\n\n/**\n * Code judge result schema (validated before output).\n */\nexport const CodeJudgeResultSchema = z.object({\n score: z.number().min(0).max(1),\n hits: z.array(z.string()).optional().default([]),\n misses: z.array(z.string()).optional().default([]),\n reasoning: z.string().optional(),\n /** Optional structured details for domain-specific metrics (e.g., TP/TN/FP/FN counts, alignments). */\n details: z.record(z.unknown()).optional(),\n});\n\n/**\n * Inferred types from schemas.\n */\nexport type CodeJudgeInput = z.infer<typeof CodeJudgeInputSchema>;\nexport type CodeJudgeResult = z.infer<typeof CodeJudgeResultSchema>;\nexport type TraceSummary = z.infer<typeof TraceSummarySchema>;\nexport type Message = z.infer<typeof MessageSchema>;\nexport type ToolCall = z.infer<typeof ToolCallSchema>;\nexport type TokenUsage = z.infer<typeof TokenUsageSchema>;\n","/**\n * Client for invoking configured targets from code_judge scripts.\n *\n * Environment variables (set automatically by AgentV when `target` config is present):\n * - AGENTV_TARGET_PROXY_URL: The URL of the local proxy server\n * - AGENTV_TARGET_PROXY_TOKEN: Bearer token for authentication\n */\n\n/**\n * Request to invoke the target\n */\nexport interface TargetInvokeRequest {\n readonly question: string;\n readonly systemPrompt?: string;\n readonly evalCaseId?: string;\n readonly attempt?: number;\n /** Optional target override - use a different target for this invocation */\n readonly target?: string;\n}\n\n/**\n * Response from a target invocation\n */\nexport interface TargetInvokeResponse {\n readonly outputMessages: readonly unknown[];\n readonly rawText?: string;\n}\n\n/**\n * Information about the target proxy configuration\n */\nexport interface TargetInfo {\n /** Name of the default target being used */\n readonly targetName: string;\n /** Maximum number of calls allowed */\n readonly maxCalls: number;\n /** Current number of calls made */\n readonly callCount: number;\n /** List of all available target names */\n readonly availableTargets: readonly string[];\n}\n\n/**\n * Target client for making target invocations\n */\nexport interface TargetClient {\n /**\n * Invoke the configured target with a prompt.\n * @param request - The question and optional system prompt\n * @returns The target's response with output messages and optional raw text\n */\n invoke(request: TargetInvokeRequest): Promise<TargetInvokeResponse>;\n\n /**\n * Invoke the target with multiple requests in sequence.\n * Each request counts toward the max_calls limit.\n * @param requests - Array of target requests\n * @returns Array of target responses\n */\n invokeBatch(requests: readonly TargetInvokeRequest[]): Promise<readonly TargetInvokeResponse[]>;\n\n /**\n * Get information about the target proxy configuration.\n * Returns the default target name, max calls, current call count, and available targets.\n */\n getInfo(): Promise<TargetInfo>;\n}\n\n/**\n * Error thrown when target proxy is not available\n */\nexport class TargetNotAvailableError extends Error {\n constructor(message: string) {\n super(message);\n this.name = 'TargetNotAvailableError';\n }\n}\n\n/**\n * Error thrown when target invocation fails\n */\nexport class TargetInvocationError extends Error {\n readonly statusCode?: number;\n\n constructor(message: string, statusCode?: number) {\n super(message);\n this.name = 'TargetInvocationError';\n this.statusCode = statusCode;\n }\n}\n\n/**\n * Create a target client from environment variables.\n *\n * This function reads the proxy URL and token from environment variables\n * that are automatically set by AgentV when a `target` config block is present\n * on a `code_judge` evaluator.\n *\n * @returns A target client if environment variables are set, otherwise undefined\n * @throws TargetNotAvailableError if token is missing when URL is present\n *\n * @example\n * ```typescript\n * import { createTargetClient, defineCodeJudge } from '@agentv/eval';\n *\n * export default defineCodeJudge(async ({ question, expectedOutcome }) => {\n * const target = createTargetClient();\n *\n * if (!target) {\n * // Target not available - no target config on this evaluator\n * return { score: 0.5, reasoning: 'Target not available' };\n * }\n *\n * const response = await target.invoke({\n * question: `Is this answer correct? Question: ${question}, Expected: ${expectedOutcome}`,\n * systemPrompt: 'You are an expert evaluator. Respond with JSON: { \"correct\": true/false }'\n * });\n *\n * const result = JSON.parse(response.rawText ?? '{}');\n * return { score: result.correct ? 1.0 : 0.0 };\n * });\n * ```\n */\nexport function createTargetClient(): TargetClient | undefined {\n const proxyUrl = process.env.AGENTV_TARGET_PROXY_URL;\n const proxyToken = process.env.AGENTV_TARGET_PROXY_TOKEN;\n\n if (!proxyUrl) {\n return undefined;\n }\n\n if (!proxyToken) {\n throw new TargetNotAvailableError(\n 'AGENTV_TARGET_PROXY_URL is set but AGENTV_TARGET_PROXY_TOKEN is missing',\n );\n }\n\n return createTargetClientInternal(proxyUrl, proxyToken);\n}\n\n/**\n * Internal: Create a target client with explicit URL and token.\n * Exported for testing only - use createTargetClient() in production.\n */\nexport function createTargetClientInternal(url: string, token: string): TargetClient {\n const headers = {\n 'Content-Type': 'application/json',\n Authorization: `Bearer ${token}`,\n };\n\n return {\n async invoke(request: TargetInvokeRequest): Promise<TargetInvokeResponse> {\n const response = await fetch(`${url}/invoke`, {\n method: 'POST',\n headers,\n body: JSON.stringify({\n question: request.question,\n systemPrompt: request.systemPrompt,\n evalCaseId: request.evalCaseId,\n attempt: request.attempt,\n target: request.target,\n }),\n });\n\n if (!response.ok) {\n const errorBody = await response.text();\n let errorMessage: string;\n try {\n const errorJson = JSON.parse(errorBody) as { error?: string };\n errorMessage = errorJson.error ?? `HTTP ${response.status}`;\n } catch {\n errorMessage = errorBody || `HTTP ${response.status}`;\n }\n throw new TargetInvocationError(errorMessage, response.status);\n }\n\n return (await response.json()) as TargetInvokeResponse;\n },\n\n async invokeBatch(\n requests: readonly TargetInvokeRequest[],\n ): Promise<readonly TargetInvokeResponse[]> {\n const response = await fetch(`${url}/invokeBatch`, {\n method: 'POST',\n headers,\n body: JSON.stringify({\n requests: requests.map((r) => ({\n question: r.question,\n systemPrompt: r.systemPrompt,\n evalCaseId: r.evalCaseId,\n attempt: r.attempt,\n target: r.target,\n })),\n }),\n });\n\n if (!response.ok) {\n const errorBody = await response.text();\n let errorMessage: string;\n try {\n const errorJson = JSON.parse(errorBody) as { error?: string };\n errorMessage = errorJson.error ?? `HTTP ${response.status}`;\n } catch {\n errorMessage = errorBody || `HTTP ${response.status}`;\n }\n throw new TargetInvocationError(errorMessage, response.status);\n }\n\n const result = (await response.json()) as { responses: TargetInvokeResponse[] };\n return result.responses;\n },\n\n async getInfo(): Promise<TargetInfo> {\n const response = await fetch(`${url}/info`, {\n method: 'GET',\n headers,\n });\n\n if (!response.ok) {\n const errorBody = await response.text();\n let errorMessage: string;\n try {\n const errorJson = JSON.parse(errorBody) as { error?: string };\n errorMessage = errorJson.error ?? `HTTP ${response.status}`;\n } catch {\n errorMessage = errorBody || `HTTP ${response.status}`;\n }\n throw new TargetInvocationError(errorMessage, response.status);\n }\n\n return (await response.json()) as TargetInfo;\n },\n };\n}\n","/**\n * Runtime for code judge evaluators.\n * Handles stdin parsing, validation, error handling, and output formatting.\n */\nimport { readFileSync } from 'node:fs';\n\nimport { toCamelCaseDeep } from './case-conversion.js';\nimport {\n type CodeJudgeInput,\n CodeJudgeInputSchema,\n type CodeJudgeResult,\n CodeJudgeResultSchema,\n} from './schemas.js';\n\n/**\n * Handler function type for code judges.\n */\nexport type CodeJudgeHandler = (\n input: CodeJudgeInput,\n) => CodeJudgeResult | Promise<CodeJudgeResult>;\n\n/**\n * Read stdin synchronously (works in both Node.js and Bun).\n */\nfunction readStdin(): string {\n return readFileSync(0, 'utf8');\n}\n\n/**\n * Clamp a value to the range [0, 1].\n */\nfunction clampScore(value: number): number {\n if (Number.isNaN(value) || !Number.isFinite(value)) {\n return 0;\n }\n return Math.max(0, Math.min(1, value));\n}\n\n/**\n * Format an error for output.\n */\nfunction formatError(error: unknown): string {\n if (error instanceof Error) {\n return error.message;\n }\n return String(error);\n}\n\n/**\n * Run a code judge handler with full stdin/stdout handling.\n * This is the internal implementation called by defineCodeJudge.\n */\nexport async function runCodeJudge(handler: CodeJudgeHandler): Promise<void> {\n try {\n // 1. Read stdin\n const stdin = readStdin();\n\n // 2. Parse JSON\n const rawInput = JSON.parse(stdin) as Record<string, unknown>;\n\n // 3. Convert snake_case to camelCase\n const camelInput = toCamelCaseDeep(rawInput);\n\n // 4. Validate input with Zod\n const input = CodeJudgeInputSchema.parse(camelInput);\n\n // 5. Run handler\n const rawResult = await handler(input);\n\n // 6. Validate and normalize output\n const result = CodeJudgeResultSchema.parse({\n ...rawResult,\n score: clampScore(rawResult.score),\n });\n\n // 7. Output JSON\n console.log(JSON.stringify(result, null, 2));\n } catch (error) {\n // Output failure result\n const errorMessage = formatError(error);\n const errorResult: CodeJudgeResult = {\n score: 0,\n hits: [],\n misses: [errorMessage],\n reasoning: `Evaluation failed: ${errorMessage}`,\n };\n console.log(JSON.stringify(errorResult, null, 2));\n process.exit(1);\n }\n}\n","/**\n * Case conversion utilities for JSON payloads.\n * Converts between snake_case (wire format) and camelCase (TypeScript).\n */\n\nfunction toCamelCase(str: string): string {\n // Don't convert keys that start with uppercase (proper nouns/tool names)\n if (/^[A-Z]/.test(str)) {\n return str;\n }\n return str.replace(/_([a-z0-9])/g, (_, letter) => letter.toUpperCase());\n}\n\n/**\n * Recursively converts all keys in an object from snake_case to camelCase.\n * Used to map wire payloads into TypeScript-friendly shapes.\n *\n * @param obj - The object to convert (can be any JSON-serializable value)\n * @returns A new object with all keys converted to camelCase\n */\nexport function toCamelCaseDeep(obj: unknown): unknown {\n if (obj === null || obj === undefined) {\n return obj;\n }\n\n if (Array.isArray(obj)) {\n return obj.map((item) => toCamelCaseDeep(item));\n }\n\n if (typeof obj === 'object') {\n const result: Record<string, unknown> = {};\n for (const [key, value] of Object.entries(obj)) {\n const camelKey = toCamelCase(key);\n result[camelKey] = toCamelCaseDeep(value);\n }\n return result;\n }\n\n return obj;\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;;;ACIA,iBAAkB;AAKX,IAAM,mBAAmB,aAAE,OAAO;AAAA,EACvC,OAAO,aAAE,OAAO;AAAA,EAChB,QAAQ,aAAE,OAAO;AAAA,EACjB,QAAQ,aAAE,OAAO,EAAE,SAAS;AAC9B,CAAC;AAKM,IAAM,qBAAqB,aAAE,OAAO;AAAA,EACzC,YAAY,aAAE,OAAO;AAAA,EACrB,WAAW,aAAE,MAAM,aAAE,OAAO,CAAC;AAAA,EAC7B,iBAAiB,aAAE,OAAO,aAAE,OAAO,GAAG,aAAE,OAAO,CAAC;AAAA,EAChD,YAAY,aAAE,OAAO;AAAA,EACrB,YAAY,iBAAiB,SAAS;AAAA,EACtC,SAAS,aAAE,OAAO,EAAE,SAAS;AAAA,EAC7B,YAAY,aAAE,OAAO,EAAE,SAAS;AAAA,EAChC,eAAe,aAAE,OAAO,aAAE,OAAO,GAAG,aAAE,MAAM,aAAE,OAAO,CAAC,CAAC,EAAE,SAAS;AACpE,CAAC;AAKM,IAAM,iBAAiB,aAAE,OAAO;AAAA,EACrC,MAAM,aAAE,OAAO;AAAA,EACf,OAAO,aAAE,QAAQ,EAAE,SAAS;AAAA,EAC5B,QAAQ,aAAE,QAAQ,EAAE,SAAS;AAAA,EAC7B,IAAI,aAAE,OAAO,EAAE,SAAS;AAAA,EACxB,WAAW,aAAE,OAAO,EAAE,SAAS;AACjC,CAAC;AAKM,IAAM,gBAAgB,aAAE,OAAO;AAAA,EACpC,MAAM,aAAE,KAAK,CAAC,aAAa,QAAQ,UAAU,MAAM,CAAC;AAAA,EACpD,SAAS,aAAE,MAAM,CAAC,aAAE,OAAO,GAAG,aAAE,OAAO,aAAE,QAAQ,CAAC,GAAG,aAAE,MAAM,aAAE,OAAO,aAAE,QAAQ,CAAC,CAAC,CAAC,CAAC,EAAE,SAAS;AAAA,EAC/F,WAAW,aAAE,MAAM,cAAc,EAAE,SAAS;AAAA,EAC5C,MAAM,aAAE,OAAO,EAAE,SAAS;AAAA,EAC1B,WAAW,aAAE,OAAO,EAAE,SAAS;AAAA,EAC/B,UAAU,aAAE,OAAO,aAAE,QAAQ,CAAC,EAAE,SAAS;AAC3C,CAAC;AAKM,IAAM,uBAAuB,aAAE,OAAO;AAAA,EAC3C,UAAU,aAAE,OAAO;AAAA,EACnB,iBAAiB,aAAE,OAAO;AAAA,EAC1B,kBAAkB,aAAE,MAAM,aAAa;AAAA,EACvC,iBAAiB,aAAE,OAAO,EAAE,SAAS;AAAA,EACrC,iBAAiB,aAAE,OAAO;AAAA,EAC1B,gBAAgB,aAAE,MAAM,aAAa,EAAE,SAAS,EAAE,SAAS;AAAA,EAC3D,gBAAgB,aAAE,MAAM,aAAE,OAAO,CAAC;AAAA,EAClC,YAAY,aAAE,MAAM,aAAE,OAAO,CAAC;AAAA,EAC9B,eAAe,aAAE,MAAM,aAAa;AAAA,EACpC,cAAc,mBAAmB,SAAS,EAAE,SAAS;AAAA,EACrD,QAAQ,aAAE,OAAO,aAAE,QAAQ,CAAC,EAAE,SAAS,EAAE,SAAS;AACpD,CAAC;AAKM,IAAM,wBAAwB,aAAE,OAAO;AAAA,EAC5C,OAAO,aAAE,OAAO,EAAE,IAAI,CAAC,EAAE,IAAI,CAAC;AAAA,EAC9B,MAAM,aAAE,MAAM,aAAE,OAAO,CAAC,EAAE,SAAS,EAAE,QAAQ,CAAC,CAAC;AAAA,EAC/C,QAAQ,aAAE,MAAM,aAAE,OAAO,CAAC,EAAE,SAAS,EAAE,QAAQ,CAAC,CAAC;AAAA,EACjD,WAAW,aAAE,OAAO,EAAE,SAAS;AAAA;AAAA,EAE/B,SAAS,aAAE,OAAO,aAAE,QAAQ,CAAC,EAAE,SAAS;AAC1C,CAAC;;;ACRM,IAAM,0BAAN,cAAsC,MAAM;AAAA,EACjD,YAAY,SAAiB;AAC3B,UAAM,OAAO;AACb,SAAK,OAAO;AAAA,EACd;AACF;AAKO,IAAM,wBAAN,cAAoC,MAAM;AAAA,EACtC;AAAA,EAET,YAAY,SAAiB,YAAqB;AAChD,UAAM,OAAO;AACb,SAAK,OAAO;AACZ,SAAK,aAAa;AAAA,EACpB;AACF;AAkCO,SAAS,qBAA+C;AAC7D,QAAM,WAAW,QAAQ,IAAI;AAC7B,QAAM,aAAa,QAAQ,IAAI;AAE/B,MAAI,CAAC,UAAU;AACb,WAAO;AAAA,EACT;AAEA,MAAI,CAAC,YAAY;AACf,UAAM,IAAI;AAAA,MACR;AAAA,IACF;AAAA,EACF;AAEA,SAAO,2BAA2B,UAAU,UAAU;AACxD;AAMO,SAAS,2BAA2B,KAAa,OAA6B;AACnF,QAAM,UAAU;AAAA,IACd,gBAAgB;AAAA,IAChB,eAAe,UAAU,KAAK;AAAA,EAChC;AAEA,SAAO;AAAA,IACL,MAAM,OAAO,SAA6D;AACxE,YAAM,WAAW,MAAM,MAAM,GAAG,GAAG,WAAW;AAAA,QAC5C,QAAQ;AAAA,QACR;AAAA,QACA,MAAM,KAAK,UAAU;AAAA,UACnB,UAAU,QAAQ;AAAA,UAClB,cAAc,QAAQ;AAAA,UACtB,YAAY,QAAQ;AAAA,UACpB,SAAS,QAAQ;AAAA,UACjB,QAAQ,QAAQ;AAAA,QAClB,CAAC;AAAA,MACH,CAAC;AAED,UAAI,CAAC,SAAS,IAAI;AAChB,cAAM,YAAY,MAAM,SAAS,KAAK;AACtC,YAAI;AACJ,YAAI;AACF,gBAAM,YAAY,KAAK,MAAM,SAAS;AACtC,yBAAe,UAAU,SAAS,QAAQ,SAAS,MAAM;AAAA,QAC3D,QAAQ;AACN,yBAAe,aAAa,QAAQ,SAAS,MAAM;AAAA,QACrD;AACA,cAAM,IAAI,sBAAsB,cAAc,SAAS,MAAM;AAAA,MAC/D;AAEA,aAAQ,MAAM,SAAS,KAAK;AAAA,IAC9B;AAAA,IAEA,MAAM,YACJ,UAC0C;AAC1C,YAAM,WAAW,MAAM,MAAM,GAAG,GAAG,gBAAgB;AAAA,QACjD,QAAQ;AAAA,QACR;AAAA,QACA,MAAM,KAAK,UAAU;AAAA,UACnB,UAAU,SAAS,IAAI,CAAC,OAAO;AAAA,YAC7B,UAAU,EAAE;AAAA,YACZ,cAAc,EAAE;AAAA,YAChB,YAAY,EAAE;AAAA,YACd,SAAS,EAAE;AAAA,YACX,QAAQ,EAAE;AAAA,UACZ,EAAE;AAAA,QACJ,CAAC;AAAA,MACH,CAAC;AAED,UAAI,CAAC,SAAS,IAAI;AAChB,cAAM,YAAY,MAAM,SAAS,KAAK;AACtC,YAAI;AACJ,YAAI;AACF,gBAAM,YAAY,KAAK,MAAM,SAAS;AACtC,yBAAe,UAAU,SAAS,QAAQ,SAAS,MAAM;AAAA,QAC3D,QAAQ;AACN,yBAAe,aAAa,QAAQ,SAAS,MAAM;AAAA,QACrD;AACA,cAAM,IAAI,sBAAsB,cAAc,SAAS,MAAM;AAAA,MAC/D;AAEA,YAAM,SAAU,MAAM,SAAS,KAAK;AACpC,aAAO,OAAO;AAAA,IAChB;AAAA,IAEA,MAAM,UAA+B;AACnC,YAAM,WAAW,MAAM,MAAM,GAAG,GAAG,SAAS;AAAA,QAC1C,QAAQ;AAAA,QACR;AAAA,MACF,CAAC;AAED,UAAI,CAAC,SAAS,IAAI;AAChB,cAAM,YAAY,MAAM,SAAS,KAAK;AACtC,YAAI;AACJ,YAAI;AACF,gBAAM,YAAY,KAAK,MAAM,SAAS;AACtC,yBAAe,UAAU,SAAS,QAAQ,SAAS,MAAM;AAAA,QAC3D,QAAQ;AACN,yBAAe,aAAa,QAAQ,SAAS,MAAM;AAAA,QACrD;AACA,cAAM,IAAI,sBAAsB,cAAc,SAAS,MAAM;AAAA,MAC/D;AAEA,aAAQ,MAAM,SAAS,KAAK;AAAA,IAC9B;AAAA,EACF;AACF;;;AFpKA,IAAAA,cAAkB;;;AGjElB,qBAA6B;;;ACC7B,SAAS,YAAY,KAAqB;AAExC,MAAI,SAAS,KAAK,GAAG,GAAG;AACtB,WAAO;AAAA,EACT;AACA,SAAO,IAAI,QAAQ,gBAAgB,CAAC,GAAG,WAAW,OAAO,YAAY,CAAC;AACxE;AASO,SAAS,gBAAgB,KAAuB;AACrD,MAAI,QAAQ,QAAQ,QAAQ,QAAW;AACrC,WAAO;AAAA,EACT;AAEA,MAAI,MAAM,QAAQ,GAAG,GAAG;AACtB,WAAO,IAAI,IAAI,CAAC,SAAS,gBAAgB,IAAI,CAAC;AAAA,EAChD;AAEA,MAAI,OAAO,QAAQ,UAAU;AAC3B,UAAM,SAAkC,CAAC;AACzC,eAAW,CAAC,KAAK,KAAK,KAAK,OAAO,QAAQ,GAAG,GAAG;AAC9C,YAAM,WAAW,YAAY,GAAG;AAChC,aAAO,QAAQ,IAAI,gBAAgB,KAAK;AAAA,IAC1C;AACA,WAAO;AAAA,EACT;AAEA,SAAO;AACT;;;ADfA,SAAS,YAAoB;AAC3B,aAAO,6BAAa,GAAG,MAAM;AAC/B;AAKA,SAAS,WAAW,OAAuB;AACzC,MAAI,OAAO,MAAM,KAAK,KAAK,CAAC,OAAO,SAAS,KAAK,GAAG;AAClD,WAAO;AAAA,EACT;AACA,SAAO,KAAK,IAAI,GAAG,KAAK,IAAI,GAAG,KAAK,CAAC;AACvC;AAKA,SAAS,YAAY,OAAwB;AAC3C,MAAI,iBAAiB,OAAO;AAC1B,WAAO,MAAM;AAAA,EACf;AACA,SAAO,OAAO,KAAK;AACrB;AAMA,eAAsB,aAAa,SAA0C;AAC3E,MAAI;AAEF,UAAM,QAAQ,UAAU;AAGxB,UAAM,WAAW,KAAK,MAAM,KAAK;AAGjC,UAAM,aAAa,gBAAgB,QAAQ;AAG3C,UAAM,QAAQ,qBAAqB,MAAM,UAAU;AAGnD,UAAM,YAAY,MAAM,QAAQ,KAAK;AAGrC,UAAM,SAAS,sBAAsB,MAAM;AAAA,MACzC,GAAG;AAAA,MACH,OAAO,WAAW,UAAU,KAAK;AAAA,IACnC,CAAC;AAGD,YAAQ,IAAI,KAAK,UAAU,QAAQ,MAAM,CAAC,CAAC;AAAA,EAC7C,SAAS,OAAO;AAEd,UAAM,eAAe,YAAY,KAAK;AACtC,UAAM,cAA+B;AAAA,MACnC,OAAO;AAAA,MACP,MAAM,CAAC;AAAA,MACP,QAAQ,CAAC,YAAY;AAAA,MACrB,WAAW,sBAAsB,YAAY;AAAA,IAC/C;AACA,YAAQ,IAAI,KAAK,UAAU,aAAa,MAAM,CAAC,CAAC;AAChD,YAAQ,KAAK,CAAC;AAAA,EAChB;AACF;;;AH+BO,SAAS,gBAAgB,SAAiC;AAE/D,eAAa,OAAO;AACtB;","names":["import_zod"]}
|
|
1
|
+
{"version":3,"sources":["../src/index.ts","../src/schemas.ts","../src/target-client.ts","../src/prompt-template.ts","../src/case-conversion.ts","../src/runtime.ts"],"sourcesContent":["/**\n * AgentV Evaluation SDK\n *\n * Build custom code judges for evaluating AI agent outputs.\n *\n * @example Basic code judge\n * ```typescript\n * #!/usr/bin/env bun\n * import { defineCodeJudge } from '@agentv/eval';\n *\n * export default defineCodeJudge(({ traceSummary, candidateAnswer }) => ({\n * score: traceSummary?.eventCount <= 5 ? 1.0 : 0.5,\n * hits: ['Efficient tool usage'],\n * misses: [],\n * }));\n * ```\n *\n * @example Code judge with target access (requires `target` config in YAML)\n * ```typescript\n * #!/usr/bin/env bun\n * import { defineCodeJudge, createTargetClient } from '@agentv/eval';\n *\n * export default defineCodeJudge(async ({ question }) => {\n * const target = createTargetClient();\n * if (!target) {\n * return { score: 0, misses: ['Target not available'] };\n * }\n *\n * const response = await target.invoke({\n * question: `Evaluate: ${question}`,\n * systemPrompt: 'Respond with JSON: { \"score\": 0-1 }'\n * });\n *\n * const result = JSON.parse(response.rawText ?? '{}');\n * return { score: result.score ?? 0 };\n * });\n * ```\n *\n * @packageDocumentation\n */\n\n// Re-export schemas and types\nexport {\n CodeJudgeInputSchema,\n CodeJudgeResultSchema,\n TraceSummarySchema,\n MessageSchema,\n ToolCallSchema,\n TokenUsageSchema,\n PromptTemplateInputSchema,\n type CodeJudgeInput,\n type CodeJudgeResult,\n type TraceSummary,\n type Message,\n type ToolCall,\n type TokenUsage,\n type PromptTemplateInput,\n} from './schemas.js';\n\n// Re-export target client\nexport {\n createTargetClient,\n TargetNotAvailableError,\n TargetInvocationError,\n type TargetClient,\n type TargetInfo,\n type TargetInvokeRequest,\n type TargetInvokeResponse,\n} from './target-client.js';\n\n// Re-export Zod for typed config support\nexport { z } from 'zod';\n\nimport { type PromptTemplateHandler, runPromptTemplate } from './prompt-template.js';\n// Import runtime\nimport { type CodeJudgeHandler, runCodeJudge } from './runtime.js';\n\nexport type { CodeJudgeHandler };\nexport type { PromptTemplateHandler };\n\n/**\n * Define a code judge evaluator with automatic stdin/stdout handling.\n *\n * This function:\n * 1. Reads JSON from stdin (snake_case format)\n * 2. Converts to camelCase and validates with Zod\n * 3. Calls your handler with typed input\n * 4. Validates the result and outputs JSON to stdout\n * 5. Handles errors gracefully with proper exit codes\n *\n * @param handler - Function that evaluates the input and returns a result\n *\n * @example\n * ```typescript\n * import { defineCodeJudge } from '@agentv/eval';\n *\n * export default defineCodeJudge(({ traceSummary }) => {\n * if (!traceSummary) {\n * return { score: 0.5, reasoning: 'No trace available' };\n * }\n *\n * const efficient = traceSummary.eventCount <= 10;\n * return {\n * score: efficient ? 1.0 : 0.5,\n * hits: efficient ? ['Efficient execution'] : [],\n * misses: efficient ? [] : ['Too many tool calls'],\n * };\n * });\n * ```\n *\n * @example With typed config\n * ```typescript\n * import { defineCodeJudge, z } from '@agentv/eval';\n *\n * const ConfigSchema = z.object({\n * maxToolCalls: z.number().default(10),\n * });\n *\n * export default defineCodeJudge(({ traceSummary, config }) => {\n * const { maxToolCalls } = ConfigSchema.parse(config ?? {});\n * // Use maxToolCalls...\n * });\n * ```\n */\nexport function defineCodeJudge(handler: CodeJudgeHandler): void {\n // Run immediately when module is loaded\n runCodeJudge(handler);\n}\n\n/**\n * Define a prompt template with automatic stdin/stdout handling.\n *\n * This function:\n * 1. Reads JSON from stdin (snake_case format)\n * 2. Converts to camelCase and validates with Zod\n * 3. Calls your handler with typed input\n * 4. Outputs the generated prompt string to stdout\n * 5. Handles errors gracefully with proper exit codes\n *\n * @param handler - Function that generates the prompt string from input\n *\n * @example\n * ```typescript\n * import { definePromptTemplate } from '@agentv/eval';\n *\n * export default definePromptTemplate((ctx) => `\n * Question: ${ctx.question}\n * Answer: ${ctx.candidateAnswer}\n *\n * ${ctx.referenceAnswer ? `Reference: ${ctx.referenceAnswer}` : ''}\n * `);\n * ```\n *\n * @example With conditional logic\n * ```typescript\n * import { definePromptTemplate } from '@agentv/eval';\n *\n * export default definePromptTemplate((ctx) => {\n * const rubric = ctx.config?.rubric as string | undefined;\n * return `\n * Question: ${ctx.question}\n * Candidate Answer: ${ctx.candidateAnswer}\n * ${rubric ? `\\nEvaluation Criteria:\\n${rubric}` : ''}\n * `;\n * });\n * ```\n */\nexport function definePromptTemplate(handler: PromptTemplateHandler): void {\n // Run immediately when module is loaded\n runPromptTemplate(handler);\n}\n","/**\n * Zod schemas for code judge input/output validation.\n * Provides both compile-time types and runtime validation.\n */\nimport { z } from 'zod';\n\n/**\n * Token usage metrics schema.\n */\nexport const TokenUsageSchema = z.object({\n input: z.number(),\n output: z.number(),\n cached: z.number().optional(),\n});\n\n/**\n * Trace summary schema (camelCase for TypeScript ergonomics).\n */\nexport const TraceSummarySchema = z.object({\n eventCount: z.number(),\n toolNames: z.array(z.string()),\n toolCallsByName: z.record(z.string(), z.number()),\n errorCount: z.number(),\n tokenUsage: TokenUsageSchema.optional(),\n costUsd: z.number().optional(),\n durationMs: z.number().optional(),\n toolDurations: z.record(z.string(), z.array(z.number())).optional(),\n});\n\n/**\n * Tool call schema.\n */\nexport const ToolCallSchema = z.object({\n tool: z.string(),\n input: z.unknown().optional(),\n output: z.unknown().optional(),\n id: z.string().optional(),\n timestamp: z.string().optional(),\n});\n\n/**\n * Unified message schema for input, expected, and output messages.\n */\nexport const MessageSchema = z.object({\n role: z.enum(['assistant', 'user', 'system', 'tool']),\n content: z.union([z.string(), z.record(z.unknown()), z.array(z.record(z.unknown()))]).optional(),\n toolCalls: z.array(ToolCallSchema).optional(),\n name: z.string().optional(),\n timestamp: z.string().optional(),\n metadata: z.record(z.unknown()).optional(),\n});\n\n/**\n * Code judge input schema (camelCase, converted from snake_case wire format).\n */\nexport const CodeJudgeInputSchema = z.object({\n question: z.string(),\n expectedOutcome: z.string(),\n expectedMessages: z.array(MessageSchema),\n referenceAnswer: z.string().optional(),\n candidateAnswer: z.string(),\n outputMessages: z.array(MessageSchema).nullable().optional(),\n guidelineFiles: z.array(z.string()),\n inputFiles: z.array(z.string()),\n inputMessages: z.array(MessageSchema),\n traceSummary: TraceSummarySchema.nullable().optional(),\n config: z.record(z.unknown()).nullable().optional(),\n});\n\n/**\n * Code judge result schema (validated before output).\n */\nexport const CodeJudgeResultSchema = z.object({\n score: z.number().min(0).max(1),\n hits: z.array(z.string()).optional().default([]),\n misses: z.array(z.string()).optional().default([]),\n reasoning: z.string().optional(),\n /** Optional structured details for domain-specific metrics (e.g., TP/TN/FP/FN counts, alignments). */\n details: z.record(z.unknown()).optional(),\n});\n\n/**\n * Inferred types from schemas.\n */\nexport type CodeJudgeInput = z.infer<typeof CodeJudgeInputSchema>;\nexport type CodeJudgeResult = z.infer<typeof CodeJudgeResultSchema>;\nexport type TraceSummary = z.infer<typeof TraceSummarySchema>;\nexport type Message = z.infer<typeof MessageSchema>;\nexport type ToolCall = z.infer<typeof ToolCallSchema>;\nexport type TokenUsage = z.infer<typeof TokenUsageSchema>;\n\n/**\n * Prompt template input schema (camelCase, converted from snake_case wire format).\n * Uses the same schema as CodeJudgeInput since the orchestrator sends identical payloads.\n */\nexport const PromptTemplateInputSchema = CodeJudgeInputSchema;\n\nexport type PromptTemplateInput = CodeJudgeInput;\n","/**\n * Client for invoking configured targets from code_judge scripts.\n *\n * Environment variables (set automatically by AgentV when `target` config is present):\n * - AGENTV_TARGET_PROXY_URL: The URL of the local proxy server\n * - AGENTV_TARGET_PROXY_TOKEN: Bearer token for authentication\n */\n\n/**\n * Request to invoke the target\n */\nexport interface TargetInvokeRequest {\n readonly question: string;\n readonly systemPrompt?: string;\n readonly evalCaseId?: string;\n readonly attempt?: number;\n /** Optional target override - use a different target for this invocation */\n readonly target?: string;\n}\n\n/**\n * Response from a target invocation\n */\nexport interface TargetInvokeResponse {\n readonly outputMessages: readonly unknown[];\n readonly rawText?: string;\n}\n\n/**\n * Information about the target proxy configuration\n */\nexport interface TargetInfo {\n /** Name of the default target being used */\n readonly targetName: string;\n /** Maximum number of calls allowed */\n readonly maxCalls: number;\n /** Current number of calls made */\n readonly callCount: number;\n /** List of all available target names */\n readonly availableTargets: readonly string[];\n}\n\n/**\n * Target client for making target invocations\n */\nexport interface TargetClient {\n /**\n * Invoke the configured target with a prompt.\n * @param request - The question and optional system prompt\n * @returns The target's response with output messages and optional raw text\n */\n invoke(request: TargetInvokeRequest): Promise<TargetInvokeResponse>;\n\n /**\n * Invoke the target with multiple requests in sequence.\n * Each request counts toward the max_calls limit.\n * @param requests - Array of target requests\n * @returns Array of target responses\n */\n invokeBatch(requests: readonly TargetInvokeRequest[]): Promise<readonly TargetInvokeResponse[]>;\n\n /**\n * Get information about the target proxy configuration.\n * Returns the default target name, max calls, current call count, and available targets.\n */\n getInfo(): Promise<TargetInfo>;\n}\n\n/**\n * Error thrown when target proxy is not available\n */\nexport class TargetNotAvailableError extends Error {\n constructor(message: string) {\n super(message);\n this.name = 'TargetNotAvailableError';\n }\n}\n\n/**\n * Error thrown when target invocation fails\n */\nexport class TargetInvocationError extends Error {\n readonly statusCode?: number;\n\n constructor(message: string, statusCode?: number) {\n super(message);\n this.name = 'TargetInvocationError';\n this.statusCode = statusCode;\n }\n}\n\n/**\n * Create a target client from environment variables.\n *\n * This function reads the proxy URL and token from environment variables\n * that are automatically set by AgentV when a `target` config block is present\n * on a `code_judge` evaluator.\n *\n * @returns A target client if environment variables are set, otherwise undefined\n * @throws TargetNotAvailableError if token is missing when URL is present\n *\n * @example\n * ```typescript\n * import { createTargetClient, defineCodeJudge } from '@agentv/eval';\n *\n * export default defineCodeJudge(async ({ question, expectedOutcome }) => {\n * const target = createTargetClient();\n *\n * if (!target) {\n * // Target not available - no target config on this evaluator\n * return { score: 0.5, reasoning: 'Target not available' };\n * }\n *\n * const response = await target.invoke({\n * question: `Is this answer correct? Question: ${question}, Expected: ${expectedOutcome}`,\n * systemPrompt: 'You are an expert evaluator. Respond with JSON: { \"correct\": true/false }'\n * });\n *\n * const result = JSON.parse(response.rawText ?? '{}');\n * return { score: result.correct ? 1.0 : 0.0 };\n * });\n * ```\n */\nexport function createTargetClient(): TargetClient | undefined {\n const proxyUrl = process.env.AGENTV_TARGET_PROXY_URL;\n const proxyToken = process.env.AGENTV_TARGET_PROXY_TOKEN;\n\n if (!proxyUrl) {\n return undefined;\n }\n\n if (!proxyToken) {\n throw new TargetNotAvailableError(\n 'AGENTV_TARGET_PROXY_URL is set but AGENTV_TARGET_PROXY_TOKEN is missing',\n );\n }\n\n return createTargetClientInternal(proxyUrl, proxyToken);\n}\n\n/**\n * Internal: Create a target client with explicit URL and token.\n * Exported for testing only - use createTargetClient() in production.\n */\nexport function createTargetClientInternal(url: string, token: string): TargetClient {\n const headers = {\n 'Content-Type': 'application/json',\n Authorization: `Bearer ${token}`,\n };\n\n return {\n async invoke(request: TargetInvokeRequest): Promise<TargetInvokeResponse> {\n const response = await fetch(`${url}/invoke`, {\n method: 'POST',\n headers,\n body: JSON.stringify({\n question: request.question,\n systemPrompt: request.systemPrompt,\n evalCaseId: request.evalCaseId,\n attempt: request.attempt,\n target: request.target,\n }),\n });\n\n if (!response.ok) {\n const errorBody = await response.text();\n let errorMessage: string;\n try {\n const errorJson = JSON.parse(errorBody) as { error?: string };\n errorMessage = errorJson.error ?? `HTTP ${response.status}`;\n } catch {\n errorMessage = errorBody || `HTTP ${response.status}`;\n }\n throw new TargetInvocationError(errorMessage, response.status);\n }\n\n return (await response.json()) as TargetInvokeResponse;\n },\n\n async invokeBatch(\n requests: readonly TargetInvokeRequest[],\n ): Promise<readonly TargetInvokeResponse[]> {\n const response = await fetch(`${url}/invokeBatch`, {\n method: 'POST',\n headers,\n body: JSON.stringify({\n requests: requests.map((r) => ({\n question: r.question,\n systemPrompt: r.systemPrompt,\n evalCaseId: r.evalCaseId,\n attempt: r.attempt,\n target: r.target,\n })),\n }),\n });\n\n if (!response.ok) {\n const errorBody = await response.text();\n let errorMessage: string;\n try {\n const errorJson = JSON.parse(errorBody) as { error?: string };\n errorMessage = errorJson.error ?? `HTTP ${response.status}`;\n } catch {\n errorMessage = errorBody || `HTTP ${response.status}`;\n }\n throw new TargetInvocationError(errorMessage, response.status);\n }\n\n const result = (await response.json()) as { responses: TargetInvokeResponse[] };\n return result.responses;\n },\n\n async getInfo(): Promise<TargetInfo> {\n const response = await fetch(`${url}/info`, {\n method: 'GET',\n headers,\n });\n\n if (!response.ok) {\n const errorBody = await response.text();\n let errorMessage: string;\n try {\n const errorJson = JSON.parse(errorBody) as { error?: string };\n errorMessage = errorJson.error ?? `HTTP ${response.status}`;\n } catch {\n errorMessage = errorBody || `HTTP ${response.status}`;\n }\n throw new TargetInvocationError(errorMessage, response.status);\n }\n\n return (await response.json()) as TargetInfo;\n },\n };\n}\n","/**\n * Runtime for prompt template evaluators.\n * Handles stdin parsing, validation, error handling, and string output.\n */\nimport { readFileSync } from 'node:fs';\n\nimport { toCamelCaseDeep } from './case-conversion.js';\nimport { type PromptTemplateInput, PromptTemplateInputSchema } from './schemas.js';\n\n/**\n * Handler function type for prompt templates.\n * Returns the prompt string to use for evaluation.\n */\nexport type PromptTemplateHandler = (input: PromptTemplateInput) => string | Promise<string>;\n\n/**\n * Read stdin synchronously (works in both Node.js and Bun).\n */\nfunction readStdin(): string {\n return readFileSync(0, 'utf8');\n}\n\n/**\n * Run a prompt template handler with full stdin/stdout handling.\n * This is the internal implementation called by definePromptTemplate.\n */\nexport async function runPromptTemplate(handler: PromptTemplateHandler): Promise<void> {\n try {\n // 1. Read stdin\n const stdin = readStdin();\n\n // 2. Parse JSON\n const rawInput = JSON.parse(stdin) as Record<string, unknown>;\n\n // 3. Convert snake_case to camelCase\n const camelInput = toCamelCaseDeep(rawInput);\n\n // 4. Validate input with Zod\n const input = PromptTemplateInputSchema.parse(camelInput);\n\n // 5. Run handler\n const prompt = await handler(input);\n\n // 6. Output raw string (not JSON) - the prompt itself\n console.log(prompt);\n } catch (error) {\n // Output error to stderr and exit with non-zero code\n console.error(error instanceof Error ? error.message : String(error));\n process.exit(1);\n }\n}\n\n/**\n * Define a prompt template with automatic stdin/stdout handling.\n *\n * This function:\n * 1. Reads JSON from stdin (snake_case format)\n * 2. Converts to camelCase and validates with Zod\n * 3. Calls your handler with typed input\n * 4. Outputs the generated prompt string to stdout\n * 5. Handles errors gracefully with proper exit codes\n *\n * @param handler - Function that generates the prompt string from input\n *\n * @example\n * ```typescript\n * import { definePromptTemplate } from '@agentv/eval';\n *\n * export default definePromptTemplate((ctx) => `\n * Question: ${ctx.question}\n * Answer: ${ctx.candidateAnswer}\n *\n * ${ctx.referenceAnswer ? `Reference: ${ctx.referenceAnswer}` : ''}\n * `);\n * ```\n *\n * @example With conditional logic\n * ```typescript\n * import { definePromptTemplate } from '@agentv/eval';\n *\n * export default definePromptTemplate((ctx) => {\n * const rubric = ctx.config?.rubric as string | undefined;\n * return `\n * Question: ${ctx.question}\n * Candidate Answer: ${ctx.candidateAnswer}\n * ${rubric ? `\\nEvaluation Criteria:\\n${rubric}` : ''}\n * `;\n * });\n * ```\n *\n * @example Async handler\n * ```typescript\n * import { definePromptTemplate } from '@agentv/eval';\n *\n * export default definePromptTemplate(async (ctx) => {\n * // Async operations are supported\n * return `Question: ${ctx.question}\\nAnswer: ${ctx.candidateAnswer}`;\n * });\n * ```\n */\nexport function definePromptTemplate(handler: PromptTemplateHandler): void {\n // Run immediately when module is loaded\n runPromptTemplate(handler);\n}\n","/**\n * Case conversion utilities for JSON payloads.\n * Converts between snake_case (wire format) and camelCase (TypeScript).\n */\n\nfunction toCamelCase(str: string): string {\n // Don't convert keys that start with uppercase (proper nouns/tool names)\n if (/^[A-Z]/.test(str)) {\n return str;\n }\n return str.replace(/_([a-z0-9])/g, (_, letter) => letter.toUpperCase());\n}\n\n/**\n * Recursively converts all keys in an object from snake_case to camelCase.\n * Used to map wire payloads into TypeScript-friendly shapes.\n *\n * @param obj - The object to convert (can be any JSON-serializable value)\n * @returns A new object with all keys converted to camelCase\n */\nexport function toCamelCaseDeep(obj: unknown): unknown {\n if (obj === null || obj === undefined) {\n return obj;\n }\n\n if (Array.isArray(obj)) {\n return obj.map((item) => toCamelCaseDeep(item));\n }\n\n if (typeof obj === 'object') {\n const result: Record<string, unknown> = {};\n for (const [key, value] of Object.entries(obj)) {\n const camelKey = toCamelCase(key);\n result[camelKey] = toCamelCaseDeep(value);\n }\n return result;\n }\n\n return obj;\n}\n","/**\n * Runtime for code judge evaluators.\n * Handles stdin parsing, validation, error handling, and output formatting.\n */\nimport { readFileSync } from 'node:fs';\n\nimport { toCamelCaseDeep } from './case-conversion.js';\nimport {\n type CodeJudgeInput,\n CodeJudgeInputSchema,\n type CodeJudgeResult,\n CodeJudgeResultSchema,\n} from './schemas.js';\n\n/**\n * Handler function type for code judges.\n */\nexport type CodeJudgeHandler = (\n input: CodeJudgeInput,\n) => CodeJudgeResult | Promise<CodeJudgeResult>;\n\n/**\n * Read stdin synchronously (works in both Node.js and Bun).\n */\nfunction readStdin(): string {\n return readFileSync(0, 'utf8');\n}\n\n/**\n * Clamp a value to the range [0, 1].\n */\nfunction clampScore(value: number): number {\n if (Number.isNaN(value) || !Number.isFinite(value)) {\n return 0;\n }\n return Math.max(0, Math.min(1, value));\n}\n\n/**\n * Format an error for output.\n */\nfunction formatError(error: unknown): string {\n if (error instanceof Error) {\n return error.message;\n }\n return String(error);\n}\n\n/**\n * Run a code judge handler with full stdin/stdout handling.\n * This is the internal implementation called by defineCodeJudge.\n */\nexport async function runCodeJudge(handler: CodeJudgeHandler): Promise<void> {\n try {\n // 1. Read stdin\n const stdin = readStdin();\n\n // 2. Parse JSON\n const rawInput = JSON.parse(stdin) as Record<string, unknown>;\n\n // 3. Convert snake_case to camelCase\n const camelInput = toCamelCaseDeep(rawInput);\n\n // 4. Validate input with Zod\n const input = CodeJudgeInputSchema.parse(camelInput);\n\n // 5. Run handler\n const rawResult = await handler(input);\n\n // 6. Validate and normalize output\n const result = CodeJudgeResultSchema.parse({\n ...rawResult,\n score: clampScore(rawResult.score),\n });\n\n // 7. Output JSON\n console.log(JSON.stringify(result, null, 2));\n } catch (error) {\n // Output failure result\n const errorMessage = formatError(error);\n const errorResult: CodeJudgeResult = {\n score: 0,\n hits: [],\n misses: [errorMessage],\n reasoning: `Evaluation failed: ${errorMessage}`,\n };\n console.log(JSON.stringify(errorResult, null, 2));\n process.exit(1);\n }\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;;;ACIA,iBAAkB;AAKX,IAAM,mBAAmB,aAAE,OAAO;AAAA,EACvC,OAAO,aAAE,OAAO;AAAA,EAChB,QAAQ,aAAE,OAAO;AAAA,EACjB,QAAQ,aAAE,OAAO,EAAE,SAAS;AAC9B,CAAC;AAKM,IAAM,qBAAqB,aAAE,OAAO;AAAA,EACzC,YAAY,aAAE,OAAO;AAAA,EACrB,WAAW,aAAE,MAAM,aAAE,OAAO,CAAC;AAAA,EAC7B,iBAAiB,aAAE,OAAO,aAAE,OAAO,GAAG,aAAE,OAAO,CAAC;AAAA,EAChD,YAAY,aAAE,OAAO;AAAA,EACrB,YAAY,iBAAiB,SAAS;AAAA,EACtC,SAAS,aAAE,OAAO,EAAE,SAAS;AAAA,EAC7B,YAAY,aAAE,OAAO,EAAE,SAAS;AAAA,EAChC,eAAe,aAAE,OAAO,aAAE,OAAO,GAAG,aAAE,MAAM,aAAE,OAAO,CAAC,CAAC,EAAE,SAAS;AACpE,CAAC;AAKM,IAAM,iBAAiB,aAAE,OAAO;AAAA,EACrC,MAAM,aAAE,OAAO;AAAA,EACf,OAAO,aAAE,QAAQ,EAAE,SAAS;AAAA,EAC5B,QAAQ,aAAE,QAAQ,EAAE,SAAS;AAAA,EAC7B,IAAI,aAAE,OAAO,EAAE,SAAS;AAAA,EACxB,WAAW,aAAE,OAAO,EAAE,SAAS;AACjC,CAAC;AAKM,IAAM,gBAAgB,aAAE,OAAO;AAAA,EACpC,MAAM,aAAE,KAAK,CAAC,aAAa,QAAQ,UAAU,MAAM,CAAC;AAAA,EACpD,SAAS,aAAE,MAAM,CAAC,aAAE,OAAO,GAAG,aAAE,OAAO,aAAE,QAAQ,CAAC,GAAG,aAAE,MAAM,aAAE,OAAO,aAAE,QAAQ,CAAC,CAAC,CAAC,CAAC,EAAE,SAAS;AAAA,EAC/F,WAAW,aAAE,MAAM,cAAc,EAAE,SAAS;AAAA,EAC5C,MAAM,aAAE,OAAO,EAAE,SAAS;AAAA,EAC1B,WAAW,aAAE,OAAO,EAAE,SAAS;AAAA,EAC/B,UAAU,aAAE,OAAO,aAAE,QAAQ,CAAC,EAAE,SAAS;AAC3C,CAAC;AAKM,IAAM,uBAAuB,aAAE,OAAO;AAAA,EAC3C,UAAU,aAAE,OAAO;AAAA,EACnB,iBAAiB,aAAE,OAAO;AAAA,EAC1B,kBAAkB,aAAE,MAAM,aAAa;AAAA,EACvC,iBAAiB,aAAE,OAAO,EAAE,SAAS;AAAA,EACrC,iBAAiB,aAAE,OAAO;AAAA,EAC1B,gBAAgB,aAAE,MAAM,aAAa,EAAE,SAAS,EAAE,SAAS;AAAA,EAC3D,gBAAgB,aAAE,MAAM,aAAE,OAAO,CAAC;AAAA,EAClC,YAAY,aAAE,MAAM,aAAE,OAAO,CAAC;AAAA,EAC9B,eAAe,aAAE,MAAM,aAAa;AAAA,EACpC,cAAc,mBAAmB,SAAS,EAAE,SAAS;AAAA,EACrD,QAAQ,aAAE,OAAO,aAAE,QAAQ,CAAC,EAAE,SAAS,EAAE,SAAS;AACpD,CAAC;AAKM,IAAM,wBAAwB,aAAE,OAAO;AAAA,EAC5C,OAAO,aAAE,OAAO,EAAE,IAAI,CAAC,EAAE,IAAI,CAAC;AAAA,EAC9B,MAAM,aAAE,MAAM,aAAE,OAAO,CAAC,EAAE,SAAS,EAAE,QAAQ,CAAC,CAAC;AAAA,EAC/C,QAAQ,aAAE,MAAM,aAAE,OAAO,CAAC,EAAE,SAAS,EAAE,QAAQ,CAAC,CAAC;AAAA,EACjD,WAAW,aAAE,OAAO,EAAE,SAAS;AAAA;AAAA,EAE/B,SAAS,aAAE,OAAO,aAAE,QAAQ,CAAC,EAAE,SAAS;AAC1C,CAAC;AAgBM,IAAM,4BAA4B;;;ACxBlC,IAAM,0BAAN,cAAsC,MAAM;AAAA,EACjD,YAAY,SAAiB;AAC3B,UAAM,OAAO;AACb,SAAK,OAAO;AAAA,EACd;AACF;AAKO,IAAM,wBAAN,cAAoC,MAAM;AAAA,EACtC;AAAA,EAET,YAAY,SAAiB,YAAqB;AAChD,UAAM,OAAO;AACb,SAAK,OAAO;AACZ,SAAK,aAAa;AAAA,EACpB;AACF;AAkCO,SAAS,qBAA+C;AAC7D,QAAM,WAAW,QAAQ,IAAI;AAC7B,QAAM,aAAa,QAAQ,IAAI;AAE/B,MAAI,CAAC,UAAU;AACb,WAAO;AAAA,EACT;AAEA,MAAI,CAAC,YAAY;AACf,UAAM,IAAI;AAAA,MACR;AAAA,IACF;AAAA,EACF;AAEA,SAAO,2BAA2B,UAAU,UAAU;AACxD;AAMO,SAAS,2BAA2B,KAAa,OAA6B;AACnF,QAAM,UAAU;AAAA,IACd,gBAAgB;AAAA,IAChB,eAAe,UAAU,KAAK;AAAA,EAChC;AAEA,SAAO;AAAA,IACL,MAAM,OAAO,SAA6D;AACxE,YAAM,WAAW,MAAM,MAAM,GAAG,GAAG,WAAW;AAAA,QAC5C,QAAQ;AAAA,QACR;AAAA,QACA,MAAM,KAAK,UAAU;AAAA,UACnB,UAAU,QAAQ;AAAA,UAClB,cAAc,QAAQ;AAAA,UACtB,YAAY,QAAQ;AAAA,UACpB,SAAS,QAAQ;AAAA,UACjB,QAAQ,QAAQ;AAAA,QAClB,CAAC;AAAA,MACH,CAAC;AAED,UAAI,CAAC,SAAS,IAAI;AAChB,cAAM,YAAY,MAAM,SAAS,KAAK;AACtC,YAAI;AACJ,YAAI;AACF,gBAAM,YAAY,KAAK,MAAM,SAAS;AACtC,yBAAe,UAAU,SAAS,QAAQ,SAAS,MAAM;AAAA,QAC3D,QAAQ;AACN,yBAAe,aAAa,QAAQ,SAAS,MAAM;AAAA,QACrD;AACA,cAAM,IAAI,sBAAsB,cAAc,SAAS,MAAM;AAAA,MAC/D;AAEA,aAAQ,MAAM,SAAS,KAAK;AAAA,IAC9B;AAAA,IAEA,MAAM,YACJ,UAC0C;AAC1C,YAAM,WAAW,MAAM,MAAM,GAAG,GAAG,gBAAgB;AAAA,QACjD,QAAQ;AAAA,QACR;AAAA,QACA,MAAM,KAAK,UAAU;AAAA,UACnB,UAAU,SAAS,IAAI,CAAC,OAAO;AAAA,YAC7B,UAAU,EAAE;AAAA,YACZ,cAAc,EAAE;AAAA,YAChB,YAAY,EAAE;AAAA,YACd,SAAS,EAAE;AAAA,YACX,QAAQ,EAAE;AAAA,UACZ,EAAE;AAAA,QACJ,CAAC;AAAA,MACH,CAAC;AAED,UAAI,CAAC,SAAS,IAAI;AAChB,cAAM,YAAY,MAAM,SAAS,KAAK;AACtC,YAAI;AACJ,YAAI;AACF,gBAAM,YAAY,KAAK,MAAM,SAAS;AACtC,yBAAe,UAAU,SAAS,QAAQ,SAAS,MAAM;AAAA,QAC3D,QAAQ;AACN,yBAAe,aAAa,QAAQ,SAAS,MAAM;AAAA,QACrD;AACA,cAAM,IAAI,sBAAsB,cAAc,SAAS,MAAM;AAAA,MAC/D;AAEA,YAAM,SAAU,MAAM,SAAS,KAAK;AACpC,aAAO,OAAO;AAAA,IAChB;AAAA,IAEA,MAAM,UAA+B;AACnC,YAAM,WAAW,MAAM,MAAM,GAAG,GAAG,SAAS;AAAA,QAC1C,QAAQ;AAAA,QACR;AAAA,MACF,CAAC;AAED,UAAI,CAAC,SAAS,IAAI;AAChB,cAAM,YAAY,MAAM,SAAS,KAAK;AACtC,YAAI;AACJ,YAAI;AACF,gBAAM,YAAY,KAAK,MAAM,SAAS;AACtC,yBAAe,UAAU,SAAS,QAAQ,SAAS,MAAM;AAAA,QAC3D,QAAQ;AACN,yBAAe,aAAa,QAAQ,SAAS,MAAM;AAAA,QACrD;AACA,cAAM,IAAI,sBAAsB,cAAc,SAAS,MAAM;AAAA,MAC/D;AAEA,aAAQ,MAAM,SAAS,KAAK;AAAA,IAC9B;AAAA,EACF;AACF;;;AFlKA,IAAAA,cAAkB;;;AGnElB,qBAA6B;;;ACC7B,SAAS,YAAY,KAAqB;AAExC,MAAI,SAAS,KAAK,GAAG,GAAG;AACtB,WAAO;AAAA,EACT;AACA,SAAO,IAAI,QAAQ,gBAAgB,CAAC,GAAG,WAAW,OAAO,YAAY,CAAC;AACxE;AASO,SAAS,gBAAgB,KAAuB;AACrD,MAAI,QAAQ,QAAQ,QAAQ,QAAW;AACrC,WAAO;AAAA,EACT;AAEA,MAAI,MAAM,QAAQ,GAAG,GAAG;AACtB,WAAO,IAAI,IAAI,CAAC,SAAS,gBAAgB,IAAI,CAAC;AAAA,EAChD;AAEA,MAAI,OAAO,QAAQ,UAAU;AAC3B,UAAM,SAAkC,CAAC;AACzC,eAAW,CAAC,KAAK,KAAK,KAAK,OAAO,QAAQ,GAAG,GAAG;AAC9C,YAAM,WAAW,YAAY,GAAG;AAChC,aAAO,QAAQ,IAAI,gBAAgB,KAAK;AAAA,IAC1C;AACA,WAAO;AAAA,EACT;AAEA,SAAO;AACT;;;ADrBA,SAAS,YAAoB;AAC3B,aAAO,6BAAa,GAAG,MAAM;AAC/B;AAMA,eAAsB,kBAAkB,SAA+C;AACrF,MAAI;AAEF,UAAM,QAAQ,UAAU;AAGxB,UAAM,WAAW,KAAK,MAAM,KAAK;AAGjC,UAAM,aAAa,gBAAgB,QAAQ;AAG3C,UAAM,QAAQ,0BAA0B,MAAM,UAAU;AAGxD,UAAM,SAAS,MAAM,QAAQ,KAAK;AAGlC,YAAQ,IAAI,MAAM;AAAA,EACpB,SAAS,OAAO;AAEd,YAAQ,MAAM,iBAAiB,QAAQ,MAAM,UAAU,OAAO,KAAK,CAAC;AACpE,YAAQ,KAAK,CAAC;AAAA,EAChB;AACF;;;AE9CA,IAAAC,kBAA6B;AAoB7B,SAASC,aAAoB;AAC3B,aAAO,8BAAa,GAAG,MAAM;AAC/B;AAKA,SAAS,WAAW,OAAuB;AACzC,MAAI,OAAO,MAAM,KAAK,KAAK,CAAC,OAAO,SAAS,KAAK,GAAG;AAClD,WAAO;AAAA,EACT;AACA,SAAO,KAAK,IAAI,GAAG,KAAK,IAAI,GAAG,KAAK,CAAC;AACvC;AAKA,SAAS,YAAY,OAAwB;AAC3C,MAAI,iBAAiB,OAAO;AAC1B,WAAO,MAAM;AAAA,EACf;AACA,SAAO,OAAO,KAAK;AACrB;AAMA,eAAsB,aAAa,SAA0C;AAC3E,MAAI;AAEF,UAAM,QAAQA,WAAU;AAGxB,UAAM,WAAW,KAAK,MAAM,KAAK;AAGjC,UAAM,aAAa,gBAAgB,QAAQ;AAG3C,UAAM,QAAQ,qBAAqB,MAAM,UAAU;AAGnD,UAAM,YAAY,MAAM,QAAQ,KAAK;AAGrC,UAAM,SAAS,sBAAsB,MAAM;AAAA,MACzC,GAAG;AAAA,MACH,OAAO,WAAW,UAAU,KAAK;AAAA,IACnC,CAAC;AAGD,YAAQ,IAAI,KAAK,UAAU,QAAQ,MAAM,CAAC,CAAC;AAAA,EAC7C,SAAS,OAAO;AAEd,UAAM,eAAe,YAAY,KAAK;AACtC,UAAM,cAA+B;AAAA,MACnC,OAAO;AAAA,MACP,MAAM,CAAC;AAAA,MACP,QAAQ,CAAC,YAAY;AAAA,MACrB,WAAW,sBAAsB,YAAY;AAAA,IAC/C;AACA,YAAQ,IAAI,KAAK,UAAU,aAAa,MAAM,CAAC,CAAC;AAChD,YAAQ,KAAK,CAAC;AAAA,EAChB;AACF;;;ALmCO,SAAS,gBAAgB,SAAiC;AAE/D,eAAa,OAAO;AACtB;AAwCO,SAAS,qBAAqB,SAAsC;AAEzE,oBAAkB,OAAO;AAC3B;","names":["import_zod","import_node_fs","readStdin"]}
|
package/dist/index.d.cts
CHANGED
|
@@ -526,6 +526,352 @@ type TraceSummary = z.infer<typeof TraceSummarySchema>;
|
|
|
526
526
|
type Message = z.infer<typeof MessageSchema>;
|
|
527
527
|
type ToolCall = z.infer<typeof ToolCallSchema>;
|
|
528
528
|
type TokenUsage = z.infer<typeof TokenUsageSchema>;
|
|
529
|
+
/**
|
|
530
|
+
* Prompt template input schema (camelCase, converted from snake_case wire format).
|
|
531
|
+
* Uses the same schema as CodeJudgeInput since the orchestrator sends identical payloads.
|
|
532
|
+
*/
|
|
533
|
+
declare const PromptTemplateInputSchema: z.ZodObject<{
|
|
534
|
+
question: z.ZodString;
|
|
535
|
+
expectedOutcome: z.ZodString;
|
|
536
|
+
expectedMessages: z.ZodArray<z.ZodObject<{
|
|
537
|
+
role: z.ZodEnum<["assistant", "user", "system", "tool"]>;
|
|
538
|
+
content: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodRecord<z.ZodString, z.ZodUnknown>, z.ZodArray<z.ZodRecord<z.ZodString, z.ZodUnknown>, "many">]>>;
|
|
539
|
+
toolCalls: z.ZodOptional<z.ZodArray<z.ZodObject<{
|
|
540
|
+
tool: z.ZodString;
|
|
541
|
+
input: z.ZodOptional<z.ZodUnknown>;
|
|
542
|
+
output: z.ZodOptional<z.ZodUnknown>;
|
|
543
|
+
id: z.ZodOptional<z.ZodString>;
|
|
544
|
+
timestamp: z.ZodOptional<z.ZodString>;
|
|
545
|
+
}, "strip", z.ZodTypeAny, {
|
|
546
|
+
tool: string;
|
|
547
|
+
input?: unknown;
|
|
548
|
+
output?: unknown;
|
|
549
|
+
id?: string | undefined;
|
|
550
|
+
timestamp?: string | undefined;
|
|
551
|
+
}, {
|
|
552
|
+
tool: string;
|
|
553
|
+
input?: unknown;
|
|
554
|
+
output?: unknown;
|
|
555
|
+
id?: string | undefined;
|
|
556
|
+
timestamp?: string | undefined;
|
|
557
|
+
}>, "many">>;
|
|
558
|
+
name: z.ZodOptional<z.ZodString>;
|
|
559
|
+
timestamp: z.ZodOptional<z.ZodString>;
|
|
560
|
+
metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
561
|
+
}, "strip", z.ZodTypeAny, {
|
|
562
|
+
role: "tool" | "assistant" | "user" | "system";
|
|
563
|
+
timestamp?: string | undefined;
|
|
564
|
+
content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
|
|
565
|
+
toolCalls?: {
|
|
566
|
+
tool: string;
|
|
567
|
+
input?: unknown;
|
|
568
|
+
output?: unknown;
|
|
569
|
+
id?: string | undefined;
|
|
570
|
+
timestamp?: string | undefined;
|
|
571
|
+
}[] | undefined;
|
|
572
|
+
name?: string | undefined;
|
|
573
|
+
metadata?: Record<string, unknown> | undefined;
|
|
574
|
+
}, {
|
|
575
|
+
role: "tool" | "assistant" | "user" | "system";
|
|
576
|
+
timestamp?: string | undefined;
|
|
577
|
+
content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
|
|
578
|
+
toolCalls?: {
|
|
579
|
+
tool: string;
|
|
580
|
+
input?: unknown;
|
|
581
|
+
output?: unknown;
|
|
582
|
+
id?: string | undefined;
|
|
583
|
+
timestamp?: string | undefined;
|
|
584
|
+
}[] | undefined;
|
|
585
|
+
name?: string | undefined;
|
|
586
|
+
metadata?: Record<string, unknown> | undefined;
|
|
587
|
+
}>, "many">;
|
|
588
|
+
referenceAnswer: z.ZodOptional<z.ZodString>;
|
|
589
|
+
candidateAnswer: z.ZodString;
|
|
590
|
+
outputMessages: z.ZodOptional<z.ZodNullable<z.ZodArray<z.ZodObject<{
|
|
591
|
+
role: z.ZodEnum<["assistant", "user", "system", "tool"]>;
|
|
592
|
+
content: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodRecord<z.ZodString, z.ZodUnknown>, z.ZodArray<z.ZodRecord<z.ZodString, z.ZodUnknown>, "many">]>>;
|
|
593
|
+
toolCalls: z.ZodOptional<z.ZodArray<z.ZodObject<{
|
|
594
|
+
tool: z.ZodString;
|
|
595
|
+
input: z.ZodOptional<z.ZodUnknown>;
|
|
596
|
+
output: z.ZodOptional<z.ZodUnknown>;
|
|
597
|
+
id: z.ZodOptional<z.ZodString>;
|
|
598
|
+
timestamp: z.ZodOptional<z.ZodString>;
|
|
599
|
+
}, "strip", z.ZodTypeAny, {
|
|
600
|
+
tool: string;
|
|
601
|
+
input?: unknown;
|
|
602
|
+
output?: unknown;
|
|
603
|
+
id?: string | undefined;
|
|
604
|
+
timestamp?: string | undefined;
|
|
605
|
+
}, {
|
|
606
|
+
tool: string;
|
|
607
|
+
input?: unknown;
|
|
608
|
+
output?: unknown;
|
|
609
|
+
id?: string | undefined;
|
|
610
|
+
timestamp?: string | undefined;
|
|
611
|
+
}>, "many">>;
|
|
612
|
+
name: z.ZodOptional<z.ZodString>;
|
|
613
|
+
timestamp: z.ZodOptional<z.ZodString>;
|
|
614
|
+
metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
615
|
+
}, "strip", z.ZodTypeAny, {
|
|
616
|
+
role: "tool" | "assistant" | "user" | "system";
|
|
617
|
+
timestamp?: string | undefined;
|
|
618
|
+
content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
|
|
619
|
+
toolCalls?: {
|
|
620
|
+
tool: string;
|
|
621
|
+
input?: unknown;
|
|
622
|
+
output?: unknown;
|
|
623
|
+
id?: string | undefined;
|
|
624
|
+
timestamp?: string | undefined;
|
|
625
|
+
}[] | undefined;
|
|
626
|
+
name?: string | undefined;
|
|
627
|
+
metadata?: Record<string, unknown> | undefined;
|
|
628
|
+
}, {
|
|
629
|
+
role: "tool" | "assistant" | "user" | "system";
|
|
630
|
+
timestamp?: string | undefined;
|
|
631
|
+
content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
|
|
632
|
+
toolCalls?: {
|
|
633
|
+
tool: string;
|
|
634
|
+
input?: unknown;
|
|
635
|
+
output?: unknown;
|
|
636
|
+
id?: string | undefined;
|
|
637
|
+
timestamp?: string | undefined;
|
|
638
|
+
}[] | undefined;
|
|
639
|
+
name?: string | undefined;
|
|
640
|
+
metadata?: Record<string, unknown> | undefined;
|
|
641
|
+
}>, "many">>>;
|
|
642
|
+
guidelineFiles: z.ZodArray<z.ZodString, "many">;
|
|
643
|
+
inputFiles: z.ZodArray<z.ZodString, "many">;
|
|
644
|
+
inputMessages: z.ZodArray<z.ZodObject<{
|
|
645
|
+
role: z.ZodEnum<["assistant", "user", "system", "tool"]>;
|
|
646
|
+
content: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodRecord<z.ZodString, z.ZodUnknown>, z.ZodArray<z.ZodRecord<z.ZodString, z.ZodUnknown>, "many">]>>;
|
|
647
|
+
toolCalls: z.ZodOptional<z.ZodArray<z.ZodObject<{
|
|
648
|
+
tool: z.ZodString;
|
|
649
|
+
input: z.ZodOptional<z.ZodUnknown>;
|
|
650
|
+
output: z.ZodOptional<z.ZodUnknown>;
|
|
651
|
+
id: z.ZodOptional<z.ZodString>;
|
|
652
|
+
timestamp: z.ZodOptional<z.ZodString>;
|
|
653
|
+
}, "strip", z.ZodTypeAny, {
|
|
654
|
+
tool: string;
|
|
655
|
+
input?: unknown;
|
|
656
|
+
output?: unknown;
|
|
657
|
+
id?: string | undefined;
|
|
658
|
+
timestamp?: string | undefined;
|
|
659
|
+
}, {
|
|
660
|
+
tool: string;
|
|
661
|
+
input?: unknown;
|
|
662
|
+
output?: unknown;
|
|
663
|
+
id?: string | undefined;
|
|
664
|
+
timestamp?: string | undefined;
|
|
665
|
+
}>, "many">>;
|
|
666
|
+
name: z.ZodOptional<z.ZodString>;
|
|
667
|
+
timestamp: z.ZodOptional<z.ZodString>;
|
|
668
|
+
metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
669
|
+
}, "strip", z.ZodTypeAny, {
|
|
670
|
+
role: "tool" | "assistant" | "user" | "system";
|
|
671
|
+
timestamp?: string | undefined;
|
|
672
|
+
content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
|
|
673
|
+
toolCalls?: {
|
|
674
|
+
tool: string;
|
|
675
|
+
input?: unknown;
|
|
676
|
+
output?: unknown;
|
|
677
|
+
id?: string | undefined;
|
|
678
|
+
timestamp?: string | undefined;
|
|
679
|
+
}[] | undefined;
|
|
680
|
+
name?: string | undefined;
|
|
681
|
+
metadata?: Record<string, unknown> | undefined;
|
|
682
|
+
}, {
|
|
683
|
+
role: "tool" | "assistant" | "user" | "system";
|
|
684
|
+
timestamp?: string | undefined;
|
|
685
|
+
content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
|
|
686
|
+
toolCalls?: {
|
|
687
|
+
tool: string;
|
|
688
|
+
input?: unknown;
|
|
689
|
+
output?: unknown;
|
|
690
|
+
id?: string | undefined;
|
|
691
|
+
timestamp?: string | undefined;
|
|
692
|
+
}[] | undefined;
|
|
693
|
+
name?: string | undefined;
|
|
694
|
+
metadata?: Record<string, unknown> | undefined;
|
|
695
|
+
}>, "many">;
|
|
696
|
+
traceSummary: z.ZodOptional<z.ZodNullable<z.ZodObject<{
|
|
697
|
+
eventCount: z.ZodNumber;
|
|
698
|
+
toolNames: z.ZodArray<z.ZodString, "many">;
|
|
699
|
+
toolCallsByName: z.ZodRecord<z.ZodString, z.ZodNumber>;
|
|
700
|
+
errorCount: z.ZodNumber;
|
|
701
|
+
tokenUsage: z.ZodOptional<z.ZodObject<{
|
|
702
|
+
input: z.ZodNumber;
|
|
703
|
+
output: z.ZodNumber;
|
|
704
|
+
cached: z.ZodOptional<z.ZodNumber>;
|
|
705
|
+
}, "strip", z.ZodTypeAny, {
|
|
706
|
+
input: number;
|
|
707
|
+
output: number;
|
|
708
|
+
cached?: number | undefined;
|
|
709
|
+
}, {
|
|
710
|
+
input: number;
|
|
711
|
+
output: number;
|
|
712
|
+
cached?: number | undefined;
|
|
713
|
+
}>>;
|
|
714
|
+
costUsd: z.ZodOptional<z.ZodNumber>;
|
|
715
|
+
durationMs: z.ZodOptional<z.ZodNumber>;
|
|
716
|
+
toolDurations: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodArray<z.ZodNumber, "many">>>;
|
|
717
|
+
}, "strip", z.ZodTypeAny, {
|
|
718
|
+
eventCount: number;
|
|
719
|
+
toolNames: string[];
|
|
720
|
+
toolCallsByName: Record<string, number>;
|
|
721
|
+
errorCount: number;
|
|
722
|
+
tokenUsage?: {
|
|
723
|
+
input: number;
|
|
724
|
+
output: number;
|
|
725
|
+
cached?: number | undefined;
|
|
726
|
+
} | undefined;
|
|
727
|
+
costUsd?: number | undefined;
|
|
728
|
+
durationMs?: number | undefined;
|
|
729
|
+
toolDurations?: Record<string, number[]> | undefined;
|
|
730
|
+
}, {
|
|
731
|
+
eventCount: number;
|
|
732
|
+
toolNames: string[];
|
|
733
|
+
toolCallsByName: Record<string, number>;
|
|
734
|
+
errorCount: number;
|
|
735
|
+
tokenUsage?: {
|
|
736
|
+
input: number;
|
|
737
|
+
output: number;
|
|
738
|
+
cached?: number | undefined;
|
|
739
|
+
} | undefined;
|
|
740
|
+
costUsd?: number | undefined;
|
|
741
|
+
durationMs?: number | undefined;
|
|
742
|
+
toolDurations?: Record<string, number[]> | undefined;
|
|
743
|
+
}>>>;
|
|
744
|
+
config: z.ZodOptional<z.ZodNullable<z.ZodRecord<z.ZodString, z.ZodUnknown>>>;
|
|
745
|
+
}, "strip", z.ZodTypeAny, {
|
|
746
|
+
question: string;
|
|
747
|
+
expectedOutcome: string;
|
|
748
|
+
expectedMessages: {
|
|
749
|
+
role: "tool" | "assistant" | "user" | "system";
|
|
750
|
+
timestamp?: string | undefined;
|
|
751
|
+
content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
|
|
752
|
+
toolCalls?: {
|
|
753
|
+
tool: string;
|
|
754
|
+
input?: unknown;
|
|
755
|
+
output?: unknown;
|
|
756
|
+
id?: string | undefined;
|
|
757
|
+
timestamp?: string | undefined;
|
|
758
|
+
}[] | undefined;
|
|
759
|
+
name?: string | undefined;
|
|
760
|
+
metadata?: Record<string, unknown> | undefined;
|
|
761
|
+
}[];
|
|
762
|
+
candidateAnswer: string;
|
|
763
|
+
guidelineFiles: string[];
|
|
764
|
+
inputFiles: string[];
|
|
765
|
+
inputMessages: {
|
|
766
|
+
role: "tool" | "assistant" | "user" | "system";
|
|
767
|
+
timestamp?: string | undefined;
|
|
768
|
+
content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
|
|
769
|
+
toolCalls?: {
|
|
770
|
+
tool: string;
|
|
771
|
+
input?: unknown;
|
|
772
|
+
output?: unknown;
|
|
773
|
+
id?: string | undefined;
|
|
774
|
+
timestamp?: string | undefined;
|
|
775
|
+
}[] | undefined;
|
|
776
|
+
name?: string | undefined;
|
|
777
|
+
metadata?: Record<string, unknown> | undefined;
|
|
778
|
+
}[];
|
|
779
|
+
referenceAnswer?: string | undefined;
|
|
780
|
+
outputMessages?: {
|
|
781
|
+
role: "tool" | "assistant" | "user" | "system";
|
|
782
|
+
timestamp?: string | undefined;
|
|
783
|
+
content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
|
|
784
|
+
toolCalls?: {
|
|
785
|
+
tool: string;
|
|
786
|
+
input?: unknown;
|
|
787
|
+
output?: unknown;
|
|
788
|
+
id?: string | undefined;
|
|
789
|
+
timestamp?: string | undefined;
|
|
790
|
+
}[] | undefined;
|
|
791
|
+
name?: string | undefined;
|
|
792
|
+
metadata?: Record<string, unknown> | undefined;
|
|
793
|
+
}[] | null | undefined;
|
|
794
|
+
traceSummary?: {
|
|
795
|
+
eventCount: number;
|
|
796
|
+
toolNames: string[];
|
|
797
|
+
toolCallsByName: Record<string, number>;
|
|
798
|
+
errorCount: number;
|
|
799
|
+
tokenUsage?: {
|
|
800
|
+
input: number;
|
|
801
|
+
output: number;
|
|
802
|
+
cached?: number | undefined;
|
|
803
|
+
} | undefined;
|
|
804
|
+
costUsd?: number | undefined;
|
|
805
|
+
durationMs?: number | undefined;
|
|
806
|
+
toolDurations?: Record<string, number[]> | undefined;
|
|
807
|
+
} | null | undefined;
|
|
808
|
+
config?: Record<string, unknown> | null | undefined;
|
|
809
|
+
}, {
|
|
810
|
+
question: string;
|
|
811
|
+
expectedOutcome: string;
|
|
812
|
+
expectedMessages: {
|
|
813
|
+
role: "tool" | "assistant" | "user" | "system";
|
|
814
|
+
timestamp?: string | undefined;
|
|
815
|
+
content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
|
|
816
|
+
toolCalls?: {
|
|
817
|
+
tool: string;
|
|
818
|
+
input?: unknown;
|
|
819
|
+
output?: unknown;
|
|
820
|
+
id?: string | undefined;
|
|
821
|
+
timestamp?: string | undefined;
|
|
822
|
+
}[] | undefined;
|
|
823
|
+
name?: string | undefined;
|
|
824
|
+
metadata?: Record<string, unknown> | undefined;
|
|
825
|
+
}[];
|
|
826
|
+
candidateAnswer: string;
|
|
827
|
+
guidelineFiles: string[];
|
|
828
|
+
inputFiles: string[];
|
|
829
|
+
inputMessages: {
|
|
830
|
+
role: "tool" | "assistant" | "user" | "system";
|
|
831
|
+
timestamp?: string | undefined;
|
|
832
|
+
content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
|
|
833
|
+
toolCalls?: {
|
|
834
|
+
tool: string;
|
|
835
|
+
input?: unknown;
|
|
836
|
+
output?: unknown;
|
|
837
|
+
id?: string | undefined;
|
|
838
|
+
timestamp?: string | undefined;
|
|
839
|
+
}[] | undefined;
|
|
840
|
+
name?: string | undefined;
|
|
841
|
+
metadata?: Record<string, unknown> | undefined;
|
|
842
|
+
}[];
|
|
843
|
+
referenceAnswer?: string | undefined;
|
|
844
|
+
outputMessages?: {
|
|
845
|
+
role: "tool" | "assistant" | "user" | "system";
|
|
846
|
+
timestamp?: string | undefined;
|
|
847
|
+
content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
|
|
848
|
+
toolCalls?: {
|
|
849
|
+
tool: string;
|
|
850
|
+
input?: unknown;
|
|
851
|
+
output?: unknown;
|
|
852
|
+
id?: string | undefined;
|
|
853
|
+
timestamp?: string | undefined;
|
|
854
|
+
}[] | undefined;
|
|
855
|
+
name?: string | undefined;
|
|
856
|
+
metadata?: Record<string, unknown> | undefined;
|
|
857
|
+
}[] | null | undefined;
|
|
858
|
+
traceSummary?: {
|
|
859
|
+
eventCount: number;
|
|
860
|
+
toolNames: string[];
|
|
861
|
+
toolCallsByName: Record<string, number>;
|
|
862
|
+
errorCount: number;
|
|
863
|
+
tokenUsage?: {
|
|
864
|
+
input: number;
|
|
865
|
+
output: number;
|
|
866
|
+
cached?: number | undefined;
|
|
867
|
+
} | undefined;
|
|
868
|
+
costUsd?: number | undefined;
|
|
869
|
+
durationMs?: number | undefined;
|
|
870
|
+
toolDurations?: Record<string, number[]> | undefined;
|
|
871
|
+
} | null | undefined;
|
|
872
|
+
config?: Record<string, unknown> | null | undefined;
|
|
873
|
+
}>;
|
|
874
|
+
type PromptTemplateInput = CodeJudgeInput;
|
|
529
875
|
|
|
530
876
|
/**
|
|
531
877
|
* Client for invoking configured targets from code_judge scripts.
|
|
@@ -635,6 +981,12 @@ declare class TargetInvocationError extends Error {
|
|
|
635
981
|
*/
|
|
636
982
|
declare function createTargetClient(): TargetClient | undefined;
|
|
637
983
|
|
|
984
|
+
/**
|
|
985
|
+
* Handler function type for prompt templates.
|
|
986
|
+
* Returns the prompt string to use for evaluation.
|
|
987
|
+
*/
|
|
988
|
+
type PromptTemplateHandler = (input: PromptTemplateInput) => string | Promise<string>;
|
|
989
|
+
|
|
638
990
|
/**
|
|
639
991
|
* Handler function type for code judges.
|
|
640
992
|
*/
|
|
@@ -726,5 +1078,44 @@ type CodeJudgeHandler = (input: CodeJudgeInput) => CodeJudgeResult | Promise<Cod
|
|
|
726
1078
|
* ```
|
|
727
1079
|
*/
|
|
728
1080
|
declare function defineCodeJudge(handler: CodeJudgeHandler): void;
|
|
1081
|
+
/**
|
|
1082
|
+
* Define a prompt template with automatic stdin/stdout handling.
|
|
1083
|
+
*
|
|
1084
|
+
* This function:
|
|
1085
|
+
* 1. Reads JSON from stdin (snake_case format)
|
|
1086
|
+
* 2. Converts to camelCase and validates with Zod
|
|
1087
|
+
* 3. Calls your handler with typed input
|
|
1088
|
+
* 4. Outputs the generated prompt string to stdout
|
|
1089
|
+
* 5. Handles errors gracefully with proper exit codes
|
|
1090
|
+
*
|
|
1091
|
+
* @param handler - Function that generates the prompt string from input
|
|
1092
|
+
*
|
|
1093
|
+
* @example
|
|
1094
|
+
* ```typescript
|
|
1095
|
+
* import { definePromptTemplate } from '@agentv/eval';
|
|
1096
|
+
*
|
|
1097
|
+
* export default definePromptTemplate((ctx) => `
|
|
1098
|
+
* Question: ${ctx.question}
|
|
1099
|
+
* Answer: ${ctx.candidateAnswer}
|
|
1100
|
+
*
|
|
1101
|
+
* ${ctx.referenceAnswer ? `Reference: ${ctx.referenceAnswer}` : ''}
|
|
1102
|
+
* `);
|
|
1103
|
+
* ```
|
|
1104
|
+
*
|
|
1105
|
+
* @example With conditional logic
|
|
1106
|
+
* ```typescript
|
|
1107
|
+
* import { definePromptTemplate } from '@agentv/eval';
|
|
1108
|
+
*
|
|
1109
|
+
* export default definePromptTemplate((ctx) => {
|
|
1110
|
+
* const rubric = ctx.config?.rubric as string | undefined;
|
|
1111
|
+
* return `
|
|
1112
|
+
* Question: ${ctx.question}
|
|
1113
|
+
* Candidate Answer: ${ctx.candidateAnswer}
|
|
1114
|
+
* ${rubric ? `\nEvaluation Criteria:\n${rubric}` : ''}
|
|
1115
|
+
* `;
|
|
1116
|
+
* });
|
|
1117
|
+
* ```
|
|
1118
|
+
*/
|
|
1119
|
+
declare function definePromptTemplate(handler: PromptTemplateHandler): void;
|
|
729
1120
|
|
|
730
|
-
export { type CodeJudgeHandler, type CodeJudgeInput, CodeJudgeInputSchema, type CodeJudgeResult, CodeJudgeResultSchema, type Message, MessageSchema, type TargetClient, type TargetInfo, TargetInvocationError, type TargetInvokeRequest, type TargetInvokeResponse, TargetNotAvailableError, type TokenUsage, TokenUsageSchema, type ToolCall, ToolCallSchema, type TraceSummary, TraceSummarySchema, createTargetClient, defineCodeJudge };
|
|
1121
|
+
export { type CodeJudgeHandler, type CodeJudgeInput, CodeJudgeInputSchema, type CodeJudgeResult, CodeJudgeResultSchema, type Message, MessageSchema, type PromptTemplateHandler, type PromptTemplateInput, PromptTemplateInputSchema, type TargetClient, type TargetInfo, TargetInvocationError, type TargetInvokeRequest, type TargetInvokeResponse, TargetNotAvailableError, type TokenUsage, TokenUsageSchema, type ToolCall, ToolCallSchema, type TraceSummary, TraceSummarySchema, createTargetClient, defineCodeJudge, definePromptTemplate };
|
package/dist/index.d.ts
CHANGED
|
@@ -526,6 +526,352 @@ type TraceSummary = z.infer<typeof TraceSummarySchema>;
|
|
|
526
526
|
type Message = z.infer<typeof MessageSchema>;
|
|
527
527
|
type ToolCall = z.infer<typeof ToolCallSchema>;
|
|
528
528
|
type TokenUsage = z.infer<typeof TokenUsageSchema>;
|
|
529
|
+
/**
|
|
530
|
+
* Prompt template input schema (camelCase, converted from snake_case wire format).
|
|
531
|
+
* Uses the same schema as CodeJudgeInput since the orchestrator sends identical payloads.
|
|
532
|
+
*/
|
|
533
|
+
declare const PromptTemplateInputSchema: z.ZodObject<{
|
|
534
|
+
question: z.ZodString;
|
|
535
|
+
expectedOutcome: z.ZodString;
|
|
536
|
+
expectedMessages: z.ZodArray<z.ZodObject<{
|
|
537
|
+
role: z.ZodEnum<["assistant", "user", "system", "tool"]>;
|
|
538
|
+
content: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodRecord<z.ZodString, z.ZodUnknown>, z.ZodArray<z.ZodRecord<z.ZodString, z.ZodUnknown>, "many">]>>;
|
|
539
|
+
toolCalls: z.ZodOptional<z.ZodArray<z.ZodObject<{
|
|
540
|
+
tool: z.ZodString;
|
|
541
|
+
input: z.ZodOptional<z.ZodUnknown>;
|
|
542
|
+
output: z.ZodOptional<z.ZodUnknown>;
|
|
543
|
+
id: z.ZodOptional<z.ZodString>;
|
|
544
|
+
timestamp: z.ZodOptional<z.ZodString>;
|
|
545
|
+
}, "strip", z.ZodTypeAny, {
|
|
546
|
+
tool: string;
|
|
547
|
+
input?: unknown;
|
|
548
|
+
output?: unknown;
|
|
549
|
+
id?: string | undefined;
|
|
550
|
+
timestamp?: string | undefined;
|
|
551
|
+
}, {
|
|
552
|
+
tool: string;
|
|
553
|
+
input?: unknown;
|
|
554
|
+
output?: unknown;
|
|
555
|
+
id?: string | undefined;
|
|
556
|
+
timestamp?: string | undefined;
|
|
557
|
+
}>, "many">>;
|
|
558
|
+
name: z.ZodOptional<z.ZodString>;
|
|
559
|
+
timestamp: z.ZodOptional<z.ZodString>;
|
|
560
|
+
metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
561
|
+
}, "strip", z.ZodTypeAny, {
|
|
562
|
+
role: "tool" | "assistant" | "user" | "system";
|
|
563
|
+
timestamp?: string | undefined;
|
|
564
|
+
content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
|
|
565
|
+
toolCalls?: {
|
|
566
|
+
tool: string;
|
|
567
|
+
input?: unknown;
|
|
568
|
+
output?: unknown;
|
|
569
|
+
id?: string | undefined;
|
|
570
|
+
timestamp?: string | undefined;
|
|
571
|
+
}[] | undefined;
|
|
572
|
+
name?: string | undefined;
|
|
573
|
+
metadata?: Record<string, unknown> | undefined;
|
|
574
|
+
}, {
|
|
575
|
+
role: "tool" | "assistant" | "user" | "system";
|
|
576
|
+
timestamp?: string | undefined;
|
|
577
|
+
content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
|
|
578
|
+
toolCalls?: {
|
|
579
|
+
tool: string;
|
|
580
|
+
input?: unknown;
|
|
581
|
+
output?: unknown;
|
|
582
|
+
id?: string | undefined;
|
|
583
|
+
timestamp?: string | undefined;
|
|
584
|
+
}[] | undefined;
|
|
585
|
+
name?: string | undefined;
|
|
586
|
+
metadata?: Record<string, unknown> | undefined;
|
|
587
|
+
}>, "many">;
|
|
588
|
+
referenceAnswer: z.ZodOptional<z.ZodString>;
|
|
589
|
+
candidateAnswer: z.ZodString;
|
|
590
|
+
outputMessages: z.ZodOptional<z.ZodNullable<z.ZodArray<z.ZodObject<{
|
|
591
|
+
role: z.ZodEnum<["assistant", "user", "system", "tool"]>;
|
|
592
|
+
content: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodRecord<z.ZodString, z.ZodUnknown>, z.ZodArray<z.ZodRecord<z.ZodString, z.ZodUnknown>, "many">]>>;
|
|
593
|
+
toolCalls: z.ZodOptional<z.ZodArray<z.ZodObject<{
|
|
594
|
+
tool: z.ZodString;
|
|
595
|
+
input: z.ZodOptional<z.ZodUnknown>;
|
|
596
|
+
output: z.ZodOptional<z.ZodUnknown>;
|
|
597
|
+
id: z.ZodOptional<z.ZodString>;
|
|
598
|
+
timestamp: z.ZodOptional<z.ZodString>;
|
|
599
|
+
}, "strip", z.ZodTypeAny, {
|
|
600
|
+
tool: string;
|
|
601
|
+
input?: unknown;
|
|
602
|
+
output?: unknown;
|
|
603
|
+
id?: string | undefined;
|
|
604
|
+
timestamp?: string | undefined;
|
|
605
|
+
}, {
|
|
606
|
+
tool: string;
|
|
607
|
+
input?: unknown;
|
|
608
|
+
output?: unknown;
|
|
609
|
+
id?: string | undefined;
|
|
610
|
+
timestamp?: string | undefined;
|
|
611
|
+
}>, "many">>;
|
|
612
|
+
name: z.ZodOptional<z.ZodString>;
|
|
613
|
+
timestamp: z.ZodOptional<z.ZodString>;
|
|
614
|
+
metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
615
|
+
}, "strip", z.ZodTypeAny, {
|
|
616
|
+
role: "tool" | "assistant" | "user" | "system";
|
|
617
|
+
timestamp?: string | undefined;
|
|
618
|
+
content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
|
|
619
|
+
toolCalls?: {
|
|
620
|
+
tool: string;
|
|
621
|
+
input?: unknown;
|
|
622
|
+
output?: unknown;
|
|
623
|
+
id?: string | undefined;
|
|
624
|
+
timestamp?: string | undefined;
|
|
625
|
+
}[] | undefined;
|
|
626
|
+
name?: string | undefined;
|
|
627
|
+
metadata?: Record<string, unknown> | undefined;
|
|
628
|
+
}, {
|
|
629
|
+
role: "tool" | "assistant" | "user" | "system";
|
|
630
|
+
timestamp?: string | undefined;
|
|
631
|
+
content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
|
|
632
|
+
toolCalls?: {
|
|
633
|
+
tool: string;
|
|
634
|
+
input?: unknown;
|
|
635
|
+
output?: unknown;
|
|
636
|
+
id?: string | undefined;
|
|
637
|
+
timestamp?: string | undefined;
|
|
638
|
+
}[] | undefined;
|
|
639
|
+
name?: string | undefined;
|
|
640
|
+
metadata?: Record<string, unknown> | undefined;
|
|
641
|
+
}>, "many">>>;
|
|
642
|
+
guidelineFiles: z.ZodArray<z.ZodString, "many">;
|
|
643
|
+
inputFiles: z.ZodArray<z.ZodString, "many">;
|
|
644
|
+
inputMessages: z.ZodArray<z.ZodObject<{
|
|
645
|
+
role: z.ZodEnum<["assistant", "user", "system", "tool"]>;
|
|
646
|
+
content: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodRecord<z.ZodString, z.ZodUnknown>, z.ZodArray<z.ZodRecord<z.ZodString, z.ZodUnknown>, "many">]>>;
|
|
647
|
+
toolCalls: z.ZodOptional<z.ZodArray<z.ZodObject<{
|
|
648
|
+
tool: z.ZodString;
|
|
649
|
+
input: z.ZodOptional<z.ZodUnknown>;
|
|
650
|
+
output: z.ZodOptional<z.ZodUnknown>;
|
|
651
|
+
id: z.ZodOptional<z.ZodString>;
|
|
652
|
+
timestamp: z.ZodOptional<z.ZodString>;
|
|
653
|
+
}, "strip", z.ZodTypeAny, {
|
|
654
|
+
tool: string;
|
|
655
|
+
input?: unknown;
|
|
656
|
+
output?: unknown;
|
|
657
|
+
id?: string | undefined;
|
|
658
|
+
timestamp?: string | undefined;
|
|
659
|
+
}, {
|
|
660
|
+
tool: string;
|
|
661
|
+
input?: unknown;
|
|
662
|
+
output?: unknown;
|
|
663
|
+
id?: string | undefined;
|
|
664
|
+
timestamp?: string | undefined;
|
|
665
|
+
}>, "many">>;
|
|
666
|
+
name: z.ZodOptional<z.ZodString>;
|
|
667
|
+
timestamp: z.ZodOptional<z.ZodString>;
|
|
668
|
+
metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
669
|
+
}, "strip", z.ZodTypeAny, {
|
|
670
|
+
role: "tool" | "assistant" | "user" | "system";
|
|
671
|
+
timestamp?: string | undefined;
|
|
672
|
+
content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
|
|
673
|
+
toolCalls?: {
|
|
674
|
+
tool: string;
|
|
675
|
+
input?: unknown;
|
|
676
|
+
output?: unknown;
|
|
677
|
+
id?: string | undefined;
|
|
678
|
+
timestamp?: string | undefined;
|
|
679
|
+
}[] | undefined;
|
|
680
|
+
name?: string | undefined;
|
|
681
|
+
metadata?: Record<string, unknown> | undefined;
|
|
682
|
+
}, {
|
|
683
|
+
role: "tool" | "assistant" | "user" | "system";
|
|
684
|
+
timestamp?: string | undefined;
|
|
685
|
+
content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
|
|
686
|
+
toolCalls?: {
|
|
687
|
+
tool: string;
|
|
688
|
+
input?: unknown;
|
|
689
|
+
output?: unknown;
|
|
690
|
+
id?: string | undefined;
|
|
691
|
+
timestamp?: string | undefined;
|
|
692
|
+
}[] | undefined;
|
|
693
|
+
name?: string | undefined;
|
|
694
|
+
metadata?: Record<string, unknown> | undefined;
|
|
695
|
+
}>, "many">;
|
|
696
|
+
traceSummary: z.ZodOptional<z.ZodNullable<z.ZodObject<{
|
|
697
|
+
eventCount: z.ZodNumber;
|
|
698
|
+
toolNames: z.ZodArray<z.ZodString, "many">;
|
|
699
|
+
toolCallsByName: z.ZodRecord<z.ZodString, z.ZodNumber>;
|
|
700
|
+
errorCount: z.ZodNumber;
|
|
701
|
+
tokenUsage: z.ZodOptional<z.ZodObject<{
|
|
702
|
+
input: z.ZodNumber;
|
|
703
|
+
output: z.ZodNumber;
|
|
704
|
+
cached: z.ZodOptional<z.ZodNumber>;
|
|
705
|
+
}, "strip", z.ZodTypeAny, {
|
|
706
|
+
input: number;
|
|
707
|
+
output: number;
|
|
708
|
+
cached?: number | undefined;
|
|
709
|
+
}, {
|
|
710
|
+
input: number;
|
|
711
|
+
output: number;
|
|
712
|
+
cached?: number | undefined;
|
|
713
|
+
}>>;
|
|
714
|
+
costUsd: z.ZodOptional<z.ZodNumber>;
|
|
715
|
+
durationMs: z.ZodOptional<z.ZodNumber>;
|
|
716
|
+
toolDurations: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodArray<z.ZodNumber, "many">>>;
|
|
717
|
+
}, "strip", z.ZodTypeAny, {
|
|
718
|
+
eventCount: number;
|
|
719
|
+
toolNames: string[];
|
|
720
|
+
toolCallsByName: Record<string, number>;
|
|
721
|
+
errorCount: number;
|
|
722
|
+
tokenUsage?: {
|
|
723
|
+
input: number;
|
|
724
|
+
output: number;
|
|
725
|
+
cached?: number | undefined;
|
|
726
|
+
} | undefined;
|
|
727
|
+
costUsd?: number | undefined;
|
|
728
|
+
durationMs?: number | undefined;
|
|
729
|
+
toolDurations?: Record<string, number[]> | undefined;
|
|
730
|
+
}, {
|
|
731
|
+
eventCount: number;
|
|
732
|
+
toolNames: string[];
|
|
733
|
+
toolCallsByName: Record<string, number>;
|
|
734
|
+
errorCount: number;
|
|
735
|
+
tokenUsage?: {
|
|
736
|
+
input: number;
|
|
737
|
+
output: number;
|
|
738
|
+
cached?: number | undefined;
|
|
739
|
+
} | undefined;
|
|
740
|
+
costUsd?: number | undefined;
|
|
741
|
+
durationMs?: number | undefined;
|
|
742
|
+
toolDurations?: Record<string, number[]> | undefined;
|
|
743
|
+
}>>>;
|
|
744
|
+
config: z.ZodOptional<z.ZodNullable<z.ZodRecord<z.ZodString, z.ZodUnknown>>>;
|
|
745
|
+
}, "strip", z.ZodTypeAny, {
|
|
746
|
+
question: string;
|
|
747
|
+
expectedOutcome: string;
|
|
748
|
+
expectedMessages: {
|
|
749
|
+
role: "tool" | "assistant" | "user" | "system";
|
|
750
|
+
timestamp?: string | undefined;
|
|
751
|
+
content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
|
|
752
|
+
toolCalls?: {
|
|
753
|
+
tool: string;
|
|
754
|
+
input?: unknown;
|
|
755
|
+
output?: unknown;
|
|
756
|
+
id?: string | undefined;
|
|
757
|
+
timestamp?: string | undefined;
|
|
758
|
+
}[] | undefined;
|
|
759
|
+
name?: string | undefined;
|
|
760
|
+
metadata?: Record<string, unknown> | undefined;
|
|
761
|
+
}[];
|
|
762
|
+
candidateAnswer: string;
|
|
763
|
+
guidelineFiles: string[];
|
|
764
|
+
inputFiles: string[];
|
|
765
|
+
inputMessages: {
|
|
766
|
+
role: "tool" | "assistant" | "user" | "system";
|
|
767
|
+
timestamp?: string | undefined;
|
|
768
|
+
content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
|
|
769
|
+
toolCalls?: {
|
|
770
|
+
tool: string;
|
|
771
|
+
input?: unknown;
|
|
772
|
+
output?: unknown;
|
|
773
|
+
id?: string | undefined;
|
|
774
|
+
timestamp?: string | undefined;
|
|
775
|
+
}[] | undefined;
|
|
776
|
+
name?: string | undefined;
|
|
777
|
+
metadata?: Record<string, unknown> | undefined;
|
|
778
|
+
}[];
|
|
779
|
+
referenceAnswer?: string | undefined;
|
|
780
|
+
outputMessages?: {
|
|
781
|
+
role: "tool" | "assistant" | "user" | "system";
|
|
782
|
+
timestamp?: string | undefined;
|
|
783
|
+
content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
|
|
784
|
+
toolCalls?: {
|
|
785
|
+
tool: string;
|
|
786
|
+
input?: unknown;
|
|
787
|
+
output?: unknown;
|
|
788
|
+
id?: string | undefined;
|
|
789
|
+
timestamp?: string | undefined;
|
|
790
|
+
}[] | undefined;
|
|
791
|
+
name?: string | undefined;
|
|
792
|
+
metadata?: Record<string, unknown> | undefined;
|
|
793
|
+
}[] | null | undefined;
|
|
794
|
+
traceSummary?: {
|
|
795
|
+
eventCount: number;
|
|
796
|
+
toolNames: string[];
|
|
797
|
+
toolCallsByName: Record<string, number>;
|
|
798
|
+
errorCount: number;
|
|
799
|
+
tokenUsage?: {
|
|
800
|
+
input: number;
|
|
801
|
+
output: number;
|
|
802
|
+
cached?: number | undefined;
|
|
803
|
+
} | undefined;
|
|
804
|
+
costUsd?: number | undefined;
|
|
805
|
+
durationMs?: number | undefined;
|
|
806
|
+
toolDurations?: Record<string, number[]> | undefined;
|
|
807
|
+
} | null | undefined;
|
|
808
|
+
config?: Record<string, unknown> | null | undefined;
|
|
809
|
+
}, {
|
|
810
|
+
question: string;
|
|
811
|
+
expectedOutcome: string;
|
|
812
|
+
expectedMessages: {
|
|
813
|
+
role: "tool" | "assistant" | "user" | "system";
|
|
814
|
+
timestamp?: string | undefined;
|
|
815
|
+
content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
|
|
816
|
+
toolCalls?: {
|
|
817
|
+
tool: string;
|
|
818
|
+
input?: unknown;
|
|
819
|
+
output?: unknown;
|
|
820
|
+
id?: string | undefined;
|
|
821
|
+
timestamp?: string | undefined;
|
|
822
|
+
}[] | undefined;
|
|
823
|
+
name?: string | undefined;
|
|
824
|
+
metadata?: Record<string, unknown> | undefined;
|
|
825
|
+
}[];
|
|
826
|
+
candidateAnswer: string;
|
|
827
|
+
guidelineFiles: string[];
|
|
828
|
+
inputFiles: string[];
|
|
829
|
+
inputMessages: {
|
|
830
|
+
role: "tool" | "assistant" | "user" | "system";
|
|
831
|
+
timestamp?: string | undefined;
|
|
832
|
+
content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
|
|
833
|
+
toolCalls?: {
|
|
834
|
+
tool: string;
|
|
835
|
+
input?: unknown;
|
|
836
|
+
output?: unknown;
|
|
837
|
+
id?: string | undefined;
|
|
838
|
+
timestamp?: string | undefined;
|
|
839
|
+
}[] | undefined;
|
|
840
|
+
name?: string | undefined;
|
|
841
|
+
metadata?: Record<string, unknown> | undefined;
|
|
842
|
+
}[];
|
|
843
|
+
referenceAnswer?: string | undefined;
|
|
844
|
+
outputMessages?: {
|
|
845
|
+
role: "tool" | "assistant" | "user" | "system";
|
|
846
|
+
timestamp?: string | undefined;
|
|
847
|
+
content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
|
|
848
|
+
toolCalls?: {
|
|
849
|
+
tool: string;
|
|
850
|
+
input?: unknown;
|
|
851
|
+
output?: unknown;
|
|
852
|
+
id?: string | undefined;
|
|
853
|
+
timestamp?: string | undefined;
|
|
854
|
+
}[] | undefined;
|
|
855
|
+
name?: string | undefined;
|
|
856
|
+
metadata?: Record<string, unknown> | undefined;
|
|
857
|
+
}[] | null | undefined;
|
|
858
|
+
traceSummary?: {
|
|
859
|
+
eventCount: number;
|
|
860
|
+
toolNames: string[];
|
|
861
|
+
toolCallsByName: Record<string, number>;
|
|
862
|
+
errorCount: number;
|
|
863
|
+
tokenUsage?: {
|
|
864
|
+
input: number;
|
|
865
|
+
output: number;
|
|
866
|
+
cached?: number | undefined;
|
|
867
|
+
} | undefined;
|
|
868
|
+
costUsd?: number | undefined;
|
|
869
|
+
durationMs?: number | undefined;
|
|
870
|
+
toolDurations?: Record<string, number[]> | undefined;
|
|
871
|
+
} | null | undefined;
|
|
872
|
+
config?: Record<string, unknown> | null | undefined;
|
|
873
|
+
}>;
|
|
874
|
+
type PromptTemplateInput = CodeJudgeInput;
|
|
529
875
|
|
|
530
876
|
/**
|
|
531
877
|
* Client for invoking configured targets from code_judge scripts.
|
|
@@ -635,6 +981,12 @@ declare class TargetInvocationError extends Error {
|
|
|
635
981
|
*/
|
|
636
982
|
declare function createTargetClient(): TargetClient | undefined;
|
|
637
983
|
|
|
984
|
+
/**
|
|
985
|
+
* Handler function type for prompt templates.
|
|
986
|
+
* Returns the prompt string to use for evaluation.
|
|
987
|
+
*/
|
|
988
|
+
type PromptTemplateHandler = (input: PromptTemplateInput) => string | Promise<string>;
|
|
989
|
+
|
|
638
990
|
/**
|
|
639
991
|
* Handler function type for code judges.
|
|
640
992
|
*/
|
|
@@ -726,5 +1078,44 @@ type CodeJudgeHandler = (input: CodeJudgeInput) => CodeJudgeResult | Promise<Cod
|
|
|
726
1078
|
* ```
|
|
727
1079
|
*/
|
|
728
1080
|
declare function defineCodeJudge(handler: CodeJudgeHandler): void;
|
|
1081
|
+
/**
|
|
1082
|
+
* Define a prompt template with automatic stdin/stdout handling.
|
|
1083
|
+
*
|
|
1084
|
+
* This function:
|
|
1085
|
+
* 1. Reads JSON from stdin (snake_case format)
|
|
1086
|
+
* 2. Converts to camelCase and validates with Zod
|
|
1087
|
+
* 3. Calls your handler with typed input
|
|
1088
|
+
* 4. Outputs the generated prompt string to stdout
|
|
1089
|
+
* 5. Handles errors gracefully with proper exit codes
|
|
1090
|
+
*
|
|
1091
|
+
* @param handler - Function that generates the prompt string from input
|
|
1092
|
+
*
|
|
1093
|
+
* @example
|
|
1094
|
+
* ```typescript
|
|
1095
|
+
* import { definePromptTemplate } from '@agentv/eval';
|
|
1096
|
+
*
|
|
1097
|
+
* export default definePromptTemplate((ctx) => `
|
|
1098
|
+
* Question: ${ctx.question}
|
|
1099
|
+
* Answer: ${ctx.candidateAnswer}
|
|
1100
|
+
*
|
|
1101
|
+
* ${ctx.referenceAnswer ? `Reference: ${ctx.referenceAnswer}` : ''}
|
|
1102
|
+
* `);
|
|
1103
|
+
* ```
|
|
1104
|
+
*
|
|
1105
|
+
* @example With conditional logic
|
|
1106
|
+
* ```typescript
|
|
1107
|
+
* import { definePromptTemplate } from '@agentv/eval';
|
|
1108
|
+
*
|
|
1109
|
+
* export default definePromptTemplate((ctx) => {
|
|
1110
|
+
* const rubric = ctx.config?.rubric as string | undefined;
|
|
1111
|
+
* return `
|
|
1112
|
+
* Question: ${ctx.question}
|
|
1113
|
+
* Candidate Answer: ${ctx.candidateAnswer}
|
|
1114
|
+
* ${rubric ? `\nEvaluation Criteria:\n${rubric}` : ''}
|
|
1115
|
+
* `;
|
|
1116
|
+
* });
|
|
1117
|
+
* ```
|
|
1118
|
+
*/
|
|
1119
|
+
declare function definePromptTemplate(handler: PromptTemplateHandler): void;
|
|
729
1120
|
|
|
730
|
-
export { type CodeJudgeHandler, type CodeJudgeInput, CodeJudgeInputSchema, type CodeJudgeResult, CodeJudgeResultSchema, type Message, MessageSchema, type TargetClient, type TargetInfo, TargetInvocationError, type TargetInvokeRequest, type TargetInvokeResponse, TargetNotAvailableError, type TokenUsage, TokenUsageSchema, type ToolCall, ToolCallSchema, type TraceSummary, TraceSummarySchema, createTargetClient, defineCodeJudge };
|
|
1121
|
+
export { type CodeJudgeHandler, type CodeJudgeInput, CodeJudgeInputSchema, type CodeJudgeResult, CodeJudgeResultSchema, type Message, MessageSchema, type PromptTemplateHandler, type PromptTemplateInput, PromptTemplateInputSchema, type TargetClient, type TargetInfo, TargetInvocationError, type TargetInvokeRequest, type TargetInvokeResponse, TargetNotAvailableError, type TokenUsage, TokenUsageSchema, type ToolCall, ToolCallSchema, type TraceSummary, TraceSummarySchema, createTargetClient, defineCodeJudge, definePromptTemplate };
|
package/dist/index.js
CHANGED
|
@@ -51,6 +51,7 @@ var CodeJudgeResultSchema = z.object({
|
|
|
51
51
|
/** Optional structured details for domain-specific metrics (e.g., TP/TN/FP/FN counts, alignments). */
|
|
52
52
|
details: z.record(z.unknown()).optional()
|
|
53
53
|
});
|
|
54
|
+
var PromptTemplateInputSchema = CodeJudgeInputSchema;
|
|
54
55
|
|
|
55
56
|
// src/target-client.ts
|
|
56
57
|
var TargetNotAvailableError = class extends Error {
|
|
@@ -163,7 +164,7 @@ function createTargetClientInternal(url, token) {
|
|
|
163
164
|
// src/index.ts
|
|
164
165
|
import { z as z2 } from "zod";
|
|
165
166
|
|
|
166
|
-
// src/
|
|
167
|
+
// src/prompt-template.ts
|
|
167
168
|
import { readFileSync } from "node:fs";
|
|
168
169
|
|
|
169
170
|
// src/case-conversion.ts
|
|
@@ -191,10 +192,29 @@ function toCamelCaseDeep(obj) {
|
|
|
191
192
|
return obj;
|
|
192
193
|
}
|
|
193
194
|
|
|
194
|
-
// src/
|
|
195
|
+
// src/prompt-template.ts
|
|
195
196
|
function readStdin() {
|
|
196
197
|
return readFileSync(0, "utf8");
|
|
197
198
|
}
|
|
199
|
+
async function runPromptTemplate(handler) {
|
|
200
|
+
try {
|
|
201
|
+
const stdin = readStdin();
|
|
202
|
+
const rawInput = JSON.parse(stdin);
|
|
203
|
+
const camelInput = toCamelCaseDeep(rawInput);
|
|
204
|
+
const input = PromptTemplateInputSchema.parse(camelInput);
|
|
205
|
+
const prompt = await handler(input);
|
|
206
|
+
console.log(prompt);
|
|
207
|
+
} catch (error) {
|
|
208
|
+
console.error(error instanceof Error ? error.message : String(error));
|
|
209
|
+
process.exit(1);
|
|
210
|
+
}
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
// src/runtime.ts
|
|
214
|
+
import { readFileSync as readFileSync2 } from "node:fs";
|
|
215
|
+
function readStdin2() {
|
|
216
|
+
return readFileSync2(0, "utf8");
|
|
217
|
+
}
|
|
198
218
|
function clampScore(value) {
|
|
199
219
|
if (Number.isNaN(value) || !Number.isFinite(value)) {
|
|
200
220
|
return 0;
|
|
@@ -209,7 +229,7 @@ function formatError(error) {
|
|
|
209
229
|
}
|
|
210
230
|
async function runCodeJudge(handler) {
|
|
211
231
|
try {
|
|
212
|
-
const stdin =
|
|
232
|
+
const stdin = readStdin2();
|
|
213
233
|
const rawInput = JSON.parse(stdin);
|
|
214
234
|
const camelInput = toCamelCaseDeep(rawInput);
|
|
215
235
|
const input = CodeJudgeInputSchema.parse(camelInput);
|
|
@@ -236,10 +256,14 @@ async function runCodeJudge(handler) {
|
|
|
236
256
|
function defineCodeJudge(handler) {
|
|
237
257
|
runCodeJudge(handler);
|
|
238
258
|
}
|
|
259
|
+
function definePromptTemplate(handler) {
|
|
260
|
+
runPromptTemplate(handler);
|
|
261
|
+
}
|
|
239
262
|
export {
|
|
240
263
|
CodeJudgeInputSchema,
|
|
241
264
|
CodeJudgeResultSchema,
|
|
242
265
|
MessageSchema,
|
|
266
|
+
PromptTemplateInputSchema,
|
|
243
267
|
TargetInvocationError,
|
|
244
268
|
TargetNotAvailableError,
|
|
245
269
|
TokenUsageSchema,
|
|
@@ -247,6 +271,7 @@ export {
|
|
|
247
271
|
TraceSummarySchema,
|
|
248
272
|
createTargetClient,
|
|
249
273
|
defineCodeJudge,
|
|
274
|
+
definePromptTemplate,
|
|
250
275
|
z2 as z
|
|
251
276
|
};
|
|
252
277
|
//# sourceMappingURL=index.js.map
|
package/dist/index.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../src/schemas.ts","../src/target-client.ts","../src/index.ts","../src/runtime.ts","../src/case-conversion.ts"],"sourcesContent":["/**\n * Zod schemas for code judge input/output validation.\n * Provides both compile-time types and runtime validation.\n */\nimport { z } from 'zod';\n\n/**\n * Token usage metrics schema.\n */\nexport const TokenUsageSchema = z.object({\n input: z.number(),\n output: z.number(),\n cached: z.number().optional(),\n});\n\n/**\n * Trace summary schema (camelCase for TypeScript ergonomics).\n */\nexport const TraceSummarySchema = z.object({\n eventCount: z.number(),\n toolNames: z.array(z.string()),\n toolCallsByName: z.record(z.string(), z.number()),\n errorCount: z.number(),\n tokenUsage: TokenUsageSchema.optional(),\n costUsd: z.number().optional(),\n durationMs: z.number().optional(),\n toolDurations: z.record(z.string(), z.array(z.number())).optional(),\n});\n\n/**\n * Tool call schema.\n */\nexport const ToolCallSchema = z.object({\n tool: z.string(),\n input: z.unknown().optional(),\n output: z.unknown().optional(),\n id: z.string().optional(),\n timestamp: z.string().optional(),\n});\n\n/**\n * Unified message schema for input, expected, and output messages.\n */\nexport const MessageSchema = z.object({\n role: z.enum(['assistant', 'user', 'system', 'tool']),\n content: z.union([z.string(), z.record(z.unknown()), z.array(z.record(z.unknown()))]).optional(),\n toolCalls: z.array(ToolCallSchema).optional(),\n name: z.string().optional(),\n timestamp: z.string().optional(),\n metadata: z.record(z.unknown()).optional(),\n});\n\n/**\n * Code judge input schema (camelCase, converted from snake_case wire format).\n */\nexport const CodeJudgeInputSchema = z.object({\n question: z.string(),\n expectedOutcome: z.string(),\n expectedMessages: z.array(MessageSchema),\n referenceAnswer: z.string().optional(),\n candidateAnswer: z.string(),\n outputMessages: z.array(MessageSchema).nullable().optional(),\n guidelineFiles: z.array(z.string()),\n inputFiles: z.array(z.string()),\n inputMessages: z.array(MessageSchema),\n traceSummary: TraceSummarySchema.nullable().optional(),\n config: z.record(z.unknown()).nullable().optional(),\n});\n\n/**\n * Code judge result schema (validated before output).\n */\nexport const CodeJudgeResultSchema = z.object({\n score: z.number().min(0).max(1),\n hits: z.array(z.string()).optional().default([]),\n misses: z.array(z.string()).optional().default([]),\n reasoning: z.string().optional(),\n /** Optional structured details for domain-specific metrics (e.g., TP/TN/FP/FN counts, alignments). */\n details: z.record(z.unknown()).optional(),\n});\n\n/**\n * Inferred types from schemas.\n */\nexport type CodeJudgeInput = z.infer<typeof CodeJudgeInputSchema>;\nexport type CodeJudgeResult = z.infer<typeof CodeJudgeResultSchema>;\nexport type TraceSummary = z.infer<typeof TraceSummarySchema>;\nexport type Message = z.infer<typeof MessageSchema>;\nexport type ToolCall = z.infer<typeof ToolCallSchema>;\nexport type TokenUsage = z.infer<typeof TokenUsageSchema>;\n","/**\n * Client for invoking configured targets from code_judge scripts.\n *\n * Environment variables (set automatically by AgentV when `target` config is present):\n * - AGENTV_TARGET_PROXY_URL: The URL of the local proxy server\n * - AGENTV_TARGET_PROXY_TOKEN: Bearer token for authentication\n */\n\n/**\n * Request to invoke the target\n */\nexport interface TargetInvokeRequest {\n readonly question: string;\n readonly systemPrompt?: string;\n readonly evalCaseId?: string;\n readonly attempt?: number;\n /** Optional target override - use a different target for this invocation */\n readonly target?: string;\n}\n\n/**\n * Response from a target invocation\n */\nexport interface TargetInvokeResponse {\n readonly outputMessages: readonly unknown[];\n readonly rawText?: string;\n}\n\n/**\n * Information about the target proxy configuration\n */\nexport interface TargetInfo {\n /** Name of the default target being used */\n readonly targetName: string;\n /** Maximum number of calls allowed */\n readonly maxCalls: number;\n /** Current number of calls made */\n readonly callCount: number;\n /** List of all available target names */\n readonly availableTargets: readonly string[];\n}\n\n/**\n * Target client for making target invocations\n */\nexport interface TargetClient {\n /**\n * Invoke the configured target with a prompt.\n * @param request - The question and optional system prompt\n * @returns The target's response with output messages and optional raw text\n */\n invoke(request: TargetInvokeRequest): Promise<TargetInvokeResponse>;\n\n /**\n * Invoke the target with multiple requests in sequence.\n * Each request counts toward the max_calls limit.\n * @param requests - Array of target requests\n * @returns Array of target responses\n */\n invokeBatch(requests: readonly TargetInvokeRequest[]): Promise<readonly TargetInvokeResponse[]>;\n\n /**\n * Get information about the target proxy configuration.\n * Returns the default target name, max calls, current call count, and available targets.\n */\n getInfo(): Promise<TargetInfo>;\n}\n\n/**\n * Error thrown when target proxy is not available\n */\nexport class TargetNotAvailableError extends Error {\n constructor(message: string) {\n super(message);\n this.name = 'TargetNotAvailableError';\n }\n}\n\n/**\n * Error thrown when target invocation fails\n */\nexport class TargetInvocationError extends Error {\n readonly statusCode?: number;\n\n constructor(message: string, statusCode?: number) {\n super(message);\n this.name = 'TargetInvocationError';\n this.statusCode = statusCode;\n }\n}\n\n/**\n * Create a target client from environment variables.\n *\n * This function reads the proxy URL and token from environment variables\n * that are automatically set by AgentV when a `target` config block is present\n * on a `code_judge` evaluator.\n *\n * @returns A target client if environment variables are set, otherwise undefined\n * @throws TargetNotAvailableError if token is missing when URL is present\n *\n * @example\n * ```typescript\n * import { createTargetClient, defineCodeJudge } from '@agentv/eval';\n *\n * export default defineCodeJudge(async ({ question, expectedOutcome }) => {\n * const target = createTargetClient();\n *\n * if (!target) {\n * // Target not available - no target config on this evaluator\n * return { score: 0.5, reasoning: 'Target not available' };\n * }\n *\n * const response = await target.invoke({\n * question: `Is this answer correct? Question: ${question}, Expected: ${expectedOutcome}`,\n * systemPrompt: 'You are an expert evaluator. Respond with JSON: { \"correct\": true/false }'\n * });\n *\n * const result = JSON.parse(response.rawText ?? '{}');\n * return { score: result.correct ? 1.0 : 0.0 };\n * });\n * ```\n */\nexport function createTargetClient(): TargetClient | undefined {\n const proxyUrl = process.env.AGENTV_TARGET_PROXY_URL;\n const proxyToken = process.env.AGENTV_TARGET_PROXY_TOKEN;\n\n if (!proxyUrl) {\n return undefined;\n }\n\n if (!proxyToken) {\n throw new TargetNotAvailableError(\n 'AGENTV_TARGET_PROXY_URL is set but AGENTV_TARGET_PROXY_TOKEN is missing',\n );\n }\n\n return createTargetClientInternal(proxyUrl, proxyToken);\n}\n\n/**\n * Internal: Create a target client with explicit URL and token.\n * Exported for testing only - use createTargetClient() in production.\n */\nexport function createTargetClientInternal(url: string, token: string): TargetClient {\n const headers = {\n 'Content-Type': 'application/json',\n Authorization: `Bearer ${token}`,\n };\n\n return {\n async invoke(request: TargetInvokeRequest): Promise<TargetInvokeResponse> {\n const response = await fetch(`${url}/invoke`, {\n method: 'POST',\n headers,\n body: JSON.stringify({\n question: request.question,\n systemPrompt: request.systemPrompt,\n evalCaseId: request.evalCaseId,\n attempt: request.attempt,\n target: request.target,\n }),\n });\n\n if (!response.ok) {\n const errorBody = await response.text();\n let errorMessage: string;\n try {\n const errorJson = JSON.parse(errorBody) as { error?: string };\n errorMessage = errorJson.error ?? `HTTP ${response.status}`;\n } catch {\n errorMessage = errorBody || `HTTP ${response.status}`;\n }\n throw new TargetInvocationError(errorMessage, response.status);\n }\n\n return (await response.json()) as TargetInvokeResponse;\n },\n\n async invokeBatch(\n requests: readonly TargetInvokeRequest[],\n ): Promise<readonly TargetInvokeResponse[]> {\n const response = await fetch(`${url}/invokeBatch`, {\n method: 'POST',\n headers,\n body: JSON.stringify({\n requests: requests.map((r) => ({\n question: r.question,\n systemPrompt: r.systemPrompt,\n evalCaseId: r.evalCaseId,\n attempt: r.attempt,\n target: r.target,\n })),\n }),\n });\n\n if (!response.ok) {\n const errorBody = await response.text();\n let errorMessage: string;\n try {\n const errorJson = JSON.parse(errorBody) as { error?: string };\n errorMessage = errorJson.error ?? `HTTP ${response.status}`;\n } catch {\n errorMessage = errorBody || `HTTP ${response.status}`;\n }\n throw new TargetInvocationError(errorMessage, response.status);\n }\n\n const result = (await response.json()) as { responses: TargetInvokeResponse[] };\n return result.responses;\n },\n\n async getInfo(): Promise<TargetInfo> {\n const response = await fetch(`${url}/info`, {\n method: 'GET',\n headers,\n });\n\n if (!response.ok) {\n const errorBody = await response.text();\n let errorMessage: string;\n try {\n const errorJson = JSON.parse(errorBody) as { error?: string };\n errorMessage = errorJson.error ?? `HTTP ${response.status}`;\n } catch {\n errorMessage = errorBody || `HTTP ${response.status}`;\n }\n throw new TargetInvocationError(errorMessage, response.status);\n }\n\n return (await response.json()) as TargetInfo;\n },\n };\n}\n","/**\n * AgentV Evaluation SDK\n *\n * Build custom code judges for evaluating AI agent outputs.\n *\n * @example Basic code judge\n * ```typescript\n * #!/usr/bin/env bun\n * import { defineCodeJudge } from '@agentv/eval';\n *\n * export default defineCodeJudge(({ traceSummary, candidateAnswer }) => ({\n * score: traceSummary?.eventCount <= 5 ? 1.0 : 0.5,\n * hits: ['Efficient tool usage'],\n * misses: [],\n * }));\n * ```\n *\n * @example Code judge with target access (requires `target` config in YAML)\n * ```typescript\n * #!/usr/bin/env bun\n * import { defineCodeJudge, createTargetClient } from '@agentv/eval';\n *\n * export default defineCodeJudge(async ({ question }) => {\n * const target = createTargetClient();\n * if (!target) {\n * return { score: 0, misses: ['Target not available'] };\n * }\n *\n * const response = await target.invoke({\n * question: `Evaluate: ${question}`,\n * systemPrompt: 'Respond with JSON: { \"score\": 0-1 }'\n * });\n *\n * const result = JSON.parse(response.rawText ?? '{}');\n * return { score: result.score ?? 0 };\n * });\n * ```\n *\n * @packageDocumentation\n */\n\n// Re-export schemas and types\nexport {\n CodeJudgeInputSchema,\n CodeJudgeResultSchema,\n TraceSummarySchema,\n MessageSchema,\n ToolCallSchema,\n TokenUsageSchema,\n type CodeJudgeInput,\n type CodeJudgeResult,\n type TraceSummary,\n type Message,\n type ToolCall,\n type TokenUsage,\n} from './schemas.js';\n\n// Re-export target client\nexport {\n createTargetClient,\n TargetNotAvailableError,\n TargetInvocationError,\n type TargetClient,\n type TargetInfo,\n type TargetInvokeRequest,\n type TargetInvokeResponse,\n} from './target-client.js';\n\n// Re-export Zod for typed config support\nexport { z } from 'zod';\n\n// Import runtime\nimport { type CodeJudgeHandler, runCodeJudge } from './runtime.js';\n\nexport type { CodeJudgeHandler };\n\n/**\n * Define a code judge evaluator with automatic stdin/stdout handling.\n *\n * This function:\n * 1. Reads JSON from stdin (snake_case format)\n * 2. Converts to camelCase and validates with Zod\n * 3. Calls your handler with typed input\n * 4. Validates the result and outputs JSON to stdout\n * 5. Handles errors gracefully with proper exit codes\n *\n * @param handler - Function that evaluates the input and returns a result\n *\n * @example\n * ```typescript\n * import { defineCodeJudge } from '@agentv/eval';\n *\n * export default defineCodeJudge(({ traceSummary }) => {\n * if (!traceSummary) {\n * return { score: 0.5, reasoning: 'No trace available' };\n * }\n *\n * const efficient = traceSummary.eventCount <= 10;\n * return {\n * score: efficient ? 1.0 : 0.5,\n * hits: efficient ? ['Efficient execution'] : [],\n * misses: efficient ? [] : ['Too many tool calls'],\n * };\n * });\n * ```\n *\n * @example With typed config\n * ```typescript\n * import { defineCodeJudge, z } from '@agentv/eval';\n *\n * const ConfigSchema = z.object({\n * maxToolCalls: z.number().default(10),\n * });\n *\n * export default defineCodeJudge(({ traceSummary, config }) => {\n * const { maxToolCalls } = ConfigSchema.parse(config ?? {});\n * // Use maxToolCalls...\n * });\n * ```\n */\nexport function defineCodeJudge(handler: CodeJudgeHandler): void {\n // Run immediately when module is loaded\n runCodeJudge(handler);\n}\n","/**\n * Runtime for code judge evaluators.\n * Handles stdin parsing, validation, error handling, and output formatting.\n */\nimport { readFileSync } from 'node:fs';\n\nimport { toCamelCaseDeep } from './case-conversion.js';\nimport {\n type CodeJudgeInput,\n CodeJudgeInputSchema,\n type CodeJudgeResult,\n CodeJudgeResultSchema,\n} from './schemas.js';\n\n/**\n * Handler function type for code judges.\n */\nexport type CodeJudgeHandler = (\n input: CodeJudgeInput,\n) => CodeJudgeResult | Promise<CodeJudgeResult>;\n\n/**\n * Read stdin synchronously (works in both Node.js and Bun).\n */\nfunction readStdin(): string {\n return readFileSync(0, 'utf8');\n}\n\n/**\n * Clamp a value to the range [0, 1].\n */\nfunction clampScore(value: number): number {\n if (Number.isNaN(value) || !Number.isFinite(value)) {\n return 0;\n }\n return Math.max(0, Math.min(1, value));\n}\n\n/**\n * Format an error for output.\n */\nfunction formatError(error: unknown): string {\n if (error instanceof Error) {\n return error.message;\n }\n return String(error);\n}\n\n/**\n * Run a code judge handler with full stdin/stdout handling.\n * This is the internal implementation called by defineCodeJudge.\n */\nexport async function runCodeJudge(handler: CodeJudgeHandler): Promise<void> {\n try {\n // 1. Read stdin\n const stdin = readStdin();\n\n // 2. Parse JSON\n const rawInput = JSON.parse(stdin) as Record<string, unknown>;\n\n // 3. Convert snake_case to camelCase\n const camelInput = toCamelCaseDeep(rawInput);\n\n // 4. Validate input with Zod\n const input = CodeJudgeInputSchema.parse(camelInput);\n\n // 5. Run handler\n const rawResult = await handler(input);\n\n // 6. Validate and normalize output\n const result = CodeJudgeResultSchema.parse({\n ...rawResult,\n score: clampScore(rawResult.score),\n });\n\n // 7. Output JSON\n console.log(JSON.stringify(result, null, 2));\n } catch (error) {\n // Output failure result\n const errorMessage = formatError(error);\n const errorResult: CodeJudgeResult = {\n score: 0,\n hits: [],\n misses: [errorMessage],\n reasoning: `Evaluation failed: ${errorMessage}`,\n };\n console.log(JSON.stringify(errorResult, null, 2));\n process.exit(1);\n }\n}\n","/**\n * Case conversion utilities for JSON payloads.\n * Converts between snake_case (wire format) and camelCase (TypeScript).\n */\n\nfunction toCamelCase(str: string): string {\n // Don't convert keys that start with uppercase (proper nouns/tool names)\n if (/^[A-Z]/.test(str)) {\n return str;\n }\n return str.replace(/_([a-z0-9])/g, (_, letter) => letter.toUpperCase());\n}\n\n/**\n * Recursively converts all keys in an object from snake_case to camelCase.\n * Used to map wire payloads into TypeScript-friendly shapes.\n *\n * @param obj - The object to convert (can be any JSON-serializable value)\n * @returns A new object with all keys converted to camelCase\n */\nexport function toCamelCaseDeep(obj: unknown): unknown {\n if (obj === null || obj === undefined) {\n return obj;\n }\n\n if (Array.isArray(obj)) {\n return obj.map((item) => toCamelCaseDeep(item));\n }\n\n if (typeof obj === 'object') {\n const result: Record<string, unknown> = {};\n for (const [key, value] of Object.entries(obj)) {\n const camelKey = toCamelCase(key);\n result[camelKey] = toCamelCaseDeep(value);\n }\n return result;\n }\n\n return obj;\n}\n"],"mappings":";AAIA,SAAS,SAAS;AAKX,IAAM,mBAAmB,EAAE,OAAO;AAAA,EACvC,OAAO,EAAE,OAAO;AAAA,EAChB,QAAQ,EAAE,OAAO;AAAA,EACjB,QAAQ,EAAE,OAAO,EAAE,SAAS;AAC9B,CAAC;AAKM,IAAM,qBAAqB,EAAE,OAAO;AAAA,EACzC,YAAY,EAAE,OAAO;AAAA,EACrB,WAAW,EAAE,MAAM,EAAE,OAAO,CAAC;AAAA,EAC7B,iBAAiB,EAAE,OAAO,EAAE,OAAO,GAAG,EAAE,OAAO,CAAC;AAAA,EAChD,YAAY,EAAE,OAAO;AAAA,EACrB,YAAY,iBAAiB,SAAS;AAAA,EACtC,SAAS,EAAE,OAAO,EAAE,SAAS;AAAA,EAC7B,YAAY,EAAE,OAAO,EAAE,SAAS;AAAA,EAChC,eAAe,EAAE,OAAO,EAAE,OAAO,GAAG,EAAE,MAAM,EAAE,OAAO,CAAC,CAAC,EAAE,SAAS;AACpE,CAAC;AAKM,IAAM,iBAAiB,EAAE,OAAO;AAAA,EACrC,MAAM,EAAE,OAAO;AAAA,EACf,OAAO,EAAE,QAAQ,EAAE,SAAS;AAAA,EAC5B,QAAQ,EAAE,QAAQ,EAAE,SAAS;AAAA,EAC7B,IAAI,EAAE,OAAO,EAAE,SAAS;AAAA,EACxB,WAAW,EAAE,OAAO,EAAE,SAAS;AACjC,CAAC;AAKM,IAAM,gBAAgB,EAAE,OAAO;AAAA,EACpC,MAAM,EAAE,KAAK,CAAC,aAAa,QAAQ,UAAU,MAAM,CAAC;AAAA,EACpD,SAAS,EAAE,MAAM,CAAC,EAAE,OAAO,GAAG,EAAE,OAAO,EAAE,QAAQ,CAAC,GAAG,EAAE,MAAM,EAAE,OAAO,EAAE,QAAQ,CAAC,CAAC,CAAC,CAAC,EAAE,SAAS;AAAA,EAC/F,WAAW,EAAE,MAAM,cAAc,EAAE,SAAS;AAAA,EAC5C,MAAM,EAAE,OAAO,EAAE,SAAS;AAAA,EAC1B,WAAW,EAAE,OAAO,EAAE,SAAS;AAAA,EAC/B,UAAU,EAAE,OAAO,EAAE,QAAQ,CAAC,EAAE,SAAS;AAC3C,CAAC;AAKM,IAAM,uBAAuB,EAAE,OAAO;AAAA,EAC3C,UAAU,EAAE,OAAO;AAAA,EACnB,iBAAiB,EAAE,OAAO;AAAA,EAC1B,kBAAkB,EAAE,MAAM,aAAa;AAAA,EACvC,iBAAiB,EAAE,OAAO,EAAE,SAAS;AAAA,EACrC,iBAAiB,EAAE,OAAO;AAAA,EAC1B,gBAAgB,EAAE,MAAM,aAAa,EAAE,SAAS,EAAE,SAAS;AAAA,EAC3D,gBAAgB,EAAE,MAAM,EAAE,OAAO,CAAC;AAAA,EAClC,YAAY,EAAE,MAAM,EAAE,OAAO,CAAC;AAAA,EAC9B,eAAe,EAAE,MAAM,aAAa;AAAA,EACpC,cAAc,mBAAmB,SAAS,EAAE,SAAS;AAAA,EACrD,QAAQ,EAAE,OAAO,EAAE,QAAQ,CAAC,EAAE,SAAS,EAAE,SAAS;AACpD,CAAC;AAKM,IAAM,wBAAwB,EAAE,OAAO;AAAA,EAC5C,OAAO,EAAE,OAAO,EAAE,IAAI,CAAC,EAAE,IAAI,CAAC;AAAA,EAC9B,MAAM,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,SAAS,EAAE,QAAQ,CAAC,CAAC;AAAA,EAC/C,QAAQ,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,SAAS,EAAE,QAAQ,CAAC,CAAC;AAAA,EACjD,WAAW,EAAE,OAAO,EAAE,SAAS;AAAA;AAAA,EAE/B,SAAS,EAAE,OAAO,EAAE,QAAQ,CAAC,EAAE,SAAS;AAC1C,CAAC;;;ACRM,IAAM,0BAAN,cAAsC,MAAM;AAAA,EACjD,YAAY,SAAiB;AAC3B,UAAM,OAAO;AACb,SAAK,OAAO;AAAA,EACd;AACF;AAKO,IAAM,wBAAN,cAAoC,MAAM;AAAA,EACtC;AAAA,EAET,YAAY,SAAiB,YAAqB;AAChD,UAAM,OAAO;AACb,SAAK,OAAO;AACZ,SAAK,aAAa;AAAA,EACpB;AACF;AAkCO,SAAS,qBAA+C;AAC7D,QAAM,WAAW,QAAQ,IAAI;AAC7B,QAAM,aAAa,QAAQ,IAAI;AAE/B,MAAI,CAAC,UAAU;AACb,WAAO;AAAA,EACT;AAEA,MAAI,CAAC,YAAY;AACf,UAAM,IAAI;AAAA,MACR;AAAA,IACF;AAAA,EACF;AAEA,SAAO,2BAA2B,UAAU,UAAU;AACxD;AAMO,SAAS,2BAA2B,KAAa,OAA6B;AACnF,QAAM,UAAU;AAAA,IACd,gBAAgB;AAAA,IAChB,eAAe,UAAU,KAAK;AAAA,EAChC;AAEA,SAAO;AAAA,IACL,MAAM,OAAO,SAA6D;AACxE,YAAM,WAAW,MAAM,MAAM,GAAG,GAAG,WAAW;AAAA,QAC5C,QAAQ;AAAA,QACR;AAAA,QACA,MAAM,KAAK,UAAU;AAAA,UACnB,UAAU,QAAQ;AAAA,UAClB,cAAc,QAAQ;AAAA,UACtB,YAAY,QAAQ;AAAA,UACpB,SAAS,QAAQ;AAAA,UACjB,QAAQ,QAAQ;AAAA,QAClB,CAAC;AAAA,MACH,CAAC;AAED,UAAI,CAAC,SAAS,IAAI;AAChB,cAAM,YAAY,MAAM,SAAS,KAAK;AACtC,YAAI;AACJ,YAAI;AACF,gBAAM,YAAY,KAAK,MAAM,SAAS;AACtC,yBAAe,UAAU,SAAS,QAAQ,SAAS,MAAM;AAAA,QAC3D,QAAQ;AACN,yBAAe,aAAa,QAAQ,SAAS,MAAM;AAAA,QACrD;AACA,cAAM,IAAI,sBAAsB,cAAc,SAAS,MAAM;AAAA,MAC/D;AAEA,aAAQ,MAAM,SAAS,KAAK;AAAA,IAC9B;AAAA,IAEA,MAAM,YACJ,UAC0C;AAC1C,YAAM,WAAW,MAAM,MAAM,GAAG,GAAG,gBAAgB;AAAA,QACjD,QAAQ;AAAA,QACR;AAAA,QACA,MAAM,KAAK,UAAU;AAAA,UACnB,UAAU,SAAS,IAAI,CAAC,OAAO;AAAA,YAC7B,UAAU,EAAE;AAAA,YACZ,cAAc,EAAE;AAAA,YAChB,YAAY,EAAE;AAAA,YACd,SAAS,EAAE;AAAA,YACX,QAAQ,EAAE;AAAA,UACZ,EAAE;AAAA,QACJ,CAAC;AAAA,MACH,CAAC;AAED,UAAI,CAAC,SAAS,IAAI;AAChB,cAAM,YAAY,MAAM,SAAS,KAAK;AACtC,YAAI;AACJ,YAAI;AACF,gBAAM,YAAY,KAAK,MAAM,SAAS;AACtC,yBAAe,UAAU,SAAS,QAAQ,SAAS,MAAM;AAAA,QAC3D,QAAQ;AACN,yBAAe,aAAa,QAAQ,SAAS,MAAM;AAAA,QACrD;AACA,cAAM,IAAI,sBAAsB,cAAc,SAAS,MAAM;AAAA,MAC/D;AAEA,YAAM,SAAU,MAAM,SAAS,KAAK;AACpC,aAAO,OAAO;AAAA,IAChB;AAAA,IAEA,MAAM,UAA+B;AACnC,YAAM,WAAW,MAAM,MAAM,GAAG,GAAG,SAAS;AAAA,QAC1C,QAAQ;AAAA,QACR;AAAA,MACF,CAAC;AAED,UAAI,CAAC,SAAS,IAAI;AAChB,cAAM,YAAY,MAAM,SAAS,KAAK;AACtC,YAAI;AACJ,YAAI;AACF,gBAAM,YAAY,KAAK,MAAM,SAAS;AACtC,yBAAe,UAAU,SAAS,QAAQ,SAAS,MAAM;AAAA,QAC3D,QAAQ;AACN,yBAAe,aAAa,QAAQ,SAAS,MAAM;AAAA,QACrD;AACA,cAAM,IAAI,sBAAsB,cAAc,SAAS,MAAM;AAAA,MAC/D;AAEA,aAAQ,MAAM,SAAS,KAAK;AAAA,IAC9B;AAAA,EACF;AACF;;;ACpKA,SAAS,KAAAA,UAAS;;;ACjElB,SAAS,oBAAoB;;;ACC7B,SAAS,YAAY,KAAqB;AAExC,MAAI,SAAS,KAAK,GAAG,GAAG;AACtB,WAAO;AAAA,EACT;AACA,SAAO,IAAI,QAAQ,gBAAgB,CAAC,GAAG,WAAW,OAAO,YAAY,CAAC;AACxE;AASO,SAAS,gBAAgB,KAAuB;AACrD,MAAI,QAAQ,QAAQ,QAAQ,QAAW;AACrC,WAAO;AAAA,EACT;AAEA,MAAI,MAAM,QAAQ,GAAG,GAAG;AACtB,WAAO,IAAI,IAAI,CAAC,SAAS,gBAAgB,IAAI,CAAC;AAAA,EAChD;AAEA,MAAI,OAAO,QAAQ,UAAU;AAC3B,UAAM,SAAkC,CAAC;AACzC,eAAW,CAAC,KAAK,KAAK,KAAK,OAAO,QAAQ,GAAG,GAAG;AAC9C,YAAM,WAAW,YAAY,GAAG;AAChC,aAAO,QAAQ,IAAI,gBAAgB,KAAK;AAAA,IAC1C;AACA,WAAO;AAAA,EACT;AAEA,SAAO;AACT;;;ADfA,SAAS,YAAoB;AAC3B,SAAO,aAAa,GAAG,MAAM;AAC/B;AAKA,SAAS,WAAW,OAAuB;AACzC,MAAI,OAAO,MAAM,KAAK,KAAK,CAAC,OAAO,SAAS,KAAK,GAAG;AAClD,WAAO;AAAA,EACT;AACA,SAAO,KAAK,IAAI,GAAG,KAAK,IAAI,GAAG,KAAK,CAAC;AACvC;AAKA,SAAS,YAAY,OAAwB;AAC3C,MAAI,iBAAiB,OAAO;AAC1B,WAAO,MAAM;AAAA,EACf;AACA,SAAO,OAAO,KAAK;AACrB;AAMA,eAAsB,aAAa,SAA0C;AAC3E,MAAI;AAEF,UAAM,QAAQ,UAAU;AAGxB,UAAM,WAAW,KAAK,MAAM,KAAK;AAGjC,UAAM,aAAa,gBAAgB,QAAQ;AAG3C,UAAM,QAAQ,qBAAqB,MAAM,UAAU;AAGnD,UAAM,YAAY,MAAM,QAAQ,KAAK;AAGrC,UAAM,SAAS,sBAAsB,MAAM;AAAA,MACzC,GAAG;AAAA,MACH,OAAO,WAAW,UAAU,KAAK;AAAA,IACnC,CAAC;AAGD,YAAQ,IAAI,KAAK,UAAU,QAAQ,MAAM,CAAC,CAAC;AAAA,EAC7C,SAAS,OAAO;AAEd,UAAM,eAAe,YAAY,KAAK;AACtC,UAAM,cAA+B;AAAA,MACnC,OAAO;AAAA,MACP,MAAM,CAAC;AAAA,MACP,QAAQ,CAAC,YAAY;AAAA,MACrB,WAAW,sBAAsB,YAAY;AAAA,IAC/C;AACA,YAAQ,IAAI,KAAK,UAAU,aAAa,MAAM,CAAC,CAAC;AAChD,YAAQ,KAAK,CAAC;AAAA,EAChB;AACF;;;AD+BO,SAAS,gBAAgB,SAAiC;AAE/D,eAAa,OAAO;AACtB;","names":["z"]}
|
|
1
|
+
{"version":3,"sources":["../src/schemas.ts","../src/target-client.ts","../src/index.ts","../src/prompt-template.ts","../src/case-conversion.ts","../src/runtime.ts"],"sourcesContent":["/**\n * Zod schemas for code judge input/output validation.\n * Provides both compile-time types and runtime validation.\n */\nimport { z } from 'zod';\n\n/**\n * Token usage metrics schema.\n */\nexport const TokenUsageSchema = z.object({\n input: z.number(),\n output: z.number(),\n cached: z.number().optional(),\n});\n\n/**\n * Trace summary schema (camelCase for TypeScript ergonomics).\n */\nexport const TraceSummarySchema = z.object({\n eventCount: z.number(),\n toolNames: z.array(z.string()),\n toolCallsByName: z.record(z.string(), z.number()),\n errorCount: z.number(),\n tokenUsage: TokenUsageSchema.optional(),\n costUsd: z.number().optional(),\n durationMs: z.number().optional(),\n toolDurations: z.record(z.string(), z.array(z.number())).optional(),\n});\n\n/**\n * Tool call schema.\n */\nexport const ToolCallSchema = z.object({\n tool: z.string(),\n input: z.unknown().optional(),\n output: z.unknown().optional(),\n id: z.string().optional(),\n timestamp: z.string().optional(),\n});\n\n/**\n * Unified message schema for input, expected, and output messages.\n */\nexport const MessageSchema = z.object({\n role: z.enum(['assistant', 'user', 'system', 'tool']),\n content: z.union([z.string(), z.record(z.unknown()), z.array(z.record(z.unknown()))]).optional(),\n toolCalls: z.array(ToolCallSchema).optional(),\n name: z.string().optional(),\n timestamp: z.string().optional(),\n metadata: z.record(z.unknown()).optional(),\n});\n\n/**\n * Code judge input schema (camelCase, converted from snake_case wire format).\n */\nexport const CodeJudgeInputSchema = z.object({\n question: z.string(),\n expectedOutcome: z.string(),\n expectedMessages: z.array(MessageSchema),\n referenceAnswer: z.string().optional(),\n candidateAnswer: z.string(),\n outputMessages: z.array(MessageSchema).nullable().optional(),\n guidelineFiles: z.array(z.string()),\n inputFiles: z.array(z.string()),\n inputMessages: z.array(MessageSchema),\n traceSummary: TraceSummarySchema.nullable().optional(),\n config: z.record(z.unknown()).nullable().optional(),\n});\n\n/**\n * Code judge result schema (validated before output).\n */\nexport const CodeJudgeResultSchema = z.object({\n score: z.number().min(0).max(1),\n hits: z.array(z.string()).optional().default([]),\n misses: z.array(z.string()).optional().default([]),\n reasoning: z.string().optional(),\n /** Optional structured details for domain-specific metrics (e.g., TP/TN/FP/FN counts, alignments). */\n details: z.record(z.unknown()).optional(),\n});\n\n/**\n * Inferred types from schemas.\n */\nexport type CodeJudgeInput = z.infer<typeof CodeJudgeInputSchema>;\nexport type CodeJudgeResult = z.infer<typeof CodeJudgeResultSchema>;\nexport type TraceSummary = z.infer<typeof TraceSummarySchema>;\nexport type Message = z.infer<typeof MessageSchema>;\nexport type ToolCall = z.infer<typeof ToolCallSchema>;\nexport type TokenUsage = z.infer<typeof TokenUsageSchema>;\n\n/**\n * Prompt template input schema (camelCase, converted from snake_case wire format).\n * Uses the same schema as CodeJudgeInput since the orchestrator sends identical payloads.\n */\nexport const PromptTemplateInputSchema = CodeJudgeInputSchema;\n\nexport type PromptTemplateInput = CodeJudgeInput;\n","/**\n * Client for invoking configured targets from code_judge scripts.\n *\n * Environment variables (set automatically by AgentV when `target` config is present):\n * - AGENTV_TARGET_PROXY_URL: The URL of the local proxy server\n * - AGENTV_TARGET_PROXY_TOKEN: Bearer token for authentication\n */\n\n/**\n * Request to invoke the target\n */\nexport interface TargetInvokeRequest {\n readonly question: string;\n readonly systemPrompt?: string;\n readonly evalCaseId?: string;\n readonly attempt?: number;\n /** Optional target override - use a different target for this invocation */\n readonly target?: string;\n}\n\n/**\n * Response from a target invocation\n */\nexport interface TargetInvokeResponse {\n readonly outputMessages: readonly unknown[];\n readonly rawText?: string;\n}\n\n/**\n * Information about the target proxy configuration\n */\nexport interface TargetInfo {\n /** Name of the default target being used */\n readonly targetName: string;\n /** Maximum number of calls allowed */\n readonly maxCalls: number;\n /** Current number of calls made */\n readonly callCount: number;\n /** List of all available target names */\n readonly availableTargets: readonly string[];\n}\n\n/**\n * Target client for making target invocations\n */\nexport interface TargetClient {\n /**\n * Invoke the configured target with a prompt.\n * @param request - The question and optional system prompt\n * @returns The target's response with output messages and optional raw text\n */\n invoke(request: TargetInvokeRequest): Promise<TargetInvokeResponse>;\n\n /**\n * Invoke the target with multiple requests in sequence.\n * Each request counts toward the max_calls limit.\n * @param requests - Array of target requests\n * @returns Array of target responses\n */\n invokeBatch(requests: readonly TargetInvokeRequest[]): Promise<readonly TargetInvokeResponse[]>;\n\n /**\n * Get information about the target proxy configuration.\n * Returns the default target name, max calls, current call count, and available targets.\n */\n getInfo(): Promise<TargetInfo>;\n}\n\n/**\n * Error thrown when target proxy is not available\n */\nexport class TargetNotAvailableError extends Error {\n constructor(message: string) {\n super(message);\n this.name = 'TargetNotAvailableError';\n }\n}\n\n/**\n * Error thrown when target invocation fails\n */\nexport class TargetInvocationError extends Error {\n readonly statusCode?: number;\n\n constructor(message: string, statusCode?: number) {\n super(message);\n this.name = 'TargetInvocationError';\n this.statusCode = statusCode;\n }\n}\n\n/**\n * Create a target client from environment variables.\n *\n * This function reads the proxy URL and token from environment variables\n * that are automatically set by AgentV when a `target` config block is present\n * on a `code_judge` evaluator.\n *\n * @returns A target client if environment variables are set, otherwise undefined\n * @throws TargetNotAvailableError if token is missing when URL is present\n *\n * @example\n * ```typescript\n * import { createTargetClient, defineCodeJudge } from '@agentv/eval';\n *\n * export default defineCodeJudge(async ({ question, expectedOutcome }) => {\n * const target = createTargetClient();\n *\n * if (!target) {\n * // Target not available - no target config on this evaluator\n * return { score: 0.5, reasoning: 'Target not available' };\n * }\n *\n * const response = await target.invoke({\n * question: `Is this answer correct? Question: ${question}, Expected: ${expectedOutcome}`,\n * systemPrompt: 'You are an expert evaluator. Respond with JSON: { \"correct\": true/false }'\n * });\n *\n * const result = JSON.parse(response.rawText ?? '{}');\n * return { score: result.correct ? 1.0 : 0.0 };\n * });\n * ```\n */\nexport function createTargetClient(): TargetClient | undefined {\n const proxyUrl = process.env.AGENTV_TARGET_PROXY_URL;\n const proxyToken = process.env.AGENTV_TARGET_PROXY_TOKEN;\n\n if (!proxyUrl) {\n return undefined;\n }\n\n if (!proxyToken) {\n throw new TargetNotAvailableError(\n 'AGENTV_TARGET_PROXY_URL is set but AGENTV_TARGET_PROXY_TOKEN is missing',\n );\n }\n\n return createTargetClientInternal(proxyUrl, proxyToken);\n}\n\n/**\n * Internal: Create a target client with explicit URL and token.\n * Exported for testing only - use createTargetClient() in production.\n */\nexport function createTargetClientInternal(url: string, token: string): TargetClient {\n const headers = {\n 'Content-Type': 'application/json',\n Authorization: `Bearer ${token}`,\n };\n\n return {\n async invoke(request: TargetInvokeRequest): Promise<TargetInvokeResponse> {\n const response = await fetch(`${url}/invoke`, {\n method: 'POST',\n headers,\n body: JSON.stringify({\n question: request.question,\n systemPrompt: request.systemPrompt,\n evalCaseId: request.evalCaseId,\n attempt: request.attempt,\n target: request.target,\n }),\n });\n\n if (!response.ok) {\n const errorBody = await response.text();\n let errorMessage: string;\n try {\n const errorJson = JSON.parse(errorBody) as { error?: string };\n errorMessage = errorJson.error ?? `HTTP ${response.status}`;\n } catch {\n errorMessage = errorBody || `HTTP ${response.status}`;\n }\n throw new TargetInvocationError(errorMessage, response.status);\n }\n\n return (await response.json()) as TargetInvokeResponse;\n },\n\n async invokeBatch(\n requests: readonly TargetInvokeRequest[],\n ): Promise<readonly TargetInvokeResponse[]> {\n const response = await fetch(`${url}/invokeBatch`, {\n method: 'POST',\n headers,\n body: JSON.stringify({\n requests: requests.map((r) => ({\n question: r.question,\n systemPrompt: r.systemPrompt,\n evalCaseId: r.evalCaseId,\n attempt: r.attempt,\n target: r.target,\n })),\n }),\n });\n\n if (!response.ok) {\n const errorBody = await response.text();\n let errorMessage: string;\n try {\n const errorJson = JSON.parse(errorBody) as { error?: string };\n errorMessage = errorJson.error ?? `HTTP ${response.status}`;\n } catch {\n errorMessage = errorBody || `HTTP ${response.status}`;\n }\n throw new TargetInvocationError(errorMessage, response.status);\n }\n\n const result = (await response.json()) as { responses: TargetInvokeResponse[] };\n return result.responses;\n },\n\n async getInfo(): Promise<TargetInfo> {\n const response = await fetch(`${url}/info`, {\n method: 'GET',\n headers,\n });\n\n if (!response.ok) {\n const errorBody = await response.text();\n let errorMessage: string;\n try {\n const errorJson = JSON.parse(errorBody) as { error?: string };\n errorMessage = errorJson.error ?? `HTTP ${response.status}`;\n } catch {\n errorMessage = errorBody || `HTTP ${response.status}`;\n }\n throw new TargetInvocationError(errorMessage, response.status);\n }\n\n return (await response.json()) as TargetInfo;\n },\n };\n}\n","/**\n * AgentV Evaluation SDK\n *\n * Build custom code judges for evaluating AI agent outputs.\n *\n * @example Basic code judge\n * ```typescript\n * #!/usr/bin/env bun\n * import { defineCodeJudge } from '@agentv/eval';\n *\n * export default defineCodeJudge(({ traceSummary, candidateAnswer }) => ({\n * score: traceSummary?.eventCount <= 5 ? 1.0 : 0.5,\n * hits: ['Efficient tool usage'],\n * misses: [],\n * }));\n * ```\n *\n * @example Code judge with target access (requires `target` config in YAML)\n * ```typescript\n * #!/usr/bin/env bun\n * import { defineCodeJudge, createTargetClient } from '@agentv/eval';\n *\n * export default defineCodeJudge(async ({ question }) => {\n * const target = createTargetClient();\n * if (!target) {\n * return { score: 0, misses: ['Target not available'] };\n * }\n *\n * const response = await target.invoke({\n * question: `Evaluate: ${question}`,\n * systemPrompt: 'Respond with JSON: { \"score\": 0-1 }'\n * });\n *\n * const result = JSON.parse(response.rawText ?? '{}');\n * return { score: result.score ?? 0 };\n * });\n * ```\n *\n * @packageDocumentation\n */\n\n// Re-export schemas and types\nexport {\n CodeJudgeInputSchema,\n CodeJudgeResultSchema,\n TraceSummarySchema,\n MessageSchema,\n ToolCallSchema,\n TokenUsageSchema,\n PromptTemplateInputSchema,\n type CodeJudgeInput,\n type CodeJudgeResult,\n type TraceSummary,\n type Message,\n type ToolCall,\n type TokenUsage,\n type PromptTemplateInput,\n} from './schemas.js';\n\n// Re-export target client\nexport {\n createTargetClient,\n TargetNotAvailableError,\n TargetInvocationError,\n type TargetClient,\n type TargetInfo,\n type TargetInvokeRequest,\n type TargetInvokeResponse,\n} from './target-client.js';\n\n// Re-export Zod for typed config support\nexport { z } from 'zod';\n\nimport { type PromptTemplateHandler, runPromptTemplate } from './prompt-template.js';\n// Import runtime\nimport { type CodeJudgeHandler, runCodeJudge } from './runtime.js';\n\nexport type { CodeJudgeHandler };\nexport type { PromptTemplateHandler };\n\n/**\n * Define a code judge evaluator with automatic stdin/stdout handling.\n *\n * This function:\n * 1. Reads JSON from stdin (snake_case format)\n * 2. Converts to camelCase and validates with Zod\n * 3. Calls your handler with typed input\n * 4. Validates the result and outputs JSON to stdout\n * 5. Handles errors gracefully with proper exit codes\n *\n * @param handler - Function that evaluates the input and returns a result\n *\n * @example\n * ```typescript\n * import { defineCodeJudge } from '@agentv/eval';\n *\n * export default defineCodeJudge(({ traceSummary }) => {\n * if (!traceSummary) {\n * return { score: 0.5, reasoning: 'No trace available' };\n * }\n *\n * const efficient = traceSummary.eventCount <= 10;\n * return {\n * score: efficient ? 1.0 : 0.5,\n * hits: efficient ? ['Efficient execution'] : [],\n * misses: efficient ? [] : ['Too many tool calls'],\n * };\n * });\n * ```\n *\n * @example With typed config\n * ```typescript\n * import { defineCodeJudge, z } from '@agentv/eval';\n *\n * const ConfigSchema = z.object({\n * maxToolCalls: z.number().default(10),\n * });\n *\n * export default defineCodeJudge(({ traceSummary, config }) => {\n * const { maxToolCalls } = ConfigSchema.parse(config ?? {});\n * // Use maxToolCalls...\n * });\n * ```\n */\nexport function defineCodeJudge(handler: CodeJudgeHandler): void {\n // Run immediately when module is loaded\n runCodeJudge(handler);\n}\n\n/**\n * Define a prompt template with automatic stdin/stdout handling.\n *\n * This function:\n * 1. Reads JSON from stdin (snake_case format)\n * 2. Converts to camelCase and validates with Zod\n * 3. Calls your handler with typed input\n * 4. Outputs the generated prompt string to stdout\n * 5. Handles errors gracefully with proper exit codes\n *\n * @param handler - Function that generates the prompt string from input\n *\n * @example\n * ```typescript\n * import { definePromptTemplate } from '@agentv/eval';\n *\n * export default definePromptTemplate((ctx) => `\n * Question: ${ctx.question}\n * Answer: ${ctx.candidateAnswer}\n *\n * ${ctx.referenceAnswer ? `Reference: ${ctx.referenceAnswer}` : ''}\n * `);\n * ```\n *\n * @example With conditional logic\n * ```typescript\n * import { definePromptTemplate } from '@agentv/eval';\n *\n * export default definePromptTemplate((ctx) => {\n * const rubric = ctx.config?.rubric as string | undefined;\n * return `\n * Question: ${ctx.question}\n * Candidate Answer: ${ctx.candidateAnswer}\n * ${rubric ? `\\nEvaluation Criteria:\\n${rubric}` : ''}\n * `;\n * });\n * ```\n */\nexport function definePromptTemplate(handler: PromptTemplateHandler): void {\n // Run immediately when module is loaded\n runPromptTemplate(handler);\n}\n","/**\n * Runtime for prompt template evaluators.\n * Handles stdin parsing, validation, error handling, and string output.\n */\nimport { readFileSync } from 'node:fs';\n\nimport { toCamelCaseDeep } from './case-conversion.js';\nimport { type PromptTemplateInput, PromptTemplateInputSchema } from './schemas.js';\n\n/**\n * Handler function type for prompt templates.\n * Returns the prompt string to use for evaluation.\n */\nexport type PromptTemplateHandler = (input: PromptTemplateInput) => string | Promise<string>;\n\n/**\n * Read stdin synchronously (works in both Node.js and Bun).\n */\nfunction readStdin(): string {\n return readFileSync(0, 'utf8');\n}\n\n/**\n * Run a prompt template handler with full stdin/stdout handling.\n * This is the internal implementation called by definePromptTemplate.\n */\nexport async function runPromptTemplate(handler: PromptTemplateHandler): Promise<void> {\n try {\n // 1. Read stdin\n const stdin = readStdin();\n\n // 2. Parse JSON\n const rawInput = JSON.parse(stdin) as Record<string, unknown>;\n\n // 3. Convert snake_case to camelCase\n const camelInput = toCamelCaseDeep(rawInput);\n\n // 4. Validate input with Zod\n const input = PromptTemplateInputSchema.parse(camelInput);\n\n // 5. Run handler\n const prompt = await handler(input);\n\n // 6. Output raw string (not JSON) - the prompt itself\n console.log(prompt);\n } catch (error) {\n // Output error to stderr and exit with non-zero code\n console.error(error instanceof Error ? error.message : String(error));\n process.exit(1);\n }\n}\n\n/**\n * Define a prompt template with automatic stdin/stdout handling.\n *\n * This function:\n * 1. Reads JSON from stdin (snake_case format)\n * 2. Converts to camelCase and validates with Zod\n * 3. Calls your handler with typed input\n * 4. Outputs the generated prompt string to stdout\n * 5. Handles errors gracefully with proper exit codes\n *\n * @param handler - Function that generates the prompt string from input\n *\n * @example\n * ```typescript\n * import { definePromptTemplate } from '@agentv/eval';\n *\n * export default definePromptTemplate((ctx) => `\n * Question: ${ctx.question}\n * Answer: ${ctx.candidateAnswer}\n *\n * ${ctx.referenceAnswer ? `Reference: ${ctx.referenceAnswer}` : ''}\n * `);\n * ```\n *\n * @example With conditional logic\n * ```typescript\n * import { definePromptTemplate } from '@agentv/eval';\n *\n * export default definePromptTemplate((ctx) => {\n * const rubric = ctx.config?.rubric as string | undefined;\n * return `\n * Question: ${ctx.question}\n * Candidate Answer: ${ctx.candidateAnswer}\n * ${rubric ? `\\nEvaluation Criteria:\\n${rubric}` : ''}\n * `;\n * });\n * ```\n *\n * @example Async handler\n * ```typescript\n * import { definePromptTemplate } from '@agentv/eval';\n *\n * export default definePromptTemplate(async (ctx) => {\n * // Async operations are supported\n * return `Question: ${ctx.question}\\nAnswer: ${ctx.candidateAnswer}`;\n * });\n * ```\n */\nexport function definePromptTemplate(handler: PromptTemplateHandler): void {\n // Run immediately when module is loaded\n runPromptTemplate(handler);\n}\n","/**\n * Case conversion utilities for JSON payloads.\n * Converts between snake_case (wire format) and camelCase (TypeScript).\n */\n\nfunction toCamelCase(str: string): string {\n // Don't convert keys that start with uppercase (proper nouns/tool names)\n if (/^[A-Z]/.test(str)) {\n return str;\n }\n return str.replace(/_([a-z0-9])/g, (_, letter) => letter.toUpperCase());\n}\n\n/**\n * Recursively converts all keys in an object from snake_case to camelCase.\n * Used to map wire payloads into TypeScript-friendly shapes.\n *\n * @param obj - The object to convert (can be any JSON-serializable value)\n * @returns A new object with all keys converted to camelCase\n */\nexport function toCamelCaseDeep(obj: unknown): unknown {\n if (obj === null || obj === undefined) {\n return obj;\n }\n\n if (Array.isArray(obj)) {\n return obj.map((item) => toCamelCaseDeep(item));\n }\n\n if (typeof obj === 'object') {\n const result: Record<string, unknown> = {};\n for (const [key, value] of Object.entries(obj)) {\n const camelKey = toCamelCase(key);\n result[camelKey] = toCamelCaseDeep(value);\n }\n return result;\n }\n\n return obj;\n}\n","/**\n * Runtime for code judge evaluators.\n * Handles stdin parsing, validation, error handling, and output formatting.\n */\nimport { readFileSync } from 'node:fs';\n\nimport { toCamelCaseDeep } from './case-conversion.js';\nimport {\n type CodeJudgeInput,\n CodeJudgeInputSchema,\n type CodeJudgeResult,\n CodeJudgeResultSchema,\n} from './schemas.js';\n\n/**\n * Handler function type for code judges.\n */\nexport type CodeJudgeHandler = (\n input: CodeJudgeInput,\n) => CodeJudgeResult | Promise<CodeJudgeResult>;\n\n/**\n * Read stdin synchronously (works in both Node.js and Bun).\n */\nfunction readStdin(): string {\n return readFileSync(0, 'utf8');\n}\n\n/**\n * Clamp a value to the range [0, 1].\n */\nfunction clampScore(value: number): number {\n if (Number.isNaN(value) || !Number.isFinite(value)) {\n return 0;\n }\n return Math.max(0, Math.min(1, value));\n}\n\n/**\n * Format an error for output.\n */\nfunction formatError(error: unknown): string {\n if (error instanceof Error) {\n return error.message;\n }\n return String(error);\n}\n\n/**\n * Run a code judge handler with full stdin/stdout handling.\n * This is the internal implementation called by defineCodeJudge.\n */\nexport async function runCodeJudge(handler: CodeJudgeHandler): Promise<void> {\n try {\n // 1. Read stdin\n const stdin = readStdin();\n\n // 2. Parse JSON\n const rawInput = JSON.parse(stdin) as Record<string, unknown>;\n\n // 3. Convert snake_case to camelCase\n const camelInput = toCamelCaseDeep(rawInput);\n\n // 4. Validate input with Zod\n const input = CodeJudgeInputSchema.parse(camelInput);\n\n // 5. Run handler\n const rawResult = await handler(input);\n\n // 6. Validate and normalize output\n const result = CodeJudgeResultSchema.parse({\n ...rawResult,\n score: clampScore(rawResult.score),\n });\n\n // 7. Output JSON\n console.log(JSON.stringify(result, null, 2));\n } catch (error) {\n // Output failure result\n const errorMessage = formatError(error);\n const errorResult: CodeJudgeResult = {\n score: 0,\n hits: [],\n misses: [errorMessage],\n reasoning: `Evaluation failed: ${errorMessage}`,\n };\n console.log(JSON.stringify(errorResult, null, 2));\n process.exit(1);\n }\n}\n"],"mappings":";AAIA,SAAS,SAAS;AAKX,IAAM,mBAAmB,EAAE,OAAO;AAAA,EACvC,OAAO,EAAE,OAAO;AAAA,EAChB,QAAQ,EAAE,OAAO;AAAA,EACjB,QAAQ,EAAE,OAAO,EAAE,SAAS;AAC9B,CAAC;AAKM,IAAM,qBAAqB,EAAE,OAAO;AAAA,EACzC,YAAY,EAAE,OAAO;AAAA,EACrB,WAAW,EAAE,MAAM,EAAE,OAAO,CAAC;AAAA,EAC7B,iBAAiB,EAAE,OAAO,EAAE,OAAO,GAAG,EAAE,OAAO,CAAC;AAAA,EAChD,YAAY,EAAE,OAAO;AAAA,EACrB,YAAY,iBAAiB,SAAS;AAAA,EACtC,SAAS,EAAE,OAAO,EAAE,SAAS;AAAA,EAC7B,YAAY,EAAE,OAAO,EAAE,SAAS;AAAA,EAChC,eAAe,EAAE,OAAO,EAAE,OAAO,GAAG,EAAE,MAAM,EAAE,OAAO,CAAC,CAAC,EAAE,SAAS;AACpE,CAAC;AAKM,IAAM,iBAAiB,EAAE,OAAO;AAAA,EACrC,MAAM,EAAE,OAAO;AAAA,EACf,OAAO,EAAE,QAAQ,EAAE,SAAS;AAAA,EAC5B,QAAQ,EAAE,QAAQ,EAAE,SAAS;AAAA,EAC7B,IAAI,EAAE,OAAO,EAAE,SAAS;AAAA,EACxB,WAAW,EAAE,OAAO,EAAE,SAAS;AACjC,CAAC;AAKM,IAAM,gBAAgB,EAAE,OAAO;AAAA,EACpC,MAAM,EAAE,KAAK,CAAC,aAAa,QAAQ,UAAU,MAAM,CAAC;AAAA,EACpD,SAAS,EAAE,MAAM,CAAC,EAAE,OAAO,GAAG,EAAE,OAAO,EAAE,QAAQ,CAAC,GAAG,EAAE,MAAM,EAAE,OAAO,EAAE,QAAQ,CAAC,CAAC,CAAC,CAAC,EAAE,SAAS;AAAA,EAC/F,WAAW,EAAE,MAAM,cAAc,EAAE,SAAS;AAAA,EAC5C,MAAM,EAAE,OAAO,EAAE,SAAS;AAAA,EAC1B,WAAW,EAAE,OAAO,EAAE,SAAS;AAAA,EAC/B,UAAU,EAAE,OAAO,EAAE,QAAQ,CAAC,EAAE,SAAS;AAC3C,CAAC;AAKM,IAAM,uBAAuB,EAAE,OAAO;AAAA,EAC3C,UAAU,EAAE,OAAO;AAAA,EACnB,iBAAiB,EAAE,OAAO;AAAA,EAC1B,kBAAkB,EAAE,MAAM,aAAa;AAAA,EACvC,iBAAiB,EAAE,OAAO,EAAE,SAAS;AAAA,EACrC,iBAAiB,EAAE,OAAO;AAAA,EAC1B,gBAAgB,EAAE,MAAM,aAAa,EAAE,SAAS,EAAE,SAAS;AAAA,EAC3D,gBAAgB,EAAE,MAAM,EAAE,OAAO,CAAC;AAAA,EAClC,YAAY,EAAE,MAAM,EAAE,OAAO,CAAC;AAAA,EAC9B,eAAe,EAAE,MAAM,aAAa;AAAA,EACpC,cAAc,mBAAmB,SAAS,EAAE,SAAS;AAAA,EACrD,QAAQ,EAAE,OAAO,EAAE,QAAQ,CAAC,EAAE,SAAS,EAAE,SAAS;AACpD,CAAC;AAKM,IAAM,wBAAwB,EAAE,OAAO;AAAA,EAC5C,OAAO,EAAE,OAAO,EAAE,IAAI,CAAC,EAAE,IAAI,CAAC;AAAA,EAC9B,MAAM,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,SAAS,EAAE,QAAQ,CAAC,CAAC;AAAA,EAC/C,QAAQ,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,SAAS,EAAE,QAAQ,CAAC,CAAC;AAAA,EACjD,WAAW,EAAE,OAAO,EAAE,SAAS;AAAA;AAAA,EAE/B,SAAS,EAAE,OAAO,EAAE,QAAQ,CAAC,EAAE,SAAS;AAC1C,CAAC;AAgBM,IAAM,4BAA4B;;;ACxBlC,IAAM,0BAAN,cAAsC,MAAM;AAAA,EACjD,YAAY,SAAiB;AAC3B,UAAM,OAAO;AACb,SAAK,OAAO;AAAA,EACd;AACF;AAKO,IAAM,wBAAN,cAAoC,MAAM;AAAA,EACtC;AAAA,EAET,YAAY,SAAiB,YAAqB;AAChD,UAAM,OAAO;AACb,SAAK,OAAO;AACZ,SAAK,aAAa;AAAA,EACpB;AACF;AAkCO,SAAS,qBAA+C;AAC7D,QAAM,WAAW,QAAQ,IAAI;AAC7B,QAAM,aAAa,QAAQ,IAAI;AAE/B,MAAI,CAAC,UAAU;AACb,WAAO;AAAA,EACT;AAEA,MAAI,CAAC,YAAY;AACf,UAAM,IAAI;AAAA,MACR;AAAA,IACF;AAAA,EACF;AAEA,SAAO,2BAA2B,UAAU,UAAU;AACxD;AAMO,SAAS,2BAA2B,KAAa,OAA6B;AACnF,QAAM,UAAU;AAAA,IACd,gBAAgB;AAAA,IAChB,eAAe,UAAU,KAAK;AAAA,EAChC;AAEA,SAAO;AAAA,IACL,MAAM,OAAO,SAA6D;AACxE,YAAM,WAAW,MAAM,MAAM,GAAG,GAAG,WAAW;AAAA,QAC5C,QAAQ;AAAA,QACR;AAAA,QACA,MAAM,KAAK,UAAU;AAAA,UACnB,UAAU,QAAQ;AAAA,UAClB,cAAc,QAAQ;AAAA,UACtB,YAAY,QAAQ;AAAA,UACpB,SAAS,QAAQ;AAAA,UACjB,QAAQ,QAAQ;AAAA,QAClB,CAAC;AAAA,MACH,CAAC;AAED,UAAI,CAAC,SAAS,IAAI;AAChB,cAAM,YAAY,MAAM,SAAS,KAAK;AACtC,YAAI;AACJ,YAAI;AACF,gBAAM,YAAY,KAAK,MAAM,SAAS;AACtC,yBAAe,UAAU,SAAS,QAAQ,SAAS,MAAM;AAAA,QAC3D,QAAQ;AACN,yBAAe,aAAa,QAAQ,SAAS,MAAM;AAAA,QACrD;AACA,cAAM,IAAI,sBAAsB,cAAc,SAAS,MAAM;AAAA,MAC/D;AAEA,aAAQ,MAAM,SAAS,KAAK;AAAA,IAC9B;AAAA,IAEA,MAAM,YACJ,UAC0C;AAC1C,YAAM,WAAW,MAAM,MAAM,GAAG,GAAG,gBAAgB;AAAA,QACjD,QAAQ;AAAA,QACR;AAAA,QACA,MAAM,KAAK,UAAU;AAAA,UACnB,UAAU,SAAS,IAAI,CAAC,OAAO;AAAA,YAC7B,UAAU,EAAE;AAAA,YACZ,cAAc,EAAE;AAAA,YAChB,YAAY,EAAE;AAAA,YACd,SAAS,EAAE;AAAA,YACX,QAAQ,EAAE;AAAA,UACZ,EAAE;AAAA,QACJ,CAAC;AAAA,MACH,CAAC;AAED,UAAI,CAAC,SAAS,IAAI;AAChB,cAAM,YAAY,MAAM,SAAS,KAAK;AACtC,YAAI;AACJ,YAAI;AACF,gBAAM,YAAY,KAAK,MAAM,SAAS;AACtC,yBAAe,UAAU,SAAS,QAAQ,SAAS,MAAM;AAAA,QAC3D,QAAQ;AACN,yBAAe,aAAa,QAAQ,SAAS,MAAM;AAAA,QACrD;AACA,cAAM,IAAI,sBAAsB,cAAc,SAAS,MAAM;AAAA,MAC/D;AAEA,YAAM,SAAU,MAAM,SAAS,KAAK;AACpC,aAAO,OAAO;AAAA,IAChB;AAAA,IAEA,MAAM,UAA+B;AACnC,YAAM,WAAW,MAAM,MAAM,GAAG,GAAG,SAAS;AAAA,QAC1C,QAAQ;AAAA,QACR;AAAA,MACF,CAAC;AAED,UAAI,CAAC,SAAS,IAAI;AAChB,cAAM,YAAY,MAAM,SAAS,KAAK;AACtC,YAAI;AACJ,YAAI;AACF,gBAAM,YAAY,KAAK,MAAM,SAAS;AACtC,yBAAe,UAAU,SAAS,QAAQ,SAAS,MAAM;AAAA,QAC3D,QAAQ;AACN,yBAAe,aAAa,QAAQ,SAAS,MAAM;AAAA,QACrD;AACA,cAAM,IAAI,sBAAsB,cAAc,SAAS,MAAM;AAAA,MAC/D;AAEA,aAAQ,MAAM,SAAS,KAAK;AAAA,IAC9B;AAAA,EACF;AACF;;;AClKA,SAAS,KAAAA,UAAS;;;ACnElB,SAAS,oBAAoB;;;ACC7B,SAAS,YAAY,KAAqB;AAExC,MAAI,SAAS,KAAK,GAAG,GAAG;AACtB,WAAO;AAAA,EACT;AACA,SAAO,IAAI,QAAQ,gBAAgB,CAAC,GAAG,WAAW,OAAO,YAAY,CAAC;AACxE;AASO,SAAS,gBAAgB,KAAuB;AACrD,MAAI,QAAQ,QAAQ,QAAQ,QAAW;AACrC,WAAO;AAAA,EACT;AAEA,MAAI,MAAM,QAAQ,GAAG,GAAG;AACtB,WAAO,IAAI,IAAI,CAAC,SAAS,gBAAgB,IAAI,CAAC;AAAA,EAChD;AAEA,MAAI,OAAO,QAAQ,UAAU;AAC3B,UAAM,SAAkC,CAAC;AACzC,eAAW,CAAC,KAAK,KAAK,KAAK,OAAO,QAAQ,GAAG,GAAG;AAC9C,YAAM,WAAW,YAAY,GAAG;AAChC,aAAO,QAAQ,IAAI,gBAAgB,KAAK;AAAA,IAC1C;AACA,WAAO;AAAA,EACT;AAEA,SAAO;AACT;;;ADrBA,SAAS,YAAoB;AAC3B,SAAO,aAAa,GAAG,MAAM;AAC/B;AAMA,eAAsB,kBAAkB,SAA+C;AACrF,MAAI;AAEF,UAAM,QAAQ,UAAU;AAGxB,UAAM,WAAW,KAAK,MAAM,KAAK;AAGjC,UAAM,aAAa,gBAAgB,QAAQ;AAG3C,UAAM,QAAQ,0BAA0B,MAAM,UAAU;AAGxD,UAAM,SAAS,MAAM,QAAQ,KAAK;AAGlC,YAAQ,IAAI,MAAM;AAAA,EACpB,SAAS,OAAO;AAEd,YAAQ,MAAM,iBAAiB,QAAQ,MAAM,UAAU,OAAO,KAAK,CAAC;AACpE,YAAQ,KAAK,CAAC;AAAA,EAChB;AACF;;;AE9CA,SAAS,gBAAAC,qBAAoB;AAoB7B,SAASC,aAAoB;AAC3B,SAAOC,cAAa,GAAG,MAAM;AAC/B;AAKA,SAAS,WAAW,OAAuB;AACzC,MAAI,OAAO,MAAM,KAAK,KAAK,CAAC,OAAO,SAAS,KAAK,GAAG;AAClD,WAAO;AAAA,EACT;AACA,SAAO,KAAK,IAAI,GAAG,KAAK,IAAI,GAAG,KAAK,CAAC;AACvC;AAKA,SAAS,YAAY,OAAwB;AAC3C,MAAI,iBAAiB,OAAO;AAC1B,WAAO,MAAM;AAAA,EACf;AACA,SAAO,OAAO,KAAK;AACrB;AAMA,eAAsB,aAAa,SAA0C;AAC3E,MAAI;AAEF,UAAM,QAAQD,WAAU;AAGxB,UAAM,WAAW,KAAK,MAAM,KAAK;AAGjC,UAAM,aAAa,gBAAgB,QAAQ;AAG3C,UAAM,QAAQ,qBAAqB,MAAM,UAAU;AAGnD,UAAM,YAAY,MAAM,QAAQ,KAAK;AAGrC,UAAM,SAAS,sBAAsB,MAAM;AAAA,MACzC,GAAG;AAAA,MACH,OAAO,WAAW,UAAU,KAAK;AAAA,IACnC,CAAC;AAGD,YAAQ,IAAI,KAAK,UAAU,QAAQ,MAAM,CAAC,CAAC;AAAA,EAC7C,SAAS,OAAO;AAEd,UAAM,eAAe,YAAY,KAAK;AACtC,UAAM,cAA+B;AAAA,MACnC,OAAO;AAAA,MACP,MAAM,CAAC;AAAA,MACP,QAAQ,CAAC,YAAY;AAAA,MACrB,WAAW,sBAAsB,YAAY;AAAA,IAC/C;AACA,YAAQ,IAAI,KAAK,UAAU,aAAa,MAAM,CAAC,CAAC;AAChD,YAAQ,KAAK,CAAC;AAAA,EAChB;AACF;;;AHmCO,SAAS,gBAAgB,SAAiC;AAE/D,eAAa,OAAO;AACtB;AAwCO,SAAS,qBAAqB,SAAsC;AAEzE,oBAAkB,OAAO;AAC3B;","names":["z","readFileSync","readStdin","readFileSync"]}
|