@agentv/eval 2.18.4 → 3.0.0-next.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -2,7 +2,7 @@ import { z } from 'zod';
2
2
  export { z } from 'zod';
3
3
 
4
4
  /**
5
- * Zod schemas for code judge input/output validation.
5
+ * Zod schemas for code grader input/output validation.
6
6
  * Provides both compile-time types and runtime validation.
7
7
  */
8
8
 
@@ -147,9 +147,9 @@ declare const MessageSchema: z.ZodObject<{
147
147
  metadata?: Record<string, unknown> | undefined;
148
148
  }>;
149
149
  /**
150
- * Code judge input schema (camelCase, converted from snake_case wire format).
150
+ * Code grader input schema (camelCase, converted from snake_case wire format).
151
151
  */
152
- declare const CodeJudgeInputSchema: z.ZodObject<{
152
+ declare const CodeGraderInputSchema: z.ZodObject<{
153
153
  question: z.ZodString;
154
154
  criteria: z.ZodString;
155
155
  expectedOutput: z.ZodArray<z.ZodObject<{
@@ -570,9 +570,9 @@ declare const CodeJudgeInputSchema: z.ZodObject<{
570
570
  config?: Record<string, unknown> | null | undefined;
571
571
  }>;
572
572
  /**
573
- * Code judge result schema (validated before output).
573
+ * Code grader result schema (validated before output).
574
574
  */
575
- declare const CodeJudgeResultSchema: z.ZodObject<{
575
+ declare const CodeGraderResultSchema: z.ZodObject<{
576
576
  score: z.ZodNumber;
577
577
  hits: z.ZodDefault<z.ZodOptional<z.ZodArray<z.ZodString, "many">>>;
578
578
  misses: z.ZodDefault<z.ZodOptional<z.ZodArray<z.ZodString, "many">>>;
@@ -595,15 +595,15 @@ declare const CodeJudgeResultSchema: z.ZodObject<{
595
595
  /**
596
596
  * Inferred types from schemas.
597
597
  */
598
- type CodeJudgeInput = z.infer<typeof CodeJudgeInputSchema>;
599
- type CodeJudgeResult = z.infer<typeof CodeJudgeResultSchema>;
598
+ type CodeGraderInput = z.infer<typeof CodeGraderInputSchema>;
599
+ type CodeGraderResult = z.infer<typeof CodeGraderResultSchema>;
600
600
  type TraceSummary = z.infer<typeof TraceSummarySchema>;
601
601
  type Message = z.infer<typeof MessageSchema>;
602
602
  type ToolCall = z.infer<typeof ToolCallSchema>;
603
603
  type TokenUsage = z.infer<typeof TokenUsageSchema>;
604
604
  /**
605
605
  * Prompt template input schema (camelCase, converted from snake_case wire format).
606
- * Uses the same schema as CodeJudgeInput since the orchestrator sends identical payloads.
606
+ * Uses the same schema as CodeGraderInput since the orchestrator sends identical payloads.
607
607
  */
608
608
  declare const PromptTemplateInputSchema: z.ZodObject<{
609
609
  question: z.ZodString;
@@ -1025,10 +1025,456 @@ declare const PromptTemplateInputSchema: z.ZodObject<{
1025
1025
  workspacePath?: string | null | undefined;
1026
1026
  config?: Record<string, unknown> | null | undefined;
1027
1027
  }>;
1028
- type PromptTemplateInput = CodeJudgeInput;
1028
+ type PromptTemplateInput = CodeGraderInput;
1029
+ /** @deprecated Use CodeGraderInputSchema */
1030
+ declare const CodeJudgeInputSchema: z.ZodObject<{
1031
+ question: z.ZodString;
1032
+ criteria: z.ZodString;
1033
+ expectedOutput: z.ZodArray<z.ZodObject<{
1034
+ role: z.ZodEnum<["assistant", "user", "system", "tool"]>;
1035
+ content: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodRecord<z.ZodString, z.ZodUnknown>, z.ZodArray<z.ZodRecord<z.ZodString, z.ZodUnknown>, "many">]>>;
1036
+ toolCalls: z.ZodOptional<z.ZodArray<z.ZodObject<{
1037
+ tool: z.ZodString;
1038
+ input: z.ZodOptional<z.ZodUnknown>;
1039
+ output: z.ZodOptional<z.ZodUnknown>;
1040
+ id: z.ZodOptional<z.ZodString>;
1041
+ startTime: z.ZodOptional<z.ZodString>;
1042
+ endTime: z.ZodOptional<z.ZodString>;
1043
+ durationMs: z.ZodOptional<z.ZodNumber>;
1044
+ }, "strip", z.ZodTypeAny, {
1045
+ tool: string;
1046
+ input?: unknown;
1047
+ output?: unknown;
1048
+ id?: string | undefined;
1049
+ startTime?: string | undefined;
1050
+ endTime?: string | undefined;
1051
+ durationMs?: number | undefined;
1052
+ }, {
1053
+ tool: string;
1054
+ input?: unknown;
1055
+ output?: unknown;
1056
+ id?: string | undefined;
1057
+ startTime?: string | undefined;
1058
+ endTime?: string | undefined;
1059
+ durationMs?: number | undefined;
1060
+ }>, "many">>;
1061
+ name: z.ZodOptional<z.ZodString>;
1062
+ startTime: z.ZodOptional<z.ZodString>;
1063
+ endTime: z.ZodOptional<z.ZodString>;
1064
+ durationMs: z.ZodOptional<z.ZodNumber>;
1065
+ metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
1066
+ }, "strip", z.ZodTypeAny, {
1067
+ role: "tool" | "assistant" | "user" | "system";
1068
+ startTime?: string | undefined;
1069
+ endTime?: string | undefined;
1070
+ durationMs?: number | undefined;
1071
+ content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
1072
+ toolCalls?: {
1073
+ tool: string;
1074
+ input?: unknown;
1075
+ output?: unknown;
1076
+ id?: string | undefined;
1077
+ startTime?: string | undefined;
1078
+ endTime?: string | undefined;
1079
+ durationMs?: number | undefined;
1080
+ }[] | undefined;
1081
+ name?: string | undefined;
1082
+ metadata?: Record<string, unknown> | undefined;
1083
+ }, {
1084
+ role: "tool" | "assistant" | "user" | "system";
1085
+ startTime?: string | undefined;
1086
+ endTime?: string | undefined;
1087
+ durationMs?: number | undefined;
1088
+ content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
1089
+ toolCalls?: {
1090
+ tool: string;
1091
+ input?: unknown;
1092
+ output?: unknown;
1093
+ id?: string | undefined;
1094
+ startTime?: string | undefined;
1095
+ endTime?: string | undefined;
1096
+ durationMs?: number | undefined;
1097
+ }[] | undefined;
1098
+ name?: string | undefined;
1099
+ metadata?: Record<string, unknown> | undefined;
1100
+ }>, "many">;
1101
+ referenceAnswer: z.ZodOptional<z.ZodString>;
1102
+ answer: z.ZodString;
1103
+ output: z.ZodOptional<z.ZodNullable<z.ZodArray<z.ZodObject<{
1104
+ role: z.ZodEnum<["assistant", "user", "system", "tool"]>;
1105
+ content: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodRecord<z.ZodString, z.ZodUnknown>, z.ZodArray<z.ZodRecord<z.ZodString, z.ZodUnknown>, "many">]>>;
1106
+ toolCalls: z.ZodOptional<z.ZodArray<z.ZodObject<{
1107
+ tool: z.ZodString;
1108
+ input: z.ZodOptional<z.ZodUnknown>;
1109
+ output: z.ZodOptional<z.ZodUnknown>;
1110
+ id: z.ZodOptional<z.ZodString>;
1111
+ startTime: z.ZodOptional<z.ZodString>;
1112
+ endTime: z.ZodOptional<z.ZodString>;
1113
+ durationMs: z.ZodOptional<z.ZodNumber>;
1114
+ }, "strip", z.ZodTypeAny, {
1115
+ tool: string;
1116
+ input?: unknown;
1117
+ output?: unknown;
1118
+ id?: string | undefined;
1119
+ startTime?: string | undefined;
1120
+ endTime?: string | undefined;
1121
+ durationMs?: number | undefined;
1122
+ }, {
1123
+ tool: string;
1124
+ input?: unknown;
1125
+ output?: unknown;
1126
+ id?: string | undefined;
1127
+ startTime?: string | undefined;
1128
+ endTime?: string | undefined;
1129
+ durationMs?: number | undefined;
1130
+ }>, "many">>;
1131
+ name: z.ZodOptional<z.ZodString>;
1132
+ startTime: z.ZodOptional<z.ZodString>;
1133
+ endTime: z.ZodOptional<z.ZodString>;
1134
+ durationMs: z.ZodOptional<z.ZodNumber>;
1135
+ metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
1136
+ }, "strip", z.ZodTypeAny, {
1137
+ role: "tool" | "assistant" | "user" | "system";
1138
+ startTime?: string | undefined;
1139
+ endTime?: string | undefined;
1140
+ durationMs?: number | undefined;
1141
+ content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
1142
+ toolCalls?: {
1143
+ tool: string;
1144
+ input?: unknown;
1145
+ output?: unknown;
1146
+ id?: string | undefined;
1147
+ startTime?: string | undefined;
1148
+ endTime?: string | undefined;
1149
+ durationMs?: number | undefined;
1150
+ }[] | undefined;
1151
+ name?: string | undefined;
1152
+ metadata?: Record<string, unknown> | undefined;
1153
+ }, {
1154
+ role: "tool" | "assistant" | "user" | "system";
1155
+ startTime?: string | undefined;
1156
+ endTime?: string | undefined;
1157
+ durationMs?: number | undefined;
1158
+ content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
1159
+ toolCalls?: {
1160
+ tool: string;
1161
+ input?: unknown;
1162
+ output?: unknown;
1163
+ id?: string | undefined;
1164
+ startTime?: string | undefined;
1165
+ endTime?: string | undefined;
1166
+ durationMs?: number | undefined;
1167
+ }[] | undefined;
1168
+ name?: string | undefined;
1169
+ metadata?: Record<string, unknown> | undefined;
1170
+ }>, "many">>>;
1171
+ /** Path to a temp file containing the output JSON (used for large payloads). */
1172
+ outputPath: z.ZodOptional<z.ZodString>;
1173
+ guidelineFiles: z.ZodArray<z.ZodString, "many">;
1174
+ inputFiles: z.ZodArray<z.ZodString, "many">;
1175
+ input: z.ZodArray<z.ZodObject<{
1176
+ role: z.ZodEnum<["assistant", "user", "system", "tool"]>;
1177
+ content: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodRecord<z.ZodString, z.ZodUnknown>, z.ZodArray<z.ZodRecord<z.ZodString, z.ZodUnknown>, "many">]>>;
1178
+ toolCalls: z.ZodOptional<z.ZodArray<z.ZodObject<{
1179
+ tool: z.ZodString;
1180
+ input: z.ZodOptional<z.ZodUnknown>;
1181
+ output: z.ZodOptional<z.ZodUnknown>;
1182
+ id: z.ZodOptional<z.ZodString>;
1183
+ startTime: z.ZodOptional<z.ZodString>;
1184
+ endTime: z.ZodOptional<z.ZodString>;
1185
+ durationMs: z.ZodOptional<z.ZodNumber>;
1186
+ }, "strip", z.ZodTypeAny, {
1187
+ tool: string;
1188
+ input?: unknown;
1189
+ output?: unknown;
1190
+ id?: string | undefined;
1191
+ startTime?: string | undefined;
1192
+ endTime?: string | undefined;
1193
+ durationMs?: number | undefined;
1194
+ }, {
1195
+ tool: string;
1196
+ input?: unknown;
1197
+ output?: unknown;
1198
+ id?: string | undefined;
1199
+ startTime?: string | undefined;
1200
+ endTime?: string | undefined;
1201
+ durationMs?: number | undefined;
1202
+ }>, "many">>;
1203
+ name: z.ZodOptional<z.ZodString>;
1204
+ startTime: z.ZodOptional<z.ZodString>;
1205
+ endTime: z.ZodOptional<z.ZodString>;
1206
+ durationMs: z.ZodOptional<z.ZodNumber>;
1207
+ metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
1208
+ }, "strip", z.ZodTypeAny, {
1209
+ role: "tool" | "assistant" | "user" | "system";
1210
+ startTime?: string | undefined;
1211
+ endTime?: string | undefined;
1212
+ durationMs?: number | undefined;
1213
+ content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
1214
+ toolCalls?: {
1215
+ tool: string;
1216
+ input?: unknown;
1217
+ output?: unknown;
1218
+ id?: string | undefined;
1219
+ startTime?: string | undefined;
1220
+ endTime?: string | undefined;
1221
+ durationMs?: number | undefined;
1222
+ }[] | undefined;
1223
+ name?: string | undefined;
1224
+ metadata?: Record<string, unknown> | undefined;
1225
+ }, {
1226
+ role: "tool" | "assistant" | "user" | "system";
1227
+ startTime?: string | undefined;
1228
+ endTime?: string | undefined;
1229
+ durationMs?: number | undefined;
1230
+ content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
1231
+ toolCalls?: {
1232
+ tool: string;
1233
+ input?: unknown;
1234
+ output?: unknown;
1235
+ id?: string | undefined;
1236
+ startTime?: string | undefined;
1237
+ endTime?: string | undefined;
1238
+ durationMs?: number | undefined;
1239
+ }[] | undefined;
1240
+ name?: string | undefined;
1241
+ metadata?: Record<string, unknown> | undefined;
1242
+ }>, "many">;
1243
+ trace: z.ZodOptional<z.ZodNullable<z.ZodObject<{
1244
+ eventCount: z.ZodNumber;
1245
+ toolNames: z.ZodArray<z.ZodString, "many">;
1246
+ toolCallsByName: z.ZodRecord<z.ZodString, z.ZodNumber>;
1247
+ errorCount: z.ZodNumber;
1248
+ toolDurations: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodArray<z.ZodNumber, "many">>>;
1249
+ llmCallCount: z.ZodOptional<z.ZodNumber>;
1250
+ }, "strip", z.ZodTypeAny, {
1251
+ eventCount: number;
1252
+ toolNames: string[];
1253
+ toolCallsByName: Record<string, number>;
1254
+ errorCount: number;
1255
+ toolDurations?: Record<string, number[]> | undefined;
1256
+ llmCallCount?: number | undefined;
1257
+ }, {
1258
+ eventCount: number;
1259
+ toolNames: string[];
1260
+ toolCallsByName: Record<string, number>;
1261
+ errorCount: number;
1262
+ toolDurations?: Record<string, number[]> | undefined;
1263
+ llmCallCount?: number | undefined;
1264
+ }>>>;
1265
+ tokenUsage: z.ZodOptional<z.ZodNullable<z.ZodObject<{
1266
+ input: z.ZodNumber;
1267
+ output: z.ZodNumber;
1268
+ cached: z.ZodOptional<z.ZodNumber>;
1269
+ }, "strip", z.ZodTypeAny, {
1270
+ input: number;
1271
+ output: number;
1272
+ cached?: number | undefined;
1273
+ }, {
1274
+ input: number;
1275
+ output: number;
1276
+ cached?: number | undefined;
1277
+ }>>>;
1278
+ costUsd: z.ZodOptional<z.ZodNullable<z.ZodNumber>>;
1279
+ durationMs: z.ZodOptional<z.ZodNullable<z.ZodNumber>>;
1280
+ startTime: z.ZodOptional<z.ZodNullable<z.ZodString>>;
1281
+ endTime: z.ZodOptional<z.ZodNullable<z.ZodString>>;
1282
+ fileChanges: z.ZodOptional<z.ZodNullable<z.ZodString>>;
1283
+ workspacePath: z.ZodOptional<z.ZodNullable<z.ZodString>>;
1284
+ config: z.ZodOptional<z.ZodNullable<z.ZodRecord<z.ZodString, z.ZodUnknown>>>;
1285
+ }, "strip", z.ZodTypeAny, {
1286
+ input: {
1287
+ role: "tool" | "assistant" | "user" | "system";
1288
+ startTime?: string | undefined;
1289
+ endTime?: string | undefined;
1290
+ durationMs?: number | undefined;
1291
+ content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
1292
+ toolCalls?: {
1293
+ tool: string;
1294
+ input?: unknown;
1295
+ output?: unknown;
1296
+ id?: string | undefined;
1297
+ startTime?: string | undefined;
1298
+ endTime?: string | undefined;
1299
+ durationMs?: number | undefined;
1300
+ }[] | undefined;
1301
+ name?: string | undefined;
1302
+ metadata?: Record<string, unknown> | undefined;
1303
+ }[];
1304
+ question: string;
1305
+ criteria: string;
1306
+ expectedOutput: {
1307
+ role: "tool" | "assistant" | "user" | "system";
1308
+ startTime?: string | undefined;
1309
+ endTime?: string | undefined;
1310
+ durationMs?: number | undefined;
1311
+ content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
1312
+ toolCalls?: {
1313
+ tool: string;
1314
+ input?: unknown;
1315
+ output?: unknown;
1316
+ id?: string | undefined;
1317
+ startTime?: string | undefined;
1318
+ endTime?: string | undefined;
1319
+ durationMs?: number | undefined;
1320
+ }[] | undefined;
1321
+ name?: string | undefined;
1322
+ metadata?: Record<string, unknown> | undefined;
1323
+ }[];
1324
+ answer: string;
1325
+ guidelineFiles: string[];
1326
+ inputFiles: string[];
1327
+ output?: {
1328
+ role: "tool" | "assistant" | "user" | "system";
1329
+ startTime?: string | undefined;
1330
+ endTime?: string | undefined;
1331
+ durationMs?: number | undefined;
1332
+ content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
1333
+ toolCalls?: {
1334
+ tool: string;
1335
+ input?: unknown;
1336
+ output?: unknown;
1337
+ id?: string | undefined;
1338
+ startTime?: string | undefined;
1339
+ endTime?: string | undefined;
1340
+ durationMs?: number | undefined;
1341
+ }[] | undefined;
1342
+ name?: string | undefined;
1343
+ metadata?: Record<string, unknown> | undefined;
1344
+ }[] | null | undefined;
1345
+ startTime?: string | null | undefined;
1346
+ endTime?: string | null | undefined;
1347
+ durationMs?: number | null | undefined;
1348
+ referenceAnswer?: string | undefined;
1349
+ outputPath?: string | undefined;
1350
+ trace?: {
1351
+ eventCount: number;
1352
+ toolNames: string[];
1353
+ toolCallsByName: Record<string, number>;
1354
+ errorCount: number;
1355
+ toolDurations?: Record<string, number[]> | undefined;
1356
+ llmCallCount?: number | undefined;
1357
+ } | null | undefined;
1358
+ tokenUsage?: {
1359
+ input: number;
1360
+ output: number;
1361
+ cached?: number | undefined;
1362
+ } | null | undefined;
1363
+ costUsd?: number | null | undefined;
1364
+ fileChanges?: string | null | undefined;
1365
+ workspacePath?: string | null | undefined;
1366
+ config?: Record<string, unknown> | null | undefined;
1367
+ }, {
1368
+ input: {
1369
+ role: "tool" | "assistant" | "user" | "system";
1370
+ startTime?: string | undefined;
1371
+ endTime?: string | undefined;
1372
+ durationMs?: number | undefined;
1373
+ content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
1374
+ toolCalls?: {
1375
+ tool: string;
1376
+ input?: unknown;
1377
+ output?: unknown;
1378
+ id?: string | undefined;
1379
+ startTime?: string | undefined;
1380
+ endTime?: string | undefined;
1381
+ durationMs?: number | undefined;
1382
+ }[] | undefined;
1383
+ name?: string | undefined;
1384
+ metadata?: Record<string, unknown> | undefined;
1385
+ }[];
1386
+ question: string;
1387
+ criteria: string;
1388
+ expectedOutput: {
1389
+ role: "tool" | "assistant" | "user" | "system";
1390
+ startTime?: string | undefined;
1391
+ endTime?: string | undefined;
1392
+ durationMs?: number | undefined;
1393
+ content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
1394
+ toolCalls?: {
1395
+ tool: string;
1396
+ input?: unknown;
1397
+ output?: unknown;
1398
+ id?: string | undefined;
1399
+ startTime?: string | undefined;
1400
+ endTime?: string | undefined;
1401
+ durationMs?: number | undefined;
1402
+ }[] | undefined;
1403
+ name?: string | undefined;
1404
+ metadata?: Record<string, unknown> | undefined;
1405
+ }[];
1406
+ answer: string;
1407
+ guidelineFiles: string[];
1408
+ inputFiles: string[];
1409
+ output?: {
1410
+ role: "tool" | "assistant" | "user" | "system";
1411
+ startTime?: string | undefined;
1412
+ endTime?: string | undefined;
1413
+ durationMs?: number | undefined;
1414
+ content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
1415
+ toolCalls?: {
1416
+ tool: string;
1417
+ input?: unknown;
1418
+ output?: unknown;
1419
+ id?: string | undefined;
1420
+ startTime?: string | undefined;
1421
+ endTime?: string | undefined;
1422
+ durationMs?: number | undefined;
1423
+ }[] | undefined;
1424
+ name?: string | undefined;
1425
+ metadata?: Record<string, unknown> | undefined;
1426
+ }[] | null | undefined;
1427
+ startTime?: string | null | undefined;
1428
+ endTime?: string | null | undefined;
1429
+ durationMs?: number | null | undefined;
1430
+ referenceAnswer?: string | undefined;
1431
+ outputPath?: string | undefined;
1432
+ trace?: {
1433
+ eventCount: number;
1434
+ toolNames: string[];
1435
+ toolCallsByName: Record<string, number>;
1436
+ errorCount: number;
1437
+ toolDurations?: Record<string, number[]> | undefined;
1438
+ llmCallCount?: number | undefined;
1439
+ } | null | undefined;
1440
+ tokenUsage?: {
1441
+ input: number;
1442
+ output: number;
1443
+ cached?: number | undefined;
1444
+ } | null | undefined;
1445
+ costUsd?: number | null | undefined;
1446
+ fileChanges?: string | null | undefined;
1447
+ workspacePath?: string | null | undefined;
1448
+ config?: Record<string, unknown> | null | undefined;
1449
+ }>;
1450
+ /** @deprecated Use CodeGraderResultSchema */
1451
+ declare const CodeJudgeResultSchema: z.ZodObject<{
1452
+ score: z.ZodNumber;
1453
+ hits: z.ZodDefault<z.ZodOptional<z.ZodArray<z.ZodString, "many">>>;
1454
+ misses: z.ZodDefault<z.ZodOptional<z.ZodArray<z.ZodString, "many">>>;
1455
+ reasoning: z.ZodOptional<z.ZodString>;
1456
+ /** Optional structured details for domain-specific metrics (e.g., TP/TN/FP/FN counts, alignments). */
1457
+ details: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
1458
+ }, "strip", z.ZodTypeAny, {
1459
+ score: number;
1460
+ hits: string[];
1461
+ misses: string[];
1462
+ reasoning?: string | undefined;
1463
+ details?: Record<string, unknown> | undefined;
1464
+ }, {
1465
+ score: number;
1466
+ hits?: string[] | undefined;
1467
+ misses?: string[] | undefined;
1468
+ reasoning?: string | undefined;
1469
+ details?: Record<string, unknown> | undefined;
1470
+ }>;
1471
+ /** @deprecated Use CodeGraderInput */
1472
+ type CodeJudgeInput = CodeGraderInput;
1473
+ /** @deprecated Use CodeGraderResult */
1474
+ type CodeJudgeResult = CodeGraderResult;
1029
1475
 
1030
1476
  /**
1031
- * Client for invoking configured targets from code-judge scripts.
1477
+ * Client for invoking configured targets from code-grader scripts.
1032
1478
  *
1033
1479
  * Environment variables (set automatically by AgentV when `target` config is present):
1034
1480
  * - AGENTV_TARGET_PROXY_URL: The URL of the local proxy server
@@ -1108,16 +1554,16 @@ declare class TargetInvocationError extends Error {
1108
1554
  *
1109
1555
  * This function reads the proxy URL and token from environment variables
1110
1556
  * that are automatically set by AgentV when a `target` config block is present
1111
- * on a `code_judge` evaluator.
1557
+ * on a `code_grader` (or `code_judge`) evaluator.
1112
1558
  *
1113
1559
  * @returns A target client if environment variables are set, otherwise undefined
1114
1560
  * @throws TargetNotAvailableError if token is missing when URL is present
1115
1561
  *
1116
1562
  * @example
1117
1563
  * ```typescript
1118
- * import { createTargetClient, defineCodeJudge } from '@agentv/eval';
1564
+ * import { createTargetClient, defineCodeGrader } from '@agentv/eval';
1119
1565
  *
1120
- * export default defineCodeJudge(async ({ question, criteria }) => {
1566
+ * export default defineCodeGrader(async ({ question, criteria }) => {
1121
1567
  * const target = createTargetClient();
1122
1568
  *
1123
1569
  * if (!target) {
@@ -1139,15 +1585,15 @@ declare function createTargetClient(): TargetClient | undefined;
1139
1585
 
1140
1586
  /**
1141
1587
  * Context provided to assertion handlers.
1142
- * Same shape as CodeJudgeInput — assertions receive full evaluation context.
1588
+ * Same shape as CodeGraderInput — assertions receive full evaluation context.
1143
1589
  */
1144
- type AssertionContext = CodeJudgeInput;
1590
+ type AssertionContext = CodeGraderInput;
1145
1591
  /**
1146
1592
  * Known built-in assertion types. Custom types are extensible via string.
1147
1593
  *
1148
- * Use in EVAL.yaml `assert` blocks:
1594
+ * Use in EVAL.yaml `assertions` blocks:
1149
1595
  * ```yaml
1150
- * assert:
1596
+ * assertions:
1151
1597
  * - type: contains
1152
1598
  * value: "Paris"
1153
1599
  * ```
@@ -1156,7 +1602,7 @@ type AssertionContext = CodeJudgeInput;
1156
1602
  * are also valid — the `string & {}` escape hatch provides autocomplete
1157
1603
  * for known types while accepting any string.
1158
1604
  */
1159
- type AssertionType = 'llm-judge' | 'code-judge' | 'rubrics' | 'composite' | 'tool-trajectory' | 'field-accuracy' | 'latency' | 'cost' | 'token-usage' | 'execution-metrics' | 'agent-judge' | 'contains' | 'contains-any' | 'contains-all' | 'icontains' | 'icontains-any' | 'icontains-all' | 'starts-with' | 'ends-with' | 'equals' | 'regex' | 'is-json' | 'llm_judge' | 'code_judge' | 'tool_trajectory' | 'field_accuracy' | 'token_usage' | 'execution_metrics' | 'agent_judge' | 'contains_any' | 'contains_all' | 'icontains_any' | 'icontains_all' | 'starts_with' | 'ends_with' | 'is_json' | (string & {});
1605
+ type AssertionType = 'llm-grader' | 'code-grader' | 'rubrics' | 'composite' | 'tool-trajectory' | 'field-accuracy' | 'latency' | 'cost' | 'token-usage' | 'execution-metrics' | 'skill-trigger' | 'contains' | 'contains-any' | 'contains-all' | 'icontains' | 'icontains-any' | 'icontains-all' | 'starts-with' | 'ends-with' | 'equals' | 'regex' | 'is-json' | 'llm-judge' | 'code-judge' | 'llm_judge' | 'code_judge' | 'llm_grader' | 'code_grader' | 'tool_trajectory' | 'field_accuracy' | 'token_usage' | 'execution_metrics' | 'contains_any' | 'contains_all' | 'icontains_any' | 'icontains_all' | 'starts_with' | 'ends_with' | 'is_json' | (string & {});
1160
1606
  /**
1161
1607
  * Result returned from an assertion handler.
1162
1608
  *
@@ -1201,9 +1647,11 @@ type AssertionHandler = (ctx: AssertionContext) => AssertionScore | Promise<Asse
1201
1647
  type PromptTemplateHandler = (input: PromptTemplateInput) => string | Promise<string>;
1202
1648
 
1203
1649
  /**
1204
- * Handler function type for code judges.
1650
+ * Handler function type for code graders.
1205
1651
  */
1206
- type CodeJudgeHandler = (input: CodeJudgeInput) => CodeJudgeResult | Promise<CodeJudgeResult>;
1652
+ type CodeGraderHandler = (input: CodeGraderInput) => CodeGraderResult | Promise<CodeGraderResult>;
1653
+ /** @deprecated Use CodeGraderHandler */
1654
+ type CodeJudgeHandler = CodeGraderHandler;
1207
1655
 
1208
1656
  /**
1209
1657
  * AgentV Evaluation SDK
@@ -1221,24 +1669,24 @@ type CodeJudgeHandler = (input: CodeJudgeInput) => CodeJudgeResult | Promise<Cod
1221
1669
  * }));
1222
1670
  * ```
1223
1671
  *
1224
- * @example Code judge (full control)
1672
+ * @example Code grader (full control)
1225
1673
  * ```typescript
1226
1674
  * #!/usr/bin/env bun
1227
- * import { defineCodeJudge } from '@agentv/eval';
1675
+ * import { defineCodeGrader } from '@agentv/eval';
1228
1676
  *
1229
- * export default defineCodeJudge(({ trace, answer }) => ({
1677
+ * export default defineCodeGrader(({ trace, answer }) => ({
1230
1678
  * score: trace?.eventCount <= 5 ? 1.0 : 0.5,
1231
1679
  * hits: ['Efficient tool usage'],
1232
1680
  * misses: [],
1233
1681
  * }));
1234
1682
  * ```
1235
1683
  *
1236
- * @example Code judge with target access (requires `target` config in YAML)
1684
+ * @example Code grader with target access (requires `target` config in YAML)
1237
1685
  * ```typescript
1238
1686
  * #!/usr/bin/env bun
1239
- * import { defineCodeJudge, createTargetClient } from '@agentv/eval';
1687
+ * import { defineCodeGrader, createTargetClient } from '@agentv/eval';
1240
1688
  *
1241
- * export default defineCodeJudge(async ({ question }) => {
1689
+ * export default defineCodeGrader(async ({ question }) => {
1242
1690
  * const target = createTargetClient();
1243
1691
  * if (!target) {
1244
1692
  * return { score: 0, misses: ['Target not available'] };
@@ -1258,7 +1706,7 @@ type CodeJudgeHandler = (input: CodeJudgeInput) => CodeJudgeResult | Promise<Cod
1258
1706
  */
1259
1707
 
1260
1708
  /**
1261
- * Define a code judge evaluator with automatic stdin/stdout handling.
1709
+ * Define a code grader evaluator with automatic stdin/stdout handling.
1262
1710
  *
1263
1711
  * This function:
1264
1712
  * 1. Reads JSON from stdin (snake_case format)
@@ -1271,9 +1719,9 @@ type CodeJudgeHandler = (input: CodeJudgeInput) => CodeJudgeResult | Promise<Cod
1271
1719
  *
1272
1720
  * @example
1273
1721
  * ```typescript
1274
- * import { defineCodeJudge } from '@agentv/eval';
1722
+ * import { defineCodeGrader } from '@agentv/eval';
1275
1723
  *
1276
- * export default defineCodeJudge(({ trace }) => {
1724
+ * export default defineCodeGrader(({ trace }) => {
1277
1725
  * if (!trace) {
1278
1726
  * return { score: 0.5, reasoning: 'No trace available' };
1279
1727
  * }
@@ -1289,19 +1737,21 @@ type CodeJudgeHandler = (input: CodeJudgeInput) => CodeJudgeResult | Promise<Cod
1289
1737
  *
1290
1738
  * @example With typed config
1291
1739
  * ```typescript
1292
- * import { defineCodeJudge, z } from '@agentv/eval';
1740
+ * import { defineCodeGrader, z } from '@agentv/eval';
1293
1741
  *
1294
1742
  * const ConfigSchema = z.object({
1295
1743
  * maxToolCalls: z.number().default(10),
1296
1744
  * });
1297
1745
  *
1298
- * export default defineCodeJudge(({ trace, config }) => {
1746
+ * export default defineCodeGrader(({ trace, config }) => {
1299
1747
  * const { maxToolCalls } = ConfigSchema.parse(config ?? {});
1300
1748
  * // Use maxToolCalls...
1301
1749
  * });
1302
1750
  * ```
1303
1751
  */
1304
- declare function defineCodeJudge(handler: CodeJudgeHandler): void;
1752
+ declare function defineCodeGrader(handler: CodeGraderHandler): void;
1753
+ /** @deprecated Use defineCodeGrader */
1754
+ declare const defineCodeJudge: typeof defineCodeGrader;
1305
1755
  /**
1306
1756
  * Define a prompt template with automatic stdin/stdout handling.
1307
1757
  *
@@ -1387,4 +1837,4 @@ declare function definePromptTemplate(handler: PromptTemplateHandler): void;
1387
1837
  */
1388
1838
  declare function defineAssertion(handler: AssertionHandler): void;
1389
1839
 
1390
- export { type AssertionContext, type AssertionHandler, type AssertionScore, type AssertionType, type CodeJudgeHandler, type CodeJudgeInput, CodeJudgeInputSchema, type CodeJudgeResult, CodeJudgeResultSchema, type Message, MessageSchema, type PromptTemplateHandler, type PromptTemplateInput, PromptTemplateInputSchema, type TargetClient, type TargetInfo, TargetInvocationError, type TargetInvokeRequest, type TargetInvokeResponse, TargetNotAvailableError, type TokenUsage, TokenUsageSchema, type ToolCall, ToolCallSchema, type TraceSummary, TraceSummarySchema, createTargetClient, defineAssertion, defineCodeJudge, definePromptTemplate };
1840
+ export { type AssertionContext, type AssertionHandler, type AssertionScore, type AssertionType, type CodeGraderHandler, type CodeGraderInput, CodeGraderInputSchema, type CodeGraderResult, CodeGraderResultSchema, type CodeJudgeHandler, type CodeJudgeInput, CodeJudgeInputSchema, type CodeJudgeResult, CodeJudgeResultSchema, type Message, MessageSchema, type PromptTemplateHandler, type PromptTemplateInput, PromptTemplateInputSchema, type TargetClient, type TargetInfo, TargetInvocationError, type TargetInvokeRequest, type TargetInvokeResponse, TargetNotAvailableError, type TokenUsage, TokenUsageSchema, type ToolCall, ToolCallSchema, type TraceSummary, TraceSummarySchema, createTargetClient, defineAssertion, defineCodeGrader, defineCodeJudge, definePromptTemplate };