@agentv/eval 2.19.0 → 3.0.0-next.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -5
- package/dist/index.cjs +19 -10
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +483 -33
- package/dist/index.d.ts +483 -33
- package/dist/index.js +16 -10
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.d.ts
CHANGED
|
@@ -2,7 +2,7 @@ import { z } from 'zod';
|
|
|
2
2
|
export { z } from 'zod';
|
|
3
3
|
|
|
4
4
|
/**
|
|
5
|
-
* Zod schemas for code
|
|
5
|
+
* Zod schemas for code grader input/output validation.
|
|
6
6
|
* Provides both compile-time types and runtime validation.
|
|
7
7
|
*/
|
|
8
8
|
|
|
@@ -147,9 +147,9 @@ declare const MessageSchema: z.ZodObject<{
|
|
|
147
147
|
metadata?: Record<string, unknown> | undefined;
|
|
148
148
|
}>;
|
|
149
149
|
/**
|
|
150
|
-
* Code
|
|
150
|
+
* Code grader input schema (camelCase, converted from snake_case wire format).
|
|
151
151
|
*/
|
|
152
|
-
declare const
|
|
152
|
+
declare const CodeGraderInputSchema: z.ZodObject<{
|
|
153
153
|
question: z.ZodString;
|
|
154
154
|
criteria: z.ZodString;
|
|
155
155
|
expectedOutput: z.ZodArray<z.ZodObject<{
|
|
@@ -570,9 +570,9 @@ declare const CodeJudgeInputSchema: z.ZodObject<{
|
|
|
570
570
|
config?: Record<string, unknown> | null | undefined;
|
|
571
571
|
}>;
|
|
572
572
|
/**
|
|
573
|
-
* Code
|
|
573
|
+
* Code grader result schema (validated before output).
|
|
574
574
|
*/
|
|
575
|
-
declare const
|
|
575
|
+
declare const CodeGraderResultSchema: z.ZodObject<{
|
|
576
576
|
score: z.ZodNumber;
|
|
577
577
|
hits: z.ZodDefault<z.ZodOptional<z.ZodArray<z.ZodString, "many">>>;
|
|
578
578
|
misses: z.ZodDefault<z.ZodOptional<z.ZodArray<z.ZodString, "many">>>;
|
|
@@ -595,15 +595,15 @@ declare const CodeJudgeResultSchema: z.ZodObject<{
|
|
|
595
595
|
/**
|
|
596
596
|
* Inferred types from schemas.
|
|
597
597
|
*/
|
|
598
|
-
type
|
|
599
|
-
type
|
|
598
|
+
type CodeGraderInput = z.infer<typeof CodeGraderInputSchema>;
|
|
599
|
+
type CodeGraderResult = z.infer<typeof CodeGraderResultSchema>;
|
|
600
600
|
type TraceSummary = z.infer<typeof TraceSummarySchema>;
|
|
601
601
|
type Message = z.infer<typeof MessageSchema>;
|
|
602
602
|
type ToolCall = z.infer<typeof ToolCallSchema>;
|
|
603
603
|
type TokenUsage = z.infer<typeof TokenUsageSchema>;
|
|
604
604
|
/**
|
|
605
605
|
* Prompt template input schema (camelCase, converted from snake_case wire format).
|
|
606
|
-
* Uses the same schema as
|
|
606
|
+
* Uses the same schema as CodeGraderInput since the orchestrator sends identical payloads.
|
|
607
607
|
*/
|
|
608
608
|
declare const PromptTemplateInputSchema: z.ZodObject<{
|
|
609
609
|
question: z.ZodString;
|
|
@@ -1025,10 +1025,456 @@ declare const PromptTemplateInputSchema: z.ZodObject<{
|
|
|
1025
1025
|
workspacePath?: string | null | undefined;
|
|
1026
1026
|
config?: Record<string, unknown> | null | undefined;
|
|
1027
1027
|
}>;
|
|
1028
|
-
type PromptTemplateInput =
|
|
1028
|
+
type PromptTemplateInput = CodeGraderInput;
|
|
1029
|
+
/** @deprecated Use CodeGraderInputSchema */
|
|
1030
|
+
declare const CodeJudgeInputSchema: z.ZodObject<{
|
|
1031
|
+
question: z.ZodString;
|
|
1032
|
+
criteria: z.ZodString;
|
|
1033
|
+
expectedOutput: z.ZodArray<z.ZodObject<{
|
|
1034
|
+
role: z.ZodEnum<["assistant", "user", "system", "tool"]>;
|
|
1035
|
+
content: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodRecord<z.ZodString, z.ZodUnknown>, z.ZodArray<z.ZodRecord<z.ZodString, z.ZodUnknown>, "many">]>>;
|
|
1036
|
+
toolCalls: z.ZodOptional<z.ZodArray<z.ZodObject<{
|
|
1037
|
+
tool: z.ZodString;
|
|
1038
|
+
input: z.ZodOptional<z.ZodUnknown>;
|
|
1039
|
+
output: z.ZodOptional<z.ZodUnknown>;
|
|
1040
|
+
id: z.ZodOptional<z.ZodString>;
|
|
1041
|
+
startTime: z.ZodOptional<z.ZodString>;
|
|
1042
|
+
endTime: z.ZodOptional<z.ZodString>;
|
|
1043
|
+
durationMs: z.ZodOptional<z.ZodNumber>;
|
|
1044
|
+
}, "strip", z.ZodTypeAny, {
|
|
1045
|
+
tool: string;
|
|
1046
|
+
input?: unknown;
|
|
1047
|
+
output?: unknown;
|
|
1048
|
+
id?: string | undefined;
|
|
1049
|
+
startTime?: string | undefined;
|
|
1050
|
+
endTime?: string | undefined;
|
|
1051
|
+
durationMs?: number | undefined;
|
|
1052
|
+
}, {
|
|
1053
|
+
tool: string;
|
|
1054
|
+
input?: unknown;
|
|
1055
|
+
output?: unknown;
|
|
1056
|
+
id?: string | undefined;
|
|
1057
|
+
startTime?: string | undefined;
|
|
1058
|
+
endTime?: string | undefined;
|
|
1059
|
+
durationMs?: number | undefined;
|
|
1060
|
+
}>, "many">>;
|
|
1061
|
+
name: z.ZodOptional<z.ZodString>;
|
|
1062
|
+
startTime: z.ZodOptional<z.ZodString>;
|
|
1063
|
+
endTime: z.ZodOptional<z.ZodString>;
|
|
1064
|
+
durationMs: z.ZodOptional<z.ZodNumber>;
|
|
1065
|
+
metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
1066
|
+
}, "strip", z.ZodTypeAny, {
|
|
1067
|
+
role: "tool" | "assistant" | "user" | "system";
|
|
1068
|
+
startTime?: string | undefined;
|
|
1069
|
+
endTime?: string | undefined;
|
|
1070
|
+
durationMs?: number | undefined;
|
|
1071
|
+
content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
|
|
1072
|
+
toolCalls?: {
|
|
1073
|
+
tool: string;
|
|
1074
|
+
input?: unknown;
|
|
1075
|
+
output?: unknown;
|
|
1076
|
+
id?: string | undefined;
|
|
1077
|
+
startTime?: string | undefined;
|
|
1078
|
+
endTime?: string | undefined;
|
|
1079
|
+
durationMs?: number | undefined;
|
|
1080
|
+
}[] | undefined;
|
|
1081
|
+
name?: string | undefined;
|
|
1082
|
+
metadata?: Record<string, unknown> | undefined;
|
|
1083
|
+
}, {
|
|
1084
|
+
role: "tool" | "assistant" | "user" | "system";
|
|
1085
|
+
startTime?: string | undefined;
|
|
1086
|
+
endTime?: string | undefined;
|
|
1087
|
+
durationMs?: number | undefined;
|
|
1088
|
+
content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
|
|
1089
|
+
toolCalls?: {
|
|
1090
|
+
tool: string;
|
|
1091
|
+
input?: unknown;
|
|
1092
|
+
output?: unknown;
|
|
1093
|
+
id?: string | undefined;
|
|
1094
|
+
startTime?: string | undefined;
|
|
1095
|
+
endTime?: string | undefined;
|
|
1096
|
+
durationMs?: number | undefined;
|
|
1097
|
+
}[] | undefined;
|
|
1098
|
+
name?: string | undefined;
|
|
1099
|
+
metadata?: Record<string, unknown> | undefined;
|
|
1100
|
+
}>, "many">;
|
|
1101
|
+
referenceAnswer: z.ZodOptional<z.ZodString>;
|
|
1102
|
+
answer: z.ZodString;
|
|
1103
|
+
output: z.ZodOptional<z.ZodNullable<z.ZodArray<z.ZodObject<{
|
|
1104
|
+
role: z.ZodEnum<["assistant", "user", "system", "tool"]>;
|
|
1105
|
+
content: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodRecord<z.ZodString, z.ZodUnknown>, z.ZodArray<z.ZodRecord<z.ZodString, z.ZodUnknown>, "many">]>>;
|
|
1106
|
+
toolCalls: z.ZodOptional<z.ZodArray<z.ZodObject<{
|
|
1107
|
+
tool: z.ZodString;
|
|
1108
|
+
input: z.ZodOptional<z.ZodUnknown>;
|
|
1109
|
+
output: z.ZodOptional<z.ZodUnknown>;
|
|
1110
|
+
id: z.ZodOptional<z.ZodString>;
|
|
1111
|
+
startTime: z.ZodOptional<z.ZodString>;
|
|
1112
|
+
endTime: z.ZodOptional<z.ZodString>;
|
|
1113
|
+
durationMs: z.ZodOptional<z.ZodNumber>;
|
|
1114
|
+
}, "strip", z.ZodTypeAny, {
|
|
1115
|
+
tool: string;
|
|
1116
|
+
input?: unknown;
|
|
1117
|
+
output?: unknown;
|
|
1118
|
+
id?: string | undefined;
|
|
1119
|
+
startTime?: string | undefined;
|
|
1120
|
+
endTime?: string | undefined;
|
|
1121
|
+
durationMs?: number | undefined;
|
|
1122
|
+
}, {
|
|
1123
|
+
tool: string;
|
|
1124
|
+
input?: unknown;
|
|
1125
|
+
output?: unknown;
|
|
1126
|
+
id?: string | undefined;
|
|
1127
|
+
startTime?: string | undefined;
|
|
1128
|
+
endTime?: string | undefined;
|
|
1129
|
+
durationMs?: number | undefined;
|
|
1130
|
+
}>, "many">>;
|
|
1131
|
+
name: z.ZodOptional<z.ZodString>;
|
|
1132
|
+
startTime: z.ZodOptional<z.ZodString>;
|
|
1133
|
+
endTime: z.ZodOptional<z.ZodString>;
|
|
1134
|
+
durationMs: z.ZodOptional<z.ZodNumber>;
|
|
1135
|
+
metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
1136
|
+
}, "strip", z.ZodTypeAny, {
|
|
1137
|
+
role: "tool" | "assistant" | "user" | "system";
|
|
1138
|
+
startTime?: string | undefined;
|
|
1139
|
+
endTime?: string | undefined;
|
|
1140
|
+
durationMs?: number | undefined;
|
|
1141
|
+
content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
|
|
1142
|
+
toolCalls?: {
|
|
1143
|
+
tool: string;
|
|
1144
|
+
input?: unknown;
|
|
1145
|
+
output?: unknown;
|
|
1146
|
+
id?: string | undefined;
|
|
1147
|
+
startTime?: string | undefined;
|
|
1148
|
+
endTime?: string | undefined;
|
|
1149
|
+
durationMs?: number | undefined;
|
|
1150
|
+
}[] | undefined;
|
|
1151
|
+
name?: string | undefined;
|
|
1152
|
+
metadata?: Record<string, unknown> | undefined;
|
|
1153
|
+
}, {
|
|
1154
|
+
role: "tool" | "assistant" | "user" | "system";
|
|
1155
|
+
startTime?: string | undefined;
|
|
1156
|
+
endTime?: string | undefined;
|
|
1157
|
+
durationMs?: number | undefined;
|
|
1158
|
+
content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
|
|
1159
|
+
toolCalls?: {
|
|
1160
|
+
tool: string;
|
|
1161
|
+
input?: unknown;
|
|
1162
|
+
output?: unknown;
|
|
1163
|
+
id?: string | undefined;
|
|
1164
|
+
startTime?: string | undefined;
|
|
1165
|
+
endTime?: string | undefined;
|
|
1166
|
+
durationMs?: number | undefined;
|
|
1167
|
+
}[] | undefined;
|
|
1168
|
+
name?: string | undefined;
|
|
1169
|
+
metadata?: Record<string, unknown> | undefined;
|
|
1170
|
+
}>, "many">>>;
|
|
1171
|
+
/** Path to a temp file containing the output JSON (used for large payloads). */
|
|
1172
|
+
outputPath: z.ZodOptional<z.ZodString>;
|
|
1173
|
+
guidelineFiles: z.ZodArray<z.ZodString, "many">;
|
|
1174
|
+
inputFiles: z.ZodArray<z.ZodString, "many">;
|
|
1175
|
+
input: z.ZodArray<z.ZodObject<{
|
|
1176
|
+
role: z.ZodEnum<["assistant", "user", "system", "tool"]>;
|
|
1177
|
+
content: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodRecord<z.ZodString, z.ZodUnknown>, z.ZodArray<z.ZodRecord<z.ZodString, z.ZodUnknown>, "many">]>>;
|
|
1178
|
+
toolCalls: z.ZodOptional<z.ZodArray<z.ZodObject<{
|
|
1179
|
+
tool: z.ZodString;
|
|
1180
|
+
input: z.ZodOptional<z.ZodUnknown>;
|
|
1181
|
+
output: z.ZodOptional<z.ZodUnknown>;
|
|
1182
|
+
id: z.ZodOptional<z.ZodString>;
|
|
1183
|
+
startTime: z.ZodOptional<z.ZodString>;
|
|
1184
|
+
endTime: z.ZodOptional<z.ZodString>;
|
|
1185
|
+
durationMs: z.ZodOptional<z.ZodNumber>;
|
|
1186
|
+
}, "strip", z.ZodTypeAny, {
|
|
1187
|
+
tool: string;
|
|
1188
|
+
input?: unknown;
|
|
1189
|
+
output?: unknown;
|
|
1190
|
+
id?: string | undefined;
|
|
1191
|
+
startTime?: string | undefined;
|
|
1192
|
+
endTime?: string | undefined;
|
|
1193
|
+
durationMs?: number | undefined;
|
|
1194
|
+
}, {
|
|
1195
|
+
tool: string;
|
|
1196
|
+
input?: unknown;
|
|
1197
|
+
output?: unknown;
|
|
1198
|
+
id?: string | undefined;
|
|
1199
|
+
startTime?: string | undefined;
|
|
1200
|
+
endTime?: string | undefined;
|
|
1201
|
+
durationMs?: number | undefined;
|
|
1202
|
+
}>, "many">>;
|
|
1203
|
+
name: z.ZodOptional<z.ZodString>;
|
|
1204
|
+
startTime: z.ZodOptional<z.ZodString>;
|
|
1205
|
+
endTime: z.ZodOptional<z.ZodString>;
|
|
1206
|
+
durationMs: z.ZodOptional<z.ZodNumber>;
|
|
1207
|
+
metadata: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
1208
|
+
}, "strip", z.ZodTypeAny, {
|
|
1209
|
+
role: "tool" | "assistant" | "user" | "system";
|
|
1210
|
+
startTime?: string | undefined;
|
|
1211
|
+
endTime?: string | undefined;
|
|
1212
|
+
durationMs?: number | undefined;
|
|
1213
|
+
content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
|
|
1214
|
+
toolCalls?: {
|
|
1215
|
+
tool: string;
|
|
1216
|
+
input?: unknown;
|
|
1217
|
+
output?: unknown;
|
|
1218
|
+
id?: string | undefined;
|
|
1219
|
+
startTime?: string | undefined;
|
|
1220
|
+
endTime?: string | undefined;
|
|
1221
|
+
durationMs?: number | undefined;
|
|
1222
|
+
}[] | undefined;
|
|
1223
|
+
name?: string | undefined;
|
|
1224
|
+
metadata?: Record<string, unknown> | undefined;
|
|
1225
|
+
}, {
|
|
1226
|
+
role: "tool" | "assistant" | "user" | "system";
|
|
1227
|
+
startTime?: string | undefined;
|
|
1228
|
+
endTime?: string | undefined;
|
|
1229
|
+
durationMs?: number | undefined;
|
|
1230
|
+
content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
|
|
1231
|
+
toolCalls?: {
|
|
1232
|
+
tool: string;
|
|
1233
|
+
input?: unknown;
|
|
1234
|
+
output?: unknown;
|
|
1235
|
+
id?: string | undefined;
|
|
1236
|
+
startTime?: string | undefined;
|
|
1237
|
+
endTime?: string | undefined;
|
|
1238
|
+
durationMs?: number | undefined;
|
|
1239
|
+
}[] | undefined;
|
|
1240
|
+
name?: string | undefined;
|
|
1241
|
+
metadata?: Record<string, unknown> | undefined;
|
|
1242
|
+
}>, "many">;
|
|
1243
|
+
trace: z.ZodOptional<z.ZodNullable<z.ZodObject<{
|
|
1244
|
+
eventCount: z.ZodNumber;
|
|
1245
|
+
toolNames: z.ZodArray<z.ZodString, "many">;
|
|
1246
|
+
toolCallsByName: z.ZodRecord<z.ZodString, z.ZodNumber>;
|
|
1247
|
+
errorCount: z.ZodNumber;
|
|
1248
|
+
toolDurations: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodArray<z.ZodNumber, "many">>>;
|
|
1249
|
+
llmCallCount: z.ZodOptional<z.ZodNumber>;
|
|
1250
|
+
}, "strip", z.ZodTypeAny, {
|
|
1251
|
+
eventCount: number;
|
|
1252
|
+
toolNames: string[];
|
|
1253
|
+
toolCallsByName: Record<string, number>;
|
|
1254
|
+
errorCount: number;
|
|
1255
|
+
toolDurations?: Record<string, number[]> | undefined;
|
|
1256
|
+
llmCallCount?: number | undefined;
|
|
1257
|
+
}, {
|
|
1258
|
+
eventCount: number;
|
|
1259
|
+
toolNames: string[];
|
|
1260
|
+
toolCallsByName: Record<string, number>;
|
|
1261
|
+
errorCount: number;
|
|
1262
|
+
toolDurations?: Record<string, number[]> | undefined;
|
|
1263
|
+
llmCallCount?: number | undefined;
|
|
1264
|
+
}>>>;
|
|
1265
|
+
tokenUsage: z.ZodOptional<z.ZodNullable<z.ZodObject<{
|
|
1266
|
+
input: z.ZodNumber;
|
|
1267
|
+
output: z.ZodNumber;
|
|
1268
|
+
cached: z.ZodOptional<z.ZodNumber>;
|
|
1269
|
+
}, "strip", z.ZodTypeAny, {
|
|
1270
|
+
input: number;
|
|
1271
|
+
output: number;
|
|
1272
|
+
cached?: number | undefined;
|
|
1273
|
+
}, {
|
|
1274
|
+
input: number;
|
|
1275
|
+
output: number;
|
|
1276
|
+
cached?: number | undefined;
|
|
1277
|
+
}>>>;
|
|
1278
|
+
costUsd: z.ZodOptional<z.ZodNullable<z.ZodNumber>>;
|
|
1279
|
+
durationMs: z.ZodOptional<z.ZodNullable<z.ZodNumber>>;
|
|
1280
|
+
startTime: z.ZodOptional<z.ZodNullable<z.ZodString>>;
|
|
1281
|
+
endTime: z.ZodOptional<z.ZodNullable<z.ZodString>>;
|
|
1282
|
+
fileChanges: z.ZodOptional<z.ZodNullable<z.ZodString>>;
|
|
1283
|
+
workspacePath: z.ZodOptional<z.ZodNullable<z.ZodString>>;
|
|
1284
|
+
config: z.ZodOptional<z.ZodNullable<z.ZodRecord<z.ZodString, z.ZodUnknown>>>;
|
|
1285
|
+
}, "strip", z.ZodTypeAny, {
|
|
1286
|
+
input: {
|
|
1287
|
+
role: "tool" | "assistant" | "user" | "system";
|
|
1288
|
+
startTime?: string | undefined;
|
|
1289
|
+
endTime?: string | undefined;
|
|
1290
|
+
durationMs?: number | undefined;
|
|
1291
|
+
content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
|
|
1292
|
+
toolCalls?: {
|
|
1293
|
+
tool: string;
|
|
1294
|
+
input?: unknown;
|
|
1295
|
+
output?: unknown;
|
|
1296
|
+
id?: string | undefined;
|
|
1297
|
+
startTime?: string | undefined;
|
|
1298
|
+
endTime?: string | undefined;
|
|
1299
|
+
durationMs?: number | undefined;
|
|
1300
|
+
}[] | undefined;
|
|
1301
|
+
name?: string | undefined;
|
|
1302
|
+
metadata?: Record<string, unknown> | undefined;
|
|
1303
|
+
}[];
|
|
1304
|
+
question: string;
|
|
1305
|
+
criteria: string;
|
|
1306
|
+
expectedOutput: {
|
|
1307
|
+
role: "tool" | "assistant" | "user" | "system";
|
|
1308
|
+
startTime?: string | undefined;
|
|
1309
|
+
endTime?: string | undefined;
|
|
1310
|
+
durationMs?: number | undefined;
|
|
1311
|
+
content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
|
|
1312
|
+
toolCalls?: {
|
|
1313
|
+
tool: string;
|
|
1314
|
+
input?: unknown;
|
|
1315
|
+
output?: unknown;
|
|
1316
|
+
id?: string | undefined;
|
|
1317
|
+
startTime?: string | undefined;
|
|
1318
|
+
endTime?: string | undefined;
|
|
1319
|
+
durationMs?: number | undefined;
|
|
1320
|
+
}[] | undefined;
|
|
1321
|
+
name?: string | undefined;
|
|
1322
|
+
metadata?: Record<string, unknown> | undefined;
|
|
1323
|
+
}[];
|
|
1324
|
+
answer: string;
|
|
1325
|
+
guidelineFiles: string[];
|
|
1326
|
+
inputFiles: string[];
|
|
1327
|
+
output?: {
|
|
1328
|
+
role: "tool" | "assistant" | "user" | "system";
|
|
1329
|
+
startTime?: string | undefined;
|
|
1330
|
+
endTime?: string | undefined;
|
|
1331
|
+
durationMs?: number | undefined;
|
|
1332
|
+
content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
|
|
1333
|
+
toolCalls?: {
|
|
1334
|
+
tool: string;
|
|
1335
|
+
input?: unknown;
|
|
1336
|
+
output?: unknown;
|
|
1337
|
+
id?: string | undefined;
|
|
1338
|
+
startTime?: string | undefined;
|
|
1339
|
+
endTime?: string | undefined;
|
|
1340
|
+
durationMs?: number | undefined;
|
|
1341
|
+
}[] | undefined;
|
|
1342
|
+
name?: string | undefined;
|
|
1343
|
+
metadata?: Record<string, unknown> | undefined;
|
|
1344
|
+
}[] | null | undefined;
|
|
1345
|
+
startTime?: string | null | undefined;
|
|
1346
|
+
endTime?: string | null | undefined;
|
|
1347
|
+
durationMs?: number | null | undefined;
|
|
1348
|
+
referenceAnswer?: string | undefined;
|
|
1349
|
+
outputPath?: string | undefined;
|
|
1350
|
+
trace?: {
|
|
1351
|
+
eventCount: number;
|
|
1352
|
+
toolNames: string[];
|
|
1353
|
+
toolCallsByName: Record<string, number>;
|
|
1354
|
+
errorCount: number;
|
|
1355
|
+
toolDurations?: Record<string, number[]> | undefined;
|
|
1356
|
+
llmCallCount?: number | undefined;
|
|
1357
|
+
} | null | undefined;
|
|
1358
|
+
tokenUsage?: {
|
|
1359
|
+
input: number;
|
|
1360
|
+
output: number;
|
|
1361
|
+
cached?: number | undefined;
|
|
1362
|
+
} | null | undefined;
|
|
1363
|
+
costUsd?: number | null | undefined;
|
|
1364
|
+
fileChanges?: string | null | undefined;
|
|
1365
|
+
workspacePath?: string | null | undefined;
|
|
1366
|
+
config?: Record<string, unknown> | null | undefined;
|
|
1367
|
+
}, {
|
|
1368
|
+
input: {
|
|
1369
|
+
role: "tool" | "assistant" | "user" | "system";
|
|
1370
|
+
startTime?: string | undefined;
|
|
1371
|
+
endTime?: string | undefined;
|
|
1372
|
+
durationMs?: number | undefined;
|
|
1373
|
+
content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
|
|
1374
|
+
toolCalls?: {
|
|
1375
|
+
tool: string;
|
|
1376
|
+
input?: unknown;
|
|
1377
|
+
output?: unknown;
|
|
1378
|
+
id?: string | undefined;
|
|
1379
|
+
startTime?: string | undefined;
|
|
1380
|
+
endTime?: string | undefined;
|
|
1381
|
+
durationMs?: number | undefined;
|
|
1382
|
+
}[] | undefined;
|
|
1383
|
+
name?: string | undefined;
|
|
1384
|
+
metadata?: Record<string, unknown> | undefined;
|
|
1385
|
+
}[];
|
|
1386
|
+
question: string;
|
|
1387
|
+
criteria: string;
|
|
1388
|
+
expectedOutput: {
|
|
1389
|
+
role: "tool" | "assistant" | "user" | "system";
|
|
1390
|
+
startTime?: string | undefined;
|
|
1391
|
+
endTime?: string | undefined;
|
|
1392
|
+
durationMs?: number | undefined;
|
|
1393
|
+
content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
|
|
1394
|
+
toolCalls?: {
|
|
1395
|
+
tool: string;
|
|
1396
|
+
input?: unknown;
|
|
1397
|
+
output?: unknown;
|
|
1398
|
+
id?: string | undefined;
|
|
1399
|
+
startTime?: string | undefined;
|
|
1400
|
+
endTime?: string | undefined;
|
|
1401
|
+
durationMs?: number | undefined;
|
|
1402
|
+
}[] | undefined;
|
|
1403
|
+
name?: string | undefined;
|
|
1404
|
+
metadata?: Record<string, unknown> | undefined;
|
|
1405
|
+
}[];
|
|
1406
|
+
answer: string;
|
|
1407
|
+
guidelineFiles: string[];
|
|
1408
|
+
inputFiles: string[];
|
|
1409
|
+
output?: {
|
|
1410
|
+
role: "tool" | "assistant" | "user" | "system";
|
|
1411
|
+
startTime?: string | undefined;
|
|
1412
|
+
endTime?: string | undefined;
|
|
1413
|
+
durationMs?: number | undefined;
|
|
1414
|
+
content?: string | Record<string, unknown> | Record<string, unknown>[] | undefined;
|
|
1415
|
+
toolCalls?: {
|
|
1416
|
+
tool: string;
|
|
1417
|
+
input?: unknown;
|
|
1418
|
+
output?: unknown;
|
|
1419
|
+
id?: string | undefined;
|
|
1420
|
+
startTime?: string | undefined;
|
|
1421
|
+
endTime?: string | undefined;
|
|
1422
|
+
durationMs?: number | undefined;
|
|
1423
|
+
}[] | undefined;
|
|
1424
|
+
name?: string | undefined;
|
|
1425
|
+
metadata?: Record<string, unknown> | undefined;
|
|
1426
|
+
}[] | null | undefined;
|
|
1427
|
+
startTime?: string | null | undefined;
|
|
1428
|
+
endTime?: string | null | undefined;
|
|
1429
|
+
durationMs?: number | null | undefined;
|
|
1430
|
+
referenceAnswer?: string | undefined;
|
|
1431
|
+
outputPath?: string | undefined;
|
|
1432
|
+
trace?: {
|
|
1433
|
+
eventCount: number;
|
|
1434
|
+
toolNames: string[];
|
|
1435
|
+
toolCallsByName: Record<string, number>;
|
|
1436
|
+
errorCount: number;
|
|
1437
|
+
toolDurations?: Record<string, number[]> | undefined;
|
|
1438
|
+
llmCallCount?: number | undefined;
|
|
1439
|
+
} | null | undefined;
|
|
1440
|
+
tokenUsage?: {
|
|
1441
|
+
input: number;
|
|
1442
|
+
output: number;
|
|
1443
|
+
cached?: number | undefined;
|
|
1444
|
+
} | null | undefined;
|
|
1445
|
+
costUsd?: number | null | undefined;
|
|
1446
|
+
fileChanges?: string | null | undefined;
|
|
1447
|
+
workspacePath?: string | null | undefined;
|
|
1448
|
+
config?: Record<string, unknown> | null | undefined;
|
|
1449
|
+
}>;
|
|
1450
|
+
/** @deprecated Use CodeGraderResultSchema */
|
|
1451
|
+
declare const CodeJudgeResultSchema: z.ZodObject<{
|
|
1452
|
+
score: z.ZodNumber;
|
|
1453
|
+
hits: z.ZodDefault<z.ZodOptional<z.ZodArray<z.ZodString, "many">>>;
|
|
1454
|
+
misses: z.ZodDefault<z.ZodOptional<z.ZodArray<z.ZodString, "many">>>;
|
|
1455
|
+
reasoning: z.ZodOptional<z.ZodString>;
|
|
1456
|
+
/** Optional structured details for domain-specific metrics (e.g., TP/TN/FP/FN counts, alignments). */
|
|
1457
|
+
details: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>;
|
|
1458
|
+
}, "strip", z.ZodTypeAny, {
|
|
1459
|
+
score: number;
|
|
1460
|
+
hits: string[];
|
|
1461
|
+
misses: string[];
|
|
1462
|
+
reasoning?: string | undefined;
|
|
1463
|
+
details?: Record<string, unknown> | undefined;
|
|
1464
|
+
}, {
|
|
1465
|
+
score: number;
|
|
1466
|
+
hits?: string[] | undefined;
|
|
1467
|
+
misses?: string[] | undefined;
|
|
1468
|
+
reasoning?: string | undefined;
|
|
1469
|
+
details?: Record<string, unknown> | undefined;
|
|
1470
|
+
}>;
|
|
1471
|
+
/** @deprecated Use CodeGraderInput */
|
|
1472
|
+
type CodeJudgeInput = CodeGraderInput;
|
|
1473
|
+
/** @deprecated Use CodeGraderResult */
|
|
1474
|
+
type CodeJudgeResult = CodeGraderResult;
|
|
1029
1475
|
|
|
1030
1476
|
/**
|
|
1031
|
-
* Client for invoking configured targets from code-
|
|
1477
|
+
* Client for invoking configured targets from code-grader scripts.
|
|
1032
1478
|
*
|
|
1033
1479
|
* Environment variables (set automatically by AgentV when `target` config is present):
|
|
1034
1480
|
* - AGENTV_TARGET_PROXY_URL: The URL of the local proxy server
|
|
@@ -1108,16 +1554,16 @@ declare class TargetInvocationError extends Error {
|
|
|
1108
1554
|
*
|
|
1109
1555
|
* This function reads the proxy URL and token from environment variables
|
|
1110
1556
|
* that are automatically set by AgentV when a `target` config block is present
|
|
1111
|
-
* on a `code_judge` evaluator.
|
|
1557
|
+
* on a `code_grader` (or `code_judge`) evaluator.
|
|
1112
1558
|
*
|
|
1113
1559
|
* @returns A target client if environment variables are set, otherwise undefined
|
|
1114
1560
|
* @throws TargetNotAvailableError if token is missing when URL is present
|
|
1115
1561
|
*
|
|
1116
1562
|
* @example
|
|
1117
1563
|
* ```typescript
|
|
1118
|
-
* import { createTargetClient,
|
|
1564
|
+
* import { createTargetClient, defineCodeGrader } from '@agentv/eval';
|
|
1119
1565
|
*
|
|
1120
|
-
* export default
|
|
1566
|
+
* export default defineCodeGrader(async ({ question, criteria }) => {
|
|
1121
1567
|
* const target = createTargetClient();
|
|
1122
1568
|
*
|
|
1123
1569
|
* if (!target) {
|
|
@@ -1139,15 +1585,15 @@ declare function createTargetClient(): TargetClient | undefined;
|
|
|
1139
1585
|
|
|
1140
1586
|
/**
|
|
1141
1587
|
* Context provided to assertion handlers.
|
|
1142
|
-
* Same shape as
|
|
1588
|
+
* Same shape as CodeGraderInput — assertions receive full evaluation context.
|
|
1143
1589
|
*/
|
|
1144
|
-
type AssertionContext =
|
|
1590
|
+
type AssertionContext = CodeGraderInput;
|
|
1145
1591
|
/**
|
|
1146
1592
|
* Known built-in assertion types. Custom types are extensible via string.
|
|
1147
1593
|
*
|
|
1148
|
-
* Use in EVAL.yaml `
|
|
1594
|
+
* Use in EVAL.yaml `assertions` blocks:
|
|
1149
1595
|
* ```yaml
|
|
1150
|
-
*
|
|
1596
|
+
* assertions:
|
|
1151
1597
|
* - type: contains
|
|
1152
1598
|
* value: "Paris"
|
|
1153
1599
|
* ```
|
|
@@ -1156,7 +1602,7 @@ type AssertionContext = CodeJudgeInput;
|
|
|
1156
1602
|
* are also valid — the `string & {}` escape hatch provides autocomplete
|
|
1157
1603
|
* for known types while accepting any string.
|
|
1158
1604
|
*/
|
|
1159
|
-
type AssertionType = 'llm-
|
|
1605
|
+
type AssertionType = 'llm-grader' | 'code-grader' | 'rubrics' | 'composite' | 'tool-trajectory' | 'field-accuracy' | 'latency' | 'cost' | 'token-usage' | 'execution-metrics' | 'skill-trigger' | 'contains' | 'contains-any' | 'contains-all' | 'icontains' | 'icontains-any' | 'icontains-all' | 'starts-with' | 'ends-with' | 'equals' | 'regex' | 'is-json' | 'llm-judge' | 'code-judge' | 'llm_judge' | 'code_judge' | 'llm_grader' | 'code_grader' | 'tool_trajectory' | 'field_accuracy' | 'token_usage' | 'execution_metrics' | 'contains_any' | 'contains_all' | 'icontains_any' | 'icontains_all' | 'starts_with' | 'ends_with' | 'is_json' | (string & {});
|
|
1160
1606
|
/**
|
|
1161
1607
|
* Result returned from an assertion handler.
|
|
1162
1608
|
*
|
|
@@ -1201,9 +1647,11 @@ type AssertionHandler = (ctx: AssertionContext) => AssertionScore | Promise<Asse
|
|
|
1201
1647
|
type PromptTemplateHandler = (input: PromptTemplateInput) => string | Promise<string>;
|
|
1202
1648
|
|
|
1203
1649
|
/**
|
|
1204
|
-
* Handler function type for code
|
|
1650
|
+
* Handler function type for code graders.
|
|
1205
1651
|
*/
|
|
1206
|
-
type
|
|
1652
|
+
type CodeGraderHandler = (input: CodeGraderInput) => CodeGraderResult | Promise<CodeGraderResult>;
|
|
1653
|
+
/** @deprecated Use CodeGraderHandler */
|
|
1654
|
+
type CodeJudgeHandler = CodeGraderHandler;
|
|
1207
1655
|
|
|
1208
1656
|
/**
|
|
1209
1657
|
* AgentV Evaluation SDK
|
|
@@ -1221,24 +1669,24 @@ type CodeJudgeHandler = (input: CodeJudgeInput) => CodeJudgeResult | Promise<Cod
|
|
|
1221
1669
|
* }));
|
|
1222
1670
|
* ```
|
|
1223
1671
|
*
|
|
1224
|
-
* @example Code
|
|
1672
|
+
* @example Code grader (full control)
|
|
1225
1673
|
* ```typescript
|
|
1226
1674
|
* #!/usr/bin/env bun
|
|
1227
|
-
* import {
|
|
1675
|
+
* import { defineCodeGrader } from '@agentv/eval';
|
|
1228
1676
|
*
|
|
1229
|
-
* export default
|
|
1677
|
+
* export default defineCodeGrader(({ trace, answer }) => ({
|
|
1230
1678
|
* score: trace?.eventCount <= 5 ? 1.0 : 0.5,
|
|
1231
1679
|
* hits: ['Efficient tool usage'],
|
|
1232
1680
|
* misses: [],
|
|
1233
1681
|
* }));
|
|
1234
1682
|
* ```
|
|
1235
1683
|
*
|
|
1236
|
-
* @example Code
|
|
1684
|
+
* @example Code grader with target access (requires `target` config in YAML)
|
|
1237
1685
|
* ```typescript
|
|
1238
1686
|
* #!/usr/bin/env bun
|
|
1239
|
-
* import {
|
|
1687
|
+
* import { defineCodeGrader, createTargetClient } from '@agentv/eval';
|
|
1240
1688
|
*
|
|
1241
|
-
* export default
|
|
1689
|
+
* export default defineCodeGrader(async ({ question }) => {
|
|
1242
1690
|
* const target = createTargetClient();
|
|
1243
1691
|
* if (!target) {
|
|
1244
1692
|
* return { score: 0, misses: ['Target not available'] };
|
|
@@ -1258,7 +1706,7 @@ type CodeJudgeHandler = (input: CodeJudgeInput) => CodeJudgeResult | Promise<Cod
|
|
|
1258
1706
|
*/
|
|
1259
1707
|
|
|
1260
1708
|
/**
|
|
1261
|
-
* Define a code
|
|
1709
|
+
* Define a code grader evaluator with automatic stdin/stdout handling.
|
|
1262
1710
|
*
|
|
1263
1711
|
* This function:
|
|
1264
1712
|
* 1. Reads JSON from stdin (snake_case format)
|
|
@@ -1271,9 +1719,9 @@ type CodeJudgeHandler = (input: CodeJudgeInput) => CodeJudgeResult | Promise<Cod
|
|
|
1271
1719
|
*
|
|
1272
1720
|
* @example
|
|
1273
1721
|
* ```typescript
|
|
1274
|
-
* import {
|
|
1722
|
+
* import { defineCodeGrader } from '@agentv/eval';
|
|
1275
1723
|
*
|
|
1276
|
-
* export default
|
|
1724
|
+
* export default defineCodeGrader(({ trace }) => {
|
|
1277
1725
|
* if (!trace) {
|
|
1278
1726
|
* return { score: 0.5, reasoning: 'No trace available' };
|
|
1279
1727
|
* }
|
|
@@ -1289,19 +1737,21 @@ type CodeJudgeHandler = (input: CodeJudgeInput) => CodeJudgeResult | Promise<Cod
|
|
|
1289
1737
|
*
|
|
1290
1738
|
* @example With typed config
|
|
1291
1739
|
* ```typescript
|
|
1292
|
-
* import {
|
|
1740
|
+
* import { defineCodeGrader, z } from '@agentv/eval';
|
|
1293
1741
|
*
|
|
1294
1742
|
* const ConfigSchema = z.object({
|
|
1295
1743
|
* maxToolCalls: z.number().default(10),
|
|
1296
1744
|
* });
|
|
1297
1745
|
*
|
|
1298
|
-
* export default
|
|
1746
|
+
* export default defineCodeGrader(({ trace, config }) => {
|
|
1299
1747
|
* const { maxToolCalls } = ConfigSchema.parse(config ?? {});
|
|
1300
1748
|
* // Use maxToolCalls...
|
|
1301
1749
|
* });
|
|
1302
1750
|
* ```
|
|
1303
1751
|
*/
|
|
1304
|
-
declare function
|
|
1752
|
+
declare function defineCodeGrader(handler: CodeGraderHandler): void;
|
|
1753
|
+
/** @deprecated Use defineCodeGrader */
|
|
1754
|
+
declare const defineCodeJudge: typeof defineCodeGrader;
|
|
1305
1755
|
/**
|
|
1306
1756
|
* Define a prompt template with automatic stdin/stdout handling.
|
|
1307
1757
|
*
|
|
@@ -1387,4 +1837,4 @@ declare function definePromptTemplate(handler: PromptTemplateHandler): void;
|
|
|
1387
1837
|
*/
|
|
1388
1838
|
declare function defineAssertion(handler: AssertionHandler): void;
|
|
1389
1839
|
|
|
1390
|
-
export { type AssertionContext, type AssertionHandler, type AssertionScore, type AssertionType, type CodeJudgeHandler, type CodeJudgeInput, CodeJudgeInputSchema, type CodeJudgeResult, CodeJudgeResultSchema, type Message, MessageSchema, type PromptTemplateHandler, type PromptTemplateInput, PromptTemplateInputSchema, type TargetClient, type TargetInfo, TargetInvocationError, type TargetInvokeRequest, type TargetInvokeResponse, TargetNotAvailableError, type TokenUsage, TokenUsageSchema, type ToolCall, ToolCallSchema, type TraceSummary, TraceSummarySchema, createTargetClient, defineAssertion, defineCodeJudge, definePromptTemplate };
|
|
1840
|
+
export { type AssertionContext, type AssertionHandler, type AssertionScore, type AssertionType, type CodeGraderHandler, type CodeGraderInput, CodeGraderInputSchema, type CodeGraderResult, CodeGraderResultSchema, type CodeJudgeHandler, type CodeJudgeInput, CodeJudgeInputSchema, type CodeJudgeResult, CodeJudgeResultSchema, type Message, MessageSchema, type PromptTemplateHandler, type PromptTemplateInput, PromptTemplateInputSchema, type TargetClient, type TargetInfo, TargetInvocationError, type TargetInvokeRequest, type TargetInvokeResponse, TargetNotAvailableError, type TokenUsage, TokenUsageSchema, type ToolCall, ToolCallSchema, type TraceSummary, TraceSummarySchema, createTargetClient, defineAssertion, defineCodeGrader, defineCodeJudge, definePromptTemplate };
|