@agentv/core 4.17.1 → 4.18.0-next.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-6VZY3B6M.js → chunk-PYDBJOAO.js} +6 -6
- package/dist/chunk-PYDBJOAO.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +5 -5
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +3 -3
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +229 -238
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +156 -158
- package/dist/index.d.ts +156 -158
- package/dist/index.js +210 -216
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/dist/chunk-6VZY3B6M.js.map +0 -1
package/dist/index.js
CHANGED
|
@@ -13,7 +13,7 @@ import {
|
|
|
13
13
|
isAgentProvider,
|
|
14
14
|
isContent,
|
|
15
15
|
isContentArray,
|
|
16
|
-
|
|
16
|
+
isGraderKind,
|
|
17
17
|
isJsonObject,
|
|
18
18
|
isJsonValue,
|
|
19
19
|
isTestMessage,
|
|
@@ -25,7 +25,7 @@ import {
|
|
|
25
25
|
resolveDelegatedTargetDefinition,
|
|
26
26
|
resolveFileReference,
|
|
27
27
|
resolveTargetDefinition
|
|
28
|
-
} from "./chunk-
|
|
28
|
+
} from "./chunk-PYDBJOAO.js";
|
|
29
29
|
import {
|
|
30
30
|
execFileWithStdin,
|
|
31
31
|
execShellWithStdin
|
|
@@ -647,22 +647,25 @@ function extractCacheConfig(suite) {
|
|
|
647
647
|
const resolvedCachePath = typeof cachePath === "string" && cachePath.trim().length > 0 ? cachePath.trim() : void 0;
|
|
648
648
|
return { enabled: cache, cachePath: resolvedCachePath };
|
|
649
649
|
}
|
|
650
|
-
function
|
|
650
|
+
function extractBudgetUsd(suite) {
|
|
651
651
|
const execution = suite.execution;
|
|
652
652
|
if (!execution || typeof execution !== "object" || Array.isArray(execution)) {
|
|
653
653
|
return void 0;
|
|
654
654
|
}
|
|
655
655
|
const executionObj = execution;
|
|
656
|
-
|
|
656
|
+
if ("total_budget_usd" in executionObj || "totalBudgetUsd" in executionObj) {
|
|
657
|
+
throw new Error(
|
|
658
|
+
"execution.total_budget_usd has been renamed to execution.budget_usd. Update your eval YAML."
|
|
659
|
+
);
|
|
660
|
+
}
|
|
661
|
+
const rawBudget = executionObj.budget_usd ?? executionObj.budgetUsd;
|
|
657
662
|
if (rawBudget === void 0 || rawBudget === null) {
|
|
658
663
|
return void 0;
|
|
659
664
|
}
|
|
660
665
|
if (typeof rawBudget === "number" && rawBudget > 0) {
|
|
661
666
|
return rawBudget;
|
|
662
667
|
}
|
|
663
|
-
logWarning(
|
|
664
|
-
`Invalid execution.total_budget_usd: ${rawBudget}. Must be a positive number. Ignoring.`
|
|
665
|
-
);
|
|
668
|
+
logWarning(`Invalid execution.budget_usd: ${rawBudget}. Must be a positive number. Ignoring.`);
|
|
666
669
|
return void 0;
|
|
667
670
|
}
|
|
668
671
|
function extractFailOnError(suite) {
|
|
@@ -812,7 +815,7 @@ function logWarning(message) {
|
|
|
812
815
|
console.warn(`${ANSI_YELLOW}Warning: ${message}${ANSI_RESET2}`);
|
|
813
816
|
}
|
|
814
817
|
|
|
815
|
-
// src/evaluation/loaders/
|
|
818
|
+
// src/evaluation/loaders/grader-parser.ts
|
|
816
819
|
import { readFile as readFile5 } from "node:fs/promises";
|
|
817
820
|
import path5 from "node:path";
|
|
818
821
|
import { parse as parse2 } from "yaml";
|
|
@@ -1051,38 +1054,38 @@ function validateTemplateVariables(content, source) {
|
|
|
1051
1054
|
);
|
|
1052
1055
|
}
|
|
1053
1056
|
if (invalidVariables.length > 0) {
|
|
1054
|
-
const warningMessage = `${ANSI_YELLOW2}Warning: Custom
|
|
1057
|
+
const warningMessage = `${ANSI_YELLOW2}Warning: Custom grader template at ${source}
|
|
1055
1058
|
Contains invalid variables: ${invalidVariables.map((v) => `{{ ${v} }}`).join(", ")}
|
|
1056
1059
|
Valid variables: ${Array.from(VALID_TEMPLATE_VARIABLES).map((v) => `{{ ${v} }}`).join(", ")}${ANSI_RESET3}`;
|
|
1057
1060
|
console.warn(warningMessage);
|
|
1058
1061
|
}
|
|
1059
1062
|
}
|
|
1060
1063
|
|
|
1061
|
-
// src/evaluation/loaders/
|
|
1064
|
+
// src/evaluation/loaders/grader-parser.ts
|
|
1062
1065
|
var ANSI_YELLOW3 = "\x1B[33m";
|
|
1063
1066
|
var ANSI_RESET4 = "\x1B[0m";
|
|
1064
1067
|
var MAX_ASSERTION_INCLUDE_DEPTH = 3;
|
|
1065
1068
|
var PROMPT_FILE_PREFIX = "file://";
|
|
1066
|
-
function
|
|
1069
|
+
function normalizeGraderType(type) {
|
|
1067
1070
|
return type.replace(/_/g, "-");
|
|
1068
1071
|
}
|
|
1069
1072
|
function isDeprecatedJudgeType(type) {
|
|
1070
1073
|
return type === "code-judge" || type === "llm-judge";
|
|
1071
1074
|
}
|
|
1072
|
-
async function
|
|
1075
|
+
async function parseGraders(rawEvalCase, globalExecution, searchRoots, evalId, defaultPreprocessors) {
|
|
1073
1076
|
const execution = rawEvalCase.execution;
|
|
1074
1077
|
const executionObject = isJsonObject2(execution) ? execution : void 0;
|
|
1075
1078
|
const caseEvaluators = rawEvalCase.assertions ?? rawEvalCase.assert ?? (executionObject ? executionObject.evaluators : void 0) ?? // deprecated: use assertions
|
|
1076
1079
|
rawEvalCase.evaluators;
|
|
1077
1080
|
const skipDefaults = executionObject?.skip_defaults === true;
|
|
1078
1081
|
const rootEvaluators = skipDefaults ? void 0 : globalExecution?.assertions ?? globalExecution?.assert ?? globalExecution?.evaluators;
|
|
1079
|
-
const parsedCase = await
|
|
1082
|
+
const parsedCase = await parseGraderList(
|
|
1080
1083
|
caseEvaluators,
|
|
1081
1084
|
searchRoots,
|
|
1082
1085
|
evalId,
|
|
1083
1086
|
defaultPreprocessors
|
|
1084
1087
|
);
|
|
1085
|
-
const parsedRoot = await
|
|
1088
|
+
const parsedRoot = await parseGraderList(
|
|
1086
1089
|
rootEvaluators,
|
|
1087
1090
|
searchRoots,
|
|
1088
1091
|
evalId,
|
|
@@ -1161,12 +1164,12 @@ ${resolved.attempted.map((attempt) => ` Tried: ${attempt}`).join("\n")}` : "";
|
|
|
1161
1164
|
templateDir,
|
|
1162
1165
|
...searchRoots.filter((root) => path5.resolve(root) !== templateDir)
|
|
1163
1166
|
];
|
|
1164
|
-
return await
|
|
1167
|
+
return await expandGraderEntries(assertions, nestedSearchRoots, evalId, {
|
|
1165
1168
|
depth: nextDepth,
|
|
1166
1169
|
chain: [...includeContext.chain, resolved.resolvedPath]
|
|
1167
1170
|
}) ?? [];
|
|
1168
1171
|
}
|
|
1169
|
-
async function
|
|
1172
|
+
async function expandGraderEntries(candidateEvaluators, searchRoots, evalId, includeContext = { depth: 0, chain: [] }) {
|
|
1170
1173
|
if (candidateEvaluators === void 0) {
|
|
1171
1174
|
return void 0;
|
|
1172
1175
|
}
|
|
@@ -1190,8 +1193,8 @@ async function expandEvaluatorEntries(candidateEvaluators, searchRoots, evalId,
|
|
|
1190
1193
|
}
|
|
1191
1194
|
return expanded;
|
|
1192
1195
|
}
|
|
1193
|
-
async function
|
|
1194
|
-
const expandedEvaluators = await
|
|
1196
|
+
async function parseGraderList(candidateEvaluators, searchRoots, evalId, defaultPreprocessors) {
|
|
1197
|
+
const expandedEvaluators = await expandGraderEntries(candidateEvaluators, searchRoots, evalId);
|
|
1195
1198
|
if (!expandedEvaluators) {
|
|
1196
1199
|
return void 0;
|
|
1197
1200
|
}
|
|
@@ -1237,14 +1240,14 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId, defa
|
|
|
1237
1240
|
}
|
|
1238
1241
|
const rawName = asString(rawEvaluator.name);
|
|
1239
1242
|
const rawType = rawEvaluator.type;
|
|
1240
|
-
const typeValue = typeof rawType === "string" ?
|
|
1243
|
+
const typeValue = typeof rawType === "string" ? normalizeGraderType(rawType) : rawType;
|
|
1241
1244
|
if (typeof typeValue === "string" && isDeprecatedJudgeType(typeValue)) {
|
|
1242
1245
|
logWarning2(
|
|
1243
1246
|
`Skipping evaluator '${rawName ?? "<unnamed>"}' in '${evalId}': '${rawType}' is deprecated. Use '${typeValue.replace("-judge", "-grader")}' instead`
|
|
1244
1247
|
);
|
|
1245
1248
|
continue;
|
|
1246
1249
|
}
|
|
1247
|
-
const isCustomType = typeof typeValue === "string" && !
|
|
1250
|
+
const isCustomType = typeof typeValue === "string" && !isGraderKind(typeValue);
|
|
1248
1251
|
if (typeof typeValue !== "string") {
|
|
1249
1252
|
logWarning2(`Skipping evaluator with invalid type in '${evalId}'`);
|
|
1250
1253
|
continue;
|
|
@@ -1407,7 +1410,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId, defa
|
|
|
1407
1410
|
continue;
|
|
1408
1411
|
}
|
|
1409
1412
|
const aggregatorType = asString(rawAggregator.type);
|
|
1410
|
-
const normalizedAggregatorType = typeof aggregatorType === "string" ? aggregatorType === "weighted_average" || aggregatorType === "threshold" ? aggregatorType :
|
|
1413
|
+
const normalizedAggregatorType = typeof aggregatorType === "string" ? aggregatorType === "weighted_average" || aggregatorType === "threshold" ? aggregatorType : normalizeGraderType(aggregatorType) : aggregatorType;
|
|
1411
1414
|
if (typeof normalizedAggregatorType === "string" && isDeprecatedJudgeType(normalizedAggregatorType)) {
|
|
1412
1415
|
logWarning2(
|
|
1413
1416
|
`Skipping composite evaluator '${name}' in '${evalId}': aggregator type '${aggregatorType}' is deprecated. Use '${normalizedAggregatorType.replace("-judge", "-grader")}' instead`
|
|
@@ -1420,7 +1423,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId, defa
|
|
|
1420
1423
|
);
|
|
1421
1424
|
continue;
|
|
1422
1425
|
}
|
|
1423
|
-
const expandedMembers = await
|
|
1426
|
+
const expandedMembers = await expandGraderEntries(
|
|
1424
1427
|
rawMembers,
|
|
1425
1428
|
searchRoots,
|
|
1426
1429
|
`${evalId}:${name}`
|
|
@@ -1436,11 +1439,11 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId, defa
|
|
|
1436
1439
|
}
|
|
1437
1440
|
const memberName = asString(rawMember.name);
|
|
1438
1441
|
const memberType = rawMember.type;
|
|
1439
|
-
if (!memberName || !
|
|
1442
|
+
if (!memberName || !isGraderKind(memberType)) {
|
|
1440
1443
|
logWarning2(`Skipping member evaluator with invalid name/type in composite '${name}'`);
|
|
1441
1444
|
continue;
|
|
1442
1445
|
}
|
|
1443
|
-
const memberConfigs = await
|
|
1446
|
+
const memberConfigs = await parseGraders(
|
|
1444
1447
|
{ evaluators: [rawMember] },
|
|
1445
1448
|
void 0,
|
|
1446
1449
|
searchRoots,
|
|
@@ -2181,7 +2184,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId, defa
|
|
|
2181
2184
|
`prompt.command for evaluator '${name}' in '${evalId}'`
|
|
2182
2185
|
);
|
|
2183
2186
|
if (!commandArray) {
|
|
2184
|
-
throw new Error(`
|
|
2187
|
+
throw new Error(`Grader '${name}' in '${evalId}': prompt object requires command array`);
|
|
2185
2188
|
}
|
|
2186
2189
|
const commandPath = commandArray[commandArray.length - 1];
|
|
2187
2190
|
const resolved = await resolveFileReference2(commandPath, searchRoots);
|
|
@@ -2189,7 +2192,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId, defa
|
|
|
2189
2192
|
resolvedPromptScript = [...commandArray.slice(0, -1), path5.resolve(resolved.resolvedPath)];
|
|
2190
2193
|
} else {
|
|
2191
2194
|
throw new Error(
|
|
2192
|
-
`
|
|
2195
|
+
`Grader '${name}' in '${evalId}': prompt command file not found: ${resolved.displayPath}`
|
|
2193
2196
|
);
|
|
2194
2197
|
}
|
|
2195
2198
|
if (isJsonObject2(rawPrompt.config)) {
|
|
@@ -2206,11 +2209,11 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId, defa
|
|
|
2206
2209
|
await validateCustomPromptContent(promptPath);
|
|
2207
2210
|
} catch (error) {
|
|
2208
2211
|
const message = error instanceof Error ? error.message : String(error);
|
|
2209
|
-
throw new Error(`
|
|
2212
|
+
throw new Error(`Grader '${name}' template (${promptPath}): ${message}`);
|
|
2210
2213
|
}
|
|
2211
2214
|
} else {
|
|
2212
2215
|
throw new Error(
|
|
2213
|
-
`
|
|
2216
|
+
`Grader '${name}' in '${evalId}': prompt file not found: ${resolved.displayPath}`
|
|
2214
2217
|
);
|
|
2215
2218
|
}
|
|
2216
2219
|
} else {
|
|
@@ -2327,18 +2330,18 @@ async function parsePreprocessors(rawValue, searchRoots, evaluatorName, evalId)
|
|
|
2327
2330
|
return void 0;
|
|
2328
2331
|
}
|
|
2329
2332
|
if (!Array.isArray(rawValue)) {
|
|
2330
|
-
throw new Error(`
|
|
2333
|
+
throw new Error(`Grader '${evaluatorName}' in '${evalId}': preprocessors must be an array`);
|
|
2331
2334
|
}
|
|
2332
2335
|
const preprocessors = [];
|
|
2333
2336
|
for (const rawEntry of rawValue) {
|
|
2334
2337
|
if (!isJsonObject2(rawEntry)) {
|
|
2335
2338
|
throw new Error(
|
|
2336
|
-
`
|
|
2339
|
+
`Grader '${evaluatorName}' in '${evalId}': each preprocessor must be an object`
|
|
2337
2340
|
);
|
|
2338
2341
|
}
|
|
2339
2342
|
const type = asString(rawEntry.type)?.trim();
|
|
2340
2343
|
if (!type) {
|
|
2341
|
-
throw new Error(`
|
|
2344
|
+
throw new Error(`Grader '${evaluatorName}' in '${evalId}': preprocessor.type is required`);
|
|
2342
2345
|
}
|
|
2343
2346
|
const command = asStringArray(
|
|
2344
2347
|
rawEntry.command,
|
|
@@ -2346,14 +2349,14 @@ async function parsePreprocessors(rawValue, searchRoots, evaluatorName, evalId)
|
|
|
2346
2349
|
);
|
|
2347
2350
|
if (!command || command.length === 0) {
|
|
2348
2351
|
throw new Error(
|
|
2349
|
-
`
|
|
2352
|
+
`Grader '${evaluatorName}' in '${evalId}': preprocessor '${type}' requires command`
|
|
2350
2353
|
);
|
|
2351
2354
|
}
|
|
2352
2355
|
const commandPath = command[command.length - 1];
|
|
2353
2356
|
const resolved = await resolveFileReference2(commandPath, searchRoots);
|
|
2354
2357
|
if (!resolved.resolvedPath) {
|
|
2355
2358
|
throw new Error(
|
|
2356
|
-
`
|
|
2359
|
+
`Grader '${evaluatorName}' in '${evalId}': preprocessor command file not found: ${resolved.displayPath}`
|
|
2357
2360
|
);
|
|
2358
2361
|
}
|
|
2359
2362
|
preprocessors.push({
|
|
@@ -2404,13 +2407,13 @@ function coerceEvaluator(candidate, contextId) {
|
|
|
2404
2407
|
if (typeof candidate !== "string") {
|
|
2405
2408
|
return void 0;
|
|
2406
2409
|
}
|
|
2407
|
-
const normalized =
|
|
2410
|
+
const normalized = normalizeGraderType(candidate);
|
|
2408
2411
|
if (isDeprecatedJudgeType(normalized)) {
|
|
2409
2412
|
throw new Error(
|
|
2410
2413
|
`Unsupported grader '${candidate}' in ${contextId}. Use '${normalized.replace("-judge", "-grader")}' instead.`
|
|
2411
2414
|
);
|
|
2412
2415
|
}
|
|
2413
|
-
if (
|
|
2416
|
+
if (isGraderKind(normalized)) {
|
|
2414
2417
|
return normalized;
|
|
2415
2418
|
}
|
|
2416
2419
|
logWarning2(`Unknown grader '${candidate}' in ${contextId}, falling back to default`);
|
|
@@ -2482,7 +2485,7 @@ function parseRequiredAndMinScore(rawRequired, rawMinScore, evaluatorName, evalI
|
|
|
2482
2485
|
}
|
|
2483
2486
|
result.required = rawRequired;
|
|
2484
2487
|
logWarning2(
|
|
2485
|
-
`
|
|
2488
|
+
`Grader '${evaluatorName}' in '${evalId}': 'required: ${rawRequired}' is deprecated. Use 'required: true' + 'min_score: ${rawRequired}' instead.`
|
|
2486
2489
|
);
|
|
2487
2490
|
}
|
|
2488
2491
|
return result;
|
|
@@ -3302,7 +3305,7 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
|
|
|
3302
3305
|
const testCaseEvaluatorKind = coerceEvaluator(testCaseConfig.evaluator, id) ?? globalEvaluator;
|
|
3303
3306
|
let evaluators;
|
|
3304
3307
|
try {
|
|
3305
|
-
evaluators = await
|
|
3308
|
+
evaluators = await parseGraders(
|
|
3306
3309
|
testCaseConfig,
|
|
3307
3310
|
mergedExecution,
|
|
3308
3311
|
searchRoots,
|
|
@@ -3648,7 +3651,7 @@ async function loadTestSuite(evalFilePath, repoRoot, options) {
|
|
|
3648
3651
|
targetRefs: extractTargetRefsFromSuite(parsed),
|
|
3649
3652
|
workers: extractWorkersFromSuite(parsed),
|
|
3650
3653
|
cacheConfig: extractCacheConfig(parsed),
|
|
3651
|
-
|
|
3654
|
+
budgetUsd: extractBudgetUsd(parsed),
|
|
3652
3655
|
...metadata !== void 0 && { metadata },
|
|
3653
3656
|
...failOnError !== void 0 && { failOnError },
|
|
3654
3657
|
...threshold !== void 0 && { threshold },
|
|
@@ -3789,7 +3792,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
3789
3792
|
const testCaseEvaluatorKind = coerceEvaluator(testCaseConfig.evaluator, id) ?? globalEvaluator;
|
|
3790
3793
|
let evaluators;
|
|
3791
3794
|
try {
|
|
3792
|
-
evaluators = await
|
|
3795
|
+
evaluators = await parseGraders(
|
|
3793
3796
|
testCaseConfig,
|
|
3794
3797
|
globalExecution,
|
|
3795
3798
|
searchRoots,
|
|
@@ -11600,7 +11603,7 @@ function resolveAndCreateProvider(definition, env = process.env) {
|
|
|
11600
11603
|
return createProvider(resolved);
|
|
11601
11604
|
}
|
|
11602
11605
|
|
|
11603
|
-
// src/evaluation/
|
|
11606
|
+
// src/evaluation/graders/scoring.ts
|
|
11604
11607
|
var DEFAULT_THRESHOLD = 0.8;
|
|
11605
11608
|
var PASS_THRESHOLD = DEFAULT_THRESHOLD;
|
|
11606
11609
|
function scoreToVerdict(score, threshold = DEFAULT_THRESHOLD) {
|
|
@@ -11688,7 +11691,7 @@ function negateScore(score) {
|
|
|
11688
11691
|
};
|
|
11689
11692
|
}
|
|
11690
11693
|
|
|
11691
|
-
// src/evaluation/
|
|
11694
|
+
// src/evaluation/graders/code-grader.ts
|
|
11692
11695
|
import { mkdtemp as mkdtemp2, rm as rm3, writeFile as writeFile6 } from "node:fs/promises";
|
|
11693
11696
|
import { tmpdir as tmpdir2 } from "node:os";
|
|
11694
11697
|
import { dirname, join } from "node:path";
|
|
@@ -11981,7 +11984,7 @@ function getRepoCheckoutTargets(repos) {
|
|
|
11981
11984
|
}));
|
|
11982
11985
|
}
|
|
11983
11986
|
|
|
11984
|
-
// src/evaluation/
|
|
11987
|
+
// src/evaluation/graders/code-grader.ts
|
|
11985
11988
|
var FILE_BACKED_OUTPUT_THRESHOLD = 5e4;
|
|
11986
11989
|
var DATA_URI_RE = /^data:([^;]+);base64,(.+)$/s;
|
|
11987
11990
|
async function materializeContentForGrader(messages, getWorkDir) {
|
|
@@ -12033,7 +12036,7 @@ async function materializeContentForGrader(messages, getWorkDir) {
|
|
|
12033
12036
|
}
|
|
12034
12037
|
return result;
|
|
12035
12038
|
}
|
|
12036
|
-
var
|
|
12039
|
+
var CodeGrader = class {
|
|
12037
12040
|
kind = "code-grader";
|
|
12038
12041
|
command;
|
|
12039
12042
|
cwd;
|
|
@@ -12151,7 +12154,7 @@ var CodeEvaluator = class {
|
|
|
12151
12154
|
})) : [];
|
|
12152
12155
|
const details = parsed?.details && typeof parsed.details === "object" && !Array.isArray(parsed.details) ? parsed.details : void 0;
|
|
12153
12156
|
const proxyUsage = getProxyUsage?.();
|
|
12154
|
-
const
|
|
12157
|
+
const graderRawRequest = {
|
|
12155
12158
|
command: this.command,
|
|
12156
12159
|
...this.cwd ? { cwd: this.cwd } : {},
|
|
12157
12160
|
...proxyUsage ? {
|
|
@@ -12166,7 +12169,7 @@ var CodeEvaluator = class {
|
|
|
12166
12169
|
verdict: scoreToVerdict(score),
|
|
12167
12170
|
assertions,
|
|
12168
12171
|
expectedAspectCount: assertions.length || 1,
|
|
12169
|
-
|
|
12172
|
+
graderRawRequest,
|
|
12170
12173
|
...details ? { details } : {},
|
|
12171
12174
|
tokenUsage: proxyUsage?.tokenUsage
|
|
12172
12175
|
};
|
|
@@ -12178,7 +12181,7 @@ var CodeEvaluator = class {
|
|
|
12178
12181
|
verdict: "fail",
|
|
12179
12182
|
assertions: [{ text: `Code evaluator failed: ${message}`, passed: false }],
|
|
12180
12183
|
expectedAspectCount: 1,
|
|
12181
|
-
|
|
12184
|
+
graderRawRequest: {
|
|
12182
12185
|
command: this.command,
|
|
12183
12186
|
...this.cwd ? { cwd: this.cwd } : {},
|
|
12184
12187
|
...proxyUsage ? {
|
|
@@ -12227,10 +12230,10 @@ function formatStderr(stderr) {
|
|
|
12227
12230
|
${tail}`;
|
|
12228
12231
|
}
|
|
12229
12232
|
|
|
12230
|
-
// src/evaluation/
|
|
12233
|
+
// src/evaluation/graders/composite.ts
|
|
12231
12234
|
import { generateText as generateText3 } from "ai";
|
|
12232
12235
|
|
|
12233
|
-
// src/evaluation/
|
|
12236
|
+
// src/evaluation/graders/llm-grader.ts
|
|
12234
12237
|
import fs2 from "node:fs/promises";
|
|
12235
12238
|
import path37 from "node:path";
|
|
12236
12239
|
import { generateText as generateText2, stepCountIs, tool } from "ai";
|
|
@@ -12270,7 +12273,7 @@ var BINARY_EXTENSIONS = /* @__PURE__ */ new Set([
|
|
|
12270
12273
|
".so",
|
|
12271
12274
|
".dylib"
|
|
12272
12275
|
]);
|
|
12273
|
-
var
|
|
12276
|
+
var DEFAULT_GRADER_TEMPLATE = `You are an expert grader. Your goal is to grade the answer based on how well it achieves the criteria for the original task.
|
|
12274
12277
|
|
|
12275
12278
|
Use the reference_answer as a gold standard for a high-quality response (if provided). The reference_answer may be a simple text response, or it may contain a sequence of expected agent messages including tool calls. When it contains multiple messages, the last message represents the final expected answer. The answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
|
|
12276
12279
|
|
|
@@ -12325,19 +12328,19 @@ function resolveContentBasePath(context) {
|
|
|
12325
12328
|
}
|
|
12326
12329
|
return void 0;
|
|
12327
12330
|
}
|
|
12328
|
-
var
|
|
12331
|
+
var LlmGrader = class {
|
|
12329
12332
|
kind = "llm-grader";
|
|
12330
12333
|
resolveGraderProvider;
|
|
12331
12334
|
maxOutputTokens;
|
|
12332
12335
|
temperature;
|
|
12333
|
-
|
|
12336
|
+
graderTemplate;
|
|
12334
12337
|
maxSteps;
|
|
12335
12338
|
graderTargetProvider;
|
|
12336
12339
|
constructor(options) {
|
|
12337
12340
|
this.resolveGraderProvider = options.resolveGraderProvider ?? options.resolveJudgeProvider;
|
|
12338
12341
|
this.maxOutputTokens = options.maxOutputTokens;
|
|
12339
12342
|
this.temperature = options.temperature;
|
|
12340
|
-
this.
|
|
12343
|
+
this.graderTemplate = options.graderTemplate;
|
|
12341
12344
|
this.maxSteps = Math.min(options.maxSteps ?? DEFAULT_MAX_STEPS, MAX_STEPS_LIMIT);
|
|
12342
12345
|
this.graderTargetProvider = options.graderTargetProvider ?? options.judgeTargetProvider;
|
|
12343
12346
|
}
|
|
@@ -12400,16 +12403,16 @@ var LlmGraderEvaluator = class {
|
|
|
12400
12403
|
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context.evalCase.reference_answer ?? "").trim()
|
|
12401
12404
|
};
|
|
12402
12405
|
const systemPrompt = buildOutputSchema();
|
|
12403
|
-
const
|
|
12404
|
-
warnDeprecatedTemplateVars(
|
|
12405
|
-
let userPrompt = substituteVariables(
|
|
12406
|
-
if (context.fileChanges && !context.
|
|
12406
|
+
const graderTemplate = context.graderTemplateOverride ?? this.graderTemplate ?? DEFAULT_GRADER_TEMPLATE;
|
|
12407
|
+
warnDeprecatedTemplateVars(graderTemplate);
|
|
12408
|
+
let userPrompt = substituteVariables(graderTemplate, variables);
|
|
12409
|
+
if (context.fileChanges && !context.graderTemplateOverride && !this.graderTemplate) {
|
|
12407
12410
|
userPrompt += `
|
|
12408
12411
|
|
|
12409
12412
|
[[ ## file_changes ## ]]
|
|
12410
12413
|
${context.fileChanges}`;
|
|
12411
12414
|
}
|
|
12412
|
-
const
|
|
12415
|
+
const graderRawRequest = {
|
|
12413
12416
|
userPrompt,
|
|
12414
12417
|
systemPrompt
|
|
12415
12418
|
};
|
|
@@ -12430,7 +12433,7 @@ ${context.fileChanges}`;
|
|
|
12430
12433
|
verdict: scoreToVerdict(score),
|
|
12431
12434
|
assertions,
|
|
12432
12435
|
expectedAspectCount: Math.max(assertions.length, 1),
|
|
12433
|
-
|
|
12436
|
+
graderRawRequest,
|
|
12434
12437
|
graderTarget: graderProvider.targetName,
|
|
12435
12438
|
details: data.details,
|
|
12436
12439
|
tokenUsage
|
|
@@ -12444,7 +12447,7 @@ ${context.fileChanges}`;
|
|
|
12444
12447
|
verdict: "skip",
|
|
12445
12448
|
assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
|
|
12446
12449
|
expectedAspectCount: 1,
|
|
12447
|
-
|
|
12450
|
+
graderRawRequest,
|
|
12448
12451
|
graderTarget: graderProvider.targetName
|
|
12449
12452
|
};
|
|
12450
12453
|
}
|
|
@@ -12461,7 +12464,7 @@ ${context.fileChanges}`;
|
|
|
12461
12464
|
}
|
|
12462
12465
|
const prompt = this.buildRubricPrompt(context, rubrics);
|
|
12463
12466
|
const systemPrompt = buildRubricOutputSchema();
|
|
12464
|
-
const
|
|
12467
|
+
const graderRawRequest = {
|
|
12465
12468
|
userPrompt: prompt,
|
|
12466
12469
|
systemPrompt
|
|
12467
12470
|
};
|
|
@@ -12481,7 +12484,7 @@ ${context.fileChanges}`;
|
|
|
12481
12484
|
verdict,
|
|
12482
12485
|
assertions,
|
|
12483
12486
|
expectedAspectCount: rubrics.length,
|
|
12484
|
-
|
|
12487
|
+
graderRawRequest,
|
|
12485
12488
|
graderTarget: graderProvider.targetName,
|
|
12486
12489
|
tokenUsage
|
|
12487
12490
|
};
|
|
@@ -12494,7 +12497,7 @@ ${context.fileChanges}`;
|
|
|
12494
12497
|
verdict: "skip",
|
|
12495
12498
|
assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
|
|
12496
12499
|
expectedAspectCount: rubrics.length,
|
|
12497
|
-
|
|
12500
|
+
graderRawRequest,
|
|
12498
12501
|
graderTarget: graderProvider.targetName
|
|
12499
12502
|
};
|
|
12500
12503
|
}
|
|
@@ -12506,7 +12509,7 @@ ${context.fileChanges}`;
|
|
|
12506
12509
|
async evaluateWithScoreRanges(context, graderProvider, rubrics) {
|
|
12507
12510
|
const prompt = this.buildScoreRangePrompt(context, rubrics);
|
|
12508
12511
|
const systemPrompt = buildScoreRangeOutputSchema();
|
|
12509
|
-
const
|
|
12512
|
+
const graderRawRequest = {
|
|
12510
12513
|
userPrompt: prompt,
|
|
12511
12514
|
systemPrompt
|
|
12512
12515
|
};
|
|
@@ -12526,7 +12529,7 @@ ${context.fileChanges}`;
|
|
|
12526
12529
|
verdict,
|
|
12527
12530
|
assertions,
|
|
12528
12531
|
expectedAspectCount: rubrics.length,
|
|
12529
|
-
|
|
12532
|
+
graderRawRequest,
|
|
12530
12533
|
graderTarget: graderProvider.targetName,
|
|
12531
12534
|
details,
|
|
12532
12535
|
tokenUsage
|
|
@@ -12540,7 +12543,7 @@ ${context.fileChanges}`;
|
|
|
12540
12543
|
verdict: "skip",
|
|
12541
12544
|
assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
|
|
12542
12545
|
expectedAspectCount: rubrics.length,
|
|
12543
|
-
|
|
12546
|
+
graderRawRequest,
|
|
12544
12547
|
graderTarget: graderProvider.targetName
|
|
12545
12548
|
};
|
|
12546
12549
|
}
|
|
@@ -12569,7 +12572,7 @@ ${context.fileChanges}`;
|
|
|
12569
12572
|
const config = context.evaluator;
|
|
12570
12573
|
const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
|
|
12571
12574
|
const fsTools = createFilesystemTools(workspacePath);
|
|
12572
|
-
const
|
|
12575
|
+
const graderRawRequest = {
|
|
12573
12576
|
mode: "built-in",
|
|
12574
12577
|
systemPrompt,
|
|
12575
12578
|
userPrompt,
|
|
@@ -12593,7 +12596,7 @@ ${context.fileChanges}`;
|
|
|
12593
12596
|
return this.parseAgentResult(
|
|
12594
12597
|
text,
|
|
12595
12598
|
rubrics,
|
|
12596
|
-
|
|
12599
|
+
graderRawRequest,
|
|
12597
12600
|
details,
|
|
12598
12601
|
graderProvider.targetName
|
|
12599
12602
|
);
|
|
@@ -12604,7 +12607,7 @@ ${context.fileChanges}`;
|
|
|
12604
12607
|
verdict: "fail",
|
|
12605
12608
|
assertions: [{ text: `llm-grader built-in evaluation failed: ${message}`, passed: false }],
|
|
12606
12609
|
expectedAspectCount: 1,
|
|
12607
|
-
|
|
12610
|
+
graderRawRequest,
|
|
12608
12611
|
graderTarget: graderProvider.targetName,
|
|
12609
12612
|
details: { mode: "built-in", error: message }
|
|
12610
12613
|
};
|
|
@@ -12636,7 +12639,7 @@ ${context.fileChanges}`;
|
|
|
12636
12639
|
async evaluateWithDelegate(context, provider, modeLabel) {
|
|
12637
12640
|
const workspacePath = context.workspacePath;
|
|
12638
12641
|
const prompt = this.buildDelegatedPrompt(context);
|
|
12639
|
-
const
|
|
12642
|
+
const graderRawRequest = {
|
|
12640
12643
|
mode: modeLabel,
|
|
12641
12644
|
grader_target: provider.targetName,
|
|
12642
12645
|
prompt
|
|
@@ -12657,7 +12660,7 @@ ${context.fileChanges}`;
|
|
|
12657
12660
|
{ text: `llm-grader ${modeLabel} returned no assistant response`, passed: false }
|
|
12658
12661
|
],
|
|
12659
12662
|
expectedAspectCount: 1,
|
|
12660
|
-
|
|
12663
|
+
graderRawRequest,
|
|
12661
12664
|
graderTarget: provider.targetName,
|
|
12662
12665
|
details: { mode: modeLabel, grader_target: provider.targetName }
|
|
12663
12666
|
};
|
|
@@ -12671,7 +12674,7 @@ ${context.fileChanges}`;
|
|
|
12671
12674
|
return this.parseAgentResult(
|
|
12672
12675
|
assistantContent,
|
|
12673
12676
|
rubrics,
|
|
12674
|
-
|
|
12677
|
+
graderRawRequest,
|
|
12675
12678
|
details,
|
|
12676
12679
|
provider.targetName
|
|
12677
12680
|
);
|
|
@@ -12684,7 +12687,7 @@ ${context.fileChanges}`;
|
|
|
12684
12687
|
{ text: `llm-grader ${modeLabel} evaluation failed: ${message}`, passed: false }
|
|
12685
12688
|
],
|
|
12686
12689
|
expectedAspectCount: 1,
|
|
12687
|
-
|
|
12690
|
+
graderRawRequest,
|
|
12688
12691
|
graderTarget: provider.targetName,
|
|
12689
12692
|
details: {
|
|
12690
12693
|
mode: modeLabel,
|
|
@@ -12705,7 +12708,7 @@ ${context.fileChanges}`;
|
|
|
12705
12708
|
const config = context.evaluator;
|
|
12706
12709
|
const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
|
|
12707
12710
|
const parts = [
|
|
12708
|
-
"You are an expert
|
|
12711
|
+
"You are an expert grader with access to the workspace filesystem.",
|
|
12709
12712
|
"Use the provided tools to investigate the workspace and verify the criteria are met.",
|
|
12710
12713
|
"Thoroughly examine relevant files before making your assessment.",
|
|
12711
12714
|
""
|
|
@@ -12734,9 +12737,9 @@ ${context.fileChanges}`;
|
|
|
12734
12737
|
[TEMPLATE_VARIABLES.OUTPUT_TEXT]: context.candidate.trim(),
|
|
12735
12738
|
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context.evalCase.reference_answer ?? "").trim()
|
|
12736
12739
|
};
|
|
12737
|
-
if (this.
|
|
12738
|
-
warnDeprecatedTemplateVars(this.
|
|
12739
|
-
return substituteVariables(this.
|
|
12740
|
+
if (this.graderTemplate) {
|
|
12741
|
+
warnDeprecatedTemplateVars(this.graderTemplate);
|
|
12742
|
+
return substituteVariables(this.graderTemplate, variables);
|
|
12740
12743
|
}
|
|
12741
12744
|
const config = context.evaluator;
|
|
12742
12745
|
const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
|
|
@@ -12783,7 +12786,7 @@ ${context.fileChanges}`;
|
|
|
12783
12786
|
const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
|
|
12784
12787
|
const config = context.evaluator;
|
|
12785
12788
|
const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
|
|
12786
|
-
if (this.
|
|
12789
|
+
if (this.graderTemplate) {
|
|
12787
12790
|
const variables = {
|
|
12788
12791
|
[TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(),
|
|
12789
12792
|
[TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
|
|
@@ -12795,15 +12798,15 @@ ${context.fileChanges}`;
|
|
|
12795
12798
|
[TEMPLATE_VARIABLES.OUTPUT_TEXT]: context.candidate.trim(),
|
|
12796
12799
|
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context.evalCase.reference_answer ?? "").trim()
|
|
12797
12800
|
};
|
|
12798
|
-
warnDeprecatedTemplateVars(this.
|
|
12799
|
-
const customPrompt = substituteVariables(this.
|
|
12801
|
+
warnDeprecatedTemplateVars(this.graderTemplate);
|
|
12802
|
+
const customPrompt = substituteVariables(this.graderTemplate, variables);
|
|
12800
12803
|
const outputSchema = rubrics && rubrics.length > 0 ? buildRubricOutputSchema() : buildOutputSchema();
|
|
12801
12804
|
return `${customPrompt}
|
|
12802
12805
|
|
|
12803
12806
|
${outputSchema}`;
|
|
12804
12807
|
}
|
|
12805
12808
|
const parts = [
|
|
12806
|
-
"You are an expert
|
|
12809
|
+
"You are an expert grader. Investigate the workspace to verify the criteria are met.",
|
|
12807
12810
|
"",
|
|
12808
12811
|
"[[ ## question ## ]]",
|
|
12809
12812
|
formattedQuestion,
|
|
@@ -12840,7 +12843,7 @@ ${outputSchema}`;
|
|
|
12840
12843
|
* Parse the agent's response text into an EvaluationScore.
|
|
12841
12844
|
* Supports both freeform and rubric modes.
|
|
12842
12845
|
*/
|
|
12843
|
-
parseAgentResult(text, rubrics,
|
|
12846
|
+
parseAgentResult(text, rubrics, graderRawRequest, details, graderTarget) {
|
|
12844
12847
|
try {
|
|
12845
12848
|
const parsed = parseJsonFromText(text);
|
|
12846
12849
|
if (rubrics && rubrics.length > 0) {
|
|
@@ -12851,7 +12854,7 @@ ${outputSchema}`;
|
|
|
12851
12854
|
verdict,
|
|
12852
12855
|
assertions: assertions2,
|
|
12853
12856
|
expectedAspectCount: rubrics.length,
|
|
12854
|
-
|
|
12857
|
+
graderRawRequest,
|
|
12855
12858
|
graderTarget,
|
|
12856
12859
|
details
|
|
12857
12860
|
};
|
|
@@ -12864,7 +12867,7 @@ ${outputSchema}`;
|
|
|
12864
12867
|
verdict: scoreToVerdict(score),
|
|
12865
12868
|
assertions,
|
|
12866
12869
|
expectedAspectCount: Math.max(assertions.length, 1),
|
|
12867
|
-
|
|
12870
|
+
graderRawRequest,
|
|
12868
12871
|
graderTarget,
|
|
12869
12872
|
details: data.details && Object.keys(data.details).length > 0 ? { ...details, ...data.details } : details
|
|
12870
12873
|
};
|
|
@@ -12879,7 +12882,7 @@ ${outputSchema}`;
|
|
|
12879
12882
|
}
|
|
12880
12883
|
],
|
|
12881
12884
|
expectedAspectCount: 1,
|
|
12882
|
-
|
|
12885
|
+
graderRawRequest,
|
|
12883
12886
|
graderTarget,
|
|
12884
12887
|
details
|
|
12885
12888
|
};
|
|
@@ -12894,7 +12897,7 @@ ${outputSchema}`;
|
|
|
12894
12897
|
buildScoreRangePrompt(context, rubrics) {
|
|
12895
12898
|
const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
|
|
12896
12899
|
const parts = [
|
|
12897
|
-
"You are an expert
|
|
12900
|
+
"You are an expert grader. Score the candidate answer on each criterion below using the provided score ranges.",
|
|
12898
12901
|
"For each criterion, output an integer score from 0 to 10 based on which score range best matches the answer.",
|
|
12899
12902
|
"",
|
|
12900
12903
|
"[[ ## question ## ]]",
|
|
@@ -12937,7 +12940,7 @@ ${outputSchema}`;
|
|
|
12937
12940
|
buildRubricPrompt(context, rubrics) {
|
|
12938
12941
|
const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
|
|
12939
12942
|
const parts = [
|
|
12940
|
-
"You are an expert
|
|
12943
|
+
"You are an expert grader. Evaluate the candidate answer against each rubric item below.",
|
|
12941
12944
|
"",
|
|
12942
12945
|
"[[ ## question ## ]]",
|
|
12943
12946
|
formattedQuestion,
|
|
@@ -13111,7 +13114,7 @@ function sumTokenUsage(first, second) {
|
|
|
13111
13114
|
};
|
|
13112
13115
|
}
|
|
13113
13116
|
function buildRubricOutputSchema() {
|
|
13114
|
-
return `You are an expert
|
|
13117
|
+
return `You are an expert grader. Evaluate the candidate answer against each rubric item.
|
|
13115
13118
|
You must return a valid JSON object matching this schema:
|
|
13116
13119
|
{
|
|
13117
13120
|
"checks": [
|
|
@@ -13145,7 +13148,7 @@ function warnDeprecatedTemplateVars(template) {
|
|
|
13145
13148
|
console.warn(
|
|
13146
13149
|
`${ANSI_YELLOW7}\u26A0 Deprecated template variables detected (they still work but will be removed in a future version):
|
|
13147
13150
|
${used.join("\n ")}
|
|
13148
|
-
Update your custom
|
|
13151
|
+
Update your custom grader template to use the new names.${ANSI_RESET8}`
|
|
13149
13152
|
);
|
|
13150
13153
|
}
|
|
13151
13154
|
}
|
|
@@ -13177,7 +13180,7 @@ function calculateRubricScore(result, rubrics) {
|
|
|
13177
13180
|
return { score, verdict, assertions };
|
|
13178
13181
|
}
|
|
13179
13182
|
function buildScoreRangeOutputSchema() {
|
|
13180
|
-
return `You are an expert
|
|
13183
|
+
return `You are an expert grader. Score the candidate answer on each criterion.
|
|
13181
13184
|
You must return a valid JSON object matching this schema:
|
|
13182
13185
|
{
|
|
13183
13186
|
"checks": [
|
|
@@ -13385,13 +13388,13 @@ async function searchDirectory(dirPath, workspacePath, regex, matches) {
|
|
|
13385
13388
|
}
|
|
13386
13389
|
}
|
|
13387
13390
|
|
|
13388
|
-
// src/evaluation/
|
|
13391
|
+
// src/evaluation/graders/composite.ts
|
|
13389
13392
|
var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
|
|
13390
13393
|
{{EVALUATOR_RESULTS_JSON}}
|
|
13391
13394
|
|
|
13392
|
-
Decide the final score and verdict based on all
|
|
13395
|
+
Decide the final score and verdict based on all grader results.
|
|
13393
13396
|
Return a JSON object with: score (0.0-1.0), verdict (pass/fail), and reasoning.`;
|
|
13394
|
-
var
|
|
13397
|
+
var CompositeGrader = class {
|
|
13395
13398
|
kind = "composite";
|
|
13396
13399
|
config;
|
|
13397
13400
|
evaluatorFactory;
|
|
@@ -13442,7 +13445,7 @@ var CompositeEvaluator = class {
|
|
|
13442
13445
|
weight,
|
|
13443
13446
|
verdict: member.result.verdict,
|
|
13444
13447
|
assertions: [...member.result.assertions],
|
|
13445
|
-
|
|
13448
|
+
graderRawRequest: member.result.graderRawRequest,
|
|
13446
13449
|
scores: member.result.scores,
|
|
13447
13450
|
details: member.result.details,
|
|
13448
13451
|
tokenUsage: member.result.tokenUsage
|
|
@@ -13463,7 +13466,7 @@ var CompositeEvaluator = class {
|
|
|
13463
13466
|
verdict: "skip",
|
|
13464
13467
|
assertions: [{ text: "All evaluators skipped (infrastructure failure)", passed: false }],
|
|
13465
13468
|
expectedAspectCount: 1,
|
|
13466
|
-
|
|
13469
|
+
graderRawRequest: {
|
|
13467
13470
|
aggregator: "weighted_average",
|
|
13468
13471
|
...weights ? { weights } : {}
|
|
13469
13472
|
},
|
|
@@ -13476,7 +13479,7 @@ var CompositeEvaluator = class {
|
|
|
13476
13479
|
verdict: scoreToVerdict(finalScore),
|
|
13477
13480
|
assertions: allAssertions,
|
|
13478
13481
|
expectedAspectCount: allAssertions.length || 1,
|
|
13479
|
-
|
|
13482
|
+
graderRawRequest: {
|
|
13480
13483
|
aggregator: "weighted_average",
|
|
13481
13484
|
...weights ? { weights } : {}
|
|
13482
13485
|
},
|
|
@@ -13495,7 +13498,7 @@ var CompositeEvaluator = class {
|
|
|
13495
13498
|
score: member.result.score,
|
|
13496
13499
|
verdict: member.result.verdict,
|
|
13497
13500
|
assertions: [...member.result.assertions],
|
|
13498
|
-
|
|
13501
|
+
graderRawRequest: member.result.graderRawRequest,
|
|
13499
13502
|
scores: member.result.scores,
|
|
13500
13503
|
details: member.result.details,
|
|
13501
13504
|
tokenUsage: member.result.tokenUsage
|
|
@@ -13518,7 +13521,7 @@ var CompositeEvaluator = class {
|
|
|
13518
13521
|
verdict: "skip",
|
|
13519
13522
|
assertions: [{ text: "All evaluators skipped (infrastructure failure)", passed: false }],
|
|
13520
13523
|
expectedAspectCount: 1,
|
|
13521
|
-
|
|
13524
|
+
graderRawRequest: {
|
|
13522
13525
|
aggregator: "threshold",
|
|
13523
13526
|
threshold
|
|
13524
13527
|
},
|
|
@@ -13537,7 +13540,7 @@ var CompositeEvaluator = class {
|
|
|
13537
13540
|
verdict: pass ? "pass" : "fail",
|
|
13538
13541
|
assertions: allAssertions,
|
|
13539
13542
|
expectedAspectCount: allAssertions.length || 1,
|
|
13540
|
-
|
|
13543
|
+
graderRawRequest: {
|
|
13541
13544
|
aggregator: "threshold",
|
|
13542
13545
|
threshold
|
|
13543
13546
|
},
|
|
@@ -13554,7 +13557,7 @@ var CompositeEvaluator = class {
|
|
|
13554
13557
|
weight: weights?.[member.id] ?? 1,
|
|
13555
13558
|
verdict: member.result.verdict,
|
|
13556
13559
|
assertions: [...member.result.assertions],
|
|
13557
|
-
|
|
13560
|
+
graderRawRequest: member.result.graderRawRequest,
|
|
13558
13561
|
scores: member.result.scores,
|
|
13559
13562
|
details: member.result.details
|
|
13560
13563
|
}));
|
|
@@ -13575,7 +13578,7 @@ var CompositeEvaluator = class {
|
|
|
13575
13578
|
verdict,
|
|
13576
13579
|
assertions,
|
|
13577
13580
|
expectedAspectCount: assertions.length || 1,
|
|
13578
|
-
|
|
13581
|
+
graderRawRequest: {
|
|
13579
13582
|
aggregator: "code-grader",
|
|
13580
13583
|
script: scriptPath
|
|
13581
13584
|
},
|
|
@@ -13588,7 +13591,7 @@ var CompositeEvaluator = class {
|
|
|
13588
13591
|
verdict: "fail",
|
|
13589
13592
|
assertions: [{ text: `Code aggregator failed: ${message}`, passed: false }],
|
|
13590
13593
|
expectedAspectCount: 1,
|
|
13591
|
-
|
|
13594
|
+
graderRawRequest: {
|
|
13592
13595
|
aggregator: "code-grader",
|
|
13593
13596
|
script: scriptPath,
|
|
13594
13597
|
error: message
|
|
@@ -13610,14 +13613,14 @@ var CompositeEvaluator = class {
|
|
|
13610
13613
|
score: member.result.score,
|
|
13611
13614
|
verdict: member.result.verdict,
|
|
13612
13615
|
assertions: [...member.result.assertions],
|
|
13613
|
-
|
|
13616
|
+
graderRawRequest: member.result.graderRawRequest,
|
|
13614
13617
|
scores: member.result.scores,
|
|
13615
13618
|
details: member.result.details
|
|
13616
13619
|
}));
|
|
13617
13620
|
const promptTemplate = config.prompt ?? DEFAULT_COMPOSITE_AGGREGATOR_PROMPT;
|
|
13618
13621
|
const userPrompt = promptTemplate.replace(/\{\{EVALUATOR_RESULTS_JSON\}\}/g, resultsJson);
|
|
13619
13622
|
const systemPrompt = buildOutputSchema();
|
|
13620
|
-
const
|
|
13623
|
+
const graderRawRequest = {
|
|
13621
13624
|
aggregator: "llm-grader",
|
|
13622
13625
|
userPrompt,
|
|
13623
13626
|
systemPrompt,
|
|
@@ -13639,7 +13642,7 @@ var CompositeEvaluator = class {
|
|
|
13639
13642
|
verdict: scoreToVerdict(score2),
|
|
13640
13643
|
assertions: assertions2,
|
|
13641
13644
|
expectedAspectCount: Math.max(assertions2.length, 1),
|
|
13642
|
-
|
|
13645
|
+
graderRawRequest,
|
|
13643
13646
|
scores
|
|
13644
13647
|
};
|
|
13645
13648
|
}
|
|
@@ -13659,7 +13662,7 @@ var CompositeEvaluator = class {
|
|
|
13659
13662
|
verdict: scoreToVerdict(score),
|
|
13660
13663
|
assertions,
|
|
13661
13664
|
expectedAspectCount: Math.max(assertions.length, 1),
|
|
13662
|
-
|
|
13665
|
+
graderRawRequest,
|
|
13663
13666
|
scores
|
|
13664
13667
|
};
|
|
13665
13668
|
} catch {
|
|
@@ -13668,15 +13671,15 @@ var CompositeEvaluator = class {
|
|
|
13668
13671
|
verdict: "fail",
|
|
13669
13672
|
assertions: [{ text: "LLM aggregator failed", passed: false }],
|
|
13670
13673
|
expectedAspectCount: 1,
|
|
13671
|
-
|
|
13674
|
+
graderRawRequest,
|
|
13672
13675
|
scores
|
|
13673
13676
|
};
|
|
13674
13677
|
}
|
|
13675
13678
|
}
|
|
13676
13679
|
};
|
|
13677
13680
|
|
|
13678
|
-
// src/evaluation/
|
|
13679
|
-
var
|
|
13681
|
+
// src/evaluation/graders/cost.ts
|
|
13682
|
+
var CostGrader = class {
|
|
13680
13683
|
kind = "cost";
|
|
13681
13684
|
config;
|
|
13682
13685
|
constructor(options) {
|
|
@@ -13691,7 +13694,7 @@ var CostEvaluator = class {
|
|
|
13691
13694
|
verdict: "fail",
|
|
13692
13695
|
assertions: [{ text: "No cost data available in trace", passed: false }],
|
|
13693
13696
|
expectedAspectCount: 1,
|
|
13694
|
-
|
|
13697
|
+
graderRawRequest: {
|
|
13695
13698
|
type: "cost",
|
|
13696
13699
|
budget,
|
|
13697
13700
|
costUsd: null
|
|
@@ -13708,7 +13711,7 @@ var CostEvaluator = class {
|
|
|
13708
13711
|
passed ? { text: `Cost ${formatCost(costUsd)} <= ${formatCost(budget)} budget`, passed: true } : { text: `Cost ${formatCost(costUsd)} > ${formatCost(budget)} budget`, passed: false }
|
|
13709
13712
|
],
|
|
13710
13713
|
expectedAspectCount: 1,
|
|
13711
|
-
|
|
13714
|
+
graderRawRequest: {
|
|
13712
13715
|
type: "cost",
|
|
13713
13716
|
budget,
|
|
13714
13717
|
costUsd
|
|
@@ -13717,8 +13720,8 @@ var CostEvaluator = class {
|
|
|
13717
13720
|
}
|
|
13718
13721
|
};
|
|
13719
13722
|
|
|
13720
|
-
// src/evaluation/
|
|
13721
|
-
var
|
|
13723
|
+
// src/evaluation/graders/execution-metrics.ts
|
|
13724
|
+
var ExecutionMetricsGrader = class {
|
|
13722
13725
|
kind = "execution-metrics";
|
|
13723
13726
|
config;
|
|
13724
13727
|
constructor(options) {
|
|
@@ -13742,7 +13745,7 @@ var ExecutionMetricsEvaluator = class {
|
|
|
13742
13745
|
verdict: "fail",
|
|
13743
13746
|
assertions: [{ text: "No trace summary available", passed: false }],
|
|
13744
13747
|
expectedAspectCount: 1,
|
|
13745
|
-
|
|
13748
|
+
graderRawRequest: {
|
|
13746
13749
|
type: "execution-metrics",
|
|
13747
13750
|
config: this.extractConfiguredThresholds(),
|
|
13748
13751
|
actual: null
|
|
@@ -13858,7 +13861,7 @@ var ExecutionMetricsEvaluator = class {
|
|
|
13858
13861
|
verdict: scoreToVerdict(score),
|
|
13859
13862
|
assertions,
|
|
13860
13863
|
expectedAspectCount: totalChecks || 1,
|
|
13861
|
-
|
|
13864
|
+
graderRawRequest: {
|
|
13862
13865
|
type: "execution-metrics",
|
|
13863
13866
|
config: this.extractConfiguredThresholds(),
|
|
13864
13867
|
actual: this.filterDefinedMetrics(actualMetrics)
|
|
@@ -13901,7 +13904,7 @@ var ExecutionMetricsEvaluator = class {
|
|
|
13901
13904
|
}
|
|
13902
13905
|
};
|
|
13903
13906
|
|
|
13904
|
-
// src/evaluation/
|
|
13907
|
+
// src/evaluation/graders/field-accuracy.ts
|
|
13905
13908
|
var DEFAULT_DATE_FORMATS = [
|
|
13906
13909
|
"YYYY-MM-DDTHH:mm:ssZ",
|
|
13907
13910
|
// ISO with timezone
|
|
@@ -13946,7 +13949,7 @@ var MONTH_NAMES = {
|
|
|
13946
13949
|
dec: 11,
|
|
13947
13950
|
december: 11
|
|
13948
13951
|
};
|
|
13949
|
-
var
|
|
13952
|
+
var FieldAccuracyGrader = class {
|
|
13950
13953
|
kind = "field-accuracy";
|
|
13951
13954
|
config;
|
|
13952
13955
|
constructor(options) {
|
|
@@ -14305,8 +14308,8 @@ function parseJsonFromTextSafe(text) {
|
|
|
14305
14308
|
return parseJsonFromText(text);
|
|
14306
14309
|
}
|
|
14307
14310
|
|
|
14308
|
-
// src/evaluation/
|
|
14309
|
-
var
|
|
14311
|
+
// src/evaluation/graders/latency.ts
|
|
14312
|
+
var LatencyGrader = class {
|
|
14310
14313
|
kind = "latency";
|
|
14311
14314
|
config;
|
|
14312
14315
|
constructor(options) {
|
|
@@ -14321,7 +14324,7 @@ var LatencyEvaluator = class {
|
|
|
14321
14324
|
verdict: "fail",
|
|
14322
14325
|
assertions: [{ text: "No duration data available in trace", passed: false }],
|
|
14323
14326
|
expectedAspectCount: 1,
|
|
14324
|
-
|
|
14327
|
+
graderRawRequest: {
|
|
14325
14328
|
type: "latency",
|
|
14326
14329
|
threshold,
|
|
14327
14330
|
durationMs: null
|
|
@@ -14337,7 +14340,7 @@ var LatencyEvaluator = class {
|
|
|
14337
14340
|
passed ? { text: `Duration ${durationMs}ms <= ${threshold}ms threshold`, passed: true } : { text: `Duration ${durationMs}ms > ${threshold}ms threshold`, passed: false }
|
|
14338
14341
|
],
|
|
14339
14342
|
expectedAspectCount: 1,
|
|
14340
|
-
|
|
14343
|
+
graderRawRequest: {
|
|
14341
14344
|
type: "latency",
|
|
14342
14345
|
threshold,
|
|
14343
14346
|
durationMs
|
|
@@ -14346,8 +14349,8 @@ var LatencyEvaluator = class {
|
|
|
14346
14349
|
}
|
|
14347
14350
|
};
|
|
14348
14351
|
|
|
14349
|
-
// src/evaluation/
|
|
14350
|
-
var
|
|
14352
|
+
// src/evaluation/graders/skill-trigger.ts
|
|
14353
|
+
var SkillTriggerGrader = class {
|
|
14351
14354
|
kind = "skill-trigger";
|
|
14352
14355
|
config;
|
|
14353
14356
|
constructor(config) {
|
|
@@ -14414,7 +14417,7 @@ var SkillTriggerEvaluator = class {
|
|
|
14414
14417
|
}
|
|
14415
14418
|
};
|
|
14416
14419
|
|
|
14417
|
-
// src/evaluation/
|
|
14420
|
+
// src/evaluation/graders/llm-grader-prompt.ts
|
|
14418
14421
|
function assembleLlmGraderPrompt(input) {
|
|
14419
14422
|
const {
|
|
14420
14423
|
evalCase,
|
|
@@ -14422,7 +14425,7 @@ function assembleLlmGraderPrompt(input) {
|
|
|
14422
14425
|
promptInputs,
|
|
14423
14426
|
evaluatorConfig,
|
|
14424
14427
|
fileChanges,
|
|
14425
|
-
|
|
14428
|
+
graderTemplateOverride
|
|
14426
14429
|
} = input;
|
|
14427
14430
|
const rubrics = evaluatorConfig?.rubrics;
|
|
14428
14431
|
if (rubrics && rubrics.length > 0) {
|
|
@@ -14432,15 +14435,9 @@ function assembleLlmGraderPrompt(input) {
|
|
|
14432
14435
|
}
|
|
14433
14436
|
return assembleChecklist(evalCase, candidate, promptInputs, rubrics, fileChanges);
|
|
14434
14437
|
}
|
|
14435
|
-
return assembleFreeform(
|
|
14436
|
-
evalCase,
|
|
14437
|
-
candidate,
|
|
14438
|
-
promptInputs,
|
|
14439
|
-
fileChanges,
|
|
14440
|
-
evaluatorTemplateOverride
|
|
14441
|
-
);
|
|
14438
|
+
return assembleFreeform(evalCase, candidate, promptInputs, fileChanges, graderTemplateOverride);
|
|
14442
14439
|
}
|
|
14443
|
-
function assembleFreeform(evalCase, candidate, promptInputs, fileChanges,
|
|
14440
|
+
function assembleFreeform(evalCase, candidate, promptInputs, fileChanges, graderTemplateOverride) {
|
|
14444
14441
|
const formattedQuestion = promptInputs.question && promptInputs.question.trim().length > 0 ? promptInputs.question : evalCase.question;
|
|
14445
14442
|
const variables = {
|
|
14446
14443
|
[TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
|
|
@@ -14454,9 +14451,9 @@ function assembleFreeform(evalCase, candidate, promptInputs, fileChanges, evalua
|
|
|
14454
14451
|
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (evalCase.reference_answer ?? "").trim()
|
|
14455
14452
|
};
|
|
14456
14453
|
const systemPrompt = buildOutputSchema();
|
|
14457
|
-
const template =
|
|
14454
|
+
const template = graderTemplateOverride ?? DEFAULT_GRADER_TEMPLATE;
|
|
14458
14455
|
let userPrompt = substituteVariables(template, variables);
|
|
14459
|
-
if (fileChanges && !
|
|
14456
|
+
if (fileChanges && !graderTemplateOverride) {
|
|
14460
14457
|
userPrompt += `
|
|
14461
14458
|
|
|
14462
14459
|
[[ ## file_changes ## ]]
|
|
@@ -14472,7 +14469,7 @@ ${fileChanges}`;
|
|
|
14472
14469
|
function assembleChecklist(evalCase, candidate, promptInputs, rubrics, fileChanges) {
|
|
14473
14470
|
const formattedQuestion = promptInputs.question && promptInputs.question.trim().length > 0 ? promptInputs.question : evalCase.question;
|
|
14474
14471
|
const parts = [
|
|
14475
|
-
"You are an expert
|
|
14472
|
+
"You are an expert grader. Evaluate the candidate answer against each rubric item below.",
|
|
14476
14473
|
"",
|
|
14477
14474
|
"[[ ## question ## ]]",
|
|
14478
14475
|
formattedQuestion,
|
|
@@ -14507,7 +14504,7 @@ function assembleChecklist(evalCase, candidate, promptInputs, rubrics, fileChang
|
|
|
14507
14504
|
function assembleScoreRange(evalCase, candidate, promptInputs, rubrics, fileChanges) {
|
|
14508
14505
|
const formattedQuestion = promptInputs.question && promptInputs.question.trim().length > 0 ? promptInputs.question : evalCase.question;
|
|
14509
14506
|
const parts = [
|
|
14510
|
-
"You are an expert
|
|
14507
|
+
"You are an expert grader. Score the candidate answer on each criterion below using the provided score ranges.",
|
|
14511
14508
|
"For each criterion, output an integer score from 0 to 10 based on which score range best matches the answer.",
|
|
14512
14509
|
"",
|
|
14513
14510
|
"[[ ## question ## ]]",
|
|
@@ -14555,8 +14552,8 @@ function assembleScoreRange(evalCase, candidate, promptInputs, rubrics, fileChan
|
|
|
14555
14552
|
};
|
|
14556
14553
|
}
|
|
14557
14554
|
|
|
14558
|
-
// src/evaluation/
|
|
14559
|
-
var
|
|
14555
|
+
// src/evaluation/graders/token-usage.ts
|
|
14556
|
+
var TokenUsageGrader = class {
|
|
14560
14557
|
kind = "token-usage";
|
|
14561
14558
|
config;
|
|
14562
14559
|
constructor(options) {
|
|
@@ -14577,7 +14574,7 @@ var TokenUsageEvaluator = class {
|
|
|
14577
14574
|
verdict: "fail",
|
|
14578
14575
|
assertions: [{ text: "No token usage data available in trace", passed: false }],
|
|
14579
14576
|
expectedAspectCount,
|
|
14580
|
-
|
|
14577
|
+
graderRawRequest: {
|
|
14581
14578
|
type: "token-usage",
|
|
14582
14579
|
max_total: maxTotal ?? null,
|
|
14583
14580
|
max_input: maxInput ?? null,
|
|
@@ -14618,7 +14615,7 @@ var TokenUsageEvaluator = class {
|
|
|
14618
14615
|
verdict: passed ? "pass" : "fail",
|
|
14619
14616
|
assertions,
|
|
14620
14617
|
expectedAspectCount,
|
|
14621
|
-
|
|
14618
|
+
graderRawRequest: {
|
|
14622
14619
|
type: "token-usage",
|
|
14623
14620
|
max_total: maxTotal ?? null,
|
|
14624
14621
|
max_input: maxInput ?? null,
|
|
@@ -14634,7 +14631,7 @@ var TokenUsageEvaluator = class {
|
|
|
14634
14631
|
}
|
|
14635
14632
|
};
|
|
14636
14633
|
|
|
14637
|
-
// src/evaluation/
|
|
14634
|
+
// src/evaluation/graders/tool-trajectory.ts
|
|
14638
14635
|
function getNestedValue(obj, path53) {
|
|
14639
14636
|
const parts = path53.split(".");
|
|
14640
14637
|
let current = obj;
|
|
@@ -14703,7 +14700,7 @@ function checkLatency(toolName, maxDurationMs, actualDurationMs) {
|
|
|
14703
14700
|
message: `${toolName} took ${actualDurationMs}ms (max: ${maxDurationMs}ms)`
|
|
14704
14701
|
};
|
|
14705
14702
|
}
|
|
14706
|
-
var
|
|
14703
|
+
var ToolTrajectoryGrader = class {
|
|
14707
14704
|
kind = "tool-trajectory";
|
|
14708
14705
|
config;
|
|
14709
14706
|
constructor(options) {
|
|
@@ -15108,7 +15105,7 @@ var ToolTrajectoryEvaluator = class {
|
|
|
15108
15105
|
}
|
|
15109
15106
|
};
|
|
15110
15107
|
|
|
15111
|
-
// src/evaluation/
|
|
15108
|
+
// src/evaluation/graders/assertions.ts
|
|
15112
15109
|
function runContainsAssertion(output, value) {
|
|
15113
15110
|
const passed = output.includes(value);
|
|
15114
15111
|
return {
|
|
@@ -15403,15 +15400,15 @@ function validateConcurrency(concurrency) {
|
|
|
15403
15400
|
}
|
|
15404
15401
|
}
|
|
15405
15402
|
|
|
15406
|
-
// src/evaluation/registry/
|
|
15407
|
-
var
|
|
15403
|
+
// src/evaluation/registry/grader-registry.ts
|
|
15404
|
+
var GraderRegistry = class {
|
|
15408
15405
|
factories = /* @__PURE__ */ new Map();
|
|
15409
|
-
/** Register a factory function for an
|
|
15406
|
+
/** Register a factory function for an grader type. */
|
|
15410
15407
|
register(type, factory) {
|
|
15411
15408
|
this.factories.set(type, factory);
|
|
15412
15409
|
return this;
|
|
15413
15410
|
}
|
|
15414
|
-
/** Get the factory function for an
|
|
15411
|
+
/** Get the factory function for an grader type. */
|
|
15415
15412
|
get(type) {
|
|
15416
15413
|
return this.factories.get(type);
|
|
15417
15414
|
}
|
|
@@ -15419,25 +15416,25 @@ var EvaluatorRegistry = class {
|
|
|
15419
15416
|
has(type) {
|
|
15420
15417
|
return this.factories.has(type);
|
|
15421
15418
|
}
|
|
15422
|
-
/** List all registered
|
|
15419
|
+
/** List all registered grader type names. */
|
|
15423
15420
|
list() {
|
|
15424
15421
|
return [...this.factories.keys()];
|
|
15425
15422
|
}
|
|
15426
15423
|
/**
|
|
15427
15424
|
* Create an evaluator instance from a config, using the registered factory.
|
|
15428
|
-
* Throws if no factory is registered for the
|
|
15425
|
+
* Throws if no factory is registered for the grader type.
|
|
15429
15426
|
*/
|
|
15430
15427
|
async create(config, context) {
|
|
15431
15428
|
const factory = this.factories.get(config.type);
|
|
15432
15429
|
if (!factory) {
|
|
15433
15430
|
throw new Error(
|
|
15434
|
-
`Unknown
|
|
15431
|
+
`Unknown grader type: "${config.type}". Registered types: ${this.list().join(", ")}`
|
|
15435
15432
|
);
|
|
15436
15433
|
}
|
|
15437
15434
|
return factory(config, context);
|
|
15438
15435
|
}
|
|
15439
15436
|
};
|
|
15440
|
-
var
|
|
15437
|
+
var DeterministicAssertionGrader = class {
|
|
15441
15438
|
constructor(kind, assertFn) {
|
|
15442
15439
|
this.assertFn = assertFn;
|
|
15443
15440
|
this.kind = kind;
|
|
@@ -15448,8 +15445,8 @@ var DeterministicAssertionEvaluator = class {
|
|
|
15448
15445
|
}
|
|
15449
15446
|
};
|
|
15450
15447
|
|
|
15451
|
-
// src/evaluation/
|
|
15452
|
-
var
|
|
15448
|
+
// src/evaluation/graders/inline-assert.ts
|
|
15449
|
+
var InlineAssertGrader = class {
|
|
15453
15450
|
constructor(fn, name) {
|
|
15454
15451
|
this.fn = fn;
|
|
15455
15452
|
this.name = name;
|
|
@@ -15474,7 +15471,7 @@ var InlineAssertEvaluator = class {
|
|
|
15474
15471
|
}
|
|
15475
15472
|
};
|
|
15476
15473
|
|
|
15477
|
-
// src/evaluation/
|
|
15474
|
+
// src/evaluation/graders/prompt-resolution.ts
|
|
15478
15475
|
import path38 from "node:path";
|
|
15479
15476
|
async function resolveCustomPrompt(promptConfig, context, timeoutMs) {
|
|
15480
15477
|
if (promptConfig.resolvedPromptScript && promptConfig.resolvedPromptScript.length > 0) {
|
|
@@ -15541,7 +15538,7 @@ async function executePromptTemplate(script, context, config, timeoutMs) {
|
|
|
15541
15538
|
}
|
|
15542
15539
|
}
|
|
15543
15540
|
|
|
15544
|
-
// src/evaluation/registry/builtin-
|
|
15541
|
+
// src/evaluation/registry/builtin-graders.ts
|
|
15545
15542
|
var INLINE_ASSERT_FN = Symbol.for("agentv.inline-assert-fn");
|
|
15546
15543
|
var llmGraderFactory = (config, context) => {
|
|
15547
15544
|
const c = config;
|
|
@@ -15558,7 +15555,7 @@ var llmGraderFactory = (config, context) => {
|
|
|
15558
15555
|
);
|
|
15559
15556
|
}
|
|
15560
15557
|
const isAgent = isAgentProvider(graderTargetProvider) || graderTargetProvider.kind === "agentv";
|
|
15561
|
-
evaluator = new
|
|
15558
|
+
evaluator = new LlmGrader({
|
|
15562
15559
|
resolveGraderProvider: async (evalContext) => {
|
|
15563
15560
|
if (graderTargetProvider) return graderTargetProvider;
|
|
15564
15561
|
if (evalContext.graderProvider) return evalContext.graderProvider;
|
|
@@ -15586,11 +15583,11 @@ var llmGraderFactory = (config, context) => {
|
|
|
15586
15583
|
agentTimeoutMs
|
|
15587
15584
|
);
|
|
15588
15585
|
const isFromInlinePrompt = !c.resolvedPromptScript?.length && !c.resolvedPromptPath && !c.promptPath;
|
|
15589
|
-
let
|
|
15586
|
+
let graderTemplateOverride;
|
|
15590
15587
|
let evalCase = evalContext.evalCase;
|
|
15591
15588
|
if (customPrompt) {
|
|
15592
15589
|
if (!isFromInlinePrompt || containsTemplateVariables(customPrompt)) {
|
|
15593
|
-
|
|
15590
|
+
graderTemplateOverride = customPrompt;
|
|
15594
15591
|
} else {
|
|
15595
15592
|
evalCase = { ...evalCase, criteria: customPrompt };
|
|
15596
15593
|
}
|
|
@@ -15598,7 +15595,7 @@ var llmGraderFactory = (config, context) => {
|
|
|
15598
15595
|
return evaluator.evaluate({
|
|
15599
15596
|
...evalContext,
|
|
15600
15597
|
evalCase,
|
|
15601
|
-
|
|
15598
|
+
graderTemplateOverride,
|
|
15602
15599
|
evaluator: c
|
|
15603
15600
|
});
|
|
15604
15601
|
}
|
|
@@ -15606,7 +15603,7 @@ var llmGraderFactory = (config, context) => {
|
|
|
15606
15603
|
};
|
|
15607
15604
|
var codeFactory = (config, context) => {
|
|
15608
15605
|
const c = config;
|
|
15609
|
-
return new
|
|
15606
|
+
return new CodeGrader({
|
|
15610
15607
|
command: c.command ?? c.script ?? [],
|
|
15611
15608
|
cwd: c.resolvedCwd ?? c.cwd,
|
|
15612
15609
|
agentTimeoutMs: context.agentTimeoutMs,
|
|
@@ -15617,19 +15614,19 @@ var codeFactory = (config, context) => {
|
|
|
15617
15614
|
var compositeFactory = (config, context) => {
|
|
15618
15615
|
const c = config;
|
|
15619
15616
|
const evalFileDir = context.evalFileDir ?? process.cwd();
|
|
15620
|
-
return new
|
|
15617
|
+
return new CompositeGrader({
|
|
15621
15618
|
config: c,
|
|
15622
15619
|
cwd: evalFileDir,
|
|
15623
15620
|
evaluatorFactory: {
|
|
15624
15621
|
create: (memberConfig) => {
|
|
15625
15622
|
const factory = context.registry.get(memberConfig.type);
|
|
15626
15623
|
if (!factory) {
|
|
15627
|
-
throw new Error(`Unsupported
|
|
15624
|
+
throw new Error(`Unsupported grader type in composite: ${memberConfig.type}`);
|
|
15628
15625
|
}
|
|
15629
15626
|
const result = factory(memberConfig, context);
|
|
15630
15627
|
if (result instanceof Promise) {
|
|
15631
15628
|
throw new Error(
|
|
15632
|
-
`
|
|
15629
|
+
`Grader factory for type "${memberConfig.type}" is async \u2014 not supported inside composite members. Use synchronous factories for composite child evaluators.`
|
|
15633
15630
|
);
|
|
15634
15631
|
}
|
|
15635
15632
|
return result;
|
|
@@ -15638,35 +15635,35 @@ var compositeFactory = (config, context) => {
|
|
|
15638
15635
|
});
|
|
15639
15636
|
};
|
|
15640
15637
|
var toolTrajectoryFactory = (config) => {
|
|
15641
|
-
return new
|
|
15638
|
+
return new ToolTrajectoryGrader({
|
|
15642
15639
|
config
|
|
15643
15640
|
});
|
|
15644
15641
|
};
|
|
15645
15642
|
var fieldAccuracyFactory = (config) => {
|
|
15646
|
-
return new
|
|
15643
|
+
return new FieldAccuracyGrader({
|
|
15647
15644
|
config
|
|
15648
15645
|
});
|
|
15649
15646
|
};
|
|
15650
15647
|
var latencyFactory = (config) => {
|
|
15651
|
-
return new
|
|
15648
|
+
return new LatencyGrader({ config });
|
|
15652
15649
|
};
|
|
15653
15650
|
var costFactory = (config) => {
|
|
15654
|
-
return new
|
|
15651
|
+
return new CostGrader({ config });
|
|
15655
15652
|
};
|
|
15656
15653
|
var tokenUsageFactory = (config) => {
|
|
15657
|
-
return new
|
|
15654
|
+
return new TokenUsageGrader({ config });
|
|
15658
15655
|
};
|
|
15659
15656
|
var executionMetricsFactory = (config) => {
|
|
15660
|
-
return new
|
|
15657
|
+
return new ExecutionMetricsGrader({
|
|
15661
15658
|
config
|
|
15662
15659
|
});
|
|
15663
15660
|
};
|
|
15664
15661
|
var skillTriggerFactory = (config) => {
|
|
15665
|
-
return new
|
|
15662
|
+
return new SkillTriggerGrader(config);
|
|
15666
15663
|
};
|
|
15667
15664
|
var containsFactory = (config) => {
|
|
15668
15665
|
const c = config;
|
|
15669
|
-
return new
|
|
15666
|
+
return new DeterministicAssertionGrader("contains", (ctx) => {
|
|
15670
15667
|
const result = runContainsAssertion(ctx.candidate, c.value);
|
|
15671
15668
|
return {
|
|
15672
15669
|
score: result.score,
|
|
@@ -15678,7 +15675,7 @@ var containsFactory = (config) => {
|
|
|
15678
15675
|
};
|
|
15679
15676
|
var regexFactory = (config) => {
|
|
15680
15677
|
const c = config;
|
|
15681
|
-
return new
|
|
15678
|
+
return new DeterministicAssertionGrader("regex", (ctx) => {
|
|
15682
15679
|
const result = runRegexAssertion(ctx.candidate, c.value, c.flags);
|
|
15683
15680
|
return {
|
|
15684
15681
|
score: result.score,
|
|
@@ -15689,7 +15686,7 @@ var regexFactory = (config) => {
|
|
|
15689
15686
|
});
|
|
15690
15687
|
};
|
|
15691
15688
|
var isJsonFactory = () => {
|
|
15692
|
-
return new
|
|
15689
|
+
return new DeterministicAssertionGrader("is-json", (ctx) => {
|
|
15693
15690
|
const result = runIsJsonAssertion(ctx.candidate);
|
|
15694
15691
|
return {
|
|
15695
15692
|
score: result.score,
|
|
@@ -15701,7 +15698,7 @@ var isJsonFactory = () => {
|
|
|
15701
15698
|
};
|
|
15702
15699
|
var equalsFactory = (config) => {
|
|
15703
15700
|
const c = config;
|
|
15704
|
-
return new
|
|
15701
|
+
return new DeterministicAssertionGrader("equals", (ctx) => {
|
|
15705
15702
|
const result = runEqualsAssertion(ctx.candidate, c.value);
|
|
15706
15703
|
return {
|
|
15707
15704
|
score: result.score,
|
|
@@ -15713,7 +15710,7 @@ var equalsFactory = (config) => {
|
|
|
15713
15710
|
};
|
|
15714
15711
|
var containsAnyFactory = (config) => {
|
|
15715
15712
|
const c = config;
|
|
15716
|
-
return new
|
|
15713
|
+
return new DeterministicAssertionGrader("contains-any", (ctx) => {
|
|
15717
15714
|
const result = runContainsAnyAssertion(ctx.candidate, c.value);
|
|
15718
15715
|
return {
|
|
15719
15716
|
score: result.score,
|
|
@@ -15725,7 +15722,7 @@ var containsAnyFactory = (config) => {
|
|
|
15725
15722
|
};
|
|
15726
15723
|
var containsAllFactory = (config) => {
|
|
15727
15724
|
const c = config;
|
|
15728
|
-
return new
|
|
15725
|
+
return new DeterministicAssertionGrader("contains-all", (ctx) => {
|
|
15729
15726
|
const result = runContainsAllAssertion(ctx.candidate, c.value);
|
|
15730
15727
|
return {
|
|
15731
15728
|
score: result.score,
|
|
@@ -15737,7 +15734,7 @@ var containsAllFactory = (config) => {
|
|
|
15737
15734
|
};
|
|
15738
15735
|
var icontainsFactory = (config) => {
|
|
15739
15736
|
const c = config;
|
|
15740
|
-
return new
|
|
15737
|
+
return new DeterministicAssertionGrader("icontains", (ctx) => {
|
|
15741
15738
|
const result = runIcontainsAssertion(ctx.candidate, c.value);
|
|
15742
15739
|
return {
|
|
15743
15740
|
score: result.score,
|
|
@@ -15749,7 +15746,7 @@ var icontainsFactory = (config) => {
|
|
|
15749
15746
|
};
|
|
15750
15747
|
var icontainsAnyFactory = (config) => {
|
|
15751
15748
|
const c = config;
|
|
15752
|
-
return new
|
|
15749
|
+
return new DeterministicAssertionGrader("icontains-any", (ctx) => {
|
|
15753
15750
|
const result = runIcontainsAnyAssertion(ctx.candidate, c.value);
|
|
15754
15751
|
return {
|
|
15755
15752
|
score: result.score,
|
|
@@ -15761,7 +15758,7 @@ var icontainsAnyFactory = (config) => {
|
|
|
15761
15758
|
};
|
|
15762
15759
|
var icontainsAllFactory = (config) => {
|
|
15763
15760
|
const c = config;
|
|
15764
|
-
return new
|
|
15761
|
+
return new DeterministicAssertionGrader("icontains-all", (ctx) => {
|
|
15765
15762
|
const result = runIcontainsAllAssertion(ctx.candidate, c.value);
|
|
15766
15763
|
return {
|
|
15767
15764
|
score: result.score,
|
|
@@ -15773,7 +15770,7 @@ var icontainsAllFactory = (config) => {
|
|
|
15773
15770
|
};
|
|
15774
15771
|
var startsWithFactory = (config) => {
|
|
15775
15772
|
const c = config;
|
|
15776
|
-
return new
|
|
15773
|
+
return new DeterministicAssertionGrader("starts-with", (ctx) => {
|
|
15777
15774
|
const result = runStartsWithAssertion(ctx.candidate, c.value);
|
|
15778
15775
|
return {
|
|
15779
15776
|
score: result.score,
|
|
@@ -15785,7 +15782,7 @@ var startsWithFactory = (config) => {
|
|
|
15785
15782
|
};
|
|
15786
15783
|
var endsWithFactory = (config) => {
|
|
15787
15784
|
const c = config;
|
|
15788
|
-
return new
|
|
15785
|
+
return new DeterministicAssertionGrader("ends-with", (ctx) => {
|
|
15789
15786
|
const result = runEndsWithAssertion(ctx.candidate, c.value);
|
|
15790
15787
|
return {
|
|
15791
15788
|
score: result.score,
|
|
@@ -15796,7 +15793,7 @@ var endsWithFactory = (config) => {
|
|
|
15796
15793
|
});
|
|
15797
15794
|
};
|
|
15798
15795
|
function createBuiltinRegistry() {
|
|
15799
|
-
const registry = new
|
|
15796
|
+
const registry = new GraderRegistry();
|
|
15800
15797
|
registry.register("llm-grader", llmGraderFactory).register("code-grader", codeFactory).register("composite", compositeFactory).register("tool-trajectory", toolTrajectoryFactory).register("field-accuracy", fieldAccuracyFactory).register("latency", latencyFactory).register("cost", costFactory).register("token-usage", tokenUsageFactory).register("execution-metrics", executionMetricsFactory).register("skill-trigger", skillTriggerFactory).register("contains", containsFactory).register("contains-any", containsAnyFactory).register("contains-all", containsAllFactory).register("icontains", icontainsFactory).register("icontains-any", icontainsAnyFactory).register("icontains-all", icontainsAllFactory).register("starts-with", startsWithFactory).register("ends-with", endsWithFactory).register("regex", regexFactory).register("is-json", isJsonFactory).register("equals", equalsFactory).register("inline-assert", (config) => {
|
|
15801
15798
|
const fn = config[INLINE_ASSERT_FN];
|
|
15802
15799
|
if (!fn) {
|
|
@@ -15804,7 +15801,7 @@ function createBuiltinRegistry() {
|
|
|
15804
15801
|
`No inline assert function found on config for "${config.name}". Inline assert functions must be attached via INLINE_ASSERT_FN symbol.`
|
|
15805
15802
|
);
|
|
15806
15803
|
}
|
|
15807
|
-
return new
|
|
15804
|
+
return new InlineAssertGrader(fn, config.name ?? "inline-assert");
|
|
15808
15805
|
});
|
|
15809
15806
|
return registry;
|
|
15810
15807
|
}
|
|
@@ -15841,7 +15838,7 @@ async function discoverAssertions(registry, baseDir) {
|
|
|
15841
15838
|
continue;
|
|
15842
15839
|
}
|
|
15843
15840
|
const factory = (_config, context) => {
|
|
15844
|
-
return new
|
|
15841
|
+
return new CodeGrader({
|
|
15845
15842
|
command: ["bun", "run", filePath],
|
|
15846
15843
|
agentTimeoutMs: context.agentTimeoutMs
|
|
15847
15844
|
});
|
|
@@ -15885,7 +15882,7 @@ async function discoverGraders(registry, baseDir) {
|
|
|
15885
15882
|
continue;
|
|
15886
15883
|
}
|
|
15887
15884
|
const factory = (_config, context) => {
|
|
15888
|
-
return new
|
|
15885
|
+
return new CodeGrader({
|
|
15889
15886
|
command: ["bun", "run", filePath],
|
|
15890
15887
|
agentTimeoutMs: context.agentTimeoutMs
|
|
15891
15888
|
});
|
|
@@ -16727,10 +16724,10 @@ function buildSkippedEvaluatorError(scores) {
|
|
|
16727
16724
|
}
|
|
16728
16725
|
const messages = skippedScores.map((score) => {
|
|
16729
16726
|
const label = score.name || score.type;
|
|
16730
|
-
const assertionMessage = score.assertions.find((assertion) => !assertion.passed)?.text ?? "
|
|
16727
|
+
const assertionMessage = score.assertions.find((assertion) => !assertion.passed)?.text ?? "Grader skipped";
|
|
16731
16728
|
return `${label}: ${assertionMessage}`;
|
|
16732
16729
|
});
|
|
16733
|
-
return messages.length === 1 ? messages[0] : `
|
|
16730
|
+
return messages.length === 1 ? messages[0] : `Graders skipped: ${messages.join(" | ")}`;
|
|
16734
16731
|
}
|
|
16735
16732
|
function usesFileReferencePrompt(provider) {
|
|
16736
16733
|
return isAgentProvider(provider) || provider.kind === "cli";
|
|
@@ -16899,7 +16896,7 @@ async function runEvaluation(options) {
|
|
|
16899
16896
|
cleanupWorkspaces,
|
|
16900
16897
|
trials,
|
|
16901
16898
|
streamCallbacks,
|
|
16902
|
-
|
|
16899
|
+
budgetUsd,
|
|
16903
16900
|
failOnError,
|
|
16904
16901
|
poolWorkspaces,
|
|
16905
16902
|
poolMaxSlots: configPoolMaxSlots,
|
|
@@ -17428,7 +17425,7 @@ async function runEvaluation(options) {
|
|
|
17428
17425
|
async function dispatchTest(evalCase, depResults) {
|
|
17429
17426
|
const workerId = nextWorkerId++;
|
|
17430
17427
|
workerIdByEvalId.set(evalCase.id, workerId);
|
|
17431
|
-
if (
|
|
17428
|
+
if (budgetUsd !== void 0 && budgetExhausted) {
|
|
17432
17429
|
const budgetResult = {
|
|
17433
17430
|
timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
|
|
17434
17431
|
testId: evalCase.id,
|
|
@@ -17438,13 +17435,13 @@ async function runEvaluation(options) {
|
|
|
17438
17435
|
assertions: [],
|
|
17439
17436
|
output: [],
|
|
17440
17437
|
target: target.name,
|
|
17441
|
-
error: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${
|
|
17438
|
+
error: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${budgetUsd.toFixed(4)})`,
|
|
17442
17439
|
budgetExceeded: true,
|
|
17443
17440
|
executionStatus: "execution_error",
|
|
17444
17441
|
failureStage: "setup",
|
|
17445
17442
|
failureReasonCode: "budget_exceeded",
|
|
17446
17443
|
executionError: {
|
|
17447
|
-
message: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${
|
|
17444
|
+
message: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${budgetUsd.toFixed(4)})`,
|
|
17448
17445
|
stage: "setup"
|
|
17449
17446
|
}
|
|
17450
17447
|
};
|
|
@@ -17541,7 +17538,7 @@ async function runEvaluation(options) {
|
|
|
17541
17538
|
...depResults && Object.keys(depResults).length > 0 ? { dependencyResults: depResults } : {}
|
|
17542
17539
|
};
|
|
17543
17540
|
let result = trials && trials.count > 1 ? await runEvalCaseWithTrials(runCaseOptions, trials) : await runEvalCase(runCaseOptions);
|
|
17544
|
-
if (
|
|
17541
|
+
if (budgetUsd !== void 0) {
|
|
17545
17542
|
let caseCost;
|
|
17546
17543
|
if (result.trials && result.trials.length > 0) {
|
|
17547
17544
|
const trialCostSum = result.trials.reduce((sum, t) => sum + (t.costUsd ?? 0), 0);
|
|
@@ -17553,7 +17550,7 @@ async function runEvaluation(options) {
|
|
|
17553
17550
|
}
|
|
17554
17551
|
if (caseCost !== void 0) {
|
|
17555
17552
|
cumulativeBudgetCost += caseCost;
|
|
17556
|
-
if (cumulativeBudgetCost >=
|
|
17553
|
+
if (cumulativeBudgetCost >= budgetUsd) {
|
|
17557
17554
|
budgetExhausted = true;
|
|
17558
17555
|
}
|
|
17559
17556
|
}
|
|
@@ -18695,7 +18692,7 @@ async function evaluateCandidate(options) {
|
|
|
18695
18692
|
};
|
|
18696
18693
|
}
|
|
18697
18694
|
}
|
|
18698
|
-
const evaluatorRequest = scores ? void 0 : score.
|
|
18695
|
+
const evaluatorRequest = scores ? void 0 : score.graderRawRequest;
|
|
18699
18696
|
const effectiveAgentRequest = agentRequest && Object.keys(agentRequest).length > 0 ? agentRequest : void 0;
|
|
18700
18697
|
const requests = effectiveAgentRequest || lmRequest || evaluatorRequest ? {
|
|
18701
18698
|
...effectiveAgentRequest ? { agent: effectiveAgentRequest } : {},
|
|
@@ -18911,7 +18908,7 @@ async function runEvaluatorList(options) {
|
|
|
18911
18908
|
weight,
|
|
18912
18909
|
verdict: score2.verdict,
|
|
18913
18910
|
assertions: score2.assertions,
|
|
18914
|
-
input: score2.
|
|
18911
|
+
input: score2.graderRawRequest,
|
|
18915
18912
|
target: score2.graderTarget,
|
|
18916
18913
|
details: score2.details,
|
|
18917
18914
|
scores: mapChildResults(score2.scores),
|
|
@@ -18927,7 +18924,7 @@ async function runEvaluatorList(options) {
|
|
|
18927
18924
|
score: 0,
|
|
18928
18925
|
verdict: "fail",
|
|
18929
18926
|
assertions: [
|
|
18930
|
-
{ text: `
|
|
18927
|
+
{ text: `Grader '${evaluatorConfig.name}' failed: ${message}`, passed: false }
|
|
18931
18928
|
],
|
|
18932
18929
|
expectedAspectCount: 1
|
|
18933
18930
|
};
|
|
@@ -18948,7 +18945,7 @@ async function runEvaluatorList(options) {
|
|
|
18948
18945
|
verdict: "fail",
|
|
18949
18946
|
assertions: [
|
|
18950
18947
|
{
|
|
18951
|
-
text: `
|
|
18948
|
+
text: `Grader '${evaluatorConfig.name ?? "unknown"}' failed: ${message}`,
|
|
18952
18949
|
passed: false
|
|
18953
18950
|
}
|
|
18954
18951
|
],
|
|
@@ -19005,7 +19002,7 @@ function filterEvalCases(evalCases, filter) {
|
|
|
19005
19002
|
return evalCases.filter((evalCase) => matchesFilter3(evalCase.id, filter));
|
|
19006
19003
|
}
|
|
19007
19004
|
function buildEvaluatorRegistry(overrides, resolveGraderProvider) {
|
|
19008
|
-
const llmGrader = overrides?.["llm-grader"] ?? new
|
|
19005
|
+
const llmGrader = overrides?.["llm-grader"] ?? new LlmGrader({
|
|
19009
19006
|
resolveGraderProvider: async (context) => {
|
|
19010
19007
|
if (context.graderProvider) {
|
|
19011
19008
|
return context.graderProvider;
|
|
@@ -19496,7 +19493,7 @@ function mapChildResults(children) {
|
|
|
19496
19493
|
weight: child.weight,
|
|
19497
19494
|
verdict: child.verdict,
|
|
19498
19495
|
assertions: child.assertions,
|
|
19499
|
-
input: child.
|
|
19496
|
+
input: child.graderRawRequest,
|
|
19500
19497
|
scores: mapChildResults(child.scores),
|
|
19501
19498
|
details: child.details,
|
|
19502
19499
|
tokenUsage: child.tokenUsage
|
|
@@ -21599,22 +21596,21 @@ function createAgentKernel() {
|
|
|
21599
21596
|
}
|
|
21600
21597
|
export {
|
|
21601
21598
|
COMMON_TARGET_SETTINGS,
|
|
21602
|
-
|
|
21603
|
-
|
|
21604
|
-
|
|
21599
|
+
CodeGrader,
|
|
21600
|
+
CompositeGrader,
|
|
21601
|
+
CostGrader,
|
|
21605
21602
|
DEFAULT_CATEGORY,
|
|
21606
|
-
DEFAULT_EVALUATOR_TEMPLATE,
|
|
21607
21603
|
DEFAULT_EVAL_PATTERNS,
|
|
21608
21604
|
DEFAULT_EXPLORATION_TOOLS,
|
|
21605
|
+
DEFAULT_GRADER_TEMPLATE,
|
|
21609
21606
|
DEFAULT_THRESHOLD,
|
|
21610
|
-
|
|
21607
|
+
DeterministicAssertionGrader,
|
|
21611
21608
|
DockerWorkspaceProvider,
|
|
21612
|
-
|
|
21613
|
-
|
|
21614
|
-
|
|
21615
|
-
|
|
21616
|
-
|
|
21617
|
-
LlmGraderEvaluator as LlmJudgeEvaluator,
|
|
21609
|
+
ExecutionMetricsGrader,
|
|
21610
|
+
FieldAccuracyGrader,
|
|
21611
|
+
GraderRegistry,
|
|
21612
|
+
LatencyGrader,
|
|
21613
|
+
LlmGrader,
|
|
21618
21614
|
OTEL_BACKEND_PRESETS,
|
|
21619
21615
|
OtelStreamingObserver,
|
|
21620
21616
|
OtelTraceExporter,
|
|
@@ -21623,18 +21619,17 @@ export {
|
|
|
21623
21619
|
ProviderRegistry,
|
|
21624
21620
|
RepoManager,
|
|
21625
21621
|
ResponseCache,
|
|
21626
|
-
|
|
21622
|
+
SkillTriggerGrader,
|
|
21627
21623
|
TEST_MESSAGE_ROLES,
|
|
21628
21624
|
TemplateNotDirectoryError,
|
|
21629
21625
|
TemplateNotFoundError,
|
|
21630
|
-
|
|
21631
|
-
|
|
21626
|
+
TokenUsageGrader,
|
|
21627
|
+
ToolTrajectoryGrader,
|
|
21632
21628
|
TranscriptProvider,
|
|
21633
21629
|
WorkspaceCreationError,
|
|
21634
21630
|
WorkspacePoolManager,
|
|
21635
21631
|
addBenchmark,
|
|
21636
21632
|
assembleLlmGraderPrompt,
|
|
21637
|
-
assembleLlmGraderPrompt as assembleLlmJudgePrompt,
|
|
21638
21633
|
avgToolDurationMs,
|
|
21639
21634
|
buildDirectoryChain,
|
|
21640
21635
|
buildOutputSchema,
|
|
@@ -21674,7 +21669,6 @@ export {
|
|
|
21674
21669
|
discoverCodexSessions,
|
|
21675
21670
|
discoverCopilotSessions,
|
|
21676
21671
|
discoverGraders,
|
|
21677
|
-
discoverGraders as discoverJudges,
|
|
21678
21672
|
discoverProviders,
|
|
21679
21673
|
ensureResultsRepoClone,
|
|
21680
21674
|
ensureVSCodeSubagents,
|
|
@@ -21716,7 +21710,7 @@ export {
|
|
|
21716
21710
|
isAgentSkillsFormat,
|
|
21717
21711
|
isContent,
|
|
21718
21712
|
isContentArray,
|
|
21719
|
-
|
|
21713
|
+
isGraderKind,
|
|
21720
21714
|
isJsonObject,
|
|
21721
21715
|
isJsonValue,
|
|
21722
21716
|
isNonEmptyString,
|