agentv 4.17.1 → 4.18.0-next.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/dist/artifact-writer-WH3OE42V.js +40 -0
- package/dist/chunk-HBDOJJFY.js +689 -0
- package/dist/chunk-HBDOJJFY.js.map +1 -0
- package/dist/{chunk-ILIM6IIX.js → chunk-MCBERRMC.js} +196 -161
- package/dist/chunk-MCBERRMC.js.map +1 -0
- package/dist/{chunk-IRU2UOWN.js → chunk-RCOAXXHP.js} +194 -197
- package/dist/chunk-RCOAXXHP.js.map +1 -0
- package/dist/{chunk-ZUNYOUFO.js → chunk-VRPCMCLQ.js} +255 -621
- package/dist/chunk-VRPCMCLQ.js.map +1 -0
- package/dist/cli.js +4 -3
- package/dist/cli.js.map +1 -1
- package/dist/{dist-U5EXNMON.js → dist-7W4OI3X2.js} +30 -33
- package/dist/dist-7W4OI3X2.js.map +1 -0
- package/dist/index.js +4 -3
- package/dist/{interactive-LFCOVXPQ.js → interactive-J4QEU5FG.js} +4 -3
- package/dist/{interactive-LFCOVXPQ.js.map → interactive-J4QEU5FG.js.map} +1 -1
- package/dist/studio/assets/{index-Bhv1TEO2.js → index-BTsTcivx.js} +1 -1
- package/dist/studio/assets/{index-vZYHIvCH.js → index-KfPHd-QM.js} +1 -1
- package/dist/studio/index.html +1 -1
- package/package.json +1 -1
- package/dist/chunk-ILIM6IIX.js.map +0 -1
- package/dist/chunk-IRU2UOWN.js.map +0 -1
- package/dist/chunk-ZUNYOUFO.js.map +0 -1
- /package/dist/{dist-U5EXNMON.js.map → artifact-writer-WH3OE42V.js.map} +0 -0
|
@@ -305,7 +305,7 @@ var require_dist = __commonJS({
|
|
|
305
305
|
}
|
|
306
306
|
});
|
|
307
307
|
|
|
308
|
-
// ../../packages/core/dist/chunk-
|
|
308
|
+
// ../../packages/core/dist/chunk-PYDBJOAO.js
|
|
309
309
|
import { constants } from "node:fs";
|
|
310
310
|
import { access, readFile } from "node:fs/promises";
|
|
311
311
|
import path from "node:path";
|
|
@@ -425,7 +425,7 @@ __export(external_exports2, {
|
|
|
425
425
|
void: () => voidType
|
|
426
426
|
});
|
|
427
427
|
|
|
428
|
-
// ../../packages/core/dist/chunk-
|
|
428
|
+
// ../../packages/core/dist/chunk-PYDBJOAO.js
|
|
429
429
|
import { readFile as readFile2 } from "node:fs/promises";
|
|
430
430
|
import path3 from "node:path";
|
|
431
431
|
import fg from "fast-glob";
|
|
@@ -497,7 +497,7 @@ function isTestMessage(value) {
|
|
|
497
497
|
}
|
|
498
498
|
return false;
|
|
499
499
|
}
|
|
500
|
-
var
|
|
500
|
+
var GRADER_KIND_VALUES = [
|
|
501
501
|
"code-grader",
|
|
502
502
|
"llm-grader",
|
|
503
503
|
"rubric",
|
|
@@ -523,9 +523,9 @@ var EVALUATOR_KIND_VALUES = [
|
|
|
523
523
|
"rubrics",
|
|
524
524
|
"inline-assert"
|
|
525
525
|
];
|
|
526
|
-
var
|
|
527
|
-
function
|
|
528
|
-
return typeof value === "string" &&
|
|
526
|
+
var GRADER_KIND_SET = new Set(GRADER_KIND_VALUES);
|
|
527
|
+
function isGraderKind(value) {
|
|
528
|
+
return typeof value === "string" && GRADER_KIND_SET.has(value);
|
|
529
529
|
}
|
|
530
530
|
async function fileExists(filePath) {
|
|
531
531
|
try {
|
|
@@ -15138,22 +15138,25 @@ function extractCacheConfig(suite) {
|
|
|
15138
15138
|
const resolvedCachePath = typeof cachePath === "string" && cachePath.trim().length > 0 ? cachePath.trim() : void 0;
|
|
15139
15139
|
return { enabled: cache, cachePath: resolvedCachePath };
|
|
15140
15140
|
}
|
|
15141
|
-
function
|
|
15141
|
+
function extractBudgetUsd(suite) {
|
|
15142
15142
|
const execution = suite.execution;
|
|
15143
15143
|
if (!execution || typeof execution !== "object" || Array.isArray(execution)) {
|
|
15144
15144
|
return void 0;
|
|
15145
15145
|
}
|
|
15146
15146
|
const executionObj = execution;
|
|
15147
|
-
|
|
15147
|
+
if ("total_budget_usd" in executionObj || "totalBudgetUsd" in executionObj) {
|
|
15148
|
+
throw new Error(
|
|
15149
|
+
"execution.total_budget_usd has been renamed to execution.budget_usd. Update your eval YAML."
|
|
15150
|
+
);
|
|
15151
|
+
}
|
|
15152
|
+
const rawBudget = executionObj.budget_usd ?? executionObj.budgetUsd;
|
|
15148
15153
|
if (rawBudget === void 0 || rawBudget === null) {
|
|
15149
15154
|
return void 0;
|
|
15150
15155
|
}
|
|
15151
15156
|
if (typeof rawBudget === "number" && rawBudget > 0) {
|
|
15152
15157
|
return rawBudget;
|
|
15153
15158
|
}
|
|
15154
|
-
logWarning(
|
|
15155
|
-
`Invalid execution.total_budget_usd: ${rawBudget}. Must be a positive number. Ignoring.`
|
|
15156
|
-
);
|
|
15159
|
+
logWarning(`Invalid execution.budget_usd: ${rawBudget}. Must be a positive number. Ignoring.`);
|
|
15157
15160
|
return void 0;
|
|
15158
15161
|
}
|
|
15159
15162
|
function extractFailOnError(suite) {
|
|
@@ -15525,7 +15528,7 @@ function validateTemplateVariables(content, source) {
|
|
|
15525
15528
|
);
|
|
15526
15529
|
}
|
|
15527
15530
|
if (invalidVariables.length > 0) {
|
|
15528
|
-
const warningMessage = `${ANSI_YELLOW22}Warning: Custom
|
|
15531
|
+
const warningMessage = `${ANSI_YELLOW22}Warning: Custom grader template at ${source}
|
|
15529
15532
|
Contains invalid variables: ${invalidVariables.map((v) => `{{ ${v} }}`).join(", ")}
|
|
15530
15533
|
Valid variables: ${Array.from(VALID_TEMPLATE_VARIABLES).map((v) => `{{ ${v} }}`).join(", ")}${ANSI_RESET3}`;
|
|
15531
15534
|
console.warn(warningMessage);
|
|
@@ -15535,26 +15538,26 @@ var ANSI_YELLOW3 = "\x1B[33m";
|
|
|
15535
15538
|
var ANSI_RESET4 = "\x1B[0m";
|
|
15536
15539
|
var MAX_ASSERTION_INCLUDE_DEPTH = 3;
|
|
15537
15540
|
var PROMPT_FILE_PREFIX = "file://";
|
|
15538
|
-
function
|
|
15541
|
+
function normalizeGraderType(type) {
|
|
15539
15542
|
return type.replace(/_/g, "-");
|
|
15540
15543
|
}
|
|
15541
15544
|
function isDeprecatedJudgeType(type) {
|
|
15542
15545
|
return type === "code-judge" || type === "llm-judge";
|
|
15543
15546
|
}
|
|
15544
|
-
async function
|
|
15547
|
+
async function parseGraders(rawEvalCase, globalExecution, searchRoots, evalId, defaultPreprocessors) {
|
|
15545
15548
|
const execution = rawEvalCase.execution;
|
|
15546
15549
|
const executionObject = isJsonObject2(execution) ? execution : void 0;
|
|
15547
15550
|
const caseEvaluators = rawEvalCase.assertions ?? rawEvalCase.assert ?? (executionObject ? executionObject.evaluators : void 0) ?? // deprecated: use assertions
|
|
15548
15551
|
rawEvalCase.evaluators;
|
|
15549
15552
|
const skipDefaults = executionObject?.skip_defaults === true;
|
|
15550
15553
|
const rootEvaluators = skipDefaults ? void 0 : globalExecution?.assertions ?? globalExecution?.assert ?? globalExecution?.evaluators;
|
|
15551
|
-
const parsedCase = await
|
|
15554
|
+
const parsedCase = await parseGraderList(
|
|
15552
15555
|
caseEvaluators,
|
|
15553
15556
|
searchRoots,
|
|
15554
15557
|
evalId,
|
|
15555
15558
|
defaultPreprocessors
|
|
15556
15559
|
);
|
|
15557
|
-
const parsedRoot = await
|
|
15560
|
+
const parsedRoot = await parseGraderList(
|
|
15558
15561
|
rootEvaluators,
|
|
15559
15562
|
searchRoots,
|
|
15560
15563
|
evalId,
|
|
@@ -15633,12 +15636,12 @@ ${resolved.attempted.map((attempt) => ` Tried: ${attempt}`).join("\n")}` : "";
|
|
|
15633
15636
|
templateDir,
|
|
15634
15637
|
...searchRoots.filter((root) => path5.resolve(root) !== templateDir)
|
|
15635
15638
|
];
|
|
15636
|
-
return await
|
|
15639
|
+
return await expandGraderEntries(assertions, nestedSearchRoots, evalId, {
|
|
15637
15640
|
depth: nextDepth,
|
|
15638
15641
|
chain: [...includeContext.chain, resolved.resolvedPath]
|
|
15639
15642
|
}) ?? [];
|
|
15640
15643
|
}
|
|
15641
|
-
async function
|
|
15644
|
+
async function expandGraderEntries(candidateEvaluators, searchRoots, evalId, includeContext = { depth: 0, chain: [] }) {
|
|
15642
15645
|
if (candidateEvaluators === void 0) {
|
|
15643
15646
|
return void 0;
|
|
15644
15647
|
}
|
|
@@ -15662,8 +15665,8 @@ async function expandEvaluatorEntries(candidateEvaluators, searchRoots, evalId,
|
|
|
15662
15665
|
}
|
|
15663
15666
|
return expanded;
|
|
15664
15667
|
}
|
|
15665
|
-
async function
|
|
15666
|
-
const expandedEvaluators = await
|
|
15668
|
+
async function parseGraderList(candidateEvaluators, searchRoots, evalId, defaultPreprocessors) {
|
|
15669
|
+
const expandedEvaluators = await expandGraderEntries(candidateEvaluators, searchRoots, evalId);
|
|
15667
15670
|
if (!expandedEvaluators) {
|
|
15668
15671
|
return void 0;
|
|
15669
15672
|
}
|
|
@@ -15709,14 +15712,14 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId, defa
|
|
|
15709
15712
|
}
|
|
15710
15713
|
const rawName = asString(rawEvaluator.name);
|
|
15711
15714
|
const rawType = rawEvaluator.type;
|
|
15712
|
-
const typeValue = typeof rawType === "string" ?
|
|
15715
|
+
const typeValue = typeof rawType === "string" ? normalizeGraderType(rawType) : rawType;
|
|
15713
15716
|
if (typeof typeValue === "string" && isDeprecatedJudgeType(typeValue)) {
|
|
15714
15717
|
logWarning2(
|
|
15715
15718
|
`Skipping evaluator '${rawName ?? "<unnamed>"}' in '${evalId}': '${rawType}' is deprecated. Use '${typeValue.replace("-judge", "-grader")}' instead`
|
|
15716
15719
|
);
|
|
15717
15720
|
continue;
|
|
15718
15721
|
}
|
|
15719
|
-
const isCustomType = typeof typeValue === "string" && !
|
|
15722
|
+
const isCustomType = typeof typeValue === "string" && !isGraderKind(typeValue);
|
|
15720
15723
|
if (typeof typeValue !== "string") {
|
|
15721
15724
|
logWarning2(`Skipping evaluator with invalid type in '${evalId}'`);
|
|
15722
15725
|
continue;
|
|
@@ -15879,7 +15882,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId, defa
|
|
|
15879
15882
|
continue;
|
|
15880
15883
|
}
|
|
15881
15884
|
const aggregatorType = asString(rawAggregator.type);
|
|
15882
|
-
const normalizedAggregatorType = typeof aggregatorType === "string" ? aggregatorType === "weighted_average" || aggregatorType === "threshold" ? aggregatorType :
|
|
15885
|
+
const normalizedAggregatorType = typeof aggregatorType === "string" ? aggregatorType === "weighted_average" || aggregatorType === "threshold" ? aggregatorType : normalizeGraderType(aggregatorType) : aggregatorType;
|
|
15883
15886
|
if (typeof normalizedAggregatorType === "string" && isDeprecatedJudgeType(normalizedAggregatorType)) {
|
|
15884
15887
|
logWarning2(
|
|
15885
15888
|
`Skipping composite evaluator '${name21}' in '${evalId}': aggregator type '${aggregatorType}' is deprecated. Use '${normalizedAggregatorType.replace("-judge", "-grader")}' instead`
|
|
@@ -15892,7 +15895,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId, defa
|
|
|
15892
15895
|
);
|
|
15893
15896
|
continue;
|
|
15894
15897
|
}
|
|
15895
|
-
const expandedMembers = await
|
|
15898
|
+
const expandedMembers = await expandGraderEntries(
|
|
15896
15899
|
rawMembers,
|
|
15897
15900
|
searchRoots,
|
|
15898
15901
|
`${evalId}:${name21}`
|
|
@@ -15908,11 +15911,11 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId, defa
|
|
|
15908
15911
|
}
|
|
15909
15912
|
const memberName = asString(rawMember.name);
|
|
15910
15913
|
const memberType = rawMember.type;
|
|
15911
|
-
if (!memberName || !
|
|
15914
|
+
if (!memberName || !isGraderKind(memberType)) {
|
|
15912
15915
|
logWarning2(`Skipping member evaluator with invalid name/type in composite '${name21}'`);
|
|
15913
15916
|
continue;
|
|
15914
15917
|
}
|
|
15915
|
-
const memberConfigs = await
|
|
15918
|
+
const memberConfigs = await parseGraders(
|
|
15916
15919
|
{ evaluators: [rawMember] },
|
|
15917
15920
|
void 0,
|
|
15918
15921
|
searchRoots,
|
|
@@ -16653,7 +16656,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId, defa
|
|
|
16653
16656
|
`prompt.command for evaluator '${name21}' in '${evalId}'`
|
|
16654
16657
|
);
|
|
16655
16658
|
if (!commandArray) {
|
|
16656
|
-
throw new Error(`
|
|
16659
|
+
throw new Error(`Grader '${name21}' in '${evalId}': prompt object requires command array`);
|
|
16657
16660
|
}
|
|
16658
16661
|
const commandPath = commandArray[commandArray.length - 1];
|
|
16659
16662
|
const resolved = await resolveFileReference22(commandPath, searchRoots);
|
|
@@ -16661,7 +16664,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId, defa
|
|
|
16661
16664
|
resolvedPromptScript = [...commandArray.slice(0, -1), path5.resolve(resolved.resolvedPath)];
|
|
16662
16665
|
} else {
|
|
16663
16666
|
throw new Error(
|
|
16664
|
-
`
|
|
16667
|
+
`Grader '${name21}' in '${evalId}': prompt command file not found: ${resolved.displayPath}`
|
|
16665
16668
|
);
|
|
16666
16669
|
}
|
|
16667
16670
|
if (isJsonObject2(rawPrompt.config)) {
|
|
@@ -16678,11 +16681,11 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId, defa
|
|
|
16678
16681
|
await validateCustomPromptContent(promptPath);
|
|
16679
16682
|
} catch (error) {
|
|
16680
16683
|
const message = error instanceof Error ? error.message : String(error);
|
|
16681
|
-
throw new Error(`
|
|
16684
|
+
throw new Error(`Grader '${name21}' template (${promptPath}): ${message}`);
|
|
16682
16685
|
}
|
|
16683
16686
|
} else {
|
|
16684
16687
|
throw new Error(
|
|
16685
|
-
`
|
|
16688
|
+
`Grader '${name21}' in '${evalId}': prompt file not found: ${resolved.displayPath}`
|
|
16686
16689
|
);
|
|
16687
16690
|
}
|
|
16688
16691
|
} else {
|
|
@@ -16799,18 +16802,18 @@ async function parsePreprocessors(rawValue, searchRoots, evaluatorName, evalId)
|
|
|
16799
16802
|
return void 0;
|
|
16800
16803
|
}
|
|
16801
16804
|
if (!Array.isArray(rawValue)) {
|
|
16802
|
-
throw new Error(`
|
|
16805
|
+
throw new Error(`Grader '${evaluatorName}' in '${evalId}': preprocessors must be an array`);
|
|
16803
16806
|
}
|
|
16804
16807
|
const preprocessors = [];
|
|
16805
16808
|
for (const rawEntry of rawValue) {
|
|
16806
16809
|
if (!isJsonObject2(rawEntry)) {
|
|
16807
16810
|
throw new Error(
|
|
16808
|
-
`
|
|
16811
|
+
`Grader '${evaluatorName}' in '${evalId}': each preprocessor must be an object`
|
|
16809
16812
|
);
|
|
16810
16813
|
}
|
|
16811
16814
|
const type = asString(rawEntry.type)?.trim();
|
|
16812
16815
|
if (!type) {
|
|
16813
|
-
throw new Error(`
|
|
16816
|
+
throw new Error(`Grader '${evaluatorName}' in '${evalId}': preprocessor.type is required`);
|
|
16814
16817
|
}
|
|
16815
16818
|
const command = asStringArray(
|
|
16816
16819
|
rawEntry.command,
|
|
@@ -16818,14 +16821,14 @@ async function parsePreprocessors(rawValue, searchRoots, evaluatorName, evalId)
|
|
|
16818
16821
|
);
|
|
16819
16822
|
if (!command || command.length === 0) {
|
|
16820
16823
|
throw new Error(
|
|
16821
|
-
`
|
|
16824
|
+
`Grader '${evaluatorName}' in '${evalId}': preprocessor '${type}' requires command`
|
|
16822
16825
|
);
|
|
16823
16826
|
}
|
|
16824
16827
|
const commandPath = command[command.length - 1];
|
|
16825
16828
|
const resolved = await resolveFileReference22(commandPath, searchRoots);
|
|
16826
16829
|
if (!resolved.resolvedPath) {
|
|
16827
16830
|
throw new Error(
|
|
16828
|
-
`
|
|
16831
|
+
`Grader '${evaluatorName}' in '${evalId}': preprocessor command file not found: ${resolved.displayPath}`
|
|
16829
16832
|
);
|
|
16830
16833
|
}
|
|
16831
16834
|
preprocessors.push({
|
|
@@ -16876,13 +16879,13 @@ function coerceEvaluator(candidate, contextId) {
|
|
|
16876
16879
|
if (typeof candidate !== "string") {
|
|
16877
16880
|
return void 0;
|
|
16878
16881
|
}
|
|
16879
|
-
const normalized =
|
|
16882
|
+
const normalized = normalizeGraderType(candidate);
|
|
16880
16883
|
if (isDeprecatedJudgeType(normalized)) {
|
|
16881
16884
|
throw new Error(
|
|
16882
16885
|
`Unsupported grader '${candidate}' in ${contextId}. Use '${normalized.replace("-judge", "-grader")}' instead.`
|
|
16883
16886
|
);
|
|
16884
16887
|
}
|
|
16885
|
-
if (
|
|
16888
|
+
if (isGraderKind(normalized)) {
|
|
16886
16889
|
return normalized;
|
|
16887
16890
|
}
|
|
16888
16891
|
logWarning2(`Unknown grader '${candidate}' in ${contextId}, falling back to default`);
|
|
@@ -16954,7 +16957,7 @@ function parseRequiredAndMinScore(rawRequired, rawMinScore, evaluatorName, evalI
|
|
|
16954
16957
|
}
|
|
16955
16958
|
result.required = rawRequired;
|
|
16956
16959
|
logWarning2(
|
|
16957
|
-
`
|
|
16960
|
+
`Grader '${evaluatorName}' in '${evalId}': 'required: ${rawRequired}' is deprecated. Use 'required: true' + 'min_score: ${rawRequired}' instead.`
|
|
16958
16961
|
);
|
|
16959
16962
|
}
|
|
16960
16963
|
return result;
|
|
@@ -17756,7 +17759,7 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
|
|
|
17756
17759
|
const testCaseEvaluatorKind = coerceEvaluator(testCaseConfig.evaluator, id) ?? globalEvaluator;
|
|
17757
17760
|
let evaluators;
|
|
17758
17761
|
try {
|
|
17759
|
-
evaluators = await
|
|
17762
|
+
evaluators = await parseGraders(
|
|
17760
17763
|
testCaseConfig,
|
|
17761
17764
|
mergedExecution,
|
|
17762
17765
|
searchRoots,
|
|
@@ -18093,7 +18096,7 @@ async function loadTestSuite(evalFilePath, repoRoot, options) {
|
|
|
18093
18096
|
targetRefs: extractTargetRefsFromSuite(parsed),
|
|
18094
18097
|
workers: extractWorkersFromSuite(parsed),
|
|
18095
18098
|
cacheConfig: extractCacheConfig(parsed),
|
|
18096
|
-
|
|
18099
|
+
budgetUsd: extractBudgetUsd(parsed),
|
|
18097
18100
|
...metadata !== void 0 && { metadata },
|
|
18098
18101
|
...failOnError !== void 0 && { failOnError },
|
|
18099
18102
|
...threshold !== void 0 && { threshold },
|
|
@@ -18234,7 +18237,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
18234
18237
|
const testCaseEvaluatorKind = coerceEvaluator(testCaseConfig.evaluator, id) ?? globalEvaluator;
|
|
18235
18238
|
let evaluators;
|
|
18236
18239
|
try {
|
|
18237
|
-
evaluators = await
|
|
18240
|
+
evaluators = await parseGraders(
|
|
18238
18241
|
testCaseConfig,
|
|
18239
18242
|
globalExecution,
|
|
18240
18243
|
searchRoots,
|
|
@@ -26241,7 +26244,7 @@ async function materializeContentForGrader(messages, getWorkDir) {
|
|
|
26241
26244
|
}
|
|
26242
26245
|
return result;
|
|
26243
26246
|
}
|
|
26244
|
-
var
|
|
26247
|
+
var CodeGrader = class {
|
|
26245
26248
|
kind = "code-grader";
|
|
26246
26249
|
command;
|
|
26247
26250
|
cwd;
|
|
@@ -26359,7 +26362,7 @@ var CodeEvaluator = class {
|
|
|
26359
26362
|
})) : [];
|
|
26360
26363
|
const details = parsed?.details && typeof parsed.details === "object" && !Array.isArray(parsed.details) ? parsed.details : void 0;
|
|
26361
26364
|
const proxyUsage = getProxyUsage?.();
|
|
26362
|
-
const
|
|
26365
|
+
const graderRawRequest = {
|
|
26363
26366
|
command: this.command,
|
|
26364
26367
|
...this.cwd ? { cwd: this.cwd } : {},
|
|
26365
26368
|
...proxyUsage ? {
|
|
@@ -26374,7 +26377,7 @@ var CodeEvaluator = class {
|
|
|
26374
26377
|
verdict: scoreToVerdict(score),
|
|
26375
26378
|
assertions,
|
|
26376
26379
|
expectedAspectCount: assertions.length || 1,
|
|
26377
|
-
|
|
26380
|
+
graderRawRequest,
|
|
26378
26381
|
...details ? { details } : {},
|
|
26379
26382
|
tokenUsage: proxyUsage?.tokenUsage
|
|
26380
26383
|
};
|
|
@@ -26386,7 +26389,7 @@ var CodeEvaluator = class {
|
|
|
26386
26389
|
verdict: "fail",
|
|
26387
26390
|
assertions: [{ text: `Code evaluator failed: ${message}`, passed: false }],
|
|
26388
26391
|
expectedAspectCount: 1,
|
|
26389
|
-
|
|
26392
|
+
graderRawRequest: {
|
|
26390
26393
|
command: this.command,
|
|
26391
26394
|
...this.cwd ? { cwd: this.cwd } : {},
|
|
26392
26395
|
...proxyUsage ? {
|
|
@@ -26469,7 +26472,7 @@ var BINARY_EXTENSIONS = /* @__PURE__ */ new Set([
|
|
|
26469
26472
|
".so",
|
|
26470
26473
|
".dylib"
|
|
26471
26474
|
]);
|
|
26472
|
-
var
|
|
26475
|
+
var DEFAULT_GRADER_TEMPLATE = `You are an expert grader. Your goal is to grade the answer based on how well it achieves the criteria for the original task.
|
|
26473
26476
|
|
|
26474
26477
|
Use the reference_answer as a gold standard for a high-quality response (if provided). The reference_answer may be a simple text response, or it may contain a sequence of expected agent messages including tool calls. When it contains multiple messages, the last message represents the final expected answer. The answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
|
|
26475
26478
|
|
|
@@ -26524,19 +26527,19 @@ function resolveContentBasePath(context2) {
|
|
|
26524
26527
|
}
|
|
26525
26528
|
return void 0;
|
|
26526
26529
|
}
|
|
26527
|
-
var
|
|
26530
|
+
var LlmGrader = class {
|
|
26528
26531
|
kind = "llm-grader";
|
|
26529
26532
|
resolveGraderProvider;
|
|
26530
26533
|
maxOutputTokens;
|
|
26531
26534
|
temperature;
|
|
26532
|
-
|
|
26535
|
+
graderTemplate;
|
|
26533
26536
|
maxSteps;
|
|
26534
26537
|
graderTargetProvider;
|
|
26535
26538
|
constructor(options) {
|
|
26536
26539
|
this.resolveGraderProvider = options.resolveGraderProvider ?? options.resolveJudgeProvider;
|
|
26537
26540
|
this.maxOutputTokens = options.maxOutputTokens;
|
|
26538
26541
|
this.temperature = options.temperature;
|
|
26539
|
-
this.
|
|
26542
|
+
this.graderTemplate = options.graderTemplate;
|
|
26540
26543
|
this.maxSteps = Math.min(options.maxSteps ?? DEFAULT_MAX_STEPS, MAX_STEPS_LIMIT);
|
|
26541
26544
|
this.graderTargetProvider = options.graderTargetProvider ?? options.judgeTargetProvider;
|
|
26542
26545
|
}
|
|
@@ -26599,16 +26602,16 @@ var LlmGraderEvaluator = class {
|
|
|
26599
26602
|
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context2.evalCase.reference_answer ?? "").trim()
|
|
26600
26603
|
};
|
|
26601
26604
|
const systemPrompt = buildOutputSchema();
|
|
26602
|
-
const
|
|
26603
|
-
warnDeprecatedTemplateVars(
|
|
26604
|
-
let userPrompt = substituteVariables(
|
|
26605
|
-
if (context2.fileChanges && !context2.
|
|
26605
|
+
const graderTemplate = context2.graderTemplateOverride ?? this.graderTemplate ?? DEFAULT_GRADER_TEMPLATE;
|
|
26606
|
+
warnDeprecatedTemplateVars(graderTemplate);
|
|
26607
|
+
let userPrompt = substituteVariables(graderTemplate, variables);
|
|
26608
|
+
if (context2.fileChanges && !context2.graderTemplateOverride && !this.graderTemplate) {
|
|
26606
26609
|
userPrompt += `
|
|
26607
26610
|
|
|
26608
26611
|
[[ ## file_changes ## ]]
|
|
26609
26612
|
${context2.fileChanges}`;
|
|
26610
26613
|
}
|
|
26611
|
-
const
|
|
26614
|
+
const graderRawRequest = {
|
|
26612
26615
|
userPrompt,
|
|
26613
26616
|
systemPrompt
|
|
26614
26617
|
};
|
|
@@ -26629,7 +26632,7 @@ ${context2.fileChanges}`;
|
|
|
26629
26632
|
verdict: scoreToVerdict(score),
|
|
26630
26633
|
assertions,
|
|
26631
26634
|
expectedAspectCount: Math.max(assertions.length, 1),
|
|
26632
|
-
|
|
26635
|
+
graderRawRequest,
|
|
26633
26636
|
graderTarget: graderProvider.targetName,
|
|
26634
26637
|
details: data.details,
|
|
26635
26638
|
tokenUsage
|
|
@@ -26643,7 +26646,7 @@ ${context2.fileChanges}`;
|
|
|
26643
26646
|
verdict: "skip",
|
|
26644
26647
|
assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
|
|
26645
26648
|
expectedAspectCount: 1,
|
|
26646
|
-
|
|
26649
|
+
graderRawRequest,
|
|
26647
26650
|
graderTarget: graderProvider.targetName
|
|
26648
26651
|
};
|
|
26649
26652
|
}
|
|
@@ -26660,7 +26663,7 @@ ${context2.fileChanges}`;
|
|
|
26660
26663
|
}
|
|
26661
26664
|
const prompt = this.buildRubricPrompt(context2, rubrics);
|
|
26662
26665
|
const systemPrompt = buildRubricOutputSchema();
|
|
26663
|
-
const
|
|
26666
|
+
const graderRawRequest = {
|
|
26664
26667
|
userPrompt: prompt,
|
|
26665
26668
|
systemPrompt
|
|
26666
26669
|
};
|
|
@@ -26680,7 +26683,7 @@ ${context2.fileChanges}`;
|
|
|
26680
26683
|
verdict,
|
|
26681
26684
|
assertions,
|
|
26682
26685
|
expectedAspectCount: rubrics.length,
|
|
26683
|
-
|
|
26686
|
+
graderRawRequest,
|
|
26684
26687
|
graderTarget: graderProvider.targetName,
|
|
26685
26688
|
tokenUsage
|
|
26686
26689
|
};
|
|
@@ -26693,7 +26696,7 @@ ${context2.fileChanges}`;
|
|
|
26693
26696
|
verdict: "skip",
|
|
26694
26697
|
assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
|
|
26695
26698
|
expectedAspectCount: rubrics.length,
|
|
26696
|
-
|
|
26699
|
+
graderRawRequest,
|
|
26697
26700
|
graderTarget: graderProvider.targetName
|
|
26698
26701
|
};
|
|
26699
26702
|
}
|
|
@@ -26705,7 +26708,7 @@ ${context2.fileChanges}`;
|
|
|
26705
26708
|
async evaluateWithScoreRanges(context2, graderProvider, rubrics) {
|
|
26706
26709
|
const prompt = this.buildScoreRangePrompt(context2, rubrics);
|
|
26707
26710
|
const systemPrompt = buildScoreRangeOutputSchema();
|
|
26708
|
-
const
|
|
26711
|
+
const graderRawRequest = {
|
|
26709
26712
|
userPrompt: prompt,
|
|
26710
26713
|
systemPrompt
|
|
26711
26714
|
};
|
|
@@ -26725,7 +26728,7 @@ ${context2.fileChanges}`;
|
|
|
26725
26728
|
verdict,
|
|
26726
26729
|
assertions,
|
|
26727
26730
|
expectedAspectCount: rubrics.length,
|
|
26728
|
-
|
|
26731
|
+
graderRawRequest,
|
|
26729
26732
|
graderTarget: graderProvider.targetName,
|
|
26730
26733
|
details,
|
|
26731
26734
|
tokenUsage
|
|
@@ -26739,7 +26742,7 @@ ${context2.fileChanges}`;
|
|
|
26739
26742
|
verdict: "skip",
|
|
26740
26743
|
assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
|
|
26741
26744
|
expectedAspectCount: rubrics.length,
|
|
26742
|
-
|
|
26745
|
+
graderRawRequest,
|
|
26743
26746
|
graderTarget: graderProvider.targetName
|
|
26744
26747
|
};
|
|
26745
26748
|
}
|
|
@@ -26768,7 +26771,7 @@ ${context2.fileChanges}`;
|
|
|
26768
26771
|
const config = context2.evaluator;
|
|
26769
26772
|
const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
|
|
26770
26773
|
const fsTools = createFilesystemTools(workspacePath);
|
|
26771
|
-
const
|
|
26774
|
+
const graderRawRequest = {
|
|
26772
26775
|
mode: "built-in",
|
|
26773
26776
|
systemPrompt,
|
|
26774
26777
|
userPrompt,
|
|
@@ -26792,7 +26795,7 @@ ${context2.fileChanges}`;
|
|
|
26792
26795
|
return this.parseAgentResult(
|
|
26793
26796
|
text2,
|
|
26794
26797
|
rubrics,
|
|
26795
|
-
|
|
26798
|
+
graderRawRequest,
|
|
26796
26799
|
details,
|
|
26797
26800
|
graderProvider.targetName
|
|
26798
26801
|
);
|
|
@@ -26803,7 +26806,7 @@ ${context2.fileChanges}`;
|
|
|
26803
26806
|
verdict: "fail",
|
|
26804
26807
|
assertions: [{ text: `llm-grader built-in evaluation failed: ${message}`, passed: false }],
|
|
26805
26808
|
expectedAspectCount: 1,
|
|
26806
|
-
|
|
26809
|
+
graderRawRequest,
|
|
26807
26810
|
graderTarget: graderProvider.targetName,
|
|
26808
26811
|
details: { mode: "built-in", error: message }
|
|
26809
26812
|
};
|
|
@@ -26835,7 +26838,7 @@ ${context2.fileChanges}`;
|
|
|
26835
26838
|
async evaluateWithDelegate(context2, provider, modeLabel) {
|
|
26836
26839
|
const workspacePath = context2.workspacePath;
|
|
26837
26840
|
const prompt = this.buildDelegatedPrompt(context2);
|
|
26838
|
-
const
|
|
26841
|
+
const graderRawRequest = {
|
|
26839
26842
|
mode: modeLabel,
|
|
26840
26843
|
grader_target: provider.targetName,
|
|
26841
26844
|
prompt
|
|
@@ -26856,7 +26859,7 @@ ${context2.fileChanges}`;
|
|
|
26856
26859
|
{ text: `llm-grader ${modeLabel} returned no assistant response`, passed: false }
|
|
26857
26860
|
],
|
|
26858
26861
|
expectedAspectCount: 1,
|
|
26859
|
-
|
|
26862
|
+
graderRawRequest,
|
|
26860
26863
|
graderTarget: provider.targetName,
|
|
26861
26864
|
details: { mode: modeLabel, grader_target: provider.targetName }
|
|
26862
26865
|
};
|
|
@@ -26870,7 +26873,7 @@ ${context2.fileChanges}`;
|
|
|
26870
26873
|
return this.parseAgentResult(
|
|
26871
26874
|
assistantContent,
|
|
26872
26875
|
rubrics,
|
|
26873
|
-
|
|
26876
|
+
graderRawRequest,
|
|
26874
26877
|
details,
|
|
26875
26878
|
provider.targetName
|
|
26876
26879
|
);
|
|
@@ -26883,7 +26886,7 @@ ${context2.fileChanges}`;
|
|
|
26883
26886
|
{ text: `llm-grader ${modeLabel} evaluation failed: ${message}`, passed: false }
|
|
26884
26887
|
],
|
|
26885
26888
|
expectedAspectCount: 1,
|
|
26886
|
-
|
|
26889
|
+
graderRawRequest,
|
|
26887
26890
|
graderTarget: provider.targetName,
|
|
26888
26891
|
details: {
|
|
26889
26892
|
mode: modeLabel,
|
|
@@ -26904,7 +26907,7 @@ ${context2.fileChanges}`;
|
|
|
26904
26907
|
const config = context2.evaluator;
|
|
26905
26908
|
const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
|
|
26906
26909
|
const parts = [
|
|
26907
|
-
"You are an expert
|
|
26910
|
+
"You are an expert grader with access to the workspace filesystem.",
|
|
26908
26911
|
"Use the provided tools to investigate the workspace and verify the criteria are met.",
|
|
26909
26912
|
"Thoroughly examine relevant files before making your assessment.",
|
|
26910
26913
|
""
|
|
@@ -26933,9 +26936,9 @@ ${context2.fileChanges}`;
|
|
|
26933
26936
|
[TEMPLATE_VARIABLES.OUTPUT_TEXT]: context2.candidate.trim(),
|
|
26934
26937
|
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context2.evalCase.reference_answer ?? "").trim()
|
|
26935
26938
|
};
|
|
26936
|
-
if (this.
|
|
26937
|
-
warnDeprecatedTemplateVars(this.
|
|
26938
|
-
return substituteVariables(this.
|
|
26939
|
+
if (this.graderTemplate) {
|
|
26940
|
+
warnDeprecatedTemplateVars(this.graderTemplate);
|
|
26941
|
+
return substituteVariables(this.graderTemplate, variables);
|
|
26939
26942
|
}
|
|
26940
26943
|
const config = context2.evaluator;
|
|
26941
26944
|
const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
|
|
@@ -26982,7 +26985,7 @@ ${context2.fileChanges}`;
|
|
|
26982
26985
|
const formattedQuestion = context2.promptInputs.question && context2.promptInputs.question.trim().length > 0 ? context2.promptInputs.question : context2.evalCase.question;
|
|
26983
26986
|
const config = context2.evaluator;
|
|
26984
26987
|
const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
|
|
26985
|
-
if (this.
|
|
26988
|
+
if (this.graderTemplate) {
|
|
26986
26989
|
const variables = {
|
|
26987
26990
|
[TEMPLATE_VARIABLES.CRITERIA]: context2.evalCase.criteria.trim(),
|
|
26988
26991
|
[TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
|
|
@@ -26994,15 +26997,15 @@ ${context2.fileChanges}`;
|
|
|
26994
26997
|
[TEMPLATE_VARIABLES.OUTPUT_TEXT]: context2.candidate.trim(),
|
|
26995
26998
|
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context2.evalCase.reference_answer ?? "").trim()
|
|
26996
26999
|
};
|
|
26997
|
-
warnDeprecatedTemplateVars(this.
|
|
26998
|
-
const customPrompt = substituteVariables(this.
|
|
27000
|
+
warnDeprecatedTemplateVars(this.graderTemplate);
|
|
27001
|
+
const customPrompt = substituteVariables(this.graderTemplate, variables);
|
|
26999
27002
|
const outputSchema2 = rubrics && rubrics.length > 0 ? buildRubricOutputSchema() : buildOutputSchema();
|
|
27000
27003
|
return `${customPrompt}
|
|
27001
27004
|
|
|
27002
27005
|
${outputSchema2}`;
|
|
27003
27006
|
}
|
|
27004
27007
|
const parts = [
|
|
27005
|
-
"You are an expert
|
|
27008
|
+
"You are an expert grader. Investigate the workspace to verify the criteria are met.",
|
|
27006
27009
|
"",
|
|
27007
27010
|
"[[ ## question ## ]]",
|
|
27008
27011
|
formattedQuestion,
|
|
@@ -27039,7 +27042,7 @@ ${outputSchema2}`;
|
|
|
27039
27042
|
* Parse the agent's response text into an EvaluationScore.
|
|
27040
27043
|
* Supports both freeform and rubric modes.
|
|
27041
27044
|
*/
|
|
27042
|
-
parseAgentResult(text2, rubrics,
|
|
27045
|
+
parseAgentResult(text2, rubrics, graderRawRequest, details, graderTarget) {
|
|
27043
27046
|
try {
|
|
27044
27047
|
const parsed = parseJsonFromText(text2);
|
|
27045
27048
|
if (rubrics && rubrics.length > 0) {
|
|
@@ -27050,7 +27053,7 @@ ${outputSchema2}`;
|
|
|
27050
27053
|
verdict,
|
|
27051
27054
|
assertions: assertions2,
|
|
27052
27055
|
expectedAspectCount: rubrics.length,
|
|
27053
|
-
|
|
27056
|
+
graderRawRequest,
|
|
27054
27057
|
graderTarget,
|
|
27055
27058
|
details
|
|
27056
27059
|
};
|
|
@@ -27063,7 +27066,7 @@ ${outputSchema2}`;
|
|
|
27063
27066
|
verdict: scoreToVerdict(score),
|
|
27064
27067
|
assertions,
|
|
27065
27068
|
expectedAspectCount: Math.max(assertions.length, 1),
|
|
27066
|
-
|
|
27069
|
+
graderRawRequest,
|
|
27067
27070
|
graderTarget,
|
|
27068
27071
|
details: data.details && Object.keys(data.details).length > 0 ? { ...details, ...data.details } : details
|
|
27069
27072
|
};
|
|
@@ -27078,7 +27081,7 @@ ${outputSchema2}`;
|
|
|
27078
27081
|
}
|
|
27079
27082
|
],
|
|
27080
27083
|
expectedAspectCount: 1,
|
|
27081
|
-
|
|
27084
|
+
graderRawRequest,
|
|
27082
27085
|
graderTarget,
|
|
27083
27086
|
details
|
|
27084
27087
|
};
|
|
@@ -27093,7 +27096,7 @@ ${outputSchema2}`;
|
|
|
27093
27096
|
buildScoreRangePrompt(context2, rubrics) {
|
|
27094
27097
|
const formattedQuestion = context2.promptInputs.question && context2.promptInputs.question.trim().length > 0 ? context2.promptInputs.question : context2.evalCase.question;
|
|
27095
27098
|
const parts = [
|
|
27096
|
-
"You are an expert
|
|
27099
|
+
"You are an expert grader. Score the candidate answer on each criterion below using the provided score ranges.",
|
|
27097
27100
|
"For each criterion, output an integer score from 0 to 10 based on which score range best matches the answer.",
|
|
27098
27101
|
"",
|
|
27099
27102
|
"[[ ## question ## ]]",
|
|
@@ -27136,7 +27139,7 @@ ${outputSchema2}`;
|
|
|
27136
27139
|
buildRubricPrompt(context2, rubrics) {
|
|
27137
27140
|
const formattedQuestion = context2.promptInputs.question && context2.promptInputs.question.trim().length > 0 ? context2.promptInputs.question : context2.evalCase.question;
|
|
27138
27141
|
const parts = [
|
|
27139
|
-
"You are an expert
|
|
27142
|
+
"You are an expert grader. Evaluate the candidate answer against each rubric item below.",
|
|
27140
27143
|
"",
|
|
27141
27144
|
"[[ ## question ## ]]",
|
|
27142
27145
|
formattedQuestion,
|
|
@@ -27310,7 +27313,7 @@ function sumTokenUsage(first, second) {
|
|
|
27310
27313
|
};
|
|
27311
27314
|
}
|
|
27312
27315
|
function buildRubricOutputSchema() {
|
|
27313
|
-
return `You are an expert
|
|
27316
|
+
return `You are an expert grader. Evaluate the candidate answer against each rubric item.
|
|
27314
27317
|
You must return a valid JSON object matching this schema:
|
|
27315
27318
|
{
|
|
27316
27319
|
"checks": [
|
|
@@ -27344,7 +27347,7 @@ function warnDeprecatedTemplateVars(template) {
|
|
|
27344
27347
|
console.warn(
|
|
27345
27348
|
`${ANSI_YELLOW7}\u26A0 Deprecated template variables detected (they still work but will be removed in a future version):
|
|
27346
27349
|
${used.join("\n ")}
|
|
27347
|
-
Update your custom
|
|
27350
|
+
Update your custom grader template to use the new names.${ANSI_RESET8}`
|
|
27348
27351
|
);
|
|
27349
27352
|
}
|
|
27350
27353
|
}
|
|
@@ -27376,7 +27379,7 @@ function calculateRubricScore(result, rubrics) {
|
|
|
27376
27379
|
return { score, verdict, assertions };
|
|
27377
27380
|
}
|
|
27378
27381
|
function buildScoreRangeOutputSchema() {
|
|
27379
|
-
return `You are an expert
|
|
27382
|
+
return `You are an expert grader. Score the candidate answer on each criterion.
|
|
27380
27383
|
You must return a valid JSON object matching this schema:
|
|
27381
27384
|
{
|
|
27382
27385
|
"checks": [
|
|
@@ -27586,9 +27589,9 @@ async function searchDirectory(dirPath, workspacePath, regex, matches) {
|
|
|
27586
27589
|
var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
|
|
27587
27590
|
{{EVALUATOR_RESULTS_JSON}}
|
|
27588
27591
|
|
|
27589
|
-
Decide the final score and verdict based on all
|
|
27592
|
+
Decide the final score and verdict based on all grader results.
|
|
27590
27593
|
Return a JSON object with: score (0.0-1.0), verdict (pass/fail), and reasoning.`;
|
|
27591
|
-
var
|
|
27594
|
+
var CompositeGrader = class {
|
|
27592
27595
|
kind = "composite";
|
|
27593
27596
|
config;
|
|
27594
27597
|
evaluatorFactory;
|
|
@@ -27639,7 +27642,7 @@ var CompositeEvaluator = class {
|
|
|
27639
27642
|
weight,
|
|
27640
27643
|
verdict: member.result.verdict,
|
|
27641
27644
|
assertions: [...member.result.assertions],
|
|
27642
|
-
|
|
27645
|
+
graderRawRequest: member.result.graderRawRequest,
|
|
27643
27646
|
scores: member.result.scores,
|
|
27644
27647
|
details: member.result.details,
|
|
27645
27648
|
tokenUsage: member.result.tokenUsage
|
|
@@ -27660,7 +27663,7 @@ var CompositeEvaluator = class {
|
|
|
27660
27663
|
verdict: "skip",
|
|
27661
27664
|
assertions: [{ text: "All evaluators skipped (infrastructure failure)", passed: false }],
|
|
27662
27665
|
expectedAspectCount: 1,
|
|
27663
|
-
|
|
27666
|
+
graderRawRequest: {
|
|
27664
27667
|
aggregator: "weighted_average",
|
|
27665
27668
|
...weights ? { weights } : {}
|
|
27666
27669
|
},
|
|
@@ -27673,7 +27676,7 @@ var CompositeEvaluator = class {
|
|
|
27673
27676
|
verdict: scoreToVerdict(finalScore),
|
|
27674
27677
|
assertions: allAssertions,
|
|
27675
27678
|
expectedAspectCount: allAssertions.length || 1,
|
|
27676
|
-
|
|
27679
|
+
graderRawRequest: {
|
|
27677
27680
|
aggregator: "weighted_average",
|
|
27678
27681
|
...weights ? { weights } : {}
|
|
27679
27682
|
},
|
|
@@ -27692,7 +27695,7 @@ var CompositeEvaluator = class {
|
|
|
27692
27695
|
score: member.result.score,
|
|
27693
27696
|
verdict: member.result.verdict,
|
|
27694
27697
|
assertions: [...member.result.assertions],
|
|
27695
|
-
|
|
27698
|
+
graderRawRequest: member.result.graderRawRequest,
|
|
27696
27699
|
scores: member.result.scores,
|
|
27697
27700
|
details: member.result.details,
|
|
27698
27701
|
tokenUsage: member.result.tokenUsage
|
|
@@ -27715,7 +27718,7 @@ var CompositeEvaluator = class {
|
|
|
27715
27718
|
verdict: "skip",
|
|
27716
27719
|
assertions: [{ text: "All evaluators skipped (infrastructure failure)", passed: false }],
|
|
27717
27720
|
expectedAspectCount: 1,
|
|
27718
|
-
|
|
27721
|
+
graderRawRequest: {
|
|
27719
27722
|
aggregator: "threshold",
|
|
27720
27723
|
threshold
|
|
27721
27724
|
},
|
|
@@ -27734,7 +27737,7 @@ var CompositeEvaluator = class {
|
|
|
27734
27737
|
verdict: pass ? "pass" : "fail",
|
|
27735
27738
|
assertions: allAssertions,
|
|
27736
27739
|
expectedAspectCount: allAssertions.length || 1,
|
|
27737
|
-
|
|
27740
|
+
graderRawRequest: {
|
|
27738
27741
|
aggregator: "threshold",
|
|
27739
27742
|
threshold
|
|
27740
27743
|
},
|
|
@@ -27751,7 +27754,7 @@ var CompositeEvaluator = class {
|
|
|
27751
27754
|
weight: weights?.[member.id] ?? 1,
|
|
27752
27755
|
verdict: member.result.verdict,
|
|
27753
27756
|
assertions: [...member.result.assertions],
|
|
27754
|
-
|
|
27757
|
+
graderRawRequest: member.result.graderRawRequest,
|
|
27755
27758
|
scores: member.result.scores,
|
|
27756
27759
|
details: member.result.details
|
|
27757
27760
|
}));
|
|
@@ -27772,7 +27775,7 @@ var CompositeEvaluator = class {
|
|
|
27772
27775
|
verdict,
|
|
27773
27776
|
assertions,
|
|
27774
27777
|
expectedAspectCount: assertions.length || 1,
|
|
27775
|
-
|
|
27778
|
+
graderRawRequest: {
|
|
27776
27779
|
aggregator: "code-grader",
|
|
27777
27780
|
script: scriptPath
|
|
27778
27781
|
},
|
|
@@ -27785,7 +27788,7 @@ var CompositeEvaluator = class {
|
|
|
27785
27788
|
verdict: "fail",
|
|
27786
27789
|
assertions: [{ text: `Code aggregator failed: ${message}`, passed: false }],
|
|
27787
27790
|
expectedAspectCount: 1,
|
|
27788
|
-
|
|
27791
|
+
graderRawRequest: {
|
|
27789
27792
|
aggregator: "code-grader",
|
|
27790
27793
|
script: scriptPath,
|
|
27791
27794
|
error: message
|
|
@@ -27807,14 +27810,14 @@ var CompositeEvaluator = class {
|
|
|
27807
27810
|
score: member.result.score,
|
|
27808
27811
|
verdict: member.result.verdict,
|
|
27809
27812
|
assertions: [...member.result.assertions],
|
|
27810
|
-
|
|
27813
|
+
graderRawRequest: member.result.graderRawRequest,
|
|
27811
27814
|
scores: member.result.scores,
|
|
27812
27815
|
details: member.result.details
|
|
27813
27816
|
}));
|
|
27814
27817
|
const promptTemplate = config.prompt ?? DEFAULT_COMPOSITE_AGGREGATOR_PROMPT;
|
|
27815
27818
|
const userPrompt = promptTemplate.replace(/\{\{EVALUATOR_RESULTS_JSON\}\}/g, resultsJson);
|
|
27816
27819
|
const systemPrompt = buildOutputSchema();
|
|
27817
|
-
const
|
|
27820
|
+
const graderRawRequest = {
|
|
27818
27821
|
aggregator: "llm-grader",
|
|
27819
27822
|
userPrompt,
|
|
27820
27823
|
systemPrompt,
|
|
@@ -27836,7 +27839,7 @@ var CompositeEvaluator = class {
|
|
|
27836
27839
|
verdict: scoreToVerdict(score2),
|
|
27837
27840
|
assertions: assertions2,
|
|
27838
27841
|
expectedAspectCount: Math.max(assertions2.length, 1),
|
|
27839
|
-
|
|
27842
|
+
graderRawRequest,
|
|
27840
27843
|
scores
|
|
27841
27844
|
};
|
|
27842
27845
|
}
|
|
@@ -27856,7 +27859,7 @@ var CompositeEvaluator = class {
|
|
|
27856
27859
|
verdict: scoreToVerdict(score),
|
|
27857
27860
|
assertions,
|
|
27858
27861
|
expectedAspectCount: Math.max(assertions.length, 1),
|
|
27859
|
-
|
|
27862
|
+
graderRawRequest,
|
|
27860
27863
|
scores
|
|
27861
27864
|
};
|
|
27862
27865
|
} catch {
|
|
@@ -27865,13 +27868,13 @@ var CompositeEvaluator = class {
|
|
|
27865
27868
|
verdict: "fail",
|
|
27866
27869
|
assertions: [{ text: "LLM aggregator failed", passed: false }],
|
|
27867
27870
|
expectedAspectCount: 1,
|
|
27868
|
-
|
|
27871
|
+
graderRawRequest,
|
|
27869
27872
|
scores
|
|
27870
27873
|
};
|
|
27871
27874
|
}
|
|
27872
27875
|
}
|
|
27873
27876
|
};
|
|
27874
|
-
var
|
|
27877
|
+
var CostGrader = class {
|
|
27875
27878
|
kind = "cost";
|
|
27876
27879
|
config;
|
|
27877
27880
|
constructor(options) {
|
|
@@ -27886,7 +27889,7 @@ var CostEvaluator = class {
|
|
|
27886
27889
|
verdict: "fail",
|
|
27887
27890
|
assertions: [{ text: "No cost data available in trace", passed: false }],
|
|
27888
27891
|
expectedAspectCount: 1,
|
|
27889
|
-
|
|
27892
|
+
graderRawRequest: {
|
|
27890
27893
|
type: "cost",
|
|
27891
27894
|
budget,
|
|
27892
27895
|
costUsd: null
|
|
@@ -27903,7 +27906,7 @@ var CostEvaluator = class {
|
|
|
27903
27906
|
passed ? { text: `Cost ${formatCost(costUsd)} <= ${formatCost(budget)} budget`, passed: true } : { text: `Cost ${formatCost(costUsd)} > ${formatCost(budget)} budget`, passed: false }
|
|
27904
27907
|
],
|
|
27905
27908
|
expectedAspectCount: 1,
|
|
27906
|
-
|
|
27909
|
+
graderRawRequest: {
|
|
27907
27910
|
type: "cost",
|
|
27908
27911
|
budget,
|
|
27909
27912
|
costUsd
|
|
@@ -27911,7 +27914,7 @@ var CostEvaluator = class {
|
|
|
27911
27914
|
};
|
|
27912
27915
|
}
|
|
27913
27916
|
};
|
|
27914
|
-
var
|
|
27917
|
+
var ExecutionMetricsGrader = class {
|
|
27915
27918
|
kind = "execution-metrics";
|
|
27916
27919
|
config;
|
|
27917
27920
|
constructor(options) {
|
|
@@ -27935,7 +27938,7 @@ var ExecutionMetricsEvaluator = class {
|
|
|
27935
27938
|
verdict: "fail",
|
|
27936
27939
|
assertions: [{ text: "No trace summary available", passed: false }],
|
|
27937
27940
|
expectedAspectCount: 1,
|
|
27938
|
-
|
|
27941
|
+
graderRawRequest: {
|
|
27939
27942
|
type: "execution-metrics",
|
|
27940
27943
|
config: this.extractConfiguredThresholds(),
|
|
27941
27944
|
actual: null
|
|
@@ -28051,7 +28054,7 @@ var ExecutionMetricsEvaluator = class {
|
|
|
28051
28054
|
verdict: scoreToVerdict(score),
|
|
28052
28055
|
assertions,
|
|
28053
28056
|
expectedAspectCount: totalChecks || 1,
|
|
28054
|
-
|
|
28057
|
+
graderRawRequest: {
|
|
28055
28058
|
type: "execution-metrics",
|
|
28056
28059
|
config: this.extractConfiguredThresholds(),
|
|
28057
28060
|
actual: this.filterDefinedMetrics(actualMetrics)
|
|
@@ -28137,7 +28140,7 @@ var MONTH_NAMES = {
|
|
|
28137
28140
|
dec: 11,
|
|
28138
28141
|
december: 11
|
|
28139
28142
|
};
|
|
28140
|
-
var
|
|
28143
|
+
var FieldAccuracyGrader = class {
|
|
28141
28144
|
kind = "field-accuracy";
|
|
28142
28145
|
config;
|
|
28143
28146
|
constructor(options) {
|
|
@@ -28495,7 +28498,7 @@ function formatDateISO(date) {
|
|
|
28495
28498
|
function parseJsonFromTextSafe(text2) {
|
|
28496
28499
|
return parseJsonFromText(text2);
|
|
28497
28500
|
}
|
|
28498
|
-
var
|
|
28501
|
+
var LatencyGrader = class {
|
|
28499
28502
|
kind = "latency";
|
|
28500
28503
|
config;
|
|
28501
28504
|
constructor(options) {
|
|
@@ -28510,7 +28513,7 @@ var LatencyEvaluator = class {
|
|
|
28510
28513
|
verdict: "fail",
|
|
28511
28514
|
assertions: [{ text: "No duration data available in trace", passed: false }],
|
|
28512
28515
|
expectedAspectCount: 1,
|
|
28513
|
-
|
|
28516
|
+
graderRawRequest: {
|
|
28514
28517
|
type: "latency",
|
|
28515
28518
|
threshold,
|
|
28516
28519
|
durationMs: null
|
|
@@ -28526,7 +28529,7 @@ var LatencyEvaluator = class {
|
|
|
28526
28529
|
passed ? { text: `Duration ${durationMs}ms <= ${threshold}ms threshold`, passed: true } : { text: `Duration ${durationMs}ms > ${threshold}ms threshold`, passed: false }
|
|
28527
28530
|
],
|
|
28528
28531
|
expectedAspectCount: 1,
|
|
28529
|
-
|
|
28532
|
+
graderRawRequest: {
|
|
28530
28533
|
type: "latency",
|
|
28531
28534
|
threshold,
|
|
28532
28535
|
durationMs
|
|
@@ -28534,7 +28537,7 @@ var LatencyEvaluator = class {
|
|
|
28534
28537
|
};
|
|
28535
28538
|
}
|
|
28536
28539
|
};
|
|
28537
|
-
var
|
|
28540
|
+
var SkillTriggerGrader = class {
|
|
28538
28541
|
kind = "skill-trigger";
|
|
28539
28542
|
config;
|
|
28540
28543
|
constructor(config) {
|
|
@@ -28607,7 +28610,7 @@ function assembleLlmGraderPrompt(input) {
|
|
|
28607
28610
|
promptInputs,
|
|
28608
28611
|
evaluatorConfig,
|
|
28609
28612
|
fileChanges,
|
|
28610
|
-
|
|
28613
|
+
graderTemplateOverride
|
|
28611
28614
|
} = input;
|
|
28612
28615
|
const rubrics = evaluatorConfig?.rubrics;
|
|
28613
28616
|
if (rubrics && rubrics.length > 0) {
|
|
@@ -28617,15 +28620,9 @@ function assembleLlmGraderPrompt(input) {
|
|
|
28617
28620
|
}
|
|
28618
28621
|
return assembleChecklist(evalCase, candidate, promptInputs, rubrics, fileChanges);
|
|
28619
28622
|
}
|
|
28620
|
-
return assembleFreeform(
|
|
28621
|
-
evalCase,
|
|
28622
|
-
candidate,
|
|
28623
|
-
promptInputs,
|
|
28624
|
-
fileChanges,
|
|
28625
|
-
evaluatorTemplateOverride
|
|
28626
|
-
);
|
|
28623
|
+
return assembleFreeform(evalCase, candidate, promptInputs, fileChanges, graderTemplateOverride);
|
|
28627
28624
|
}
|
|
28628
|
-
function assembleFreeform(evalCase, candidate, promptInputs, fileChanges,
|
|
28625
|
+
function assembleFreeform(evalCase, candidate, promptInputs, fileChanges, graderTemplateOverride) {
|
|
28629
28626
|
const formattedQuestion = promptInputs.question && promptInputs.question.trim().length > 0 ? promptInputs.question : evalCase.question;
|
|
28630
28627
|
const variables = {
|
|
28631
28628
|
[TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
|
|
@@ -28639,9 +28636,9 @@ function assembleFreeform(evalCase, candidate, promptInputs, fileChanges, evalua
|
|
|
28639
28636
|
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (evalCase.reference_answer ?? "").trim()
|
|
28640
28637
|
};
|
|
28641
28638
|
const systemPrompt = buildOutputSchema();
|
|
28642
|
-
const template =
|
|
28639
|
+
const template = graderTemplateOverride ?? DEFAULT_GRADER_TEMPLATE;
|
|
28643
28640
|
let userPrompt = substituteVariables(template, variables);
|
|
28644
|
-
if (fileChanges && !
|
|
28641
|
+
if (fileChanges && !graderTemplateOverride) {
|
|
28645
28642
|
userPrompt += `
|
|
28646
28643
|
|
|
28647
28644
|
[[ ## file_changes ## ]]
|
|
@@ -28657,7 +28654,7 @@ ${fileChanges}`;
|
|
|
28657
28654
|
function assembleChecklist(evalCase, candidate, promptInputs, rubrics, fileChanges) {
|
|
28658
28655
|
const formattedQuestion = promptInputs.question && promptInputs.question.trim().length > 0 ? promptInputs.question : evalCase.question;
|
|
28659
28656
|
const parts = [
|
|
28660
|
-
"You are an expert
|
|
28657
|
+
"You are an expert grader. Evaluate the candidate answer against each rubric item below.",
|
|
28661
28658
|
"",
|
|
28662
28659
|
"[[ ## question ## ]]",
|
|
28663
28660
|
formattedQuestion,
|
|
@@ -28692,7 +28689,7 @@ function assembleChecklist(evalCase, candidate, promptInputs, rubrics, fileChang
|
|
|
28692
28689
|
function assembleScoreRange(evalCase, candidate, promptInputs, rubrics, fileChanges) {
|
|
28693
28690
|
const formattedQuestion = promptInputs.question && promptInputs.question.trim().length > 0 ? promptInputs.question : evalCase.question;
|
|
28694
28691
|
const parts = [
|
|
28695
|
-
"You are an expert
|
|
28692
|
+
"You are an expert grader. Score the candidate answer on each criterion below using the provided score ranges.",
|
|
28696
28693
|
"For each criterion, output an integer score from 0 to 10 based on which score range best matches the answer.",
|
|
28697
28694
|
"",
|
|
28698
28695
|
"[[ ## question ## ]]",
|
|
@@ -28739,7 +28736,7 @@ function assembleScoreRange(evalCase, candidate, promptInputs, rubrics, fileChan
|
|
|
28739
28736
|
mode: "score_range"
|
|
28740
28737
|
};
|
|
28741
28738
|
}
|
|
28742
|
-
var
|
|
28739
|
+
var TokenUsageGrader = class {
|
|
28743
28740
|
kind = "token-usage";
|
|
28744
28741
|
config;
|
|
28745
28742
|
constructor(options) {
|
|
@@ -28760,7 +28757,7 @@ var TokenUsageEvaluator = class {
|
|
|
28760
28757
|
verdict: "fail",
|
|
28761
28758
|
assertions: [{ text: "No token usage data available in trace", passed: false }],
|
|
28762
28759
|
expectedAspectCount,
|
|
28763
|
-
|
|
28760
|
+
graderRawRequest: {
|
|
28764
28761
|
type: "token-usage",
|
|
28765
28762
|
max_total: maxTotal ?? null,
|
|
28766
28763
|
max_input: maxInput ?? null,
|
|
@@ -28801,7 +28798,7 @@ var TokenUsageEvaluator = class {
|
|
|
28801
28798
|
verdict: passed ? "pass" : "fail",
|
|
28802
28799
|
assertions,
|
|
28803
28800
|
expectedAspectCount,
|
|
28804
|
-
|
|
28801
|
+
graderRawRequest: {
|
|
28805
28802
|
type: "token-usage",
|
|
28806
28803
|
max_total: maxTotal ?? null,
|
|
28807
28804
|
max_input: maxInput ?? null,
|
|
@@ -28884,7 +28881,7 @@ function checkLatency(toolName, maxDurationMs, actualDurationMs) {
|
|
|
28884
28881
|
message: `${toolName} took ${actualDurationMs}ms (max: ${maxDurationMs}ms)`
|
|
28885
28882
|
};
|
|
28886
28883
|
}
|
|
28887
|
-
var
|
|
28884
|
+
var ToolTrajectoryGrader = class {
|
|
28888
28885
|
kind = "tool-trajectory";
|
|
28889
28886
|
config;
|
|
28890
28887
|
constructor(options) {
|
|
@@ -29568,14 +29565,14 @@ function validateConcurrency(concurrency) {
|
|
|
29568
29565
|
throw new TypeError("Expected `concurrency` to be a number from 1 and up");
|
|
29569
29566
|
}
|
|
29570
29567
|
}
|
|
29571
|
-
var
|
|
29568
|
+
var GraderRegistry = class {
|
|
29572
29569
|
factories = /* @__PURE__ */ new Map();
|
|
29573
|
-
/** Register a factory function for an
|
|
29570
|
+
/** Register a factory function for an grader type. */
|
|
29574
29571
|
register(type, factory) {
|
|
29575
29572
|
this.factories.set(type, factory);
|
|
29576
29573
|
return this;
|
|
29577
29574
|
}
|
|
29578
|
-
/** Get the factory function for an
|
|
29575
|
+
/** Get the factory function for an grader type. */
|
|
29579
29576
|
get(type) {
|
|
29580
29577
|
return this.factories.get(type);
|
|
29581
29578
|
}
|
|
@@ -29583,25 +29580,25 @@ var EvaluatorRegistry = class {
|
|
|
29583
29580
|
has(type) {
|
|
29584
29581
|
return this.factories.has(type);
|
|
29585
29582
|
}
|
|
29586
|
-
/** List all registered
|
|
29583
|
+
/** List all registered grader type names. */
|
|
29587
29584
|
list() {
|
|
29588
29585
|
return [...this.factories.keys()];
|
|
29589
29586
|
}
|
|
29590
29587
|
/**
|
|
29591
29588
|
* Create an evaluator instance from a config, using the registered factory.
|
|
29592
|
-
* Throws if no factory is registered for the
|
|
29589
|
+
* Throws if no factory is registered for the grader type.
|
|
29593
29590
|
*/
|
|
29594
29591
|
async create(config, context2) {
|
|
29595
29592
|
const factory = this.factories.get(config.type);
|
|
29596
29593
|
if (!factory) {
|
|
29597
29594
|
throw new Error(
|
|
29598
|
-
`Unknown
|
|
29595
|
+
`Unknown grader type: "${config.type}". Registered types: ${this.list().join(", ")}`
|
|
29599
29596
|
);
|
|
29600
29597
|
}
|
|
29601
29598
|
return factory(config, context2);
|
|
29602
29599
|
}
|
|
29603
29600
|
};
|
|
29604
|
-
var
|
|
29601
|
+
var DeterministicAssertionGrader = class {
|
|
29605
29602
|
constructor(kind, assertFn) {
|
|
29606
29603
|
this.assertFn = assertFn;
|
|
29607
29604
|
this.kind = kind;
|
|
@@ -29611,7 +29608,7 @@ var DeterministicAssertionEvaluator = class {
|
|
|
29611
29608
|
return this.assertFn(context2);
|
|
29612
29609
|
}
|
|
29613
29610
|
};
|
|
29614
|
-
var
|
|
29611
|
+
var InlineAssertGrader = class {
|
|
29615
29612
|
constructor(fn, name21) {
|
|
29616
29613
|
this.fn = fn;
|
|
29617
29614
|
this.name = name21;
|
|
@@ -29715,7 +29712,7 @@ var llmGraderFactory = (config, context2) => {
|
|
|
29715
29712
|
);
|
|
29716
29713
|
}
|
|
29717
29714
|
const isAgent = isAgentProvider(graderTargetProvider) || graderTargetProvider.kind === "agentv";
|
|
29718
|
-
evaluator = new
|
|
29715
|
+
evaluator = new LlmGrader({
|
|
29719
29716
|
resolveGraderProvider: async (evalContext) => {
|
|
29720
29717
|
if (graderTargetProvider) return graderTargetProvider;
|
|
29721
29718
|
if (evalContext.graderProvider) return evalContext.graderProvider;
|
|
@@ -29743,11 +29740,11 @@ var llmGraderFactory = (config, context2) => {
|
|
|
29743
29740
|
agentTimeoutMs
|
|
29744
29741
|
);
|
|
29745
29742
|
const isFromInlinePrompt = !c.resolvedPromptScript?.length && !c.resolvedPromptPath && !c.promptPath;
|
|
29746
|
-
let
|
|
29743
|
+
let graderTemplateOverride;
|
|
29747
29744
|
let evalCase = evalContext.evalCase;
|
|
29748
29745
|
if (customPrompt) {
|
|
29749
29746
|
if (!isFromInlinePrompt || containsTemplateVariables(customPrompt)) {
|
|
29750
|
-
|
|
29747
|
+
graderTemplateOverride = customPrompt;
|
|
29751
29748
|
} else {
|
|
29752
29749
|
evalCase = { ...evalCase, criteria: customPrompt };
|
|
29753
29750
|
}
|
|
@@ -29755,7 +29752,7 @@ var llmGraderFactory = (config, context2) => {
|
|
|
29755
29752
|
return evaluator.evaluate({
|
|
29756
29753
|
...evalContext,
|
|
29757
29754
|
evalCase,
|
|
29758
|
-
|
|
29755
|
+
graderTemplateOverride,
|
|
29759
29756
|
evaluator: c
|
|
29760
29757
|
});
|
|
29761
29758
|
}
|
|
@@ -29763,7 +29760,7 @@ var llmGraderFactory = (config, context2) => {
|
|
|
29763
29760
|
};
|
|
29764
29761
|
var codeFactory = (config, context2) => {
|
|
29765
29762
|
const c = config;
|
|
29766
|
-
return new
|
|
29763
|
+
return new CodeGrader({
|
|
29767
29764
|
command: c.command ?? c.script ?? [],
|
|
29768
29765
|
cwd: c.resolvedCwd ?? c.cwd,
|
|
29769
29766
|
agentTimeoutMs: context2.agentTimeoutMs,
|
|
@@ -29774,19 +29771,19 @@ var codeFactory = (config, context2) => {
|
|
|
29774
29771
|
var compositeFactory = (config, context2) => {
|
|
29775
29772
|
const c = config;
|
|
29776
29773
|
const evalFileDir = context2.evalFileDir ?? process.cwd();
|
|
29777
|
-
return new
|
|
29774
|
+
return new CompositeGrader({
|
|
29778
29775
|
config: c,
|
|
29779
29776
|
cwd: evalFileDir,
|
|
29780
29777
|
evaluatorFactory: {
|
|
29781
29778
|
create: (memberConfig) => {
|
|
29782
29779
|
const factory = context2.registry.get(memberConfig.type);
|
|
29783
29780
|
if (!factory) {
|
|
29784
|
-
throw new Error(`Unsupported
|
|
29781
|
+
throw new Error(`Unsupported grader type in composite: ${memberConfig.type}`);
|
|
29785
29782
|
}
|
|
29786
29783
|
const result = factory(memberConfig, context2);
|
|
29787
29784
|
if (result instanceof Promise) {
|
|
29788
29785
|
throw new Error(
|
|
29789
|
-
`
|
|
29786
|
+
`Grader factory for type "${memberConfig.type}" is async \u2014 not supported inside composite members. Use synchronous factories for composite child evaluators.`
|
|
29790
29787
|
);
|
|
29791
29788
|
}
|
|
29792
29789
|
return result;
|
|
@@ -29795,35 +29792,35 @@ var compositeFactory = (config, context2) => {
|
|
|
29795
29792
|
});
|
|
29796
29793
|
};
|
|
29797
29794
|
var toolTrajectoryFactory = (config) => {
|
|
29798
|
-
return new
|
|
29795
|
+
return new ToolTrajectoryGrader({
|
|
29799
29796
|
config
|
|
29800
29797
|
});
|
|
29801
29798
|
};
|
|
29802
29799
|
var fieldAccuracyFactory = (config) => {
|
|
29803
|
-
return new
|
|
29800
|
+
return new FieldAccuracyGrader({
|
|
29804
29801
|
config
|
|
29805
29802
|
});
|
|
29806
29803
|
};
|
|
29807
29804
|
var latencyFactory = (config) => {
|
|
29808
|
-
return new
|
|
29805
|
+
return new LatencyGrader({ config });
|
|
29809
29806
|
};
|
|
29810
29807
|
var costFactory = (config) => {
|
|
29811
|
-
return new
|
|
29808
|
+
return new CostGrader({ config });
|
|
29812
29809
|
};
|
|
29813
29810
|
var tokenUsageFactory = (config) => {
|
|
29814
|
-
return new
|
|
29811
|
+
return new TokenUsageGrader({ config });
|
|
29815
29812
|
};
|
|
29816
29813
|
var executionMetricsFactory = (config) => {
|
|
29817
|
-
return new
|
|
29814
|
+
return new ExecutionMetricsGrader({
|
|
29818
29815
|
config
|
|
29819
29816
|
});
|
|
29820
29817
|
};
|
|
29821
29818
|
var skillTriggerFactory = (config) => {
|
|
29822
|
-
return new
|
|
29819
|
+
return new SkillTriggerGrader(config);
|
|
29823
29820
|
};
|
|
29824
29821
|
var containsFactory = (config) => {
|
|
29825
29822
|
const c = config;
|
|
29826
|
-
return new
|
|
29823
|
+
return new DeterministicAssertionGrader("contains", (ctx) => {
|
|
29827
29824
|
const result = runContainsAssertion(ctx.candidate, c.value);
|
|
29828
29825
|
return {
|
|
29829
29826
|
score: result.score,
|
|
@@ -29835,7 +29832,7 @@ var containsFactory = (config) => {
|
|
|
29835
29832
|
};
|
|
29836
29833
|
var regexFactory = (config) => {
|
|
29837
29834
|
const c = config;
|
|
29838
|
-
return new
|
|
29835
|
+
return new DeterministicAssertionGrader("regex", (ctx) => {
|
|
29839
29836
|
const result = runRegexAssertion(ctx.candidate, c.value, c.flags);
|
|
29840
29837
|
return {
|
|
29841
29838
|
score: result.score,
|
|
@@ -29846,7 +29843,7 @@ var regexFactory = (config) => {
|
|
|
29846
29843
|
});
|
|
29847
29844
|
};
|
|
29848
29845
|
var isJsonFactory = () => {
|
|
29849
|
-
return new
|
|
29846
|
+
return new DeterministicAssertionGrader("is-json", (ctx) => {
|
|
29850
29847
|
const result = runIsJsonAssertion(ctx.candidate);
|
|
29851
29848
|
return {
|
|
29852
29849
|
score: result.score,
|
|
@@ -29858,7 +29855,7 @@ var isJsonFactory = () => {
|
|
|
29858
29855
|
};
|
|
29859
29856
|
var equalsFactory = (config) => {
|
|
29860
29857
|
const c = config;
|
|
29861
|
-
return new
|
|
29858
|
+
return new DeterministicAssertionGrader("equals", (ctx) => {
|
|
29862
29859
|
const result = runEqualsAssertion(ctx.candidate, c.value);
|
|
29863
29860
|
return {
|
|
29864
29861
|
score: result.score,
|
|
@@ -29870,7 +29867,7 @@ var equalsFactory = (config) => {
|
|
|
29870
29867
|
};
|
|
29871
29868
|
var containsAnyFactory = (config) => {
|
|
29872
29869
|
const c = config;
|
|
29873
|
-
return new
|
|
29870
|
+
return new DeterministicAssertionGrader("contains-any", (ctx) => {
|
|
29874
29871
|
const result = runContainsAnyAssertion(ctx.candidate, c.value);
|
|
29875
29872
|
return {
|
|
29876
29873
|
score: result.score,
|
|
@@ -29882,7 +29879,7 @@ var containsAnyFactory = (config) => {
|
|
|
29882
29879
|
};
|
|
29883
29880
|
var containsAllFactory = (config) => {
|
|
29884
29881
|
const c = config;
|
|
29885
|
-
return new
|
|
29882
|
+
return new DeterministicAssertionGrader("contains-all", (ctx) => {
|
|
29886
29883
|
const result = runContainsAllAssertion(ctx.candidate, c.value);
|
|
29887
29884
|
return {
|
|
29888
29885
|
score: result.score,
|
|
@@ -29894,7 +29891,7 @@ var containsAllFactory = (config) => {
|
|
|
29894
29891
|
};
|
|
29895
29892
|
var icontainsFactory = (config) => {
|
|
29896
29893
|
const c = config;
|
|
29897
|
-
return new
|
|
29894
|
+
return new DeterministicAssertionGrader("icontains", (ctx) => {
|
|
29898
29895
|
const result = runIcontainsAssertion(ctx.candidate, c.value);
|
|
29899
29896
|
return {
|
|
29900
29897
|
score: result.score,
|
|
@@ -29906,7 +29903,7 @@ var icontainsFactory = (config) => {
|
|
|
29906
29903
|
};
|
|
29907
29904
|
var icontainsAnyFactory = (config) => {
|
|
29908
29905
|
const c = config;
|
|
29909
|
-
return new
|
|
29906
|
+
return new DeterministicAssertionGrader("icontains-any", (ctx) => {
|
|
29910
29907
|
const result = runIcontainsAnyAssertion(ctx.candidate, c.value);
|
|
29911
29908
|
return {
|
|
29912
29909
|
score: result.score,
|
|
@@ -29918,7 +29915,7 @@ var icontainsAnyFactory = (config) => {
|
|
|
29918
29915
|
};
|
|
29919
29916
|
var icontainsAllFactory = (config) => {
|
|
29920
29917
|
const c = config;
|
|
29921
|
-
return new
|
|
29918
|
+
return new DeterministicAssertionGrader("icontains-all", (ctx) => {
|
|
29922
29919
|
const result = runIcontainsAllAssertion(ctx.candidate, c.value);
|
|
29923
29920
|
return {
|
|
29924
29921
|
score: result.score,
|
|
@@ -29930,7 +29927,7 @@ var icontainsAllFactory = (config) => {
|
|
|
29930
29927
|
};
|
|
29931
29928
|
var startsWithFactory = (config) => {
|
|
29932
29929
|
const c = config;
|
|
29933
|
-
return new
|
|
29930
|
+
return new DeterministicAssertionGrader("starts-with", (ctx) => {
|
|
29934
29931
|
const result = runStartsWithAssertion(ctx.candidate, c.value);
|
|
29935
29932
|
return {
|
|
29936
29933
|
score: result.score,
|
|
@@ -29942,7 +29939,7 @@ var startsWithFactory = (config) => {
|
|
|
29942
29939
|
};
|
|
29943
29940
|
var endsWithFactory = (config) => {
|
|
29944
29941
|
const c = config;
|
|
29945
|
-
return new
|
|
29942
|
+
return new DeterministicAssertionGrader("ends-with", (ctx) => {
|
|
29946
29943
|
const result = runEndsWithAssertion(ctx.candidate, c.value);
|
|
29947
29944
|
return {
|
|
29948
29945
|
score: result.score,
|
|
@@ -29953,7 +29950,7 @@ var endsWithFactory = (config) => {
|
|
|
29953
29950
|
});
|
|
29954
29951
|
};
|
|
29955
29952
|
function createBuiltinRegistry() {
|
|
29956
|
-
const registry = new
|
|
29953
|
+
const registry = new GraderRegistry();
|
|
29957
29954
|
registry.register("llm-grader", llmGraderFactory).register("code-grader", codeFactory).register("composite", compositeFactory).register("tool-trajectory", toolTrajectoryFactory).register("field-accuracy", fieldAccuracyFactory).register("latency", latencyFactory).register("cost", costFactory).register("token-usage", tokenUsageFactory).register("execution-metrics", executionMetricsFactory).register("skill-trigger", skillTriggerFactory).register("contains", containsFactory).register("contains-any", containsAnyFactory).register("contains-all", containsAllFactory).register("icontains", icontainsFactory).register("icontains-any", icontainsAnyFactory).register("icontains-all", icontainsAllFactory).register("starts-with", startsWithFactory).register("ends-with", endsWithFactory).register("regex", regexFactory).register("is-json", isJsonFactory).register("equals", equalsFactory).register("inline-assert", (config) => {
|
|
29958
29955
|
const fn = config[INLINE_ASSERT_FN];
|
|
29959
29956
|
if (!fn) {
|
|
@@ -29961,7 +29958,7 @@ function createBuiltinRegistry() {
|
|
|
29961
29958
|
`No inline assert function found on config for "${config.name}". Inline assert functions must be attached via INLINE_ASSERT_FN symbol.`
|
|
29962
29959
|
);
|
|
29963
29960
|
}
|
|
29964
|
-
return new
|
|
29961
|
+
return new InlineAssertGrader(fn, config.name ?? "inline-assert");
|
|
29965
29962
|
});
|
|
29966
29963
|
return registry;
|
|
29967
29964
|
}
|
|
@@ -29994,7 +29991,7 @@ async function discoverAssertions(registry, baseDir) {
|
|
|
29994
29991
|
continue;
|
|
29995
29992
|
}
|
|
29996
29993
|
const factory = (_config, context2) => {
|
|
29997
|
-
return new
|
|
29994
|
+
return new CodeGrader({
|
|
29998
29995
|
command: ["bun", "run", filePath],
|
|
29999
29996
|
agentTimeoutMs: context2.agentTimeoutMs
|
|
30000
29997
|
});
|
|
@@ -30034,7 +30031,7 @@ async function discoverGraders(registry, baseDir) {
|
|
|
30034
30031
|
continue;
|
|
30035
30032
|
}
|
|
30036
30033
|
const factory = (_config, context2) => {
|
|
30037
|
-
return new
|
|
30034
|
+
return new CodeGrader({
|
|
30038
30035
|
command: ["bun", "run", filePath],
|
|
30039
30036
|
agentTimeoutMs: context2.agentTimeoutMs
|
|
30040
30037
|
});
|
|
@@ -30848,10 +30845,10 @@ function buildSkippedEvaluatorError(scores) {
|
|
|
30848
30845
|
}
|
|
30849
30846
|
const messages = skippedScores.map((score) => {
|
|
30850
30847
|
const label = score.name || score.type;
|
|
30851
|
-
const assertionMessage = score.assertions.find((assertion) => !assertion.passed)?.text ?? "
|
|
30848
|
+
const assertionMessage = score.assertions.find((assertion) => !assertion.passed)?.text ?? "Grader skipped";
|
|
30852
30849
|
return `${label}: ${assertionMessage}`;
|
|
30853
30850
|
});
|
|
30854
|
-
return messages.length === 1 ? messages[0] : `
|
|
30851
|
+
return messages.length === 1 ? messages[0] : `Graders skipped: ${messages.join(" | ")}`;
|
|
30855
30852
|
}
|
|
30856
30853
|
function usesFileReferencePrompt(provider) {
|
|
30857
30854
|
return isAgentProvider(provider) || provider.kind === "cli";
|
|
@@ -31020,7 +31017,7 @@ async function runEvaluation(options) {
|
|
|
31020
31017
|
cleanupWorkspaces,
|
|
31021
31018
|
trials,
|
|
31022
31019
|
streamCallbacks,
|
|
31023
|
-
|
|
31020
|
+
budgetUsd,
|
|
31024
31021
|
failOnError,
|
|
31025
31022
|
poolWorkspaces,
|
|
31026
31023
|
poolMaxSlots: configPoolMaxSlots,
|
|
@@ -31549,7 +31546,7 @@ async function runEvaluation(options) {
|
|
|
31549
31546
|
async function dispatchTest(evalCase, depResults) {
|
|
31550
31547
|
const workerId = nextWorkerId++;
|
|
31551
31548
|
workerIdByEvalId.set(evalCase.id, workerId);
|
|
31552
|
-
if (
|
|
31549
|
+
if (budgetUsd !== void 0 && budgetExhausted) {
|
|
31553
31550
|
const budgetResult = {
|
|
31554
31551
|
timestamp: (now2 ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
|
|
31555
31552
|
testId: evalCase.id,
|
|
@@ -31559,13 +31556,13 @@ async function runEvaluation(options) {
|
|
|
31559
31556
|
assertions: [],
|
|
31560
31557
|
output: [],
|
|
31561
31558
|
target: target.name,
|
|
31562
|
-
error: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${
|
|
31559
|
+
error: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${budgetUsd.toFixed(4)})`,
|
|
31563
31560
|
budgetExceeded: true,
|
|
31564
31561
|
executionStatus: "execution_error",
|
|
31565
31562
|
failureStage: "setup",
|
|
31566
31563
|
failureReasonCode: "budget_exceeded",
|
|
31567
31564
|
executionError: {
|
|
31568
|
-
message: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${
|
|
31565
|
+
message: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${budgetUsd.toFixed(4)})`,
|
|
31569
31566
|
stage: "setup"
|
|
31570
31567
|
}
|
|
31571
31568
|
};
|
|
@@ -31662,7 +31659,7 @@ async function runEvaluation(options) {
|
|
|
31662
31659
|
...depResults && Object.keys(depResults).length > 0 ? { dependencyResults: depResults } : {}
|
|
31663
31660
|
};
|
|
31664
31661
|
let result = trials && trials.count > 1 ? await runEvalCaseWithTrials(runCaseOptions, trials) : await runEvalCase(runCaseOptions);
|
|
31665
|
-
if (
|
|
31662
|
+
if (budgetUsd !== void 0) {
|
|
31666
31663
|
let caseCost;
|
|
31667
31664
|
if (result.trials && result.trials.length > 0) {
|
|
31668
31665
|
const trialCostSum = result.trials.reduce((sum, t) => sum + (t.costUsd ?? 0), 0);
|
|
@@ -31674,7 +31671,7 @@ async function runEvaluation(options) {
|
|
|
31674
31671
|
}
|
|
31675
31672
|
if (caseCost !== void 0) {
|
|
31676
31673
|
cumulativeBudgetCost += caseCost;
|
|
31677
|
-
if (cumulativeBudgetCost >=
|
|
31674
|
+
if (cumulativeBudgetCost >= budgetUsd) {
|
|
31678
31675
|
budgetExhausted = true;
|
|
31679
31676
|
}
|
|
31680
31677
|
}
|
|
@@ -32816,7 +32813,7 @@ async function evaluateCandidate(options) {
|
|
|
32816
32813
|
};
|
|
32817
32814
|
}
|
|
32818
32815
|
}
|
|
32819
|
-
const evaluatorRequest = scores ? void 0 : score.
|
|
32816
|
+
const evaluatorRequest = scores ? void 0 : score.graderRawRequest;
|
|
32820
32817
|
const effectiveAgentRequest = agentRequest && Object.keys(agentRequest).length > 0 ? agentRequest : void 0;
|
|
32821
32818
|
const requests = effectiveAgentRequest || lmRequest || evaluatorRequest ? {
|
|
32822
32819
|
...effectiveAgentRequest ? { agent: effectiveAgentRequest } : {},
|
|
@@ -33032,7 +33029,7 @@ async function runEvaluatorList(options) {
|
|
|
33032
33029
|
weight,
|
|
33033
33030
|
verdict: score2.verdict,
|
|
33034
33031
|
assertions: score2.assertions,
|
|
33035
|
-
input: score2.
|
|
33032
|
+
input: score2.graderRawRequest,
|
|
33036
33033
|
target: score2.graderTarget,
|
|
33037
33034
|
details: score2.details,
|
|
33038
33035
|
scores: mapChildResults(score2.scores),
|
|
@@ -33048,7 +33045,7 @@ async function runEvaluatorList(options) {
|
|
|
33048
33045
|
score: 0,
|
|
33049
33046
|
verdict: "fail",
|
|
33050
33047
|
assertions: [
|
|
33051
|
-
{ text: `
|
|
33048
|
+
{ text: `Grader '${evaluatorConfig.name}' failed: ${message}`, passed: false }
|
|
33052
33049
|
],
|
|
33053
33050
|
expectedAspectCount: 1
|
|
33054
33051
|
};
|
|
@@ -33069,7 +33066,7 @@ async function runEvaluatorList(options) {
|
|
|
33069
33066
|
verdict: "fail",
|
|
33070
33067
|
assertions: [
|
|
33071
33068
|
{
|
|
33072
|
-
text: `
|
|
33069
|
+
text: `Grader '${evaluatorConfig.name ?? "unknown"}' failed: ${message}`,
|
|
33073
33070
|
passed: false
|
|
33074
33071
|
}
|
|
33075
33072
|
],
|
|
@@ -33126,7 +33123,7 @@ function filterEvalCases(evalCases, filter2) {
|
|
|
33126
33123
|
return evalCases.filter((evalCase) => matchesFilter3(evalCase.id, filter2));
|
|
33127
33124
|
}
|
|
33128
33125
|
function buildEvaluatorRegistry(overrides, resolveGraderProvider) {
|
|
33129
|
-
const llmGrader = overrides?.["llm-grader"] ?? new
|
|
33126
|
+
const llmGrader = overrides?.["llm-grader"] ?? new LlmGrader({
|
|
33130
33127
|
resolveGraderProvider: async (context2) => {
|
|
33131
33128
|
if (context2.graderProvider) {
|
|
33132
33129
|
return context2.graderProvider;
|
|
@@ -33617,7 +33614,7 @@ function mapChildResults(children) {
|
|
|
33617
33614
|
weight: child.weight,
|
|
33618
33615
|
verdict: child.verdict,
|
|
33619
33616
|
assertions: child.assertions,
|
|
33620
|
-
input: child.
|
|
33617
|
+
input: child.graderRawRequest,
|
|
33621
33618
|
scores: mapChildResults(child.scores),
|
|
33622
33619
|
details: child.details,
|
|
33623
33620
|
tokenUsage: child.tokenUsage
|
|
@@ -35656,7 +35653,7 @@ export {
|
|
|
35656
35653
|
isJsonObject,
|
|
35657
35654
|
isJsonValue,
|
|
35658
35655
|
isTestMessage,
|
|
35659
|
-
|
|
35656
|
+
isGraderKind,
|
|
35660
35657
|
fileExists,
|
|
35661
35658
|
normalizeLineEndings,
|
|
35662
35659
|
readTextFile,
|
|
@@ -35746,27 +35743,27 @@ export {
|
|
|
35746
35743
|
negateScore,
|
|
35747
35744
|
toSnakeCaseDeep,
|
|
35748
35745
|
toCamelCaseDeep,
|
|
35749
|
-
|
|
35746
|
+
CodeGrader,
|
|
35750
35747
|
executeScript,
|
|
35751
|
-
|
|
35748
|
+
DEFAULT_GRADER_TEMPLATE,
|
|
35752
35749
|
freeformEvaluationSchema,
|
|
35753
35750
|
rubricEvaluationSchema,
|
|
35754
|
-
|
|
35751
|
+
LlmGrader,
|
|
35755
35752
|
buildOutputSchema,
|
|
35756
35753
|
buildRubricOutputSchema,
|
|
35757
35754
|
substituteVariables,
|
|
35758
35755
|
calculateRubricScore,
|
|
35759
35756
|
buildScoreRangeOutputSchema,
|
|
35760
35757
|
extractImageBlocks,
|
|
35761
|
-
|
|
35762
|
-
|
|
35763
|
-
|
|
35764
|
-
|
|
35765
|
-
|
|
35766
|
-
|
|
35758
|
+
CompositeGrader,
|
|
35759
|
+
CostGrader,
|
|
35760
|
+
ExecutionMetricsGrader,
|
|
35761
|
+
FieldAccuracyGrader,
|
|
35762
|
+
LatencyGrader,
|
|
35763
|
+
SkillTriggerGrader,
|
|
35767
35764
|
assembleLlmGraderPrompt,
|
|
35768
|
-
|
|
35769
|
-
|
|
35765
|
+
TokenUsageGrader,
|
|
35766
|
+
ToolTrajectoryGrader,
|
|
35770
35767
|
runContainsAssertion,
|
|
35771
35768
|
runContainsAnyAssertion,
|
|
35772
35769
|
runContainsAllAssertion,
|
|
@@ -35778,8 +35775,8 @@ export {
|
|
|
35778
35775
|
runRegexAssertion,
|
|
35779
35776
|
runIsJsonAssertion,
|
|
35780
35777
|
runEqualsAssertion,
|
|
35781
|
-
|
|
35782
|
-
|
|
35778
|
+
GraderRegistry,
|
|
35779
|
+
DeterministicAssertionGrader,
|
|
35783
35780
|
createBuiltinRegistry,
|
|
35784
35781
|
discoverAssertions,
|
|
35785
35782
|
discoverGraders,
|
|
@@ -35845,4 +35842,4 @@ export {
|
|
|
35845
35842
|
TranscriptProvider,
|
|
35846
35843
|
createAgentKernel
|
|
35847
35844
|
};
|
|
35848
|
-
//# sourceMappingURL=chunk-
|
|
35845
|
+
//# sourceMappingURL=chunk-RCOAXXHP.js.map
|