@agentv/core 4.17.1-next.1 → 4.18.0-next.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-6VZY3B6M.js → chunk-PYDBJOAO.js} +6 -6
- package/dist/chunk-PYDBJOAO.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +5 -5
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +3 -3
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +229 -238
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +156 -158
- package/dist/index.d.ts +156 -158
- package/dist/index.js +210 -216
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/dist/chunk-6VZY3B6M.js.map +0 -1
package/dist/index.cjs
CHANGED
|
@@ -1768,22 +1768,21 @@ var init_otlp_json_file_exporter = __esm({
|
|
|
1768
1768
|
var index_exports = {};
|
|
1769
1769
|
__export(index_exports, {
|
|
1770
1770
|
COMMON_TARGET_SETTINGS: () => COMMON_TARGET_SETTINGS,
|
|
1771
|
-
|
|
1772
|
-
|
|
1773
|
-
|
|
1771
|
+
CodeGrader: () => CodeGrader,
|
|
1772
|
+
CompositeGrader: () => CompositeGrader,
|
|
1773
|
+
CostGrader: () => CostGrader,
|
|
1774
1774
|
DEFAULT_CATEGORY: () => DEFAULT_CATEGORY,
|
|
1775
|
-
DEFAULT_EVALUATOR_TEMPLATE: () => DEFAULT_EVALUATOR_TEMPLATE,
|
|
1776
1775
|
DEFAULT_EVAL_PATTERNS: () => DEFAULT_EVAL_PATTERNS,
|
|
1777
1776
|
DEFAULT_EXPLORATION_TOOLS: () => DEFAULT_EXPLORATION_TOOLS,
|
|
1777
|
+
DEFAULT_GRADER_TEMPLATE: () => DEFAULT_GRADER_TEMPLATE,
|
|
1778
1778
|
DEFAULT_THRESHOLD: () => DEFAULT_THRESHOLD,
|
|
1779
|
-
|
|
1779
|
+
DeterministicAssertionGrader: () => DeterministicAssertionGrader,
|
|
1780
1780
|
DockerWorkspaceProvider: () => DockerWorkspaceProvider,
|
|
1781
|
-
|
|
1782
|
-
|
|
1783
|
-
|
|
1784
|
-
|
|
1785
|
-
|
|
1786
|
-
LlmJudgeEvaluator: () => LlmGraderEvaluator,
|
|
1781
|
+
ExecutionMetricsGrader: () => ExecutionMetricsGrader,
|
|
1782
|
+
FieldAccuracyGrader: () => FieldAccuracyGrader,
|
|
1783
|
+
GraderRegistry: () => GraderRegistry,
|
|
1784
|
+
LatencyGrader: () => LatencyGrader,
|
|
1785
|
+
LlmGrader: () => LlmGrader,
|
|
1787
1786
|
OTEL_BACKEND_PRESETS: () => OTEL_BACKEND_PRESETS,
|
|
1788
1787
|
OtelStreamingObserver: () => OtelStreamingObserver,
|
|
1789
1788
|
OtelTraceExporter: () => OtelTraceExporter,
|
|
@@ -1792,18 +1791,17 @@ __export(index_exports, {
|
|
|
1792
1791
|
ProviderRegistry: () => ProviderRegistry,
|
|
1793
1792
|
RepoManager: () => RepoManager,
|
|
1794
1793
|
ResponseCache: () => ResponseCache,
|
|
1795
|
-
|
|
1794
|
+
SkillTriggerGrader: () => SkillTriggerGrader,
|
|
1796
1795
|
TEST_MESSAGE_ROLES: () => TEST_MESSAGE_ROLES,
|
|
1797
1796
|
TemplateNotDirectoryError: () => TemplateNotDirectoryError,
|
|
1798
1797
|
TemplateNotFoundError: () => TemplateNotFoundError,
|
|
1799
|
-
|
|
1800
|
-
|
|
1798
|
+
TokenUsageGrader: () => TokenUsageGrader,
|
|
1799
|
+
ToolTrajectoryGrader: () => ToolTrajectoryGrader,
|
|
1801
1800
|
TranscriptProvider: () => TranscriptProvider,
|
|
1802
1801
|
WorkspaceCreationError: () => WorkspaceCreationError,
|
|
1803
1802
|
WorkspacePoolManager: () => WorkspacePoolManager,
|
|
1804
1803
|
addBenchmark: () => addBenchmark,
|
|
1805
1804
|
assembleLlmGraderPrompt: () => assembleLlmGraderPrompt,
|
|
1806
|
-
assembleLlmJudgePrompt: () => assembleLlmGraderPrompt,
|
|
1807
1805
|
avgToolDurationMs: () => avgToolDurationMs,
|
|
1808
1806
|
buildDirectoryChain: () => buildDirectoryChain2,
|
|
1809
1807
|
buildOutputSchema: () => buildOutputSchema,
|
|
@@ -1843,7 +1841,6 @@ __export(index_exports, {
|
|
|
1843
1841
|
discoverCodexSessions: () => discoverCodexSessions,
|
|
1844
1842
|
discoverCopilotSessions: () => discoverCopilotSessions,
|
|
1845
1843
|
discoverGraders: () => discoverGraders,
|
|
1846
|
-
discoverJudges: () => discoverGraders,
|
|
1847
1844
|
discoverProviders: () => discoverProviders,
|
|
1848
1845
|
ensureResultsRepoClone: () => ensureResultsRepoClone,
|
|
1849
1846
|
ensureVSCodeSubagents: () => ensureVSCodeSubagents,
|
|
@@ -1885,7 +1882,7 @@ __export(index_exports, {
|
|
|
1885
1882
|
isAgentSkillsFormat: () => isAgentSkillsFormat,
|
|
1886
1883
|
isContent: () => isContent,
|
|
1887
1884
|
isContentArray: () => isContentArray,
|
|
1888
|
-
|
|
1885
|
+
isGraderKind: () => isGraderKind,
|
|
1889
1886
|
isJsonObject: () => isJsonObject,
|
|
1890
1887
|
isJsonValue: () => isJsonValue,
|
|
1891
1888
|
isNonEmptyString: () => isNonEmptyString,
|
|
@@ -2038,7 +2035,7 @@ function isTestMessage(value) {
|
|
|
2038
2035
|
}
|
|
2039
2036
|
return false;
|
|
2040
2037
|
}
|
|
2041
|
-
var
|
|
2038
|
+
var GRADER_KIND_VALUES = [
|
|
2042
2039
|
"code-grader",
|
|
2043
2040
|
"llm-grader",
|
|
2044
2041
|
"rubric",
|
|
@@ -2064,9 +2061,9 @@ var EVALUATOR_KIND_VALUES = [
|
|
|
2064
2061
|
"rubrics",
|
|
2065
2062
|
"inline-assert"
|
|
2066
2063
|
];
|
|
2067
|
-
var
|
|
2068
|
-
function
|
|
2069
|
-
return typeof value === "string" &&
|
|
2064
|
+
var GRADER_KIND_SET = new Set(GRADER_KIND_VALUES);
|
|
2065
|
+
function isGraderKind(value) {
|
|
2066
|
+
return typeof value === "string" && GRADER_KIND_SET.has(value);
|
|
2070
2067
|
}
|
|
2071
2068
|
|
|
2072
2069
|
// src/evaluation/trace.ts
|
|
@@ -2821,22 +2818,25 @@ function extractCacheConfig(suite) {
|
|
|
2821
2818
|
const resolvedCachePath = typeof cachePath === "string" && cachePath.trim().length > 0 ? cachePath.trim() : void 0;
|
|
2822
2819
|
return { enabled: cache, cachePath: resolvedCachePath };
|
|
2823
2820
|
}
|
|
2824
|
-
function
|
|
2821
|
+
function extractBudgetUsd(suite) {
|
|
2825
2822
|
const execution = suite.execution;
|
|
2826
2823
|
if (!execution || typeof execution !== "object" || Array.isArray(execution)) {
|
|
2827
2824
|
return void 0;
|
|
2828
2825
|
}
|
|
2829
2826
|
const executionObj = execution;
|
|
2830
|
-
|
|
2827
|
+
if ("total_budget_usd" in executionObj || "totalBudgetUsd" in executionObj) {
|
|
2828
|
+
throw new Error(
|
|
2829
|
+
"execution.total_budget_usd has been renamed to execution.budget_usd. Update your eval YAML."
|
|
2830
|
+
);
|
|
2831
|
+
}
|
|
2832
|
+
const rawBudget = executionObj.budget_usd ?? executionObj.budgetUsd;
|
|
2831
2833
|
if (rawBudget === void 0 || rawBudget === null) {
|
|
2832
2834
|
return void 0;
|
|
2833
2835
|
}
|
|
2834
2836
|
if (typeof rawBudget === "number" && rawBudget > 0) {
|
|
2835
2837
|
return rawBudget;
|
|
2836
2838
|
}
|
|
2837
|
-
logWarning(
|
|
2838
|
-
`Invalid execution.total_budget_usd: ${rawBudget}. Must be a positive number. Ignoring.`
|
|
2839
|
-
);
|
|
2839
|
+
logWarning(`Invalid execution.budget_usd: ${rawBudget}. Must be a positive number. Ignoring.`);
|
|
2840
2840
|
return void 0;
|
|
2841
2841
|
}
|
|
2842
2842
|
function extractFailOnError(suite) {
|
|
@@ -2986,7 +2986,7 @@ function logWarning(message) {
|
|
|
2986
2986
|
console.warn(`${ANSI_YELLOW2}Warning: ${message}${ANSI_RESET3}`);
|
|
2987
2987
|
}
|
|
2988
2988
|
|
|
2989
|
-
// src/evaluation/loaders/
|
|
2989
|
+
// src/evaluation/loaders/grader-parser.ts
|
|
2990
2990
|
init_cjs_shims();
|
|
2991
2991
|
var import_promises7 = require("fs/promises");
|
|
2992
2992
|
var import_node_path6 = __toESM(require("path"), 1);
|
|
@@ -3230,38 +3230,38 @@ function validateTemplateVariables(content, source) {
|
|
|
3230
3230
|
);
|
|
3231
3231
|
}
|
|
3232
3232
|
if (invalidVariables.length > 0) {
|
|
3233
|
-
const warningMessage = `${ANSI_YELLOW3}Warning: Custom
|
|
3233
|
+
const warningMessage = `${ANSI_YELLOW3}Warning: Custom grader template at ${source}
|
|
3234
3234
|
Contains invalid variables: ${invalidVariables.map((v) => `{{ ${v} }}`).join(", ")}
|
|
3235
3235
|
Valid variables: ${Array.from(VALID_TEMPLATE_VARIABLES).map((v) => `{{ ${v} }}`).join(", ")}${ANSI_RESET4}`;
|
|
3236
3236
|
console.warn(warningMessage);
|
|
3237
3237
|
}
|
|
3238
3238
|
}
|
|
3239
3239
|
|
|
3240
|
-
// src/evaluation/loaders/
|
|
3240
|
+
// src/evaluation/loaders/grader-parser.ts
|
|
3241
3241
|
var ANSI_YELLOW4 = "\x1B[33m";
|
|
3242
3242
|
var ANSI_RESET5 = "\x1B[0m";
|
|
3243
3243
|
var MAX_ASSERTION_INCLUDE_DEPTH = 3;
|
|
3244
3244
|
var PROMPT_FILE_PREFIX = "file://";
|
|
3245
|
-
function
|
|
3245
|
+
function normalizeGraderType(type) {
|
|
3246
3246
|
return type.replace(/_/g, "-");
|
|
3247
3247
|
}
|
|
3248
3248
|
function isDeprecatedJudgeType(type) {
|
|
3249
3249
|
return type === "code-judge" || type === "llm-judge";
|
|
3250
3250
|
}
|
|
3251
|
-
async function
|
|
3251
|
+
async function parseGraders(rawEvalCase, globalExecution, searchRoots, evalId, defaultPreprocessors) {
|
|
3252
3252
|
const execution = rawEvalCase.execution;
|
|
3253
3253
|
const executionObject = isJsonObject2(execution) ? execution : void 0;
|
|
3254
3254
|
const caseEvaluators = rawEvalCase.assertions ?? rawEvalCase.assert ?? (executionObject ? executionObject.evaluators : void 0) ?? // deprecated: use assertions
|
|
3255
3255
|
rawEvalCase.evaluators;
|
|
3256
3256
|
const skipDefaults = executionObject?.skip_defaults === true;
|
|
3257
3257
|
const rootEvaluators = skipDefaults ? void 0 : globalExecution?.assertions ?? globalExecution?.assert ?? globalExecution?.evaluators;
|
|
3258
|
-
const parsedCase = await
|
|
3258
|
+
const parsedCase = await parseGraderList(
|
|
3259
3259
|
caseEvaluators,
|
|
3260
3260
|
searchRoots,
|
|
3261
3261
|
evalId,
|
|
3262
3262
|
defaultPreprocessors
|
|
3263
3263
|
);
|
|
3264
|
-
const parsedRoot = await
|
|
3264
|
+
const parsedRoot = await parseGraderList(
|
|
3265
3265
|
rootEvaluators,
|
|
3266
3266
|
searchRoots,
|
|
3267
3267
|
evalId,
|
|
@@ -3340,12 +3340,12 @@ ${resolved.attempted.map((attempt) => ` Tried: ${attempt}`).join("\n")}` : "";
|
|
|
3340
3340
|
templateDir,
|
|
3341
3341
|
...searchRoots.filter((root) => import_node_path6.default.resolve(root) !== templateDir)
|
|
3342
3342
|
];
|
|
3343
|
-
return await
|
|
3343
|
+
return await expandGraderEntries(assertions, nestedSearchRoots, evalId, {
|
|
3344
3344
|
depth: nextDepth,
|
|
3345
3345
|
chain: [...includeContext.chain, resolved.resolvedPath]
|
|
3346
3346
|
}) ?? [];
|
|
3347
3347
|
}
|
|
3348
|
-
async function
|
|
3348
|
+
async function expandGraderEntries(candidateEvaluators, searchRoots, evalId, includeContext = { depth: 0, chain: [] }) {
|
|
3349
3349
|
if (candidateEvaluators === void 0) {
|
|
3350
3350
|
return void 0;
|
|
3351
3351
|
}
|
|
@@ -3369,8 +3369,8 @@ async function expandEvaluatorEntries(candidateEvaluators, searchRoots, evalId,
|
|
|
3369
3369
|
}
|
|
3370
3370
|
return expanded;
|
|
3371
3371
|
}
|
|
3372
|
-
async function
|
|
3373
|
-
const expandedEvaluators = await
|
|
3372
|
+
async function parseGraderList(candidateEvaluators, searchRoots, evalId, defaultPreprocessors) {
|
|
3373
|
+
const expandedEvaluators = await expandGraderEntries(candidateEvaluators, searchRoots, evalId);
|
|
3374
3374
|
if (!expandedEvaluators) {
|
|
3375
3375
|
return void 0;
|
|
3376
3376
|
}
|
|
@@ -3416,14 +3416,14 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId, defa
|
|
|
3416
3416
|
}
|
|
3417
3417
|
const rawName = asString(rawEvaluator.name);
|
|
3418
3418
|
const rawType = rawEvaluator.type;
|
|
3419
|
-
const typeValue = typeof rawType === "string" ?
|
|
3419
|
+
const typeValue = typeof rawType === "string" ? normalizeGraderType(rawType) : rawType;
|
|
3420
3420
|
if (typeof typeValue === "string" && isDeprecatedJudgeType(typeValue)) {
|
|
3421
3421
|
logWarning2(
|
|
3422
3422
|
`Skipping evaluator '${rawName ?? "<unnamed>"}' in '${evalId}': '${rawType}' is deprecated. Use '${typeValue.replace("-judge", "-grader")}' instead`
|
|
3423
3423
|
);
|
|
3424
3424
|
continue;
|
|
3425
3425
|
}
|
|
3426
|
-
const isCustomType = typeof typeValue === "string" && !
|
|
3426
|
+
const isCustomType = typeof typeValue === "string" && !isGraderKind(typeValue);
|
|
3427
3427
|
if (typeof typeValue !== "string") {
|
|
3428
3428
|
logWarning2(`Skipping evaluator with invalid type in '${evalId}'`);
|
|
3429
3429
|
continue;
|
|
@@ -3586,7 +3586,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId, defa
|
|
|
3586
3586
|
continue;
|
|
3587
3587
|
}
|
|
3588
3588
|
const aggregatorType = asString(rawAggregator.type);
|
|
3589
|
-
const normalizedAggregatorType = typeof aggregatorType === "string" ? aggregatorType === "weighted_average" || aggregatorType === "threshold" ? aggregatorType :
|
|
3589
|
+
const normalizedAggregatorType = typeof aggregatorType === "string" ? aggregatorType === "weighted_average" || aggregatorType === "threshold" ? aggregatorType : normalizeGraderType(aggregatorType) : aggregatorType;
|
|
3590
3590
|
if (typeof normalizedAggregatorType === "string" && isDeprecatedJudgeType(normalizedAggregatorType)) {
|
|
3591
3591
|
logWarning2(
|
|
3592
3592
|
`Skipping composite evaluator '${name}' in '${evalId}': aggregator type '${aggregatorType}' is deprecated. Use '${normalizedAggregatorType.replace("-judge", "-grader")}' instead`
|
|
@@ -3599,7 +3599,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId, defa
|
|
|
3599
3599
|
);
|
|
3600
3600
|
continue;
|
|
3601
3601
|
}
|
|
3602
|
-
const expandedMembers = await
|
|
3602
|
+
const expandedMembers = await expandGraderEntries(
|
|
3603
3603
|
rawMembers,
|
|
3604
3604
|
searchRoots,
|
|
3605
3605
|
`${evalId}:${name}`
|
|
@@ -3615,11 +3615,11 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId, defa
|
|
|
3615
3615
|
}
|
|
3616
3616
|
const memberName = asString(rawMember.name);
|
|
3617
3617
|
const memberType = rawMember.type;
|
|
3618
|
-
if (!memberName || !
|
|
3618
|
+
if (!memberName || !isGraderKind(memberType)) {
|
|
3619
3619
|
logWarning2(`Skipping member evaluator with invalid name/type in composite '${name}'`);
|
|
3620
3620
|
continue;
|
|
3621
3621
|
}
|
|
3622
|
-
const memberConfigs = await
|
|
3622
|
+
const memberConfigs = await parseGraders(
|
|
3623
3623
|
{ evaluators: [rawMember] },
|
|
3624
3624
|
void 0,
|
|
3625
3625
|
searchRoots,
|
|
@@ -4360,7 +4360,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId, defa
|
|
|
4360
4360
|
`prompt.command for evaluator '${name}' in '${evalId}'`
|
|
4361
4361
|
);
|
|
4362
4362
|
if (!commandArray) {
|
|
4363
|
-
throw new Error(`
|
|
4363
|
+
throw new Error(`Grader '${name}' in '${evalId}': prompt object requires command array`);
|
|
4364
4364
|
}
|
|
4365
4365
|
const commandPath = commandArray[commandArray.length - 1];
|
|
4366
4366
|
const resolved = await resolveFileReference2(commandPath, searchRoots);
|
|
@@ -4368,7 +4368,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId, defa
|
|
|
4368
4368
|
resolvedPromptScript = [...commandArray.slice(0, -1), import_node_path6.default.resolve(resolved.resolvedPath)];
|
|
4369
4369
|
} else {
|
|
4370
4370
|
throw new Error(
|
|
4371
|
-
`
|
|
4371
|
+
`Grader '${name}' in '${evalId}': prompt command file not found: ${resolved.displayPath}`
|
|
4372
4372
|
);
|
|
4373
4373
|
}
|
|
4374
4374
|
if (isJsonObject2(rawPrompt.config)) {
|
|
@@ -4385,11 +4385,11 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId, defa
|
|
|
4385
4385
|
await validateCustomPromptContent(promptPath);
|
|
4386
4386
|
} catch (error) {
|
|
4387
4387
|
const message = error instanceof Error ? error.message : String(error);
|
|
4388
|
-
throw new Error(`
|
|
4388
|
+
throw new Error(`Grader '${name}' template (${promptPath}): ${message}`);
|
|
4389
4389
|
}
|
|
4390
4390
|
} else {
|
|
4391
4391
|
throw new Error(
|
|
4392
|
-
`
|
|
4392
|
+
`Grader '${name}' in '${evalId}': prompt file not found: ${resolved.displayPath}`
|
|
4393
4393
|
);
|
|
4394
4394
|
}
|
|
4395
4395
|
} else {
|
|
@@ -4506,18 +4506,18 @@ async function parsePreprocessors(rawValue, searchRoots, evaluatorName, evalId)
|
|
|
4506
4506
|
return void 0;
|
|
4507
4507
|
}
|
|
4508
4508
|
if (!Array.isArray(rawValue)) {
|
|
4509
|
-
throw new Error(`
|
|
4509
|
+
throw new Error(`Grader '${evaluatorName}' in '${evalId}': preprocessors must be an array`);
|
|
4510
4510
|
}
|
|
4511
4511
|
const preprocessors = [];
|
|
4512
4512
|
for (const rawEntry of rawValue) {
|
|
4513
4513
|
if (!isJsonObject2(rawEntry)) {
|
|
4514
4514
|
throw new Error(
|
|
4515
|
-
`
|
|
4515
|
+
`Grader '${evaluatorName}' in '${evalId}': each preprocessor must be an object`
|
|
4516
4516
|
);
|
|
4517
4517
|
}
|
|
4518
4518
|
const type = asString(rawEntry.type)?.trim();
|
|
4519
4519
|
if (!type) {
|
|
4520
|
-
throw new Error(`
|
|
4520
|
+
throw new Error(`Grader '${evaluatorName}' in '${evalId}': preprocessor.type is required`);
|
|
4521
4521
|
}
|
|
4522
4522
|
const command = asStringArray(
|
|
4523
4523
|
rawEntry.command,
|
|
@@ -4525,14 +4525,14 @@ async function parsePreprocessors(rawValue, searchRoots, evaluatorName, evalId)
|
|
|
4525
4525
|
);
|
|
4526
4526
|
if (!command || command.length === 0) {
|
|
4527
4527
|
throw new Error(
|
|
4528
|
-
`
|
|
4528
|
+
`Grader '${evaluatorName}' in '${evalId}': preprocessor '${type}' requires command`
|
|
4529
4529
|
);
|
|
4530
4530
|
}
|
|
4531
4531
|
const commandPath = command[command.length - 1];
|
|
4532
4532
|
const resolved = await resolveFileReference2(commandPath, searchRoots);
|
|
4533
4533
|
if (!resolved.resolvedPath) {
|
|
4534
4534
|
throw new Error(
|
|
4535
|
-
`
|
|
4535
|
+
`Grader '${evaluatorName}' in '${evalId}': preprocessor command file not found: ${resolved.displayPath}`
|
|
4536
4536
|
);
|
|
4537
4537
|
}
|
|
4538
4538
|
preprocessors.push({
|
|
@@ -4583,13 +4583,13 @@ function coerceEvaluator(candidate, contextId) {
|
|
|
4583
4583
|
if (typeof candidate !== "string") {
|
|
4584
4584
|
return void 0;
|
|
4585
4585
|
}
|
|
4586
|
-
const normalized =
|
|
4586
|
+
const normalized = normalizeGraderType(candidate);
|
|
4587
4587
|
if (isDeprecatedJudgeType(normalized)) {
|
|
4588
4588
|
throw new Error(
|
|
4589
4589
|
`Unsupported grader '${candidate}' in ${contextId}. Use '${normalized.replace("-judge", "-grader")}' instead.`
|
|
4590
4590
|
);
|
|
4591
4591
|
}
|
|
4592
|
-
if (
|
|
4592
|
+
if (isGraderKind(normalized)) {
|
|
4593
4593
|
return normalized;
|
|
4594
4594
|
}
|
|
4595
4595
|
logWarning2(`Unknown grader '${candidate}' in ${contextId}, falling back to default`);
|
|
@@ -4661,7 +4661,7 @@ function parseRequiredAndMinScore(rawRequired, rawMinScore, evaluatorName, evalI
|
|
|
4661
4661
|
}
|
|
4662
4662
|
result.required = rawRequired;
|
|
4663
4663
|
logWarning2(
|
|
4664
|
-
`
|
|
4664
|
+
`Grader '${evaluatorName}' in '${evalId}': 'required: ${rawRequired}' is deprecated. Use 'required: true' + 'min_score: ${rawRequired}' instead.`
|
|
4665
4665
|
);
|
|
4666
4666
|
}
|
|
4667
4667
|
return result;
|
|
@@ -5485,7 +5485,7 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
|
|
|
5485
5485
|
const testCaseEvaluatorKind = coerceEvaluator(testCaseConfig.evaluator, id) ?? globalEvaluator;
|
|
5486
5486
|
let evaluators;
|
|
5487
5487
|
try {
|
|
5488
|
-
evaluators = await
|
|
5488
|
+
evaluators = await parseGraders(
|
|
5489
5489
|
testCaseConfig,
|
|
5490
5490
|
mergedExecution,
|
|
5491
5491
|
searchRoots,
|
|
@@ -5834,7 +5834,7 @@ async function loadTestSuite(evalFilePath, repoRoot, options) {
|
|
|
5834
5834
|
targetRefs: extractTargetRefsFromSuite(parsed),
|
|
5835
5835
|
workers: extractWorkersFromSuite(parsed),
|
|
5836
5836
|
cacheConfig: extractCacheConfig(parsed),
|
|
5837
|
-
|
|
5837
|
+
budgetUsd: extractBudgetUsd(parsed),
|
|
5838
5838
|
...metadata !== void 0 && { metadata },
|
|
5839
5839
|
...failOnError !== void 0 && { failOnError },
|
|
5840
5840
|
...threshold !== void 0 && { threshold },
|
|
@@ -5975,7 +5975,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
5975
5975
|
const testCaseEvaluatorKind = coerceEvaluator(testCaseConfig.evaluator, id) ?? globalEvaluator;
|
|
5976
5976
|
let evaluators;
|
|
5977
5977
|
try {
|
|
5978
|
-
evaluators = await
|
|
5978
|
+
evaluators = await parseGraders(
|
|
5979
5979
|
testCaseConfig,
|
|
5980
5980
|
globalExecution,
|
|
5981
5981
|
searchRoots,
|
|
@@ -15421,13 +15421,13 @@ function resolveAndCreateProvider(definition, env = process.env) {
|
|
|
15421
15421
|
return createProvider(resolved);
|
|
15422
15422
|
}
|
|
15423
15423
|
|
|
15424
|
-
// src/evaluation/
|
|
15424
|
+
// src/evaluation/graders.ts
|
|
15425
15425
|
init_cjs_shims();
|
|
15426
15426
|
|
|
15427
|
-
// src/evaluation/
|
|
15427
|
+
// src/evaluation/graders/index.ts
|
|
15428
15428
|
init_cjs_shims();
|
|
15429
15429
|
|
|
15430
|
-
// src/evaluation/
|
|
15430
|
+
// src/evaluation/graders/scoring.ts
|
|
15431
15431
|
init_cjs_shims();
|
|
15432
15432
|
var DEFAULT_THRESHOLD = 0.8;
|
|
15433
15433
|
var PASS_THRESHOLD = DEFAULT_THRESHOLD;
|
|
@@ -15516,7 +15516,7 @@ function negateScore(score) {
|
|
|
15516
15516
|
};
|
|
15517
15517
|
}
|
|
15518
15518
|
|
|
15519
|
-
// src/evaluation/
|
|
15519
|
+
// src/evaluation/graders/code-grader.ts
|
|
15520
15520
|
init_cjs_shims();
|
|
15521
15521
|
var import_promises31 = require("fs/promises");
|
|
15522
15522
|
var import_node_os9 = require("os");
|
|
@@ -15814,7 +15814,7 @@ function getRepoCheckoutTargets(repos) {
|
|
|
15814
15814
|
}));
|
|
15815
15815
|
}
|
|
15816
15816
|
|
|
15817
|
-
// src/evaluation/
|
|
15817
|
+
// src/evaluation/graders/code-grader.ts
|
|
15818
15818
|
var FILE_BACKED_OUTPUT_THRESHOLD = 5e4;
|
|
15819
15819
|
var DATA_URI_RE = /^data:([^;]+);base64,(.+)$/s;
|
|
15820
15820
|
async function materializeContentForGrader(messages, getWorkDir) {
|
|
@@ -15866,7 +15866,7 @@ async function materializeContentForGrader(messages, getWorkDir) {
|
|
|
15866
15866
|
}
|
|
15867
15867
|
return result;
|
|
15868
15868
|
}
|
|
15869
|
-
var
|
|
15869
|
+
var CodeGrader = class {
|
|
15870
15870
|
kind = "code-grader";
|
|
15871
15871
|
command;
|
|
15872
15872
|
cwd;
|
|
@@ -15984,7 +15984,7 @@ var CodeEvaluator = class {
|
|
|
15984
15984
|
})) : [];
|
|
15985
15985
|
const details = parsed?.details && typeof parsed.details === "object" && !Array.isArray(parsed.details) ? parsed.details : void 0;
|
|
15986
15986
|
const proxyUsage = getProxyUsage?.();
|
|
15987
|
-
const
|
|
15987
|
+
const graderRawRequest = {
|
|
15988
15988
|
command: this.command,
|
|
15989
15989
|
...this.cwd ? { cwd: this.cwd } : {},
|
|
15990
15990
|
...proxyUsage ? {
|
|
@@ -15999,7 +15999,7 @@ var CodeEvaluator = class {
|
|
|
15999
15999
|
verdict: scoreToVerdict(score),
|
|
16000
16000
|
assertions,
|
|
16001
16001
|
expectedAspectCount: assertions.length || 1,
|
|
16002
|
-
|
|
16002
|
+
graderRawRequest,
|
|
16003
16003
|
...details ? { details } : {},
|
|
16004
16004
|
tokenUsage: proxyUsage?.tokenUsage
|
|
16005
16005
|
};
|
|
@@ -16011,7 +16011,7 @@ var CodeEvaluator = class {
|
|
|
16011
16011
|
verdict: "fail",
|
|
16012
16012
|
assertions: [{ text: `Code evaluator failed: ${message}`, passed: false }],
|
|
16013
16013
|
expectedAspectCount: 1,
|
|
16014
|
-
|
|
16014
|
+
graderRawRequest: {
|
|
16015
16015
|
command: this.command,
|
|
16016
16016
|
...this.cwd ? { cwd: this.cwd } : {},
|
|
16017
16017
|
...proxyUsage ? {
|
|
@@ -16060,11 +16060,11 @@ function formatStderr(stderr) {
|
|
|
16060
16060
|
${tail}`;
|
|
16061
16061
|
}
|
|
16062
16062
|
|
|
16063
|
-
// src/evaluation/
|
|
16063
|
+
// src/evaluation/graders/composite.ts
|
|
16064
16064
|
init_cjs_shims();
|
|
16065
16065
|
var import_ai3 = require("ai");
|
|
16066
16066
|
|
|
16067
|
-
// src/evaluation/
|
|
16067
|
+
// src/evaluation/graders/llm-grader.ts
|
|
16068
16068
|
init_cjs_shims();
|
|
16069
16069
|
var import_promises32 = __toESM(require("fs/promises"), 1);
|
|
16070
16070
|
var import_node_path41 = __toESM(require("path"), 1);
|
|
@@ -16105,7 +16105,7 @@ var BINARY_EXTENSIONS = /* @__PURE__ */ new Set([
|
|
|
16105
16105
|
".so",
|
|
16106
16106
|
".dylib"
|
|
16107
16107
|
]);
|
|
16108
|
-
var
|
|
16108
|
+
var DEFAULT_GRADER_TEMPLATE = `You are an expert grader. Your goal is to grade the answer based on how well it achieves the criteria for the original task.
|
|
16109
16109
|
|
|
16110
16110
|
Use the reference_answer as a gold standard for a high-quality response (if provided). The reference_answer may be a simple text response, or it may contain a sequence of expected agent messages including tool calls. When it contains multiple messages, the last message represents the final expected answer. The answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
|
|
16111
16111
|
|
|
@@ -16160,19 +16160,19 @@ function resolveContentBasePath(context2) {
|
|
|
16160
16160
|
}
|
|
16161
16161
|
return void 0;
|
|
16162
16162
|
}
|
|
16163
|
-
var
|
|
16163
|
+
var LlmGrader = class {
|
|
16164
16164
|
kind = "llm-grader";
|
|
16165
16165
|
resolveGraderProvider;
|
|
16166
16166
|
maxOutputTokens;
|
|
16167
16167
|
temperature;
|
|
16168
|
-
|
|
16168
|
+
graderTemplate;
|
|
16169
16169
|
maxSteps;
|
|
16170
16170
|
graderTargetProvider;
|
|
16171
16171
|
constructor(options) {
|
|
16172
16172
|
this.resolveGraderProvider = options.resolveGraderProvider ?? options.resolveJudgeProvider;
|
|
16173
16173
|
this.maxOutputTokens = options.maxOutputTokens;
|
|
16174
16174
|
this.temperature = options.temperature;
|
|
16175
|
-
this.
|
|
16175
|
+
this.graderTemplate = options.graderTemplate;
|
|
16176
16176
|
this.maxSteps = Math.min(options.maxSteps ?? DEFAULT_MAX_STEPS, MAX_STEPS_LIMIT);
|
|
16177
16177
|
this.graderTargetProvider = options.graderTargetProvider ?? options.judgeTargetProvider;
|
|
16178
16178
|
}
|
|
@@ -16235,16 +16235,16 @@ var LlmGraderEvaluator = class {
|
|
|
16235
16235
|
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context2.evalCase.reference_answer ?? "").trim()
|
|
16236
16236
|
};
|
|
16237
16237
|
const systemPrompt = buildOutputSchema();
|
|
16238
|
-
const
|
|
16239
|
-
warnDeprecatedTemplateVars(
|
|
16240
|
-
let userPrompt = substituteVariables(
|
|
16241
|
-
if (context2.fileChanges && !context2.
|
|
16238
|
+
const graderTemplate = context2.graderTemplateOverride ?? this.graderTemplate ?? DEFAULT_GRADER_TEMPLATE;
|
|
16239
|
+
warnDeprecatedTemplateVars(graderTemplate);
|
|
16240
|
+
let userPrompt = substituteVariables(graderTemplate, variables);
|
|
16241
|
+
if (context2.fileChanges && !context2.graderTemplateOverride && !this.graderTemplate) {
|
|
16242
16242
|
userPrompt += `
|
|
16243
16243
|
|
|
16244
16244
|
[[ ## file_changes ## ]]
|
|
16245
16245
|
${context2.fileChanges}`;
|
|
16246
16246
|
}
|
|
16247
|
-
const
|
|
16247
|
+
const graderRawRequest = {
|
|
16248
16248
|
userPrompt,
|
|
16249
16249
|
systemPrompt
|
|
16250
16250
|
};
|
|
@@ -16265,7 +16265,7 @@ ${context2.fileChanges}`;
|
|
|
16265
16265
|
verdict: scoreToVerdict(score),
|
|
16266
16266
|
assertions,
|
|
16267
16267
|
expectedAspectCount: Math.max(assertions.length, 1),
|
|
16268
|
-
|
|
16268
|
+
graderRawRequest,
|
|
16269
16269
|
graderTarget: graderProvider.targetName,
|
|
16270
16270
|
details: data.details,
|
|
16271
16271
|
tokenUsage
|
|
@@ -16279,7 +16279,7 @@ ${context2.fileChanges}`;
|
|
|
16279
16279
|
verdict: "skip",
|
|
16280
16280
|
assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
|
|
16281
16281
|
expectedAspectCount: 1,
|
|
16282
|
-
|
|
16282
|
+
graderRawRequest,
|
|
16283
16283
|
graderTarget: graderProvider.targetName
|
|
16284
16284
|
};
|
|
16285
16285
|
}
|
|
@@ -16296,7 +16296,7 @@ ${context2.fileChanges}`;
|
|
|
16296
16296
|
}
|
|
16297
16297
|
const prompt = this.buildRubricPrompt(context2, rubrics);
|
|
16298
16298
|
const systemPrompt = buildRubricOutputSchema();
|
|
16299
|
-
const
|
|
16299
|
+
const graderRawRequest = {
|
|
16300
16300
|
userPrompt: prompt,
|
|
16301
16301
|
systemPrompt
|
|
16302
16302
|
};
|
|
@@ -16316,7 +16316,7 @@ ${context2.fileChanges}`;
|
|
|
16316
16316
|
verdict,
|
|
16317
16317
|
assertions,
|
|
16318
16318
|
expectedAspectCount: rubrics.length,
|
|
16319
|
-
|
|
16319
|
+
graderRawRequest,
|
|
16320
16320
|
graderTarget: graderProvider.targetName,
|
|
16321
16321
|
tokenUsage
|
|
16322
16322
|
};
|
|
@@ -16329,7 +16329,7 @@ ${context2.fileChanges}`;
|
|
|
16329
16329
|
verdict: "skip",
|
|
16330
16330
|
assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
|
|
16331
16331
|
expectedAspectCount: rubrics.length,
|
|
16332
|
-
|
|
16332
|
+
graderRawRequest,
|
|
16333
16333
|
graderTarget: graderProvider.targetName
|
|
16334
16334
|
};
|
|
16335
16335
|
}
|
|
@@ -16341,7 +16341,7 @@ ${context2.fileChanges}`;
|
|
|
16341
16341
|
async evaluateWithScoreRanges(context2, graderProvider, rubrics) {
|
|
16342
16342
|
const prompt = this.buildScoreRangePrompt(context2, rubrics);
|
|
16343
16343
|
const systemPrompt = buildScoreRangeOutputSchema();
|
|
16344
|
-
const
|
|
16344
|
+
const graderRawRequest = {
|
|
16345
16345
|
userPrompt: prompt,
|
|
16346
16346
|
systemPrompt
|
|
16347
16347
|
};
|
|
@@ -16361,7 +16361,7 @@ ${context2.fileChanges}`;
|
|
|
16361
16361
|
verdict,
|
|
16362
16362
|
assertions,
|
|
16363
16363
|
expectedAspectCount: rubrics.length,
|
|
16364
|
-
|
|
16364
|
+
graderRawRequest,
|
|
16365
16365
|
graderTarget: graderProvider.targetName,
|
|
16366
16366
|
details,
|
|
16367
16367
|
tokenUsage
|
|
@@ -16375,7 +16375,7 @@ ${context2.fileChanges}`;
|
|
|
16375
16375
|
verdict: "skip",
|
|
16376
16376
|
assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
|
|
16377
16377
|
expectedAspectCount: rubrics.length,
|
|
16378
|
-
|
|
16378
|
+
graderRawRequest,
|
|
16379
16379
|
graderTarget: graderProvider.targetName
|
|
16380
16380
|
};
|
|
16381
16381
|
}
|
|
@@ -16404,7 +16404,7 @@ ${context2.fileChanges}`;
|
|
|
16404
16404
|
const config = context2.evaluator;
|
|
16405
16405
|
const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
|
|
16406
16406
|
const fsTools = createFilesystemTools(workspacePath);
|
|
16407
|
-
const
|
|
16407
|
+
const graderRawRequest = {
|
|
16408
16408
|
mode: "built-in",
|
|
16409
16409
|
systemPrompt,
|
|
16410
16410
|
userPrompt,
|
|
@@ -16428,7 +16428,7 @@ ${context2.fileChanges}`;
|
|
|
16428
16428
|
return this.parseAgentResult(
|
|
16429
16429
|
text,
|
|
16430
16430
|
rubrics,
|
|
16431
|
-
|
|
16431
|
+
graderRawRequest,
|
|
16432
16432
|
details,
|
|
16433
16433
|
graderProvider.targetName
|
|
16434
16434
|
);
|
|
@@ -16439,7 +16439,7 @@ ${context2.fileChanges}`;
|
|
|
16439
16439
|
verdict: "fail",
|
|
16440
16440
|
assertions: [{ text: `llm-grader built-in evaluation failed: ${message}`, passed: false }],
|
|
16441
16441
|
expectedAspectCount: 1,
|
|
16442
|
-
|
|
16442
|
+
graderRawRequest,
|
|
16443
16443
|
graderTarget: graderProvider.targetName,
|
|
16444
16444
|
details: { mode: "built-in", error: message }
|
|
16445
16445
|
};
|
|
@@ -16471,7 +16471,7 @@ ${context2.fileChanges}`;
|
|
|
16471
16471
|
async evaluateWithDelegate(context2, provider, modeLabel) {
|
|
16472
16472
|
const workspacePath = context2.workspacePath;
|
|
16473
16473
|
const prompt = this.buildDelegatedPrompt(context2);
|
|
16474
|
-
const
|
|
16474
|
+
const graderRawRequest = {
|
|
16475
16475
|
mode: modeLabel,
|
|
16476
16476
|
grader_target: provider.targetName,
|
|
16477
16477
|
prompt
|
|
@@ -16492,7 +16492,7 @@ ${context2.fileChanges}`;
|
|
|
16492
16492
|
{ text: `llm-grader ${modeLabel} returned no assistant response`, passed: false }
|
|
16493
16493
|
],
|
|
16494
16494
|
expectedAspectCount: 1,
|
|
16495
|
-
|
|
16495
|
+
graderRawRequest,
|
|
16496
16496
|
graderTarget: provider.targetName,
|
|
16497
16497
|
details: { mode: modeLabel, grader_target: provider.targetName }
|
|
16498
16498
|
};
|
|
@@ -16506,7 +16506,7 @@ ${context2.fileChanges}`;
|
|
|
16506
16506
|
return this.parseAgentResult(
|
|
16507
16507
|
assistantContent,
|
|
16508
16508
|
rubrics,
|
|
16509
|
-
|
|
16509
|
+
graderRawRequest,
|
|
16510
16510
|
details,
|
|
16511
16511
|
provider.targetName
|
|
16512
16512
|
);
|
|
@@ -16519,7 +16519,7 @@ ${context2.fileChanges}`;
|
|
|
16519
16519
|
{ text: `llm-grader ${modeLabel} evaluation failed: ${message}`, passed: false }
|
|
16520
16520
|
],
|
|
16521
16521
|
expectedAspectCount: 1,
|
|
16522
|
-
|
|
16522
|
+
graderRawRequest,
|
|
16523
16523
|
graderTarget: provider.targetName,
|
|
16524
16524
|
details: {
|
|
16525
16525
|
mode: modeLabel,
|
|
@@ -16540,7 +16540,7 @@ ${context2.fileChanges}`;
|
|
|
16540
16540
|
const config = context2.evaluator;
|
|
16541
16541
|
const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
|
|
16542
16542
|
const parts = [
|
|
16543
|
-
"You are an expert
|
|
16543
|
+
"You are an expert grader with access to the workspace filesystem.",
|
|
16544
16544
|
"Use the provided tools to investigate the workspace and verify the criteria are met.",
|
|
16545
16545
|
"Thoroughly examine relevant files before making your assessment.",
|
|
16546
16546
|
""
|
|
@@ -16569,9 +16569,9 @@ ${context2.fileChanges}`;
|
|
|
16569
16569
|
[TEMPLATE_VARIABLES.OUTPUT_TEXT]: context2.candidate.trim(),
|
|
16570
16570
|
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context2.evalCase.reference_answer ?? "").trim()
|
|
16571
16571
|
};
|
|
16572
|
-
if (this.
|
|
16573
|
-
warnDeprecatedTemplateVars(this.
|
|
16574
|
-
return substituteVariables(this.
|
|
16572
|
+
if (this.graderTemplate) {
|
|
16573
|
+
warnDeprecatedTemplateVars(this.graderTemplate);
|
|
16574
|
+
return substituteVariables(this.graderTemplate, variables);
|
|
16575
16575
|
}
|
|
16576
16576
|
const config = context2.evaluator;
|
|
16577
16577
|
const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
|
|
@@ -16618,7 +16618,7 @@ ${context2.fileChanges}`;
|
|
|
16618
16618
|
const formattedQuestion = context2.promptInputs.question && context2.promptInputs.question.trim().length > 0 ? context2.promptInputs.question : context2.evalCase.question;
|
|
16619
16619
|
const config = context2.evaluator;
|
|
16620
16620
|
const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
|
|
16621
|
-
if (this.
|
|
16621
|
+
if (this.graderTemplate) {
|
|
16622
16622
|
const variables = {
|
|
16623
16623
|
[TEMPLATE_VARIABLES.CRITERIA]: context2.evalCase.criteria.trim(),
|
|
16624
16624
|
[TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
|
|
@@ -16630,15 +16630,15 @@ ${context2.fileChanges}`;
|
|
|
16630
16630
|
[TEMPLATE_VARIABLES.OUTPUT_TEXT]: context2.candidate.trim(),
|
|
16631
16631
|
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context2.evalCase.reference_answer ?? "").trim()
|
|
16632
16632
|
};
|
|
16633
|
-
warnDeprecatedTemplateVars(this.
|
|
16634
|
-
const customPrompt = substituteVariables(this.
|
|
16633
|
+
warnDeprecatedTemplateVars(this.graderTemplate);
|
|
16634
|
+
const customPrompt = substituteVariables(this.graderTemplate, variables);
|
|
16635
16635
|
const outputSchema = rubrics && rubrics.length > 0 ? buildRubricOutputSchema() : buildOutputSchema();
|
|
16636
16636
|
return `${customPrompt}
|
|
16637
16637
|
|
|
16638
16638
|
${outputSchema}`;
|
|
16639
16639
|
}
|
|
16640
16640
|
const parts = [
|
|
16641
|
-
"You are an expert
|
|
16641
|
+
"You are an expert grader. Investigate the workspace to verify the criteria are met.",
|
|
16642
16642
|
"",
|
|
16643
16643
|
"[[ ## question ## ]]",
|
|
16644
16644
|
formattedQuestion,
|
|
@@ -16675,7 +16675,7 @@ ${outputSchema}`;
|
|
|
16675
16675
|
* Parse the agent's response text into an EvaluationScore.
|
|
16676
16676
|
* Supports both freeform and rubric modes.
|
|
16677
16677
|
*/
|
|
16678
|
-
parseAgentResult(text, rubrics,
|
|
16678
|
+
parseAgentResult(text, rubrics, graderRawRequest, details, graderTarget) {
|
|
16679
16679
|
try {
|
|
16680
16680
|
const parsed = parseJsonFromText(text);
|
|
16681
16681
|
if (rubrics && rubrics.length > 0) {
|
|
@@ -16686,7 +16686,7 @@ ${outputSchema}`;
|
|
|
16686
16686
|
verdict,
|
|
16687
16687
|
assertions: assertions2,
|
|
16688
16688
|
expectedAspectCount: rubrics.length,
|
|
16689
|
-
|
|
16689
|
+
graderRawRequest,
|
|
16690
16690
|
graderTarget,
|
|
16691
16691
|
details
|
|
16692
16692
|
};
|
|
@@ -16699,7 +16699,7 @@ ${outputSchema}`;
|
|
|
16699
16699
|
verdict: scoreToVerdict(score),
|
|
16700
16700
|
assertions,
|
|
16701
16701
|
expectedAspectCount: Math.max(assertions.length, 1),
|
|
16702
|
-
|
|
16702
|
+
graderRawRequest,
|
|
16703
16703
|
graderTarget,
|
|
16704
16704
|
details: data.details && Object.keys(data.details).length > 0 ? { ...details, ...data.details } : details
|
|
16705
16705
|
};
|
|
@@ -16714,7 +16714,7 @@ ${outputSchema}`;
|
|
|
16714
16714
|
}
|
|
16715
16715
|
],
|
|
16716
16716
|
expectedAspectCount: 1,
|
|
16717
|
-
|
|
16717
|
+
graderRawRequest,
|
|
16718
16718
|
graderTarget,
|
|
16719
16719
|
details
|
|
16720
16720
|
};
|
|
@@ -16729,7 +16729,7 @@ ${outputSchema}`;
|
|
|
16729
16729
|
buildScoreRangePrompt(context2, rubrics) {
|
|
16730
16730
|
const formattedQuestion = context2.promptInputs.question && context2.promptInputs.question.trim().length > 0 ? context2.promptInputs.question : context2.evalCase.question;
|
|
16731
16731
|
const parts = [
|
|
16732
|
-
"You are an expert
|
|
16732
|
+
"You are an expert grader. Score the candidate answer on each criterion below using the provided score ranges.",
|
|
16733
16733
|
"For each criterion, output an integer score from 0 to 10 based on which score range best matches the answer.",
|
|
16734
16734
|
"",
|
|
16735
16735
|
"[[ ## question ## ]]",
|
|
@@ -16772,7 +16772,7 @@ ${outputSchema}`;
|
|
|
16772
16772
|
buildRubricPrompt(context2, rubrics) {
|
|
16773
16773
|
const formattedQuestion = context2.promptInputs.question && context2.promptInputs.question.trim().length > 0 ? context2.promptInputs.question : context2.evalCase.question;
|
|
16774
16774
|
const parts = [
|
|
16775
|
-
"You are an expert
|
|
16775
|
+
"You are an expert grader. Evaluate the candidate answer against each rubric item below.",
|
|
16776
16776
|
"",
|
|
16777
16777
|
"[[ ## question ## ]]",
|
|
16778
16778
|
formattedQuestion,
|
|
@@ -16946,7 +16946,7 @@ function sumTokenUsage(first, second) {
|
|
|
16946
16946
|
};
|
|
16947
16947
|
}
|
|
16948
16948
|
function buildRubricOutputSchema() {
|
|
16949
|
-
return `You are an expert
|
|
16949
|
+
return `You are an expert grader. Evaluate the candidate answer against each rubric item.
|
|
16950
16950
|
You must return a valid JSON object matching this schema:
|
|
16951
16951
|
{
|
|
16952
16952
|
"checks": [
|
|
@@ -16980,7 +16980,7 @@ function warnDeprecatedTemplateVars(template) {
|
|
|
16980
16980
|
console.warn(
|
|
16981
16981
|
`${ANSI_YELLOW8}\u26A0 Deprecated template variables detected (they still work but will be removed in a future version):
|
|
16982
16982
|
${used.join("\n ")}
|
|
16983
|
-
Update your custom
|
|
16983
|
+
Update your custom grader template to use the new names.${ANSI_RESET9}`
|
|
16984
16984
|
);
|
|
16985
16985
|
}
|
|
16986
16986
|
}
|
|
@@ -17012,7 +17012,7 @@ function calculateRubricScore(result, rubrics) {
|
|
|
17012
17012
|
return { score, verdict, assertions };
|
|
17013
17013
|
}
|
|
17014
17014
|
function buildScoreRangeOutputSchema() {
|
|
17015
|
-
return `You are an expert
|
|
17015
|
+
return `You are an expert grader. Score the candidate answer on each criterion.
|
|
17016
17016
|
You must return a valid JSON object matching this schema:
|
|
17017
17017
|
{
|
|
17018
17018
|
"checks": [
|
|
@@ -17220,13 +17220,13 @@ async function searchDirectory(dirPath, workspacePath, regex, matches) {
|
|
|
17220
17220
|
}
|
|
17221
17221
|
}
|
|
17222
17222
|
|
|
17223
|
-
// src/evaluation/
|
|
17223
|
+
// src/evaluation/graders/composite.ts
|
|
17224
17224
|
var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
|
|
17225
17225
|
{{EVALUATOR_RESULTS_JSON}}
|
|
17226
17226
|
|
|
17227
|
-
Decide the final score and verdict based on all
|
|
17227
|
+
Decide the final score and verdict based on all grader results.
|
|
17228
17228
|
Return a JSON object with: score (0.0-1.0), verdict (pass/fail), and reasoning.`;
|
|
17229
|
-
var
|
|
17229
|
+
var CompositeGrader = class {
|
|
17230
17230
|
kind = "composite";
|
|
17231
17231
|
config;
|
|
17232
17232
|
evaluatorFactory;
|
|
@@ -17277,7 +17277,7 @@ var CompositeEvaluator = class {
|
|
|
17277
17277
|
weight,
|
|
17278
17278
|
verdict: member.result.verdict,
|
|
17279
17279
|
assertions: [...member.result.assertions],
|
|
17280
|
-
|
|
17280
|
+
graderRawRequest: member.result.graderRawRequest,
|
|
17281
17281
|
scores: member.result.scores,
|
|
17282
17282
|
details: member.result.details,
|
|
17283
17283
|
tokenUsage: member.result.tokenUsage
|
|
@@ -17298,7 +17298,7 @@ var CompositeEvaluator = class {
|
|
|
17298
17298
|
verdict: "skip",
|
|
17299
17299
|
assertions: [{ text: "All evaluators skipped (infrastructure failure)", passed: false }],
|
|
17300
17300
|
expectedAspectCount: 1,
|
|
17301
|
-
|
|
17301
|
+
graderRawRequest: {
|
|
17302
17302
|
aggregator: "weighted_average",
|
|
17303
17303
|
...weights ? { weights } : {}
|
|
17304
17304
|
},
|
|
@@ -17311,7 +17311,7 @@ var CompositeEvaluator = class {
|
|
|
17311
17311
|
verdict: scoreToVerdict(finalScore),
|
|
17312
17312
|
assertions: allAssertions,
|
|
17313
17313
|
expectedAspectCount: allAssertions.length || 1,
|
|
17314
|
-
|
|
17314
|
+
graderRawRequest: {
|
|
17315
17315
|
aggregator: "weighted_average",
|
|
17316
17316
|
...weights ? { weights } : {}
|
|
17317
17317
|
},
|
|
@@ -17330,7 +17330,7 @@ var CompositeEvaluator = class {
|
|
|
17330
17330
|
score: member.result.score,
|
|
17331
17331
|
verdict: member.result.verdict,
|
|
17332
17332
|
assertions: [...member.result.assertions],
|
|
17333
|
-
|
|
17333
|
+
graderRawRequest: member.result.graderRawRequest,
|
|
17334
17334
|
scores: member.result.scores,
|
|
17335
17335
|
details: member.result.details,
|
|
17336
17336
|
tokenUsage: member.result.tokenUsage
|
|
@@ -17353,7 +17353,7 @@ var CompositeEvaluator = class {
|
|
|
17353
17353
|
verdict: "skip",
|
|
17354
17354
|
assertions: [{ text: "All evaluators skipped (infrastructure failure)", passed: false }],
|
|
17355
17355
|
expectedAspectCount: 1,
|
|
17356
|
-
|
|
17356
|
+
graderRawRequest: {
|
|
17357
17357
|
aggregator: "threshold",
|
|
17358
17358
|
threshold
|
|
17359
17359
|
},
|
|
@@ -17372,7 +17372,7 @@ var CompositeEvaluator = class {
|
|
|
17372
17372
|
verdict: pass ? "pass" : "fail",
|
|
17373
17373
|
assertions: allAssertions,
|
|
17374
17374
|
expectedAspectCount: allAssertions.length || 1,
|
|
17375
|
-
|
|
17375
|
+
graderRawRequest: {
|
|
17376
17376
|
aggregator: "threshold",
|
|
17377
17377
|
threshold
|
|
17378
17378
|
},
|
|
@@ -17389,7 +17389,7 @@ var CompositeEvaluator = class {
|
|
|
17389
17389
|
weight: weights?.[member.id] ?? 1,
|
|
17390
17390
|
verdict: member.result.verdict,
|
|
17391
17391
|
assertions: [...member.result.assertions],
|
|
17392
|
-
|
|
17392
|
+
graderRawRequest: member.result.graderRawRequest,
|
|
17393
17393
|
scores: member.result.scores,
|
|
17394
17394
|
details: member.result.details
|
|
17395
17395
|
}));
|
|
@@ -17410,7 +17410,7 @@ var CompositeEvaluator = class {
|
|
|
17410
17410
|
verdict,
|
|
17411
17411
|
assertions,
|
|
17412
17412
|
expectedAspectCount: assertions.length || 1,
|
|
17413
|
-
|
|
17413
|
+
graderRawRequest: {
|
|
17414
17414
|
aggregator: "code-grader",
|
|
17415
17415
|
script: scriptPath
|
|
17416
17416
|
},
|
|
@@ -17423,7 +17423,7 @@ var CompositeEvaluator = class {
|
|
|
17423
17423
|
verdict: "fail",
|
|
17424
17424
|
assertions: [{ text: `Code aggregator failed: ${message}`, passed: false }],
|
|
17425
17425
|
expectedAspectCount: 1,
|
|
17426
|
-
|
|
17426
|
+
graderRawRequest: {
|
|
17427
17427
|
aggregator: "code-grader",
|
|
17428
17428
|
script: scriptPath,
|
|
17429
17429
|
error: message
|
|
@@ -17445,14 +17445,14 @@ var CompositeEvaluator = class {
|
|
|
17445
17445
|
score: member.result.score,
|
|
17446
17446
|
verdict: member.result.verdict,
|
|
17447
17447
|
assertions: [...member.result.assertions],
|
|
17448
|
-
|
|
17448
|
+
graderRawRequest: member.result.graderRawRequest,
|
|
17449
17449
|
scores: member.result.scores,
|
|
17450
17450
|
details: member.result.details
|
|
17451
17451
|
}));
|
|
17452
17452
|
const promptTemplate = config.prompt ?? DEFAULT_COMPOSITE_AGGREGATOR_PROMPT;
|
|
17453
17453
|
const userPrompt = promptTemplate.replace(/\{\{EVALUATOR_RESULTS_JSON\}\}/g, resultsJson);
|
|
17454
17454
|
const systemPrompt = buildOutputSchema();
|
|
17455
|
-
const
|
|
17455
|
+
const graderRawRequest = {
|
|
17456
17456
|
aggregator: "llm-grader",
|
|
17457
17457
|
userPrompt,
|
|
17458
17458
|
systemPrompt,
|
|
@@ -17474,7 +17474,7 @@ var CompositeEvaluator = class {
|
|
|
17474
17474
|
verdict: scoreToVerdict(score2),
|
|
17475
17475
|
assertions: assertions2,
|
|
17476
17476
|
expectedAspectCount: Math.max(assertions2.length, 1),
|
|
17477
|
-
|
|
17477
|
+
graderRawRequest,
|
|
17478
17478
|
scores
|
|
17479
17479
|
};
|
|
17480
17480
|
}
|
|
@@ -17494,7 +17494,7 @@ var CompositeEvaluator = class {
|
|
|
17494
17494
|
verdict: scoreToVerdict(score),
|
|
17495
17495
|
assertions,
|
|
17496
17496
|
expectedAspectCount: Math.max(assertions.length, 1),
|
|
17497
|
-
|
|
17497
|
+
graderRawRequest,
|
|
17498
17498
|
scores
|
|
17499
17499
|
};
|
|
17500
17500
|
} catch {
|
|
@@ -17503,16 +17503,16 @@ var CompositeEvaluator = class {
|
|
|
17503
17503
|
verdict: "fail",
|
|
17504
17504
|
assertions: [{ text: "LLM aggregator failed", passed: false }],
|
|
17505
17505
|
expectedAspectCount: 1,
|
|
17506
|
-
|
|
17506
|
+
graderRawRequest,
|
|
17507
17507
|
scores
|
|
17508
17508
|
};
|
|
17509
17509
|
}
|
|
17510
17510
|
}
|
|
17511
17511
|
};
|
|
17512
17512
|
|
|
17513
|
-
// src/evaluation/
|
|
17513
|
+
// src/evaluation/graders/cost.ts
|
|
17514
17514
|
init_cjs_shims();
|
|
17515
|
-
var
|
|
17515
|
+
var CostGrader = class {
|
|
17516
17516
|
kind = "cost";
|
|
17517
17517
|
config;
|
|
17518
17518
|
constructor(options) {
|
|
@@ -17527,7 +17527,7 @@ var CostEvaluator = class {
|
|
|
17527
17527
|
verdict: "fail",
|
|
17528
17528
|
assertions: [{ text: "No cost data available in trace", passed: false }],
|
|
17529
17529
|
expectedAspectCount: 1,
|
|
17530
|
-
|
|
17530
|
+
graderRawRequest: {
|
|
17531
17531
|
type: "cost",
|
|
17532
17532
|
budget,
|
|
17533
17533
|
costUsd: null
|
|
@@ -17544,7 +17544,7 @@ var CostEvaluator = class {
|
|
|
17544
17544
|
passed ? { text: `Cost ${formatCost(costUsd)} <= ${formatCost(budget)} budget`, passed: true } : { text: `Cost ${formatCost(costUsd)} > ${formatCost(budget)} budget`, passed: false }
|
|
17545
17545
|
],
|
|
17546
17546
|
expectedAspectCount: 1,
|
|
17547
|
-
|
|
17547
|
+
graderRawRequest: {
|
|
17548
17548
|
type: "cost",
|
|
17549
17549
|
budget,
|
|
17550
17550
|
costUsd
|
|
@@ -17553,9 +17553,9 @@ var CostEvaluator = class {
|
|
|
17553
17553
|
}
|
|
17554
17554
|
};
|
|
17555
17555
|
|
|
17556
|
-
// src/evaluation/
|
|
17556
|
+
// src/evaluation/graders/execution-metrics.ts
|
|
17557
17557
|
init_cjs_shims();
|
|
17558
|
-
var
|
|
17558
|
+
var ExecutionMetricsGrader = class {
|
|
17559
17559
|
kind = "execution-metrics";
|
|
17560
17560
|
config;
|
|
17561
17561
|
constructor(options) {
|
|
@@ -17579,7 +17579,7 @@ var ExecutionMetricsEvaluator = class {
|
|
|
17579
17579
|
verdict: "fail",
|
|
17580
17580
|
assertions: [{ text: "No trace summary available", passed: false }],
|
|
17581
17581
|
expectedAspectCount: 1,
|
|
17582
|
-
|
|
17582
|
+
graderRawRequest: {
|
|
17583
17583
|
type: "execution-metrics",
|
|
17584
17584
|
config: this.extractConfiguredThresholds(),
|
|
17585
17585
|
actual: null
|
|
@@ -17695,7 +17695,7 @@ var ExecutionMetricsEvaluator = class {
|
|
|
17695
17695
|
verdict: scoreToVerdict(score),
|
|
17696
17696
|
assertions,
|
|
17697
17697
|
expectedAspectCount: totalChecks || 1,
|
|
17698
|
-
|
|
17698
|
+
graderRawRequest: {
|
|
17699
17699
|
type: "execution-metrics",
|
|
17700
17700
|
config: this.extractConfiguredThresholds(),
|
|
17701
17701
|
actual: this.filterDefinedMetrics(actualMetrics)
|
|
@@ -17738,7 +17738,7 @@ var ExecutionMetricsEvaluator = class {
|
|
|
17738
17738
|
}
|
|
17739
17739
|
};
|
|
17740
17740
|
|
|
17741
|
-
// src/evaluation/
|
|
17741
|
+
// src/evaluation/graders/field-accuracy.ts
|
|
17742
17742
|
init_cjs_shims();
|
|
17743
17743
|
var DEFAULT_DATE_FORMATS = [
|
|
17744
17744
|
"YYYY-MM-DDTHH:mm:ssZ",
|
|
@@ -17784,7 +17784,7 @@ var MONTH_NAMES = {
|
|
|
17784
17784
|
dec: 11,
|
|
17785
17785
|
december: 11
|
|
17786
17786
|
};
|
|
17787
|
-
var
|
|
17787
|
+
var FieldAccuracyGrader = class {
|
|
17788
17788
|
kind = "field-accuracy";
|
|
17789
17789
|
config;
|
|
17790
17790
|
constructor(options) {
|
|
@@ -18143,9 +18143,9 @@ function parseJsonFromTextSafe(text) {
|
|
|
18143
18143
|
return parseJsonFromText(text);
|
|
18144
18144
|
}
|
|
18145
18145
|
|
|
18146
|
-
// src/evaluation/
|
|
18146
|
+
// src/evaluation/graders/latency.ts
|
|
18147
18147
|
init_cjs_shims();
|
|
18148
|
-
var
|
|
18148
|
+
var LatencyGrader = class {
|
|
18149
18149
|
kind = "latency";
|
|
18150
18150
|
config;
|
|
18151
18151
|
constructor(options) {
|
|
@@ -18160,7 +18160,7 @@ var LatencyEvaluator = class {
|
|
|
18160
18160
|
verdict: "fail",
|
|
18161
18161
|
assertions: [{ text: "No duration data available in trace", passed: false }],
|
|
18162
18162
|
expectedAspectCount: 1,
|
|
18163
|
-
|
|
18163
|
+
graderRawRequest: {
|
|
18164
18164
|
type: "latency",
|
|
18165
18165
|
threshold,
|
|
18166
18166
|
durationMs: null
|
|
@@ -18176,7 +18176,7 @@ var LatencyEvaluator = class {
|
|
|
18176
18176
|
passed ? { text: `Duration ${durationMs}ms <= ${threshold}ms threshold`, passed: true } : { text: `Duration ${durationMs}ms > ${threshold}ms threshold`, passed: false }
|
|
18177
18177
|
],
|
|
18178
18178
|
expectedAspectCount: 1,
|
|
18179
|
-
|
|
18179
|
+
graderRawRequest: {
|
|
18180
18180
|
type: "latency",
|
|
18181
18181
|
threshold,
|
|
18182
18182
|
durationMs
|
|
@@ -18185,9 +18185,9 @@ var LatencyEvaluator = class {
|
|
|
18185
18185
|
}
|
|
18186
18186
|
};
|
|
18187
18187
|
|
|
18188
|
-
// src/evaluation/
|
|
18188
|
+
// src/evaluation/graders/skill-trigger.ts
|
|
18189
18189
|
init_cjs_shims();
|
|
18190
|
-
var
|
|
18190
|
+
var SkillTriggerGrader = class {
|
|
18191
18191
|
kind = "skill-trigger";
|
|
18192
18192
|
config;
|
|
18193
18193
|
constructor(config) {
|
|
@@ -18254,7 +18254,7 @@ var SkillTriggerEvaluator = class {
|
|
|
18254
18254
|
}
|
|
18255
18255
|
};
|
|
18256
18256
|
|
|
18257
|
-
// src/evaluation/
|
|
18257
|
+
// src/evaluation/graders/llm-grader-prompt.ts
|
|
18258
18258
|
init_cjs_shims();
|
|
18259
18259
|
function assembleLlmGraderPrompt(input) {
|
|
18260
18260
|
const {
|
|
@@ -18263,7 +18263,7 @@ function assembleLlmGraderPrompt(input) {
|
|
|
18263
18263
|
promptInputs,
|
|
18264
18264
|
evaluatorConfig,
|
|
18265
18265
|
fileChanges,
|
|
18266
|
-
|
|
18266
|
+
graderTemplateOverride
|
|
18267
18267
|
} = input;
|
|
18268
18268
|
const rubrics = evaluatorConfig?.rubrics;
|
|
18269
18269
|
if (rubrics && rubrics.length > 0) {
|
|
@@ -18273,15 +18273,9 @@ function assembleLlmGraderPrompt(input) {
|
|
|
18273
18273
|
}
|
|
18274
18274
|
return assembleChecklist(evalCase, candidate, promptInputs, rubrics, fileChanges);
|
|
18275
18275
|
}
|
|
18276
|
-
return assembleFreeform(
|
|
18277
|
-
evalCase,
|
|
18278
|
-
candidate,
|
|
18279
|
-
promptInputs,
|
|
18280
|
-
fileChanges,
|
|
18281
|
-
evaluatorTemplateOverride
|
|
18282
|
-
);
|
|
18276
|
+
return assembleFreeform(evalCase, candidate, promptInputs, fileChanges, graderTemplateOverride);
|
|
18283
18277
|
}
|
|
18284
|
-
function assembleFreeform(evalCase, candidate, promptInputs, fileChanges,
|
|
18278
|
+
function assembleFreeform(evalCase, candidate, promptInputs, fileChanges, graderTemplateOverride) {
|
|
18285
18279
|
const formattedQuestion = promptInputs.question && promptInputs.question.trim().length > 0 ? promptInputs.question : evalCase.question;
|
|
18286
18280
|
const variables = {
|
|
18287
18281
|
[TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
|
|
@@ -18295,9 +18289,9 @@ function assembleFreeform(evalCase, candidate, promptInputs, fileChanges, evalua
|
|
|
18295
18289
|
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (evalCase.reference_answer ?? "").trim()
|
|
18296
18290
|
};
|
|
18297
18291
|
const systemPrompt = buildOutputSchema();
|
|
18298
|
-
const template =
|
|
18292
|
+
const template = graderTemplateOverride ?? DEFAULT_GRADER_TEMPLATE;
|
|
18299
18293
|
let userPrompt = substituteVariables(template, variables);
|
|
18300
|
-
if (fileChanges && !
|
|
18294
|
+
if (fileChanges && !graderTemplateOverride) {
|
|
18301
18295
|
userPrompt += `
|
|
18302
18296
|
|
|
18303
18297
|
[[ ## file_changes ## ]]
|
|
@@ -18313,7 +18307,7 @@ ${fileChanges}`;
|
|
|
18313
18307
|
function assembleChecklist(evalCase, candidate, promptInputs, rubrics, fileChanges) {
|
|
18314
18308
|
const formattedQuestion = promptInputs.question && promptInputs.question.trim().length > 0 ? promptInputs.question : evalCase.question;
|
|
18315
18309
|
const parts = [
|
|
18316
|
-
"You are an expert
|
|
18310
|
+
"You are an expert grader. Evaluate the candidate answer against each rubric item below.",
|
|
18317
18311
|
"",
|
|
18318
18312
|
"[[ ## question ## ]]",
|
|
18319
18313
|
formattedQuestion,
|
|
@@ -18348,7 +18342,7 @@ function assembleChecklist(evalCase, candidate, promptInputs, rubrics, fileChang
|
|
|
18348
18342
|
function assembleScoreRange(evalCase, candidate, promptInputs, rubrics, fileChanges) {
|
|
18349
18343
|
const formattedQuestion = promptInputs.question && promptInputs.question.trim().length > 0 ? promptInputs.question : evalCase.question;
|
|
18350
18344
|
const parts = [
|
|
18351
|
-
"You are an expert
|
|
18345
|
+
"You are an expert grader. Score the candidate answer on each criterion below using the provided score ranges.",
|
|
18352
18346
|
"For each criterion, output an integer score from 0 to 10 based on which score range best matches the answer.",
|
|
18353
18347
|
"",
|
|
18354
18348
|
"[[ ## question ## ]]",
|
|
@@ -18396,9 +18390,9 @@ function assembleScoreRange(evalCase, candidate, promptInputs, rubrics, fileChan
|
|
|
18396
18390
|
};
|
|
18397
18391
|
}
|
|
18398
18392
|
|
|
18399
|
-
// src/evaluation/
|
|
18393
|
+
// src/evaluation/graders/token-usage.ts
|
|
18400
18394
|
init_cjs_shims();
|
|
18401
|
-
var
|
|
18395
|
+
var TokenUsageGrader = class {
|
|
18402
18396
|
kind = "token-usage";
|
|
18403
18397
|
config;
|
|
18404
18398
|
constructor(options) {
|
|
@@ -18419,7 +18413,7 @@ var TokenUsageEvaluator = class {
|
|
|
18419
18413
|
verdict: "fail",
|
|
18420
18414
|
assertions: [{ text: "No token usage data available in trace", passed: false }],
|
|
18421
18415
|
expectedAspectCount,
|
|
18422
|
-
|
|
18416
|
+
graderRawRequest: {
|
|
18423
18417
|
type: "token-usage",
|
|
18424
18418
|
max_total: maxTotal ?? null,
|
|
18425
18419
|
max_input: maxInput ?? null,
|
|
@@ -18460,7 +18454,7 @@ var TokenUsageEvaluator = class {
|
|
|
18460
18454
|
verdict: passed ? "pass" : "fail",
|
|
18461
18455
|
assertions,
|
|
18462
18456
|
expectedAspectCount,
|
|
18463
|
-
|
|
18457
|
+
graderRawRequest: {
|
|
18464
18458
|
type: "token-usage",
|
|
18465
18459
|
max_total: maxTotal ?? null,
|
|
18466
18460
|
max_input: maxInput ?? null,
|
|
@@ -18476,7 +18470,7 @@ var TokenUsageEvaluator = class {
|
|
|
18476
18470
|
}
|
|
18477
18471
|
};
|
|
18478
18472
|
|
|
18479
|
-
// src/evaluation/
|
|
18473
|
+
// src/evaluation/graders/tool-trajectory.ts
|
|
18480
18474
|
init_cjs_shims();
|
|
18481
18475
|
function getNestedValue(obj, path56) {
|
|
18482
18476
|
const parts = path56.split(".");
|
|
@@ -18546,7 +18540,7 @@ function checkLatency(toolName, maxDurationMs, actualDurationMs) {
|
|
|
18546
18540
|
message: `${toolName} took ${actualDurationMs}ms (max: ${maxDurationMs}ms)`
|
|
18547
18541
|
};
|
|
18548
18542
|
}
|
|
18549
|
-
var
|
|
18543
|
+
var ToolTrajectoryGrader = class {
|
|
18550
18544
|
kind = "tool-trajectory";
|
|
18551
18545
|
config;
|
|
18552
18546
|
constructor(options) {
|
|
@@ -18951,7 +18945,7 @@ var ToolTrajectoryEvaluator = class {
|
|
|
18951
18945
|
}
|
|
18952
18946
|
};
|
|
18953
18947
|
|
|
18954
|
-
// src/evaluation/
|
|
18948
|
+
// src/evaluation/graders/assertions.ts
|
|
18955
18949
|
init_cjs_shims();
|
|
18956
18950
|
function runContainsAssertion(output, value) {
|
|
18957
18951
|
const passed = output.includes(value);
|
|
@@ -19255,16 +19249,16 @@ function validateConcurrency(concurrency) {
|
|
|
19255
19249
|
// src/evaluation/registry/index.ts
|
|
19256
19250
|
init_cjs_shims();
|
|
19257
19251
|
|
|
19258
|
-
// src/evaluation/registry/
|
|
19252
|
+
// src/evaluation/registry/grader-registry.ts
|
|
19259
19253
|
init_cjs_shims();
|
|
19260
|
-
var
|
|
19254
|
+
var GraderRegistry = class {
|
|
19261
19255
|
factories = /* @__PURE__ */ new Map();
|
|
19262
|
-
/** Register a factory function for an
|
|
19256
|
+
/** Register a factory function for an grader type. */
|
|
19263
19257
|
register(type, factory) {
|
|
19264
19258
|
this.factories.set(type, factory);
|
|
19265
19259
|
return this;
|
|
19266
19260
|
}
|
|
19267
|
-
/** Get the factory function for an
|
|
19261
|
+
/** Get the factory function for an grader type. */
|
|
19268
19262
|
get(type) {
|
|
19269
19263
|
return this.factories.get(type);
|
|
19270
19264
|
}
|
|
@@ -19272,25 +19266,25 @@ var EvaluatorRegistry = class {
|
|
|
19272
19266
|
has(type) {
|
|
19273
19267
|
return this.factories.has(type);
|
|
19274
19268
|
}
|
|
19275
|
-
/** List all registered
|
|
19269
|
+
/** List all registered grader type names. */
|
|
19276
19270
|
list() {
|
|
19277
19271
|
return [...this.factories.keys()];
|
|
19278
19272
|
}
|
|
19279
19273
|
/**
|
|
19280
19274
|
* Create an evaluator instance from a config, using the registered factory.
|
|
19281
|
-
* Throws if no factory is registered for the
|
|
19275
|
+
* Throws if no factory is registered for the grader type.
|
|
19282
19276
|
*/
|
|
19283
19277
|
async create(config, context2) {
|
|
19284
19278
|
const factory = this.factories.get(config.type);
|
|
19285
19279
|
if (!factory) {
|
|
19286
19280
|
throw new Error(
|
|
19287
|
-
`Unknown
|
|
19281
|
+
`Unknown grader type: "${config.type}". Registered types: ${this.list().join(", ")}`
|
|
19288
19282
|
);
|
|
19289
19283
|
}
|
|
19290
19284
|
return factory(config, context2);
|
|
19291
19285
|
}
|
|
19292
19286
|
};
|
|
19293
|
-
var
|
|
19287
|
+
var DeterministicAssertionGrader = class {
|
|
19294
19288
|
constructor(kind, assertFn) {
|
|
19295
19289
|
this.assertFn = assertFn;
|
|
19296
19290
|
this.kind = kind;
|
|
@@ -19301,12 +19295,12 @@ var DeterministicAssertionEvaluator = class {
|
|
|
19301
19295
|
}
|
|
19302
19296
|
};
|
|
19303
19297
|
|
|
19304
|
-
// src/evaluation/registry/builtin-
|
|
19298
|
+
// src/evaluation/registry/builtin-graders.ts
|
|
19305
19299
|
init_cjs_shims();
|
|
19306
19300
|
|
|
19307
|
-
// src/evaluation/
|
|
19301
|
+
// src/evaluation/graders/inline-assert.ts
|
|
19308
19302
|
init_cjs_shims();
|
|
19309
|
-
var
|
|
19303
|
+
var InlineAssertGrader = class {
|
|
19310
19304
|
constructor(fn, name) {
|
|
19311
19305
|
this.fn = fn;
|
|
19312
19306
|
this.name = name;
|
|
@@ -19331,7 +19325,7 @@ var InlineAssertEvaluator = class {
|
|
|
19331
19325
|
}
|
|
19332
19326
|
};
|
|
19333
19327
|
|
|
19334
|
-
// src/evaluation/
|
|
19328
|
+
// src/evaluation/graders/prompt-resolution.ts
|
|
19335
19329
|
init_cjs_shims();
|
|
19336
19330
|
var import_node_path42 = __toESM(require("path"), 1);
|
|
19337
19331
|
async function resolveCustomPrompt(promptConfig, context2, timeoutMs) {
|
|
@@ -19399,7 +19393,7 @@ async function executePromptTemplate(script, context2, config, timeoutMs) {
|
|
|
19399
19393
|
}
|
|
19400
19394
|
}
|
|
19401
19395
|
|
|
19402
|
-
// src/evaluation/registry/builtin-
|
|
19396
|
+
// src/evaluation/registry/builtin-graders.ts
|
|
19403
19397
|
var INLINE_ASSERT_FN = Symbol.for("agentv.inline-assert-fn");
|
|
19404
19398
|
var llmGraderFactory = (config, context2) => {
|
|
19405
19399
|
const c = config;
|
|
@@ -19416,7 +19410,7 @@ var llmGraderFactory = (config, context2) => {
|
|
|
19416
19410
|
);
|
|
19417
19411
|
}
|
|
19418
19412
|
const isAgent = isAgentProvider(graderTargetProvider) || graderTargetProvider.kind === "agentv";
|
|
19419
|
-
evaluator = new
|
|
19413
|
+
evaluator = new LlmGrader({
|
|
19420
19414
|
resolveGraderProvider: async (evalContext) => {
|
|
19421
19415
|
if (graderTargetProvider) return graderTargetProvider;
|
|
19422
19416
|
if (evalContext.graderProvider) return evalContext.graderProvider;
|
|
@@ -19444,11 +19438,11 @@ var llmGraderFactory = (config, context2) => {
|
|
|
19444
19438
|
agentTimeoutMs
|
|
19445
19439
|
);
|
|
19446
19440
|
const isFromInlinePrompt = !c.resolvedPromptScript?.length && !c.resolvedPromptPath && !c.promptPath;
|
|
19447
|
-
let
|
|
19441
|
+
let graderTemplateOverride;
|
|
19448
19442
|
let evalCase = evalContext.evalCase;
|
|
19449
19443
|
if (customPrompt) {
|
|
19450
19444
|
if (!isFromInlinePrompt || containsTemplateVariables(customPrompt)) {
|
|
19451
|
-
|
|
19445
|
+
graderTemplateOverride = customPrompt;
|
|
19452
19446
|
} else {
|
|
19453
19447
|
evalCase = { ...evalCase, criteria: customPrompt };
|
|
19454
19448
|
}
|
|
@@ -19456,7 +19450,7 @@ var llmGraderFactory = (config, context2) => {
|
|
|
19456
19450
|
return evaluator.evaluate({
|
|
19457
19451
|
...evalContext,
|
|
19458
19452
|
evalCase,
|
|
19459
|
-
|
|
19453
|
+
graderTemplateOverride,
|
|
19460
19454
|
evaluator: c
|
|
19461
19455
|
});
|
|
19462
19456
|
}
|
|
@@ -19464,7 +19458,7 @@ var llmGraderFactory = (config, context2) => {
|
|
|
19464
19458
|
};
|
|
19465
19459
|
var codeFactory = (config, context2) => {
|
|
19466
19460
|
const c = config;
|
|
19467
|
-
return new
|
|
19461
|
+
return new CodeGrader({
|
|
19468
19462
|
command: c.command ?? c.script ?? [],
|
|
19469
19463
|
cwd: c.resolvedCwd ?? c.cwd,
|
|
19470
19464
|
agentTimeoutMs: context2.agentTimeoutMs,
|
|
@@ -19475,19 +19469,19 @@ var codeFactory = (config, context2) => {
|
|
|
19475
19469
|
var compositeFactory = (config, context2) => {
|
|
19476
19470
|
const c = config;
|
|
19477
19471
|
const evalFileDir = context2.evalFileDir ?? process.cwd();
|
|
19478
|
-
return new
|
|
19472
|
+
return new CompositeGrader({
|
|
19479
19473
|
config: c,
|
|
19480
19474
|
cwd: evalFileDir,
|
|
19481
19475
|
evaluatorFactory: {
|
|
19482
19476
|
create: (memberConfig) => {
|
|
19483
19477
|
const factory = context2.registry.get(memberConfig.type);
|
|
19484
19478
|
if (!factory) {
|
|
19485
|
-
throw new Error(`Unsupported
|
|
19479
|
+
throw new Error(`Unsupported grader type in composite: ${memberConfig.type}`);
|
|
19486
19480
|
}
|
|
19487
19481
|
const result = factory(memberConfig, context2);
|
|
19488
19482
|
if (result instanceof Promise) {
|
|
19489
19483
|
throw new Error(
|
|
19490
|
-
`
|
|
19484
|
+
`Grader factory for type "${memberConfig.type}" is async \u2014 not supported inside composite members. Use synchronous factories for composite child evaluators.`
|
|
19491
19485
|
);
|
|
19492
19486
|
}
|
|
19493
19487
|
return result;
|
|
@@ -19496,35 +19490,35 @@ var compositeFactory = (config, context2) => {
|
|
|
19496
19490
|
});
|
|
19497
19491
|
};
|
|
19498
19492
|
var toolTrajectoryFactory = (config) => {
|
|
19499
|
-
return new
|
|
19493
|
+
return new ToolTrajectoryGrader({
|
|
19500
19494
|
config
|
|
19501
19495
|
});
|
|
19502
19496
|
};
|
|
19503
19497
|
var fieldAccuracyFactory = (config) => {
|
|
19504
|
-
return new
|
|
19498
|
+
return new FieldAccuracyGrader({
|
|
19505
19499
|
config
|
|
19506
19500
|
});
|
|
19507
19501
|
};
|
|
19508
19502
|
var latencyFactory = (config) => {
|
|
19509
|
-
return new
|
|
19503
|
+
return new LatencyGrader({ config });
|
|
19510
19504
|
};
|
|
19511
19505
|
var costFactory = (config) => {
|
|
19512
|
-
return new
|
|
19506
|
+
return new CostGrader({ config });
|
|
19513
19507
|
};
|
|
19514
19508
|
var tokenUsageFactory = (config) => {
|
|
19515
|
-
return new
|
|
19509
|
+
return new TokenUsageGrader({ config });
|
|
19516
19510
|
};
|
|
19517
19511
|
var executionMetricsFactory = (config) => {
|
|
19518
|
-
return new
|
|
19512
|
+
return new ExecutionMetricsGrader({
|
|
19519
19513
|
config
|
|
19520
19514
|
});
|
|
19521
19515
|
};
|
|
19522
19516
|
var skillTriggerFactory = (config) => {
|
|
19523
|
-
return new
|
|
19517
|
+
return new SkillTriggerGrader(config);
|
|
19524
19518
|
};
|
|
19525
19519
|
var containsFactory = (config) => {
|
|
19526
19520
|
const c = config;
|
|
19527
|
-
return new
|
|
19521
|
+
return new DeterministicAssertionGrader("contains", (ctx) => {
|
|
19528
19522
|
const result = runContainsAssertion(ctx.candidate, c.value);
|
|
19529
19523
|
return {
|
|
19530
19524
|
score: result.score,
|
|
@@ -19536,7 +19530,7 @@ var containsFactory = (config) => {
|
|
|
19536
19530
|
};
|
|
19537
19531
|
var regexFactory = (config) => {
|
|
19538
19532
|
const c = config;
|
|
19539
|
-
return new
|
|
19533
|
+
return new DeterministicAssertionGrader("regex", (ctx) => {
|
|
19540
19534
|
const result = runRegexAssertion(ctx.candidate, c.value, c.flags);
|
|
19541
19535
|
return {
|
|
19542
19536
|
score: result.score,
|
|
@@ -19547,7 +19541,7 @@ var regexFactory = (config) => {
|
|
|
19547
19541
|
});
|
|
19548
19542
|
};
|
|
19549
19543
|
var isJsonFactory = () => {
|
|
19550
|
-
return new
|
|
19544
|
+
return new DeterministicAssertionGrader("is-json", (ctx) => {
|
|
19551
19545
|
const result = runIsJsonAssertion(ctx.candidate);
|
|
19552
19546
|
return {
|
|
19553
19547
|
score: result.score,
|
|
@@ -19559,7 +19553,7 @@ var isJsonFactory = () => {
|
|
|
19559
19553
|
};
|
|
19560
19554
|
var equalsFactory = (config) => {
|
|
19561
19555
|
const c = config;
|
|
19562
|
-
return new
|
|
19556
|
+
return new DeterministicAssertionGrader("equals", (ctx) => {
|
|
19563
19557
|
const result = runEqualsAssertion(ctx.candidate, c.value);
|
|
19564
19558
|
return {
|
|
19565
19559
|
score: result.score,
|
|
@@ -19571,7 +19565,7 @@ var equalsFactory = (config) => {
|
|
|
19571
19565
|
};
|
|
19572
19566
|
var containsAnyFactory = (config) => {
|
|
19573
19567
|
const c = config;
|
|
19574
|
-
return new
|
|
19568
|
+
return new DeterministicAssertionGrader("contains-any", (ctx) => {
|
|
19575
19569
|
const result = runContainsAnyAssertion(ctx.candidate, c.value);
|
|
19576
19570
|
return {
|
|
19577
19571
|
score: result.score,
|
|
@@ -19583,7 +19577,7 @@ var containsAnyFactory = (config) => {
|
|
|
19583
19577
|
};
|
|
19584
19578
|
var containsAllFactory = (config) => {
|
|
19585
19579
|
const c = config;
|
|
19586
|
-
return new
|
|
19580
|
+
return new DeterministicAssertionGrader("contains-all", (ctx) => {
|
|
19587
19581
|
const result = runContainsAllAssertion(ctx.candidate, c.value);
|
|
19588
19582
|
return {
|
|
19589
19583
|
score: result.score,
|
|
@@ -19595,7 +19589,7 @@ var containsAllFactory = (config) => {
|
|
|
19595
19589
|
};
|
|
19596
19590
|
var icontainsFactory = (config) => {
|
|
19597
19591
|
const c = config;
|
|
19598
|
-
return new
|
|
19592
|
+
return new DeterministicAssertionGrader("icontains", (ctx) => {
|
|
19599
19593
|
const result = runIcontainsAssertion(ctx.candidate, c.value);
|
|
19600
19594
|
return {
|
|
19601
19595
|
score: result.score,
|
|
@@ -19607,7 +19601,7 @@ var icontainsFactory = (config) => {
|
|
|
19607
19601
|
};
|
|
19608
19602
|
var icontainsAnyFactory = (config) => {
|
|
19609
19603
|
const c = config;
|
|
19610
|
-
return new
|
|
19604
|
+
return new DeterministicAssertionGrader("icontains-any", (ctx) => {
|
|
19611
19605
|
const result = runIcontainsAnyAssertion(ctx.candidate, c.value);
|
|
19612
19606
|
return {
|
|
19613
19607
|
score: result.score,
|
|
@@ -19619,7 +19613,7 @@ var icontainsAnyFactory = (config) => {
|
|
|
19619
19613
|
};
|
|
19620
19614
|
var icontainsAllFactory = (config) => {
|
|
19621
19615
|
const c = config;
|
|
19622
|
-
return new
|
|
19616
|
+
return new DeterministicAssertionGrader("icontains-all", (ctx) => {
|
|
19623
19617
|
const result = runIcontainsAllAssertion(ctx.candidate, c.value);
|
|
19624
19618
|
return {
|
|
19625
19619
|
score: result.score,
|
|
@@ -19631,7 +19625,7 @@ var icontainsAllFactory = (config) => {
|
|
|
19631
19625
|
};
|
|
19632
19626
|
var startsWithFactory = (config) => {
|
|
19633
19627
|
const c = config;
|
|
19634
|
-
return new
|
|
19628
|
+
return new DeterministicAssertionGrader("starts-with", (ctx) => {
|
|
19635
19629
|
const result = runStartsWithAssertion(ctx.candidate, c.value);
|
|
19636
19630
|
return {
|
|
19637
19631
|
score: result.score,
|
|
@@ -19643,7 +19637,7 @@ var startsWithFactory = (config) => {
|
|
|
19643
19637
|
};
|
|
19644
19638
|
var endsWithFactory = (config) => {
|
|
19645
19639
|
const c = config;
|
|
19646
|
-
return new
|
|
19640
|
+
return new DeterministicAssertionGrader("ends-with", (ctx) => {
|
|
19647
19641
|
const result = runEndsWithAssertion(ctx.candidate, c.value);
|
|
19648
19642
|
return {
|
|
19649
19643
|
score: result.score,
|
|
@@ -19654,7 +19648,7 @@ var endsWithFactory = (config) => {
|
|
|
19654
19648
|
});
|
|
19655
19649
|
};
|
|
19656
19650
|
function createBuiltinRegistry() {
|
|
19657
|
-
const registry = new
|
|
19651
|
+
const registry = new GraderRegistry();
|
|
19658
19652
|
registry.register("llm-grader", llmGraderFactory).register("code-grader", codeFactory).register("composite", compositeFactory).register("tool-trajectory", toolTrajectoryFactory).register("field-accuracy", fieldAccuracyFactory).register("latency", latencyFactory).register("cost", costFactory).register("token-usage", tokenUsageFactory).register("execution-metrics", executionMetricsFactory).register("skill-trigger", skillTriggerFactory).register("contains", containsFactory).register("contains-any", containsAnyFactory).register("contains-all", containsAllFactory).register("icontains", icontainsFactory).register("icontains-any", icontainsAnyFactory).register("icontains-all", icontainsAllFactory).register("starts-with", startsWithFactory).register("ends-with", endsWithFactory).register("regex", regexFactory).register("is-json", isJsonFactory).register("equals", equalsFactory).register("inline-assert", (config) => {
|
|
19659
19653
|
const fn = config[INLINE_ASSERT_FN];
|
|
19660
19654
|
if (!fn) {
|
|
@@ -19662,7 +19656,7 @@ function createBuiltinRegistry() {
|
|
|
19662
19656
|
`No inline assert function found on config for "${config.name}". Inline assert functions must be attached via INLINE_ASSERT_FN symbol.`
|
|
19663
19657
|
);
|
|
19664
19658
|
}
|
|
19665
|
-
return new
|
|
19659
|
+
return new InlineAssertGrader(fn, config.name ?? "inline-assert");
|
|
19666
19660
|
});
|
|
19667
19661
|
return registry;
|
|
19668
19662
|
}
|
|
@@ -19700,7 +19694,7 @@ async function discoverAssertions(registry, baseDir) {
|
|
|
19700
19694
|
continue;
|
|
19701
19695
|
}
|
|
19702
19696
|
const factory = (_config, context2) => {
|
|
19703
|
-
return new
|
|
19697
|
+
return new CodeGrader({
|
|
19704
19698
|
command: ["bun", "run", filePath],
|
|
19705
19699
|
agentTimeoutMs: context2.agentTimeoutMs
|
|
19706
19700
|
});
|
|
@@ -19745,7 +19739,7 @@ async function discoverGraders(registry, baseDir) {
|
|
|
19745
19739
|
continue;
|
|
19746
19740
|
}
|
|
19747
19741
|
const factory = (_config, context2) => {
|
|
19748
|
-
return new
|
|
19742
|
+
return new CodeGrader({
|
|
19749
19743
|
command: ["bun", "run", filePath],
|
|
19750
19744
|
agentTimeoutMs: context2.agentTimeoutMs
|
|
19751
19745
|
});
|
|
@@ -20594,10 +20588,10 @@ function buildSkippedEvaluatorError(scores) {
|
|
|
20594
20588
|
}
|
|
20595
20589
|
const messages = skippedScores.map((score) => {
|
|
20596
20590
|
const label = score.name || score.type;
|
|
20597
|
-
const assertionMessage = score.assertions.find((assertion) => !assertion.passed)?.text ?? "
|
|
20591
|
+
const assertionMessage = score.assertions.find((assertion) => !assertion.passed)?.text ?? "Grader skipped";
|
|
20598
20592
|
return `${label}: ${assertionMessage}`;
|
|
20599
20593
|
});
|
|
20600
|
-
return messages.length === 1 ? messages[0] : `
|
|
20594
|
+
return messages.length === 1 ? messages[0] : `Graders skipped: ${messages.join(" | ")}`;
|
|
20601
20595
|
}
|
|
20602
20596
|
function usesFileReferencePrompt(provider) {
|
|
20603
20597
|
return isAgentProvider(provider) || provider.kind === "cli";
|
|
@@ -20766,7 +20760,7 @@ async function runEvaluation(options) {
|
|
|
20766
20760
|
cleanupWorkspaces,
|
|
20767
20761
|
trials,
|
|
20768
20762
|
streamCallbacks,
|
|
20769
|
-
|
|
20763
|
+
budgetUsd,
|
|
20770
20764
|
failOnError,
|
|
20771
20765
|
poolWorkspaces,
|
|
20772
20766
|
poolMaxSlots: configPoolMaxSlots,
|
|
@@ -21295,7 +21289,7 @@ async function runEvaluation(options) {
|
|
|
21295
21289
|
async function dispatchTest(evalCase, depResults) {
|
|
21296
21290
|
const workerId = nextWorkerId++;
|
|
21297
21291
|
workerIdByEvalId.set(evalCase.id, workerId);
|
|
21298
|
-
if (
|
|
21292
|
+
if (budgetUsd !== void 0 && budgetExhausted) {
|
|
21299
21293
|
const budgetResult = {
|
|
21300
21294
|
timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
|
|
21301
21295
|
testId: evalCase.id,
|
|
@@ -21305,13 +21299,13 @@ async function runEvaluation(options) {
|
|
|
21305
21299
|
assertions: [],
|
|
21306
21300
|
output: [],
|
|
21307
21301
|
target: target.name,
|
|
21308
|
-
error: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${
|
|
21302
|
+
error: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${budgetUsd.toFixed(4)})`,
|
|
21309
21303
|
budgetExceeded: true,
|
|
21310
21304
|
executionStatus: "execution_error",
|
|
21311
21305
|
failureStage: "setup",
|
|
21312
21306
|
failureReasonCode: "budget_exceeded",
|
|
21313
21307
|
executionError: {
|
|
21314
|
-
message: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${
|
|
21308
|
+
message: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${budgetUsd.toFixed(4)})`,
|
|
21315
21309
|
stage: "setup"
|
|
21316
21310
|
}
|
|
21317
21311
|
};
|
|
@@ -21408,7 +21402,7 @@ async function runEvaluation(options) {
|
|
|
21408
21402
|
...depResults && Object.keys(depResults).length > 0 ? { dependencyResults: depResults } : {}
|
|
21409
21403
|
};
|
|
21410
21404
|
let result = trials && trials.count > 1 ? await runEvalCaseWithTrials(runCaseOptions, trials) : await runEvalCase(runCaseOptions);
|
|
21411
|
-
if (
|
|
21405
|
+
if (budgetUsd !== void 0) {
|
|
21412
21406
|
let caseCost;
|
|
21413
21407
|
if (result.trials && result.trials.length > 0) {
|
|
21414
21408
|
const trialCostSum = result.trials.reduce((sum, t) => sum + (t.costUsd ?? 0), 0);
|
|
@@ -21420,7 +21414,7 @@ async function runEvaluation(options) {
|
|
|
21420
21414
|
}
|
|
21421
21415
|
if (caseCost !== void 0) {
|
|
21422
21416
|
cumulativeBudgetCost += caseCost;
|
|
21423
|
-
if (cumulativeBudgetCost >=
|
|
21417
|
+
if (cumulativeBudgetCost >= budgetUsd) {
|
|
21424
21418
|
budgetExhausted = true;
|
|
21425
21419
|
}
|
|
21426
21420
|
}
|
|
@@ -22562,7 +22556,7 @@ async function evaluateCandidate(options) {
|
|
|
22562
22556
|
};
|
|
22563
22557
|
}
|
|
22564
22558
|
}
|
|
22565
|
-
const evaluatorRequest = scores ? void 0 : score.
|
|
22559
|
+
const evaluatorRequest = scores ? void 0 : score.graderRawRequest;
|
|
22566
22560
|
const effectiveAgentRequest = agentRequest && Object.keys(agentRequest).length > 0 ? agentRequest : void 0;
|
|
22567
22561
|
const requests = effectiveAgentRequest || lmRequest || evaluatorRequest ? {
|
|
22568
22562
|
...effectiveAgentRequest ? { agent: effectiveAgentRequest } : {},
|
|
@@ -22778,7 +22772,7 @@ async function runEvaluatorList(options) {
|
|
|
22778
22772
|
weight,
|
|
22779
22773
|
verdict: score2.verdict,
|
|
22780
22774
|
assertions: score2.assertions,
|
|
22781
|
-
input: score2.
|
|
22775
|
+
input: score2.graderRawRequest,
|
|
22782
22776
|
target: score2.graderTarget,
|
|
22783
22777
|
details: score2.details,
|
|
22784
22778
|
scores: mapChildResults(score2.scores),
|
|
@@ -22794,7 +22788,7 @@ async function runEvaluatorList(options) {
|
|
|
22794
22788
|
score: 0,
|
|
22795
22789
|
verdict: "fail",
|
|
22796
22790
|
assertions: [
|
|
22797
|
-
{ text: `
|
|
22791
|
+
{ text: `Grader '${evaluatorConfig.name}' failed: ${message}`, passed: false }
|
|
22798
22792
|
],
|
|
22799
22793
|
expectedAspectCount: 1
|
|
22800
22794
|
};
|
|
@@ -22815,7 +22809,7 @@ async function runEvaluatorList(options) {
|
|
|
22815
22809
|
verdict: "fail",
|
|
22816
22810
|
assertions: [
|
|
22817
22811
|
{
|
|
22818
|
-
text: `
|
|
22812
|
+
text: `Grader '${evaluatorConfig.name ?? "unknown"}' failed: ${message}`,
|
|
22819
22813
|
passed: false
|
|
22820
22814
|
}
|
|
22821
22815
|
],
|
|
@@ -22872,7 +22866,7 @@ function filterEvalCases(evalCases, filter) {
|
|
|
22872
22866
|
return evalCases.filter((evalCase) => matchesFilter3(evalCase.id, filter));
|
|
22873
22867
|
}
|
|
22874
22868
|
function buildEvaluatorRegistry(overrides, resolveGraderProvider) {
|
|
22875
|
-
const llmGrader = overrides?.["llm-grader"] ?? new
|
|
22869
|
+
const llmGrader = overrides?.["llm-grader"] ?? new LlmGrader({
|
|
22876
22870
|
resolveGraderProvider: async (context2) => {
|
|
22877
22871
|
if (context2.graderProvider) {
|
|
22878
22872
|
return context2.graderProvider;
|
|
@@ -23363,7 +23357,7 @@ function mapChildResults(children) {
|
|
|
23363
23357
|
weight: child.weight,
|
|
23364
23358
|
verdict: child.verdict,
|
|
23365
23359
|
assertions: child.assertions,
|
|
23366
|
-
input: child.
|
|
23360
|
+
input: child.graderRawRequest,
|
|
23367
23361
|
scores: mapChildResults(child.scores),
|
|
23368
23362
|
details: child.details,
|
|
23369
23363
|
tokenUsage: child.tokenUsage
|
|
@@ -25496,22 +25490,21 @@ function createAgentKernel() {
|
|
|
25496
25490
|
// Annotate the CommonJS export names for ESM import in node:
|
|
25497
25491
|
0 && (module.exports = {
|
|
25498
25492
|
COMMON_TARGET_SETTINGS,
|
|
25499
|
-
|
|
25500
|
-
|
|
25501
|
-
|
|
25493
|
+
CodeGrader,
|
|
25494
|
+
CompositeGrader,
|
|
25495
|
+
CostGrader,
|
|
25502
25496
|
DEFAULT_CATEGORY,
|
|
25503
|
-
DEFAULT_EVALUATOR_TEMPLATE,
|
|
25504
25497
|
DEFAULT_EVAL_PATTERNS,
|
|
25505
25498
|
DEFAULT_EXPLORATION_TOOLS,
|
|
25499
|
+
DEFAULT_GRADER_TEMPLATE,
|
|
25506
25500
|
DEFAULT_THRESHOLD,
|
|
25507
|
-
|
|
25501
|
+
DeterministicAssertionGrader,
|
|
25508
25502
|
DockerWorkspaceProvider,
|
|
25509
|
-
|
|
25510
|
-
|
|
25511
|
-
|
|
25512
|
-
|
|
25513
|
-
|
|
25514
|
-
LlmJudgeEvaluator,
|
|
25503
|
+
ExecutionMetricsGrader,
|
|
25504
|
+
FieldAccuracyGrader,
|
|
25505
|
+
GraderRegistry,
|
|
25506
|
+
LatencyGrader,
|
|
25507
|
+
LlmGrader,
|
|
25515
25508
|
OTEL_BACKEND_PRESETS,
|
|
25516
25509
|
OtelStreamingObserver,
|
|
25517
25510
|
OtelTraceExporter,
|
|
@@ -25520,18 +25513,17 @@ function createAgentKernel() {
|
|
|
25520
25513
|
ProviderRegistry,
|
|
25521
25514
|
RepoManager,
|
|
25522
25515
|
ResponseCache,
|
|
25523
|
-
|
|
25516
|
+
SkillTriggerGrader,
|
|
25524
25517
|
TEST_MESSAGE_ROLES,
|
|
25525
25518
|
TemplateNotDirectoryError,
|
|
25526
25519
|
TemplateNotFoundError,
|
|
25527
|
-
|
|
25528
|
-
|
|
25520
|
+
TokenUsageGrader,
|
|
25521
|
+
ToolTrajectoryGrader,
|
|
25529
25522
|
TranscriptProvider,
|
|
25530
25523
|
WorkspaceCreationError,
|
|
25531
25524
|
WorkspacePoolManager,
|
|
25532
25525
|
addBenchmark,
|
|
25533
25526
|
assembleLlmGraderPrompt,
|
|
25534
|
-
assembleLlmJudgePrompt,
|
|
25535
25527
|
avgToolDurationMs,
|
|
25536
25528
|
buildDirectoryChain,
|
|
25537
25529
|
buildOutputSchema,
|
|
@@ -25571,7 +25563,6 @@ function createAgentKernel() {
|
|
|
25571
25563
|
discoverCodexSessions,
|
|
25572
25564
|
discoverCopilotSessions,
|
|
25573
25565
|
discoverGraders,
|
|
25574
|
-
discoverJudges,
|
|
25575
25566
|
discoverProviders,
|
|
25576
25567
|
ensureResultsRepoClone,
|
|
25577
25568
|
ensureVSCodeSubagents,
|
|
@@ -25613,7 +25604,7 @@ function createAgentKernel() {
|
|
|
25613
25604
|
isAgentSkillsFormat,
|
|
25614
25605
|
isContent,
|
|
25615
25606
|
isContentArray,
|
|
25616
|
-
|
|
25607
|
+
isGraderKind,
|
|
25617
25608
|
isJsonObject,
|
|
25618
25609
|
isJsonValue,
|
|
25619
25610
|
isNonEmptyString,
|