@agentv/core 4.17.1-next.1 → 4.18.0-next.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -13,7 +13,7 @@ import {
13
13
  isAgentProvider,
14
14
  isContent,
15
15
  isContentArray,
16
- isEvaluatorKind,
16
+ isGraderKind,
17
17
  isJsonObject,
18
18
  isJsonValue,
19
19
  isTestMessage,
@@ -25,7 +25,7 @@ import {
25
25
  resolveDelegatedTargetDefinition,
26
26
  resolveFileReference,
27
27
  resolveTargetDefinition
28
- } from "./chunk-6VZY3B6M.js";
28
+ } from "./chunk-PYDBJOAO.js";
29
29
  import {
30
30
  execFileWithStdin,
31
31
  execShellWithStdin
@@ -647,22 +647,25 @@ function extractCacheConfig(suite) {
647
647
  const resolvedCachePath = typeof cachePath === "string" && cachePath.trim().length > 0 ? cachePath.trim() : void 0;
648
648
  return { enabled: cache, cachePath: resolvedCachePath };
649
649
  }
650
- function extractTotalBudgetUsd(suite) {
650
+ function extractBudgetUsd(suite) {
651
651
  const execution = suite.execution;
652
652
  if (!execution || typeof execution !== "object" || Array.isArray(execution)) {
653
653
  return void 0;
654
654
  }
655
655
  const executionObj = execution;
656
- const rawBudget = executionObj.total_budget_usd ?? executionObj.totalBudgetUsd;
656
+ if ("total_budget_usd" in executionObj || "totalBudgetUsd" in executionObj) {
657
+ throw new Error(
658
+ "execution.total_budget_usd has been renamed to execution.budget_usd. Update your eval YAML."
659
+ );
660
+ }
661
+ const rawBudget = executionObj.budget_usd ?? executionObj.budgetUsd;
657
662
  if (rawBudget === void 0 || rawBudget === null) {
658
663
  return void 0;
659
664
  }
660
665
  if (typeof rawBudget === "number" && rawBudget > 0) {
661
666
  return rawBudget;
662
667
  }
663
- logWarning(
664
- `Invalid execution.total_budget_usd: ${rawBudget}. Must be a positive number. Ignoring.`
665
- );
668
+ logWarning(`Invalid execution.budget_usd: ${rawBudget}. Must be a positive number. Ignoring.`);
666
669
  return void 0;
667
670
  }
668
671
  function extractFailOnError(suite) {
@@ -812,7 +815,7 @@ function logWarning(message) {
812
815
  console.warn(`${ANSI_YELLOW}Warning: ${message}${ANSI_RESET2}`);
813
816
  }
814
817
 
815
- // src/evaluation/loaders/evaluator-parser.ts
818
+ // src/evaluation/loaders/grader-parser.ts
816
819
  import { readFile as readFile5 } from "node:fs/promises";
817
820
  import path5 from "node:path";
818
821
  import { parse as parse2 } from "yaml";
@@ -1051,38 +1054,38 @@ function validateTemplateVariables(content, source) {
1051
1054
  );
1052
1055
  }
1053
1056
  if (invalidVariables.length > 0) {
1054
- const warningMessage = `${ANSI_YELLOW2}Warning: Custom evaluator template at ${source}
1057
+ const warningMessage = `${ANSI_YELLOW2}Warning: Custom grader template at ${source}
1055
1058
  Contains invalid variables: ${invalidVariables.map((v) => `{{ ${v} }}`).join(", ")}
1056
1059
  Valid variables: ${Array.from(VALID_TEMPLATE_VARIABLES).map((v) => `{{ ${v} }}`).join(", ")}${ANSI_RESET3}`;
1057
1060
  console.warn(warningMessage);
1058
1061
  }
1059
1062
  }
1060
1063
 
1061
- // src/evaluation/loaders/evaluator-parser.ts
1064
+ // src/evaluation/loaders/grader-parser.ts
1062
1065
  var ANSI_YELLOW3 = "\x1B[33m";
1063
1066
  var ANSI_RESET4 = "\x1B[0m";
1064
1067
  var MAX_ASSERTION_INCLUDE_DEPTH = 3;
1065
1068
  var PROMPT_FILE_PREFIX = "file://";
1066
- function normalizeEvaluatorType(type) {
1069
+ function normalizeGraderType(type) {
1067
1070
  return type.replace(/_/g, "-");
1068
1071
  }
1069
1072
  function isDeprecatedJudgeType(type) {
1070
1073
  return type === "code-judge" || type === "llm-judge";
1071
1074
  }
1072
- async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId, defaultPreprocessors) {
1075
+ async function parseGraders(rawEvalCase, globalExecution, searchRoots, evalId, defaultPreprocessors) {
1073
1076
  const execution = rawEvalCase.execution;
1074
1077
  const executionObject = isJsonObject2(execution) ? execution : void 0;
1075
1078
  const caseEvaluators = rawEvalCase.assertions ?? rawEvalCase.assert ?? (executionObject ? executionObject.evaluators : void 0) ?? // deprecated: use assertions
1076
1079
  rawEvalCase.evaluators;
1077
1080
  const skipDefaults = executionObject?.skip_defaults === true;
1078
1081
  const rootEvaluators = skipDefaults ? void 0 : globalExecution?.assertions ?? globalExecution?.assert ?? globalExecution?.evaluators;
1079
- const parsedCase = await parseEvaluatorList(
1082
+ const parsedCase = await parseGraderList(
1080
1083
  caseEvaluators,
1081
1084
  searchRoots,
1082
1085
  evalId,
1083
1086
  defaultPreprocessors
1084
1087
  );
1085
- const parsedRoot = await parseEvaluatorList(
1088
+ const parsedRoot = await parseGraderList(
1086
1089
  rootEvaluators,
1087
1090
  searchRoots,
1088
1091
  evalId,
@@ -1161,12 +1164,12 @@ ${resolved.attempted.map((attempt) => ` Tried: ${attempt}`).join("\n")}` : "";
1161
1164
  templateDir,
1162
1165
  ...searchRoots.filter((root) => path5.resolve(root) !== templateDir)
1163
1166
  ];
1164
- return await expandEvaluatorEntries(assertions, nestedSearchRoots, evalId, {
1167
+ return await expandGraderEntries(assertions, nestedSearchRoots, evalId, {
1165
1168
  depth: nextDepth,
1166
1169
  chain: [...includeContext.chain, resolved.resolvedPath]
1167
1170
  }) ?? [];
1168
1171
  }
1169
- async function expandEvaluatorEntries(candidateEvaluators, searchRoots, evalId, includeContext = { depth: 0, chain: [] }) {
1172
+ async function expandGraderEntries(candidateEvaluators, searchRoots, evalId, includeContext = { depth: 0, chain: [] }) {
1170
1173
  if (candidateEvaluators === void 0) {
1171
1174
  return void 0;
1172
1175
  }
@@ -1190,8 +1193,8 @@ async function expandEvaluatorEntries(candidateEvaluators, searchRoots, evalId,
1190
1193
  }
1191
1194
  return expanded;
1192
1195
  }
1193
- async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId, defaultPreprocessors) {
1194
- const expandedEvaluators = await expandEvaluatorEntries(candidateEvaluators, searchRoots, evalId);
1196
+ async function parseGraderList(candidateEvaluators, searchRoots, evalId, defaultPreprocessors) {
1197
+ const expandedEvaluators = await expandGraderEntries(candidateEvaluators, searchRoots, evalId);
1195
1198
  if (!expandedEvaluators) {
1196
1199
  return void 0;
1197
1200
  }
@@ -1237,14 +1240,14 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId, defa
1237
1240
  }
1238
1241
  const rawName = asString(rawEvaluator.name);
1239
1242
  const rawType = rawEvaluator.type;
1240
- const typeValue = typeof rawType === "string" ? normalizeEvaluatorType(rawType) : rawType;
1243
+ const typeValue = typeof rawType === "string" ? normalizeGraderType(rawType) : rawType;
1241
1244
  if (typeof typeValue === "string" && isDeprecatedJudgeType(typeValue)) {
1242
1245
  logWarning2(
1243
1246
  `Skipping evaluator '${rawName ?? "<unnamed>"}' in '${evalId}': '${rawType}' is deprecated. Use '${typeValue.replace("-judge", "-grader")}' instead`
1244
1247
  );
1245
1248
  continue;
1246
1249
  }
1247
- const isCustomType = typeof typeValue === "string" && !isEvaluatorKind(typeValue);
1250
+ const isCustomType = typeof typeValue === "string" && !isGraderKind(typeValue);
1248
1251
  if (typeof typeValue !== "string") {
1249
1252
  logWarning2(`Skipping evaluator with invalid type in '${evalId}'`);
1250
1253
  continue;
@@ -1407,7 +1410,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId, defa
1407
1410
  continue;
1408
1411
  }
1409
1412
  const aggregatorType = asString(rawAggregator.type);
1410
- const normalizedAggregatorType = typeof aggregatorType === "string" ? aggregatorType === "weighted_average" || aggregatorType === "threshold" ? aggregatorType : normalizeEvaluatorType(aggregatorType) : aggregatorType;
1413
+ const normalizedAggregatorType = typeof aggregatorType === "string" ? aggregatorType === "weighted_average" || aggregatorType === "threshold" ? aggregatorType : normalizeGraderType(aggregatorType) : aggregatorType;
1411
1414
  if (typeof normalizedAggregatorType === "string" && isDeprecatedJudgeType(normalizedAggregatorType)) {
1412
1415
  logWarning2(
1413
1416
  `Skipping composite evaluator '${name}' in '${evalId}': aggregator type '${aggregatorType}' is deprecated. Use '${normalizedAggregatorType.replace("-judge", "-grader")}' instead`
@@ -1420,7 +1423,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId, defa
1420
1423
  );
1421
1424
  continue;
1422
1425
  }
1423
- const expandedMembers = await expandEvaluatorEntries(
1426
+ const expandedMembers = await expandGraderEntries(
1424
1427
  rawMembers,
1425
1428
  searchRoots,
1426
1429
  `${evalId}:${name}`
@@ -1436,11 +1439,11 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId, defa
1436
1439
  }
1437
1440
  const memberName = asString(rawMember.name);
1438
1441
  const memberType = rawMember.type;
1439
- if (!memberName || !isEvaluatorKind(memberType)) {
1442
+ if (!memberName || !isGraderKind(memberType)) {
1440
1443
  logWarning2(`Skipping member evaluator with invalid name/type in composite '${name}'`);
1441
1444
  continue;
1442
1445
  }
1443
- const memberConfigs = await parseEvaluators(
1446
+ const memberConfigs = await parseGraders(
1444
1447
  { evaluators: [rawMember] },
1445
1448
  void 0,
1446
1449
  searchRoots,
@@ -2181,7 +2184,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId, defa
2181
2184
  `prompt.command for evaluator '${name}' in '${evalId}'`
2182
2185
  );
2183
2186
  if (!commandArray) {
2184
- throw new Error(`Evaluator '${name}' in '${evalId}': prompt object requires command array`);
2187
+ throw new Error(`Grader '${name}' in '${evalId}': prompt object requires command array`);
2185
2188
  }
2186
2189
  const commandPath = commandArray[commandArray.length - 1];
2187
2190
  const resolved = await resolveFileReference2(commandPath, searchRoots);
@@ -2189,7 +2192,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId, defa
2189
2192
  resolvedPromptScript = [...commandArray.slice(0, -1), path5.resolve(resolved.resolvedPath)];
2190
2193
  } else {
2191
2194
  throw new Error(
2192
- `Evaluator '${name}' in '${evalId}': prompt command file not found: ${resolved.displayPath}`
2195
+ `Grader '${name}' in '${evalId}': prompt command file not found: ${resolved.displayPath}`
2193
2196
  );
2194
2197
  }
2195
2198
  if (isJsonObject2(rawPrompt.config)) {
@@ -2206,11 +2209,11 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId, defa
2206
2209
  await validateCustomPromptContent(promptPath);
2207
2210
  } catch (error) {
2208
2211
  const message = error instanceof Error ? error.message : String(error);
2209
- throw new Error(`Evaluator '${name}' template (${promptPath}): ${message}`);
2212
+ throw new Error(`Grader '${name}' template (${promptPath}): ${message}`);
2210
2213
  }
2211
2214
  } else {
2212
2215
  throw new Error(
2213
- `Evaluator '${name}' in '${evalId}': prompt file not found: ${resolved.displayPath}`
2216
+ `Grader '${name}' in '${evalId}': prompt file not found: ${resolved.displayPath}`
2214
2217
  );
2215
2218
  }
2216
2219
  } else {
@@ -2327,18 +2330,18 @@ async function parsePreprocessors(rawValue, searchRoots, evaluatorName, evalId)
2327
2330
  return void 0;
2328
2331
  }
2329
2332
  if (!Array.isArray(rawValue)) {
2330
- throw new Error(`Evaluator '${evaluatorName}' in '${evalId}': preprocessors must be an array`);
2333
+ throw new Error(`Grader '${evaluatorName}' in '${evalId}': preprocessors must be an array`);
2331
2334
  }
2332
2335
  const preprocessors = [];
2333
2336
  for (const rawEntry of rawValue) {
2334
2337
  if (!isJsonObject2(rawEntry)) {
2335
2338
  throw new Error(
2336
- `Evaluator '${evaluatorName}' in '${evalId}': each preprocessor must be an object`
2339
+ `Grader '${evaluatorName}' in '${evalId}': each preprocessor must be an object`
2337
2340
  );
2338
2341
  }
2339
2342
  const type = asString(rawEntry.type)?.trim();
2340
2343
  if (!type) {
2341
- throw new Error(`Evaluator '${evaluatorName}' in '${evalId}': preprocessor.type is required`);
2344
+ throw new Error(`Grader '${evaluatorName}' in '${evalId}': preprocessor.type is required`);
2342
2345
  }
2343
2346
  const command = asStringArray(
2344
2347
  rawEntry.command,
@@ -2346,14 +2349,14 @@ async function parsePreprocessors(rawValue, searchRoots, evaluatorName, evalId)
2346
2349
  );
2347
2350
  if (!command || command.length === 0) {
2348
2351
  throw new Error(
2349
- `Evaluator '${evaluatorName}' in '${evalId}': preprocessor '${type}' requires command`
2352
+ `Grader '${evaluatorName}' in '${evalId}': preprocessor '${type}' requires command`
2350
2353
  );
2351
2354
  }
2352
2355
  const commandPath = command[command.length - 1];
2353
2356
  const resolved = await resolveFileReference2(commandPath, searchRoots);
2354
2357
  if (!resolved.resolvedPath) {
2355
2358
  throw new Error(
2356
- `Evaluator '${evaluatorName}' in '${evalId}': preprocessor command file not found: ${resolved.displayPath}`
2359
+ `Grader '${evaluatorName}' in '${evalId}': preprocessor command file not found: ${resolved.displayPath}`
2357
2360
  );
2358
2361
  }
2359
2362
  preprocessors.push({
@@ -2404,13 +2407,13 @@ function coerceEvaluator(candidate, contextId) {
2404
2407
  if (typeof candidate !== "string") {
2405
2408
  return void 0;
2406
2409
  }
2407
- const normalized = normalizeEvaluatorType(candidate);
2410
+ const normalized = normalizeGraderType(candidate);
2408
2411
  if (isDeprecatedJudgeType(normalized)) {
2409
2412
  throw new Error(
2410
2413
  `Unsupported grader '${candidate}' in ${contextId}. Use '${normalized.replace("-judge", "-grader")}' instead.`
2411
2414
  );
2412
2415
  }
2413
- if (isEvaluatorKind(normalized)) {
2416
+ if (isGraderKind(normalized)) {
2414
2417
  return normalized;
2415
2418
  }
2416
2419
  logWarning2(`Unknown grader '${candidate}' in ${contextId}, falling back to default`);
@@ -2482,7 +2485,7 @@ function parseRequiredAndMinScore(rawRequired, rawMinScore, evaluatorName, evalI
2482
2485
  }
2483
2486
  result.required = rawRequired;
2484
2487
  logWarning2(
2485
- `Evaluator '${evaluatorName}' in '${evalId}': 'required: ${rawRequired}' is deprecated. Use 'required: true' + 'min_score: ${rawRequired}' instead.`
2488
+ `Grader '${evaluatorName}' in '${evalId}': 'required: ${rawRequired}' is deprecated. Use 'required: true' + 'min_score: ${rawRequired}' instead.`
2486
2489
  );
2487
2490
  }
2488
2491
  return result;
@@ -3302,7 +3305,7 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
3302
3305
  const testCaseEvaluatorKind = coerceEvaluator(testCaseConfig.evaluator, id) ?? globalEvaluator;
3303
3306
  let evaluators;
3304
3307
  try {
3305
- evaluators = await parseEvaluators(
3308
+ evaluators = await parseGraders(
3306
3309
  testCaseConfig,
3307
3310
  mergedExecution,
3308
3311
  searchRoots,
@@ -3648,7 +3651,7 @@ async function loadTestSuite(evalFilePath, repoRoot, options) {
3648
3651
  targetRefs: extractTargetRefsFromSuite(parsed),
3649
3652
  workers: extractWorkersFromSuite(parsed),
3650
3653
  cacheConfig: extractCacheConfig(parsed),
3651
- totalBudgetUsd: extractTotalBudgetUsd(parsed),
3654
+ budgetUsd: extractBudgetUsd(parsed),
3652
3655
  ...metadata !== void 0 && { metadata },
3653
3656
  ...failOnError !== void 0 && { failOnError },
3654
3657
  ...threshold !== void 0 && { threshold },
@@ -3789,7 +3792,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
3789
3792
  const testCaseEvaluatorKind = coerceEvaluator(testCaseConfig.evaluator, id) ?? globalEvaluator;
3790
3793
  let evaluators;
3791
3794
  try {
3792
- evaluators = await parseEvaluators(
3795
+ evaluators = await parseGraders(
3793
3796
  testCaseConfig,
3794
3797
  globalExecution,
3795
3798
  searchRoots,
@@ -11600,7 +11603,7 @@ function resolveAndCreateProvider(definition, env = process.env) {
11600
11603
  return createProvider(resolved);
11601
11604
  }
11602
11605
 
11603
- // src/evaluation/evaluators/scoring.ts
11606
+ // src/evaluation/graders/scoring.ts
11604
11607
  var DEFAULT_THRESHOLD = 0.8;
11605
11608
  var PASS_THRESHOLD = DEFAULT_THRESHOLD;
11606
11609
  function scoreToVerdict(score, threshold = DEFAULT_THRESHOLD) {
@@ -11688,7 +11691,7 @@ function negateScore(score) {
11688
11691
  };
11689
11692
  }
11690
11693
 
11691
- // src/evaluation/evaluators/code-evaluator.ts
11694
+ // src/evaluation/graders/code-grader.ts
11692
11695
  import { mkdtemp as mkdtemp2, rm as rm3, writeFile as writeFile6 } from "node:fs/promises";
11693
11696
  import { tmpdir as tmpdir2 } from "node:os";
11694
11697
  import { dirname, join } from "node:path";
@@ -11981,7 +11984,7 @@ function getRepoCheckoutTargets(repos) {
11981
11984
  }));
11982
11985
  }
11983
11986
 
11984
- // src/evaluation/evaluators/code-evaluator.ts
11987
+ // src/evaluation/graders/code-grader.ts
11985
11988
  var FILE_BACKED_OUTPUT_THRESHOLD = 5e4;
11986
11989
  var DATA_URI_RE = /^data:([^;]+);base64,(.+)$/s;
11987
11990
  async function materializeContentForGrader(messages, getWorkDir) {
@@ -12033,7 +12036,7 @@ async function materializeContentForGrader(messages, getWorkDir) {
12033
12036
  }
12034
12037
  return result;
12035
12038
  }
12036
- var CodeEvaluator = class {
12039
+ var CodeGrader = class {
12037
12040
  kind = "code-grader";
12038
12041
  command;
12039
12042
  cwd;
@@ -12151,7 +12154,7 @@ var CodeEvaluator = class {
12151
12154
  })) : [];
12152
12155
  const details = parsed?.details && typeof parsed.details === "object" && !Array.isArray(parsed.details) ? parsed.details : void 0;
12153
12156
  const proxyUsage = getProxyUsage?.();
12154
- const evaluatorRawRequest = {
12157
+ const graderRawRequest = {
12155
12158
  command: this.command,
12156
12159
  ...this.cwd ? { cwd: this.cwd } : {},
12157
12160
  ...proxyUsage ? {
@@ -12166,7 +12169,7 @@ var CodeEvaluator = class {
12166
12169
  verdict: scoreToVerdict(score),
12167
12170
  assertions,
12168
12171
  expectedAspectCount: assertions.length || 1,
12169
- evaluatorRawRequest,
12172
+ graderRawRequest,
12170
12173
  ...details ? { details } : {},
12171
12174
  tokenUsage: proxyUsage?.tokenUsage
12172
12175
  };
@@ -12178,7 +12181,7 @@ var CodeEvaluator = class {
12178
12181
  verdict: "fail",
12179
12182
  assertions: [{ text: `Code evaluator failed: ${message}`, passed: false }],
12180
12183
  expectedAspectCount: 1,
12181
- evaluatorRawRequest: {
12184
+ graderRawRequest: {
12182
12185
  command: this.command,
12183
12186
  ...this.cwd ? { cwd: this.cwd } : {},
12184
12187
  ...proxyUsage ? {
@@ -12227,10 +12230,10 @@ function formatStderr(stderr) {
12227
12230
  ${tail}`;
12228
12231
  }
12229
12232
 
12230
- // src/evaluation/evaluators/composite.ts
12233
+ // src/evaluation/graders/composite.ts
12231
12234
  import { generateText as generateText3 } from "ai";
12232
12235
 
12233
- // src/evaluation/evaluators/llm-grader.ts
12236
+ // src/evaluation/graders/llm-grader.ts
12234
12237
  import fs2 from "node:fs/promises";
12235
12238
  import path37 from "node:path";
12236
12239
  import { generateText as generateText2, stepCountIs, tool } from "ai";
@@ -12270,7 +12273,7 @@ var BINARY_EXTENSIONS = /* @__PURE__ */ new Set([
12270
12273
  ".so",
12271
12274
  ".dylib"
12272
12275
  ]);
12273
- var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the answer based on how well it achieves the criteria for the original task.
12276
+ var DEFAULT_GRADER_TEMPLATE = `You are an expert grader. Your goal is to grade the answer based on how well it achieves the criteria for the original task.
12274
12277
 
12275
12278
  Use the reference_answer as a gold standard for a high-quality response (if provided). The reference_answer may be a simple text response, or it may contain a sequence of expected agent messages including tool calls. When it contains multiple messages, the last message represents the final expected answer. The answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
12276
12279
 
@@ -12325,19 +12328,19 @@ function resolveContentBasePath(context) {
12325
12328
  }
12326
12329
  return void 0;
12327
12330
  }
12328
- var LlmGraderEvaluator = class {
12331
+ var LlmGrader = class {
12329
12332
  kind = "llm-grader";
12330
12333
  resolveGraderProvider;
12331
12334
  maxOutputTokens;
12332
12335
  temperature;
12333
- evaluatorTemplate;
12336
+ graderTemplate;
12334
12337
  maxSteps;
12335
12338
  graderTargetProvider;
12336
12339
  constructor(options) {
12337
12340
  this.resolveGraderProvider = options.resolveGraderProvider ?? options.resolveJudgeProvider;
12338
12341
  this.maxOutputTokens = options.maxOutputTokens;
12339
12342
  this.temperature = options.temperature;
12340
- this.evaluatorTemplate = options.evaluatorTemplate;
12343
+ this.graderTemplate = options.graderTemplate;
12341
12344
  this.maxSteps = Math.min(options.maxSteps ?? DEFAULT_MAX_STEPS, MAX_STEPS_LIMIT);
12342
12345
  this.graderTargetProvider = options.graderTargetProvider ?? options.judgeTargetProvider;
12343
12346
  }
@@ -12400,16 +12403,16 @@ var LlmGraderEvaluator = class {
12400
12403
  [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context.evalCase.reference_answer ?? "").trim()
12401
12404
  };
12402
12405
  const systemPrompt = buildOutputSchema();
12403
- const evaluatorTemplate = context.evaluatorTemplateOverride ?? this.evaluatorTemplate ?? DEFAULT_EVALUATOR_TEMPLATE;
12404
- warnDeprecatedTemplateVars(evaluatorTemplate);
12405
- let userPrompt = substituteVariables(evaluatorTemplate, variables);
12406
- if (context.fileChanges && !context.evaluatorTemplateOverride && !this.evaluatorTemplate) {
12406
+ const graderTemplate = context.graderTemplateOverride ?? this.graderTemplate ?? DEFAULT_GRADER_TEMPLATE;
12407
+ warnDeprecatedTemplateVars(graderTemplate);
12408
+ let userPrompt = substituteVariables(graderTemplate, variables);
12409
+ if (context.fileChanges && !context.graderTemplateOverride && !this.graderTemplate) {
12407
12410
  userPrompt += `
12408
12411
 
12409
12412
  [[ ## file_changes ## ]]
12410
12413
  ${context.fileChanges}`;
12411
12414
  }
12412
- const evaluatorRawRequest = {
12415
+ const graderRawRequest = {
12413
12416
  userPrompt,
12414
12417
  systemPrompt
12415
12418
  };
@@ -12430,7 +12433,7 @@ ${context.fileChanges}`;
12430
12433
  verdict: scoreToVerdict(score),
12431
12434
  assertions,
12432
12435
  expectedAspectCount: Math.max(assertions.length, 1),
12433
- evaluatorRawRequest,
12436
+ graderRawRequest,
12434
12437
  graderTarget: graderProvider.targetName,
12435
12438
  details: data.details,
12436
12439
  tokenUsage
@@ -12444,7 +12447,7 @@ ${context.fileChanges}`;
12444
12447
  verdict: "skip",
12445
12448
  assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
12446
12449
  expectedAspectCount: 1,
12447
- evaluatorRawRequest,
12450
+ graderRawRequest,
12448
12451
  graderTarget: graderProvider.targetName
12449
12452
  };
12450
12453
  }
@@ -12461,7 +12464,7 @@ ${context.fileChanges}`;
12461
12464
  }
12462
12465
  const prompt = this.buildRubricPrompt(context, rubrics);
12463
12466
  const systemPrompt = buildRubricOutputSchema();
12464
- const evaluatorRawRequest = {
12467
+ const graderRawRequest = {
12465
12468
  userPrompt: prompt,
12466
12469
  systemPrompt
12467
12470
  };
@@ -12481,7 +12484,7 @@ ${context.fileChanges}`;
12481
12484
  verdict,
12482
12485
  assertions,
12483
12486
  expectedAspectCount: rubrics.length,
12484
- evaluatorRawRequest,
12487
+ graderRawRequest,
12485
12488
  graderTarget: graderProvider.targetName,
12486
12489
  tokenUsage
12487
12490
  };
@@ -12494,7 +12497,7 @@ ${context.fileChanges}`;
12494
12497
  verdict: "skip",
12495
12498
  assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
12496
12499
  expectedAspectCount: rubrics.length,
12497
- evaluatorRawRequest,
12500
+ graderRawRequest,
12498
12501
  graderTarget: graderProvider.targetName
12499
12502
  };
12500
12503
  }
@@ -12506,7 +12509,7 @@ ${context.fileChanges}`;
12506
12509
  async evaluateWithScoreRanges(context, graderProvider, rubrics) {
12507
12510
  const prompt = this.buildScoreRangePrompt(context, rubrics);
12508
12511
  const systemPrompt = buildScoreRangeOutputSchema();
12509
- const evaluatorRawRequest = {
12512
+ const graderRawRequest = {
12510
12513
  userPrompt: prompt,
12511
12514
  systemPrompt
12512
12515
  };
@@ -12526,7 +12529,7 @@ ${context.fileChanges}`;
12526
12529
  verdict,
12527
12530
  assertions,
12528
12531
  expectedAspectCount: rubrics.length,
12529
- evaluatorRawRequest,
12532
+ graderRawRequest,
12530
12533
  graderTarget: graderProvider.targetName,
12531
12534
  details,
12532
12535
  tokenUsage
@@ -12540,7 +12543,7 @@ ${context.fileChanges}`;
12540
12543
  verdict: "skip",
12541
12544
  assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
12542
12545
  expectedAspectCount: rubrics.length,
12543
- evaluatorRawRequest,
12546
+ graderRawRequest,
12544
12547
  graderTarget: graderProvider.targetName
12545
12548
  };
12546
12549
  }
@@ -12569,7 +12572,7 @@ ${context.fileChanges}`;
12569
12572
  const config = context.evaluator;
12570
12573
  const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
12571
12574
  const fsTools = createFilesystemTools(workspacePath);
12572
- const evaluatorRawRequest = {
12575
+ const graderRawRequest = {
12573
12576
  mode: "built-in",
12574
12577
  systemPrompt,
12575
12578
  userPrompt,
@@ -12593,7 +12596,7 @@ ${context.fileChanges}`;
12593
12596
  return this.parseAgentResult(
12594
12597
  text,
12595
12598
  rubrics,
12596
- evaluatorRawRequest,
12599
+ graderRawRequest,
12597
12600
  details,
12598
12601
  graderProvider.targetName
12599
12602
  );
@@ -12604,7 +12607,7 @@ ${context.fileChanges}`;
12604
12607
  verdict: "fail",
12605
12608
  assertions: [{ text: `llm-grader built-in evaluation failed: ${message}`, passed: false }],
12606
12609
  expectedAspectCount: 1,
12607
- evaluatorRawRequest,
12610
+ graderRawRequest,
12608
12611
  graderTarget: graderProvider.targetName,
12609
12612
  details: { mode: "built-in", error: message }
12610
12613
  };
@@ -12636,7 +12639,7 @@ ${context.fileChanges}`;
12636
12639
  async evaluateWithDelegate(context, provider, modeLabel) {
12637
12640
  const workspacePath = context.workspacePath;
12638
12641
  const prompt = this.buildDelegatedPrompt(context);
12639
- const evaluatorRawRequest = {
12642
+ const graderRawRequest = {
12640
12643
  mode: modeLabel,
12641
12644
  grader_target: provider.targetName,
12642
12645
  prompt
@@ -12657,7 +12660,7 @@ ${context.fileChanges}`;
12657
12660
  { text: `llm-grader ${modeLabel} returned no assistant response`, passed: false }
12658
12661
  ],
12659
12662
  expectedAspectCount: 1,
12660
- evaluatorRawRequest,
12663
+ graderRawRequest,
12661
12664
  graderTarget: provider.targetName,
12662
12665
  details: { mode: modeLabel, grader_target: provider.targetName }
12663
12666
  };
@@ -12671,7 +12674,7 @@ ${context.fileChanges}`;
12671
12674
  return this.parseAgentResult(
12672
12675
  assistantContent,
12673
12676
  rubrics,
12674
- evaluatorRawRequest,
12677
+ graderRawRequest,
12675
12678
  details,
12676
12679
  provider.targetName
12677
12680
  );
@@ -12684,7 +12687,7 @@ ${context.fileChanges}`;
12684
12687
  { text: `llm-grader ${modeLabel} evaluation failed: ${message}`, passed: false }
12685
12688
  ],
12686
12689
  expectedAspectCount: 1,
12687
- evaluatorRawRequest,
12690
+ graderRawRequest,
12688
12691
  graderTarget: provider.targetName,
12689
12692
  details: {
12690
12693
  mode: modeLabel,
@@ -12705,7 +12708,7 @@ ${context.fileChanges}`;
12705
12708
  const config = context.evaluator;
12706
12709
  const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
12707
12710
  const parts = [
12708
- "You are an expert evaluator with access to the workspace filesystem.",
12711
+ "You are an expert grader with access to the workspace filesystem.",
12709
12712
  "Use the provided tools to investigate the workspace and verify the criteria are met.",
12710
12713
  "Thoroughly examine relevant files before making your assessment.",
12711
12714
  ""
@@ -12734,9 +12737,9 @@ ${context.fileChanges}`;
12734
12737
  [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context.candidate.trim(),
12735
12738
  [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context.evalCase.reference_answer ?? "").trim()
12736
12739
  };
12737
- if (this.evaluatorTemplate) {
12738
- warnDeprecatedTemplateVars(this.evaluatorTemplate);
12739
- return substituteVariables(this.evaluatorTemplate, variables);
12740
+ if (this.graderTemplate) {
12741
+ warnDeprecatedTemplateVars(this.graderTemplate);
12742
+ return substituteVariables(this.graderTemplate, variables);
12740
12743
  }
12741
12744
  const config = context.evaluator;
12742
12745
  const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
@@ -12783,7 +12786,7 @@ ${context.fileChanges}`;
12783
12786
  const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
12784
12787
  const config = context.evaluator;
12785
12788
  const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
12786
- if (this.evaluatorTemplate) {
12789
+ if (this.graderTemplate) {
12787
12790
  const variables = {
12788
12791
  [TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(),
12789
12792
  [TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
@@ -12795,15 +12798,15 @@ ${context.fileChanges}`;
12795
12798
  [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context.candidate.trim(),
12796
12799
  [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context.evalCase.reference_answer ?? "").trim()
12797
12800
  };
12798
- warnDeprecatedTemplateVars(this.evaluatorTemplate);
12799
- const customPrompt = substituteVariables(this.evaluatorTemplate, variables);
12801
+ warnDeprecatedTemplateVars(this.graderTemplate);
12802
+ const customPrompt = substituteVariables(this.graderTemplate, variables);
12800
12803
  const outputSchema = rubrics && rubrics.length > 0 ? buildRubricOutputSchema() : buildOutputSchema();
12801
12804
  return `${customPrompt}
12802
12805
 
12803
12806
  ${outputSchema}`;
12804
12807
  }
12805
12808
  const parts = [
12806
- "You are an expert evaluator. Investigate the workspace to verify the criteria are met.",
12809
+ "You are an expert grader. Investigate the workspace to verify the criteria are met.",
12807
12810
  "",
12808
12811
  "[[ ## question ## ]]",
12809
12812
  formattedQuestion,
@@ -12840,7 +12843,7 @@ ${outputSchema}`;
12840
12843
  * Parse the agent's response text into an EvaluationScore.
12841
12844
  * Supports both freeform and rubric modes.
12842
12845
  */
12843
- parseAgentResult(text, rubrics, evaluatorRawRequest, details, graderTarget) {
12846
+ parseAgentResult(text, rubrics, graderRawRequest, details, graderTarget) {
12844
12847
  try {
12845
12848
  const parsed = parseJsonFromText(text);
12846
12849
  if (rubrics && rubrics.length > 0) {
@@ -12851,7 +12854,7 @@ ${outputSchema}`;
12851
12854
  verdict,
12852
12855
  assertions: assertions2,
12853
12856
  expectedAspectCount: rubrics.length,
12854
- evaluatorRawRequest,
12857
+ graderRawRequest,
12855
12858
  graderTarget,
12856
12859
  details
12857
12860
  };
@@ -12864,7 +12867,7 @@ ${outputSchema}`;
12864
12867
  verdict: scoreToVerdict(score),
12865
12868
  assertions,
12866
12869
  expectedAspectCount: Math.max(assertions.length, 1),
12867
- evaluatorRawRequest,
12870
+ graderRawRequest,
12868
12871
  graderTarget,
12869
12872
  details: data.details && Object.keys(data.details).length > 0 ? { ...details, ...data.details } : details
12870
12873
  };
@@ -12879,7 +12882,7 @@ ${outputSchema}`;
12879
12882
  }
12880
12883
  ],
12881
12884
  expectedAspectCount: 1,
12882
- evaluatorRawRequest,
12885
+ graderRawRequest,
12883
12886
  graderTarget,
12884
12887
  details
12885
12888
  };
@@ -12894,7 +12897,7 @@ ${outputSchema}`;
12894
12897
  buildScoreRangePrompt(context, rubrics) {
12895
12898
  const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
12896
12899
  const parts = [
12897
- "You are an expert evaluator. Score the candidate answer on each criterion below using the provided score ranges.",
12900
+ "You are an expert grader. Score the candidate answer on each criterion below using the provided score ranges.",
12898
12901
  "For each criterion, output an integer score from 0 to 10 based on which score range best matches the answer.",
12899
12902
  "",
12900
12903
  "[[ ## question ## ]]",
@@ -12937,7 +12940,7 @@ ${outputSchema}`;
12937
12940
  buildRubricPrompt(context, rubrics) {
12938
12941
  const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
12939
12942
  const parts = [
12940
- "You are an expert evaluator. Evaluate the candidate answer against each rubric item below.",
12943
+ "You are an expert grader. Evaluate the candidate answer against each rubric item below.",
12941
12944
  "",
12942
12945
  "[[ ## question ## ]]",
12943
12946
  formattedQuestion,
@@ -13111,7 +13114,7 @@ function sumTokenUsage(first, second) {
13111
13114
  };
13112
13115
  }
13113
13116
  function buildRubricOutputSchema() {
13114
- return `You are an expert evaluator. Evaluate the candidate answer against each rubric item.
13117
+ return `You are an expert grader. Evaluate the candidate answer against each rubric item.
13115
13118
  You must return a valid JSON object matching this schema:
13116
13119
  {
13117
13120
  "checks": [
@@ -13145,7 +13148,7 @@ function warnDeprecatedTemplateVars(template) {
13145
13148
  console.warn(
13146
13149
  `${ANSI_YELLOW7}\u26A0 Deprecated template variables detected (they still work but will be removed in a future version):
13147
13150
  ${used.join("\n ")}
13148
- Update your custom evaluator template to use the new names.${ANSI_RESET8}`
13151
+ Update your custom grader template to use the new names.${ANSI_RESET8}`
13149
13152
  );
13150
13153
  }
13151
13154
  }
@@ -13177,7 +13180,7 @@ function calculateRubricScore(result, rubrics) {
13177
13180
  return { score, verdict, assertions };
13178
13181
  }
13179
13182
  function buildScoreRangeOutputSchema() {
13180
- return `You are an expert evaluator. Score the candidate answer on each criterion.
13183
+ return `You are an expert grader. Score the candidate answer on each criterion.
13181
13184
  You must return a valid JSON object matching this schema:
13182
13185
  {
13183
13186
  "checks": [
@@ -13385,13 +13388,13 @@ async function searchDirectory(dirPath, workspacePath, regex, matches) {
13385
13388
  }
13386
13389
  }
13387
13390
 
13388
- // src/evaluation/evaluators/composite.ts
13391
+ // src/evaluation/graders/composite.ts
13389
13392
  var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
13390
13393
  {{EVALUATOR_RESULTS_JSON}}
13391
13394
 
13392
- Decide the final score and verdict based on all evaluator results.
13395
+ Decide the final score and verdict based on all grader results.
13393
13396
  Return a JSON object with: score (0.0-1.0), verdict (pass/fail), and reasoning.`;
13394
- var CompositeEvaluator = class {
13397
+ var CompositeGrader = class {
13395
13398
  kind = "composite";
13396
13399
  config;
13397
13400
  evaluatorFactory;
@@ -13442,7 +13445,7 @@ var CompositeEvaluator = class {
13442
13445
  weight,
13443
13446
  verdict: member.result.verdict,
13444
13447
  assertions: [...member.result.assertions],
13445
- evaluatorRawRequest: member.result.evaluatorRawRequest,
13448
+ graderRawRequest: member.result.graderRawRequest,
13446
13449
  scores: member.result.scores,
13447
13450
  details: member.result.details,
13448
13451
  tokenUsage: member.result.tokenUsage
@@ -13463,7 +13466,7 @@ var CompositeEvaluator = class {
13463
13466
  verdict: "skip",
13464
13467
  assertions: [{ text: "All evaluators skipped (infrastructure failure)", passed: false }],
13465
13468
  expectedAspectCount: 1,
13466
- evaluatorRawRequest: {
13469
+ graderRawRequest: {
13467
13470
  aggregator: "weighted_average",
13468
13471
  ...weights ? { weights } : {}
13469
13472
  },
@@ -13476,7 +13479,7 @@ var CompositeEvaluator = class {
13476
13479
  verdict: scoreToVerdict(finalScore),
13477
13480
  assertions: allAssertions,
13478
13481
  expectedAspectCount: allAssertions.length || 1,
13479
- evaluatorRawRequest: {
13482
+ graderRawRequest: {
13480
13483
  aggregator: "weighted_average",
13481
13484
  ...weights ? { weights } : {}
13482
13485
  },
@@ -13495,7 +13498,7 @@ var CompositeEvaluator = class {
13495
13498
  score: member.result.score,
13496
13499
  verdict: member.result.verdict,
13497
13500
  assertions: [...member.result.assertions],
13498
- evaluatorRawRequest: member.result.evaluatorRawRequest,
13501
+ graderRawRequest: member.result.graderRawRequest,
13499
13502
  scores: member.result.scores,
13500
13503
  details: member.result.details,
13501
13504
  tokenUsage: member.result.tokenUsage
@@ -13518,7 +13521,7 @@ var CompositeEvaluator = class {
13518
13521
  verdict: "skip",
13519
13522
  assertions: [{ text: "All evaluators skipped (infrastructure failure)", passed: false }],
13520
13523
  expectedAspectCount: 1,
13521
- evaluatorRawRequest: {
13524
+ graderRawRequest: {
13522
13525
  aggregator: "threshold",
13523
13526
  threshold
13524
13527
  },
@@ -13537,7 +13540,7 @@ var CompositeEvaluator = class {
13537
13540
  verdict: pass ? "pass" : "fail",
13538
13541
  assertions: allAssertions,
13539
13542
  expectedAspectCount: allAssertions.length || 1,
13540
- evaluatorRawRequest: {
13543
+ graderRawRequest: {
13541
13544
  aggregator: "threshold",
13542
13545
  threshold
13543
13546
  },
@@ -13554,7 +13557,7 @@ var CompositeEvaluator = class {
13554
13557
  weight: weights?.[member.id] ?? 1,
13555
13558
  verdict: member.result.verdict,
13556
13559
  assertions: [...member.result.assertions],
13557
- evaluatorRawRequest: member.result.evaluatorRawRequest,
13560
+ graderRawRequest: member.result.graderRawRequest,
13558
13561
  scores: member.result.scores,
13559
13562
  details: member.result.details
13560
13563
  }));
@@ -13575,7 +13578,7 @@ var CompositeEvaluator = class {
13575
13578
  verdict,
13576
13579
  assertions,
13577
13580
  expectedAspectCount: assertions.length || 1,
13578
- evaluatorRawRequest: {
13581
+ graderRawRequest: {
13579
13582
  aggregator: "code-grader",
13580
13583
  script: scriptPath
13581
13584
  },
@@ -13588,7 +13591,7 @@ var CompositeEvaluator = class {
13588
13591
  verdict: "fail",
13589
13592
  assertions: [{ text: `Code aggregator failed: ${message}`, passed: false }],
13590
13593
  expectedAspectCount: 1,
13591
- evaluatorRawRequest: {
13594
+ graderRawRequest: {
13592
13595
  aggregator: "code-grader",
13593
13596
  script: scriptPath,
13594
13597
  error: message
@@ -13610,14 +13613,14 @@ var CompositeEvaluator = class {
13610
13613
  score: member.result.score,
13611
13614
  verdict: member.result.verdict,
13612
13615
  assertions: [...member.result.assertions],
13613
- evaluatorRawRequest: member.result.evaluatorRawRequest,
13616
+ graderRawRequest: member.result.graderRawRequest,
13614
13617
  scores: member.result.scores,
13615
13618
  details: member.result.details
13616
13619
  }));
13617
13620
  const promptTemplate = config.prompt ?? DEFAULT_COMPOSITE_AGGREGATOR_PROMPT;
13618
13621
  const userPrompt = promptTemplate.replace(/\{\{EVALUATOR_RESULTS_JSON\}\}/g, resultsJson);
13619
13622
  const systemPrompt = buildOutputSchema();
13620
- const evaluatorRawRequest = {
13623
+ const graderRawRequest = {
13621
13624
  aggregator: "llm-grader",
13622
13625
  userPrompt,
13623
13626
  systemPrompt,
@@ -13639,7 +13642,7 @@ var CompositeEvaluator = class {
13639
13642
  verdict: scoreToVerdict(score2),
13640
13643
  assertions: assertions2,
13641
13644
  expectedAspectCount: Math.max(assertions2.length, 1),
13642
- evaluatorRawRequest,
13645
+ graderRawRequest,
13643
13646
  scores
13644
13647
  };
13645
13648
  }
@@ -13659,7 +13662,7 @@ var CompositeEvaluator = class {
13659
13662
  verdict: scoreToVerdict(score),
13660
13663
  assertions,
13661
13664
  expectedAspectCount: Math.max(assertions.length, 1),
13662
- evaluatorRawRequest,
13665
+ graderRawRequest,
13663
13666
  scores
13664
13667
  };
13665
13668
  } catch {
@@ -13668,15 +13671,15 @@ var CompositeEvaluator = class {
13668
13671
  verdict: "fail",
13669
13672
  assertions: [{ text: "LLM aggregator failed", passed: false }],
13670
13673
  expectedAspectCount: 1,
13671
- evaluatorRawRequest,
13674
+ graderRawRequest,
13672
13675
  scores
13673
13676
  };
13674
13677
  }
13675
13678
  }
13676
13679
  };
13677
13680
 
13678
- // src/evaluation/evaluators/cost.ts
13679
- var CostEvaluator = class {
13681
+ // src/evaluation/graders/cost.ts
13682
+ var CostGrader = class {
13680
13683
  kind = "cost";
13681
13684
  config;
13682
13685
  constructor(options) {
@@ -13691,7 +13694,7 @@ var CostEvaluator = class {
13691
13694
  verdict: "fail",
13692
13695
  assertions: [{ text: "No cost data available in trace", passed: false }],
13693
13696
  expectedAspectCount: 1,
13694
- evaluatorRawRequest: {
13697
+ graderRawRequest: {
13695
13698
  type: "cost",
13696
13699
  budget,
13697
13700
  costUsd: null
@@ -13708,7 +13711,7 @@ var CostEvaluator = class {
13708
13711
  passed ? { text: `Cost ${formatCost(costUsd)} <= ${formatCost(budget)} budget`, passed: true } : { text: `Cost ${formatCost(costUsd)} > ${formatCost(budget)} budget`, passed: false }
13709
13712
  ],
13710
13713
  expectedAspectCount: 1,
13711
- evaluatorRawRequest: {
13714
+ graderRawRequest: {
13712
13715
  type: "cost",
13713
13716
  budget,
13714
13717
  costUsd
@@ -13717,8 +13720,8 @@ var CostEvaluator = class {
13717
13720
  }
13718
13721
  };
13719
13722
 
13720
- // src/evaluation/evaluators/execution-metrics.ts
13721
- var ExecutionMetricsEvaluator = class {
13723
+ // src/evaluation/graders/execution-metrics.ts
13724
+ var ExecutionMetricsGrader = class {
13722
13725
  kind = "execution-metrics";
13723
13726
  config;
13724
13727
  constructor(options) {
@@ -13742,7 +13745,7 @@ var ExecutionMetricsEvaluator = class {
13742
13745
  verdict: "fail",
13743
13746
  assertions: [{ text: "No trace summary available", passed: false }],
13744
13747
  expectedAspectCount: 1,
13745
- evaluatorRawRequest: {
13748
+ graderRawRequest: {
13746
13749
  type: "execution-metrics",
13747
13750
  config: this.extractConfiguredThresholds(),
13748
13751
  actual: null
@@ -13858,7 +13861,7 @@ var ExecutionMetricsEvaluator = class {
13858
13861
  verdict: scoreToVerdict(score),
13859
13862
  assertions,
13860
13863
  expectedAspectCount: totalChecks || 1,
13861
- evaluatorRawRequest: {
13864
+ graderRawRequest: {
13862
13865
  type: "execution-metrics",
13863
13866
  config: this.extractConfiguredThresholds(),
13864
13867
  actual: this.filterDefinedMetrics(actualMetrics)
@@ -13901,7 +13904,7 @@ var ExecutionMetricsEvaluator = class {
13901
13904
  }
13902
13905
  };
13903
13906
 
13904
- // src/evaluation/evaluators/field-accuracy.ts
13907
+ // src/evaluation/graders/field-accuracy.ts
13905
13908
  var DEFAULT_DATE_FORMATS = [
13906
13909
  "YYYY-MM-DDTHH:mm:ssZ",
13907
13910
  // ISO with timezone
@@ -13946,7 +13949,7 @@ var MONTH_NAMES = {
13946
13949
  dec: 11,
13947
13950
  december: 11
13948
13951
  };
13949
- var FieldAccuracyEvaluator = class {
13952
+ var FieldAccuracyGrader = class {
13950
13953
  kind = "field-accuracy";
13951
13954
  config;
13952
13955
  constructor(options) {
@@ -14305,8 +14308,8 @@ function parseJsonFromTextSafe(text) {
14305
14308
  return parseJsonFromText(text);
14306
14309
  }
14307
14310
 
14308
- // src/evaluation/evaluators/latency.ts
14309
- var LatencyEvaluator = class {
14311
+ // src/evaluation/graders/latency.ts
14312
+ var LatencyGrader = class {
14310
14313
  kind = "latency";
14311
14314
  config;
14312
14315
  constructor(options) {
@@ -14321,7 +14324,7 @@ var LatencyEvaluator = class {
14321
14324
  verdict: "fail",
14322
14325
  assertions: [{ text: "No duration data available in trace", passed: false }],
14323
14326
  expectedAspectCount: 1,
14324
- evaluatorRawRequest: {
14327
+ graderRawRequest: {
14325
14328
  type: "latency",
14326
14329
  threshold,
14327
14330
  durationMs: null
@@ -14337,7 +14340,7 @@ var LatencyEvaluator = class {
14337
14340
  passed ? { text: `Duration ${durationMs}ms <= ${threshold}ms threshold`, passed: true } : { text: `Duration ${durationMs}ms > ${threshold}ms threshold`, passed: false }
14338
14341
  ],
14339
14342
  expectedAspectCount: 1,
14340
- evaluatorRawRequest: {
14343
+ graderRawRequest: {
14341
14344
  type: "latency",
14342
14345
  threshold,
14343
14346
  durationMs
@@ -14346,8 +14349,8 @@ var LatencyEvaluator = class {
14346
14349
  }
14347
14350
  };
14348
14351
 
14349
- // src/evaluation/evaluators/skill-trigger.ts
14350
- var SkillTriggerEvaluator = class {
14352
+ // src/evaluation/graders/skill-trigger.ts
14353
+ var SkillTriggerGrader = class {
14351
14354
  kind = "skill-trigger";
14352
14355
  config;
14353
14356
  constructor(config) {
@@ -14414,7 +14417,7 @@ var SkillTriggerEvaluator = class {
14414
14417
  }
14415
14418
  };
14416
14419
 
14417
- // src/evaluation/evaluators/llm-grader-prompt.ts
14420
+ // src/evaluation/graders/llm-grader-prompt.ts
14418
14421
  function assembleLlmGraderPrompt(input) {
14419
14422
  const {
14420
14423
  evalCase,
@@ -14422,7 +14425,7 @@ function assembleLlmGraderPrompt(input) {
14422
14425
  promptInputs,
14423
14426
  evaluatorConfig,
14424
14427
  fileChanges,
14425
- evaluatorTemplateOverride
14428
+ graderTemplateOverride
14426
14429
  } = input;
14427
14430
  const rubrics = evaluatorConfig?.rubrics;
14428
14431
  if (rubrics && rubrics.length > 0) {
@@ -14432,15 +14435,9 @@ function assembleLlmGraderPrompt(input) {
14432
14435
  }
14433
14436
  return assembleChecklist(evalCase, candidate, promptInputs, rubrics, fileChanges);
14434
14437
  }
14435
- return assembleFreeform(
14436
- evalCase,
14437
- candidate,
14438
- promptInputs,
14439
- fileChanges,
14440
- evaluatorTemplateOverride
14441
- );
14438
+ return assembleFreeform(evalCase, candidate, promptInputs, fileChanges, graderTemplateOverride);
14442
14439
  }
14443
- function assembleFreeform(evalCase, candidate, promptInputs, fileChanges, evaluatorTemplateOverride) {
14440
+ function assembleFreeform(evalCase, candidate, promptInputs, fileChanges, graderTemplateOverride) {
14444
14441
  const formattedQuestion = promptInputs.question && promptInputs.question.trim().length > 0 ? promptInputs.question : evalCase.question;
14445
14442
  const variables = {
14446
14443
  [TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
@@ -14454,9 +14451,9 @@ function assembleFreeform(evalCase, candidate, promptInputs, fileChanges, evalua
14454
14451
  [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (evalCase.reference_answer ?? "").trim()
14455
14452
  };
14456
14453
  const systemPrompt = buildOutputSchema();
14457
- const template = evaluatorTemplateOverride ?? DEFAULT_EVALUATOR_TEMPLATE;
14454
+ const template = graderTemplateOverride ?? DEFAULT_GRADER_TEMPLATE;
14458
14455
  let userPrompt = substituteVariables(template, variables);
14459
- if (fileChanges && !evaluatorTemplateOverride) {
14456
+ if (fileChanges && !graderTemplateOverride) {
14460
14457
  userPrompt += `
14461
14458
 
14462
14459
  [[ ## file_changes ## ]]
@@ -14472,7 +14469,7 @@ ${fileChanges}`;
14472
14469
  function assembleChecklist(evalCase, candidate, promptInputs, rubrics, fileChanges) {
14473
14470
  const formattedQuestion = promptInputs.question && promptInputs.question.trim().length > 0 ? promptInputs.question : evalCase.question;
14474
14471
  const parts = [
14475
- "You are an expert evaluator. Evaluate the candidate answer against each rubric item below.",
14472
+ "You are an expert grader. Evaluate the candidate answer against each rubric item below.",
14476
14473
  "",
14477
14474
  "[[ ## question ## ]]",
14478
14475
  formattedQuestion,
@@ -14507,7 +14504,7 @@ function assembleChecklist(evalCase, candidate, promptInputs, rubrics, fileChang
14507
14504
  function assembleScoreRange(evalCase, candidate, promptInputs, rubrics, fileChanges) {
14508
14505
  const formattedQuestion = promptInputs.question && promptInputs.question.trim().length > 0 ? promptInputs.question : evalCase.question;
14509
14506
  const parts = [
14510
- "You are an expert evaluator. Score the candidate answer on each criterion below using the provided score ranges.",
14507
+ "You are an expert grader. Score the candidate answer on each criterion below using the provided score ranges.",
14511
14508
  "For each criterion, output an integer score from 0 to 10 based on which score range best matches the answer.",
14512
14509
  "",
14513
14510
  "[[ ## question ## ]]",
@@ -14555,8 +14552,8 @@ function assembleScoreRange(evalCase, candidate, promptInputs, rubrics, fileChan
14555
14552
  };
14556
14553
  }
14557
14554
 
14558
- // src/evaluation/evaluators/token-usage.ts
14559
- var TokenUsageEvaluator = class {
14555
+ // src/evaluation/graders/token-usage.ts
14556
+ var TokenUsageGrader = class {
14560
14557
  kind = "token-usage";
14561
14558
  config;
14562
14559
  constructor(options) {
@@ -14577,7 +14574,7 @@ var TokenUsageEvaluator = class {
14577
14574
  verdict: "fail",
14578
14575
  assertions: [{ text: "No token usage data available in trace", passed: false }],
14579
14576
  expectedAspectCount,
14580
- evaluatorRawRequest: {
14577
+ graderRawRequest: {
14581
14578
  type: "token-usage",
14582
14579
  max_total: maxTotal ?? null,
14583
14580
  max_input: maxInput ?? null,
@@ -14618,7 +14615,7 @@ var TokenUsageEvaluator = class {
14618
14615
  verdict: passed ? "pass" : "fail",
14619
14616
  assertions,
14620
14617
  expectedAspectCount,
14621
- evaluatorRawRequest: {
14618
+ graderRawRequest: {
14622
14619
  type: "token-usage",
14623
14620
  max_total: maxTotal ?? null,
14624
14621
  max_input: maxInput ?? null,
@@ -14634,7 +14631,7 @@ var TokenUsageEvaluator = class {
14634
14631
  }
14635
14632
  };
14636
14633
 
14637
- // src/evaluation/evaluators/tool-trajectory.ts
14634
+ // src/evaluation/graders/tool-trajectory.ts
14638
14635
  function getNestedValue(obj, path53) {
14639
14636
  const parts = path53.split(".");
14640
14637
  let current = obj;
@@ -14703,7 +14700,7 @@ function checkLatency(toolName, maxDurationMs, actualDurationMs) {
14703
14700
  message: `${toolName} took ${actualDurationMs}ms (max: ${maxDurationMs}ms)`
14704
14701
  };
14705
14702
  }
14706
- var ToolTrajectoryEvaluator = class {
14703
+ var ToolTrajectoryGrader = class {
14707
14704
  kind = "tool-trajectory";
14708
14705
  config;
14709
14706
  constructor(options) {
@@ -15108,7 +15105,7 @@ var ToolTrajectoryEvaluator = class {
15108
15105
  }
15109
15106
  };
15110
15107
 
15111
- // src/evaluation/evaluators/assertions.ts
15108
+ // src/evaluation/graders/assertions.ts
15112
15109
  function runContainsAssertion(output, value) {
15113
15110
  const passed = output.includes(value);
15114
15111
  return {
@@ -15403,15 +15400,15 @@ function validateConcurrency(concurrency) {
15403
15400
  }
15404
15401
  }
15405
15402
 
15406
- // src/evaluation/registry/evaluator-registry.ts
15407
- var EvaluatorRegistry = class {
15403
+ // src/evaluation/registry/grader-registry.ts
15404
+ var GraderRegistry = class {
15408
15405
  factories = /* @__PURE__ */ new Map();
15409
- /** Register a factory function for an evaluator type. */
15406
+ /** Register a factory function for an grader type. */
15410
15407
  register(type, factory) {
15411
15408
  this.factories.set(type, factory);
15412
15409
  return this;
15413
15410
  }
15414
- /** Get the factory function for an evaluator type. */
15411
+ /** Get the factory function for an grader type. */
15415
15412
  get(type) {
15416
15413
  return this.factories.get(type);
15417
15414
  }
@@ -15419,25 +15416,25 @@ var EvaluatorRegistry = class {
15419
15416
  has(type) {
15420
15417
  return this.factories.has(type);
15421
15418
  }
15422
- /** List all registered evaluator type names. */
15419
+ /** List all registered grader type names. */
15423
15420
  list() {
15424
15421
  return [...this.factories.keys()];
15425
15422
  }
15426
15423
  /**
15427
15424
  * Create an evaluator instance from a config, using the registered factory.
15428
- * Throws if no factory is registered for the evaluator type.
15425
+ * Throws if no factory is registered for the grader type.
15429
15426
  */
15430
15427
  async create(config, context) {
15431
15428
  const factory = this.factories.get(config.type);
15432
15429
  if (!factory) {
15433
15430
  throw new Error(
15434
- `Unknown evaluator type: "${config.type}". Registered types: ${this.list().join(", ")}`
15431
+ `Unknown grader type: "${config.type}". Registered types: ${this.list().join(", ")}`
15435
15432
  );
15436
15433
  }
15437
15434
  return factory(config, context);
15438
15435
  }
15439
15436
  };
15440
- var DeterministicAssertionEvaluator = class {
15437
+ var DeterministicAssertionGrader = class {
15441
15438
  constructor(kind, assertFn) {
15442
15439
  this.assertFn = assertFn;
15443
15440
  this.kind = kind;
@@ -15448,8 +15445,8 @@ var DeterministicAssertionEvaluator = class {
15448
15445
  }
15449
15446
  };
15450
15447
 
15451
- // src/evaluation/evaluators/inline-assert.ts
15452
- var InlineAssertEvaluator = class {
15448
+ // src/evaluation/graders/inline-assert.ts
15449
+ var InlineAssertGrader = class {
15453
15450
  constructor(fn, name) {
15454
15451
  this.fn = fn;
15455
15452
  this.name = name;
@@ -15474,7 +15471,7 @@ var InlineAssertEvaluator = class {
15474
15471
  }
15475
15472
  };
15476
15473
 
15477
- // src/evaluation/evaluators/prompt-resolution.ts
15474
+ // src/evaluation/graders/prompt-resolution.ts
15478
15475
  import path38 from "node:path";
15479
15476
  async function resolveCustomPrompt(promptConfig, context, timeoutMs) {
15480
15477
  if (promptConfig.resolvedPromptScript && promptConfig.resolvedPromptScript.length > 0) {
@@ -15541,7 +15538,7 @@ async function executePromptTemplate(script, context, config, timeoutMs) {
15541
15538
  }
15542
15539
  }
15543
15540
 
15544
- // src/evaluation/registry/builtin-evaluators.ts
15541
+ // src/evaluation/registry/builtin-graders.ts
15545
15542
  var INLINE_ASSERT_FN = Symbol.for("agentv.inline-assert-fn");
15546
15543
  var llmGraderFactory = (config, context) => {
15547
15544
  const c = config;
@@ -15558,7 +15555,7 @@ var llmGraderFactory = (config, context) => {
15558
15555
  );
15559
15556
  }
15560
15557
  const isAgent = isAgentProvider(graderTargetProvider) || graderTargetProvider.kind === "agentv";
15561
- evaluator = new LlmGraderEvaluator({
15558
+ evaluator = new LlmGrader({
15562
15559
  resolveGraderProvider: async (evalContext) => {
15563
15560
  if (graderTargetProvider) return graderTargetProvider;
15564
15561
  if (evalContext.graderProvider) return evalContext.graderProvider;
@@ -15586,11 +15583,11 @@ var llmGraderFactory = (config, context) => {
15586
15583
  agentTimeoutMs
15587
15584
  );
15588
15585
  const isFromInlinePrompt = !c.resolvedPromptScript?.length && !c.resolvedPromptPath && !c.promptPath;
15589
- let evaluatorTemplateOverride;
15586
+ let graderTemplateOverride;
15590
15587
  let evalCase = evalContext.evalCase;
15591
15588
  if (customPrompt) {
15592
15589
  if (!isFromInlinePrompt || containsTemplateVariables(customPrompt)) {
15593
- evaluatorTemplateOverride = customPrompt;
15590
+ graderTemplateOverride = customPrompt;
15594
15591
  } else {
15595
15592
  evalCase = { ...evalCase, criteria: customPrompt };
15596
15593
  }
@@ -15598,7 +15595,7 @@ var llmGraderFactory = (config, context) => {
15598
15595
  return evaluator.evaluate({
15599
15596
  ...evalContext,
15600
15597
  evalCase,
15601
- evaluatorTemplateOverride,
15598
+ graderTemplateOverride,
15602
15599
  evaluator: c
15603
15600
  });
15604
15601
  }
@@ -15606,7 +15603,7 @@ var llmGraderFactory = (config, context) => {
15606
15603
  };
15607
15604
  var codeFactory = (config, context) => {
15608
15605
  const c = config;
15609
- return new CodeEvaluator({
15606
+ return new CodeGrader({
15610
15607
  command: c.command ?? c.script ?? [],
15611
15608
  cwd: c.resolvedCwd ?? c.cwd,
15612
15609
  agentTimeoutMs: context.agentTimeoutMs,
@@ -15617,19 +15614,19 @@ var codeFactory = (config, context) => {
15617
15614
  var compositeFactory = (config, context) => {
15618
15615
  const c = config;
15619
15616
  const evalFileDir = context.evalFileDir ?? process.cwd();
15620
- return new CompositeEvaluator({
15617
+ return new CompositeGrader({
15621
15618
  config: c,
15622
15619
  cwd: evalFileDir,
15623
15620
  evaluatorFactory: {
15624
15621
  create: (memberConfig) => {
15625
15622
  const factory = context.registry.get(memberConfig.type);
15626
15623
  if (!factory) {
15627
- throw new Error(`Unsupported evaluator type in composite: ${memberConfig.type}`);
15624
+ throw new Error(`Unsupported grader type in composite: ${memberConfig.type}`);
15628
15625
  }
15629
15626
  const result = factory(memberConfig, context);
15630
15627
  if (result instanceof Promise) {
15631
15628
  throw new Error(
15632
- `Evaluator factory for type "${memberConfig.type}" is async \u2014 not supported inside composite members. Use synchronous factories for composite child evaluators.`
15629
+ `Grader factory for type "${memberConfig.type}" is async \u2014 not supported inside composite members. Use synchronous factories for composite child evaluators.`
15633
15630
  );
15634
15631
  }
15635
15632
  return result;
@@ -15638,35 +15635,35 @@ var compositeFactory = (config, context) => {
15638
15635
  });
15639
15636
  };
15640
15637
  var toolTrajectoryFactory = (config) => {
15641
- return new ToolTrajectoryEvaluator({
15638
+ return new ToolTrajectoryGrader({
15642
15639
  config
15643
15640
  });
15644
15641
  };
15645
15642
  var fieldAccuracyFactory = (config) => {
15646
- return new FieldAccuracyEvaluator({
15643
+ return new FieldAccuracyGrader({
15647
15644
  config
15648
15645
  });
15649
15646
  };
15650
15647
  var latencyFactory = (config) => {
15651
- return new LatencyEvaluator({ config });
15648
+ return new LatencyGrader({ config });
15652
15649
  };
15653
15650
  var costFactory = (config) => {
15654
- return new CostEvaluator({ config });
15651
+ return new CostGrader({ config });
15655
15652
  };
15656
15653
  var tokenUsageFactory = (config) => {
15657
- return new TokenUsageEvaluator({ config });
15654
+ return new TokenUsageGrader({ config });
15658
15655
  };
15659
15656
  var executionMetricsFactory = (config) => {
15660
- return new ExecutionMetricsEvaluator({
15657
+ return new ExecutionMetricsGrader({
15661
15658
  config
15662
15659
  });
15663
15660
  };
15664
15661
  var skillTriggerFactory = (config) => {
15665
- return new SkillTriggerEvaluator(config);
15662
+ return new SkillTriggerGrader(config);
15666
15663
  };
15667
15664
  var containsFactory = (config) => {
15668
15665
  const c = config;
15669
- return new DeterministicAssertionEvaluator("contains", (ctx) => {
15666
+ return new DeterministicAssertionGrader("contains", (ctx) => {
15670
15667
  const result = runContainsAssertion(ctx.candidate, c.value);
15671
15668
  return {
15672
15669
  score: result.score,
@@ -15678,7 +15675,7 @@ var containsFactory = (config) => {
15678
15675
  };
15679
15676
  var regexFactory = (config) => {
15680
15677
  const c = config;
15681
- return new DeterministicAssertionEvaluator("regex", (ctx) => {
15678
+ return new DeterministicAssertionGrader("regex", (ctx) => {
15682
15679
  const result = runRegexAssertion(ctx.candidate, c.value, c.flags);
15683
15680
  return {
15684
15681
  score: result.score,
@@ -15689,7 +15686,7 @@ var regexFactory = (config) => {
15689
15686
  });
15690
15687
  };
15691
15688
  var isJsonFactory = () => {
15692
- return new DeterministicAssertionEvaluator("is-json", (ctx) => {
15689
+ return new DeterministicAssertionGrader("is-json", (ctx) => {
15693
15690
  const result = runIsJsonAssertion(ctx.candidate);
15694
15691
  return {
15695
15692
  score: result.score,
@@ -15701,7 +15698,7 @@ var isJsonFactory = () => {
15701
15698
  };
15702
15699
  var equalsFactory = (config) => {
15703
15700
  const c = config;
15704
- return new DeterministicAssertionEvaluator("equals", (ctx) => {
15701
+ return new DeterministicAssertionGrader("equals", (ctx) => {
15705
15702
  const result = runEqualsAssertion(ctx.candidate, c.value);
15706
15703
  return {
15707
15704
  score: result.score,
@@ -15713,7 +15710,7 @@ var equalsFactory = (config) => {
15713
15710
  };
15714
15711
  var containsAnyFactory = (config) => {
15715
15712
  const c = config;
15716
- return new DeterministicAssertionEvaluator("contains-any", (ctx) => {
15713
+ return new DeterministicAssertionGrader("contains-any", (ctx) => {
15717
15714
  const result = runContainsAnyAssertion(ctx.candidate, c.value);
15718
15715
  return {
15719
15716
  score: result.score,
@@ -15725,7 +15722,7 @@ var containsAnyFactory = (config) => {
15725
15722
  };
15726
15723
  var containsAllFactory = (config) => {
15727
15724
  const c = config;
15728
- return new DeterministicAssertionEvaluator("contains-all", (ctx) => {
15725
+ return new DeterministicAssertionGrader("contains-all", (ctx) => {
15729
15726
  const result = runContainsAllAssertion(ctx.candidate, c.value);
15730
15727
  return {
15731
15728
  score: result.score,
@@ -15737,7 +15734,7 @@ var containsAllFactory = (config) => {
15737
15734
  };
15738
15735
  var icontainsFactory = (config) => {
15739
15736
  const c = config;
15740
- return new DeterministicAssertionEvaluator("icontains", (ctx) => {
15737
+ return new DeterministicAssertionGrader("icontains", (ctx) => {
15741
15738
  const result = runIcontainsAssertion(ctx.candidate, c.value);
15742
15739
  return {
15743
15740
  score: result.score,
@@ -15749,7 +15746,7 @@ var icontainsFactory = (config) => {
15749
15746
  };
15750
15747
  var icontainsAnyFactory = (config) => {
15751
15748
  const c = config;
15752
- return new DeterministicAssertionEvaluator("icontains-any", (ctx) => {
15749
+ return new DeterministicAssertionGrader("icontains-any", (ctx) => {
15753
15750
  const result = runIcontainsAnyAssertion(ctx.candidate, c.value);
15754
15751
  return {
15755
15752
  score: result.score,
@@ -15761,7 +15758,7 @@ var icontainsAnyFactory = (config) => {
15761
15758
  };
15762
15759
  var icontainsAllFactory = (config) => {
15763
15760
  const c = config;
15764
- return new DeterministicAssertionEvaluator("icontains-all", (ctx) => {
15761
+ return new DeterministicAssertionGrader("icontains-all", (ctx) => {
15765
15762
  const result = runIcontainsAllAssertion(ctx.candidate, c.value);
15766
15763
  return {
15767
15764
  score: result.score,
@@ -15773,7 +15770,7 @@ var icontainsAllFactory = (config) => {
15773
15770
  };
15774
15771
  var startsWithFactory = (config) => {
15775
15772
  const c = config;
15776
- return new DeterministicAssertionEvaluator("starts-with", (ctx) => {
15773
+ return new DeterministicAssertionGrader("starts-with", (ctx) => {
15777
15774
  const result = runStartsWithAssertion(ctx.candidate, c.value);
15778
15775
  return {
15779
15776
  score: result.score,
@@ -15785,7 +15782,7 @@ var startsWithFactory = (config) => {
15785
15782
  };
15786
15783
  var endsWithFactory = (config) => {
15787
15784
  const c = config;
15788
- return new DeterministicAssertionEvaluator("ends-with", (ctx) => {
15785
+ return new DeterministicAssertionGrader("ends-with", (ctx) => {
15789
15786
  const result = runEndsWithAssertion(ctx.candidate, c.value);
15790
15787
  return {
15791
15788
  score: result.score,
@@ -15796,7 +15793,7 @@ var endsWithFactory = (config) => {
15796
15793
  });
15797
15794
  };
15798
15795
  function createBuiltinRegistry() {
15799
- const registry = new EvaluatorRegistry();
15796
+ const registry = new GraderRegistry();
15800
15797
  registry.register("llm-grader", llmGraderFactory).register("code-grader", codeFactory).register("composite", compositeFactory).register("tool-trajectory", toolTrajectoryFactory).register("field-accuracy", fieldAccuracyFactory).register("latency", latencyFactory).register("cost", costFactory).register("token-usage", tokenUsageFactory).register("execution-metrics", executionMetricsFactory).register("skill-trigger", skillTriggerFactory).register("contains", containsFactory).register("contains-any", containsAnyFactory).register("contains-all", containsAllFactory).register("icontains", icontainsFactory).register("icontains-any", icontainsAnyFactory).register("icontains-all", icontainsAllFactory).register("starts-with", startsWithFactory).register("ends-with", endsWithFactory).register("regex", regexFactory).register("is-json", isJsonFactory).register("equals", equalsFactory).register("inline-assert", (config) => {
15801
15798
  const fn = config[INLINE_ASSERT_FN];
15802
15799
  if (!fn) {
@@ -15804,7 +15801,7 @@ function createBuiltinRegistry() {
15804
15801
  `No inline assert function found on config for "${config.name}". Inline assert functions must be attached via INLINE_ASSERT_FN symbol.`
15805
15802
  );
15806
15803
  }
15807
- return new InlineAssertEvaluator(fn, config.name ?? "inline-assert");
15804
+ return new InlineAssertGrader(fn, config.name ?? "inline-assert");
15808
15805
  });
15809
15806
  return registry;
15810
15807
  }
@@ -15841,7 +15838,7 @@ async function discoverAssertions(registry, baseDir) {
15841
15838
  continue;
15842
15839
  }
15843
15840
  const factory = (_config, context) => {
15844
- return new CodeEvaluator({
15841
+ return new CodeGrader({
15845
15842
  command: ["bun", "run", filePath],
15846
15843
  agentTimeoutMs: context.agentTimeoutMs
15847
15844
  });
@@ -15885,7 +15882,7 @@ async function discoverGraders(registry, baseDir) {
15885
15882
  continue;
15886
15883
  }
15887
15884
  const factory = (_config, context) => {
15888
- return new CodeEvaluator({
15885
+ return new CodeGrader({
15889
15886
  command: ["bun", "run", filePath],
15890
15887
  agentTimeoutMs: context.agentTimeoutMs
15891
15888
  });
@@ -16727,10 +16724,10 @@ function buildSkippedEvaluatorError(scores) {
16727
16724
  }
16728
16725
  const messages = skippedScores.map((score) => {
16729
16726
  const label = score.name || score.type;
16730
- const assertionMessage = score.assertions.find((assertion) => !assertion.passed)?.text ?? "Evaluator skipped";
16727
+ const assertionMessage = score.assertions.find((assertion) => !assertion.passed)?.text ?? "Grader skipped";
16731
16728
  return `${label}: ${assertionMessage}`;
16732
16729
  });
16733
- return messages.length === 1 ? messages[0] : `Evaluators skipped: ${messages.join(" | ")}`;
16730
+ return messages.length === 1 ? messages[0] : `Graders skipped: ${messages.join(" | ")}`;
16734
16731
  }
16735
16732
  function usesFileReferencePrompt(provider) {
16736
16733
  return isAgentProvider(provider) || provider.kind === "cli";
@@ -16899,7 +16896,7 @@ async function runEvaluation(options) {
16899
16896
  cleanupWorkspaces,
16900
16897
  trials,
16901
16898
  streamCallbacks,
16902
- totalBudgetUsd,
16899
+ budgetUsd,
16903
16900
  failOnError,
16904
16901
  poolWorkspaces,
16905
16902
  poolMaxSlots: configPoolMaxSlots,
@@ -17428,7 +17425,7 @@ async function runEvaluation(options) {
17428
17425
  async function dispatchTest(evalCase, depResults) {
17429
17426
  const workerId = nextWorkerId++;
17430
17427
  workerIdByEvalId.set(evalCase.id, workerId);
17431
- if (totalBudgetUsd !== void 0 && budgetExhausted) {
17428
+ if (budgetUsd !== void 0 && budgetExhausted) {
17432
17429
  const budgetResult = {
17433
17430
  timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
17434
17431
  testId: evalCase.id,
@@ -17438,13 +17435,13 @@ async function runEvaluation(options) {
17438
17435
  assertions: [],
17439
17436
  output: [],
17440
17437
  target: target.name,
17441
- error: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
17438
+ error: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${budgetUsd.toFixed(4)})`,
17442
17439
  budgetExceeded: true,
17443
17440
  executionStatus: "execution_error",
17444
17441
  failureStage: "setup",
17445
17442
  failureReasonCode: "budget_exceeded",
17446
17443
  executionError: {
17447
- message: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
17444
+ message: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${budgetUsd.toFixed(4)})`,
17448
17445
  stage: "setup"
17449
17446
  }
17450
17447
  };
@@ -17541,7 +17538,7 @@ async function runEvaluation(options) {
17541
17538
  ...depResults && Object.keys(depResults).length > 0 ? { dependencyResults: depResults } : {}
17542
17539
  };
17543
17540
  let result = trials && trials.count > 1 ? await runEvalCaseWithTrials(runCaseOptions, trials) : await runEvalCase(runCaseOptions);
17544
- if (totalBudgetUsd !== void 0) {
17541
+ if (budgetUsd !== void 0) {
17545
17542
  let caseCost;
17546
17543
  if (result.trials && result.trials.length > 0) {
17547
17544
  const trialCostSum = result.trials.reduce((sum, t) => sum + (t.costUsd ?? 0), 0);
@@ -17553,7 +17550,7 @@ async function runEvaluation(options) {
17553
17550
  }
17554
17551
  if (caseCost !== void 0) {
17555
17552
  cumulativeBudgetCost += caseCost;
17556
- if (cumulativeBudgetCost >= totalBudgetUsd) {
17553
+ if (cumulativeBudgetCost >= budgetUsd) {
17557
17554
  budgetExhausted = true;
17558
17555
  }
17559
17556
  }
@@ -18695,7 +18692,7 @@ async function evaluateCandidate(options) {
18695
18692
  };
18696
18693
  }
18697
18694
  }
18698
- const evaluatorRequest = scores ? void 0 : score.evaluatorRawRequest;
18695
+ const evaluatorRequest = scores ? void 0 : score.graderRawRequest;
18699
18696
  const effectiveAgentRequest = agentRequest && Object.keys(agentRequest).length > 0 ? agentRequest : void 0;
18700
18697
  const requests = effectiveAgentRequest || lmRequest || evaluatorRequest ? {
18701
18698
  ...effectiveAgentRequest ? { agent: effectiveAgentRequest } : {},
@@ -18911,7 +18908,7 @@ async function runEvaluatorList(options) {
18911
18908
  weight,
18912
18909
  verdict: score2.verdict,
18913
18910
  assertions: score2.assertions,
18914
- input: score2.evaluatorRawRequest,
18911
+ input: score2.graderRawRequest,
18915
18912
  target: score2.graderTarget,
18916
18913
  details: score2.details,
18917
18914
  scores: mapChildResults(score2.scores),
@@ -18927,7 +18924,7 @@ async function runEvaluatorList(options) {
18927
18924
  score: 0,
18928
18925
  verdict: "fail",
18929
18926
  assertions: [
18930
- { text: `Evaluator '${evaluatorConfig.name}' failed: ${message}`, passed: false }
18927
+ { text: `Grader '${evaluatorConfig.name}' failed: ${message}`, passed: false }
18931
18928
  ],
18932
18929
  expectedAspectCount: 1
18933
18930
  };
@@ -18948,7 +18945,7 @@ async function runEvaluatorList(options) {
18948
18945
  verdict: "fail",
18949
18946
  assertions: [
18950
18947
  {
18951
- text: `Evaluator '${evaluatorConfig.name ?? "unknown"}' failed: ${message}`,
18948
+ text: `Grader '${evaluatorConfig.name ?? "unknown"}' failed: ${message}`,
18952
18949
  passed: false
18953
18950
  }
18954
18951
  ],
@@ -19005,7 +19002,7 @@ function filterEvalCases(evalCases, filter) {
19005
19002
  return evalCases.filter((evalCase) => matchesFilter3(evalCase.id, filter));
19006
19003
  }
19007
19004
  function buildEvaluatorRegistry(overrides, resolveGraderProvider) {
19008
- const llmGrader = overrides?.["llm-grader"] ?? new LlmGraderEvaluator({
19005
+ const llmGrader = overrides?.["llm-grader"] ?? new LlmGrader({
19009
19006
  resolveGraderProvider: async (context) => {
19010
19007
  if (context.graderProvider) {
19011
19008
  return context.graderProvider;
@@ -19496,7 +19493,7 @@ function mapChildResults(children) {
19496
19493
  weight: child.weight,
19497
19494
  verdict: child.verdict,
19498
19495
  assertions: child.assertions,
19499
- input: child.evaluatorRawRequest,
19496
+ input: child.graderRawRequest,
19500
19497
  scores: mapChildResults(child.scores),
19501
19498
  details: child.details,
19502
19499
  tokenUsage: child.tokenUsage
@@ -21599,22 +21596,21 @@ function createAgentKernel() {
21599
21596
  }
21600
21597
  export {
21601
21598
  COMMON_TARGET_SETTINGS,
21602
- CodeEvaluator,
21603
- CompositeEvaluator,
21604
- CostEvaluator,
21599
+ CodeGrader,
21600
+ CompositeGrader,
21601
+ CostGrader,
21605
21602
  DEFAULT_CATEGORY,
21606
- DEFAULT_EVALUATOR_TEMPLATE,
21607
21603
  DEFAULT_EVAL_PATTERNS,
21608
21604
  DEFAULT_EXPLORATION_TOOLS,
21605
+ DEFAULT_GRADER_TEMPLATE,
21609
21606
  DEFAULT_THRESHOLD,
21610
- DeterministicAssertionEvaluator,
21607
+ DeterministicAssertionGrader,
21611
21608
  DockerWorkspaceProvider,
21612
- EvaluatorRegistry,
21613
- ExecutionMetricsEvaluator,
21614
- FieldAccuracyEvaluator,
21615
- LatencyEvaluator,
21616
- LlmGraderEvaluator,
21617
- LlmGraderEvaluator as LlmJudgeEvaluator,
21609
+ ExecutionMetricsGrader,
21610
+ FieldAccuracyGrader,
21611
+ GraderRegistry,
21612
+ LatencyGrader,
21613
+ LlmGrader,
21618
21614
  OTEL_BACKEND_PRESETS,
21619
21615
  OtelStreamingObserver,
21620
21616
  OtelTraceExporter,
@@ -21623,18 +21619,17 @@ export {
21623
21619
  ProviderRegistry,
21624
21620
  RepoManager,
21625
21621
  ResponseCache,
21626
- SkillTriggerEvaluator,
21622
+ SkillTriggerGrader,
21627
21623
  TEST_MESSAGE_ROLES,
21628
21624
  TemplateNotDirectoryError,
21629
21625
  TemplateNotFoundError,
21630
- TokenUsageEvaluator,
21631
- ToolTrajectoryEvaluator,
21626
+ TokenUsageGrader,
21627
+ ToolTrajectoryGrader,
21632
21628
  TranscriptProvider,
21633
21629
  WorkspaceCreationError,
21634
21630
  WorkspacePoolManager,
21635
21631
  addBenchmark,
21636
21632
  assembleLlmGraderPrompt,
21637
- assembleLlmGraderPrompt as assembleLlmJudgePrompt,
21638
21633
  avgToolDurationMs,
21639
21634
  buildDirectoryChain,
21640
21635
  buildOutputSchema,
@@ -21674,7 +21669,6 @@ export {
21674
21669
  discoverCodexSessions,
21675
21670
  discoverCopilotSessions,
21676
21671
  discoverGraders,
21677
- discoverGraders as discoverJudges,
21678
21672
  discoverProviders,
21679
21673
  ensureResultsRepoClone,
21680
21674
  ensureVSCodeSubagents,
@@ -21716,7 +21710,7 @@ export {
21716
21710
  isAgentSkillsFormat,
21717
21711
  isContent,
21718
21712
  isContentArray,
21719
- isEvaluatorKind,
21713
+ isGraderKind,
21720
21714
  isJsonObject,
21721
21715
  isJsonValue,
21722
21716
  isNonEmptyString,