@agentv/core 4.17.1 → 4.18.0-next.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -1768,22 +1768,21 @@ var init_otlp_json_file_exporter = __esm({
1768
1768
  var index_exports = {};
1769
1769
  __export(index_exports, {
1770
1770
  COMMON_TARGET_SETTINGS: () => COMMON_TARGET_SETTINGS,
1771
- CodeEvaluator: () => CodeEvaluator,
1772
- CompositeEvaluator: () => CompositeEvaluator,
1773
- CostEvaluator: () => CostEvaluator,
1771
+ CodeGrader: () => CodeGrader,
1772
+ CompositeGrader: () => CompositeGrader,
1773
+ CostGrader: () => CostGrader,
1774
1774
  DEFAULT_CATEGORY: () => DEFAULT_CATEGORY,
1775
- DEFAULT_EVALUATOR_TEMPLATE: () => DEFAULT_EVALUATOR_TEMPLATE,
1776
1775
  DEFAULT_EVAL_PATTERNS: () => DEFAULT_EVAL_PATTERNS,
1777
1776
  DEFAULT_EXPLORATION_TOOLS: () => DEFAULT_EXPLORATION_TOOLS,
1777
+ DEFAULT_GRADER_TEMPLATE: () => DEFAULT_GRADER_TEMPLATE,
1778
1778
  DEFAULT_THRESHOLD: () => DEFAULT_THRESHOLD,
1779
- DeterministicAssertionEvaluator: () => DeterministicAssertionEvaluator,
1779
+ DeterministicAssertionGrader: () => DeterministicAssertionGrader,
1780
1780
  DockerWorkspaceProvider: () => DockerWorkspaceProvider,
1781
- EvaluatorRegistry: () => EvaluatorRegistry,
1782
- ExecutionMetricsEvaluator: () => ExecutionMetricsEvaluator,
1783
- FieldAccuracyEvaluator: () => FieldAccuracyEvaluator,
1784
- LatencyEvaluator: () => LatencyEvaluator,
1785
- LlmGraderEvaluator: () => LlmGraderEvaluator,
1786
- LlmJudgeEvaluator: () => LlmGraderEvaluator,
1781
+ ExecutionMetricsGrader: () => ExecutionMetricsGrader,
1782
+ FieldAccuracyGrader: () => FieldAccuracyGrader,
1783
+ GraderRegistry: () => GraderRegistry,
1784
+ LatencyGrader: () => LatencyGrader,
1785
+ LlmGrader: () => LlmGrader,
1787
1786
  OTEL_BACKEND_PRESETS: () => OTEL_BACKEND_PRESETS,
1788
1787
  OtelStreamingObserver: () => OtelStreamingObserver,
1789
1788
  OtelTraceExporter: () => OtelTraceExporter,
@@ -1792,18 +1791,17 @@ __export(index_exports, {
1792
1791
  ProviderRegistry: () => ProviderRegistry,
1793
1792
  RepoManager: () => RepoManager,
1794
1793
  ResponseCache: () => ResponseCache,
1795
- SkillTriggerEvaluator: () => SkillTriggerEvaluator,
1794
+ SkillTriggerGrader: () => SkillTriggerGrader,
1796
1795
  TEST_MESSAGE_ROLES: () => TEST_MESSAGE_ROLES,
1797
1796
  TemplateNotDirectoryError: () => TemplateNotDirectoryError,
1798
1797
  TemplateNotFoundError: () => TemplateNotFoundError,
1799
- TokenUsageEvaluator: () => TokenUsageEvaluator,
1800
- ToolTrajectoryEvaluator: () => ToolTrajectoryEvaluator,
1798
+ TokenUsageGrader: () => TokenUsageGrader,
1799
+ ToolTrajectoryGrader: () => ToolTrajectoryGrader,
1801
1800
  TranscriptProvider: () => TranscriptProvider,
1802
1801
  WorkspaceCreationError: () => WorkspaceCreationError,
1803
1802
  WorkspacePoolManager: () => WorkspacePoolManager,
1804
1803
  addBenchmark: () => addBenchmark,
1805
1804
  assembleLlmGraderPrompt: () => assembleLlmGraderPrompt,
1806
- assembleLlmJudgePrompt: () => assembleLlmGraderPrompt,
1807
1805
  avgToolDurationMs: () => avgToolDurationMs,
1808
1806
  buildDirectoryChain: () => buildDirectoryChain2,
1809
1807
  buildOutputSchema: () => buildOutputSchema,
@@ -1843,7 +1841,6 @@ __export(index_exports, {
1843
1841
  discoverCodexSessions: () => discoverCodexSessions,
1844
1842
  discoverCopilotSessions: () => discoverCopilotSessions,
1845
1843
  discoverGraders: () => discoverGraders,
1846
- discoverJudges: () => discoverGraders,
1847
1844
  discoverProviders: () => discoverProviders,
1848
1845
  ensureResultsRepoClone: () => ensureResultsRepoClone,
1849
1846
  ensureVSCodeSubagents: () => ensureVSCodeSubagents,
@@ -1885,7 +1882,7 @@ __export(index_exports, {
1885
1882
  isAgentSkillsFormat: () => isAgentSkillsFormat,
1886
1883
  isContent: () => isContent,
1887
1884
  isContentArray: () => isContentArray,
1888
- isEvaluatorKind: () => isEvaluatorKind,
1885
+ isGraderKind: () => isGraderKind,
1889
1886
  isJsonObject: () => isJsonObject,
1890
1887
  isJsonValue: () => isJsonValue,
1891
1888
  isNonEmptyString: () => isNonEmptyString,
@@ -2038,7 +2035,7 @@ function isTestMessage(value) {
2038
2035
  }
2039
2036
  return false;
2040
2037
  }
2041
- var EVALUATOR_KIND_VALUES = [
2038
+ var GRADER_KIND_VALUES = [
2042
2039
  "code-grader",
2043
2040
  "llm-grader",
2044
2041
  "rubric",
@@ -2064,9 +2061,9 @@ var EVALUATOR_KIND_VALUES = [
2064
2061
  "rubrics",
2065
2062
  "inline-assert"
2066
2063
  ];
2067
- var EVALUATOR_KIND_SET = new Set(EVALUATOR_KIND_VALUES);
2068
- function isEvaluatorKind(value) {
2069
- return typeof value === "string" && EVALUATOR_KIND_SET.has(value);
2064
+ var GRADER_KIND_SET = new Set(GRADER_KIND_VALUES);
2065
+ function isGraderKind(value) {
2066
+ return typeof value === "string" && GRADER_KIND_SET.has(value);
2070
2067
  }
2071
2068
 
2072
2069
  // src/evaluation/trace.ts
@@ -2821,22 +2818,25 @@ function extractCacheConfig(suite) {
2821
2818
  const resolvedCachePath = typeof cachePath === "string" && cachePath.trim().length > 0 ? cachePath.trim() : void 0;
2822
2819
  return { enabled: cache, cachePath: resolvedCachePath };
2823
2820
  }
2824
- function extractTotalBudgetUsd(suite) {
2821
+ function extractBudgetUsd(suite) {
2825
2822
  const execution = suite.execution;
2826
2823
  if (!execution || typeof execution !== "object" || Array.isArray(execution)) {
2827
2824
  return void 0;
2828
2825
  }
2829
2826
  const executionObj = execution;
2830
- const rawBudget = executionObj.total_budget_usd ?? executionObj.totalBudgetUsd;
2827
+ if ("total_budget_usd" in executionObj || "totalBudgetUsd" in executionObj) {
2828
+ throw new Error(
2829
+ "execution.total_budget_usd has been renamed to execution.budget_usd. Update your eval YAML."
2830
+ );
2831
+ }
2832
+ const rawBudget = executionObj.budget_usd ?? executionObj.budgetUsd;
2831
2833
  if (rawBudget === void 0 || rawBudget === null) {
2832
2834
  return void 0;
2833
2835
  }
2834
2836
  if (typeof rawBudget === "number" && rawBudget > 0) {
2835
2837
  return rawBudget;
2836
2838
  }
2837
- logWarning(
2838
- `Invalid execution.total_budget_usd: ${rawBudget}. Must be a positive number. Ignoring.`
2839
- );
2839
+ logWarning(`Invalid execution.budget_usd: ${rawBudget}. Must be a positive number. Ignoring.`);
2840
2840
  return void 0;
2841
2841
  }
2842
2842
  function extractFailOnError(suite) {
@@ -2986,7 +2986,7 @@ function logWarning(message) {
2986
2986
  console.warn(`${ANSI_YELLOW2}Warning: ${message}${ANSI_RESET3}`);
2987
2987
  }
2988
2988
 
2989
- // src/evaluation/loaders/evaluator-parser.ts
2989
+ // src/evaluation/loaders/grader-parser.ts
2990
2990
  init_cjs_shims();
2991
2991
  var import_promises7 = require("fs/promises");
2992
2992
  var import_node_path6 = __toESM(require("path"), 1);
@@ -3230,38 +3230,38 @@ function validateTemplateVariables(content, source) {
3230
3230
  );
3231
3231
  }
3232
3232
  if (invalidVariables.length > 0) {
3233
- const warningMessage = `${ANSI_YELLOW3}Warning: Custom evaluator template at ${source}
3233
+ const warningMessage = `${ANSI_YELLOW3}Warning: Custom grader template at ${source}
3234
3234
  Contains invalid variables: ${invalidVariables.map((v) => `{{ ${v} }}`).join(", ")}
3235
3235
  Valid variables: ${Array.from(VALID_TEMPLATE_VARIABLES).map((v) => `{{ ${v} }}`).join(", ")}${ANSI_RESET4}`;
3236
3236
  console.warn(warningMessage);
3237
3237
  }
3238
3238
  }
3239
3239
 
3240
- // src/evaluation/loaders/evaluator-parser.ts
3240
+ // src/evaluation/loaders/grader-parser.ts
3241
3241
  var ANSI_YELLOW4 = "\x1B[33m";
3242
3242
  var ANSI_RESET5 = "\x1B[0m";
3243
3243
  var MAX_ASSERTION_INCLUDE_DEPTH = 3;
3244
3244
  var PROMPT_FILE_PREFIX = "file://";
3245
- function normalizeEvaluatorType(type) {
3245
+ function normalizeGraderType(type) {
3246
3246
  return type.replace(/_/g, "-");
3247
3247
  }
3248
3248
  function isDeprecatedJudgeType(type) {
3249
3249
  return type === "code-judge" || type === "llm-judge";
3250
3250
  }
3251
- async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId, defaultPreprocessors) {
3251
+ async function parseGraders(rawEvalCase, globalExecution, searchRoots, evalId, defaultPreprocessors) {
3252
3252
  const execution = rawEvalCase.execution;
3253
3253
  const executionObject = isJsonObject2(execution) ? execution : void 0;
3254
3254
  const caseEvaluators = rawEvalCase.assertions ?? rawEvalCase.assert ?? (executionObject ? executionObject.evaluators : void 0) ?? // deprecated: use assertions
3255
3255
  rawEvalCase.evaluators;
3256
3256
  const skipDefaults = executionObject?.skip_defaults === true;
3257
3257
  const rootEvaluators = skipDefaults ? void 0 : globalExecution?.assertions ?? globalExecution?.assert ?? globalExecution?.evaluators;
3258
- const parsedCase = await parseEvaluatorList(
3258
+ const parsedCase = await parseGraderList(
3259
3259
  caseEvaluators,
3260
3260
  searchRoots,
3261
3261
  evalId,
3262
3262
  defaultPreprocessors
3263
3263
  );
3264
- const parsedRoot = await parseEvaluatorList(
3264
+ const parsedRoot = await parseGraderList(
3265
3265
  rootEvaluators,
3266
3266
  searchRoots,
3267
3267
  evalId,
@@ -3340,12 +3340,12 @@ ${resolved.attempted.map((attempt) => ` Tried: ${attempt}`).join("\n")}` : "";
3340
3340
  templateDir,
3341
3341
  ...searchRoots.filter((root) => import_node_path6.default.resolve(root) !== templateDir)
3342
3342
  ];
3343
- return await expandEvaluatorEntries(assertions, nestedSearchRoots, evalId, {
3343
+ return await expandGraderEntries(assertions, nestedSearchRoots, evalId, {
3344
3344
  depth: nextDepth,
3345
3345
  chain: [...includeContext.chain, resolved.resolvedPath]
3346
3346
  }) ?? [];
3347
3347
  }
3348
- async function expandEvaluatorEntries(candidateEvaluators, searchRoots, evalId, includeContext = { depth: 0, chain: [] }) {
3348
+ async function expandGraderEntries(candidateEvaluators, searchRoots, evalId, includeContext = { depth: 0, chain: [] }) {
3349
3349
  if (candidateEvaluators === void 0) {
3350
3350
  return void 0;
3351
3351
  }
@@ -3369,8 +3369,8 @@ async function expandEvaluatorEntries(candidateEvaluators, searchRoots, evalId,
3369
3369
  }
3370
3370
  return expanded;
3371
3371
  }
3372
- async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId, defaultPreprocessors) {
3373
- const expandedEvaluators = await expandEvaluatorEntries(candidateEvaluators, searchRoots, evalId);
3372
+ async function parseGraderList(candidateEvaluators, searchRoots, evalId, defaultPreprocessors) {
3373
+ const expandedEvaluators = await expandGraderEntries(candidateEvaluators, searchRoots, evalId);
3374
3374
  if (!expandedEvaluators) {
3375
3375
  return void 0;
3376
3376
  }
@@ -3416,14 +3416,14 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId, defa
3416
3416
  }
3417
3417
  const rawName = asString(rawEvaluator.name);
3418
3418
  const rawType = rawEvaluator.type;
3419
- const typeValue = typeof rawType === "string" ? normalizeEvaluatorType(rawType) : rawType;
3419
+ const typeValue = typeof rawType === "string" ? normalizeGraderType(rawType) : rawType;
3420
3420
  if (typeof typeValue === "string" && isDeprecatedJudgeType(typeValue)) {
3421
3421
  logWarning2(
3422
3422
  `Skipping evaluator '${rawName ?? "<unnamed>"}' in '${evalId}': '${rawType}' is deprecated. Use '${typeValue.replace("-judge", "-grader")}' instead`
3423
3423
  );
3424
3424
  continue;
3425
3425
  }
3426
- const isCustomType = typeof typeValue === "string" && !isEvaluatorKind(typeValue);
3426
+ const isCustomType = typeof typeValue === "string" && !isGraderKind(typeValue);
3427
3427
  if (typeof typeValue !== "string") {
3428
3428
  logWarning2(`Skipping evaluator with invalid type in '${evalId}'`);
3429
3429
  continue;
@@ -3586,7 +3586,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId, defa
3586
3586
  continue;
3587
3587
  }
3588
3588
  const aggregatorType = asString(rawAggregator.type);
3589
- const normalizedAggregatorType = typeof aggregatorType === "string" ? aggregatorType === "weighted_average" || aggregatorType === "threshold" ? aggregatorType : normalizeEvaluatorType(aggregatorType) : aggregatorType;
3589
+ const normalizedAggregatorType = typeof aggregatorType === "string" ? aggregatorType === "weighted_average" || aggregatorType === "threshold" ? aggregatorType : normalizeGraderType(aggregatorType) : aggregatorType;
3590
3590
  if (typeof normalizedAggregatorType === "string" && isDeprecatedJudgeType(normalizedAggregatorType)) {
3591
3591
  logWarning2(
3592
3592
  `Skipping composite evaluator '${name}' in '${evalId}': aggregator type '${aggregatorType}' is deprecated. Use '${normalizedAggregatorType.replace("-judge", "-grader")}' instead`
@@ -3599,7 +3599,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId, defa
3599
3599
  );
3600
3600
  continue;
3601
3601
  }
3602
- const expandedMembers = await expandEvaluatorEntries(
3602
+ const expandedMembers = await expandGraderEntries(
3603
3603
  rawMembers,
3604
3604
  searchRoots,
3605
3605
  `${evalId}:${name}`
@@ -3615,11 +3615,11 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId, defa
3615
3615
  }
3616
3616
  const memberName = asString(rawMember.name);
3617
3617
  const memberType = rawMember.type;
3618
- if (!memberName || !isEvaluatorKind(memberType)) {
3618
+ if (!memberName || !isGraderKind(memberType)) {
3619
3619
  logWarning2(`Skipping member evaluator with invalid name/type in composite '${name}'`);
3620
3620
  continue;
3621
3621
  }
3622
- const memberConfigs = await parseEvaluators(
3622
+ const memberConfigs = await parseGraders(
3623
3623
  { evaluators: [rawMember] },
3624
3624
  void 0,
3625
3625
  searchRoots,
@@ -4360,7 +4360,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId, defa
4360
4360
  `prompt.command for evaluator '${name}' in '${evalId}'`
4361
4361
  );
4362
4362
  if (!commandArray) {
4363
- throw new Error(`Evaluator '${name}' in '${evalId}': prompt object requires command array`);
4363
+ throw new Error(`Grader '${name}' in '${evalId}': prompt object requires command array`);
4364
4364
  }
4365
4365
  const commandPath = commandArray[commandArray.length - 1];
4366
4366
  const resolved = await resolveFileReference2(commandPath, searchRoots);
@@ -4368,7 +4368,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId, defa
4368
4368
  resolvedPromptScript = [...commandArray.slice(0, -1), import_node_path6.default.resolve(resolved.resolvedPath)];
4369
4369
  } else {
4370
4370
  throw new Error(
4371
- `Evaluator '${name}' in '${evalId}': prompt command file not found: ${resolved.displayPath}`
4371
+ `Grader '${name}' in '${evalId}': prompt command file not found: ${resolved.displayPath}`
4372
4372
  );
4373
4373
  }
4374
4374
  if (isJsonObject2(rawPrompt.config)) {
@@ -4385,11 +4385,11 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId, defa
4385
4385
  await validateCustomPromptContent(promptPath);
4386
4386
  } catch (error) {
4387
4387
  const message = error instanceof Error ? error.message : String(error);
4388
- throw new Error(`Evaluator '${name}' template (${promptPath}): ${message}`);
4388
+ throw new Error(`Grader '${name}' template (${promptPath}): ${message}`);
4389
4389
  }
4390
4390
  } else {
4391
4391
  throw new Error(
4392
- `Evaluator '${name}' in '${evalId}': prompt file not found: ${resolved.displayPath}`
4392
+ `Grader '${name}' in '${evalId}': prompt file not found: ${resolved.displayPath}`
4393
4393
  );
4394
4394
  }
4395
4395
  } else {
@@ -4506,18 +4506,18 @@ async function parsePreprocessors(rawValue, searchRoots, evaluatorName, evalId)
4506
4506
  return void 0;
4507
4507
  }
4508
4508
  if (!Array.isArray(rawValue)) {
4509
- throw new Error(`Evaluator '${evaluatorName}' in '${evalId}': preprocessors must be an array`);
4509
+ throw new Error(`Grader '${evaluatorName}' in '${evalId}': preprocessors must be an array`);
4510
4510
  }
4511
4511
  const preprocessors = [];
4512
4512
  for (const rawEntry of rawValue) {
4513
4513
  if (!isJsonObject2(rawEntry)) {
4514
4514
  throw new Error(
4515
- `Evaluator '${evaluatorName}' in '${evalId}': each preprocessor must be an object`
4515
+ `Grader '${evaluatorName}' in '${evalId}': each preprocessor must be an object`
4516
4516
  );
4517
4517
  }
4518
4518
  const type = asString(rawEntry.type)?.trim();
4519
4519
  if (!type) {
4520
- throw new Error(`Evaluator '${evaluatorName}' in '${evalId}': preprocessor.type is required`);
4520
+ throw new Error(`Grader '${evaluatorName}' in '${evalId}': preprocessor.type is required`);
4521
4521
  }
4522
4522
  const command = asStringArray(
4523
4523
  rawEntry.command,
@@ -4525,14 +4525,14 @@ async function parsePreprocessors(rawValue, searchRoots, evaluatorName, evalId)
4525
4525
  );
4526
4526
  if (!command || command.length === 0) {
4527
4527
  throw new Error(
4528
- `Evaluator '${evaluatorName}' in '${evalId}': preprocessor '${type}' requires command`
4528
+ `Grader '${evaluatorName}' in '${evalId}': preprocessor '${type}' requires command`
4529
4529
  );
4530
4530
  }
4531
4531
  const commandPath = command[command.length - 1];
4532
4532
  const resolved = await resolveFileReference2(commandPath, searchRoots);
4533
4533
  if (!resolved.resolvedPath) {
4534
4534
  throw new Error(
4535
- `Evaluator '${evaluatorName}' in '${evalId}': preprocessor command file not found: ${resolved.displayPath}`
4535
+ `Grader '${evaluatorName}' in '${evalId}': preprocessor command file not found: ${resolved.displayPath}`
4536
4536
  );
4537
4537
  }
4538
4538
  preprocessors.push({
@@ -4583,13 +4583,13 @@ function coerceEvaluator(candidate, contextId) {
4583
4583
  if (typeof candidate !== "string") {
4584
4584
  return void 0;
4585
4585
  }
4586
- const normalized = normalizeEvaluatorType(candidate);
4586
+ const normalized = normalizeGraderType(candidate);
4587
4587
  if (isDeprecatedJudgeType(normalized)) {
4588
4588
  throw new Error(
4589
4589
  `Unsupported grader '${candidate}' in ${contextId}. Use '${normalized.replace("-judge", "-grader")}' instead.`
4590
4590
  );
4591
4591
  }
4592
- if (isEvaluatorKind(normalized)) {
4592
+ if (isGraderKind(normalized)) {
4593
4593
  return normalized;
4594
4594
  }
4595
4595
  logWarning2(`Unknown grader '${candidate}' in ${contextId}, falling back to default`);
@@ -4661,7 +4661,7 @@ function parseRequiredAndMinScore(rawRequired, rawMinScore, evaluatorName, evalI
4661
4661
  }
4662
4662
  result.required = rawRequired;
4663
4663
  logWarning2(
4664
- `Evaluator '${evaluatorName}' in '${evalId}': 'required: ${rawRequired}' is deprecated. Use 'required: true' + 'min_score: ${rawRequired}' instead.`
4664
+ `Grader '${evaluatorName}' in '${evalId}': 'required: ${rawRequired}' is deprecated. Use 'required: true' + 'min_score: ${rawRequired}' instead.`
4665
4665
  );
4666
4666
  }
4667
4667
  return result;
@@ -5485,7 +5485,7 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
5485
5485
  const testCaseEvaluatorKind = coerceEvaluator(testCaseConfig.evaluator, id) ?? globalEvaluator;
5486
5486
  let evaluators;
5487
5487
  try {
5488
- evaluators = await parseEvaluators(
5488
+ evaluators = await parseGraders(
5489
5489
  testCaseConfig,
5490
5490
  mergedExecution,
5491
5491
  searchRoots,
@@ -5834,7 +5834,7 @@ async function loadTestSuite(evalFilePath, repoRoot, options) {
5834
5834
  targetRefs: extractTargetRefsFromSuite(parsed),
5835
5835
  workers: extractWorkersFromSuite(parsed),
5836
5836
  cacheConfig: extractCacheConfig(parsed),
5837
- totalBudgetUsd: extractTotalBudgetUsd(parsed),
5837
+ budgetUsd: extractBudgetUsd(parsed),
5838
5838
  ...metadata !== void 0 && { metadata },
5839
5839
  ...failOnError !== void 0 && { failOnError },
5840
5840
  ...threshold !== void 0 && { threshold },
@@ -5975,7 +5975,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
5975
5975
  const testCaseEvaluatorKind = coerceEvaluator(testCaseConfig.evaluator, id) ?? globalEvaluator;
5976
5976
  let evaluators;
5977
5977
  try {
5978
- evaluators = await parseEvaluators(
5978
+ evaluators = await parseGraders(
5979
5979
  testCaseConfig,
5980
5980
  globalExecution,
5981
5981
  searchRoots,
@@ -15421,13 +15421,13 @@ function resolveAndCreateProvider(definition, env = process.env) {
15421
15421
  return createProvider(resolved);
15422
15422
  }
15423
15423
 
15424
- // src/evaluation/evaluators.ts
15424
+ // src/evaluation/graders.ts
15425
15425
  init_cjs_shims();
15426
15426
 
15427
- // src/evaluation/evaluators/index.ts
15427
+ // src/evaluation/graders/index.ts
15428
15428
  init_cjs_shims();
15429
15429
 
15430
- // src/evaluation/evaluators/scoring.ts
15430
+ // src/evaluation/graders/scoring.ts
15431
15431
  init_cjs_shims();
15432
15432
  var DEFAULT_THRESHOLD = 0.8;
15433
15433
  var PASS_THRESHOLD = DEFAULT_THRESHOLD;
@@ -15516,7 +15516,7 @@ function negateScore(score) {
15516
15516
  };
15517
15517
  }
15518
15518
 
15519
- // src/evaluation/evaluators/code-evaluator.ts
15519
+ // src/evaluation/graders/code-grader.ts
15520
15520
  init_cjs_shims();
15521
15521
  var import_promises31 = require("fs/promises");
15522
15522
  var import_node_os9 = require("os");
@@ -15814,7 +15814,7 @@ function getRepoCheckoutTargets(repos) {
15814
15814
  }));
15815
15815
  }
15816
15816
 
15817
- // src/evaluation/evaluators/code-evaluator.ts
15817
+ // src/evaluation/graders/code-grader.ts
15818
15818
  var FILE_BACKED_OUTPUT_THRESHOLD = 5e4;
15819
15819
  var DATA_URI_RE = /^data:([^;]+);base64,(.+)$/s;
15820
15820
  async function materializeContentForGrader(messages, getWorkDir) {
@@ -15866,7 +15866,7 @@ async function materializeContentForGrader(messages, getWorkDir) {
15866
15866
  }
15867
15867
  return result;
15868
15868
  }
15869
- var CodeEvaluator = class {
15869
+ var CodeGrader = class {
15870
15870
  kind = "code-grader";
15871
15871
  command;
15872
15872
  cwd;
@@ -15984,7 +15984,7 @@ var CodeEvaluator = class {
15984
15984
  })) : [];
15985
15985
  const details = parsed?.details && typeof parsed.details === "object" && !Array.isArray(parsed.details) ? parsed.details : void 0;
15986
15986
  const proxyUsage = getProxyUsage?.();
15987
- const evaluatorRawRequest = {
15987
+ const graderRawRequest = {
15988
15988
  command: this.command,
15989
15989
  ...this.cwd ? { cwd: this.cwd } : {},
15990
15990
  ...proxyUsage ? {
@@ -15999,7 +15999,7 @@ var CodeEvaluator = class {
15999
15999
  verdict: scoreToVerdict(score),
16000
16000
  assertions,
16001
16001
  expectedAspectCount: assertions.length || 1,
16002
- evaluatorRawRequest,
16002
+ graderRawRequest,
16003
16003
  ...details ? { details } : {},
16004
16004
  tokenUsage: proxyUsage?.tokenUsage
16005
16005
  };
@@ -16011,7 +16011,7 @@ var CodeEvaluator = class {
16011
16011
  verdict: "fail",
16012
16012
  assertions: [{ text: `Code evaluator failed: ${message}`, passed: false }],
16013
16013
  expectedAspectCount: 1,
16014
- evaluatorRawRequest: {
16014
+ graderRawRequest: {
16015
16015
  command: this.command,
16016
16016
  ...this.cwd ? { cwd: this.cwd } : {},
16017
16017
  ...proxyUsage ? {
@@ -16060,11 +16060,11 @@ function formatStderr(stderr) {
16060
16060
  ${tail}`;
16061
16061
  }
16062
16062
 
16063
- // src/evaluation/evaluators/composite.ts
16063
+ // src/evaluation/graders/composite.ts
16064
16064
  init_cjs_shims();
16065
16065
  var import_ai3 = require("ai");
16066
16066
 
16067
- // src/evaluation/evaluators/llm-grader.ts
16067
+ // src/evaluation/graders/llm-grader.ts
16068
16068
  init_cjs_shims();
16069
16069
  var import_promises32 = __toESM(require("fs/promises"), 1);
16070
16070
  var import_node_path41 = __toESM(require("path"), 1);
@@ -16105,7 +16105,7 @@ var BINARY_EXTENSIONS = /* @__PURE__ */ new Set([
16105
16105
  ".so",
16106
16106
  ".dylib"
16107
16107
  ]);
16108
- var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the answer based on how well it achieves the criteria for the original task.
16108
+ var DEFAULT_GRADER_TEMPLATE = `You are an expert grader. Your goal is to grade the answer based on how well it achieves the criteria for the original task.
16109
16109
 
16110
16110
  Use the reference_answer as a gold standard for a high-quality response (if provided). The reference_answer may be a simple text response, or it may contain a sequence of expected agent messages including tool calls. When it contains multiple messages, the last message represents the final expected answer. The answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
16111
16111
 
@@ -16160,19 +16160,19 @@ function resolveContentBasePath(context2) {
16160
16160
  }
16161
16161
  return void 0;
16162
16162
  }
16163
- var LlmGraderEvaluator = class {
16163
+ var LlmGrader = class {
16164
16164
  kind = "llm-grader";
16165
16165
  resolveGraderProvider;
16166
16166
  maxOutputTokens;
16167
16167
  temperature;
16168
- evaluatorTemplate;
16168
+ graderTemplate;
16169
16169
  maxSteps;
16170
16170
  graderTargetProvider;
16171
16171
  constructor(options) {
16172
16172
  this.resolveGraderProvider = options.resolveGraderProvider ?? options.resolveJudgeProvider;
16173
16173
  this.maxOutputTokens = options.maxOutputTokens;
16174
16174
  this.temperature = options.temperature;
16175
- this.evaluatorTemplate = options.evaluatorTemplate;
16175
+ this.graderTemplate = options.graderTemplate;
16176
16176
  this.maxSteps = Math.min(options.maxSteps ?? DEFAULT_MAX_STEPS, MAX_STEPS_LIMIT);
16177
16177
  this.graderTargetProvider = options.graderTargetProvider ?? options.judgeTargetProvider;
16178
16178
  }
@@ -16235,16 +16235,16 @@ var LlmGraderEvaluator = class {
16235
16235
  [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context2.evalCase.reference_answer ?? "").trim()
16236
16236
  };
16237
16237
  const systemPrompt = buildOutputSchema();
16238
- const evaluatorTemplate = context2.evaluatorTemplateOverride ?? this.evaluatorTemplate ?? DEFAULT_EVALUATOR_TEMPLATE;
16239
- warnDeprecatedTemplateVars(evaluatorTemplate);
16240
- let userPrompt = substituteVariables(evaluatorTemplate, variables);
16241
- if (context2.fileChanges && !context2.evaluatorTemplateOverride && !this.evaluatorTemplate) {
16238
+ const graderTemplate = context2.graderTemplateOverride ?? this.graderTemplate ?? DEFAULT_GRADER_TEMPLATE;
16239
+ warnDeprecatedTemplateVars(graderTemplate);
16240
+ let userPrompt = substituteVariables(graderTemplate, variables);
16241
+ if (context2.fileChanges && !context2.graderTemplateOverride && !this.graderTemplate) {
16242
16242
  userPrompt += `
16243
16243
 
16244
16244
  [[ ## file_changes ## ]]
16245
16245
  ${context2.fileChanges}`;
16246
16246
  }
16247
- const evaluatorRawRequest = {
16247
+ const graderRawRequest = {
16248
16248
  userPrompt,
16249
16249
  systemPrompt
16250
16250
  };
@@ -16265,7 +16265,7 @@ ${context2.fileChanges}`;
16265
16265
  verdict: scoreToVerdict(score),
16266
16266
  assertions,
16267
16267
  expectedAspectCount: Math.max(assertions.length, 1),
16268
- evaluatorRawRequest,
16268
+ graderRawRequest,
16269
16269
  graderTarget: graderProvider.targetName,
16270
16270
  details: data.details,
16271
16271
  tokenUsage
@@ -16279,7 +16279,7 @@ ${context2.fileChanges}`;
16279
16279
  verdict: "skip",
16280
16280
  assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
16281
16281
  expectedAspectCount: 1,
16282
- evaluatorRawRequest,
16282
+ graderRawRequest,
16283
16283
  graderTarget: graderProvider.targetName
16284
16284
  };
16285
16285
  }
@@ -16296,7 +16296,7 @@ ${context2.fileChanges}`;
16296
16296
  }
16297
16297
  const prompt = this.buildRubricPrompt(context2, rubrics);
16298
16298
  const systemPrompt = buildRubricOutputSchema();
16299
- const evaluatorRawRequest = {
16299
+ const graderRawRequest = {
16300
16300
  userPrompt: prompt,
16301
16301
  systemPrompt
16302
16302
  };
@@ -16316,7 +16316,7 @@ ${context2.fileChanges}`;
16316
16316
  verdict,
16317
16317
  assertions,
16318
16318
  expectedAspectCount: rubrics.length,
16319
- evaluatorRawRequest,
16319
+ graderRawRequest,
16320
16320
  graderTarget: graderProvider.targetName,
16321
16321
  tokenUsage
16322
16322
  };
@@ -16329,7 +16329,7 @@ ${context2.fileChanges}`;
16329
16329
  verdict: "skip",
16330
16330
  assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
16331
16331
  expectedAspectCount: rubrics.length,
16332
- evaluatorRawRequest,
16332
+ graderRawRequest,
16333
16333
  graderTarget: graderProvider.targetName
16334
16334
  };
16335
16335
  }
@@ -16341,7 +16341,7 @@ ${context2.fileChanges}`;
16341
16341
  async evaluateWithScoreRanges(context2, graderProvider, rubrics) {
16342
16342
  const prompt = this.buildScoreRangePrompt(context2, rubrics);
16343
16343
  const systemPrompt = buildScoreRangeOutputSchema();
16344
- const evaluatorRawRequest = {
16344
+ const graderRawRequest = {
16345
16345
  userPrompt: prompt,
16346
16346
  systemPrompt
16347
16347
  };
@@ -16361,7 +16361,7 @@ ${context2.fileChanges}`;
16361
16361
  verdict,
16362
16362
  assertions,
16363
16363
  expectedAspectCount: rubrics.length,
16364
- evaluatorRawRequest,
16364
+ graderRawRequest,
16365
16365
  graderTarget: graderProvider.targetName,
16366
16366
  details,
16367
16367
  tokenUsage
@@ -16375,7 +16375,7 @@ ${context2.fileChanges}`;
16375
16375
  verdict: "skip",
16376
16376
  assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
16377
16377
  expectedAspectCount: rubrics.length,
16378
- evaluatorRawRequest,
16378
+ graderRawRequest,
16379
16379
  graderTarget: graderProvider.targetName
16380
16380
  };
16381
16381
  }
@@ -16404,7 +16404,7 @@ ${context2.fileChanges}`;
16404
16404
  const config = context2.evaluator;
16405
16405
  const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
16406
16406
  const fsTools = createFilesystemTools(workspacePath);
16407
- const evaluatorRawRequest = {
16407
+ const graderRawRequest = {
16408
16408
  mode: "built-in",
16409
16409
  systemPrompt,
16410
16410
  userPrompt,
@@ -16428,7 +16428,7 @@ ${context2.fileChanges}`;
16428
16428
  return this.parseAgentResult(
16429
16429
  text,
16430
16430
  rubrics,
16431
- evaluatorRawRequest,
16431
+ graderRawRequest,
16432
16432
  details,
16433
16433
  graderProvider.targetName
16434
16434
  );
@@ -16439,7 +16439,7 @@ ${context2.fileChanges}`;
16439
16439
  verdict: "fail",
16440
16440
  assertions: [{ text: `llm-grader built-in evaluation failed: ${message}`, passed: false }],
16441
16441
  expectedAspectCount: 1,
16442
- evaluatorRawRequest,
16442
+ graderRawRequest,
16443
16443
  graderTarget: graderProvider.targetName,
16444
16444
  details: { mode: "built-in", error: message }
16445
16445
  };
@@ -16471,7 +16471,7 @@ ${context2.fileChanges}`;
16471
16471
  async evaluateWithDelegate(context2, provider, modeLabel) {
16472
16472
  const workspacePath = context2.workspacePath;
16473
16473
  const prompt = this.buildDelegatedPrompt(context2);
16474
- const evaluatorRawRequest = {
16474
+ const graderRawRequest = {
16475
16475
  mode: modeLabel,
16476
16476
  grader_target: provider.targetName,
16477
16477
  prompt
@@ -16492,7 +16492,7 @@ ${context2.fileChanges}`;
16492
16492
  { text: `llm-grader ${modeLabel} returned no assistant response`, passed: false }
16493
16493
  ],
16494
16494
  expectedAspectCount: 1,
16495
- evaluatorRawRequest,
16495
+ graderRawRequest,
16496
16496
  graderTarget: provider.targetName,
16497
16497
  details: { mode: modeLabel, grader_target: provider.targetName }
16498
16498
  };
@@ -16506,7 +16506,7 @@ ${context2.fileChanges}`;
16506
16506
  return this.parseAgentResult(
16507
16507
  assistantContent,
16508
16508
  rubrics,
16509
- evaluatorRawRequest,
16509
+ graderRawRequest,
16510
16510
  details,
16511
16511
  provider.targetName
16512
16512
  );
@@ -16519,7 +16519,7 @@ ${context2.fileChanges}`;
16519
16519
  { text: `llm-grader ${modeLabel} evaluation failed: ${message}`, passed: false }
16520
16520
  ],
16521
16521
  expectedAspectCount: 1,
16522
- evaluatorRawRequest,
16522
+ graderRawRequest,
16523
16523
  graderTarget: provider.targetName,
16524
16524
  details: {
16525
16525
  mode: modeLabel,
@@ -16540,7 +16540,7 @@ ${context2.fileChanges}`;
16540
16540
  const config = context2.evaluator;
16541
16541
  const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
16542
16542
  const parts = [
16543
- "You are an expert evaluator with access to the workspace filesystem.",
16543
+ "You are an expert grader with access to the workspace filesystem.",
16544
16544
  "Use the provided tools to investigate the workspace and verify the criteria are met.",
16545
16545
  "Thoroughly examine relevant files before making your assessment.",
16546
16546
  ""
@@ -16569,9 +16569,9 @@ ${context2.fileChanges}`;
16569
16569
  [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context2.candidate.trim(),
16570
16570
  [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context2.evalCase.reference_answer ?? "").trim()
16571
16571
  };
16572
- if (this.evaluatorTemplate) {
16573
- warnDeprecatedTemplateVars(this.evaluatorTemplate);
16574
- return substituteVariables(this.evaluatorTemplate, variables);
16572
+ if (this.graderTemplate) {
16573
+ warnDeprecatedTemplateVars(this.graderTemplate);
16574
+ return substituteVariables(this.graderTemplate, variables);
16575
16575
  }
16576
16576
  const config = context2.evaluator;
16577
16577
  const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
@@ -16618,7 +16618,7 @@ ${context2.fileChanges}`;
16618
16618
  const formattedQuestion = context2.promptInputs.question && context2.promptInputs.question.trim().length > 0 ? context2.promptInputs.question : context2.evalCase.question;
16619
16619
  const config = context2.evaluator;
16620
16620
  const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
16621
- if (this.evaluatorTemplate) {
16621
+ if (this.graderTemplate) {
16622
16622
  const variables = {
16623
16623
  [TEMPLATE_VARIABLES.CRITERIA]: context2.evalCase.criteria.trim(),
16624
16624
  [TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
@@ -16630,15 +16630,15 @@ ${context2.fileChanges}`;
16630
16630
  [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context2.candidate.trim(),
16631
16631
  [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context2.evalCase.reference_answer ?? "").trim()
16632
16632
  };
16633
- warnDeprecatedTemplateVars(this.evaluatorTemplate);
16634
- const customPrompt = substituteVariables(this.evaluatorTemplate, variables);
16633
+ warnDeprecatedTemplateVars(this.graderTemplate);
16634
+ const customPrompt = substituteVariables(this.graderTemplate, variables);
16635
16635
  const outputSchema = rubrics && rubrics.length > 0 ? buildRubricOutputSchema() : buildOutputSchema();
16636
16636
  return `${customPrompt}
16637
16637
 
16638
16638
  ${outputSchema}`;
16639
16639
  }
16640
16640
  const parts = [
16641
- "You are an expert evaluator. Investigate the workspace to verify the criteria are met.",
16641
+ "You are an expert grader. Investigate the workspace to verify the criteria are met.",
16642
16642
  "",
16643
16643
  "[[ ## question ## ]]",
16644
16644
  formattedQuestion,
@@ -16675,7 +16675,7 @@ ${outputSchema}`;
16675
16675
  * Parse the agent's response text into an EvaluationScore.
16676
16676
  * Supports both freeform and rubric modes.
16677
16677
  */
16678
- parseAgentResult(text, rubrics, evaluatorRawRequest, details, graderTarget) {
16678
+ parseAgentResult(text, rubrics, graderRawRequest, details, graderTarget) {
16679
16679
  try {
16680
16680
  const parsed = parseJsonFromText(text);
16681
16681
  if (rubrics && rubrics.length > 0) {
@@ -16686,7 +16686,7 @@ ${outputSchema}`;
16686
16686
  verdict,
16687
16687
  assertions: assertions2,
16688
16688
  expectedAspectCount: rubrics.length,
16689
- evaluatorRawRequest,
16689
+ graderRawRequest,
16690
16690
  graderTarget,
16691
16691
  details
16692
16692
  };
@@ -16699,7 +16699,7 @@ ${outputSchema}`;
16699
16699
  verdict: scoreToVerdict(score),
16700
16700
  assertions,
16701
16701
  expectedAspectCount: Math.max(assertions.length, 1),
16702
- evaluatorRawRequest,
16702
+ graderRawRequest,
16703
16703
  graderTarget,
16704
16704
  details: data.details && Object.keys(data.details).length > 0 ? { ...details, ...data.details } : details
16705
16705
  };
@@ -16714,7 +16714,7 @@ ${outputSchema}`;
16714
16714
  }
16715
16715
  ],
16716
16716
  expectedAspectCount: 1,
16717
- evaluatorRawRequest,
16717
+ graderRawRequest,
16718
16718
  graderTarget,
16719
16719
  details
16720
16720
  };
@@ -16729,7 +16729,7 @@ ${outputSchema}`;
16729
16729
  buildScoreRangePrompt(context2, rubrics) {
16730
16730
  const formattedQuestion = context2.promptInputs.question && context2.promptInputs.question.trim().length > 0 ? context2.promptInputs.question : context2.evalCase.question;
16731
16731
  const parts = [
16732
- "You are an expert evaluator. Score the candidate answer on each criterion below using the provided score ranges.",
16732
+ "You are an expert grader. Score the candidate answer on each criterion below using the provided score ranges.",
16733
16733
  "For each criterion, output an integer score from 0 to 10 based on which score range best matches the answer.",
16734
16734
  "",
16735
16735
  "[[ ## question ## ]]",
@@ -16772,7 +16772,7 @@ ${outputSchema}`;
16772
16772
  buildRubricPrompt(context2, rubrics) {
16773
16773
  const formattedQuestion = context2.promptInputs.question && context2.promptInputs.question.trim().length > 0 ? context2.promptInputs.question : context2.evalCase.question;
16774
16774
  const parts = [
16775
- "You are an expert evaluator. Evaluate the candidate answer against each rubric item below.",
16775
+ "You are an expert grader. Evaluate the candidate answer against each rubric item below.",
16776
16776
  "",
16777
16777
  "[[ ## question ## ]]",
16778
16778
  formattedQuestion,
@@ -16946,7 +16946,7 @@ function sumTokenUsage(first, second) {
16946
16946
  };
16947
16947
  }
16948
16948
  function buildRubricOutputSchema() {
16949
- return `You are an expert evaluator. Evaluate the candidate answer against each rubric item.
16949
+ return `You are an expert grader. Evaluate the candidate answer against each rubric item.
16950
16950
  You must return a valid JSON object matching this schema:
16951
16951
  {
16952
16952
  "checks": [
@@ -16980,7 +16980,7 @@ function warnDeprecatedTemplateVars(template) {
16980
16980
  console.warn(
16981
16981
  `${ANSI_YELLOW8}\u26A0 Deprecated template variables detected (they still work but will be removed in a future version):
16982
16982
  ${used.join("\n ")}
16983
- Update your custom evaluator template to use the new names.${ANSI_RESET9}`
16983
+ Update your custom grader template to use the new names.${ANSI_RESET9}`
16984
16984
  );
16985
16985
  }
16986
16986
  }
@@ -17012,7 +17012,7 @@ function calculateRubricScore(result, rubrics) {
17012
17012
  return { score, verdict, assertions };
17013
17013
  }
17014
17014
  function buildScoreRangeOutputSchema() {
17015
- return `You are an expert evaluator. Score the candidate answer on each criterion.
17015
+ return `You are an expert grader. Score the candidate answer on each criterion.
17016
17016
  You must return a valid JSON object matching this schema:
17017
17017
  {
17018
17018
  "checks": [
@@ -17220,13 +17220,13 @@ async function searchDirectory(dirPath, workspacePath, regex, matches) {
17220
17220
  }
17221
17221
  }
17222
17222
 
17223
- // src/evaluation/evaluators/composite.ts
17223
+ // src/evaluation/graders/composite.ts
17224
17224
  var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
17225
17225
  {{EVALUATOR_RESULTS_JSON}}
17226
17226
 
17227
- Decide the final score and verdict based on all evaluator results.
17227
+ Decide the final score and verdict based on all grader results.
17228
17228
  Return a JSON object with: score (0.0-1.0), verdict (pass/fail), and reasoning.`;
17229
- var CompositeEvaluator = class {
17229
+ var CompositeGrader = class {
17230
17230
  kind = "composite";
17231
17231
  config;
17232
17232
  evaluatorFactory;
@@ -17277,7 +17277,7 @@ var CompositeEvaluator = class {
17277
17277
  weight,
17278
17278
  verdict: member.result.verdict,
17279
17279
  assertions: [...member.result.assertions],
17280
- evaluatorRawRequest: member.result.evaluatorRawRequest,
17280
+ graderRawRequest: member.result.graderRawRequest,
17281
17281
  scores: member.result.scores,
17282
17282
  details: member.result.details,
17283
17283
  tokenUsage: member.result.tokenUsage
@@ -17298,7 +17298,7 @@ var CompositeEvaluator = class {
17298
17298
  verdict: "skip",
17299
17299
  assertions: [{ text: "All evaluators skipped (infrastructure failure)", passed: false }],
17300
17300
  expectedAspectCount: 1,
17301
- evaluatorRawRequest: {
17301
+ graderRawRequest: {
17302
17302
  aggregator: "weighted_average",
17303
17303
  ...weights ? { weights } : {}
17304
17304
  },
@@ -17311,7 +17311,7 @@ var CompositeEvaluator = class {
17311
17311
  verdict: scoreToVerdict(finalScore),
17312
17312
  assertions: allAssertions,
17313
17313
  expectedAspectCount: allAssertions.length || 1,
17314
- evaluatorRawRequest: {
17314
+ graderRawRequest: {
17315
17315
  aggregator: "weighted_average",
17316
17316
  ...weights ? { weights } : {}
17317
17317
  },
@@ -17330,7 +17330,7 @@ var CompositeEvaluator = class {
17330
17330
  score: member.result.score,
17331
17331
  verdict: member.result.verdict,
17332
17332
  assertions: [...member.result.assertions],
17333
- evaluatorRawRequest: member.result.evaluatorRawRequest,
17333
+ graderRawRequest: member.result.graderRawRequest,
17334
17334
  scores: member.result.scores,
17335
17335
  details: member.result.details,
17336
17336
  tokenUsage: member.result.tokenUsage
@@ -17353,7 +17353,7 @@ var CompositeEvaluator = class {
17353
17353
  verdict: "skip",
17354
17354
  assertions: [{ text: "All evaluators skipped (infrastructure failure)", passed: false }],
17355
17355
  expectedAspectCount: 1,
17356
- evaluatorRawRequest: {
17356
+ graderRawRequest: {
17357
17357
  aggregator: "threshold",
17358
17358
  threshold
17359
17359
  },
@@ -17372,7 +17372,7 @@ var CompositeEvaluator = class {
17372
17372
  verdict: pass ? "pass" : "fail",
17373
17373
  assertions: allAssertions,
17374
17374
  expectedAspectCount: allAssertions.length || 1,
17375
- evaluatorRawRequest: {
17375
+ graderRawRequest: {
17376
17376
  aggregator: "threshold",
17377
17377
  threshold
17378
17378
  },
@@ -17389,7 +17389,7 @@ var CompositeEvaluator = class {
17389
17389
  weight: weights?.[member.id] ?? 1,
17390
17390
  verdict: member.result.verdict,
17391
17391
  assertions: [...member.result.assertions],
17392
- evaluatorRawRequest: member.result.evaluatorRawRequest,
17392
+ graderRawRequest: member.result.graderRawRequest,
17393
17393
  scores: member.result.scores,
17394
17394
  details: member.result.details
17395
17395
  }));
@@ -17410,7 +17410,7 @@ var CompositeEvaluator = class {
17410
17410
  verdict,
17411
17411
  assertions,
17412
17412
  expectedAspectCount: assertions.length || 1,
17413
- evaluatorRawRequest: {
17413
+ graderRawRequest: {
17414
17414
  aggregator: "code-grader",
17415
17415
  script: scriptPath
17416
17416
  },
@@ -17423,7 +17423,7 @@ var CompositeEvaluator = class {
17423
17423
  verdict: "fail",
17424
17424
  assertions: [{ text: `Code aggregator failed: ${message}`, passed: false }],
17425
17425
  expectedAspectCount: 1,
17426
- evaluatorRawRequest: {
17426
+ graderRawRequest: {
17427
17427
  aggregator: "code-grader",
17428
17428
  script: scriptPath,
17429
17429
  error: message
@@ -17445,14 +17445,14 @@ var CompositeEvaluator = class {
17445
17445
  score: member.result.score,
17446
17446
  verdict: member.result.verdict,
17447
17447
  assertions: [...member.result.assertions],
17448
- evaluatorRawRequest: member.result.evaluatorRawRequest,
17448
+ graderRawRequest: member.result.graderRawRequest,
17449
17449
  scores: member.result.scores,
17450
17450
  details: member.result.details
17451
17451
  }));
17452
17452
  const promptTemplate = config.prompt ?? DEFAULT_COMPOSITE_AGGREGATOR_PROMPT;
17453
17453
  const userPrompt = promptTemplate.replace(/\{\{EVALUATOR_RESULTS_JSON\}\}/g, resultsJson);
17454
17454
  const systemPrompt = buildOutputSchema();
17455
- const evaluatorRawRequest = {
17455
+ const graderRawRequest = {
17456
17456
  aggregator: "llm-grader",
17457
17457
  userPrompt,
17458
17458
  systemPrompt,
@@ -17474,7 +17474,7 @@ var CompositeEvaluator = class {
17474
17474
  verdict: scoreToVerdict(score2),
17475
17475
  assertions: assertions2,
17476
17476
  expectedAspectCount: Math.max(assertions2.length, 1),
17477
- evaluatorRawRequest,
17477
+ graderRawRequest,
17478
17478
  scores
17479
17479
  };
17480
17480
  }
@@ -17494,7 +17494,7 @@ var CompositeEvaluator = class {
17494
17494
  verdict: scoreToVerdict(score),
17495
17495
  assertions,
17496
17496
  expectedAspectCount: Math.max(assertions.length, 1),
17497
- evaluatorRawRequest,
17497
+ graderRawRequest,
17498
17498
  scores
17499
17499
  };
17500
17500
  } catch {
@@ -17503,16 +17503,16 @@ var CompositeEvaluator = class {
17503
17503
  verdict: "fail",
17504
17504
  assertions: [{ text: "LLM aggregator failed", passed: false }],
17505
17505
  expectedAspectCount: 1,
17506
- evaluatorRawRequest,
17506
+ graderRawRequest,
17507
17507
  scores
17508
17508
  };
17509
17509
  }
17510
17510
  }
17511
17511
  };
17512
17512
 
17513
- // src/evaluation/evaluators/cost.ts
17513
+ // src/evaluation/graders/cost.ts
17514
17514
  init_cjs_shims();
17515
- var CostEvaluator = class {
17515
+ var CostGrader = class {
17516
17516
  kind = "cost";
17517
17517
  config;
17518
17518
  constructor(options) {
@@ -17527,7 +17527,7 @@ var CostEvaluator = class {
17527
17527
  verdict: "fail",
17528
17528
  assertions: [{ text: "No cost data available in trace", passed: false }],
17529
17529
  expectedAspectCount: 1,
17530
- evaluatorRawRequest: {
17530
+ graderRawRequest: {
17531
17531
  type: "cost",
17532
17532
  budget,
17533
17533
  costUsd: null
@@ -17544,7 +17544,7 @@ var CostEvaluator = class {
17544
17544
  passed ? { text: `Cost ${formatCost(costUsd)} <= ${formatCost(budget)} budget`, passed: true } : { text: `Cost ${formatCost(costUsd)} > ${formatCost(budget)} budget`, passed: false }
17545
17545
  ],
17546
17546
  expectedAspectCount: 1,
17547
- evaluatorRawRequest: {
17547
+ graderRawRequest: {
17548
17548
  type: "cost",
17549
17549
  budget,
17550
17550
  costUsd
@@ -17553,9 +17553,9 @@ var CostEvaluator = class {
17553
17553
  }
17554
17554
  };
17555
17555
 
17556
- // src/evaluation/evaluators/execution-metrics.ts
17556
+ // src/evaluation/graders/execution-metrics.ts
17557
17557
  init_cjs_shims();
17558
- var ExecutionMetricsEvaluator = class {
17558
+ var ExecutionMetricsGrader = class {
17559
17559
  kind = "execution-metrics";
17560
17560
  config;
17561
17561
  constructor(options) {
@@ -17579,7 +17579,7 @@ var ExecutionMetricsEvaluator = class {
17579
17579
  verdict: "fail",
17580
17580
  assertions: [{ text: "No trace summary available", passed: false }],
17581
17581
  expectedAspectCount: 1,
17582
- evaluatorRawRequest: {
17582
+ graderRawRequest: {
17583
17583
  type: "execution-metrics",
17584
17584
  config: this.extractConfiguredThresholds(),
17585
17585
  actual: null
@@ -17695,7 +17695,7 @@ var ExecutionMetricsEvaluator = class {
17695
17695
  verdict: scoreToVerdict(score),
17696
17696
  assertions,
17697
17697
  expectedAspectCount: totalChecks || 1,
17698
- evaluatorRawRequest: {
17698
+ graderRawRequest: {
17699
17699
  type: "execution-metrics",
17700
17700
  config: this.extractConfiguredThresholds(),
17701
17701
  actual: this.filterDefinedMetrics(actualMetrics)
@@ -17738,7 +17738,7 @@ var ExecutionMetricsEvaluator = class {
17738
17738
  }
17739
17739
  };
17740
17740
 
17741
- // src/evaluation/evaluators/field-accuracy.ts
17741
+ // src/evaluation/graders/field-accuracy.ts
17742
17742
  init_cjs_shims();
17743
17743
  var DEFAULT_DATE_FORMATS = [
17744
17744
  "YYYY-MM-DDTHH:mm:ssZ",
@@ -17784,7 +17784,7 @@ var MONTH_NAMES = {
17784
17784
  dec: 11,
17785
17785
  december: 11
17786
17786
  };
17787
- var FieldAccuracyEvaluator = class {
17787
+ var FieldAccuracyGrader = class {
17788
17788
  kind = "field-accuracy";
17789
17789
  config;
17790
17790
  constructor(options) {
@@ -18143,9 +18143,9 @@ function parseJsonFromTextSafe(text) {
18143
18143
  return parseJsonFromText(text);
18144
18144
  }
18145
18145
 
18146
- // src/evaluation/evaluators/latency.ts
18146
+ // src/evaluation/graders/latency.ts
18147
18147
  init_cjs_shims();
18148
- var LatencyEvaluator = class {
18148
+ var LatencyGrader = class {
18149
18149
  kind = "latency";
18150
18150
  config;
18151
18151
  constructor(options) {
@@ -18160,7 +18160,7 @@ var LatencyEvaluator = class {
18160
18160
  verdict: "fail",
18161
18161
  assertions: [{ text: "No duration data available in trace", passed: false }],
18162
18162
  expectedAspectCount: 1,
18163
- evaluatorRawRequest: {
18163
+ graderRawRequest: {
18164
18164
  type: "latency",
18165
18165
  threshold,
18166
18166
  durationMs: null
@@ -18176,7 +18176,7 @@ var LatencyEvaluator = class {
18176
18176
  passed ? { text: `Duration ${durationMs}ms <= ${threshold}ms threshold`, passed: true } : { text: `Duration ${durationMs}ms > ${threshold}ms threshold`, passed: false }
18177
18177
  ],
18178
18178
  expectedAspectCount: 1,
18179
- evaluatorRawRequest: {
18179
+ graderRawRequest: {
18180
18180
  type: "latency",
18181
18181
  threshold,
18182
18182
  durationMs
@@ -18185,9 +18185,9 @@ var LatencyEvaluator = class {
18185
18185
  }
18186
18186
  };
18187
18187
 
18188
- // src/evaluation/evaluators/skill-trigger.ts
18188
+ // src/evaluation/graders/skill-trigger.ts
18189
18189
  init_cjs_shims();
18190
- var SkillTriggerEvaluator = class {
18190
+ var SkillTriggerGrader = class {
18191
18191
  kind = "skill-trigger";
18192
18192
  config;
18193
18193
  constructor(config) {
@@ -18254,7 +18254,7 @@ var SkillTriggerEvaluator = class {
18254
18254
  }
18255
18255
  };
18256
18256
 
18257
- // src/evaluation/evaluators/llm-grader-prompt.ts
18257
+ // src/evaluation/graders/llm-grader-prompt.ts
18258
18258
  init_cjs_shims();
18259
18259
  function assembleLlmGraderPrompt(input) {
18260
18260
  const {
@@ -18263,7 +18263,7 @@ function assembleLlmGraderPrompt(input) {
18263
18263
  promptInputs,
18264
18264
  evaluatorConfig,
18265
18265
  fileChanges,
18266
- evaluatorTemplateOverride
18266
+ graderTemplateOverride
18267
18267
  } = input;
18268
18268
  const rubrics = evaluatorConfig?.rubrics;
18269
18269
  if (rubrics && rubrics.length > 0) {
@@ -18273,15 +18273,9 @@ function assembleLlmGraderPrompt(input) {
18273
18273
  }
18274
18274
  return assembleChecklist(evalCase, candidate, promptInputs, rubrics, fileChanges);
18275
18275
  }
18276
- return assembleFreeform(
18277
- evalCase,
18278
- candidate,
18279
- promptInputs,
18280
- fileChanges,
18281
- evaluatorTemplateOverride
18282
- );
18276
+ return assembleFreeform(evalCase, candidate, promptInputs, fileChanges, graderTemplateOverride);
18283
18277
  }
18284
- function assembleFreeform(evalCase, candidate, promptInputs, fileChanges, evaluatorTemplateOverride) {
18278
+ function assembleFreeform(evalCase, candidate, promptInputs, fileChanges, graderTemplateOverride) {
18285
18279
  const formattedQuestion = promptInputs.question && promptInputs.question.trim().length > 0 ? promptInputs.question : evalCase.question;
18286
18280
  const variables = {
18287
18281
  [TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
@@ -18295,9 +18289,9 @@ function assembleFreeform(evalCase, candidate, promptInputs, fileChanges, evalua
18295
18289
  [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (evalCase.reference_answer ?? "").trim()
18296
18290
  };
18297
18291
  const systemPrompt = buildOutputSchema();
18298
- const template = evaluatorTemplateOverride ?? DEFAULT_EVALUATOR_TEMPLATE;
18292
+ const template = graderTemplateOverride ?? DEFAULT_GRADER_TEMPLATE;
18299
18293
  let userPrompt = substituteVariables(template, variables);
18300
- if (fileChanges && !evaluatorTemplateOverride) {
18294
+ if (fileChanges && !graderTemplateOverride) {
18301
18295
  userPrompt += `
18302
18296
 
18303
18297
  [[ ## file_changes ## ]]
@@ -18313,7 +18307,7 @@ ${fileChanges}`;
18313
18307
  function assembleChecklist(evalCase, candidate, promptInputs, rubrics, fileChanges) {
18314
18308
  const formattedQuestion = promptInputs.question && promptInputs.question.trim().length > 0 ? promptInputs.question : evalCase.question;
18315
18309
  const parts = [
18316
- "You are an expert evaluator. Evaluate the candidate answer against each rubric item below.",
18310
+ "You are an expert grader. Evaluate the candidate answer against each rubric item below.",
18317
18311
  "",
18318
18312
  "[[ ## question ## ]]",
18319
18313
  formattedQuestion,
@@ -18348,7 +18342,7 @@ function assembleChecklist(evalCase, candidate, promptInputs, rubrics, fileChang
18348
18342
  function assembleScoreRange(evalCase, candidate, promptInputs, rubrics, fileChanges) {
18349
18343
  const formattedQuestion = promptInputs.question && promptInputs.question.trim().length > 0 ? promptInputs.question : evalCase.question;
18350
18344
  const parts = [
18351
- "You are an expert evaluator. Score the candidate answer on each criterion below using the provided score ranges.",
18345
+ "You are an expert grader. Score the candidate answer on each criterion below using the provided score ranges.",
18352
18346
  "For each criterion, output an integer score from 0 to 10 based on which score range best matches the answer.",
18353
18347
  "",
18354
18348
  "[[ ## question ## ]]",
@@ -18396,9 +18390,9 @@ function assembleScoreRange(evalCase, candidate, promptInputs, rubrics, fileChan
18396
18390
  };
18397
18391
  }
18398
18392
 
18399
- // src/evaluation/evaluators/token-usage.ts
18393
+ // src/evaluation/graders/token-usage.ts
18400
18394
  init_cjs_shims();
18401
- var TokenUsageEvaluator = class {
18395
+ var TokenUsageGrader = class {
18402
18396
  kind = "token-usage";
18403
18397
  config;
18404
18398
  constructor(options) {
@@ -18419,7 +18413,7 @@ var TokenUsageEvaluator = class {
18419
18413
  verdict: "fail",
18420
18414
  assertions: [{ text: "No token usage data available in trace", passed: false }],
18421
18415
  expectedAspectCount,
18422
- evaluatorRawRequest: {
18416
+ graderRawRequest: {
18423
18417
  type: "token-usage",
18424
18418
  max_total: maxTotal ?? null,
18425
18419
  max_input: maxInput ?? null,
@@ -18460,7 +18454,7 @@ var TokenUsageEvaluator = class {
18460
18454
  verdict: passed ? "pass" : "fail",
18461
18455
  assertions,
18462
18456
  expectedAspectCount,
18463
- evaluatorRawRequest: {
18457
+ graderRawRequest: {
18464
18458
  type: "token-usage",
18465
18459
  max_total: maxTotal ?? null,
18466
18460
  max_input: maxInput ?? null,
@@ -18476,7 +18470,7 @@ var TokenUsageEvaluator = class {
18476
18470
  }
18477
18471
  };
18478
18472
 
18479
- // src/evaluation/evaluators/tool-trajectory.ts
18473
+ // src/evaluation/graders/tool-trajectory.ts
18480
18474
  init_cjs_shims();
18481
18475
  function getNestedValue(obj, path56) {
18482
18476
  const parts = path56.split(".");
@@ -18546,7 +18540,7 @@ function checkLatency(toolName, maxDurationMs, actualDurationMs) {
18546
18540
  message: `${toolName} took ${actualDurationMs}ms (max: ${maxDurationMs}ms)`
18547
18541
  };
18548
18542
  }
18549
- var ToolTrajectoryEvaluator = class {
18543
+ var ToolTrajectoryGrader = class {
18550
18544
  kind = "tool-trajectory";
18551
18545
  config;
18552
18546
  constructor(options) {
@@ -18951,7 +18945,7 @@ var ToolTrajectoryEvaluator = class {
18951
18945
  }
18952
18946
  };
18953
18947
 
18954
- // src/evaluation/evaluators/assertions.ts
18948
+ // src/evaluation/graders/assertions.ts
18955
18949
  init_cjs_shims();
18956
18950
  function runContainsAssertion(output, value) {
18957
18951
  const passed = output.includes(value);
@@ -19255,16 +19249,16 @@ function validateConcurrency(concurrency) {
19255
19249
  // src/evaluation/registry/index.ts
19256
19250
  init_cjs_shims();
19257
19251
 
19258
- // src/evaluation/registry/evaluator-registry.ts
19252
+ // src/evaluation/registry/grader-registry.ts
19259
19253
  init_cjs_shims();
19260
- var EvaluatorRegistry = class {
19254
+ var GraderRegistry = class {
19261
19255
  factories = /* @__PURE__ */ new Map();
19262
- /** Register a factory function for an evaluator type. */
19256
+ /** Register a factory function for an grader type. */
19263
19257
  register(type, factory) {
19264
19258
  this.factories.set(type, factory);
19265
19259
  return this;
19266
19260
  }
19267
- /** Get the factory function for an evaluator type. */
19261
+ /** Get the factory function for an grader type. */
19268
19262
  get(type) {
19269
19263
  return this.factories.get(type);
19270
19264
  }
@@ -19272,25 +19266,25 @@ var EvaluatorRegistry = class {
19272
19266
  has(type) {
19273
19267
  return this.factories.has(type);
19274
19268
  }
19275
- /** List all registered evaluator type names. */
19269
+ /** List all registered grader type names. */
19276
19270
  list() {
19277
19271
  return [...this.factories.keys()];
19278
19272
  }
19279
19273
  /**
19280
19274
  * Create an evaluator instance from a config, using the registered factory.
19281
- * Throws if no factory is registered for the evaluator type.
19275
+ * Throws if no factory is registered for the grader type.
19282
19276
  */
19283
19277
  async create(config, context2) {
19284
19278
  const factory = this.factories.get(config.type);
19285
19279
  if (!factory) {
19286
19280
  throw new Error(
19287
- `Unknown evaluator type: "${config.type}". Registered types: ${this.list().join(", ")}`
19281
+ `Unknown grader type: "${config.type}". Registered types: ${this.list().join(", ")}`
19288
19282
  );
19289
19283
  }
19290
19284
  return factory(config, context2);
19291
19285
  }
19292
19286
  };
19293
- var DeterministicAssertionEvaluator = class {
19287
+ var DeterministicAssertionGrader = class {
19294
19288
  constructor(kind, assertFn) {
19295
19289
  this.assertFn = assertFn;
19296
19290
  this.kind = kind;
@@ -19301,12 +19295,12 @@ var DeterministicAssertionEvaluator = class {
19301
19295
  }
19302
19296
  };
19303
19297
 
19304
- // src/evaluation/registry/builtin-evaluators.ts
19298
+ // src/evaluation/registry/builtin-graders.ts
19305
19299
  init_cjs_shims();
19306
19300
 
19307
- // src/evaluation/evaluators/inline-assert.ts
19301
+ // src/evaluation/graders/inline-assert.ts
19308
19302
  init_cjs_shims();
19309
- var InlineAssertEvaluator = class {
19303
+ var InlineAssertGrader = class {
19310
19304
  constructor(fn, name) {
19311
19305
  this.fn = fn;
19312
19306
  this.name = name;
@@ -19331,7 +19325,7 @@ var InlineAssertEvaluator = class {
19331
19325
  }
19332
19326
  };
19333
19327
 
19334
- // src/evaluation/evaluators/prompt-resolution.ts
19328
+ // src/evaluation/graders/prompt-resolution.ts
19335
19329
  init_cjs_shims();
19336
19330
  var import_node_path42 = __toESM(require("path"), 1);
19337
19331
  async function resolveCustomPrompt(promptConfig, context2, timeoutMs) {
@@ -19399,7 +19393,7 @@ async function executePromptTemplate(script, context2, config, timeoutMs) {
19399
19393
  }
19400
19394
  }
19401
19395
 
19402
- // src/evaluation/registry/builtin-evaluators.ts
19396
+ // src/evaluation/registry/builtin-graders.ts
19403
19397
  var INLINE_ASSERT_FN = Symbol.for("agentv.inline-assert-fn");
19404
19398
  var llmGraderFactory = (config, context2) => {
19405
19399
  const c = config;
@@ -19416,7 +19410,7 @@ var llmGraderFactory = (config, context2) => {
19416
19410
  );
19417
19411
  }
19418
19412
  const isAgent = isAgentProvider(graderTargetProvider) || graderTargetProvider.kind === "agentv";
19419
- evaluator = new LlmGraderEvaluator({
19413
+ evaluator = new LlmGrader({
19420
19414
  resolveGraderProvider: async (evalContext) => {
19421
19415
  if (graderTargetProvider) return graderTargetProvider;
19422
19416
  if (evalContext.graderProvider) return evalContext.graderProvider;
@@ -19444,11 +19438,11 @@ var llmGraderFactory = (config, context2) => {
19444
19438
  agentTimeoutMs
19445
19439
  );
19446
19440
  const isFromInlinePrompt = !c.resolvedPromptScript?.length && !c.resolvedPromptPath && !c.promptPath;
19447
- let evaluatorTemplateOverride;
19441
+ let graderTemplateOverride;
19448
19442
  let evalCase = evalContext.evalCase;
19449
19443
  if (customPrompt) {
19450
19444
  if (!isFromInlinePrompt || containsTemplateVariables(customPrompt)) {
19451
- evaluatorTemplateOverride = customPrompt;
19445
+ graderTemplateOverride = customPrompt;
19452
19446
  } else {
19453
19447
  evalCase = { ...evalCase, criteria: customPrompt };
19454
19448
  }
@@ -19456,7 +19450,7 @@ var llmGraderFactory = (config, context2) => {
19456
19450
  return evaluator.evaluate({
19457
19451
  ...evalContext,
19458
19452
  evalCase,
19459
- evaluatorTemplateOverride,
19453
+ graderTemplateOverride,
19460
19454
  evaluator: c
19461
19455
  });
19462
19456
  }
@@ -19464,7 +19458,7 @@ var llmGraderFactory = (config, context2) => {
19464
19458
  };
19465
19459
  var codeFactory = (config, context2) => {
19466
19460
  const c = config;
19467
- return new CodeEvaluator({
19461
+ return new CodeGrader({
19468
19462
  command: c.command ?? c.script ?? [],
19469
19463
  cwd: c.resolvedCwd ?? c.cwd,
19470
19464
  agentTimeoutMs: context2.agentTimeoutMs,
@@ -19475,19 +19469,19 @@ var codeFactory = (config, context2) => {
19475
19469
  var compositeFactory = (config, context2) => {
19476
19470
  const c = config;
19477
19471
  const evalFileDir = context2.evalFileDir ?? process.cwd();
19478
- return new CompositeEvaluator({
19472
+ return new CompositeGrader({
19479
19473
  config: c,
19480
19474
  cwd: evalFileDir,
19481
19475
  evaluatorFactory: {
19482
19476
  create: (memberConfig) => {
19483
19477
  const factory = context2.registry.get(memberConfig.type);
19484
19478
  if (!factory) {
19485
- throw new Error(`Unsupported evaluator type in composite: ${memberConfig.type}`);
19479
+ throw new Error(`Unsupported grader type in composite: ${memberConfig.type}`);
19486
19480
  }
19487
19481
  const result = factory(memberConfig, context2);
19488
19482
  if (result instanceof Promise) {
19489
19483
  throw new Error(
19490
- `Evaluator factory for type "${memberConfig.type}" is async \u2014 not supported inside composite members. Use synchronous factories for composite child evaluators.`
19484
+ `Grader factory for type "${memberConfig.type}" is async \u2014 not supported inside composite members. Use synchronous factories for composite child evaluators.`
19491
19485
  );
19492
19486
  }
19493
19487
  return result;
@@ -19496,35 +19490,35 @@ var compositeFactory = (config, context2) => {
19496
19490
  });
19497
19491
  };
19498
19492
  var toolTrajectoryFactory = (config) => {
19499
- return new ToolTrajectoryEvaluator({
19493
+ return new ToolTrajectoryGrader({
19500
19494
  config
19501
19495
  });
19502
19496
  };
19503
19497
  var fieldAccuracyFactory = (config) => {
19504
- return new FieldAccuracyEvaluator({
19498
+ return new FieldAccuracyGrader({
19505
19499
  config
19506
19500
  });
19507
19501
  };
19508
19502
  var latencyFactory = (config) => {
19509
- return new LatencyEvaluator({ config });
19503
+ return new LatencyGrader({ config });
19510
19504
  };
19511
19505
  var costFactory = (config) => {
19512
- return new CostEvaluator({ config });
19506
+ return new CostGrader({ config });
19513
19507
  };
19514
19508
  var tokenUsageFactory = (config) => {
19515
- return new TokenUsageEvaluator({ config });
19509
+ return new TokenUsageGrader({ config });
19516
19510
  };
19517
19511
  var executionMetricsFactory = (config) => {
19518
- return new ExecutionMetricsEvaluator({
19512
+ return new ExecutionMetricsGrader({
19519
19513
  config
19520
19514
  });
19521
19515
  };
19522
19516
  var skillTriggerFactory = (config) => {
19523
- return new SkillTriggerEvaluator(config);
19517
+ return new SkillTriggerGrader(config);
19524
19518
  };
19525
19519
  var containsFactory = (config) => {
19526
19520
  const c = config;
19527
- return new DeterministicAssertionEvaluator("contains", (ctx) => {
19521
+ return new DeterministicAssertionGrader("contains", (ctx) => {
19528
19522
  const result = runContainsAssertion(ctx.candidate, c.value);
19529
19523
  return {
19530
19524
  score: result.score,
@@ -19536,7 +19530,7 @@ var containsFactory = (config) => {
19536
19530
  };
19537
19531
  var regexFactory = (config) => {
19538
19532
  const c = config;
19539
- return new DeterministicAssertionEvaluator("regex", (ctx) => {
19533
+ return new DeterministicAssertionGrader("regex", (ctx) => {
19540
19534
  const result = runRegexAssertion(ctx.candidate, c.value, c.flags);
19541
19535
  return {
19542
19536
  score: result.score,
@@ -19547,7 +19541,7 @@ var regexFactory = (config) => {
19547
19541
  });
19548
19542
  };
19549
19543
  var isJsonFactory = () => {
19550
- return new DeterministicAssertionEvaluator("is-json", (ctx) => {
19544
+ return new DeterministicAssertionGrader("is-json", (ctx) => {
19551
19545
  const result = runIsJsonAssertion(ctx.candidate);
19552
19546
  return {
19553
19547
  score: result.score,
@@ -19559,7 +19553,7 @@ var isJsonFactory = () => {
19559
19553
  };
19560
19554
  var equalsFactory = (config) => {
19561
19555
  const c = config;
19562
- return new DeterministicAssertionEvaluator("equals", (ctx) => {
19556
+ return new DeterministicAssertionGrader("equals", (ctx) => {
19563
19557
  const result = runEqualsAssertion(ctx.candidate, c.value);
19564
19558
  return {
19565
19559
  score: result.score,
@@ -19571,7 +19565,7 @@ var equalsFactory = (config) => {
19571
19565
  };
19572
19566
  var containsAnyFactory = (config) => {
19573
19567
  const c = config;
19574
- return new DeterministicAssertionEvaluator("contains-any", (ctx) => {
19568
+ return new DeterministicAssertionGrader("contains-any", (ctx) => {
19575
19569
  const result = runContainsAnyAssertion(ctx.candidate, c.value);
19576
19570
  return {
19577
19571
  score: result.score,
@@ -19583,7 +19577,7 @@ var containsAnyFactory = (config) => {
19583
19577
  };
19584
19578
  var containsAllFactory = (config) => {
19585
19579
  const c = config;
19586
- return new DeterministicAssertionEvaluator("contains-all", (ctx) => {
19580
+ return new DeterministicAssertionGrader("contains-all", (ctx) => {
19587
19581
  const result = runContainsAllAssertion(ctx.candidate, c.value);
19588
19582
  return {
19589
19583
  score: result.score,
@@ -19595,7 +19589,7 @@ var containsAllFactory = (config) => {
19595
19589
  };
19596
19590
  var icontainsFactory = (config) => {
19597
19591
  const c = config;
19598
- return new DeterministicAssertionEvaluator("icontains", (ctx) => {
19592
+ return new DeterministicAssertionGrader("icontains", (ctx) => {
19599
19593
  const result = runIcontainsAssertion(ctx.candidate, c.value);
19600
19594
  return {
19601
19595
  score: result.score,
@@ -19607,7 +19601,7 @@ var icontainsFactory = (config) => {
19607
19601
  };
19608
19602
  var icontainsAnyFactory = (config) => {
19609
19603
  const c = config;
19610
- return new DeterministicAssertionEvaluator("icontains-any", (ctx) => {
19604
+ return new DeterministicAssertionGrader("icontains-any", (ctx) => {
19611
19605
  const result = runIcontainsAnyAssertion(ctx.candidate, c.value);
19612
19606
  return {
19613
19607
  score: result.score,
@@ -19619,7 +19613,7 @@ var icontainsAnyFactory = (config) => {
19619
19613
  };
19620
19614
  var icontainsAllFactory = (config) => {
19621
19615
  const c = config;
19622
- return new DeterministicAssertionEvaluator("icontains-all", (ctx) => {
19616
+ return new DeterministicAssertionGrader("icontains-all", (ctx) => {
19623
19617
  const result = runIcontainsAllAssertion(ctx.candidate, c.value);
19624
19618
  return {
19625
19619
  score: result.score,
@@ -19631,7 +19625,7 @@ var icontainsAllFactory = (config) => {
19631
19625
  };
19632
19626
  var startsWithFactory = (config) => {
19633
19627
  const c = config;
19634
- return new DeterministicAssertionEvaluator("starts-with", (ctx) => {
19628
+ return new DeterministicAssertionGrader("starts-with", (ctx) => {
19635
19629
  const result = runStartsWithAssertion(ctx.candidate, c.value);
19636
19630
  return {
19637
19631
  score: result.score,
@@ -19643,7 +19637,7 @@ var startsWithFactory = (config) => {
19643
19637
  };
19644
19638
  var endsWithFactory = (config) => {
19645
19639
  const c = config;
19646
- return new DeterministicAssertionEvaluator("ends-with", (ctx) => {
19640
+ return new DeterministicAssertionGrader("ends-with", (ctx) => {
19647
19641
  const result = runEndsWithAssertion(ctx.candidate, c.value);
19648
19642
  return {
19649
19643
  score: result.score,
@@ -19654,7 +19648,7 @@ var endsWithFactory = (config) => {
19654
19648
  });
19655
19649
  };
19656
19650
  function createBuiltinRegistry() {
19657
- const registry = new EvaluatorRegistry();
19651
+ const registry = new GraderRegistry();
19658
19652
  registry.register("llm-grader", llmGraderFactory).register("code-grader", codeFactory).register("composite", compositeFactory).register("tool-trajectory", toolTrajectoryFactory).register("field-accuracy", fieldAccuracyFactory).register("latency", latencyFactory).register("cost", costFactory).register("token-usage", tokenUsageFactory).register("execution-metrics", executionMetricsFactory).register("skill-trigger", skillTriggerFactory).register("contains", containsFactory).register("contains-any", containsAnyFactory).register("contains-all", containsAllFactory).register("icontains", icontainsFactory).register("icontains-any", icontainsAnyFactory).register("icontains-all", icontainsAllFactory).register("starts-with", startsWithFactory).register("ends-with", endsWithFactory).register("regex", regexFactory).register("is-json", isJsonFactory).register("equals", equalsFactory).register("inline-assert", (config) => {
19659
19653
  const fn = config[INLINE_ASSERT_FN];
19660
19654
  if (!fn) {
@@ -19662,7 +19656,7 @@ function createBuiltinRegistry() {
19662
19656
  `No inline assert function found on config for "${config.name}". Inline assert functions must be attached via INLINE_ASSERT_FN symbol.`
19663
19657
  );
19664
19658
  }
19665
- return new InlineAssertEvaluator(fn, config.name ?? "inline-assert");
19659
+ return new InlineAssertGrader(fn, config.name ?? "inline-assert");
19666
19660
  });
19667
19661
  return registry;
19668
19662
  }
@@ -19700,7 +19694,7 @@ async function discoverAssertions(registry, baseDir) {
19700
19694
  continue;
19701
19695
  }
19702
19696
  const factory = (_config, context2) => {
19703
- return new CodeEvaluator({
19697
+ return new CodeGrader({
19704
19698
  command: ["bun", "run", filePath],
19705
19699
  agentTimeoutMs: context2.agentTimeoutMs
19706
19700
  });
@@ -19745,7 +19739,7 @@ async function discoverGraders(registry, baseDir) {
19745
19739
  continue;
19746
19740
  }
19747
19741
  const factory = (_config, context2) => {
19748
- return new CodeEvaluator({
19742
+ return new CodeGrader({
19749
19743
  command: ["bun", "run", filePath],
19750
19744
  agentTimeoutMs: context2.agentTimeoutMs
19751
19745
  });
@@ -20594,10 +20588,10 @@ function buildSkippedEvaluatorError(scores) {
20594
20588
  }
20595
20589
  const messages = skippedScores.map((score) => {
20596
20590
  const label = score.name || score.type;
20597
- const assertionMessage = score.assertions.find((assertion) => !assertion.passed)?.text ?? "Evaluator skipped";
20591
+ const assertionMessage = score.assertions.find((assertion) => !assertion.passed)?.text ?? "Grader skipped";
20598
20592
  return `${label}: ${assertionMessage}`;
20599
20593
  });
20600
- return messages.length === 1 ? messages[0] : `Evaluators skipped: ${messages.join(" | ")}`;
20594
+ return messages.length === 1 ? messages[0] : `Graders skipped: ${messages.join(" | ")}`;
20601
20595
  }
20602
20596
  function usesFileReferencePrompt(provider) {
20603
20597
  return isAgentProvider(provider) || provider.kind === "cli";
@@ -20766,7 +20760,7 @@ async function runEvaluation(options) {
20766
20760
  cleanupWorkspaces,
20767
20761
  trials,
20768
20762
  streamCallbacks,
20769
- totalBudgetUsd,
20763
+ budgetUsd,
20770
20764
  failOnError,
20771
20765
  poolWorkspaces,
20772
20766
  poolMaxSlots: configPoolMaxSlots,
@@ -21295,7 +21289,7 @@ async function runEvaluation(options) {
21295
21289
  async function dispatchTest(evalCase, depResults) {
21296
21290
  const workerId = nextWorkerId++;
21297
21291
  workerIdByEvalId.set(evalCase.id, workerId);
21298
- if (totalBudgetUsd !== void 0 && budgetExhausted) {
21292
+ if (budgetUsd !== void 0 && budgetExhausted) {
21299
21293
  const budgetResult = {
21300
21294
  timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
21301
21295
  testId: evalCase.id,
@@ -21305,13 +21299,13 @@ async function runEvaluation(options) {
21305
21299
  assertions: [],
21306
21300
  output: [],
21307
21301
  target: target.name,
21308
- error: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
21302
+ error: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${budgetUsd.toFixed(4)})`,
21309
21303
  budgetExceeded: true,
21310
21304
  executionStatus: "execution_error",
21311
21305
  failureStage: "setup",
21312
21306
  failureReasonCode: "budget_exceeded",
21313
21307
  executionError: {
21314
- message: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
21308
+ message: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${budgetUsd.toFixed(4)})`,
21315
21309
  stage: "setup"
21316
21310
  }
21317
21311
  };
@@ -21408,7 +21402,7 @@ async function runEvaluation(options) {
21408
21402
  ...depResults && Object.keys(depResults).length > 0 ? { dependencyResults: depResults } : {}
21409
21403
  };
21410
21404
  let result = trials && trials.count > 1 ? await runEvalCaseWithTrials(runCaseOptions, trials) : await runEvalCase(runCaseOptions);
21411
- if (totalBudgetUsd !== void 0) {
21405
+ if (budgetUsd !== void 0) {
21412
21406
  let caseCost;
21413
21407
  if (result.trials && result.trials.length > 0) {
21414
21408
  const trialCostSum = result.trials.reduce((sum, t) => sum + (t.costUsd ?? 0), 0);
@@ -21420,7 +21414,7 @@ async function runEvaluation(options) {
21420
21414
  }
21421
21415
  if (caseCost !== void 0) {
21422
21416
  cumulativeBudgetCost += caseCost;
21423
- if (cumulativeBudgetCost >= totalBudgetUsd) {
21417
+ if (cumulativeBudgetCost >= budgetUsd) {
21424
21418
  budgetExhausted = true;
21425
21419
  }
21426
21420
  }
@@ -22562,7 +22556,7 @@ async function evaluateCandidate(options) {
22562
22556
  };
22563
22557
  }
22564
22558
  }
22565
- const evaluatorRequest = scores ? void 0 : score.evaluatorRawRequest;
22559
+ const evaluatorRequest = scores ? void 0 : score.graderRawRequest;
22566
22560
  const effectiveAgentRequest = agentRequest && Object.keys(agentRequest).length > 0 ? agentRequest : void 0;
22567
22561
  const requests = effectiveAgentRequest || lmRequest || evaluatorRequest ? {
22568
22562
  ...effectiveAgentRequest ? { agent: effectiveAgentRequest } : {},
@@ -22778,7 +22772,7 @@ async function runEvaluatorList(options) {
22778
22772
  weight,
22779
22773
  verdict: score2.verdict,
22780
22774
  assertions: score2.assertions,
22781
- input: score2.evaluatorRawRequest,
22775
+ input: score2.graderRawRequest,
22782
22776
  target: score2.graderTarget,
22783
22777
  details: score2.details,
22784
22778
  scores: mapChildResults(score2.scores),
@@ -22794,7 +22788,7 @@ async function runEvaluatorList(options) {
22794
22788
  score: 0,
22795
22789
  verdict: "fail",
22796
22790
  assertions: [
22797
- { text: `Evaluator '${evaluatorConfig.name}' failed: ${message}`, passed: false }
22791
+ { text: `Grader '${evaluatorConfig.name}' failed: ${message}`, passed: false }
22798
22792
  ],
22799
22793
  expectedAspectCount: 1
22800
22794
  };
@@ -22815,7 +22809,7 @@ async function runEvaluatorList(options) {
22815
22809
  verdict: "fail",
22816
22810
  assertions: [
22817
22811
  {
22818
- text: `Evaluator '${evaluatorConfig.name ?? "unknown"}' failed: ${message}`,
22812
+ text: `Grader '${evaluatorConfig.name ?? "unknown"}' failed: ${message}`,
22819
22813
  passed: false
22820
22814
  }
22821
22815
  ],
@@ -22872,7 +22866,7 @@ function filterEvalCases(evalCases, filter) {
22872
22866
  return evalCases.filter((evalCase) => matchesFilter3(evalCase.id, filter));
22873
22867
  }
22874
22868
  function buildEvaluatorRegistry(overrides, resolveGraderProvider) {
22875
- const llmGrader = overrides?.["llm-grader"] ?? new LlmGraderEvaluator({
22869
+ const llmGrader = overrides?.["llm-grader"] ?? new LlmGrader({
22876
22870
  resolveGraderProvider: async (context2) => {
22877
22871
  if (context2.graderProvider) {
22878
22872
  return context2.graderProvider;
@@ -23363,7 +23357,7 @@ function mapChildResults(children) {
23363
23357
  weight: child.weight,
23364
23358
  verdict: child.verdict,
23365
23359
  assertions: child.assertions,
23366
- input: child.evaluatorRawRequest,
23360
+ input: child.graderRawRequest,
23367
23361
  scores: mapChildResults(child.scores),
23368
23362
  details: child.details,
23369
23363
  tokenUsage: child.tokenUsage
@@ -25496,22 +25490,21 @@ function createAgentKernel() {
25496
25490
  // Annotate the CommonJS export names for ESM import in node:
25497
25491
  0 && (module.exports = {
25498
25492
  COMMON_TARGET_SETTINGS,
25499
- CodeEvaluator,
25500
- CompositeEvaluator,
25501
- CostEvaluator,
25493
+ CodeGrader,
25494
+ CompositeGrader,
25495
+ CostGrader,
25502
25496
  DEFAULT_CATEGORY,
25503
- DEFAULT_EVALUATOR_TEMPLATE,
25504
25497
  DEFAULT_EVAL_PATTERNS,
25505
25498
  DEFAULT_EXPLORATION_TOOLS,
25499
+ DEFAULT_GRADER_TEMPLATE,
25506
25500
  DEFAULT_THRESHOLD,
25507
- DeterministicAssertionEvaluator,
25501
+ DeterministicAssertionGrader,
25508
25502
  DockerWorkspaceProvider,
25509
- EvaluatorRegistry,
25510
- ExecutionMetricsEvaluator,
25511
- FieldAccuracyEvaluator,
25512
- LatencyEvaluator,
25513
- LlmGraderEvaluator,
25514
- LlmJudgeEvaluator,
25503
+ ExecutionMetricsGrader,
25504
+ FieldAccuracyGrader,
25505
+ GraderRegistry,
25506
+ LatencyGrader,
25507
+ LlmGrader,
25515
25508
  OTEL_BACKEND_PRESETS,
25516
25509
  OtelStreamingObserver,
25517
25510
  OtelTraceExporter,
@@ -25520,18 +25513,17 @@ function createAgentKernel() {
25520
25513
  ProviderRegistry,
25521
25514
  RepoManager,
25522
25515
  ResponseCache,
25523
- SkillTriggerEvaluator,
25516
+ SkillTriggerGrader,
25524
25517
  TEST_MESSAGE_ROLES,
25525
25518
  TemplateNotDirectoryError,
25526
25519
  TemplateNotFoundError,
25527
- TokenUsageEvaluator,
25528
- ToolTrajectoryEvaluator,
25520
+ TokenUsageGrader,
25521
+ ToolTrajectoryGrader,
25529
25522
  TranscriptProvider,
25530
25523
  WorkspaceCreationError,
25531
25524
  WorkspacePoolManager,
25532
25525
  addBenchmark,
25533
25526
  assembleLlmGraderPrompt,
25534
- assembleLlmJudgePrompt,
25535
25527
  avgToolDurationMs,
25536
25528
  buildDirectoryChain,
25537
25529
  buildOutputSchema,
@@ -25571,7 +25563,6 @@ function createAgentKernel() {
25571
25563
  discoverCodexSessions,
25572
25564
  discoverCopilotSessions,
25573
25565
  discoverGraders,
25574
- discoverJudges,
25575
25566
  discoverProviders,
25576
25567
  ensureResultsRepoClone,
25577
25568
  ensureVSCodeSubagents,
@@ -25613,7 +25604,7 @@ function createAgentKernel() {
25613
25604
  isAgentSkillsFormat,
25614
25605
  isContent,
25615
25606
  isContentArray,
25616
- isEvaluatorKind,
25607
+ isGraderKind,
25617
25608
  isJsonObject,
25618
25609
  isJsonValue,
25619
25610
  isNonEmptyString,