agentv 4.17.1 → 4.18.0-next.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -305,7 +305,7 @@ var require_dist = __commonJS({
305
305
  }
306
306
  });
307
307
 
308
- // ../../packages/core/dist/chunk-6VZY3B6M.js
308
+ // ../../packages/core/dist/chunk-PYDBJOAO.js
309
309
  import { constants } from "node:fs";
310
310
  import { access, readFile } from "node:fs/promises";
311
311
  import path from "node:path";
@@ -425,7 +425,7 @@ __export(external_exports2, {
425
425
  void: () => voidType
426
426
  });
427
427
 
428
- // ../../packages/core/dist/chunk-6VZY3B6M.js
428
+ // ../../packages/core/dist/chunk-PYDBJOAO.js
429
429
  import { readFile as readFile2 } from "node:fs/promises";
430
430
  import path3 from "node:path";
431
431
  import fg from "fast-glob";
@@ -497,7 +497,7 @@ function isTestMessage(value) {
497
497
  }
498
498
  return false;
499
499
  }
500
- var EVALUATOR_KIND_VALUES = [
500
+ var GRADER_KIND_VALUES = [
501
501
  "code-grader",
502
502
  "llm-grader",
503
503
  "rubric",
@@ -523,9 +523,9 @@ var EVALUATOR_KIND_VALUES = [
523
523
  "rubrics",
524
524
  "inline-assert"
525
525
  ];
526
- var EVALUATOR_KIND_SET = new Set(EVALUATOR_KIND_VALUES);
527
- function isEvaluatorKind(value) {
528
- return typeof value === "string" && EVALUATOR_KIND_SET.has(value);
526
+ var GRADER_KIND_SET = new Set(GRADER_KIND_VALUES);
527
+ function isGraderKind(value) {
528
+ return typeof value === "string" && GRADER_KIND_SET.has(value);
529
529
  }
530
530
  async function fileExists(filePath) {
531
531
  try {
@@ -15138,22 +15138,25 @@ function extractCacheConfig(suite) {
15138
15138
  const resolvedCachePath = typeof cachePath === "string" && cachePath.trim().length > 0 ? cachePath.trim() : void 0;
15139
15139
  return { enabled: cache, cachePath: resolvedCachePath };
15140
15140
  }
15141
- function extractTotalBudgetUsd(suite) {
15141
+ function extractBudgetUsd(suite) {
15142
15142
  const execution = suite.execution;
15143
15143
  if (!execution || typeof execution !== "object" || Array.isArray(execution)) {
15144
15144
  return void 0;
15145
15145
  }
15146
15146
  const executionObj = execution;
15147
- const rawBudget = executionObj.total_budget_usd ?? executionObj.totalBudgetUsd;
15147
+ if ("total_budget_usd" in executionObj || "totalBudgetUsd" in executionObj) {
15148
+ throw new Error(
15149
+ "execution.total_budget_usd has been renamed to execution.budget_usd. Update your eval YAML."
15150
+ );
15151
+ }
15152
+ const rawBudget = executionObj.budget_usd ?? executionObj.budgetUsd;
15148
15153
  if (rawBudget === void 0 || rawBudget === null) {
15149
15154
  return void 0;
15150
15155
  }
15151
15156
  if (typeof rawBudget === "number" && rawBudget > 0) {
15152
15157
  return rawBudget;
15153
15158
  }
15154
- logWarning(
15155
- `Invalid execution.total_budget_usd: ${rawBudget}. Must be a positive number. Ignoring.`
15156
- );
15159
+ logWarning(`Invalid execution.budget_usd: ${rawBudget}. Must be a positive number. Ignoring.`);
15157
15160
  return void 0;
15158
15161
  }
15159
15162
  function extractFailOnError(suite) {
@@ -15525,7 +15528,7 @@ function validateTemplateVariables(content, source) {
15525
15528
  );
15526
15529
  }
15527
15530
  if (invalidVariables.length > 0) {
15528
- const warningMessage = `${ANSI_YELLOW22}Warning: Custom evaluator template at ${source}
15531
+ const warningMessage = `${ANSI_YELLOW22}Warning: Custom grader template at ${source}
15529
15532
  Contains invalid variables: ${invalidVariables.map((v) => `{{ ${v} }}`).join(", ")}
15530
15533
  Valid variables: ${Array.from(VALID_TEMPLATE_VARIABLES).map((v) => `{{ ${v} }}`).join(", ")}${ANSI_RESET3}`;
15531
15534
  console.warn(warningMessage);
@@ -15535,26 +15538,26 @@ var ANSI_YELLOW3 = "\x1B[33m";
15535
15538
  var ANSI_RESET4 = "\x1B[0m";
15536
15539
  var MAX_ASSERTION_INCLUDE_DEPTH = 3;
15537
15540
  var PROMPT_FILE_PREFIX = "file://";
15538
- function normalizeEvaluatorType(type) {
15541
+ function normalizeGraderType(type) {
15539
15542
  return type.replace(/_/g, "-");
15540
15543
  }
15541
15544
  function isDeprecatedJudgeType(type) {
15542
15545
  return type === "code-judge" || type === "llm-judge";
15543
15546
  }
15544
- async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId, defaultPreprocessors) {
15547
+ async function parseGraders(rawEvalCase, globalExecution, searchRoots, evalId, defaultPreprocessors) {
15545
15548
  const execution = rawEvalCase.execution;
15546
15549
  const executionObject = isJsonObject2(execution) ? execution : void 0;
15547
15550
  const caseEvaluators = rawEvalCase.assertions ?? rawEvalCase.assert ?? (executionObject ? executionObject.evaluators : void 0) ?? // deprecated: use assertions
15548
15551
  rawEvalCase.evaluators;
15549
15552
  const skipDefaults = executionObject?.skip_defaults === true;
15550
15553
  const rootEvaluators = skipDefaults ? void 0 : globalExecution?.assertions ?? globalExecution?.assert ?? globalExecution?.evaluators;
15551
- const parsedCase = await parseEvaluatorList(
15554
+ const parsedCase = await parseGraderList(
15552
15555
  caseEvaluators,
15553
15556
  searchRoots,
15554
15557
  evalId,
15555
15558
  defaultPreprocessors
15556
15559
  );
15557
- const parsedRoot = await parseEvaluatorList(
15560
+ const parsedRoot = await parseGraderList(
15558
15561
  rootEvaluators,
15559
15562
  searchRoots,
15560
15563
  evalId,
@@ -15633,12 +15636,12 @@ ${resolved.attempted.map((attempt) => ` Tried: ${attempt}`).join("\n")}` : "";
15633
15636
  templateDir,
15634
15637
  ...searchRoots.filter((root) => path5.resolve(root) !== templateDir)
15635
15638
  ];
15636
- return await expandEvaluatorEntries(assertions, nestedSearchRoots, evalId, {
15639
+ return await expandGraderEntries(assertions, nestedSearchRoots, evalId, {
15637
15640
  depth: nextDepth,
15638
15641
  chain: [...includeContext.chain, resolved.resolvedPath]
15639
15642
  }) ?? [];
15640
15643
  }
15641
- async function expandEvaluatorEntries(candidateEvaluators, searchRoots, evalId, includeContext = { depth: 0, chain: [] }) {
15644
+ async function expandGraderEntries(candidateEvaluators, searchRoots, evalId, includeContext = { depth: 0, chain: [] }) {
15642
15645
  if (candidateEvaluators === void 0) {
15643
15646
  return void 0;
15644
15647
  }
@@ -15662,8 +15665,8 @@ async function expandEvaluatorEntries(candidateEvaluators, searchRoots, evalId,
15662
15665
  }
15663
15666
  return expanded;
15664
15667
  }
15665
- async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId, defaultPreprocessors) {
15666
- const expandedEvaluators = await expandEvaluatorEntries(candidateEvaluators, searchRoots, evalId);
15668
+ async function parseGraderList(candidateEvaluators, searchRoots, evalId, defaultPreprocessors) {
15669
+ const expandedEvaluators = await expandGraderEntries(candidateEvaluators, searchRoots, evalId);
15667
15670
  if (!expandedEvaluators) {
15668
15671
  return void 0;
15669
15672
  }
@@ -15709,14 +15712,14 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId, defa
15709
15712
  }
15710
15713
  const rawName = asString(rawEvaluator.name);
15711
15714
  const rawType = rawEvaluator.type;
15712
- const typeValue = typeof rawType === "string" ? normalizeEvaluatorType(rawType) : rawType;
15715
+ const typeValue = typeof rawType === "string" ? normalizeGraderType(rawType) : rawType;
15713
15716
  if (typeof typeValue === "string" && isDeprecatedJudgeType(typeValue)) {
15714
15717
  logWarning2(
15715
15718
  `Skipping evaluator '${rawName ?? "<unnamed>"}' in '${evalId}': '${rawType}' is deprecated. Use '${typeValue.replace("-judge", "-grader")}' instead`
15716
15719
  );
15717
15720
  continue;
15718
15721
  }
15719
- const isCustomType = typeof typeValue === "string" && !isEvaluatorKind(typeValue);
15722
+ const isCustomType = typeof typeValue === "string" && !isGraderKind(typeValue);
15720
15723
  if (typeof typeValue !== "string") {
15721
15724
  logWarning2(`Skipping evaluator with invalid type in '${evalId}'`);
15722
15725
  continue;
@@ -15879,7 +15882,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId, defa
15879
15882
  continue;
15880
15883
  }
15881
15884
  const aggregatorType = asString(rawAggregator.type);
15882
- const normalizedAggregatorType = typeof aggregatorType === "string" ? aggregatorType === "weighted_average" || aggregatorType === "threshold" ? aggregatorType : normalizeEvaluatorType(aggregatorType) : aggregatorType;
15885
+ const normalizedAggregatorType = typeof aggregatorType === "string" ? aggregatorType === "weighted_average" || aggregatorType === "threshold" ? aggregatorType : normalizeGraderType(aggregatorType) : aggregatorType;
15883
15886
  if (typeof normalizedAggregatorType === "string" && isDeprecatedJudgeType(normalizedAggregatorType)) {
15884
15887
  logWarning2(
15885
15888
  `Skipping composite evaluator '${name21}' in '${evalId}': aggregator type '${aggregatorType}' is deprecated. Use '${normalizedAggregatorType.replace("-judge", "-grader")}' instead`
@@ -15892,7 +15895,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId, defa
15892
15895
  );
15893
15896
  continue;
15894
15897
  }
15895
- const expandedMembers = await expandEvaluatorEntries(
15898
+ const expandedMembers = await expandGraderEntries(
15896
15899
  rawMembers,
15897
15900
  searchRoots,
15898
15901
  `${evalId}:${name21}`
@@ -15908,11 +15911,11 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId, defa
15908
15911
  }
15909
15912
  const memberName = asString(rawMember.name);
15910
15913
  const memberType = rawMember.type;
15911
- if (!memberName || !isEvaluatorKind(memberType)) {
15914
+ if (!memberName || !isGraderKind(memberType)) {
15912
15915
  logWarning2(`Skipping member evaluator with invalid name/type in composite '${name21}'`);
15913
15916
  continue;
15914
15917
  }
15915
- const memberConfigs = await parseEvaluators(
15918
+ const memberConfigs = await parseGraders(
15916
15919
  { evaluators: [rawMember] },
15917
15920
  void 0,
15918
15921
  searchRoots,
@@ -16653,7 +16656,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId, defa
16653
16656
  `prompt.command for evaluator '${name21}' in '${evalId}'`
16654
16657
  );
16655
16658
  if (!commandArray) {
16656
- throw new Error(`Evaluator '${name21}' in '${evalId}': prompt object requires command array`);
16659
+ throw new Error(`Grader '${name21}' in '${evalId}': prompt object requires command array`);
16657
16660
  }
16658
16661
  const commandPath = commandArray[commandArray.length - 1];
16659
16662
  const resolved = await resolveFileReference22(commandPath, searchRoots);
@@ -16661,7 +16664,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId, defa
16661
16664
  resolvedPromptScript = [...commandArray.slice(0, -1), path5.resolve(resolved.resolvedPath)];
16662
16665
  } else {
16663
16666
  throw new Error(
16664
- `Evaluator '${name21}' in '${evalId}': prompt command file not found: ${resolved.displayPath}`
16667
+ `Grader '${name21}' in '${evalId}': prompt command file not found: ${resolved.displayPath}`
16665
16668
  );
16666
16669
  }
16667
16670
  if (isJsonObject2(rawPrompt.config)) {
@@ -16678,11 +16681,11 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId, defa
16678
16681
  await validateCustomPromptContent(promptPath);
16679
16682
  } catch (error) {
16680
16683
  const message = error instanceof Error ? error.message : String(error);
16681
- throw new Error(`Evaluator '${name21}' template (${promptPath}): ${message}`);
16684
+ throw new Error(`Grader '${name21}' template (${promptPath}): ${message}`);
16682
16685
  }
16683
16686
  } else {
16684
16687
  throw new Error(
16685
- `Evaluator '${name21}' in '${evalId}': prompt file not found: ${resolved.displayPath}`
16688
+ `Grader '${name21}' in '${evalId}': prompt file not found: ${resolved.displayPath}`
16686
16689
  );
16687
16690
  }
16688
16691
  } else {
@@ -16799,18 +16802,18 @@ async function parsePreprocessors(rawValue, searchRoots, evaluatorName, evalId)
16799
16802
  return void 0;
16800
16803
  }
16801
16804
  if (!Array.isArray(rawValue)) {
16802
- throw new Error(`Evaluator '${evaluatorName}' in '${evalId}': preprocessors must be an array`);
16805
+ throw new Error(`Grader '${evaluatorName}' in '${evalId}': preprocessors must be an array`);
16803
16806
  }
16804
16807
  const preprocessors = [];
16805
16808
  for (const rawEntry of rawValue) {
16806
16809
  if (!isJsonObject2(rawEntry)) {
16807
16810
  throw new Error(
16808
- `Evaluator '${evaluatorName}' in '${evalId}': each preprocessor must be an object`
16811
+ `Grader '${evaluatorName}' in '${evalId}': each preprocessor must be an object`
16809
16812
  );
16810
16813
  }
16811
16814
  const type = asString(rawEntry.type)?.trim();
16812
16815
  if (!type) {
16813
- throw new Error(`Evaluator '${evaluatorName}' in '${evalId}': preprocessor.type is required`);
16816
+ throw new Error(`Grader '${evaluatorName}' in '${evalId}': preprocessor.type is required`);
16814
16817
  }
16815
16818
  const command = asStringArray(
16816
16819
  rawEntry.command,
@@ -16818,14 +16821,14 @@ async function parsePreprocessors(rawValue, searchRoots, evaluatorName, evalId)
16818
16821
  );
16819
16822
  if (!command || command.length === 0) {
16820
16823
  throw new Error(
16821
- `Evaluator '${evaluatorName}' in '${evalId}': preprocessor '${type}' requires command`
16824
+ `Grader '${evaluatorName}' in '${evalId}': preprocessor '${type}' requires command`
16822
16825
  );
16823
16826
  }
16824
16827
  const commandPath = command[command.length - 1];
16825
16828
  const resolved = await resolveFileReference22(commandPath, searchRoots);
16826
16829
  if (!resolved.resolvedPath) {
16827
16830
  throw new Error(
16828
- `Evaluator '${evaluatorName}' in '${evalId}': preprocessor command file not found: ${resolved.displayPath}`
16831
+ `Grader '${evaluatorName}' in '${evalId}': preprocessor command file not found: ${resolved.displayPath}`
16829
16832
  );
16830
16833
  }
16831
16834
  preprocessors.push({
@@ -16876,13 +16879,13 @@ function coerceEvaluator(candidate, contextId) {
16876
16879
  if (typeof candidate !== "string") {
16877
16880
  return void 0;
16878
16881
  }
16879
- const normalized = normalizeEvaluatorType(candidate);
16882
+ const normalized = normalizeGraderType(candidate);
16880
16883
  if (isDeprecatedJudgeType(normalized)) {
16881
16884
  throw new Error(
16882
16885
  `Unsupported grader '${candidate}' in ${contextId}. Use '${normalized.replace("-judge", "-grader")}' instead.`
16883
16886
  );
16884
16887
  }
16885
- if (isEvaluatorKind(normalized)) {
16888
+ if (isGraderKind(normalized)) {
16886
16889
  return normalized;
16887
16890
  }
16888
16891
  logWarning2(`Unknown grader '${candidate}' in ${contextId}, falling back to default`);
@@ -16954,7 +16957,7 @@ function parseRequiredAndMinScore(rawRequired, rawMinScore, evaluatorName, evalI
16954
16957
  }
16955
16958
  result.required = rawRequired;
16956
16959
  logWarning2(
16957
- `Evaluator '${evaluatorName}' in '${evalId}': 'required: ${rawRequired}' is deprecated. Use 'required: true' + 'min_score: ${rawRequired}' instead.`
16960
+ `Grader '${evaluatorName}' in '${evalId}': 'required: ${rawRequired}' is deprecated. Use 'required: true' + 'min_score: ${rawRequired}' instead.`
16958
16961
  );
16959
16962
  }
16960
16963
  return result;
@@ -17756,7 +17759,7 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
17756
17759
  const testCaseEvaluatorKind = coerceEvaluator(testCaseConfig.evaluator, id) ?? globalEvaluator;
17757
17760
  let evaluators;
17758
17761
  try {
17759
- evaluators = await parseEvaluators(
17762
+ evaluators = await parseGraders(
17760
17763
  testCaseConfig,
17761
17764
  mergedExecution,
17762
17765
  searchRoots,
@@ -18093,7 +18096,7 @@ async function loadTestSuite(evalFilePath, repoRoot, options) {
18093
18096
  targetRefs: extractTargetRefsFromSuite(parsed),
18094
18097
  workers: extractWorkersFromSuite(parsed),
18095
18098
  cacheConfig: extractCacheConfig(parsed),
18096
- totalBudgetUsd: extractTotalBudgetUsd(parsed),
18099
+ budgetUsd: extractBudgetUsd(parsed),
18097
18100
  ...metadata !== void 0 && { metadata },
18098
18101
  ...failOnError !== void 0 && { failOnError },
18099
18102
  ...threshold !== void 0 && { threshold },
@@ -18234,7 +18237,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
18234
18237
  const testCaseEvaluatorKind = coerceEvaluator(testCaseConfig.evaluator, id) ?? globalEvaluator;
18235
18238
  let evaluators;
18236
18239
  try {
18237
- evaluators = await parseEvaluators(
18240
+ evaluators = await parseGraders(
18238
18241
  testCaseConfig,
18239
18242
  globalExecution,
18240
18243
  searchRoots,
@@ -26241,7 +26244,7 @@ async function materializeContentForGrader(messages, getWorkDir) {
26241
26244
  }
26242
26245
  return result;
26243
26246
  }
26244
- var CodeEvaluator = class {
26247
+ var CodeGrader = class {
26245
26248
  kind = "code-grader";
26246
26249
  command;
26247
26250
  cwd;
@@ -26359,7 +26362,7 @@ var CodeEvaluator = class {
26359
26362
  })) : [];
26360
26363
  const details = parsed?.details && typeof parsed.details === "object" && !Array.isArray(parsed.details) ? parsed.details : void 0;
26361
26364
  const proxyUsage = getProxyUsage?.();
26362
- const evaluatorRawRequest = {
26365
+ const graderRawRequest = {
26363
26366
  command: this.command,
26364
26367
  ...this.cwd ? { cwd: this.cwd } : {},
26365
26368
  ...proxyUsage ? {
@@ -26374,7 +26377,7 @@ var CodeEvaluator = class {
26374
26377
  verdict: scoreToVerdict(score),
26375
26378
  assertions,
26376
26379
  expectedAspectCount: assertions.length || 1,
26377
- evaluatorRawRequest,
26380
+ graderRawRequest,
26378
26381
  ...details ? { details } : {},
26379
26382
  tokenUsage: proxyUsage?.tokenUsage
26380
26383
  };
@@ -26386,7 +26389,7 @@ var CodeEvaluator = class {
26386
26389
  verdict: "fail",
26387
26390
  assertions: [{ text: `Code evaluator failed: ${message}`, passed: false }],
26388
26391
  expectedAspectCount: 1,
26389
- evaluatorRawRequest: {
26392
+ graderRawRequest: {
26390
26393
  command: this.command,
26391
26394
  ...this.cwd ? { cwd: this.cwd } : {},
26392
26395
  ...proxyUsage ? {
@@ -26469,7 +26472,7 @@ var BINARY_EXTENSIONS = /* @__PURE__ */ new Set([
26469
26472
  ".so",
26470
26473
  ".dylib"
26471
26474
  ]);
26472
- var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the answer based on how well it achieves the criteria for the original task.
26475
+ var DEFAULT_GRADER_TEMPLATE = `You are an expert grader. Your goal is to grade the answer based on how well it achieves the criteria for the original task.
26473
26476
 
26474
26477
  Use the reference_answer as a gold standard for a high-quality response (if provided). The reference_answer may be a simple text response, or it may contain a sequence of expected agent messages including tool calls. When it contains multiple messages, the last message represents the final expected answer. The answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
26475
26478
 
@@ -26524,19 +26527,19 @@ function resolveContentBasePath(context2) {
26524
26527
  }
26525
26528
  return void 0;
26526
26529
  }
26527
- var LlmGraderEvaluator = class {
26530
+ var LlmGrader = class {
26528
26531
  kind = "llm-grader";
26529
26532
  resolveGraderProvider;
26530
26533
  maxOutputTokens;
26531
26534
  temperature;
26532
- evaluatorTemplate;
26535
+ graderTemplate;
26533
26536
  maxSteps;
26534
26537
  graderTargetProvider;
26535
26538
  constructor(options) {
26536
26539
  this.resolveGraderProvider = options.resolveGraderProvider ?? options.resolveJudgeProvider;
26537
26540
  this.maxOutputTokens = options.maxOutputTokens;
26538
26541
  this.temperature = options.temperature;
26539
- this.evaluatorTemplate = options.evaluatorTemplate;
26542
+ this.graderTemplate = options.graderTemplate;
26540
26543
  this.maxSteps = Math.min(options.maxSteps ?? DEFAULT_MAX_STEPS, MAX_STEPS_LIMIT);
26541
26544
  this.graderTargetProvider = options.graderTargetProvider ?? options.judgeTargetProvider;
26542
26545
  }
@@ -26599,16 +26602,16 @@ var LlmGraderEvaluator = class {
26599
26602
  [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context2.evalCase.reference_answer ?? "").trim()
26600
26603
  };
26601
26604
  const systemPrompt = buildOutputSchema();
26602
- const evaluatorTemplate = context2.evaluatorTemplateOverride ?? this.evaluatorTemplate ?? DEFAULT_EVALUATOR_TEMPLATE;
26603
- warnDeprecatedTemplateVars(evaluatorTemplate);
26604
- let userPrompt = substituteVariables(evaluatorTemplate, variables);
26605
- if (context2.fileChanges && !context2.evaluatorTemplateOverride && !this.evaluatorTemplate) {
26605
+ const graderTemplate = context2.graderTemplateOverride ?? this.graderTemplate ?? DEFAULT_GRADER_TEMPLATE;
26606
+ warnDeprecatedTemplateVars(graderTemplate);
26607
+ let userPrompt = substituteVariables(graderTemplate, variables);
26608
+ if (context2.fileChanges && !context2.graderTemplateOverride && !this.graderTemplate) {
26606
26609
  userPrompt += `
26607
26610
 
26608
26611
  [[ ## file_changes ## ]]
26609
26612
  ${context2.fileChanges}`;
26610
26613
  }
26611
- const evaluatorRawRequest = {
26614
+ const graderRawRequest = {
26612
26615
  userPrompt,
26613
26616
  systemPrompt
26614
26617
  };
@@ -26629,7 +26632,7 @@ ${context2.fileChanges}`;
26629
26632
  verdict: scoreToVerdict(score),
26630
26633
  assertions,
26631
26634
  expectedAspectCount: Math.max(assertions.length, 1),
26632
- evaluatorRawRequest,
26635
+ graderRawRequest,
26633
26636
  graderTarget: graderProvider.targetName,
26634
26637
  details: data.details,
26635
26638
  tokenUsage
@@ -26643,7 +26646,7 @@ ${context2.fileChanges}`;
26643
26646
  verdict: "skip",
26644
26647
  assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
26645
26648
  expectedAspectCount: 1,
26646
- evaluatorRawRequest,
26649
+ graderRawRequest,
26647
26650
  graderTarget: graderProvider.targetName
26648
26651
  };
26649
26652
  }
@@ -26660,7 +26663,7 @@ ${context2.fileChanges}`;
26660
26663
  }
26661
26664
  const prompt = this.buildRubricPrompt(context2, rubrics);
26662
26665
  const systemPrompt = buildRubricOutputSchema();
26663
- const evaluatorRawRequest = {
26666
+ const graderRawRequest = {
26664
26667
  userPrompt: prompt,
26665
26668
  systemPrompt
26666
26669
  };
@@ -26680,7 +26683,7 @@ ${context2.fileChanges}`;
26680
26683
  verdict,
26681
26684
  assertions,
26682
26685
  expectedAspectCount: rubrics.length,
26683
- evaluatorRawRequest,
26686
+ graderRawRequest,
26684
26687
  graderTarget: graderProvider.targetName,
26685
26688
  tokenUsage
26686
26689
  };
@@ -26693,7 +26696,7 @@ ${context2.fileChanges}`;
26693
26696
  verdict: "skip",
26694
26697
  assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
26695
26698
  expectedAspectCount: rubrics.length,
26696
- evaluatorRawRequest,
26699
+ graderRawRequest,
26697
26700
  graderTarget: graderProvider.targetName
26698
26701
  };
26699
26702
  }
@@ -26705,7 +26708,7 @@ ${context2.fileChanges}`;
26705
26708
  async evaluateWithScoreRanges(context2, graderProvider, rubrics) {
26706
26709
  const prompt = this.buildScoreRangePrompt(context2, rubrics);
26707
26710
  const systemPrompt = buildScoreRangeOutputSchema();
26708
- const evaluatorRawRequest = {
26711
+ const graderRawRequest = {
26709
26712
  userPrompt: prompt,
26710
26713
  systemPrompt
26711
26714
  };
@@ -26725,7 +26728,7 @@ ${context2.fileChanges}`;
26725
26728
  verdict,
26726
26729
  assertions,
26727
26730
  expectedAspectCount: rubrics.length,
26728
- evaluatorRawRequest,
26731
+ graderRawRequest,
26729
26732
  graderTarget: graderProvider.targetName,
26730
26733
  details,
26731
26734
  tokenUsage
@@ -26739,7 +26742,7 @@ ${context2.fileChanges}`;
26739
26742
  verdict: "skip",
26740
26743
  assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
26741
26744
  expectedAspectCount: rubrics.length,
26742
- evaluatorRawRequest,
26745
+ graderRawRequest,
26743
26746
  graderTarget: graderProvider.targetName
26744
26747
  };
26745
26748
  }
@@ -26768,7 +26771,7 @@ ${context2.fileChanges}`;
26768
26771
  const config = context2.evaluator;
26769
26772
  const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
26770
26773
  const fsTools = createFilesystemTools(workspacePath);
26771
- const evaluatorRawRequest = {
26774
+ const graderRawRequest = {
26772
26775
  mode: "built-in",
26773
26776
  systemPrompt,
26774
26777
  userPrompt,
@@ -26792,7 +26795,7 @@ ${context2.fileChanges}`;
26792
26795
  return this.parseAgentResult(
26793
26796
  text2,
26794
26797
  rubrics,
26795
- evaluatorRawRequest,
26798
+ graderRawRequest,
26796
26799
  details,
26797
26800
  graderProvider.targetName
26798
26801
  );
@@ -26803,7 +26806,7 @@ ${context2.fileChanges}`;
26803
26806
  verdict: "fail",
26804
26807
  assertions: [{ text: `llm-grader built-in evaluation failed: ${message}`, passed: false }],
26805
26808
  expectedAspectCount: 1,
26806
- evaluatorRawRequest,
26809
+ graderRawRequest,
26807
26810
  graderTarget: graderProvider.targetName,
26808
26811
  details: { mode: "built-in", error: message }
26809
26812
  };
@@ -26835,7 +26838,7 @@ ${context2.fileChanges}`;
26835
26838
  async evaluateWithDelegate(context2, provider, modeLabel) {
26836
26839
  const workspacePath = context2.workspacePath;
26837
26840
  const prompt = this.buildDelegatedPrompt(context2);
26838
- const evaluatorRawRequest = {
26841
+ const graderRawRequest = {
26839
26842
  mode: modeLabel,
26840
26843
  grader_target: provider.targetName,
26841
26844
  prompt
@@ -26856,7 +26859,7 @@ ${context2.fileChanges}`;
26856
26859
  { text: `llm-grader ${modeLabel} returned no assistant response`, passed: false }
26857
26860
  ],
26858
26861
  expectedAspectCount: 1,
26859
- evaluatorRawRequest,
26862
+ graderRawRequest,
26860
26863
  graderTarget: provider.targetName,
26861
26864
  details: { mode: modeLabel, grader_target: provider.targetName }
26862
26865
  };
@@ -26870,7 +26873,7 @@ ${context2.fileChanges}`;
26870
26873
  return this.parseAgentResult(
26871
26874
  assistantContent,
26872
26875
  rubrics,
26873
- evaluatorRawRequest,
26876
+ graderRawRequest,
26874
26877
  details,
26875
26878
  provider.targetName
26876
26879
  );
@@ -26883,7 +26886,7 @@ ${context2.fileChanges}`;
26883
26886
  { text: `llm-grader ${modeLabel} evaluation failed: ${message}`, passed: false }
26884
26887
  ],
26885
26888
  expectedAspectCount: 1,
26886
- evaluatorRawRequest,
26889
+ graderRawRequest,
26887
26890
  graderTarget: provider.targetName,
26888
26891
  details: {
26889
26892
  mode: modeLabel,
@@ -26904,7 +26907,7 @@ ${context2.fileChanges}`;
26904
26907
  const config = context2.evaluator;
26905
26908
  const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
26906
26909
  const parts = [
26907
- "You are an expert evaluator with access to the workspace filesystem.",
26910
+ "You are an expert grader with access to the workspace filesystem.",
26908
26911
  "Use the provided tools to investigate the workspace and verify the criteria are met.",
26909
26912
  "Thoroughly examine relevant files before making your assessment.",
26910
26913
  ""
@@ -26933,9 +26936,9 @@ ${context2.fileChanges}`;
26933
26936
  [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context2.candidate.trim(),
26934
26937
  [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context2.evalCase.reference_answer ?? "").trim()
26935
26938
  };
26936
- if (this.evaluatorTemplate) {
26937
- warnDeprecatedTemplateVars(this.evaluatorTemplate);
26938
- return substituteVariables(this.evaluatorTemplate, variables);
26939
+ if (this.graderTemplate) {
26940
+ warnDeprecatedTemplateVars(this.graderTemplate);
26941
+ return substituteVariables(this.graderTemplate, variables);
26939
26942
  }
26940
26943
  const config = context2.evaluator;
26941
26944
  const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
@@ -26982,7 +26985,7 @@ ${context2.fileChanges}`;
26982
26985
  const formattedQuestion = context2.promptInputs.question && context2.promptInputs.question.trim().length > 0 ? context2.promptInputs.question : context2.evalCase.question;
26983
26986
  const config = context2.evaluator;
26984
26987
  const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
26985
- if (this.evaluatorTemplate) {
26988
+ if (this.graderTemplate) {
26986
26989
  const variables = {
26987
26990
  [TEMPLATE_VARIABLES.CRITERIA]: context2.evalCase.criteria.trim(),
26988
26991
  [TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
@@ -26994,15 +26997,15 @@ ${context2.fileChanges}`;
26994
26997
  [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context2.candidate.trim(),
26995
26998
  [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context2.evalCase.reference_answer ?? "").trim()
26996
26999
  };
26997
- warnDeprecatedTemplateVars(this.evaluatorTemplate);
26998
- const customPrompt = substituteVariables(this.evaluatorTemplate, variables);
27000
+ warnDeprecatedTemplateVars(this.graderTemplate);
27001
+ const customPrompt = substituteVariables(this.graderTemplate, variables);
26999
27002
  const outputSchema2 = rubrics && rubrics.length > 0 ? buildRubricOutputSchema() : buildOutputSchema();
27000
27003
  return `${customPrompt}
27001
27004
 
27002
27005
  ${outputSchema2}`;
27003
27006
  }
27004
27007
  const parts = [
27005
- "You are an expert evaluator. Investigate the workspace to verify the criteria are met.",
27008
+ "You are an expert grader. Investigate the workspace to verify the criteria are met.",
27006
27009
  "",
27007
27010
  "[[ ## question ## ]]",
27008
27011
  formattedQuestion,
@@ -27039,7 +27042,7 @@ ${outputSchema2}`;
27039
27042
  * Parse the agent's response text into an EvaluationScore.
27040
27043
  * Supports both freeform and rubric modes.
27041
27044
  */
27042
- parseAgentResult(text2, rubrics, evaluatorRawRequest, details, graderTarget) {
27045
+ parseAgentResult(text2, rubrics, graderRawRequest, details, graderTarget) {
27043
27046
  try {
27044
27047
  const parsed = parseJsonFromText(text2);
27045
27048
  if (rubrics && rubrics.length > 0) {
@@ -27050,7 +27053,7 @@ ${outputSchema2}`;
27050
27053
  verdict,
27051
27054
  assertions: assertions2,
27052
27055
  expectedAspectCount: rubrics.length,
27053
- evaluatorRawRequest,
27056
+ graderRawRequest,
27054
27057
  graderTarget,
27055
27058
  details
27056
27059
  };
@@ -27063,7 +27066,7 @@ ${outputSchema2}`;
27063
27066
  verdict: scoreToVerdict(score),
27064
27067
  assertions,
27065
27068
  expectedAspectCount: Math.max(assertions.length, 1),
27066
- evaluatorRawRequest,
27069
+ graderRawRequest,
27067
27070
  graderTarget,
27068
27071
  details: data.details && Object.keys(data.details).length > 0 ? { ...details, ...data.details } : details
27069
27072
  };
@@ -27078,7 +27081,7 @@ ${outputSchema2}`;
27078
27081
  }
27079
27082
  ],
27080
27083
  expectedAspectCount: 1,
27081
- evaluatorRawRequest,
27084
+ graderRawRequest,
27082
27085
  graderTarget,
27083
27086
  details
27084
27087
  };
@@ -27093,7 +27096,7 @@ ${outputSchema2}`;
27093
27096
  buildScoreRangePrompt(context2, rubrics) {
27094
27097
  const formattedQuestion = context2.promptInputs.question && context2.promptInputs.question.trim().length > 0 ? context2.promptInputs.question : context2.evalCase.question;
27095
27098
  const parts = [
27096
- "You are an expert evaluator. Score the candidate answer on each criterion below using the provided score ranges.",
27099
+ "You are an expert grader. Score the candidate answer on each criterion below using the provided score ranges.",
27097
27100
  "For each criterion, output an integer score from 0 to 10 based on which score range best matches the answer.",
27098
27101
  "",
27099
27102
  "[[ ## question ## ]]",
@@ -27136,7 +27139,7 @@ ${outputSchema2}`;
27136
27139
  buildRubricPrompt(context2, rubrics) {
27137
27140
  const formattedQuestion = context2.promptInputs.question && context2.promptInputs.question.trim().length > 0 ? context2.promptInputs.question : context2.evalCase.question;
27138
27141
  const parts = [
27139
- "You are an expert evaluator. Evaluate the candidate answer against each rubric item below.",
27142
+ "You are an expert grader. Evaluate the candidate answer against each rubric item below.",
27140
27143
  "",
27141
27144
  "[[ ## question ## ]]",
27142
27145
  formattedQuestion,
@@ -27310,7 +27313,7 @@ function sumTokenUsage(first, second) {
27310
27313
  };
27311
27314
  }
27312
27315
  function buildRubricOutputSchema() {
27313
- return `You are an expert evaluator. Evaluate the candidate answer against each rubric item.
27316
+ return `You are an expert grader. Evaluate the candidate answer against each rubric item.
27314
27317
  You must return a valid JSON object matching this schema:
27315
27318
  {
27316
27319
  "checks": [
@@ -27344,7 +27347,7 @@ function warnDeprecatedTemplateVars(template) {
27344
27347
  console.warn(
27345
27348
  `${ANSI_YELLOW7}\u26A0 Deprecated template variables detected (they still work but will be removed in a future version):
27346
27349
  ${used.join("\n ")}
27347
- Update your custom evaluator template to use the new names.${ANSI_RESET8}`
27350
+ Update your custom grader template to use the new names.${ANSI_RESET8}`
27348
27351
  );
27349
27352
  }
27350
27353
  }
@@ -27376,7 +27379,7 @@ function calculateRubricScore(result, rubrics) {
27376
27379
  return { score, verdict, assertions };
27377
27380
  }
27378
27381
  function buildScoreRangeOutputSchema() {
27379
- return `You are an expert evaluator. Score the candidate answer on each criterion.
27382
+ return `You are an expert grader. Score the candidate answer on each criterion.
27380
27383
  You must return a valid JSON object matching this schema:
27381
27384
  {
27382
27385
  "checks": [
@@ -27586,9 +27589,9 @@ async function searchDirectory(dirPath, workspacePath, regex, matches) {
27586
27589
  var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
27587
27590
  {{EVALUATOR_RESULTS_JSON}}
27588
27591
 
27589
- Decide the final score and verdict based on all evaluator results.
27592
+ Decide the final score and verdict based on all grader results.
27590
27593
  Return a JSON object with: score (0.0-1.0), verdict (pass/fail), and reasoning.`;
27591
- var CompositeEvaluator = class {
27594
+ var CompositeGrader = class {
27592
27595
  kind = "composite";
27593
27596
  config;
27594
27597
  evaluatorFactory;
@@ -27639,7 +27642,7 @@ var CompositeEvaluator = class {
27639
27642
  weight,
27640
27643
  verdict: member.result.verdict,
27641
27644
  assertions: [...member.result.assertions],
27642
- evaluatorRawRequest: member.result.evaluatorRawRequest,
27645
+ graderRawRequest: member.result.graderRawRequest,
27643
27646
  scores: member.result.scores,
27644
27647
  details: member.result.details,
27645
27648
  tokenUsage: member.result.tokenUsage
@@ -27660,7 +27663,7 @@ var CompositeEvaluator = class {
27660
27663
  verdict: "skip",
27661
27664
  assertions: [{ text: "All evaluators skipped (infrastructure failure)", passed: false }],
27662
27665
  expectedAspectCount: 1,
27663
- evaluatorRawRequest: {
27666
+ graderRawRequest: {
27664
27667
  aggregator: "weighted_average",
27665
27668
  ...weights ? { weights } : {}
27666
27669
  },
@@ -27673,7 +27676,7 @@ var CompositeEvaluator = class {
27673
27676
  verdict: scoreToVerdict(finalScore),
27674
27677
  assertions: allAssertions,
27675
27678
  expectedAspectCount: allAssertions.length || 1,
27676
- evaluatorRawRequest: {
27679
+ graderRawRequest: {
27677
27680
  aggregator: "weighted_average",
27678
27681
  ...weights ? { weights } : {}
27679
27682
  },
@@ -27692,7 +27695,7 @@ var CompositeEvaluator = class {
27692
27695
  score: member.result.score,
27693
27696
  verdict: member.result.verdict,
27694
27697
  assertions: [...member.result.assertions],
27695
- evaluatorRawRequest: member.result.evaluatorRawRequest,
27698
+ graderRawRequest: member.result.graderRawRequest,
27696
27699
  scores: member.result.scores,
27697
27700
  details: member.result.details,
27698
27701
  tokenUsage: member.result.tokenUsage
@@ -27715,7 +27718,7 @@ var CompositeEvaluator = class {
27715
27718
  verdict: "skip",
27716
27719
  assertions: [{ text: "All evaluators skipped (infrastructure failure)", passed: false }],
27717
27720
  expectedAspectCount: 1,
27718
- evaluatorRawRequest: {
27721
+ graderRawRequest: {
27719
27722
  aggregator: "threshold",
27720
27723
  threshold
27721
27724
  },
@@ -27734,7 +27737,7 @@ var CompositeEvaluator = class {
27734
27737
  verdict: pass ? "pass" : "fail",
27735
27738
  assertions: allAssertions,
27736
27739
  expectedAspectCount: allAssertions.length || 1,
27737
- evaluatorRawRequest: {
27740
+ graderRawRequest: {
27738
27741
  aggregator: "threshold",
27739
27742
  threshold
27740
27743
  },
@@ -27751,7 +27754,7 @@ var CompositeEvaluator = class {
27751
27754
  weight: weights?.[member.id] ?? 1,
27752
27755
  verdict: member.result.verdict,
27753
27756
  assertions: [...member.result.assertions],
27754
- evaluatorRawRequest: member.result.evaluatorRawRequest,
27757
+ graderRawRequest: member.result.graderRawRequest,
27755
27758
  scores: member.result.scores,
27756
27759
  details: member.result.details
27757
27760
  }));
@@ -27772,7 +27775,7 @@ var CompositeEvaluator = class {
27772
27775
  verdict,
27773
27776
  assertions,
27774
27777
  expectedAspectCount: assertions.length || 1,
27775
- evaluatorRawRequest: {
27778
+ graderRawRequest: {
27776
27779
  aggregator: "code-grader",
27777
27780
  script: scriptPath
27778
27781
  },
@@ -27785,7 +27788,7 @@ var CompositeEvaluator = class {
27785
27788
  verdict: "fail",
27786
27789
  assertions: [{ text: `Code aggregator failed: ${message}`, passed: false }],
27787
27790
  expectedAspectCount: 1,
27788
- evaluatorRawRequest: {
27791
+ graderRawRequest: {
27789
27792
  aggregator: "code-grader",
27790
27793
  script: scriptPath,
27791
27794
  error: message
@@ -27807,14 +27810,14 @@ var CompositeEvaluator = class {
27807
27810
  score: member.result.score,
27808
27811
  verdict: member.result.verdict,
27809
27812
  assertions: [...member.result.assertions],
27810
- evaluatorRawRequest: member.result.evaluatorRawRequest,
27813
+ graderRawRequest: member.result.graderRawRequest,
27811
27814
  scores: member.result.scores,
27812
27815
  details: member.result.details
27813
27816
  }));
27814
27817
  const promptTemplate = config.prompt ?? DEFAULT_COMPOSITE_AGGREGATOR_PROMPT;
27815
27818
  const userPrompt = promptTemplate.replace(/\{\{EVALUATOR_RESULTS_JSON\}\}/g, resultsJson);
27816
27819
  const systemPrompt = buildOutputSchema();
27817
- const evaluatorRawRequest = {
27820
+ const graderRawRequest = {
27818
27821
  aggregator: "llm-grader",
27819
27822
  userPrompt,
27820
27823
  systemPrompt,
@@ -27836,7 +27839,7 @@ var CompositeEvaluator = class {
27836
27839
  verdict: scoreToVerdict(score2),
27837
27840
  assertions: assertions2,
27838
27841
  expectedAspectCount: Math.max(assertions2.length, 1),
27839
- evaluatorRawRequest,
27842
+ graderRawRequest,
27840
27843
  scores
27841
27844
  };
27842
27845
  }
@@ -27856,7 +27859,7 @@ var CompositeEvaluator = class {
27856
27859
  verdict: scoreToVerdict(score),
27857
27860
  assertions,
27858
27861
  expectedAspectCount: Math.max(assertions.length, 1),
27859
- evaluatorRawRequest,
27862
+ graderRawRequest,
27860
27863
  scores
27861
27864
  };
27862
27865
  } catch {
@@ -27865,13 +27868,13 @@ var CompositeEvaluator = class {
27865
27868
  verdict: "fail",
27866
27869
  assertions: [{ text: "LLM aggregator failed", passed: false }],
27867
27870
  expectedAspectCount: 1,
27868
- evaluatorRawRequest,
27871
+ graderRawRequest,
27869
27872
  scores
27870
27873
  };
27871
27874
  }
27872
27875
  }
27873
27876
  };
27874
- var CostEvaluator = class {
27877
+ var CostGrader = class {
27875
27878
  kind = "cost";
27876
27879
  config;
27877
27880
  constructor(options) {
@@ -27886,7 +27889,7 @@ var CostEvaluator = class {
27886
27889
  verdict: "fail",
27887
27890
  assertions: [{ text: "No cost data available in trace", passed: false }],
27888
27891
  expectedAspectCount: 1,
27889
- evaluatorRawRequest: {
27892
+ graderRawRequest: {
27890
27893
  type: "cost",
27891
27894
  budget,
27892
27895
  costUsd: null
@@ -27903,7 +27906,7 @@ var CostEvaluator = class {
27903
27906
  passed ? { text: `Cost ${formatCost(costUsd)} <= ${formatCost(budget)} budget`, passed: true } : { text: `Cost ${formatCost(costUsd)} > ${formatCost(budget)} budget`, passed: false }
27904
27907
  ],
27905
27908
  expectedAspectCount: 1,
27906
- evaluatorRawRequest: {
27909
+ graderRawRequest: {
27907
27910
  type: "cost",
27908
27911
  budget,
27909
27912
  costUsd
@@ -27911,7 +27914,7 @@ var CostEvaluator = class {
27911
27914
  };
27912
27915
  }
27913
27916
  };
27914
- var ExecutionMetricsEvaluator = class {
27917
+ var ExecutionMetricsGrader = class {
27915
27918
  kind = "execution-metrics";
27916
27919
  config;
27917
27920
  constructor(options) {
@@ -27935,7 +27938,7 @@ var ExecutionMetricsEvaluator = class {
27935
27938
  verdict: "fail",
27936
27939
  assertions: [{ text: "No trace summary available", passed: false }],
27937
27940
  expectedAspectCount: 1,
27938
- evaluatorRawRequest: {
27941
+ graderRawRequest: {
27939
27942
  type: "execution-metrics",
27940
27943
  config: this.extractConfiguredThresholds(),
27941
27944
  actual: null
@@ -28051,7 +28054,7 @@ var ExecutionMetricsEvaluator = class {
28051
28054
  verdict: scoreToVerdict(score),
28052
28055
  assertions,
28053
28056
  expectedAspectCount: totalChecks || 1,
28054
- evaluatorRawRequest: {
28057
+ graderRawRequest: {
28055
28058
  type: "execution-metrics",
28056
28059
  config: this.extractConfiguredThresholds(),
28057
28060
  actual: this.filterDefinedMetrics(actualMetrics)
@@ -28137,7 +28140,7 @@ var MONTH_NAMES = {
28137
28140
  dec: 11,
28138
28141
  december: 11
28139
28142
  };
28140
- var FieldAccuracyEvaluator = class {
28143
+ var FieldAccuracyGrader = class {
28141
28144
  kind = "field-accuracy";
28142
28145
  config;
28143
28146
  constructor(options) {
@@ -28495,7 +28498,7 @@ function formatDateISO(date) {
28495
28498
  function parseJsonFromTextSafe(text2) {
28496
28499
  return parseJsonFromText(text2);
28497
28500
  }
28498
- var LatencyEvaluator = class {
28501
+ var LatencyGrader = class {
28499
28502
  kind = "latency";
28500
28503
  config;
28501
28504
  constructor(options) {
@@ -28510,7 +28513,7 @@ var LatencyEvaluator = class {
28510
28513
  verdict: "fail",
28511
28514
  assertions: [{ text: "No duration data available in trace", passed: false }],
28512
28515
  expectedAspectCount: 1,
28513
- evaluatorRawRequest: {
28516
+ graderRawRequest: {
28514
28517
  type: "latency",
28515
28518
  threshold,
28516
28519
  durationMs: null
@@ -28526,7 +28529,7 @@ var LatencyEvaluator = class {
28526
28529
  passed ? { text: `Duration ${durationMs}ms <= ${threshold}ms threshold`, passed: true } : { text: `Duration ${durationMs}ms > ${threshold}ms threshold`, passed: false }
28527
28530
  ],
28528
28531
  expectedAspectCount: 1,
28529
- evaluatorRawRequest: {
28532
+ graderRawRequest: {
28530
28533
  type: "latency",
28531
28534
  threshold,
28532
28535
  durationMs
@@ -28534,7 +28537,7 @@ var LatencyEvaluator = class {
28534
28537
  };
28535
28538
  }
28536
28539
  };
28537
- var SkillTriggerEvaluator = class {
28540
+ var SkillTriggerGrader = class {
28538
28541
  kind = "skill-trigger";
28539
28542
  config;
28540
28543
  constructor(config) {
@@ -28607,7 +28610,7 @@ function assembleLlmGraderPrompt(input) {
28607
28610
  promptInputs,
28608
28611
  evaluatorConfig,
28609
28612
  fileChanges,
28610
- evaluatorTemplateOverride
28613
+ graderTemplateOverride
28611
28614
  } = input;
28612
28615
  const rubrics = evaluatorConfig?.rubrics;
28613
28616
  if (rubrics && rubrics.length > 0) {
@@ -28617,15 +28620,9 @@ function assembleLlmGraderPrompt(input) {
28617
28620
  }
28618
28621
  return assembleChecklist(evalCase, candidate, promptInputs, rubrics, fileChanges);
28619
28622
  }
28620
- return assembleFreeform(
28621
- evalCase,
28622
- candidate,
28623
- promptInputs,
28624
- fileChanges,
28625
- evaluatorTemplateOverride
28626
- );
28623
+ return assembleFreeform(evalCase, candidate, promptInputs, fileChanges, graderTemplateOverride);
28627
28624
  }
28628
- function assembleFreeform(evalCase, candidate, promptInputs, fileChanges, evaluatorTemplateOverride) {
28625
+ function assembleFreeform(evalCase, candidate, promptInputs, fileChanges, graderTemplateOverride) {
28629
28626
  const formattedQuestion = promptInputs.question && promptInputs.question.trim().length > 0 ? promptInputs.question : evalCase.question;
28630
28627
  const variables = {
28631
28628
  [TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
@@ -28639,9 +28636,9 @@ function assembleFreeform(evalCase, candidate, promptInputs, fileChanges, evalua
28639
28636
  [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (evalCase.reference_answer ?? "").trim()
28640
28637
  };
28641
28638
  const systemPrompt = buildOutputSchema();
28642
- const template = evaluatorTemplateOverride ?? DEFAULT_EVALUATOR_TEMPLATE;
28639
+ const template = graderTemplateOverride ?? DEFAULT_GRADER_TEMPLATE;
28643
28640
  let userPrompt = substituteVariables(template, variables);
28644
- if (fileChanges && !evaluatorTemplateOverride) {
28641
+ if (fileChanges && !graderTemplateOverride) {
28645
28642
  userPrompt += `
28646
28643
 
28647
28644
  [[ ## file_changes ## ]]
@@ -28657,7 +28654,7 @@ ${fileChanges}`;
28657
28654
  function assembleChecklist(evalCase, candidate, promptInputs, rubrics, fileChanges) {
28658
28655
  const formattedQuestion = promptInputs.question && promptInputs.question.trim().length > 0 ? promptInputs.question : evalCase.question;
28659
28656
  const parts = [
28660
- "You are an expert evaluator. Evaluate the candidate answer against each rubric item below.",
28657
+ "You are an expert grader. Evaluate the candidate answer against each rubric item below.",
28661
28658
  "",
28662
28659
  "[[ ## question ## ]]",
28663
28660
  formattedQuestion,
@@ -28692,7 +28689,7 @@ function assembleChecklist(evalCase, candidate, promptInputs, rubrics, fileChang
28692
28689
  function assembleScoreRange(evalCase, candidate, promptInputs, rubrics, fileChanges) {
28693
28690
  const formattedQuestion = promptInputs.question && promptInputs.question.trim().length > 0 ? promptInputs.question : evalCase.question;
28694
28691
  const parts = [
28695
- "You are an expert evaluator. Score the candidate answer on each criterion below using the provided score ranges.",
28692
+ "You are an expert grader. Score the candidate answer on each criterion below using the provided score ranges.",
28696
28693
  "For each criterion, output an integer score from 0 to 10 based on which score range best matches the answer.",
28697
28694
  "",
28698
28695
  "[[ ## question ## ]]",
@@ -28739,7 +28736,7 @@ function assembleScoreRange(evalCase, candidate, promptInputs, rubrics, fileChan
28739
28736
  mode: "score_range"
28740
28737
  };
28741
28738
  }
28742
- var TokenUsageEvaluator = class {
28739
+ var TokenUsageGrader = class {
28743
28740
  kind = "token-usage";
28744
28741
  config;
28745
28742
  constructor(options) {
@@ -28760,7 +28757,7 @@ var TokenUsageEvaluator = class {
28760
28757
  verdict: "fail",
28761
28758
  assertions: [{ text: "No token usage data available in trace", passed: false }],
28762
28759
  expectedAspectCount,
28763
- evaluatorRawRequest: {
28760
+ graderRawRequest: {
28764
28761
  type: "token-usage",
28765
28762
  max_total: maxTotal ?? null,
28766
28763
  max_input: maxInput ?? null,
@@ -28801,7 +28798,7 @@ var TokenUsageEvaluator = class {
28801
28798
  verdict: passed ? "pass" : "fail",
28802
28799
  assertions,
28803
28800
  expectedAspectCount,
28804
- evaluatorRawRequest: {
28801
+ graderRawRequest: {
28805
28802
  type: "token-usage",
28806
28803
  max_total: maxTotal ?? null,
28807
28804
  max_input: maxInput ?? null,
@@ -28884,7 +28881,7 @@ function checkLatency(toolName, maxDurationMs, actualDurationMs) {
28884
28881
  message: `${toolName} took ${actualDurationMs}ms (max: ${maxDurationMs}ms)`
28885
28882
  };
28886
28883
  }
28887
- var ToolTrajectoryEvaluator = class {
28884
+ var ToolTrajectoryGrader = class {
28888
28885
  kind = "tool-trajectory";
28889
28886
  config;
28890
28887
  constructor(options) {
@@ -29568,14 +29565,14 @@ function validateConcurrency(concurrency) {
29568
29565
  throw new TypeError("Expected `concurrency` to be a number from 1 and up");
29569
29566
  }
29570
29567
  }
29571
- var EvaluatorRegistry = class {
29568
+ var GraderRegistry = class {
29572
29569
  factories = /* @__PURE__ */ new Map();
29573
- /** Register a factory function for an evaluator type. */
29570
+ /** Register a factory function for an grader type. */
29574
29571
  register(type, factory) {
29575
29572
  this.factories.set(type, factory);
29576
29573
  return this;
29577
29574
  }
29578
- /** Get the factory function for an evaluator type. */
29575
+ /** Get the factory function for an grader type. */
29579
29576
  get(type) {
29580
29577
  return this.factories.get(type);
29581
29578
  }
@@ -29583,25 +29580,25 @@ var EvaluatorRegistry = class {
29583
29580
  has(type) {
29584
29581
  return this.factories.has(type);
29585
29582
  }
29586
- /** List all registered evaluator type names. */
29583
+ /** List all registered grader type names. */
29587
29584
  list() {
29588
29585
  return [...this.factories.keys()];
29589
29586
  }
29590
29587
  /**
29591
29588
  * Create an evaluator instance from a config, using the registered factory.
29592
- * Throws if no factory is registered for the evaluator type.
29589
+ * Throws if no factory is registered for the grader type.
29593
29590
  */
29594
29591
  async create(config, context2) {
29595
29592
  const factory = this.factories.get(config.type);
29596
29593
  if (!factory) {
29597
29594
  throw new Error(
29598
- `Unknown evaluator type: "${config.type}". Registered types: ${this.list().join(", ")}`
29595
+ `Unknown grader type: "${config.type}". Registered types: ${this.list().join(", ")}`
29599
29596
  );
29600
29597
  }
29601
29598
  return factory(config, context2);
29602
29599
  }
29603
29600
  };
29604
- var DeterministicAssertionEvaluator = class {
29601
+ var DeterministicAssertionGrader = class {
29605
29602
  constructor(kind, assertFn) {
29606
29603
  this.assertFn = assertFn;
29607
29604
  this.kind = kind;
@@ -29611,7 +29608,7 @@ var DeterministicAssertionEvaluator = class {
29611
29608
  return this.assertFn(context2);
29612
29609
  }
29613
29610
  };
29614
- var InlineAssertEvaluator = class {
29611
+ var InlineAssertGrader = class {
29615
29612
  constructor(fn, name21) {
29616
29613
  this.fn = fn;
29617
29614
  this.name = name21;
@@ -29715,7 +29712,7 @@ var llmGraderFactory = (config, context2) => {
29715
29712
  );
29716
29713
  }
29717
29714
  const isAgent = isAgentProvider(graderTargetProvider) || graderTargetProvider.kind === "agentv";
29718
- evaluator = new LlmGraderEvaluator({
29715
+ evaluator = new LlmGrader({
29719
29716
  resolveGraderProvider: async (evalContext) => {
29720
29717
  if (graderTargetProvider) return graderTargetProvider;
29721
29718
  if (evalContext.graderProvider) return evalContext.graderProvider;
@@ -29743,11 +29740,11 @@ var llmGraderFactory = (config, context2) => {
29743
29740
  agentTimeoutMs
29744
29741
  );
29745
29742
  const isFromInlinePrompt = !c.resolvedPromptScript?.length && !c.resolvedPromptPath && !c.promptPath;
29746
- let evaluatorTemplateOverride;
29743
+ let graderTemplateOverride;
29747
29744
  let evalCase = evalContext.evalCase;
29748
29745
  if (customPrompt) {
29749
29746
  if (!isFromInlinePrompt || containsTemplateVariables(customPrompt)) {
29750
- evaluatorTemplateOverride = customPrompt;
29747
+ graderTemplateOverride = customPrompt;
29751
29748
  } else {
29752
29749
  evalCase = { ...evalCase, criteria: customPrompt };
29753
29750
  }
@@ -29755,7 +29752,7 @@ var llmGraderFactory = (config, context2) => {
29755
29752
  return evaluator.evaluate({
29756
29753
  ...evalContext,
29757
29754
  evalCase,
29758
- evaluatorTemplateOverride,
29755
+ graderTemplateOverride,
29759
29756
  evaluator: c
29760
29757
  });
29761
29758
  }
@@ -29763,7 +29760,7 @@ var llmGraderFactory = (config, context2) => {
29763
29760
  };
29764
29761
  var codeFactory = (config, context2) => {
29765
29762
  const c = config;
29766
- return new CodeEvaluator({
29763
+ return new CodeGrader({
29767
29764
  command: c.command ?? c.script ?? [],
29768
29765
  cwd: c.resolvedCwd ?? c.cwd,
29769
29766
  agentTimeoutMs: context2.agentTimeoutMs,
@@ -29774,19 +29771,19 @@ var codeFactory = (config, context2) => {
29774
29771
  var compositeFactory = (config, context2) => {
29775
29772
  const c = config;
29776
29773
  const evalFileDir = context2.evalFileDir ?? process.cwd();
29777
- return new CompositeEvaluator({
29774
+ return new CompositeGrader({
29778
29775
  config: c,
29779
29776
  cwd: evalFileDir,
29780
29777
  evaluatorFactory: {
29781
29778
  create: (memberConfig) => {
29782
29779
  const factory = context2.registry.get(memberConfig.type);
29783
29780
  if (!factory) {
29784
- throw new Error(`Unsupported evaluator type in composite: ${memberConfig.type}`);
29781
+ throw new Error(`Unsupported grader type in composite: ${memberConfig.type}`);
29785
29782
  }
29786
29783
  const result = factory(memberConfig, context2);
29787
29784
  if (result instanceof Promise) {
29788
29785
  throw new Error(
29789
- `Evaluator factory for type "${memberConfig.type}" is async \u2014 not supported inside composite members. Use synchronous factories for composite child evaluators.`
29786
+ `Grader factory for type "${memberConfig.type}" is async \u2014 not supported inside composite members. Use synchronous factories for composite child evaluators.`
29790
29787
  );
29791
29788
  }
29792
29789
  return result;
@@ -29795,35 +29792,35 @@ var compositeFactory = (config, context2) => {
29795
29792
  });
29796
29793
  };
29797
29794
  var toolTrajectoryFactory = (config) => {
29798
- return new ToolTrajectoryEvaluator({
29795
+ return new ToolTrajectoryGrader({
29799
29796
  config
29800
29797
  });
29801
29798
  };
29802
29799
  var fieldAccuracyFactory = (config) => {
29803
- return new FieldAccuracyEvaluator({
29800
+ return new FieldAccuracyGrader({
29804
29801
  config
29805
29802
  });
29806
29803
  };
29807
29804
  var latencyFactory = (config) => {
29808
- return new LatencyEvaluator({ config });
29805
+ return new LatencyGrader({ config });
29809
29806
  };
29810
29807
  var costFactory = (config) => {
29811
- return new CostEvaluator({ config });
29808
+ return new CostGrader({ config });
29812
29809
  };
29813
29810
  var tokenUsageFactory = (config) => {
29814
- return new TokenUsageEvaluator({ config });
29811
+ return new TokenUsageGrader({ config });
29815
29812
  };
29816
29813
  var executionMetricsFactory = (config) => {
29817
- return new ExecutionMetricsEvaluator({
29814
+ return new ExecutionMetricsGrader({
29818
29815
  config
29819
29816
  });
29820
29817
  };
29821
29818
  var skillTriggerFactory = (config) => {
29822
- return new SkillTriggerEvaluator(config);
29819
+ return new SkillTriggerGrader(config);
29823
29820
  };
29824
29821
  var containsFactory = (config) => {
29825
29822
  const c = config;
29826
- return new DeterministicAssertionEvaluator("contains", (ctx) => {
29823
+ return new DeterministicAssertionGrader("contains", (ctx) => {
29827
29824
  const result = runContainsAssertion(ctx.candidate, c.value);
29828
29825
  return {
29829
29826
  score: result.score,
@@ -29835,7 +29832,7 @@ var containsFactory = (config) => {
29835
29832
  };
29836
29833
  var regexFactory = (config) => {
29837
29834
  const c = config;
29838
- return new DeterministicAssertionEvaluator("regex", (ctx) => {
29835
+ return new DeterministicAssertionGrader("regex", (ctx) => {
29839
29836
  const result = runRegexAssertion(ctx.candidate, c.value, c.flags);
29840
29837
  return {
29841
29838
  score: result.score,
@@ -29846,7 +29843,7 @@ var regexFactory = (config) => {
29846
29843
  });
29847
29844
  };
29848
29845
  var isJsonFactory = () => {
29849
- return new DeterministicAssertionEvaluator("is-json", (ctx) => {
29846
+ return new DeterministicAssertionGrader("is-json", (ctx) => {
29850
29847
  const result = runIsJsonAssertion(ctx.candidate);
29851
29848
  return {
29852
29849
  score: result.score,
@@ -29858,7 +29855,7 @@ var isJsonFactory = () => {
29858
29855
  };
29859
29856
  var equalsFactory = (config) => {
29860
29857
  const c = config;
29861
- return new DeterministicAssertionEvaluator("equals", (ctx) => {
29858
+ return new DeterministicAssertionGrader("equals", (ctx) => {
29862
29859
  const result = runEqualsAssertion(ctx.candidate, c.value);
29863
29860
  return {
29864
29861
  score: result.score,
@@ -29870,7 +29867,7 @@ var equalsFactory = (config) => {
29870
29867
  };
29871
29868
  var containsAnyFactory = (config) => {
29872
29869
  const c = config;
29873
- return new DeterministicAssertionEvaluator("contains-any", (ctx) => {
29870
+ return new DeterministicAssertionGrader("contains-any", (ctx) => {
29874
29871
  const result = runContainsAnyAssertion(ctx.candidate, c.value);
29875
29872
  return {
29876
29873
  score: result.score,
@@ -29882,7 +29879,7 @@ var containsAnyFactory = (config) => {
29882
29879
  };
29883
29880
  var containsAllFactory = (config) => {
29884
29881
  const c = config;
29885
- return new DeterministicAssertionEvaluator("contains-all", (ctx) => {
29882
+ return new DeterministicAssertionGrader("contains-all", (ctx) => {
29886
29883
  const result = runContainsAllAssertion(ctx.candidate, c.value);
29887
29884
  return {
29888
29885
  score: result.score,
@@ -29894,7 +29891,7 @@ var containsAllFactory = (config) => {
29894
29891
  };
29895
29892
  var icontainsFactory = (config) => {
29896
29893
  const c = config;
29897
- return new DeterministicAssertionEvaluator("icontains", (ctx) => {
29894
+ return new DeterministicAssertionGrader("icontains", (ctx) => {
29898
29895
  const result = runIcontainsAssertion(ctx.candidate, c.value);
29899
29896
  return {
29900
29897
  score: result.score,
@@ -29906,7 +29903,7 @@ var icontainsFactory = (config) => {
29906
29903
  };
29907
29904
  var icontainsAnyFactory = (config) => {
29908
29905
  const c = config;
29909
- return new DeterministicAssertionEvaluator("icontains-any", (ctx) => {
29906
+ return new DeterministicAssertionGrader("icontains-any", (ctx) => {
29910
29907
  const result = runIcontainsAnyAssertion(ctx.candidate, c.value);
29911
29908
  return {
29912
29909
  score: result.score,
@@ -29918,7 +29915,7 @@ var icontainsAnyFactory = (config) => {
29918
29915
  };
29919
29916
  var icontainsAllFactory = (config) => {
29920
29917
  const c = config;
29921
- return new DeterministicAssertionEvaluator("icontains-all", (ctx) => {
29918
+ return new DeterministicAssertionGrader("icontains-all", (ctx) => {
29922
29919
  const result = runIcontainsAllAssertion(ctx.candidate, c.value);
29923
29920
  return {
29924
29921
  score: result.score,
@@ -29930,7 +29927,7 @@ var icontainsAllFactory = (config) => {
29930
29927
  };
29931
29928
  var startsWithFactory = (config) => {
29932
29929
  const c = config;
29933
- return new DeterministicAssertionEvaluator("starts-with", (ctx) => {
29930
+ return new DeterministicAssertionGrader("starts-with", (ctx) => {
29934
29931
  const result = runStartsWithAssertion(ctx.candidate, c.value);
29935
29932
  return {
29936
29933
  score: result.score,
@@ -29942,7 +29939,7 @@ var startsWithFactory = (config) => {
29942
29939
  };
29943
29940
  var endsWithFactory = (config) => {
29944
29941
  const c = config;
29945
- return new DeterministicAssertionEvaluator("ends-with", (ctx) => {
29942
+ return new DeterministicAssertionGrader("ends-with", (ctx) => {
29946
29943
  const result = runEndsWithAssertion(ctx.candidate, c.value);
29947
29944
  return {
29948
29945
  score: result.score,
@@ -29953,7 +29950,7 @@ var endsWithFactory = (config) => {
29953
29950
  });
29954
29951
  };
29955
29952
  function createBuiltinRegistry() {
29956
- const registry = new EvaluatorRegistry();
29953
+ const registry = new GraderRegistry();
29957
29954
  registry.register("llm-grader", llmGraderFactory).register("code-grader", codeFactory).register("composite", compositeFactory).register("tool-trajectory", toolTrajectoryFactory).register("field-accuracy", fieldAccuracyFactory).register("latency", latencyFactory).register("cost", costFactory).register("token-usage", tokenUsageFactory).register("execution-metrics", executionMetricsFactory).register("skill-trigger", skillTriggerFactory).register("contains", containsFactory).register("contains-any", containsAnyFactory).register("contains-all", containsAllFactory).register("icontains", icontainsFactory).register("icontains-any", icontainsAnyFactory).register("icontains-all", icontainsAllFactory).register("starts-with", startsWithFactory).register("ends-with", endsWithFactory).register("regex", regexFactory).register("is-json", isJsonFactory).register("equals", equalsFactory).register("inline-assert", (config) => {
29958
29955
  const fn = config[INLINE_ASSERT_FN];
29959
29956
  if (!fn) {
@@ -29961,7 +29958,7 @@ function createBuiltinRegistry() {
29961
29958
  `No inline assert function found on config for "${config.name}". Inline assert functions must be attached via INLINE_ASSERT_FN symbol.`
29962
29959
  );
29963
29960
  }
29964
- return new InlineAssertEvaluator(fn, config.name ?? "inline-assert");
29961
+ return new InlineAssertGrader(fn, config.name ?? "inline-assert");
29965
29962
  });
29966
29963
  return registry;
29967
29964
  }
@@ -29994,7 +29991,7 @@ async function discoverAssertions(registry, baseDir) {
29994
29991
  continue;
29995
29992
  }
29996
29993
  const factory = (_config, context2) => {
29997
- return new CodeEvaluator({
29994
+ return new CodeGrader({
29998
29995
  command: ["bun", "run", filePath],
29999
29996
  agentTimeoutMs: context2.agentTimeoutMs
30000
29997
  });
@@ -30034,7 +30031,7 @@ async function discoverGraders(registry, baseDir) {
30034
30031
  continue;
30035
30032
  }
30036
30033
  const factory = (_config, context2) => {
30037
- return new CodeEvaluator({
30034
+ return new CodeGrader({
30038
30035
  command: ["bun", "run", filePath],
30039
30036
  agentTimeoutMs: context2.agentTimeoutMs
30040
30037
  });
@@ -30848,10 +30845,10 @@ function buildSkippedEvaluatorError(scores) {
30848
30845
  }
30849
30846
  const messages = skippedScores.map((score) => {
30850
30847
  const label = score.name || score.type;
30851
- const assertionMessage = score.assertions.find((assertion) => !assertion.passed)?.text ?? "Evaluator skipped";
30848
+ const assertionMessage = score.assertions.find((assertion) => !assertion.passed)?.text ?? "Grader skipped";
30852
30849
  return `${label}: ${assertionMessage}`;
30853
30850
  });
30854
- return messages.length === 1 ? messages[0] : `Evaluators skipped: ${messages.join(" | ")}`;
30851
+ return messages.length === 1 ? messages[0] : `Graders skipped: ${messages.join(" | ")}`;
30855
30852
  }
30856
30853
  function usesFileReferencePrompt(provider) {
30857
30854
  return isAgentProvider(provider) || provider.kind === "cli";
@@ -31020,7 +31017,7 @@ async function runEvaluation(options) {
31020
31017
  cleanupWorkspaces,
31021
31018
  trials,
31022
31019
  streamCallbacks,
31023
- totalBudgetUsd,
31020
+ budgetUsd,
31024
31021
  failOnError,
31025
31022
  poolWorkspaces,
31026
31023
  poolMaxSlots: configPoolMaxSlots,
@@ -31549,7 +31546,7 @@ async function runEvaluation(options) {
31549
31546
  async function dispatchTest(evalCase, depResults) {
31550
31547
  const workerId = nextWorkerId++;
31551
31548
  workerIdByEvalId.set(evalCase.id, workerId);
31552
- if (totalBudgetUsd !== void 0 && budgetExhausted) {
31549
+ if (budgetUsd !== void 0 && budgetExhausted) {
31553
31550
  const budgetResult = {
31554
31551
  timestamp: (now2 ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
31555
31552
  testId: evalCase.id,
@@ -31559,13 +31556,13 @@ async function runEvaluation(options) {
31559
31556
  assertions: [],
31560
31557
  output: [],
31561
31558
  target: target.name,
31562
- error: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
31559
+ error: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${budgetUsd.toFixed(4)})`,
31563
31560
  budgetExceeded: true,
31564
31561
  executionStatus: "execution_error",
31565
31562
  failureStage: "setup",
31566
31563
  failureReasonCode: "budget_exceeded",
31567
31564
  executionError: {
31568
- message: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
31565
+ message: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${budgetUsd.toFixed(4)})`,
31569
31566
  stage: "setup"
31570
31567
  }
31571
31568
  };
@@ -31662,7 +31659,7 @@ async function runEvaluation(options) {
31662
31659
  ...depResults && Object.keys(depResults).length > 0 ? { dependencyResults: depResults } : {}
31663
31660
  };
31664
31661
  let result = trials && trials.count > 1 ? await runEvalCaseWithTrials(runCaseOptions, trials) : await runEvalCase(runCaseOptions);
31665
- if (totalBudgetUsd !== void 0) {
31662
+ if (budgetUsd !== void 0) {
31666
31663
  let caseCost;
31667
31664
  if (result.trials && result.trials.length > 0) {
31668
31665
  const trialCostSum = result.trials.reduce((sum, t) => sum + (t.costUsd ?? 0), 0);
@@ -31674,7 +31671,7 @@ async function runEvaluation(options) {
31674
31671
  }
31675
31672
  if (caseCost !== void 0) {
31676
31673
  cumulativeBudgetCost += caseCost;
31677
- if (cumulativeBudgetCost >= totalBudgetUsd) {
31674
+ if (cumulativeBudgetCost >= budgetUsd) {
31678
31675
  budgetExhausted = true;
31679
31676
  }
31680
31677
  }
@@ -32816,7 +32813,7 @@ async function evaluateCandidate(options) {
32816
32813
  };
32817
32814
  }
32818
32815
  }
32819
- const evaluatorRequest = scores ? void 0 : score.evaluatorRawRequest;
32816
+ const evaluatorRequest = scores ? void 0 : score.graderRawRequest;
32820
32817
  const effectiveAgentRequest = agentRequest && Object.keys(agentRequest).length > 0 ? agentRequest : void 0;
32821
32818
  const requests = effectiveAgentRequest || lmRequest || evaluatorRequest ? {
32822
32819
  ...effectiveAgentRequest ? { agent: effectiveAgentRequest } : {},
@@ -33032,7 +33029,7 @@ async function runEvaluatorList(options) {
33032
33029
  weight,
33033
33030
  verdict: score2.verdict,
33034
33031
  assertions: score2.assertions,
33035
- input: score2.evaluatorRawRequest,
33032
+ input: score2.graderRawRequest,
33036
33033
  target: score2.graderTarget,
33037
33034
  details: score2.details,
33038
33035
  scores: mapChildResults(score2.scores),
@@ -33048,7 +33045,7 @@ async function runEvaluatorList(options) {
33048
33045
  score: 0,
33049
33046
  verdict: "fail",
33050
33047
  assertions: [
33051
- { text: `Evaluator '${evaluatorConfig.name}' failed: ${message}`, passed: false }
33048
+ { text: `Grader '${evaluatorConfig.name}' failed: ${message}`, passed: false }
33052
33049
  ],
33053
33050
  expectedAspectCount: 1
33054
33051
  };
@@ -33069,7 +33066,7 @@ async function runEvaluatorList(options) {
33069
33066
  verdict: "fail",
33070
33067
  assertions: [
33071
33068
  {
33072
- text: `Evaluator '${evaluatorConfig.name ?? "unknown"}' failed: ${message}`,
33069
+ text: `Grader '${evaluatorConfig.name ?? "unknown"}' failed: ${message}`,
33073
33070
  passed: false
33074
33071
  }
33075
33072
  ],
@@ -33126,7 +33123,7 @@ function filterEvalCases(evalCases, filter2) {
33126
33123
  return evalCases.filter((evalCase) => matchesFilter3(evalCase.id, filter2));
33127
33124
  }
33128
33125
  function buildEvaluatorRegistry(overrides, resolveGraderProvider) {
33129
- const llmGrader = overrides?.["llm-grader"] ?? new LlmGraderEvaluator({
33126
+ const llmGrader = overrides?.["llm-grader"] ?? new LlmGrader({
33130
33127
  resolveGraderProvider: async (context2) => {
33131
33128
  if (context2.graderProvider) {
33132
33129
  return context2.graderProvider;
@@ -33617,7 +33614,7 @@ function mapChildResults(children) {
33617
33614
  weight: child.weight,
33618
33615
  verdict: child.verdict,
33619
33616
  assertions: child.assertions,
33620
- input: child.evaluatorRawRequest,
33617
+ input: child.graderRawRequest,
33621
33618
  scores: mapChildResults(child.scores),
33622
33619
  details: child.details,
33623
33620
  tokenUsage: child.tokenUsage
@@ -35656,7 +35653,7 @@ export {
35656
35653
  isJsonObject,
35657
35654
  isJsonValue,
35658
35655
  isTestMessage,
35659
- isEvaluatorKind,
35656
+ isGraderKind,
35660
35657
  fileExists,
35661
35658
  normalizeLineEndings,
35662
35659
  readTextFile,
@@ -35746,27 +35743,27 @@ export {
35746
35743
  negateScore,
35747
35744
  toSnakeCaseDeep,
35748
35745
  toCamelCaseDeep,
35749
- CodeEvaluator,
35746
+ CodeGrader,
35750
35747
  executeScript,
35751
- DEFAULT_EVALUATOR_TEMPLATE,
35748
+ DEFAULT_GRADER_TEMPLATE,
35752
35749
  freeformEvaluationSchema,
35753
35750
  rubricEvaluationSchema,
35754
- LlmGraderEvaluator,
35751
+ LlmGrader,
35755
35752
  buildOutputSchema,
35756
35753
  buildRubricOutputSchema,
35757
35754
  substituteVariables,
35758
35755
  calculateRubricScore,
35759
35756
  buildScoreRangeOutputSchema,
35760
35757
  extractImageBlocks,
35761
- CompositeEvaluator,
35762
- CostEvaluator,
35763
- ExecutionMetricsEvaluator,
35764
- FieldAccuracyEvaluator,
35765
- LatencyEvaluator,
35766
- SkillTriggerEvaluator,
35758
+ CompositeGrader,
35759
+ CostGrader,
35760
+ ExecutionMetricsGrader,
35761
+ FieldAccuracyGrader,
35762
+ LatencyGrader,
35763
+ SkillTriggerGrader,
35767
35764
  assembleLlmGraderPrompt,
35768
- TokenUsageEvaluator,
35769
- ToolTrajectoryEvaluator,
35765
+ TokenUsageGrader,
35766
+ ToolTrajectoryGrader,
35770
35767
  runContainsAssertion,
35771
35768
  runContainsAnyAssertion,
35772
35769
  runContainsAllAssertion,
@@ -35778,8 +35775,8 @@ export {
35778
35775
  runRegexAssertion,
35779
35776
  runIsJsonAssertion,
35780
35777
  runEqualsAssertion,
35781
- EvaluatorRegistry,
35782
- DeterministicAssertionEvaluator,
35778
+ GraderRegistry,
35779
+ DeterministicAssertionGrader,
35783
35780
  createBuiltinRegistry,
35784
35781
  discoverAssertions,
35785
35782
  discoverGraders,
@@ -35845,4 +35842,4 @@ export {
35845
35842
  TranscriptProvider,
35846
35843
  createAgentKernel
35847
35844
  };
35848
- //# sourceMappingURL=chunk-IRU2UOWN.js.map
35845
+ //# sourceMappingURL=chunk-RCOAXXHP.js.map