@agentv/core 3.13.0 → 3.13.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -473,7 +473,7 @@ declare function isJsonValue(value: unknown): value is JsonValue;
473
473
  * - Either content (string or array of objects) OR tool_calls (for assistant messages)
474
474
  */
475
475
  declare function isTestMessage(value: unknown): value is TestMessage;
476
- declare const EVALUATOR_KIND_VALUES: readonly ["code-grader", "llm-grader", "code-judge", "llm-judge", "rubric", "composite", "tool-trajectory", "field-accuracy", "latency", "cost", "token-usage", "execution-metrics", "skill-trigger", "contains", "contains-any", "contains-all", "icontains", "icontains-any", "icontains-all", "starts-with", "ends-with", "regex", "is-json", "equals", "rubrics", "inline-assert"];
476
+ declare const EVALUATOR_KIND_VALUES: readonly ["code-grader", "llm-grader", "rubric", "composite", "tool-trajectory", "field-accuracy", "latency", "cost", "token-usage", "execution-metrics", "skill-trigger", "contains", "contains-any", "contains-all", "icontains", "icontains-any", "icontains-all", "starts-with", "ends-with", "regex", "is-json", "equals", "rubrics", "inline-assert"];
477
477
  type EvaluatorKind = (typeof EVALUATOR_KIND_VALUES)[number];
478
478
  declare function isEvaluatorKind(value: unknown): value is EvaluatorKind;
479
479
  /**
@@ -576,7 +576,7 @@ type WorkspaceConfig = {
576
576
  };
577
577
  type CodeEvaluatorConfig = {
578
578
  readonly name: string;
579
- readonly type: 'code-judge' | 'code-grader';
579
+ readonly type: 'code-grader';
580
580
  readonly command: readonly string[];
581
581
  /** @deprecated Use `command` instead */
582
582
  readonly script?: readonly string[];
@@ -606,7 +606,7 @@ type PromptScriptConfig = {
606
606
  };
607
607
  type LlmGraderEvaluatorConfig = {
608
608
  readonly name: string;
609
- readonly type: 'llm-grader' | 'llm-judge';
609
+ readonly type: 'llm-grader';
610
610
  /** Text prompt (inline or file path) or executable script config */
611
611
  readonly prompt?: string | PromptScriptConfig;
612
612
  readonly promptPath?: string;
@@ -678,20 +678,11 @@ type CompositeAggregatorConfig = {
678
678
  readonly type: 'code-grader';
679
679
  readonly path: string;
680
680
  readonly cwd?: string;
681
- } | {
682
- readonly type: 'code-judge';
683
- readonly path: string;
684
- readonly cwd?: string;
685
681
  } | {
686
682
  readonly type: 'llm-grader';
687
683
  readonly prompt?: string;
688
684
  readonly promptPath?: string;
689
685
  readonly model?: string;
690
- } | {
691
- readonly type: 'llm-judge';
692
- readonly prompt?: string;
693
- readonly promptPath?: string;
694
- readonly model?: string;
695
686
  } | {
696
687
  readonly type: 'threshold';
697
688
  readonly threshold: number;
@@ -2117,7 +2108,7 @@ interface CodeEvaluatorOptions {
2117
2108
  readonly target?: TargetAccessConfig;
2118
2109
  }
2119
2110
  declare class CodeEvaluator implements Evaluator {
2120
- readonly kind = "code-judge";
2111
+ readonly kind = "code-grader";
2121
2112
  private readonly command;
2122
2113
  private readonly cwd?;
2123
2114
  private readonly agentTimeoutMs?;
@@ -2852,7 +2843,7 @@ interface EvalTestInput {
2852
2843
  readonly expectedOutput?: string;
2853
2844
  /** @deprecated Use `expectedOutput` instead */
2854
2845
  readonly expected_output?: string;
2855
- /** Assertion evaluators — accepts factory functions, config objects, or inline functions */
2846
+ /** Assertion graders — accepts factory functions, config objects, or inline functions */
2856
2847
  readonly assert?: readonly AssertEntry[];
2857
2848
  /** Arbitrary metadata */
2858
2849
  readonly metadata?: Record<string, unknown>;
@@ -2862,7 +2853,7 @@ interface EvalTestInput {
2862
2853
  * Matches the YAML `assert` block structure.
2863
2854
  */
2864
2855
  interface EvalAssertionInput {
2865
- /** Assertion type (e.g., 'contains', 'llm-judge', 'code-judge') */
2856
+ /** Assertion type (e.g., 'contains', 'llm-grader', 'code-grader') */
2866
2857
  readonly type: string;
2867
2858
  /** Display name */
2868
2859
  readonly name?: string;
@@ -2872,9 +2863,9 @@ interface EvalAssertionInput {
2872
2863
  readonly weight?: number;
2873
2864
  /** Whether this assertion is required to pass */
2874
2865
  readonly required?: boolean | number;
2875
- /** Prompt file for llm_judge */
2866
+ /** Prompt file for llm_grader */
2876
2867
  readonly prompt?: string;
2877
- /** Script for code_judge */
2868
+ /** Script for code_grader */
2878
2869
  readonly script?: string | readonly string[];
2879
2870
  /** Additional config passed to the assertion */
2880
2871
  readonly config?: Record<string, unknown>;
@@ -3568,17 +3559,17 @@ declare function createBuiltinRegistry(): EvaluatorRegistry;
3568
3559
  * Convention-based discovery of custom assertion scripts.
3569
3560
  *
3570
3561
  * Scans `.agentv/assertions/` for TypeScript/JavaScript files and registers
3571
- * them as code-judge evaluators in the registry. The file name (without
3572
- * extension) becomes the evaluator type name.
3562
+ * them as code graders in the registry. The file name (without
3563
+ * extension) becomes the grader type name.
3573
3564
  *
3574
3565
  * Example: `.agentv/assertions/sentiment.ts` → type "sentiment" in EVAL.yaml
3575
3566
  */
3576
3567
 
3577
3568
  /**
3578
3569
  * Discover custom assertion scripts from `.agentv/assertions/` and register
3579
- * them as evaluator types in the registry.
3570
+ * them as grader types in the registry.
3580
3571
  *
3581
- * @param registry - The evaluator registry to register discovered assertions into
3572
+ * @param registry - The grader registry to register discovered assertions into
3582
3573
  * @param baseDir - The base directory to search from (typically project root or eval file dir)
3583
3574
  * @returns Names of discovered assertion types
3584
3575
  */
package/dist/index.d.ts CHANGED
@@ -473,7 +473,7 @@ declare function isJsonValue(value: unknown): value is JsonValue;
473
473
  * - Either content (string or array of objects) OR tool_calls (for assistant messages)
474
474
  */
475
475
  declare function isTestMessage(value: unknown): value is TestMessage;
476
- declare const EVALUATOR_KIND_VALUES: readonly ["code-grader", "llm-grader", "code-judge", "llm-judge", "rubric", "composite", "tool-trajectory", "field-accuracy", "latency", "cost", "token-usage", "execution-metrics", "skill-trigger", "contains", "contains-any", "contains-all", "icontains", "icontains-any", "icontains-all", "starts-with", "ends-with", "regex", "is-json", "equals", "rubrics", "inline-assert"];
476
+ declare const EVALUATOR_KIND_VALUES: readonly ["code-grader", "llm-grader", "rubric", "composite", "tool-trajectory", "field-accuracy", "latency", "cost", "token-usage", "execution-metrics", "skill-trigger", "contains", "contains-any", "contains-all", "icontains", "icontains-any", "icontains-all", "starts-with", "ends-with", "regex", "is-json", "equals", "rubrics", "inline-assert"];
477
477
  type EvaluatorKind = (typeof EVALUATOR_KIND_VALUES)[number];
478
478
  declare function isEvaluatorKind(value: unknown): value is EvaluatorKind;
479
479
  /**
@@ -576,7 +576,7 @@ type WorkspaceConfig = {
576
576
  };
577
577
  type CodeEvaluatorConfig = {
578
578
  readonly name: string;
579
- readonly type: 'code-judge' | 'code-grader';
579
+ readonly type: 'code-grader';
580
580
  readonly command: readonly string[];
581
581
  /** @deprecated Use `command` instead */
582
582
  readonly script?: readonly string[];
@@ -606,7 +606,7 @@ type PromptScriptConfig = {
606
606
  };
607
607
  type LlmGraderEvaluatorConfig = {
608
608
  readonly name: string;
609
- readonly type: 'llm-grader' | 'llm-judge';
609
+ readonly type: 'llm-grader';
610
610
  /** Text prompt (inline or file path) or executable script config */
611
611
  readonly prompt?: string | PromptScriptConfig;
612
612
  readonly promptPath?: string;
@@ -678,20 +678,11 @@ type CompositeAggregatorConfig = {
678
678
  readonly type: 'code-grader';
679
679
  readonly path: string;
680
680
  readonly cwd?: string;
681
- } | {
682
- readonly type: 'code-judge';
683
- readonly path: string;
684
- readonly cwd?: string;
685
681
  } | {
686
682
  readonly type: 'llm-grader';
687
683
  readonly prompt?: string;
688
684
  readonly promptPath?: string;
689
685
  readonly model?: string;
690
- } | {
691
- readonly type: 'llm-judge';
692
- readonly prompt?: string;
693
- readonly promptPath?: string;
694
- readonly model?: string;
695
686
  } | {
696
687
  readonly type: 'threshold';
697
688
  readonly threshold: number;
@@ -2117,7 +2108,7 @@ interface CodeEvaluatorOptions {
2117
2108
  readonly target?: TargetAccessConfig;
2118
2109
  }
2119
2110
  declare class CodeEvaluator implements Evaluator {
2120
- readonly kind = "code-judge";
2111
+ readonly kind = "code-grader";
2121
2112
  private readonly command;
2122
2113
  private readonly cwd?;
2123
2114
  private readonly agentTimeoutMs?;
@@ -2852,7 +2843,7 @@ interface EvalTestInput {
2852
2843
  readonly expectedOutput?: string;
2853
2844
  /** @deprecated Use `expectedOutput` instead */
2854
2845
  readonly expected_output?: string;
2855
- /** Assertion evaluators — accepts factory functions, config objects, or inline functions */
2846
+ /** Assertion graders — accepts factory functions, config objects, or inline functions */
2856
2847
  readonly assert?: readonly AssertEntry[];
2857
2848
  /** Arbitrary metadata */
2858
2849
  readonly metadata?: Record<string, unknown>;
@@ -2862,7 +2853,7 @@ interface EvalTestInput {
2862
2853
  * Matches the YAML `assert` block structure.
2863
2854
  */
2864
2855
  interface EvalAssertionInput {
2865
- /** Assertion type (e.g., 'contains', 'llm-judge', 'code-judge') */
2856
+ /** Assertion type (e.g., 'contains', 'llm-grader', 'code-grader') */
2866
2857
  readonly type: string;
2867
2858
  /** Display name */
2868
2859
  readonly name?: string;
@@ -2872,9 +2863,9 @@ interface EvalAssertionInput {
2872
2863
  readonly weight?: number;
2873
2864
  /** Whether this assertion is required to pass */
2874
2865
  readonly required?: boolean | number;
2875
- /** Prompt file for llm_judge */
2866
+ /** Prompt file for llm_grader */
2876
2867
  readonly prompt?: string;
2877
- /** Script for code_judge */
2868
+ /** Script for code_grader */
2878
2869
  readonly script?: string | readonly string[];
2879
2870
  /** Additional config passed to the assertion */
2880
2871
  readonly config?: Record<string, unknown>;
@@ -3568,17 +3559,17 @@ declare function createBuiltinRegistry(): EvaluatorRegistry;
3568
3559
  * Convention-based discovery of custom assertion scripts.
3569
3560
  *
3570
3561
  * Scans `.agentv/assertions/` for TypeScript/JavaScript files and registers
3571
- * them as code-judge evaluators in the registry. The file name (without
3572
- * extension) becomes the evaluator type name.
3562
+ * them as code graders in the registry. The file name (without
3563
+ * extension) becomes the grader type name.
3573
3564
  *
3574
3565
  * Example: `.agentv/assertions/sentiment.ts` → type "sentiment" in EVAL.yaml
3575
3566
  */
3576
3567
 
3577
3568
  /**
3578
3569
  * Discover custom assertion scripts from `.agentv/assertions/` and register
3579
- * them as evaluator types in the registry.
3570
+ * them as grader types in the registry.
3580
3571
  *
3581
- * @param registry - The evaluator registry to register discovered assertions into
3572
+ * @param registry - The grader registry to register discovered assertions into
3582
3573
  * @param baseDir - The base directory to search from (typically project root or eval file dir)
3583
3574
  * @returns Names of discovered assertion types
3584
3575
  */
package/dist/index.js CHANGED
@@ -19,7 +19,7 @@ import {
19
19
  readTextFile,
20
20
  resolveFileReference,
21
21
  resolveTargetDefinition
22
- } from "./chunk-4XWPXNQM.js";
22
+ } from "./chunk-ZB3AUPES.js";
23
23
  import {
24
24
  AgentvProvider
25
25
  } from "./chunk-W5YDZWT4.js";
@@ -728,6 +728,9 @@ var ANSI_RESET4 = "\x1B[0m";
728
728
  function normalizeEvaluatorType(type) {
729
729
  return type.replace(/_/g, "-");
730
730
  }
731
+ function isDeprecatedJudgeType(type) {
732
+ return type === "code-judge" || type === "llm-judge";
733
+ }
731
734
  async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId) {
732
735
  const execution = rawEvalCase.execution;
733
736
  const executionObject = isJsonObject2(execution) ? execution : void 0;
@@ -790,6 +793,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
790
793
  const rawName = asString(rawEvaluator.name);
791
794
  const rawType = rawEvaluator.type;
792
795
  const typeValue = typeof rawType === "string" ? normalizeEvaluatorType(rawType) : rawType;
796
+ if (typeof typeValue === "string" && isDeprecatedJudgeType(typeValue)) {
797
+ logWarning2(
798
+ `Skipping evaluator '${rawName ?? "<unnamed>"}' in '${evalId}': '${rawType}' is deprecated. Use '${typeValue.replace("-judge", "-grader")}' instead`
799
+ );
800
+ continue;
801
+ }
793
802
  const isCustomType = typeof typeValue === "string" && !isEvaluatorKind(typeValue);
794
803
  if (typeof typeValue !== "string") {
795
804
  logWarning2(`Skipping evaluator with invalid type in '${evalId}'`);
@@ -822,7 +831,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
822
831
  });
823
832
  continue;
824
833
  }
825
- if (typeValue === "code-grader" || typeValue === "code-judge") {
834
+ if (typeValue === "code-grader") {
826
835
  let command;
827
836
  if (rawEvaluator.script !== void 0 && rawEvaluator.command === void 0) {
828
837
  console.warn(
@@ -932,7 +941,14 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
932
941
  continue;
933
942
  }
934
943
  const aggregatorType = asString(rawAggregator.type);
935
- if (aggregatorType !== "weighted_average" && aggregatorType !== "code-grader" && aggregatorType !== "code-judge" && aggregatorType !== "llm-grader" && aggregatorType !== "llm-judge" && aggregatorType !== "threshold") {
944
+ const normalizedAggregatorType = typeof aggregatorType === "string" ? aggregatorType === "weighted_average" || aggregatorType === "threshold" ? aggregatorType : normalizeEvaluatorType(aggregatorType) : aggregatorType;
945
+ if (typeof normalizedAggregatorType === "string" && isDeprecatedJudgeType(normalizedAggregatorType)) {
946
+ logWarning2(
947
+ `Skipping composite evaluator '${name}' in '${evalId}': aggregator type '${aggregatorType}' is deprecated. Use '${normalizedAggregatorType.replace("-judge", "-grader")}' instead`
948
+ );
949
+ continue;
950
+ }
951
+ if (normalizedAggregatorType !== "weighted_average" && normalizedAggregatorType !== "code-grader" && normalizedAggregatorType !== "llm-grader" && normalizedAggregatorType !== "threshold") {
936
952
  logWarning2(
937
953
  `Skipping composite evaluator '${name}' in '${evalId}': invalid aggregator type '${aggregatorType}'`
938
954
  );
@@ -967,7 +983,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
967
983
  continue;
968
984
  }
969
985
  let aggregator;
970
- if (aggregatorType === "weighted_average") {
986
+ if (normalizedAggregatorType === "weighted_average") {
971
987
  const weights = isJsonObject2(rawAggregator.weights) ? rawAggregator.weights : void 0;
972
988
  const parsedWeights = {};
973
989
  if (weights) {
@@ -981,7 +997,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
981
997
  type: "weighted_average",
982
998
  ...Object.keys(parsedWeights).length > 0 ? { weights: parsedWeights } : {}
983
999
  };
984
- } else if (aggregatorType === "code-grader" || aggregatorType === "code-judge") {
1000
+ } else if (normalizedAggregatorType === "code-grader") {
985
1001
  const aggregatorPath = asString(rawAggregator.path);
986
1002
  if (!aggregatorPath) {
987
1003
  logWarning2(
@@ -994,7 +1010,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
994
1010
  path: aggregatorPath,
995
1011
  cwd: searchRoots[0]
996
1012
  };
997
- } else if (aggregatorType === "threshold") {
1013
+ } else if (normalizedAggregatorType === "threshold") {
998
1014
  const thresholdValue = rawAggregator.threshold;
999
1015
  if (typeof thresholdValue !== "number" || thresholdValue < 0 || thresholdValue > 1) {
1000
1016
  logWarning2(
@@ -1742,10 +1758,15 @@ function coerceEvaluator(candidate, contextId) {
1742
1758
  return void 0;
1743
1759
  }
1744
1760
  const normalized = normalizeEvaluatorType(candidate);
1761
+ if (isDeprecatedJudgeType(normalized)) {
1762
+ throw new Error(
1763
+ `Unsupported grader '${candidate}' in ${contextId}. Use '${normalized.replace("-judge", "-grader")}' instead.`
1764
+ );
1765
+ }
1745
1766
  if (isEvaluatorKind(normalized)) {
1746
1767
  return normalized;
1747
1768
  }
1748
- logWarning2(`Unknown evaluator '${candidate}' in ${contextId}, falling back to default`);
1769
+ logWarning2(`Unknown grader '${candidate}' in ${contextId}, falling back to default`);
1749
1770
  return void 0;
1750
1771
  }
1751
1772
  function asString(value) {
@@ -3178,9 +3199,7 @@ function assertionToNaturalLanguage(entry) {
3178
3199
  case "ends_with":
3179
3200
  return `Output ends with '${entry.value}'`;
3180
3201
  case "llm-grader":
3181
- case "llm_grader":
3182
- case "llm-judge":
3183
- case "llm_judge": {
3202
+ case "llm_grader": {
3184
3203
  if (Array.isArray(entry.rubrics) && entry.rubrics.length > 0) {
3185
3204
  return null;
3186
3205
  }
@@ -3193,9 +3212,7 @@ function assertionToNaturalLanguage(entry) {
3193
3212
  return tools ? `Agent called tools in order: ${tools}` : "Agent followed expected tool trajectory";
3194
3213
  }
3195
3214
  case "code-grader":
3196
- case "code_grader":
3197
- case "code-judge":
3198
- case "code_judge": {
3215
+ case "code_grader": {
3199
3216
  const graderName = entry.name ?? deriveGraderNameFromCommand(entry.command) ?? "code-grader";
3200
3217
  const desc = typeof entry.description === "string" ? entry.description : void 0;
3201
3218
  return codeGraderInstruction(graderName, desc);
@@ -3226,7 +3243,7 @@ function assertionToNaturalLanguage(entry) {
3226
3243
  }
3227
3244
  }
3228
3245
  function assertionToNaturalLanguageList(entry) {
3229
- if (entry.type === "llm-grader" || entry.type === "llm_grader" || entry.type === "llm-judge" || entry.type === "llm_judge") {
3246
+ if (entry.type === "llm-grader" || entry.type === "llm_grader") {
3230
3247
  if (Array.isArray(entry.rubrics) && entry.rubrics.length > 0) {
3231
3248
  return entry.rubrics.map((r) => r.outcome ?? r.criteria ?? r.id).filter((s) => typeof s === "string");
3232
3249
  }
@@ -10083,7 +10100,7 @@ function toCamelCaseDeep(obj) {
10083
10100
  // src/evaluation/evaluators/code-evaluator.ts
10084
10101
  var FILE_BACKED_OUTPUT_THRESHOLD = 5e4;
10085
10102
  var CodeEvaluator = class {
10086
- kind = "code-judge";
10103
+ kind = "code-grader";
10087
10104
  command;
10088
10105
  cwd;
10089
10106
  agentTimeoutMs;
@@ -10102,7 +10119,7 @@ var CodeEvaluator = class {
10102
10119
  if (outputForPayload) {
10103
10120
  const serialized = JSON.stringify(outputForPayload);
10104
10121
  if (serialized.length > FILE_BACKED_OUTPUT_THRESHOLD) {
10105
- const tmpDir = await mkdtemp2(join(tmpdir2(), "agentv-judge-"));
10122
+ const tmpDir = await mkdtemp2(join(tmpdir2(), "agentv-grader-"));
10106
10123
  outputPath = join(tmpDir, "output.json");
10107
10124
  await writeFile6(outputPath, serialized);
10108
10125
  outputForPayload = null;
@@ -10360,7 +10377,7 @@ var LlmGraderEvaluator = class {
10360
10377
  return this.evaluateWithDelegatedAgent(context, graderProvider);
10361
10378
  }
10362
10379
  const config = context.evaluator;
10363
- if ((config?.type === "llm-grader" || config?.type === "llm-judge") && config.rubrics && config.rubrics.length > 0) {
10380
+ if (config?.type === "llm-grader" && config.rubrics && config.rubrics.length > 0) {
10364
10381
  return this.evaluateWithRubrics(context, graderProvider, config.rubrics);
10365
10382
  }
10366
10383
  return this.evaluateFreeform(context, graderProvider);
@@ -10545,7 +10562,7 @@ ${context.fileChanges}`;
10545
10562
  const systemPrompt = this.buildAgentSystemPrompt(context);
10546
10563
  const userPrompt = this.buildAgentUserPrompt(context);
10547
10564
  const config = context.evaluator;
10548
- const rubrics = config?.type === "llm-grader" || config?.type === "llm-judge" ? config.rubrics : void 0;
10565
+ const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
10549
10566
  const fsTools = createFilesystemTools(workspacePath);
10550
10567
  const evaluatorRawRequest = {
10551
10568
  mode: "built-in",
@@ -10641,7 +10658,7 @@ ${context.fileChanges}`;
10641
10658
  };
10642
10659
  }
10643
10660
  const config = context.evaluator;
10644
- const rubrics = config?.type === "llm-grader" || config?.type === "llm-judge" ? config.rubrics : void 0;
10661
+ const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
10645
10662
  const details = {
10646
10663
  mode: modeLabel,
10647
10664
  grader_target: provider.targetName
@@ -10681,7 +10698,7 @@ ${context.fileChanges}`;
10681
10698
  */
10682
10699
  buildAgentSystemPrompt(context) {
10683
10700
  const config = context.evaluator;
10684
- const rubrics = config?.type === "llm-grader" || config?.type === "llm-judge" ? config.rubrics : void 0;
10701
+ const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
10685
10702
  const parts = [
10686
10703
  "You are an expert evaluator with access to the workspace filesystem.",
10687
10704
  "Use the provided tools to investigate the workspace and verify the criteria are met.",
@@ -10712,7 +10729,7 @@ ${context.fileChanges}`;
10712
10729
  return substituteVariables(this.evaluatorTemplate, variables);
10713
10730
  }
10714
10731
  const config = context.evaluator;
10715
- const rubrics = config?.type === "llm-grader" || config?.type === "llm-judge" ? config.rubrics : void 0;
10732
+ const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
10716
10733
  const parts = [
10717
10734
  "Evaluate the candidate answer by investigating the workspace.",
10718
10735
  "",
@@ -10755,7 +10772,7 @@ ${context.fileChanges}`;
10755
10772
  buildDelegatedPrompt(context) {
10756
10773
  const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
10757
10774
  const config = context.evaluator;
10758
- const rubrics = config?.type === "llm-grader" || config?.type === "llm-judge" ? config.rubrics : void 0;
10775
+ const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
10759
10776
  if (this.evaluatorTemplate) {
10760
10777
  const variables = {
10761
10778
  [TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(),
@@ -11252,10 +11269,8 @@ var CompositeEvaluator = class {
11252
11269
  const aggregator = this.config.aggregator;
11253
11270
  switch (aggregator.type) {
11254
11271
  case "code-grader":
11255
- case "code-judge":
11256
11272
  return this.runCodeAggregator(results, aggregator.path, aggregator.cwd ?? this.cwd);
11257
11273
  case "llm-grader":
11258
- case "llm-judge":
11259
11274
  return this.runLlmAggregator(results, context, aggregator);
11260
11275
  case "threshold":
11261
11276
  return this.runThreshold(results, aggregator.threshold);
@@ -13677,7 +13692,7 @@ var endsWithFactory = (config) => {
13677
13692
  };
13678
13693
  function createBuiltinRegistry() {
13679
13694
  const registry = new EvaluatorRegistry();
13680
- registry.register("llm-grader", llmGraderFactory).register("llm-judge", llmGraderFactory).register("code-grader", codeFactory).register("code-judge", codeFactory).register("composite", compositeFactory).register("tool-trajectory", toolTrajectoryFactory).register("field-accuracy", fieldAccuracyFactory).register("latency", latencyFactory).register("cost", costFactory).register("token-usage", tokenUsageFactory).register("execution-metrics", executionMetricsFactory).register("skill-trigger", skillTriggerFactory).register("contains", containsFactory).register("contains-any", containsAnyFactory).register("contains-all", containsAllFactory).register("icontains", icontainsFactory).register("icontains-any", icontainsAnyFactory).register("icontains-all", icontainsAllFactory).register("starts-with", startsWithFactory).register("ends-with", endsWithFactory).register("regex", regexFactory).register("is-json", isJsonFactory).register("equals", equalsFactory).register("inline-assert", (config) => {
13695
+ registry.register("llm-grader", llmGraderFactory).register("code-grader", codeFactory).register("composite", compositeFactory).register("tool-trajectory", toolTrajectoryFactory).register("field-accuracy", fieldAccuracyFactory).register("latency", latencyFactory).register("cost", costFactory).register("token-usage", tokenUsageFactory).register("execution-metrics", executionMetricsFactory).register("skill-trigger", skillTriggerFactory).register("contains", containsFactory).register("contains-any", containsAnyFactory).register("contains-all", containsAllFactory).register("icontains", icontainsFactory).register("icontains-any", icontainsAnyFactory).register("icontains-all", icontainsAllFactory).register("starts-with", startsWithFactory).register("ends-with", endsWithFactory).register("regex", regexFactory).register("is-json", isJsonFactory).register("equals", equalsFactory).register("inline-assert", (config) => {
13681
13696
  const fn = config[INLINE_ASSERT_FN];
13682
13697
  if (!fn) {
13683
13698
  throw new Error(
@@ -16395,7 +16410,7 @@ function filterEvalCases(evalCases, filter) {
16395
16410
  return evalCases.filter((evalCase) => micromatch3.isMatch(evalCase.id, filter));
16396
16411
  }
16397
16412
  function buildEvaluatorRegistry(overrides, resolveGraderProvider) {
16398
- const llmGrader = overrides?.["llm-grader"] ?? overrides?.["llm-judge"] ?? new LlmGraderEvaluator({
16413
+ const llmGrader = overrides?.["llm-grader"] ?? new LlmGraderEvaluator({
16399
16414
  resolveGraderProvider: async (context) => {
16400
16415
  if (context.graderProvider) {
16401
16416
  return context.graderProvider;
@@ -17239,10 +17254,10 @@ var OtelTraceExporter = class {
17239
17254
  }
17240
17255
  if (result.scores) {
17241
17256
  for (const score of result.scores) {
17242
- rootSpan.addEvent(`agentv.evaluator.${score.name}`, {
17243
- "agentv.evaluator.score": score.score,
17244
- "agentv.evaluator.type": score.type,
17245
- ...score.verdict ? { "agentv.evaluator.verdict": score.verdict } : {}
17257
+ rootSpan.addEvent(`agentv.grader.${score.name}`, {
17258
+ "agentv.grader.score": score.score,
17259
+ "agentv.grader.type": score.type,
17260
+ ...score.verdict ? { "agentv.grader.verdict": score.verdict } : {}
17246
17261
  });
17247
17262
  }
17248
17263
  }