@agentv/core 3.13.0 → 3.13.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,7 +8,7 @@ import {
8
8
  isEvaluatorKind,
9
9
  loadCasesFromFile,
10
10
  resolveFileReference
11
- } from "../../chunk-4XWPXNQM.js";
11
+ } from "../../chunk-ZB3AUPES.js";
12
12
 
13
13
  // src/evaluation/validation/file-type.ts
14
14
  import { readFile } from "node:fs/promises";
package/dist/index.cjs CHANGED
@@ -1581,8 +1581,6 @@ function isTestMessage(value) {
1581
1581
  var EVALUATOR_KIND_VALUES = [
1582
1582
  "code-grader",
1583
1583
  "llm-grader",
1584
- "code-judge",
1585
- "llm-judge",
1586
1584
  "rubric",
1587
1585
  "composite",
1588
1586
  "tool-trajectory",
@@ -2449,6 +2447,9 @@ var ANSI_RESET5 = "\x1B[0m";
2449
2447
  function normalizeEvaluatorType(type) {
2450
2448
  return type.replace(/_/g, "-");
2451
2449
  }
2450
+ function isDeprecatedJudgeType(type) {
2451
+ return type === "code-judge" || type === "llm-judge";
2452
+ }
2452
2453
  async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId) {
2453
2454
  const execution = rawEvalCase.execution;
2454
2455
  const executionObject = isJsonObject2(execution) ? execution : void 0;
@@ -2511,6 +2512,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2511
2512
  const rawName = asString(rawEvaluator.name);
2512
2513
  const rawType = rawEvaluator.type;
2513
2514
  const typeValue = typeof rawType === "string" ? normalizeEvaluatorType(rawType) : rawType;
2515
+ if (typeof typeValue === "string" && isDeprecatedJudgeType(typeValue)) {
2516
+ logWarning2(
2517
+ `Skipping evaluator '${rawName ?? "<unnamed>"}' in '${evalId}': '${rawType}' is deprecated. Use '${typeValue.replace("-judge", "-grader")}' instead`
2518
+ );
2519
+ continue;
2520
+ }
2514
2521
  const isCustomType = typeof typeValue === "string" && !isEvaluatorKind(typeValue);
2515
2522
  if (typeof typeValue !== "string") {
2516
2523
  logWarning2(`Skipping evaluator with invalid type in '${evalId}'`);
@@ -2543,7 +2550,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2543
2550
  });
2544
2551
  continue;
2545
2552
  }
2546
- if (typeValue === "code-grader" || typeValue === "code-judge") {
2553
+ if (typeValue === "code-grader") {
2547
2554
  let command;
2548
2555
  if (rawEvaluator.script !== void 0 && rawEvaluator.command === void 0) {
2549
2556
  console.warn(
@@ -2653,7 +2660,14 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2653
2660
  continue;
2654
2661
  }
2655
2662
  const aggregatorType = asString(rawAggregator.type);
2656
- if (aggregatorType !== "weighted_average" && aggregatorType !== "code-grader" && aggregatorType !== "code-judge" && aggregatorType !== "llm-grader" && aggregatorType !== "llm-judge" && aggregatorType !== "threshold") {
2663
+ const normalizedAggregatorType = typeof aggregatorType === "string" ? aggregatorType === "weighted_average" || aggregatorType === "threshold" ? aggregatorType : normalizeEvaluatorType(aggregatorType) : aggregatorType;
2664
+ if (typeof normalizedAggregatorType === "string" && isDeprecatedJudgeType(normalizedAggregatorType)) {
2665
+ logWarning2(
2666
+ `Skipping composite evaluator '${name}' in '${evalId}': aggregator type '${aggregatorType}' is deprecated. Use '${normalizedAggregatorType.replace("-judge", "-grader")}' instead`
2667
+ );
2668
+ continue;
2669
+ }
2670
+ if (normalizedAggregatorType !== "weighted_average" && normalizedAggregatorType !== "code-grader" && normalizedAggregatorType !== "llm-grader" && normalizedAggregatorType !== "threshold") {
2657
2671
  logWarning2(
2658
2672
  `Skipping composite evaluator '${name}' in '${evalId}': invalid aggregator type '${aggregatorType}'`
2659
2673
  );
@@ -2688,7 +2702,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2688
2702
  continue;
2689
2703
  }
2690
2704
  let aggregator;
2691
- if (aggregatorType === "weighted_average") {
2705
+ if (normalizedAggregatorType === "weighted_average") {
2692
2706
  const weights = isJsonObject2(rawAggregator.weights) ? rawAggregator.weights : void 0;
2693
2707
  const parsedWeights = {};
2694
2708
  if (weights) {
@@ -2702,7 +2716,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2702
2716
  type: "weighted_average",
2703
2717
  ...Object.keys(parsedWeights).length > 0 ? { weights: parsedWeights } : {}
2704
2718
  };
2705
- } else if (aggregatorType === "code-grader" || aggregatorType === "code-judge") {
2719
+ } else if (normalizedAggregatorType === "code-grader") {
2706
2720
  const aggregatorPath = asString(rawAggregator.path);
2707
2721
  if (!aggregatorPath) {
2708
2722
  logWarning2(
@@ -2715,7 +2729,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2715
2729
  path: aggregatorPath,
2716
2730
  cwd: searchRoots[0]
2717
2731
  };
2718
- } else if (aggregatorType === "threshold") {
2732
+ } else if (normalizedAggregatorType === "threshold") {
2719
2733
  const thresholdValue = rawAggregator.threshold;
2720
2734
  if (typeof thresholdValue !== "number" || thresholdValue < 0 || thresholdValue > 1) {
2721
2735
  logWarning2(
@@ -3463,10 +3477,15 @@ function coerceEvaluator(candidate, contextId) {
3463
3477
  return void 0;
3464
3478
  }
3465
3479
  const normalized = normalizeEvaluatorType(candidate);
3480
+ if (isDeprecatedJudgeType(normalized)) {
3481
+ throw new Error(
3482
+ `Unsupported grader '${candidate}' in ${contextId}. Use '${normalized.replace("-judge", "-grader")}' instead.`
3483
+ );
3484
+ }
3466
3485
  if (isEvaluatorKind(normalized)) {
3467
3486
  return normalized;
3468
3487
  }
3469
- logWarning2(`Unknown evaluator '${candidate}' in ${contextId}, falling back to default`);
3488
+ logWarning2(`Unknown grader '${candidate}' in ${contextId}, falling back to default`);
3470
3489
  return void 0;
3471
3490
  }
3472
3491
  function asString(value) {
@@ -4899,9 +4918,7 @@ function assertionToNaturalLanguage(entry) {
4899
4918
  case "ends_with":
4900
4919
  return `Output ends with '${entry.value}'`;
4901
4920
  case "llm-grader":
4902
- case "llm_grader":
4903
- case "llm-judge":
4904
- case "llm_judge": {
4921
+ case "llm_grader": {
4905
4922
  if (Array.isArray(entry.rubrics) && entry.rubrics.length > 0) {
4906
4923
  return null;
4907
4924
  }
@@ -4914,9 +4931,7 @@ function assertionToNaturalLanguage(entry) {
4914
4931
  return tools ? `Agent called tools in order: ${tools}` : "Agent followed expected tool trajectory";
4915
4932
  }
4916
4933
  case "code-grader":
4917
- case "code_grader":
4918
- case "code-judge":
4919
- case "code_judge": {
4934
+ case "code_grader": {
4920
4935
  const graderName = entry.name ?? deriveGraderNameFromCommand(entry.command) ?? "code-grader";
4921
4936
  const desc = typeof entry.description === "string" ? entry.description : void 0;
4922
4937
  return codeGraderInstruction(graderName, desc);
@@ -4947,7 +4962,7 @@ function assertionToNaturalLanguage(entry) {
4947
4962
  }
4948
4963
  }
4949
4964
  function assertionToNaturalLanguageList(entry) {
4950
- if (entry.type === "llm-grader" || entry.type === "llm_grader" || entry.type === "llm-judge" || entry.type === "llm_judge") {
4965
+ if (entry.type === "llm-grader" || entry.type === "llm_grader") {
4951
4966
  if (Array.isArray(entry.rubrics) && entry.rubrics.length > 0) {
4952
4967
  return entry.rubrics.map((r) => r.outcome ?? r.criteria ?? r.id).filter((s) => typeof s === "string");
4953
4968
  }
@@ -13168,7 +13183,7 @@ function toCamelCaseDeep(obj) {
13168
13183
  // src/evaluation/evaluators/code-evaluator.ts
13169
13184
  var FILE_BACKED_OUTPUT_THRESHOLD = 5e4;
13170
13185
  var CodeEvaluator = class {
13171
- kind = "code-judge";
13186
+ kind = "code-grader";
13172
13187
  command;
13173
13188
  cwd;
13174
13189
  agentTimeoutMs;
@@ -13187,7 +13202,7 @@ var CodeEvaluator = class {
13187
13202
  if (outputForPayload) {
13188
13203
  const serialized = JSON.stringify(outputForPayload);
13189
13204
  if (serialized.length > FILE_BACKED_OUTPUT_THRESHOLD) {
13190
- const tmpDir = await (0, import_promises26.mkdtemp)((0, import_node_path36.join)((0, import_node_os5.tmpdir)(), "agentv-judge-"));
13205
+ const tmpDir = await (0, import_promises26.mkdtemp)((0, import_node_path36.join)((0, import_node_os5.tmpdir)(), "agentv-grader-"));
13191
13206
  outputPath = (0, import_node_path36.join)(tmpDir, "output.json");
13192
13207
  await (0, import_promises26.writeFile)(outputPath, serialized);
13193
13208
  outputForPayload = null;
@@ -13477,7 +13492,7 @@ var LlmGraderEvaluator = class {
13477
13492
  return this.evaluateWithDelegatedAgent(context2, graderProvider);
13478
13493
  }
13479
13494
  const config = context2.evaluator;
13480
- if ((config?.type === "llm-grader" || config?.type === "llm-judge") && config.rubrics && config.rubrics.length > 0) {
13495
+ if (config?.type === "llm-grader" && config.rubrics && config.rubrics.length > 0) {
13481
13496
  return this.evaluateWithRubrics(context2, graderProvider, config.rubrics);
13482
13497
  }
13483
13498
  return this.evaluateFreeform(context2, graderProvider);
@@ -13662,7 +13677,7 @@ ${context2.fileChanges}`;
13662
13677
  const systemPrompt = this.buildAgentSystemPrompt(context2);
13663
13678
  const userPrompt = this.buildAgentUserPrompt(context2);
13664
13679
  const config = context2.evaluator;
13665
- const rubrics = config?.type === "llm-grader" || config?.type === "llm-judge" ? config.rubrics : void 0;
13680
+ const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
13666
13681
  const fsTools = createFilesystemTools(workspacePath);
13667
13682
  const evaluatorRawRequest = {
13668
13683
  mode: "built-in",
@@ -13758,7 +13773,7 @@ ${context2.fileChanges}`;
13758
13773
  };
13759
13774
  }
13760
13775
  const config = context2.evaluator;
13761
- const rubrics = config?.type === "llm-grader" || config?.type === "llm-judge" ? config.rubrics : void 0;
13776
+ const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
13762
13777
  const details = {
13763
13778
  mode: modeLabel,
13764
13779
  grader_target: provider.targetName
@@ -13798,7 +13813,7 @@ ${context2.fileChanges}`;
13798
13813
  */
13799
13814
  buildAgentSystemPrompt(context2) {
13800
13815
  const config = context2.evaluator;
13801
- const rubrics = config?.type === "llm-grader" || config?.type === "llm-judge" ? config.rubrics : void 0;
13816
+ const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
13802
13817
  const parts = [
13803
13818
  "You are an expert evaluator with access to the workspace filesystem.",
13804
13819
  "Use the provided tools to investigate the workspace and verify the criteria are met.",
@@ -13829,7 +13844,7 @@ ${context2.fileChanges}`;
13829
13844
  return substituteVariables(this.evaluatorTemplate, variables);
13830
13845
  }
13831
13846
  const config = context2.evaluator;
13832
- const rubrics = config?.type === "llm-grader" || config?.type === "llm-judge" ? config.rubrics : void 0;
13847
+ const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
13833
13848
  const parts = [
13834
13849
  "Evaluate the candidate answer by investigating the workspace.",
13835
13850
  "",
@@ -13872,7 +13887,7 @@ ${context2.fileChanges}`;
13872
13887
  buildDelegatedPrompt(context2) {
13873
13888
  const formattedQuestion = context2.promptInputs.question && context2.promptInputs.question.trim().length > 0 ? context2.promptInputs.question : context2.evalCase.question;
13874
13889
  const config = context2.evaluator;
13875
- const rubrics = config?.type === "llm-grader" || config?.type === "llm-judge" ? config.rubrics : void 0;
13890
+ const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
13876
13891
  if (this.evaluatorTemplate) {
13877
13892
  const variables = {
13878
13893
  [TEMPLATE_VARIABLES.CRITERIA]: context2.evalCase.criteria.trim(),
@@ -14369,10 +14384,8 @@ var CompositeEvaluator = class {
14369
14384
  const aggregator = this.config.aggregator;
14370
14385
  switch (aggregator.type) {
14371
14386
  case "code-grader":
14372
- case "code-judge":
14373
14387
  return this.runCodeAggregator(results, aggregator.path, aggregator.cwd ?? this.cwd);
14374
14388
  case "llm-grader":
14375
- case "llm-judge":
14376
14389
  return this.runLlmAggregator(results, context2, aggregator);
14377
14390
  case "threshold":
14378
14391
  return this.runThreshold(results, aggregator.threshold);
@@ -16794,7 +16807,7 @@ var endsWithFactory = (config) => {
16794
16807
  };
16795
16808
  function createBuiltinRegistry() {
16796
16809
  const registry = new EvaluatorRegistry();
16797
- registry.register("llm-grader", llmGraderFactory).register("llm-judge", llmGraderFactory).register("code-grader", codeFactory).register("code-judge", codeFactory).register("composite", compositeFactory).register("tool-trajectory", toolTrajectoryFactory).register("field-accuracy", fieldAccuracyFactory).register("latency", latencyFactory).register("cost", costFactory).register("token-usage", tokenUsageFactory).register("execution-metrics", executionMetricsFactory).register("skill-trigger", skillTriggerFactory).register("contains", containsFactory).register("contains-any", containsAnyFactory).register("contains-all", containsAllFactory).register("icontains", icontainsFactory).register("icontains-any", icontainsAnyFactory).register("icontains-all", icontainsAllFactory).register("starts-with", startsWithFactory).register("ends-with", endsWithFactory).register("regex", regexFactory).register("is-json", isJsonFactory).register("equals", equalsFactory).register("inline-assert", (config) => {
16810
+ registry.register("llm-grader", llmGraderFactory).register("code-grader", codeFactory).register("composite", compositeFactory).register("tool-trajectory", toolTrajectoryFactory).register("field-accuracy", fieldAccuracyFactory).register("latency", latencyFactory).register("cost", costFactory).register("token-usage", tokenUsageFactory).register("execution-metrics", executionMetricsFactory).register("skill-trigger", skillTriggerFactory).register("contains", containsFactory).register("contains-any", containsAnyFactory).register("contains-all", containsAllFactory).register("icontains", icontainsFactory).register("icontains-any", icontainsAnyFactory).register("icontains-all", icontainsAllFactory).register("starts-with", startsWithFactory).register("ends-with", endsWithFactory).register("regex", regexFactory).register("is-json", isJsonFactory).register("equals", equalsFactory).register("inline-assert", (config) => {
16798
16811
  const fn = config[INLINE_ASSERT_FN];
16799
16812
  if (!fn) {
16800
16813
  throw new Error(
@@ -19512,7 +19525,7 @@ function filterEvalCases(evalCases, filter) {
19512
19525
  return evalCases.filter((evalCase) => import_micromatch3.default.isMatch(evalCase.id, filter));
19513
19526
  }
19514
19527
  function buildEvaluatorRegistry(overrides, resolveGraderProvider) {
19515
- const llmGrader = overrides?.["llm-grader"] ?? overrides?.["llm-judge"] ?? new LlmGraderEvaluator({
19528
+ const llmGrader = overrides?.["llm-grader"] ?? new LlmGraderEvaluator({
19516
19529
  resolveGraderProvider: async (context2) => {
19517
19530
  if (context2.graderProvider) {
19518
19531
  return context2.graderProvider;
@@ -20356,10 +20369,10 @@ var OtelTraceExporter = class {
20356
20369
  }
20357
20370
  if (result.scores) {
20358
20371
  for (const score of result.scores) {
20359
- rootSpan.addEvent(`agentv.evaluator.${score.name}`, {
20360
- "agentv.evaluator.score": score.score,
20361
- "agentv.evaluator.type": score.type,
20362
- ...score.verdict ? { "agentv.evaluator.verdict": score.verdict } : {}
20372
+ rootSpan.addEvent(`agentv.grader.${score.name}`, {
20373
+ "agentv.grader.score": score.score,
20374
+ "agentv.grader.type": score.type,
20375
+ ...score.verdict ? { "agentv.grader.verdict": score.verdict } : {}
20363
20376
  });
20364
20377
  }
20365
20378
  }