@agentv/core 3.13.0 → 3.13.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,7 +8,7 @@ import {
8
8
  isEvaluatorKind,
9
9
  loadCasesFromFile,
10
10
  resolveFileReference
11
- } from "../../chunk-4XWPXNQM.js";
11
+ } from "../../chunk-ZB3AUPES.js";
12
12
 
13
13
  // src/evaluation/validation/file-type.ts
14
14
  import { readFile } from "node:fs/promises";
package/dist/index.cjs CHANGED
@@ -1455,6 +1455,7 @@ __export(index_exports, {
1455
1455
  extractTargetFromSuite: () => extractTargetFromSuite,
1456
1456
  extractTargetsFromSuite: () => extractTargetsFromSuite,
1457
1457
  extractTargetsFromTestCase: () => extractTargetsFromTestCase,
1458
+ extractThreshold: () => extractThreshold,
1458
1459
  extractTrialsConfig: () => extractTrialsConfig,
1459
1460
  extractWorkersFromSuite: () => extractWorkersFromSuite,
1460
1461
  fileExists: () => fileExists2,
@@ -1581,8 +1582,6 @@ function isTestMessage(value) {
1581
1582
  var EVALUATOR_KIND_VALUES = [
1582
1583
  "code-grader",
1583
1584
  "llm-grader",
1584
- "code-judge",
1585
- "llm-judge",
1586
1585
  "rubric",
1587
1586
  "composite",
1588
1587
  "tool-trajectory",
@@ -2322,6 +2321,22 @@ function extractFailOnError(suite) {
2322
2321
  logWarning(`Invalid execution.fail_on_error: ${raw}. Must be true or false. Ignoring.`);
2323
2322
  return void 0;
2324
2323
  }
2324
+ function extractThreshold(suite) {
2325
+ const execution = suite.execution;
2326
+ if (!execution || typeof execution !== "object" || Array.isArray(execution)) {
2327
+ return void 0;
2328
+ }
2329
+ const executionObj = execution;
2330
+ const raw = executionObj.threshold;
2331
+ if (raw === void 0 || raw === null) {
2332
+ return void 0;
2333
+ }
2334
+ if (typeof raw === "number" && raw >= 0 && raw <= 1) {
2335
+ return raw;
2336
+ }
2337
+ logWarning(`Invalid execution.threshold: ${raw}. Must be a number between 0 and 1. Ignoring.`);
2338
+ return void 0;
2339
+ }
2325
2340
  function parseExecutionDefaults(raw, configPath) {
2326
2341
  if (!raw || typeof raw !== "object" || Array.isArray(raw)) {
2327
2342
  return void 0;
@@ -2449,6 +2464,9 @@ var ANSI_RESET5 = "\x1B[0m";
2449
2464
  function normalizeEvaluatorType(type) {
2450
2465
  return type.replace(/_/g, "-");
2451
2466
  }
2467
+ function isDeprecatedJudgeType(type) {
2468
+ return type === "code-judge" || type === "llm-judge";
2469
+ }
2452
2470
  async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId) {
2453
2471
  const execution = rawEvalCase.execution;
2454
2472
  const executionObject = isJsonObject2(execution) ? execution : void 0;
@@ -2511,6 +2529,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2511
2529
  const rawName = asString(rawEvaluator.name);
2512
2530
  const rawType = rawEvaluator.type;
2513
2531
  const typeValue = typeof rawType === "string" ? normalizeEvaluatorType(rawType) : rawType;
2532
+ if (typeof typeValue === "string" && isDeprecatedJudgeType(typeValue)) {
2533
+ logWarning2(
2534
+ `Skipping evaluator '${rawName ?? "<unnamed>"}' in '${evalId}': '${rawType}' is deprecated. Use '${typeValue.replace("-judge", "-grader")}' instead`
2535
+ );
2536
+ continue;
2537
+ }
2514
2538
  const isCustomType = typeof typeValue === "string" && !isEvaluatorKind(typeValue);
2515
2539
  if (typeof typeValue !== "string") {
2516
2540
  logWarning2(`Skipping evaluator with invalid type in '${evalId}'`);
@@ -2543,7 +2567,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2543
2567
  });
2544
2568
  continue;
2545
2569
  }
2546
- if (typeValue === "code-grader" || typeValue === "code-judge") {
2570
+ if (typeValue === "code-grader") {
2547
2571
  let command;
2548
2572
  if (rawEvaluator.script !== void 0 && rawEvaluator.command === void 0) {
2549
2573
  console.warn(
@@ -2653,7 +2677,14 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2653
2677
  continue;
2654
2678
  }
2655
2679
  const aggregatorType = asString(rawAggregator.type);
2656
- if (aggregatorType !== "weighted_average" && aggregatorType !== "code-grader" && aggregatorType !== "code-judge" && aggregatorType !== "llm-grader" && aggregatorType !== "llm-judge" && aggregatorType !== "threshold") {
2680
+ const normalizedAggregatorType = typeof aggregatorType === "string" ? aggregatorType === "weighted_average" || aggregatorType === "threshold" ? aggregatorType : normalizeEvaluatorType(aggregatorType) : aggregatorType;
2681
+ if (typeof normalizedAggregatorType === "string" && isDeprecatedJudgeType(normalizedAggregatorType)) {
2682
+ logWarning2(
2683
+ `Skipping composite evaluator '${name}' in '${evalId}': aggregator type '${aggregatorType}' is deprecated. Use '${normalizedAggregatorType.replace("-judge", "-grader")}' instead`
2684
+ );
2685
+ continue;
2686
+ }
2687
+ if (normalizedAggregatorType !== "weighted_average" && normalizedAggregatorType !== "code-grader" && normalizedAggregatorType !== "llm-grader" && normalizedAggregatorType !== "threshold") {
2657
2688
  logWarning2(
2658
2689
  `Skipping composite evaluator '${name}' in '${evalId}': invalid aggregator type '${aggregatorType}'`
2659
2690
  );
@@ -2688,7 +2719,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2688
2719
  continue;
2689
2720
  }
2690
2721
  let aggregator;
2691
- if (aggregatorType === "weighted_average") {
2722
+ if (normalizedAggregatorType === "weighted_average") {
2692
2723
  const weights = isJsonObject2(rawAggregator.weights) ? rawAggregator.weights : void 0;
2693
2724
  const parsedWeights = {};
2694
2725
  if (weights) {
@@ -2702,7 +2733,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2702
2733
  type: "weighted_average",
2703
2734
  ...Object.keys(parsedWeights).length > 0 ? { weights: parsedWeights } : {}
2704
2735
  };
2705
- } else if (aggregatorType === "code-grader" || aggregatorType === "code-judge") {
2736
+ } else if (normalizedAggregatorType === "code-grader") {
2706
2737
  const aggregatorPath = asString(rawAggregator.path);
2707
2738
  if (!aggregatorPath) {
2708
2739
  logWarning2(
@@ -2715,7 +2746,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2715
2746
  path: aggregatorPath,
2716
2747
  cwd: searchRoots[0]
2717
2748
  };
2718
- } else if (aggregatorType === "threshold") {
2749
+ } else if (normalizedAggregatorType === "threshold") {
2719
2750
  const thresholdValue = rawAggregator.threshold;
2720
2751
  if (typeof thresholdValue !== "number" || thresholdValue < 0 || thresholdValue > 1) {
2721
2752
  logWarning2(
@@ -3463,10 +3494,15 @@ function coerceEvaluator(candidate, contextId) {
3463
3494
  return void 0;
3464
3495
  }
3465
3496
  const normalized = normalizeEvaluatorType(candidate);
3497
+ if (isDeprecatedJudgeType(normalized)) {
3498
+ throw new Error(
3499
+ `Unsupported grader '${candidate}' in ${contextId}. Use '${normalized.replace("-judge", "-grader")}' instead.`
3500
+ );
3501
+ }
3466
3502
  if (isEvaluatorKind(normalized)) {
3467
3503
  return normalized;
3468
3504
  }
3469
- logWarning2(`Unknown evaluator '${candidate}' in ${contextId}, falling back to default`);
3505
+ logWarning2(`Unknown grader '${candidate}' in ${contextId}, falling back to default`);
3470
3506
  return void 0;
3471
3507
  }
3472
3508
  function asString(value) {
@@ -4450,6 +4486,7 @@ async function loadTestSuite(evalFilePath, repoRoot, options) {
4450
4486
  const { tests, parsed } = await loadTestsFromYaml(evalFilePath, repoRoot, options);
4451
4487
  const metadata = parseMetadata(parsed);
4452
4488
  const failOnError = extractFailOnError(parsed);
4489
+ const threshold = extractThreshold(parsed);
4453
4490
  return {
4454
4491
  tests,
4455
4492
  trials: extractTrialsConfig(parsed),
@@ -4458,7 +4495,8 @@ async function loadTestSuite(evalFilePath, repoRoot, options) {
4458
4495
  cacheConfig: extractCacheConfig(parsed),
4459
4496
  totalBudgetUsd: extractTotalBudgetUsd(parsed),
4460
4497
  ...metadata !== void 0 && { metadata },
4461
- ...failOnError !== void 0 && { failOnError }
4498
+ ...failOnError !== void 0 && { failOnError },
4499
+ ...threshold !== void 0 && { threshold }
4462
4500
  };
4463
4501
  }
4464
4502
  var loadEvalSuite = loadTestSuite;
@@ -4899,9 +4937,7 @@ function assertionToNaturalLanguage(entry) {
4899
4937
  case "ends_with":
4900
4938
  return `Output ends with '${entry.value}'`;
4901
4939
  case "llm-grader":
4902
- case "llm_grader":
4903
- case "llm-judge":
4904
- case "llm_judge": {
4940
+ case "llm_grader": {
4905
4941
  if (Array.isArray(entry.rubrics) && entry.rubrics.length > 0) {
4906
4942
  return null;
4907
4943
  }
@@ -4914,9 +4950,7 @@ function assertionToNaturalLanguage(entry) {
4914
4950
  return tools ? `Agent called tools in order: ${tools}` : "Agent followed expected tool trajectory";
4915
4951
  }
4916
4952
  case "code-grader":
4917
- case "code_grader":
4918
- case "code-judge":
4919
- case "code_judge": {
4953
+ case "code_grader": {
4920
4954
  const graderName = entry.name ?? deriveGraderNameFromCommand(entry.command) ?? "code-grader";
4921
4955
  const desc = typeof entry.description === "string" ? entry.description : void 0;
4922
4956
  return codeGraderInstruction(graderName, desc);
@@ -4947,7 +4981,7 @@ function assertionToNaturalLanguage(entry) {
4947
4981
  }
4948
4982
  }
4949
4983
  function assertionToNaturalLanguageList(entry) {
4950
- if (entry.type === "llm-grader" || entry.type === "llm_grader" || entry.type === "llm-judge" || entry.type === "llm_judge") {
4984
+ if (entry.type === "llm-grader" || entry.type === "llm_grader") {
4951
4985
  if (Array.isArray(entry.rubrics) && entry.rubrics.length > 0) {
4952
4986
  return entry.rubrics.map((r) => r.outcome ?? r.criteria ?? r.id).filter((s) => typeof s === "string");
4953
4987
  }
@@ -13168,7 +13202,7 @@ function toCamelCaseDeep(obj) {
13168
13202
  // src/evaluation/evaluators/code-evaluator.ts
13169
13203
  var FILE_BACKED_OUTPUT_THRESHOLD = 5e4;
13170
13204
  var CodeEvaluator = class {
13171
- kind = "code-judge";
13205
+ kind = "code-grader";
13172
13206
  command;
13173
13207
  cwd;
13174
13208
  agentTimeoutMs;
@@ -13187,7 +13221,7 @@ var CodeEvaluator = class {
13187
13221
  if (outputForPayload) {
13188
13222
  const serialized = JSON.stringify(outputForPayload);
13189
13223
  if (serialized.length > FILE_BACKED_OUTPUT_THRESHOLD) {
13190
- const tmpDir = await (0, import_promises26.mkdtemp)((0, import_node_path36.join)((0, import_node_os5.tmpdir)(), "agentv-judge-"));
13224
+ const tmpDir = await (0, import_promises26.mkdtemp)((0, import_node_path36.join)((0, import_node_os5.tmpdir)(), "agentv-grader-"));
13191
13225
  outputPath = (0, import_node_path36.join)(tmpDir, "output.json");
13192
13226
  await (0, import_promises26.writeFile)(outputPath, serialized);
13193
13227
  outputForPayload = null;
@@ -13477,7 +13511,7 @@ var LlmGraderEvaluator = class {
13477
13511
  return this.evaluateWithDelegatedAgent(context2, graderProvider);
13478
13512
  }
13479
13513
  const config = context2.evaluator;
13480
- if ((config?.type === "llm-grader" || config?.type === "llm-judge") && config.rubrics && config.rubrics.length > 0) {
13514
+ if (config?.type === "llm-grader" && config.rubrics && config.rubrics.length > 0) {
13481
13515
  return this.evaluateWithRubrics(context2, graderProvider, config.rubrics);
13482
13516
  }
13483
13517
  return this.evaluateFreeform(context2, graderProvider);
@@ -13662,7 +13696,7 @@ ${context2.fileChanges}`;
13662
13696
  const systemPrompt = this.buildAgentSystemPrompt(context2);
13663
13697
  const userPrompt = this.buildAgentUserPrompt(context2);
13664
13698
  const config = context2.evaluator;
13665
- const rubrics = config?.type === "llm-grader" || config?.type === "llm-judge" ? config.rubrics : void 0;
13699
+ const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
13666
13700
  const fsTools = createFilesystemTools(workspacePath);
13667
13701
  const evaluatorRawRequest = {
13668
13702
  mode: "built-in",
@@ -13758,7 +13792,7 @@ ${context2.fileChanges}`;
13758
13792
  };
13759
13793
  }
13760
13794
  const config = context2.evaluator;
13761
- const rubrics = config?.type === "llm-grader" || config?.type === "llm-judge" ? config.rubrics : void 0;
13795
+ const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
13762
13796
  const details = {
13763
13797
  mode: modeLabel,
13764
13798
  grader_target: provider.targetName
@@ -13798,7 +13832,7 @@ ${context2.fileChanges}`;
13798
13832
  */
13799
13833
  buildAgentSystemPrompt(context2) {
13800
13834
  const config = context2.evaluator;
13801
- const rubrics = config?.type === "llm-grader" || config?.type === "llm-judge" ? config.rubrics : void 0;
13835
+ const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
13802
13836
  const parts = [
13803
13837
  "You are an expert evaluator with access to the workspace filesystem.",
13804
13838
  "Use the provided tools to investigate the workspace and verify the criteria are met.",
@@ -13829,7 +13863,7 @@ ${context2.fileChanges}`;
13829
13863
  return substituteVariables(this.evaluatorTemplate, variables);
13830
13864
  }
13831
13865
  const config = context2.evaluator;
13832
- const rubrics = config?.type === "llm-grader" || config?.type === "llm-judge" ? config.rubrics : void 0;
13866
+ const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
13833
13867
  const parts = [
13834
13868
  "Evaluate the candidate answer by investigating the workspace.",
13835
13869
  "",
@@ -13872,7 +13906,7 @@ ${context2.fileChanges}`;
13872
13906
  buildDelegatedPrompt(context2) {
13873
13907
  const formattedQuestion = context2.promptInputs.question && context2.promptInputs.question.trim().length > 0 ? context2.promptInputs.question : context2.evalCase.question;
13874
13908
  const config = context2.evaluator;
13875
- const rubrics = config?.type === "llm-grader" || config?.type === "llm-judge" ? config.rubrics : void 0;
13909
+ const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
13876
13910
  if (this.evaluatorTemplate) {
13877
13911
  const variables = {
13878
13912
  [TEMPLATE_VARIABLES.CRITERIA]: context2.evalCase.criteria.trim(),
@@ -14369,10 +14403,8 @@ var CompositeEvaluator = class {
14369
14403
  const aggregator = this.config.aggregator;
14370
14404
  switch (aggregator.type) {
14371
14405
  case "code-grader":
14372
- case "code-judge":
14373
14406
  return this.runCodeAggregator(results, aggregator.path, aggregator.cwd ?? this.cwd);
14374
14407
  case "llm-grader":
14375
- case "llm-judge":
14376
14408
  return this.runLlmAggregator(results, context2, aggregator);
14377
14409
  case "threshold":
14378
14410
  return this.runThreshold(results, aggregator.threshold);
@@ -16794,7 +16826,7 @@ var endsWithFactory = (config) => {
16794
16826
  };
16795
16827
  function createBuiltinRegistry() {
16796
16828
  const registry = new EvaluatorRegistry();
16797
- registry.register("llm-grader", llmGraderFactory).register("llm-judge", llmGraderFactory).register("code-grader", codeFactory).register("code-judge", codeFactory).register("composite", compositeFactory).register("tool-trajectory", toolTrajectoryFactory).register("field-accuracy", fieldAccuracyFactory).register("latency", latencyFactory).register("cost", costFactory).register("token-usage", tokenUsageFactory).register("execution-metrics", executionMetricsFactory).register("skill-trigger", skillTriggerFactory).register("contains", containsFactory).register("contains-any", containsAnyFactory).register("contains-all", containsAllFactory).register("icontains", icontainsFactory).register("icontains-any", icontainsAnyFactory).register("icontains-all", icontainsAllFactory).register("starts-with", startsWithFactory).register("ends-with", endsWithFactory).register("regex", regexFactory).register("is-json", isJsonFactory).register("equals", equalsFactory).register("inline-assert", (config) => {
16829
+ registry.register("llm-grader", llmGraderFactory).register("code-grader", codeFactory).register("composite", compositeFactory).register("tool-trajectory", toolTrajectoryFactory).register("field-accuracy", fieldAccuracyFactory).register("latency", latencyFactory).register("cost", costFactory).register("token-usage", tokenUsageFactory).register("execution-metrics", executionMetricsFactory).register("skill-trigger", skillTriggerFactory).register("contains", containsFactory).register("contains-any", containsAnyFactory).register("contains-all", containsAllFactory).register("icontains", icontainsFactory).register("icontains-any", icontainsAnyFactory).register("icontains-all", icontainsAllFactory).register("starts-with", startsWithFactory).register("ends-with", endsWithFactory).register("regex", regexFactory).register("is-json", isJsonFactory).register("equals", equalsFactory).register("inline-assert", (config) => {
16798
16830
  const fn = config[INLINE_ASSERT_FN];
16799
16831
  if (!fn) {
16800
16832
  throw new Error(
@@ -19512,7 +19544,7 @@ function filterEvalCases(evalCases, filter) {
19512
19544
  return evalCases.filter((evalCase) => import_micromatch3.default.isMatch(evalCase.id, filter));
19513
19545
  }
19514
19546
  function buildEvaluatorRegistry(overrides, resolveGraderProvider) {
19515
- const llmGrader = overrides?.["llm-grader"] ?? overrides?.["llm-judge"] ?? new LlmGraderEvaluator({
19547
+ const llmGrader = overrides?.["llm-grader"] ?? new LlmGraderEvaluator({
19516
19548
  resolveGraderProvider: async (context2) => {
19517
19549
  if (context2.graderProvider) {
19518
19550
  return context2.graderProvider;
@@ -20356,10 +20388,10 @@ var OtelTraceExporter = class {
20356
20388
  }
20357
20389
  if (result.scores) {
20358
20390
  for (const score of result.scores) {
20359
- rootSpan.addEvent(`agentv.evaluator.${score.name}`, {
20360
- "agentv.evaluator.score": score.score,
20361
- "agentv.evaluator.type": score.type,
20362
- ...score.verdict ? { "agentv.evaluator.verdict": score.verdict } : {}
20391
+ rootSpan.addEvent(`agentv.grader.${score.name}`, {
20392
+ "agentv.grader.score": score.score,
20393
+ "agentv.grader.type": score.type,
20394
+ ...score.verdict ? { "agentv.grader.verdict": score.verdict } : {}
20363
20395
  });
20364
20396
  }
20365
20397
  }
@@ -20749,6 +20781,7 @@ function createAgentKernel() {
20749
20781
  extractTargetFromSuite,
20750
20782
  extractTargetsFromSuite,
20751
20783
  extractTargetsFromTestCase,
20784
+ extractThreshold,
20752
20785
  extractTrialsConfig,
20753
20786
  extractWorkersFromSuite,
20754
20787
  fileExists,