agentv 3.13.0 → 3.13.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -301,7 +301,7 @@ var require_dist = __commonJS({
301
301
  }
302
302
  });
303
303
 
304
- // ../../packages/core/dist/chunk-4XWPXNQM.js
304
+ // ../../packages/core/dist/chunk-ZB3AUPES.js
305
305
  import { constants } from "node:fs";
306
306
  import { access, readFile } from "node:fs/promises";
307
307
  import path from "node:path";
@@ -419,7 +419,7 @@ __export(external_exports2, {
419
419
  void: () => voidType
420
420
  });
421
421
 
422
- // ../../packages/core/dist/chunk-4XWPXNQM.js
422
+ // ../../packages/core/dist/chunk-ZB3AUPES.js
423
423
  import { readFile as readFile2 } from "node:fs/promises";
424
424
  import path3 from "node:path";
425
425
  import fg from "fast-glob";
@@ -473,8 +473,6 @@ function isTestMessage(value) {
473
473
  var EVALUATOR_KIND_VALUES = [
474
474
  "code-grader",
475
475
  "llm-grader",
476
- "code-judge",
477
- "llm-judge",
478
476
  "rubric",
479
477
  "composite",
480
478
  "tool-trajectory",
@@ -14845,6 +14843,22 @@ function extractFailOnError(suite) {
14845
14843
  logWarning(`Invalid execution.fail_on_error: ${raw}. Must be true or false. Ignoring.`);
14846
14844
  return void 0;
14847
14845
  }
14846
+ function extractThreshold(suite) {
14847
+ const execution = suite.execution;
14848
+ if (!execution || typeof execution !== "object" || Array.isArray(execution)) {
14849
+ return void 0;
14850
+ }
14851
+ const executionObj = execution;
14852
+ const raw = executionObj.threshold;
14853
+ if (raw === void 0 || raw === null) {
14854
+ return void 0;
14855
+ }
14856
+ if (typeof raw === "number" && raw >= 0 && raw <= 1) {
14857
+ return raw;
14858
+ }
14859
+ logWarning(`Invalid execution.threshold: ${raw}. Must be a number between 0 and 1. Ignoring.`);
14860
+ return void 0;
14861
+ }
14848
14862
  function parseExecutionDefaults(raw, configPath) {
14849
14863
  if (!raw || typeof raw !== "object" || Array.isArray(raw)) {
14850
14864
  return void 0;
@@ -14960,6 +14974,9 @@ var ANSI_RESET4 = "\x1B[0m";
14960
14974
  function normalizeEvaluatorType(type) {
14961
14975
  return type.replace(/_/g, "-");
14962
14976
  }
14977
+ function isDeprecatedJudgeType(type) {
14978
+ return type === "code-judge" || type === "llm-judge";
14979
+ }
14963
14980
  async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId) {
14964
14981
  const execution = rawEvalCase.execution;
14965
14982
  const executionObject = isJsonObject2(execution) ? execution : void 0;
@@ -15022,6 +15039,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
15022
15039
  const rawName = asString(rawEvaluator.name);
15023
15040
  const rawType = rawEvaluator.type;
15024
15041
  const typeValue = typeof rawType === "string" ? normalizeEvaluatorType(rawType) : rawType;
15042
+ if (typeof typeValue === "string" && isDeprecatedJudgeType(typeValue)) {
15043
+ logWarning2(
15044
+ `Skipping evaluator '${rawName ?? "<unnamed>"}' in '${evalId}': '${rawType}' is deprecated. Use '${typeValue.replace("-judge", "-grader")}' instead`
15045
+ );
15046
+ continue;
15047
+ }
15025
15048
  const isCustomType = typeof typeValue === "string" && !isEvaluatorKind(typeValue);
15026
15049
  if (typeof typeValue !== "string") {
15027
15050
  logWarning2(`Skipping evaluator with invalid type in '${evalId}'`);
@@ -15054,7 +15077,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
15054
15077
  });
15055
15078
  continue;
15056
15079
  }
15057
- if (typeValue === "code-grader" || typeValue === "code-judge") {
15080
+ if (typeValue === "code-grader") {
15058
15081
  let command;
15059
15082
  if (rawEvaluator.script !== void 0 && rawEvaluator.command === void 0) {
15060
15083
  console.warn(
@@ -15164,7 +15187,14 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
15164
15187
  continue;
15165
15188
  }
15166
15189
  const aggregatorType = asString(rawAggregator.type);
15167
- if (aggregatorType !== "weighted_average" && aggregatorType !== "code-grader" && aggregatorType !== "code-judge" && aggregatorType !== "llm-grader" && aggregatorType !== "llm-judge" && aggregatorType !== "threshold") {
15190
+ const normalizedAggregatorType = typeof aggregatorType === "string" ? aggregatorType === "weighted_average" || aggregatorType === "threshold" ? aggregatorType : normalizeEvaluatorType(aggregatorType) : aggregatorType;
15191
+ if (typeof normalizedAggregatorType === "string" && isDeprecatedJudgeType(normalizedAggregatorType)) {
15192
+ logWarning2(
15193
+ `Skipping composite evaluator '${name21}' in '${evalId}': aggregator type '${aggregatorType}' is deprecated. Use '${normalizedAggregatorType.replace("-judge", "-grader")}' instead`
15194
+ );
15195
+ continue;
15196
+ }
15197
+ if (normalizedAggregatorType !== "weighted_average" && normalizedAggregatorType !== "code-grader" && normalizedAggregatorType !== "llm-grader" && normalizedAggregatorType !== "threshold") {
15168
15198
  logWarning2(
15169
15199
  `Skipping composite evaluator '${name21}' in '${evalId}': invalid aggregator type '${aggregatorType}'`
15170
15200
  );
@@ -15199,7 +15229,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
15199
15229
  continue;
15200
15230
  }
15201
15231
  let aggregator;
15202
- if (aggregatorType === "weighted_average") {
15232
+ if (normalizedAggregatorType === "weighted_average") {
15203
15233
  const weights = isJsonObject2(rawAggregator.weights) ? rawAggregator.weights : void 0;
15204
15234
  const parsedWeights = {};
15205
15235
  if (weights) {
@@ -15213,7 +15243,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
15213
15243
  type: "weighted_average",
15214
15244
  ...Object.keys(parsedWeights).length > 0 ? { weights: parsedWeights } : {}
15215
15245
  };
15216
- } else if (aggregatorType === "code-grader" || aggregatorType === "code-judge") {
15246
+ } else if (normalizedAggregatorType === "code-grader") {
15217
15247
  const aggregatorPath = asString(rawAggregator.path);
15218
15248
  if (!aggregatorPath) {
15219
15249
  logWarning2(
@@ -15226,7 +15256,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
15226
15256
  path: aggregatorPath,
15227
15257
  cwd: searchRoots[0]
15228
15258
  };
15229
- } else if (aggregatorType === "threshold") {
15259
+ } else if (normalizedAggregatorType === "threshold") {
15230
15260
  const thresholdValue = rawAggregator.threshold;
15231
15261
  if (typeof thresholdValue !== "number" || thresholdValue < 0 || thresholdValue > 1) {
15232
15262
  logWarning2(
@@ -15974,10 +16004,15 @@ function coerceEvaluator(candidate, contextId) {
15974
16004
  return void 0;
15975
16005
  }
15976
16006
  const normalized = normalizeEvaluatorType(candidate);
16007
+ if (isDeprecatedJudgeType(normalized)) {
16008
+ throw new Error(
16009
+ `Unsupported grader '${candidate}' in ${contextId}. Use '${normalized.replace("-judge", "-grader")}' instead.`
16010
+ );
16011
+ }
15977
16012
  if (isEvaluatorKind(normalized)) {
15978
16013
  return normalized;
15979
16014
  }
15980
- logWarning2(`Unknown evaluator '${candidate}' in ${contextId}, falling back to default`);
16015
+ logWarning2(`Unknown grader '${candidate}' in ${contextId}, falling back to default`);
15981
16016
  return void 0;
15982
16017
  }
15983
16018
  function asString(value) {
@@ -16936,6 +16971,7 @@ async function loadTestSuite(evalFilePath, repoRoot, options) {
16936
16971
  const { tests, parsed } = await loadTestsFromYaml(evalFilePath, repoRoot, options);
16937
16972
  const metadata = parseMetadata(parsed);
16938
16973
  const failOnError = extractFailOnError(parsed);
16974
+ const threshold = extractThreshold(parsed);
16939
16975
  return {
16940
16976
  tests,
16941
16977
  trials: extractTrialsConfig(parsed),
@@ -16944,7 +16980,8 @@ async function loadTestSuite(evalFilePath, repoRoot, options) {
16944
16980
  cacheConfig: extractCacheConfig(parsed),
16945
16981
  totalBudgetUsd: extractTotalBudgetUsd(parsed),
16946
16982
  ...metadata !== void 0 && { metadata },
16947
- ...failOnError !== void 0 && { failOnError }
16983
+ ...failOnError !== void 0 && { failOnError },
16984
+ ...threshold !== void 0 && { threshold }
16948
16985
  };
16949
16986
  }
16950
16987
  var loadEvalSuite = loadTestSuite;
@@ -17380,9 +17417,7 @@ function assertionToNaturalLanguage(entry) {
17380
17417
  case "ends_with":
17381
17418
  return `Output ends with '${entry.value}'`;
17382
17419
  case "llm-grader":
17383
- case "llm_grader":
17384
- case "llm-judge":
17385
- case "llm_judge": {
17420
+ case "llm_grader": {
17386
17421
  if (Array.isArray(entry.rubrics) && entry.rubrics.length > 0) {
17387
17422
  return null;
17388
17423
  }
@@ -17395,9 +17430,7 @@ function assertionToNaturalLanguage(entry) {
17395
17430
  return tools ? `Agent called tools in order: ${tools}` : "Agent followed expected tool trajectory";
17396
17431
  }
17397
17432
  case "code-grader":
17398
- case "code_grader":
17399
- case "code-judge":
17400
- case "code_judge": {
17433
+ case "code_grader": {
17401
17434
  const graderName = entry.name ?? deriveGraderNameFromCommand(entry.command) ?? "code-grader";
17402
17435
  const desc = typeof entry.description === "string" ? entry.description : void 0;
17403
17436
  return codeGraderInstruction(graderName, desc);
@@ -17428,7 +17461,7 @@ function assertionToNaturalLanguage(entry) {
17428
17461
  }
17429
17462
  }
17430
17463
  function assertionToNaturalLanguageList(entry) {
17431
- if (entry.type === "llm-grader" || entry.type === "llm_grader" || entry.type === "llm-judge" || entry.type === "llm_judge") {
17464
+ if (entry.type === "llm-grader" || entry.type === "llm_grader") {
17432
17465
  if (Array.isArray(entry.rubrics) && entry.rubrics.length > 0) {
17433
17466
  return entry.rubrics.map((r) => r.outcome ?? r.criteria ?? r.id).filter((s) => typeof s === "string");
17434
17467
  }
@@ -24084,7 +24117,7 @@ function toCamelCaseDeep(obj) {
24084
24117
  }
24085
24118
  var FILE_BACKED_OUTPUT_THRESHOLD = 5e4;
24086
24119
  var CodeEvaluator = class {
24087
- kind = "code-judge";
24120
+ kind = "code-grader";
24088
24121
  command;
24089
24122
  cwd;
24090
24123
  agentTimeoutMs;
@@ -24103,7 +24136,7 @@ var CodeEvaluator = class {
24103
24136
  if (outputForPayload) {
24104
24137
  const serialized = JSON.stringify(outputForPayload);
24105
24138
  if (serialized.length > FILE_BACKED_OUTPUT_THRESHOLD) {
24106
- const tmpDir = await mkdtemp2(join(tmpdir2(), "agentv-judge-"));
24139
+ const tmpDir = await mkdtemp2(join(tmpdir2(), "agentv-grader-"));
24107
24140
  outputPath = join(tmpDir, "output.json");
24108
24141
  await writeFile6(outputPath, serialized);
24109
24142
  outputForPayload = null;
@@ -24352,7 +24385,7 @@ var LlmGraderEvaluator = class {
24352
24385
  return this.evaluateWithDelegatedAgent(context2, graderProvider);
24353
24386
  }
24354
24387
  const config = context2.evaluator;
24355
- if ((config?.type === "llm-grader" || config?.type === "llm-judge") && config.rubrics && config.rubrics.length > 0) {
24388
+ if (config?.type === "llm-grader" && config.rubrics && config.rubrics.length > 0) {
24356
24389
  return this.evaluateWithRubrics(context2, graderProvider, config.rubrics);
24357
24390
  }
24358
24391
  return this.evaluateFreeform(context2, graderProvider);
@@ -24537,7 +24570,7 @@ ${context2.fileChanges}`;
24537
24570
  const systemPrompt = this.buildAgentSystemPrompt(context2);
24538
24571
  const userPrompt = this.buildAgentUserPrompt(context2);
24539
24572
  const config = context2.evaluator;
24540
- const rubrics = config?.type === "llm-grader" || config?.type === "llm-judge" ? config.rubrics : void 0;
24573
+ const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
24541
24574
  const fsTools = createFilesystemTools(workspacePath);
24542
24575
  const evaluatorRawRequest = {
24543
24576
  mode: "built-in",
@@ -24633,7 +24666,7 @@ ${context2.fileChanges}`;
24633
24666
  };
24634
24667
  }
24635
24668
  const config = context2.evaluator;
24636
- const rubrics = config?.type === "llm-grader" || config?.type === "llm-judge" ? config.rubrics : void 0;
24669
+ const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
24637
24670
  const details = {
24638
24671
  mode: modeLabel,
24639
24672
  grader_target: provider.targetName
@@ -24673,7 +24706,7 @@ ${context2.fileChanges}`;
24673
24706
  */
24674
24707
  buildAgentSystemPrompt(context2) {
24675
24708
  const config = context2.evaluator;
24676
- const rubrics = config?.type === "llm-grader" || config?.type === "llm-judge" ? config.rubrics : void 0;
24709
+ const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
24677
24710
  const parts = [
24678
24711
  "You are an expert evaluator with access to the workspace filesystem.",
24679
24712
  "Use the provided tools to investigate the workspace and verify the criteria are met.",
@@ -24704,7 +24737,7 @@ ${context2.fileChanges}`;
24704
24737
  return substituteVariables(this.evaluatorTemplate, variables);
24705
24738
  }
24706
24739
  const config = context2.evaluator;
24707
- const rubrics = config?.type === "llm-grader" || config?.type === "llm-judge" ? config.rubrics : void 0;
24740
+ const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
24708
24741
  const parts = [
24709
24742
  "Evaluate the candidate answer by investigating the workspace.",
24710
24743
  "",
@@ -24747,7 +24780,7 @@ ${context2.fileChanges}`;
24747
24780
  buildDelegatedPrompt(context2) {
24748
24781
  const formattedQuestion = context2.promptInputs.question && context2.promptInputs.question.trim().length > 0 ? context2.promptInputs.question : context2.evalCase.question;
24749
24782
  const config = context2.evaluator;
24750
- const rubrics = config?.type === "llm-grader" || config?.type === "llm-judge" ? config.rubrics : void 0;
24783
+ const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
24751
24784
  if (this.evaluatorTemplate) {
24752
24785
  const variables = {
24753
24786
  [TEMPLATE_VARIABLES.CRITERIA]: context2.evalCase.criteria.trim(),
@@ -25242,10 +25275,8 @@ var CompositeEvaluator = class {
25242
25275
  const aggregator = this.config.aggregator;
25243
25276
  switch (aggregator.type) {
25244
25277
  case "code-grader":
25245
- case "code-judge":
25246
25278
  return this.runCodeAggregator(results, aggregator.path, aggregator.cwd ?? this.cwd);
25247
25279
  case "llm-grader":
25248
- case "llm-judge":
25249
25280
  return this.runLlmAggregator(results, context2, aggregator);
25250
25281
  case "threshold":
25251
25282
  return this.runThreshold(results, aggregator.threshold);
@@ -27630,7 +27661,7 @@ var endsWithFactory = (config) => {
27630
27661
  };
27631
27662
  function createBuiltinRegistry() {
27632
27663
  const registry = new EvaluatorRegistry();
27633
- registry.register("llm-grader", llmGraderFactory).register("llm-judge", llmGraderFactory).register("code-grader", codeFactory).register("code-judge", codeFactory).register("composite", compositeFactory).register("tool-trajectory", toolTrajectoryFactory).register("field-accuracy", fieldAccuracyFactory).register("latency", latencyFactory).register("cost", costFactory).register("token-usage", tokenUsageFactory).register("execution-metrics", executionMetricsFactory).register("skill-trigger", skillTriggerFactory).register("contains", containsFactory).register("contains-any", containsAnyFactory).register("contains-all", containsAllFactory).register("icontains", icontainsFactory).register("icontains-any", icontainsAnyFactory).register("icontains-all", icontainsAllFactory).register("starts-with", startsWithFactory).register("ends-with", endsWithFactory).register("regex", regexFactory).register("is-json", isJsonFactory).register("equals", equalsFactory).register("inline-assert", (config) => {
27664
+ registry.register("llm-grader", llmGraderFactory).register("code-grader", codeFactory).register("composite", compositeFactory).register("tool-trajectory", toolTrajectoryFactory).register("field-accuracy", fieldAccuracyFactory).register("latency", latencyFactory).register("cost", costFactory).register("token-usage", tokenUsageFactory).register("execution-metrics", executionMetricsFactory).register("skill-trigger", skillTriggerFactory).register("contains", containsFactory).register("contains-any", containsAnyFactory).register("contains-all", containsAllFactory).register("icontains", icontainsFactory).register("icontains-any", icontainsAnyFactory).register("icontains-all", icontainsAllFactory).register("starts-with", startsWithFactory).register("ends-with", endsWithFactory).register("regex", regexFactory).register("is-json", isJsonFactory).register("equals", equalsFactory).register("inline-assert", (config) => {
27634
27665
  const fn = config[INLINE_ASSERT_FN];
27635
27666
  if (!fn) {
27636
27667
  throw new Error(
@@ -30306,7 +30337,7 @@ function filterEvalCases(evalCases, filter2) {
30306
30337
  return evalCases.filter((evalCase) => micromatch3.isMatch(evalCase.id, filter2));
30307
30338
  }
30308
30339
  function buildEvaluatorRegistry(overrides, resolveGraderProvider) {
30309
- const llmGrader = overrides?.["llm-grader"] ?? overrides?.["llm-judge"] ?? new LlmGraderEvaluator({
30340
+ const llmGrader = overrides?.["llm-grader"] ?? new LlmGraderEvaluator({
30310
30341
  resolveGraderProvider: async (context2) => {
30311
30342
  if (context2.graderProvider) {
30312
30343
  return context2.graderProvider;
@@ -31127,10 +31158,10 @@ var OtelTraceExporter = class {
31127
31158
  }
31128
31159
  if (result.scores) {
31129
31160
  for (const score of result.scores) {
31130
- rootSpan.addEvent(`agentv.evaluator.${score.name}`, {
31131
- "agentv.evaluator.score": score.score,
31132
- "agentv.evaluator.type": score.type,
31133
- ...score.verdict ? { "agentv.evaluator.verdict": score.verdict } : {}
31161
+ rootSpan.addEvent(`agentv.grader.${score.name}`, {
31162
+ "agentv.grader.score": score.score,
31163
+ "agentv.grader.type": score.type,
31164
+ ...score.verdict ? { "agentv.grader.verdict": score.verdict } : {}
31134
31165
  });
31135
31166
  }
31136
31167
  }
@@ -31480,6 +31511,7 @@ export {
31480
31511
  extractTrialsConfig,
31481
31512
  extractCacheConfig,
31482
31513
  extractFailOnError,
31514
+ extractThreshold,
31483
31515
  detectFormat,
31484
31516
  buildPromptInputs,
31485
31517
  readTestSuiteMetadata,
@@ -31590,4 +31622,4 @@ export {
31590
31622
  OtelStreamingObserver,
31591
31623
  createAgentKernel
31592
31624
  };
31593
- //# sourceMappingURL=chunk-7OHZAFND.js.map
31625
+ //# sourceMappingURL=chunk-D3LNJUUB.js.map