agentv 3.13.0 → 3.13.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -221,13 +221,13 @@ agentv eval evals/my-eval.yaml -o results.xml
221
221
 
222
222
  The HTML report auto-refreshes every 2 seconds during a live run, then locks once the run completes.
223
223
 
224
- By default, `agentv eval` creates a run workspace under `.agentv/results/raw/<run>/`
224
+ By default, `agentv eval` creates a run workspace under `.agentv/results/runs/<run>/`
225
225
  with `index.jsonl` as the machine-facing manifest.
226
226
 
227
227
  You can also convert an existing manifest to HTML after the fact:
228
228
 
229
229
  ```bash
230
- agentv convert .agentv/results/raw/eval_<timestamp>/index.jsonl -o report.html
230
+ agentv convert .agentv/results/runs/eval_<timestamp>/index.jsonl -o report.html
231
231
  ```
232
232
 
233
233
  #### Timeouts
@@ -358,7 +358,7 @@ agentv create eval my-eval # → evals/my-eval.eval.yaml + .cases.jsonl
358
358
  Compare a combined results file across all targets (N-way matrix):
359
359
 
360
360
  ```bash
361
- agentv compare .agentv/results/raw/eval_<timestamp>/index.jsonl
361
+ agentv compare .agentv/results/runs/eval_<timestamp>/index.jsonl
362
362
  ```
363
363
 
364
364
  ```
@@ -379,8 +379,8 @@ Pairwise Summary:
379
379
  Designate a baseline for CI regression gating, or compare two specific targets:
380
380
 
381
381
  ```bash
382
- agentv compare .agentv/results/raw/eval_<timestamp>/index.jsonl --baseline gpt-4.1
383
- agentv compare .agentv/results/raw/eval_<timestamp>/index.jsonl --baseline gpt-4.1 --candidate gpt-5-mini
382
+ agentv compare .agentv/results/runs/eval_<timestamp>/index.jsonl --baseline gpt-4.1
383
+ agentv compare .agentv/results/runs/eval_<timestamp>/index.jsonl --baseline gpt-4.1 --candidate gpt-5-mini
384
384
  agentv compare before.jsonl after.jsonl # two-file pairwise
385
385
  ```
386
386
 
@@ -301,7 +301,7 @@ var require_dist = __commonJS({
301
301
  }
302
302
  });
303
303
 
304
- // ../../packages/core/dist/chunk-4XWPXNQM.js
304
+ // ../../packages/core/dist/chunk-ZB3AUPES.js
305
305
  import { constants } from "node:fs";
306
306
  import { access, readFile } from "node:fs/promises";
307
307
  import path from "node:path";
@@ -419,7 +419,7 @@ __export(external_exports2, {
419
419
  void: () => voidType
420
420
  });
421
421
 
422
- // ../../packages/core/dist/chunk-4XWPXNQM.js
422
+ // ../../packages/core/dist/chunk-ZB3AUPES.js
423
423
  import { readFile as readFile2 } from "node:fs/promises";
424
424
  import path3 from "node:path";
425
425
  import fg from "fast-glob";
@@ -473,8 +473,6 @@ function isTestMessage(value) {
473
473
  var EVALUATOR_KIND_VALUES = [
474
474
  "code-grader",
475
475
  "llm-grader",
476
- "code-judge",
477
- "llm-judge",
478
476
  "rubric",
479
477
  "composite",
480
478
  "tool-trajectory",
@@ -14960,6 +14958,9 @@ var ANSI_RESET4 = "\x1B[0m";
14960
14958
  function normalizeEvaluatorType(type) {
14961
14959
  return type.replace(/_/g, "-");
14962
14960
  }
14961
+ function isDeprecatedJudgeType(type) {
14962
+ return type === "code-judge" || type === "llm-judge";
14963
+ }
14963
14964
  async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId) {
14964
14965
  const execution = rawEvalCase.execution;
14965
14966
  const executionObject = isJsonObject2(execution) ? execution : void 0;
@@ -15022,6 +15023,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
15022
15023
  const rawName = asString(rawEvaluator.name);
15023
15024
  const rawType = rawEvaluator.type;
15024
15025
  const typeValue = typeof rawType === "string" ? normalizeEvaluatorType(rawType) : rawType;
15026
+ if (typeof typeValue === "string" && isDeprecatedJudgeType(typeValue)) {
15027
+ logWarning2(
15028
+ `Skipping evaluator '${rawName ?? "<unnamed>"}' in '${evalId}': '${rawType}' is deprecated. Use '${typeValue.replace("-judge", "-grader")}' instead`
15029
+ );
15030
+ continue;
15031
+ }
15025
15032
  const isCustomType = typeof typeValue === "string" && !isEvaluatorKind(typeValue);
15026
15033
  if (typeof typeValue !== "string") {
15027
15034
  logWarning2(`Skipping evaluator with invalid type in '${evalId}'`);
@@ -15054,7 +15061,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
15054
15061
  });
15055
15062
  continue;
15056
15063
  }
15057
- if (typeValue === "code-grader" || typeValue === "code-judge") {
15064
+ if (typeValue === "code-grader") {
15058
15065
  let command;
15059
15066
  if (rawEvaluator.script !== void 0 && rawEvaluator.command === void 0) {
15060
15067
  console.warn(
@@ -15164,7 +15171,14 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
15164
15171
  continue;
15165
15172
  }
15166
15173
  const aggregatorType = asString(rawAggregator.type);
15167
- if (aggregatorType !== "weighted_average" && aggregatorType !== "code-grader" && aggregatorType !== "code-judge" && aggregatorType !== "llm-grader" && aggregatorType !== "llm-judge" && aggregatorType !== "threshold") {
15174
+ const normalizedAggregatorType = typeof aggregatorType === "string" ? aggregatorType === "weighted_average" || aggregatorType === "threshold" ? aggregatorType : normalizeEvaluatorType(aggregatorType) : aggregatorType;
15175
+ if (typeof normalizedAggregatorType === "string" && isDeprecatedJudgeType(normalizedAggregatorType)) {
15176
+ logWarning2(
15177
+ `Skipping composite evaluator '${name21}' in '${evalId}': aggregator type '${aggregatorType}' is deprecated. Use '${normalizedAggregatorType.replace("-judge", "-grader")}' instead`
15178
+ );
15179
+ continue;
15180
+ }
15181
+ if (normalizedAggregatorType !== "weighted_average" && normalizedAggregatorType !== "code-grader" && normalizedAggregatorType !== "llm-grader" && normalizedAggregatorType !== "threshold") {
15168
15182
  logWarning2(
15169
15183
  `Skipping composite evaluator '${name21}' in '${evalId}': invalid aggregator type '${aggregatorType}'`
15170
15184
  );
@@ -15199,7 +15213,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
15199
15213
  continue;
15200
15214
  }
15201
15215
  let aggregator;
15202
- if (aggregatorType === "weighted_average") {
15216
+ if (normalizedAggregatorType === "weighted_average") {
15203
15217
  const weights = isJsonObject2(rawAggregator.weights) ? rawAggregator.weights : void 0;
15204
15218
  const parsedWeights = {};
15205
15219
  if (weights) {
@@ -15213,7 +15227,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
15213
15227
  type: "weighted_average",
15214
15228
  ...Object.keys(parsedWeights).length > 0 ? { weights: parsedWeights } : {}
15215
15229
  };
15216
- } else if (aggregatorType === "code-grader" || aggregatorType === "code-judge") {
15230
+ } else if (normalizedAggregatorType === "code-grader") {
15217
15231
  const aggregatorPath = asString(rawAggregator.path);
15218
15232
  if (!aggregatorPath) {
15219
15233
  logWarning2(
@@ -15226,7 +15240,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
15226
15240
  path: aggregatorPath,
15227
15241
  cwd: searchRoots[0]
15228
15242
  };
15229
- } else if (aggregatorType === "threshold") {
15243
+ } else if (normalizedAggregatorType === "threshold") {
15230
15244
  const thresholdValue = rawAggregator.threshold;
15231
15245
  if (typeof thresholdValue !== "number" || thresholdValue < 0 || thresholdValue > 1) {
15232
15246
  logWarning2(
@@ -15974,10 +15988,15 @@ function coerceEvaluator(candidate, contextId) {
15974
15988
  return void 0;
15975
15989
  }
15976
15990
  const normalized = normalizeEvaluatorType(candidate);
15991
+ if (isDeprecatedJudgeType(normalized)) {
15992
+ throw new Error(
15993
+ `Unsupported grader '${candidate}' in ${contextId}. Use '${normalized.replace("-judge", "-grader")}' instead.`
15994
+ );
15995
+ }
15977
15996
  if (isEvaluatorKind(normalized)) {
15978
15997
  return normalized;
15979
15998
  }
15980
- logWarning2(`Unknown evaluator '${candidate}' in ${contextId}, falling back to default`);
15999
+ logWarning2(`Unknown grader '${candidate}' in ${contextId}, falling back to default`);
15981
16000
  return void 0;
15982
16001
  }
15983
16002
  function asString(value) {
@@ -17380,9 +17399,7 @@ function assertionToNaturalLanguage(entry) {
17380
17399
  case "ends_with":
17381
17400
  return `Output ends with '${entry.value}'`;
17382
17401
  case "llm-grader":
17383
- case "llm_grader":
17384
- case "llm-judge":
17385
- case "llm_judge": {
17402
+ case "llm_grader": {
17386
17403
  if (Array.isArray(entry.rubrics) && entry.rubrics.length > 0) {
17387
17404
  return null;
17388
17405
  }
@@ -17395,9 +17412,7 @@ function assertionToNaturalLanguage(entry) {
17395
17412
  return tools ? `Agent called tools in order: ${tools}` : "Agent followed expected tool trajectory";
17396
17413
  }
17397
17414
  case "code-grader":
17398
- case "code_grader":
17399
- case "code-judge":
17400
- case "code_judge": {
17415
+ case "code_grader": {
17401
17416
  const graderName = entry.name ?? deriveGraderNameFromCommand(entry.command) ?? "code-grader";
17402
17417
  const desc = typeof entry.description === "string" ? entry.description : void 0;
17403
17418
  return codeGraderInstruction(graderName, desc);
@@ -17428,7 +17443,7 @@ function assertionToNaturalLanguage(entry) {
17428
17443
  }
17429
17444
  }
17430
17445
  function assertionToNaturalLanguageList(entry) {
17431
- if (entry.type === "llm-grader" || entry.type === "llm_grader" || entry.type === "llm-judge" || entry.type === "llm_judge") {
17446
+ if (entry.type === "llm-grader" || entry.type === "llm_grader") {
17432
17447
  if (Array.isArray(entry.rubrics) && entry.rubrics.length > 0) {
17433
17448
  return entry.rubrics.map((r) => r.outcome ?? r.criteria ?? r.id).filter((s) => typeof s === "string");
17434
17449
  }
@@ -24084,7 +24099,7 @@ function toCamelCaseDeep(obj) {
24084
24099
  }
24085
24100
  var FILE_BACKED_OUTPUT_THRESHOLD = 5e4;
24086
24101
  var CodeEvaluator = class {
24087
- kind = "code-judge";
24102
+ kind = "code-grader";
24088
24103
  command;
24089
24104
  cwd;
24090
24105
  agentTimeoutMs;
@@ -24103,7 +24118,7 @@ var CodeEvaluator = class {
24103
24118
  if (outputForPayload) {
24104
24119
  const serialized = JSON.stringify(outputForPayload);
24105
24120
  if (serialized.length > FILE_BACKED_OUTPUT_THRESHOLD) {
24106
- const tmpDir = await mkdtemp2(join(tmpdir2(), "agentv-judge-"));
24121
+ const tmpDir = await mkdtemp2(join(tmpdir2(), "agentv-grader-"));
24107
24122
  outputPath = join(tmpDir, "output.json");
24108
24123
  await writeFile6(outputPath, serialized);
24109
24124
  outputForPayload = null;
@@ -24352,7 +24367,7 @@ var LlmGraderEvaluator = class {
24352
24367
  return this.evaluateWithDelegatedAgent(context2, graderProvider);
24353
24368
  }
24354
24369
  const config = context2.evaluator;
24355
- if ((config?.type === "llm-grader" || config?.type === "llm-judge") && config.rubrics && config.rubrics.length > 0) {
24370
+ if (config?.type === "llm-grader" && config.rubrics && config.rubrics.length > 0) {
24356
24371
  return this.evaluateWithRubrics(context2, graderProvider, config.rubrics);
24357
24372
  }
24358
24373
  return this.evaluateFreeform(context2, graderProvider);
@@ -24537,7 +24552,7 @@ ${context2.fileChanges}`;
24537
24552
  const systemPrompt = this.buildAgentSystemPrompt(context2);
24538
24553
  const userPrompt = this.buildAgentUserPrompt(context2);
24539
24554
  const config = context2.evaluator;
24540
- const rubrics = config?.type === "llm-grader" || config?.type === "llm-judge" ? config.rubrics : void 0;
24555
+ const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
24541
24556
  const fsTools = createFilesystemTools(workspacePath);
24542
24557
  const evaluatorRawRequest = {
24543
24558
  mode: "built-in",
@@ -24633,7 +24648,7 @@ ${context2.fileChanges}`;
24633
24648
  };
24634
24649
  }
24635
24650
  const config = context2.evaluator;
24636
- const rubrics = config?.type === "llm-grader" || config?.type === "llm-judge" ? config.rubrics : void 0;
24651
+ const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
24637
24652
  const details = {
24638
24653
  mode: modeLabel,
24639
24654
  grader_target: provider.targetName
@@ -24673,7 +24688,7 @@ ${context2.fileChanges}`;
24673
24688
  */
24674
24689
  buildAgentSystemPrompt(context2) {
24675
24690
  const config = context2.evaluator;
24676
- const rubrics = config?.type === "llm-grader" || config?.type === "llm-judge" ? config.rubrics : void 0;
24691
+ const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
24677
24692
  const parts = [
24678
24693
  "You are an expert evaluator with access to the workspace filesystem.",
24679
24694
  "Use the provided tools to investigate the workspace and verify the criteria are met.",
@@ -24704,7 +24719,7 @@ ${context2.fileChanges}`;
24704
24719
  return substituteVariables(this.evaluatorTemplate, variables);
24705
24720
  }
24706
24721
  const config = context2.evaluator;
24707
- const rubrics = config?.type === "llm-grader" || config?.type === "llm-judge" ? config.rubrics : void 0;
24722
+ const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
24708
24723
  const parts = [
24709
24724
  "Evaluate the candidate answer by investigating the workspace.",
24710
24725
  "",
@@ -24747,7 +24762,7 @@ ${context2.fileChanges}`;
24747
24762
  buildDelegatedPrompt(context2) {
24748
24763
  const formattedQuestion = context2.promptInputs.question && context2.promptInputs.question.trim().length > 0 ? context2.promptInputs.question : context2.evalCase.question;
24749
24764
  const config = context2.evaluator;
24750
- const rubrics = config?.type === "llm-grader" || config?.type === "llm-judge" ? config.rubrics : void 0;
24765
+ const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
24751
24766
  if (this.evaluatorTemplate) {
24752
24767
  const variables = {
24753
24768
  [TEMPLATE_VARIABLES.CRITERIA]: context2.evalCase.criteria.trim(),
@@ -25242,10 +25257,8 @@ var CompositeEvaluator = class {
25242
25257
  const aggregator = this.config.aggregator;
25243
25258
  switch (aggregator.type) {
25244
25259
  case "code-grader":
25245
- case "code-judge":
25246
25260
  return this.runCodeAggregator(results, aggregator.path, aggregator.cwd ?? this.cwd);
25247
25261
  case "llm-grader":
25248
- case "llm-judge":
25249
25262
  return this.runLlmAggregator(results, context2, aggregator);
25250
25263
  case "threshold":
25251
25264
  return this.runThreshold(results, aggregator.threshold);
@@ -27630,7 +27643,7 @@ var endsWithFactory = (config) => {
27630
27643
  };
27631
27644
  function createBuiltinRegistry() {
27632
27645
  const registry = new EvaluatorRegistry();
27633
- registry.register("llm-grader", llmGraderFactory).register("llm-judge", llmGraderFactory).register("code-grader", codeFactory).register("code-judge", codeFactory).register("composite", compositeFactory).register("tool-trajectory", toolTrajectoryFactory).register("field-accuracy", fieldAccuracyFactory).register("latency", latencyFactory).register("cost", costFactory).register("token-usage", tokenUsageFactory).register("execution-metrics", executionMetricsFactory).register("skill-trigger", skillTriggerFactory).register("contains", containsFactory).register("contains-any", containsAnyFactory).register("contains-all", containsAllFactory).register("icontains", icontainsFactory).register("icontains-any", icontainsAnyFactory).register("icontains-all", icontainsAllFactory).register("starts-with", startsWithFactory).register("ends-with", endsWithFactory).register("regex", regexFactory).register("is-json", isJsonFactory).register("equals", equalsFactory).register("inline-assert", (config) => {
27646
+ registry.register("llm-grader", llmGraderFactory).register("code-grader", codeFactory).register("composite", compositeFactory).register("tool-trajectory", toolTrajectoryFactory).register("field-accuracy", fieldAccuracyFactory).register("latency", latencyFactory).register("cost", costFactory).register("token-usage", tokenUsageFactory).register("execution-metrics", executionMetricsFactory).register("skill-trigger", skillTriggerFactory).register("contains", containsFactory).register("contains-any", containsAnyFactory).register("contains-all", containsAllFactory).register("icontains", icontainsFactory).register("icontains-any", icontainsAnyFactory).register("icontains-all", icontainsAllFactory).register("starts-with", startsWithFactory).register("ends-with", endsWithFactory).register("regex", regexFactory).register("is-json", isJsonFactory).register("equals", equalsFactory).register("inline-assert", (config) => {
27634
27647
  const fn = config[INLINE_ASSERT_FN];
27635
27648
  if (!fn) {
27636
27649
  throw new Error(
@@ -30306,7 +30319,7 @@ function filterEvalCases(evalCases, filter2) {
30306
30319
  return evalCases.filter((evalCase) => micromatch3.isMatch(evalCase.id, filter2));
30307
30320
  }
30308
30321
  function buildEvaluatorRegistry(overrides, resolveGraderProvider) {
30309
- const llmGrader = overrides?.["llm-grader"] ?? overrides?.["llm-judge"] ?? new LlmGraderEvaluator({
30322
+ const llmGrader = overrides?.["llm-grader"] ?? new LlmGraderEvaluator({
30310
30323
  resolveGraderProvider: async (context2) => {
30311
30324
  if (context2.graderProvider) {
30312
30325
  return context2.graderProvider;
@@ -31127,10 +31140,10 @@ var OtelTraceExporter = class {
31127
31140
  }
31128
31141
  if (result.scores) {
31129
31142
  for (const score of result.scores) {
31130
- rootSpan.addEvent(`agentv.evaluator.${score.name}`, {
31131
- "agentv.evaluator.score": score.score,
31132
- "agentv.evaluator.type": score.type,
31133
- ...score.verdict ? { "agentv.evaluator.verdict": score.verdict } : {}
31143
+ rootSpan.addEvent(`agentv.grader.${score.name}`, {
31144
+ "agentv.grader.score": score.score,
31145
+ "agentv.grader.type": score.type,
31146
+ ...score.verdict ? { "agentv.grader.verdict": score.verdict } : {}
31134
31147
  });
31135
31148
  }
31136
31149
  }
@@ -31590,4 +31603,4 @@ export {
31590
31603
  OtelStreamingObserver,
31591
31604
  createAgentKernel
31592
31605
  };
31593
- //# sourceMappingURL=chunk-7OHZAFND.js.map
31606
+ //# sourceMappingURL=chunk-K747KGDP.js.map