@agentv/core 3.12.0 → 3.13.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -19,16 +19,13 @@ import {
19
19
  readTextFile,
20
20
  resolveFileReference,
21
21
  resolveTargetDefinition
22
- } from "./chunk-4XWPXNQM.js";
22
+ } from "./chunk-ZB3AUPES.js";
23
23
  import {
24
24
  AgentvProvider
25
25
  } from "./chunk-W5YDZWT4.js";
26
26
  import {
27
27
  OtlpJsonFileExporter
28
28
  } from "./chunk-HFSYZHGF.js";
29
- import {
30
- SimpleTraceFileExporter
31
- } from "./chunk-3G2KXH7N.js";
32
29
 
33
30
  // src/evaluation/trace.ts
34
31
  function computeTraceSummary(messages) {
@@ -615,12 +612,6 @@ function parseExecutionDefaults(raw, configPath) {
615
612
  } else if (obj.verbose !== void 0) {
616
613
  logWarning(`Invalid execution.verbose in ${configPath}, expected boolean`);
617
614
  }
618
- const traceFile = obj.trace_file;
619
- if (typeof traceFile === "string" && traceFile.trim().length > 0) {
620
- result.trace_file = traceFile.trim();
621
- } else if (traceFile !== void 0) {
622
- logWarning(`Invalid execution.trace_file in ${configPath}, expected non-empty string`);
623
- }
624
615
  if (typeof obj.keep_workspaces === "boolean") {
625
616
  result.keep_workspaces = obj.keep_workspaces;
626
617
  } else if (obj.keep_workspaces !== void 0) {
@@ -737,6 +728,9 @@ var ANSI_RESET4 = "\x1B[0m";
737
728
  function normalizeEvaluatorType(type) {
738
729
  return type.replace(/_/g, "-");
739
730
  }
731
+ function isDeprecatedJudgeType(type) {
732
+ return type === "code-judge" || type === "llm-judge";
733
+ }
740
734
  async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId) {
741
735
  const execution = rawEvalCase.execution;
742
736
  const executionObject = isJsonObject2(execution) ? execution : void 0;
@@ -799,6 +793,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
799
793
  const rawName = asString(rawEvaluator.name);
800
794
  const rawType = rawEvaluator.type;
801
795
  const typeValue = typeof rawType === "string" ? normalizeEvaluatorType(rawType) : rawType;
796
+ if (typeof typeValue === "string" && isDeprecatedJudgeType(typeValue)) {
797
+ logWarning2(
798
+ `Skipping evaluator '${rawName ?? "<unnamed>"}' in '${evalId}': '${rawType}' is deprecated. Use '${typeValue.replace("-judge", "-grader")}' instead`
799
+ );
800
+ continue;
801
+ }
802
802
  const isCustomType = typeof typeValue === "string" && !isEvaluatorKind(typeValue);
803
803
  if (typeof typeValue !== "string") {
804
804
  logWarning2(`Skipping evaluator with invalid type in '${evalId}'`);
@@ -831,7 +831,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
831
831
  });
832
832
  continue;
833
833
  }
834
- if (typeValue === "code-grader" || typeValue === "code-judge") {
834
+ if (typeValue === "code-grader") {
835
835
  let command;
836
836
  if (rawEvaluator.script !== void 0 && rawEvaluator.command === void 0) {
837
837
  console.warn(
@@ -941,7 +941,14 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
941
941
  continue;
942
942
  }
943
943
  const aggregatorType = asString(rawAggregator.type);
944
- if (aggregatorType !== "weighted_average" && aggregatorType !== "code-grader" && aggregatorType !== "code-judge" && aggregatorType !== "llm-grader" && aggregatorType !== "llm-judge" && aggregatorType !== "threshold") {
944
+ const normalizedAggregatorType = typeof aggregatorType === "string" ? aggregatorType === "weighted_average" || aggregatorType === "threshold" ? aggregatorType : normalizeEvaluatorType(aggregatorType) : aggregatorType;
945
+ if (typeof normalizedAggregatorType === "string" && isDeprecatedJudgeType(normalizedAggregatorType)) {
946
+ logWarning2(
947
+ `Skipping composite evaluator '${name}' in '${evalId}': aggregator type '${aggregatorType}' is deprecated. Use '${normalizedAggregatorType.replace("-judge", "-grader")}' instead`
948
+ );
949
+ continue;
950
+ }
951
+ if (normalizedAggregatorType !== "weighted_average" && normalizedAggregatorType !== "code-grader" && normalizedAggregatorType !== "llm-grader" && normalizedAggregatorType !== "threshold") {
945
952
  logWarning2(
946
953
  `Skipping composite evaluator '${name}' in '${evalId}': invalid aggregator type '${aggregatorType}'`
947
954
  );
@@ -976,7 +983,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
976
983
  continue;
977
984
  }
978
985
  let aggregator;
979
- if (aggregatorType === "weighted_average") {
986
+ if (normalizedAggregatorType === "weighted_average") {
980
987
  const weights = isJsonObject2(rawAggregator.weights) ? rawAggregator.weights : void 0;
981
988
  const parsedWeights = {};
982
989
  if (weights) {
@@ -990,7 +997,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
990
997
  type: "weighted_average",
991
998
  ...Object.keys(parsedWeights).length > 0 ? { weights: parsedWeights } : {}
992
999
  };
993
- } else if (aggregatorType === "code-grader" || aggregatorType === "code-judge") {
1000
+ } else if (normalizedAggregatorType === "code-grader") {
994
1001
  const aggregatorPath = asString(rawAggregator.path);
995
1002
  if (!aggregatorPath) {
996
1003
  logWarning2(
@@ -1003,7 +1010,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1003
1010
  path: aggregatorPath,
1004
1011
  cwd: searchRoots[0]
1005
1012
  };
1006
- } else if (aggregatorType === "threshold") {
1013
+ } else if (normalizedAggregatorType === "threshold") {
1007
1014
  const thresholdValue = rawAggregator.threshold;
1008
1015
  if (typeof thresholdValue !== "number" || thresholdValue < 0 || thresholdValue > 1) {
1009
1016
  logWarning2(
@@ -1751,10 +1758,15 @@ function coerceEvaluator(candidate, contextId) {
1751
1758
  return void 0;
1752
1759
  }
1753
1760
  const normalized = normalizeEvaluatorType(candidate);
1761
+ if (isDeprecatedJudgeType(normalized)) {
1762
+ throw new Error(
1763
+ `Unsupported grader '${candidate}' in ${contextId}. Use '${normalized.replace("-judge", "-grader")}' instead.`
1764
+ );
1765
+ }
1754
1766
  if (isEvaluatorKind(normalized)) {
1755
1767
  return normalized;
1756
1768
  }
1757
- logWarning2(`Unknown evaluator '${candidate}' in ${contextId}, falling back to default`);
1769
+ logWarning2(`Unknown grader '${candidate}' in ${contextId}, falling back to default`);
1758
1770
  return void 0;
1759
1771
  }
1760
1772
  function asString(value) {
@@ -3187,9 +3199,7 @@ function assertionToNaturalLanguage(entry) {
3187
3199
  case "ends_with":
3188
3200
  return `Output ends with '${entry.value}'`;
3189
3201
  case "llm-grader":
3190
- case "llm_grader":
3191
- case "llm-judge":
3192
- case "llm_judge": {
3202
+ case "llm_grader": {
3193
3203
  if (Array.isArray(entry.rubrics) && entry.rubrics.length > 0) {
3194
3204
  return null;
3195
3205
  }
@@ -3202,9 +3212,7 @@ function assertionToNaturalLanguage(entry) {
3202
3212
  return tools ? `Agent called tools in order: ${tools}` : "Agent followed expected tool trajectory";
3203
3213
  }
3204
3214
  case "code-grader":
3205
- case "code_grader":
3206
- case "code-judge":
3207
- case "code_judge": {
3215
+ case "code_grader": {
3208
3216
  const graderName = entry.name ?? deriveGraderNameFromCommand(entry.command) ?? "code-grader";
3209
3217
  const desc = typeof entry.description === "string" ? entry.description : void 0;
3210
3218
  return codeGraderInstruction(graderName, desc);
@@ -3235,7 +3243,7 @@ function assertionToNaturalLanguage(entry) {
3235
3243
  }
3236
3244
  }
3237
3245
  function assertionToNaturalLanguageList(entry) {
3238
- if (entry.type === "llm-grader" || entry.type === "llm_grader" || entry.type === "llm-judge" || entry.type === "llm_judge") {
3246
+ if (entry.type === "llm-grader" || entry.type === "llm_grader") {
3239
3247
  if (Array.isArray(entry.rubrics) && entry.rubrics.length > 0) {
3240
3248
  return entry.rubrics.map((r) => r.outcome ?? r.criteria ?? r.id).filter((s) => typeof s === "string");
3241
3249
  }
@@ -9601,10 +9609,26 @@ function extractJsonBlob(text) {
9601
9609
  const match = text.match(/\{[\s\S]*\}/);
9602
9610
  return match?.[0];
9603
9611
  }
9612
+ function repairSchemaNearBooleanFields(text) {
9613
+ return text.replace(
9614
+ /("passed"\s*:\s*)(?:"([^"]+)"|([A-Za-z_][A-Za-z0-9_-]*))/gi,
9615
+ (_match, prefix, quotedValue, bareValue) => {
9616
+ const value = (quotedValue ?? bareValue ?? "").trim().toLowerCase();
9617
+ if (value === "true") {
9618
+ return `${prefix}true`;
9619
+ }
9620
+ if (value === "false") {
9621
+ return `${prefix}false`;
9622
+ }
9623
+ return `${prefix}false`;
9624
+ }
9625
+ );
9626
+ }
9604
9627
  function parseJsonFromText(text) {
9605
9628
  const cleaned = typeof text === "string" ? text.replace(/```json\n?|```/g, "").trim() : "";
9606
9629
  const blob = extractJsonBlob(cleaned) ?? cleaned;
9607
- return JSON.parse(blob);
9630
+ const repaired = repairSchemaNearBooleanFields(blob);
9631
+ return JSON.parse(repaired);
9608
9632
  }
9609
9633
  function isNonEmptyString(value) {
9610
9634
  return typeof value === "string" && value.trim().length > 0;
@@ -10076,7 +10100,7 @@ function toCamelCaseDeep(obj) {
10076
10100
  // src/evaluation/evaluators/code-evaluator.ts
10077
10101
  var FILE_BACKED_OUTPUT_THRESHOLD = 5e4;
10078
10102
  var CodeEvaluator = class {
10079
- kind = "code-judge";
10103
+ kind = "code-grader";
10080
10104
  command;
10081
10105
  cwd;
10082
10106
  agentTimeoutMs;
@@ -10095,7 +10119,7 @@ var CodeEvaluator = class {
10095
10119
  if (outputForPayload) {
10096
10120
  const serialized = JSON.stringify(outputForPayload);
10097
10121
  if (serialized.length > FILE_BACKED_OUTPUT_THRESHOLD) {
10098
- const tmpDir = await mkdtemp2(join(tmpdir2(), "agentv-judge-"));
10122
+ const tmpDir = await mkdtemp2(join(tmpdir2(), "agentv-grader-"));
10099
10123
  outputPath = join(tmpDir, "output.json");
10100
10124
  await writeFile6(outputPath, serialized);
10101
10125
  outputForPayload = null;
@@ -10353,7 +10377,7 @@ var LlmGraderEvaluator = class {
10353
10377
  return this.evaluateWithDelegatedAgent(context, graderProvider);
10354
10378
  }
10355
10379
  const config = context.evaluator;
10356
- if ((config?.type === "llm-grader" || config?.type === "llm-judge") && config.rubrics && config.rubrics.length > 0) {
10380
+ if (config?.type === "llm-grader" && config.rubrics && config.rubrics.length > 0) {
10357
10381
  return this.evaluateWithRubrics(context, graderProvider, config.rubrics);
10358
10382
  }
10359
10383
  return this.evaluateFreeform(context, graderProvider);
@@ -10538,7 +10562,7 @@ ${context.fileChanges}`;
10538
10562
  const systemPrompt = this.buildAgentSystemPrompt(context);
10539
10563
  const userPrompt = this.buildAgentUserPrompt(context);
10540
10564
  const config = context.evaluator;
10541
- const rubrics = config?.type === "llm-grader" || config?.type === "llm-judge" ? config.rubrics : void 0;
10565
+ const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
10542
10566
  const fsTools = createFilesystemTools(workspacePath);
10543
10567
  const evaluatorRawRequest = {
10544
10568
  mode: "built-in",
@@ -10634,7 +10658,7 @@ ${context.fileChanges}`;
10634
10658
  };
10635
10659
  }
10636
10660
  const config = context.evaluator;
10637
- const rubrics = config?.type === "llm-grader" || config?.type === "llm-judge" ? config.rubrics : void 0;
10661
+ const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
10638
10662
  const details = {
10639
10663
  mode: modeLabel,
10640
10664
  grader_target: provider.targetName
@@ -10674,7 +10698,7 @@ ${context.fileChanges}`;
10674
10698
  */
10675
10699
  buildAgentSystemPrompt(context) {
10676
10700
  const config = context.evaluator;
10677
- const rubrics = config?.type === "llm-grader" || config?.type === "llm-judge" ? config.rubrics : void 0;
10701
+ const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
10678
10702
  const parts = [
10679
10703
  "You are an expert evaluator with access to the workspace filesystem.",
10680
10704
  "Use the provided tools to investigate the workspace and verify the criteria are met.",
@@ -10705,7 +10729,7 @@ ${context.fileChanges}`;
10705
10729
  return substituteVariables(this.evaluatorTemplate, variables);
10706
10730
  }
10707
10731
  const config = context.evaluator;
10708
- const rubrics = config?.type === "llm-grader" || config?.type === "llm-judge" ? config.rubrics : void 0;
10732
+ const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
10709
10733
  const parts = [
10710
10734
  "Evaluate the candidate answer by investigating the workspace.",
10711
10735
  "",
@@ -10748,7 +10772,7 @@ ${context.fileChanges}`;
10748
10772
  buildDelegatedPrompt(context) {
10749
10773
  const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
10750
10774
  const config = context.evaluator;
10751
- const rubrics = config?.type === "llm-grader" || config?.type === "llm-judge" ? config.rubrics : void 0;
10775
+ const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
10752
10776
  if (this.evaluatorTemplate) {
10753
10777
  const variables = {
10754
10778
  [TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(),
@@ -11245,10 +11269,8 @@ var CompositeEvaluator = class {
11245
11269
  const aggregator = this.config.aggregator;
11246
11270
  switch (aggregator.type) {
11247
11271
  case "code-grader":
11248
- case "code-judge":
11249
11272
  return this.runCodeAggregator(results, aggregator.path, aggregator.cwd ?? this.cwd);
11250
11273
  case "llm-grader":
11251
- case "llm-judge":
11252
11274
  return this.runLlmAggregator(results, context, aggregator);
11253
11275
  case "threshold":
11254
11276
  return this.runThreshold(results, aggregator.threshold);
@@ -13670,7 +13692,7 @@ var endsWithFactory = (config) => {
13670
13692
  };
13671
13693
  function createBuiltinRegistry() {
13672
13694
  const registry = new EvaluatorRegistry();
13673
- registry.register("llm-grader", llmGraderFactory).register("llm-judge", llmGraderFactory).register("code-grader", codeFactory).register("code-judge", codeFactory).register("composite", compositeFactory).register("tool-trajectory", toolTrajectoryFactory).register("field-accuracy", fieldAccuracyFactory).register("latency", latencyFactory).register("cost", costFactory).register("token-usage", tokenUsageFactory).register("execution-metrics", executionMetricsFactory).register("skill-trigger", skillTriggerFactory).register("contains", containsFactory).register("contains-any", containsAnyFactory).register("contains-all", containsAllFactory).register("icontains", icontainsFactory).register("icontains-any", icontainsAnyFactory).register("icontains-all", icontainsAllFactory).register("starts-with", startsWithFactory).register("ends-with", endsWithFactory).register("regex", regexFactory).register("is-json", isJsonFactory).register("equals", equalsFactory).register("inline-assert", (config) => {
13695
+ registry.register("llm-grader", llmGraderFactory).register("code-grader", codeFactory).register("composite", compositeFactory).register("tool-trajectory", toolTrajectoryFactory).register("field-accuracy", fieldAccuracyFactory).register("latency", latencyFactory).register("cost", costFactory).register("token-usage", tokenUsageFactory).register("execution-metrics", executionMetricsFactory).register("skill-trigger", skillTriggerFactory).register("contains", containsFactory).register("contains-any", containsAnyFactory).register("contains-all", containsAllFactory).register("icontains", icontainsFactory).register("icontains-any", icontainsAnyFactory).register("icontains-all", icontainsAllFactory).register("starts-with", startsWithFactory).register("ends-with", endsWithFactory).register("regex", regexFactory).register("is-json", isJsonFactory).register("equals", equalsFactory).register("inline-assert", (config) => {
13674
13696
  const fn = config[INLINE_ASSERT_FN];
13675
13697
  if (!fn) {
13676
13698
  throw new Error(
@@ -16388,7 +16410,7 @@ function filterEvalCases(evalCases, filter) {
16388
16410
  return evalCases.filter((evalCase) => micromatch3.isMatch(evalCase.id, filter));
16389
16411
  }
16390
16412
  function buildEvaluatorRegistry(overrides, resolveGraderProvider) {
16391
- const llmGrader = overrides?.["llm-grader"] ?? overrides?.["llm-judge"] ?? new LlmGraderEvaluator({
16413
+ const llmGrader = overrides?.["llm-grader"] ?? new LlmGraderEvaluator({
16392
16414
  resolveGraderProvider: async (context) => {
16393
16415
  if (context.graderProvider) {
16394
16416
  return context.graderProvider;
@@ -16820,8 +16842,6 @@ var AgentVConfigSchema = z4.object({
16820
16842
  agentTimeoutMs: z4.number().int().min(0).optional(),
16821
16843
  /** Enable verbose logging */
16822
16844
  verbose: z4.boolean().optional(),
16823
- /** Write human-readable trace JSONL to this path (supports {timestamp} placeholder) */
16824
- traceFile: z4.string().optional(),
16825
16845
  /** Always keep temp workspaces after eval */
16826
16846
  keepWorkspaces: z4.boolean().optional(),
16827
16847
  /** Write OTLP JSON trace to this path (supports {timestamp} placeholder) */
@@ -17121,12 +17141,6 @@ var OtelTraceExporter = class {
17121
17141
  new SimpleSpanProcessor(new OtlpJsonFileExporter2(this.options.otlpFilePath))
17122
17142
  );
17123
17143
  }
17124
- if (this.options.traceFilePath) {
17125
- const { SimpleTraceFileExporter: SimpleTraceFileExporter2 } = await import("./simple-trace-file-exporter-CRIO5HDZ.js");
17126
- processors.push(
17127
- new SimpleSpanProcessor(new SimpleTraceFileExporter2(this.options.traceFilePath))
17128
- );
17129
- }
17130
17144
  if (processors.length === 0) {
17131
17145
  return false;
17132
17146
  }
@@ -17240,10 +17254,10 @@ var OtelTraceExporter = class {
17240
17254
  }
17241
17255
  if (result.scores) {
17242
17256
  for (const score of result.scores) {
17243
- rootSpan.addEvent(`agentv.evaluator.${score.name}`, {
17244
- "agentv.evaluator.score": score.score,
17245
- "agentv.evaluator.type": score.type,
17246
- ...score.verdict ? { "agentv.evaluator.verdict": score.verdict } : {}
17257
+ rootSpan.addEvent(`agentv.grader.${score.name}`, {
17258
+ "agentv.grader.score": score.score,
17259
+ "agentv.grader.type": score.type,
17260
+ ...score.verdict ? { "agentv.grader.verdict": score.verdict } : {}
17247
17261
  });
17248
17262
  }
17249
17263
  }
@@ -17577,7 +17591,6 @@ export {
17577
17591
  ProviderRegistry,
17578
17592
  RepoManager,
17579
17593
  ResponseCache,
17580
- SimpleTraceFileExporter,
17581
17594
  SkillTriggerEvaluator,
17582
17595
  TEST_MESSAGE_ROLES,
17583
17596
  TemplateNotDirectoryError,