@agentv/core 3.5.0 → 3.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -251,10 +251,8 @@ interface TokenUsage {
251
251
  interface TraceSummary {
252
252
  /** Total number of events in trace */
253
253
  readonly eventCount: number;
254
- /** Unique tool names, sorted alphabetically */
255
- readonly toolNames: readonly string[];
256
254
  /** Map of tool name to call count */
257
- readonly toolCallsByName: Readonly<Record<string, number>>;
255
+ readonly toolCalls: Readonly<Record<string, number>>;
258
256
  /** Number of error events */
259
257
  readonly errorCount: number;
260
258
  /** Per-tool duration arrays in milliseconds (optional) */
@@ -1134,7 +1132,7 @@ interface EvaluationResult {
1134
1132
  readonly conversationId?: string;
1135
1133
  readonly score: number;
1136
1134
  readonly assertions: readonly AssertionEntry[];
1137
- readonly answer: string;
1135
+ readonly outputText: string;
1138
1136
  readonly target: string;
1139
1137
  /** Token usage metrics from provider (optional) */
1140
1138
  readonly tokenUsage?: TokenUsage;
@@ -2412,6 +2410,7 @@ declare class SkillTriggerEvaluator implements Evaluator {
2412
2410
  constructor(config: SkillTriggerEvaluatorConfig);
2413
2411
  private resolveMatcher;
2414
2412
  evaluate(context: EvaluationContext): EvaluationScore;
2413
+ private readPathFromInput;
2415
2414
  }
2416
2415
 
2417
2416
  interface LlmGraderPromptAssembly {
package/dist/index.d.ts CHANGED
@@ -251,10 +251,8 @@ interface TokenUsage {
251
251
  interface TraceSummary {
252
252
  /** Total number of events in trace */
253
253
  readonly eventCount: number;
254
- /** Unique tool names, sorted alphabetically */
255
- readonly toolNames: readonly string[];
256
254
  /** Map of tool name to call count */
257
- readonly toolCallsByName: Readonly<Record<string, number>>;
255
+ readonly toolCalls: Readonly<Record<string, number>>;
258
256
  /** Number of error events */
259
257
  readonly errorCount: number;
260
258
  /** Per-tool duration arrays in milliseconds (optional) */
@@ -1134,7 +1132,7 @@ interface EvaluationResult {
1134
1132
  readonly conversationId?: string;
1135
1133
  readonly score: number;
1136
1134
  readonly assertions: readonly AssertionEntry[];
1137
- readonly answer: string;
1135
+ readonly outputText: string;
1138
1136
  readonly target: string;
1139
1137
  /** Token usage metrics from provider (optional) */
1140
1138
  readonly tokenUsage?: TokenUsage;
@@ -2412,6 +2410,7 @@ declare class SkillTriggerEvaluator implements Evaluator {
2412
2410
  constructor(config: SkillTriggerEvaluatorConfig);
2413
2411
  private resolveMatcher;
2414
2412
  evaluate(context: EvaluationContext): EvaluationScore;
2413
+ private readPathFromInput;
2415
2414
  }
2416
2415
 
2417
2416
  interface LlmGraderPromptAssembly {
package/dist/index.js CHANGED
@@ -16,7 +16,7 @@ import {
16
16
  readTextFile,
17
17
  resolveFileReference,
18
18
  resolveTargetDefinition
19
- } from "./chunk-EFR4JHPL.js";
19
+ } from "./chunk-2IZOTQ25.js";
20
20
  import {
21
21
  AgentvProvider
22
22
  } from "./chunk-W5YDZWT4.js";
@@ -83,12 +83,10 @@ function computeTraceSummary(messages) {
83
83
  }
84
84
  }
85
85
  }
86
- const toolNames = Object.keys(toolCallCounts).sort();
87
86
  return {
88
87
  trace: {
89
88
  eventCount: totalToolCalls,
90
- toolNames,
91
- toolCallsByName: toolCallCounts,
89
+ toolCalls: toolCallCounts,
92
90
  errorCount: 0,
93
91
  llmCallCount,
94
92
  ...hasAnyDuration ? { toolDurations } : {}
@@ -112,7 +110,7 @@ var DEFAULT_EXPLORATION_TOOLS = [
112
110
  function explorationRatio(summary, explorationTools = DEFAULT_EXPLORATION_TOOLS) {
113
111
  if (summary.eventCount === 0) return void 0;
114
112
  const explorationCalls = explorationTools.reduce(
115
- (sum, tool2) => sum + (summary.toolCallsByName[tool2] ?? 0),
113
+ (sum, tool2) => sum + (summary.toolCalls[tool2] ?? 0),
116
114
  0
117
115
  );
118
116
  return explorationCalls / summary.eventCount;
@@ -742,14 +740,8 @@ import { readFile as readFile4 } from "node:fs/promises";
742
740
 
743
741
  // src/evaluation/template-variables.ts
744
742
  var TEMPLATE_VARIABLES = {
745
- /** @deprecated Use OUTPUT_TEXT instead */
746
- ANSWER: "answer",
747
743
  EXPECTED_OUTPUT: "expected_output",
748
- /** @deprecated Use INPUT_TEXT instead */
749
- QUESTION: "question",
750
744
  CRITERIA: "criteria",
751
- /** @deprecated Use EXPECTED_OUTPUT_TEXT instead */
752
- REFERENCE_ANSWER: "reference_answer",
753
745
  INPUT: "input",
754
746
  OUTPUT: "output",
755
747
  FILE_CHANGES: "file_changes",
@@ -759,9 +751,8 @@ var TEMPLATE_VARIABLES = {
759
751
  };
760
752
  var VALID_TEMPLATE_VARIABLES = new Set(Object.values(TEMPLATE_VARIABLES));
761
753
  var REQUIRED_TEMPLATE_VARIABLES = /* @__PURE__ */ new Set([
762
- TEMPLATE_VARIABLES.ANSWER,
763
- TEMPLATE_VARIABLES.EXPECTED_OUTPUT,
764
- TEMPLATE_VARIABLES.OUTPUT_TEXT
754
+ TEMPLATE_VARIABLES.OUTPUT_TEXT,
755
+ TEMPLATE_VARIABLES.EXPECTED_OUTPUT
765
756
  ]);
766
757
 
767
758
  // src/evaluation/validation/prompt-validator.ts
@@ -784,13 +775,13 @@ function validateTemplateVariables(content, source) {
784
775
  }
785
776
  match = variablePattern.exec(content);
786
777
  }
787
- const hasCandidateAnswer = foundVariables.has(TEMPLATE_VARIABLES.ANSWER) || foundVariables.has(TEMPLATE_VARIABLES.OUTPUT_TEXT);
778
+ const hasCandidateAnswer = foundVariables.has(TEMPLATE_VARIABLES.OUTPUT_TEXT);
788
779
  const hasExpectedOutput = foundVariables.has(TEMPLATE_VARIABLES.EXPECTED_OUTPUT);
789
780
  const hasRequiredFields = hasCandidateAnswer || hasExpectedOutput;
790
781
  if (!hasRequiredFields) {
791
782
  throw new Error(
792
783
  `Missing required fields. Must include at least one of:
793
- - {{ ${TEMPLATE_VARIABLES.ANSWER} }} or {{ ${TEMPLATE_VARIABLES.OUTPUT_TEXT} }}
784
+ - {{ ${TEMPLATE_VARIABLES.OUTPUT_TEXT} }}
794
785
  - {{ ${TEMPLATE_VARIABLES.EXPECTED_OUTPUT} }}`
795
786
  );
796
787
  }
@@ -3974,6 +3965,8 @@ async function invokeModel(options) {
3974
3965
  const { model, request, defaults, retryConfig, providerOptions } = options;
3975
3966
  const chatPrompt = buildChatPrompt(request);
3976
3967
  const { temperature, maxOutputTokens } = resolveModelSettings(request, defaults);
3968
+ const startTime = (/* @__PURE__ */ new Date()).toISOString();
3969
+ const startMs = Date.now();
3977
3970
  const result = await withRetry(
3978
3971
  () => generateText({
3979
3972
  model,
@@ -3987,9 +3980,11 @@ async function invokeModel(options) {
3987
3980
  retryConfig,
3988
3981
  request.signal
3989
3982
  );
3990
- return mapResponse(result);
3983
+ const endTime = (/* @__PURE__ */ new Date()).toISOString();
3984
+ const durationMs = Date.now() - startMs;
3985
+ return mapResponse(result, { durationMs, startTime, endTime });
3991
3986
  }
3992
- function mapResponse(result) {
3987
+ function mapResponse(result, timing) {
3993
3988
  const content = result.text ?? "";
3994
3989
  const rawUsage = result.totalUsage ?? result.usage;
3995
3990
  const reasoning = rawUsage?.outputTokenDetails?.reasoningTokens ?? void 0;
@@ -4004,7 +3999,10 @@ function mapResponse(result) {
4004
3999
  raw: result,
4005
4000
  usage: toJsonObject(rawUsage),
4006
4001
  output: [{ role: "assistant", content }],
4007
- tokenUsage
4002
+ tokenUsage,
4003
+ durationMs: timing?.durationMs,
4004
+ startTime: timing?.startTime,
4005
+ endTime: timing?.endTime
4008
4006
  };
4009
4007
  }
4010
4008
  function toJsonObject(value) {
@@ -4882,10 +4880,12 @@ var ClaudeSdkProvider = class {
4882
4880
  if (usage) {
4883
4881
  const inputTokens = (usage.input_tokens ?? 0) + (usage.cache_read_input_tokens ?? 0) + (usage.cache_creation_input_tokens ?? 0);
4884
4882
  const outputTokens = usage.output_tokens ?? 0;
4883
+ const reasoningTokens = usage.reasoning_tokens ?? void 0;
4885
4884
  tokenUsage = {
4886
4885
  input: inputTokens,
4887
4886
  output: outputTokens,
4888
- cached: usage.cache_read_input_tokens ?? void 0
4887
+ cached: usage.cache_read_input_tokens ?? void 0,
4888
+ reasoning: reasoningTokens
4889
4889
  };
4890
4890
  request.streamCallbacks?.onLlmCallEnd?.(this.config.model ?? "claude", tokenUsage);
4891
4891
  }
@@ -5899,7 +5899,8 @@ ${basePrompt}` : basePrompt;
5899
5899
  onUsage({
5900
5900
  input: usage.input_tokens ?? 0,
5901
5901
  output: usage.output_tokens ?? 0,
5902
- cached: usage.cached_input_tokens ?? void 0
5902
+ cached: usage.cached_input_tokens ?? void 0,
5903
+ reasoning: usage.reasoning_tokens ?? void 0
5903
5904
  });
5904
5905
  }
5905
5906
  }
@@ -7913,10 +7914,12 @@ function extractTokenUsage(events) {
7913
7914
  output: output ?? 0
7914
7915
  };
7915
7916
  const cached = toFiniteNumber(u.cache_read_input_tokens ?? u.cached ?? u.cachedTokens);
7916
- if (cached !== void 0) {
7917
- return { ...result, cached };
7918
- }
7919
- return result;
7917
+ const reasoning = toFiniteNumber(u.reasoning_tokens ?? u.reasoningTokens ?? u.reasoning);
7918
+ return {
7919
+ ...result,
7920
+ ...cached !== void 0 ? { cached } : {},
7921
+ ...reasoning !== void 0 ? { reasoning } : {}
7922
+ };
7920
7923
  }
7921
7924
  }
7922
7925
  const messages = record.messages;
@@ -10245,11 +10248,9 @@ var CodeEvaluator = class {
10245
10248
  }
10246
10249
  }
10247
10250
  const payload = {
10248
- question: context.evalCase.question,
10249
10251
  criteria: context.evalCase.criteria,
10250
10252
  expectedOutput: context.evalCase.expected_output,
10251
- referenceAnswer: context.evalCase.reference_answer,
10252
- answer: context.candidate,
10253
+ outputText: context.candidate,
10253
10254
  output: outputForPayload,
10254
10255
  outputPath,
10255
10256
  guidelineFiles: context.evalCase.guideline_paths,
@@ -10266,9 +10267,7 @@ var CodeEvaluator = class {
10266
10267
  fileChanges: context.fileChanges ?? null,
10267
10268
  workspacePath: context.workspacePath ?? null,
10268
10269
  config: this.config ?? null,
10269
- // Text convenience accessors (new names, always strings)
10270
10270
  inputText: context.evalCase.question,
10271
- outputText: context.candidate,
10272
10271
  expectedOutputText: context.evalCase.reference_answer ?? ""
10273
10272
  };
10274
10273
  const inputPayload = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
@@ -10436,13 +10435,13 @@ Be concise and focused in your evaluation. Provide succinct, specific feedback r
10436
10435
  {{${TEMPLATE_VARIABLES.CRITERIA}}}
10437
10436
 
10438
10437
  [[ ## question ## ]]
10439
- {{${TEMPLATE_VARIABLES.QUESTION}}}
10438
+ {{${TEMPLATE_VARIABLES.INPUT_TEXT}}}
10440
10439
 
10441
10440
  [[ ## reference_answer ## ]]
10442
- {{${TEMPLATE_VARIABLES.REFERENCE_ANSWER}}}
10441
+ {{${TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT}}}
10443
10442
 
10444
10443
  [[ ## answer ## ]]
10445
- {{${TEMPLATE_VARIABLES.ANSWER}}}`;
10444
+ {{${TEMPLATE_VARIABLES.OUTPUT_TEXT}}}`;
10446
10445
  var freeformEvaluationSchema = z3.object({
10447
10446
  score: z3.number().min(0).max(1).describe("Score between 0.0 and 1.0"),
10448
10447
  assertions: z3.array(
@@ -10520,12 +10519,8 @@ var LlmGraderEvaluator = class {
10520
10519
  2
10521
10520
  ),
10522
10521
  [TEMPLATE_VARIABLES.OUTPUT]: JSON.stringify(context.output ?? [], null, 2),
10523
- [TEMPLATE_VARIABLES.ANSWER]: context.candidate.trim(),
10524
- [TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (context.evalCase.reference_answer ?? "").trim(),
10525
10522
  [TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(),
10526
- [TEMPLATE_VARIABLES.QUESTION]: formattedQuestion.trim(),
10527
10523
  [TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? "",
10528
- // Text convenience accessors (new names, always strings)
10529
10524
  [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
10530
10525
  [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context.candidate.trim(),
10531
10526
  [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context.evalCase.reference_answer ?? "").trim()
@@ -10830,10 +10825,10 @@ ${context.fileChanges}`;
10830
10825
  buildAgentUserPrompt(context) {
10831
10826
  const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
10832
10827
  const variables = {
10833
- [TEMPLATE_VARIABLES.ANSWER]: context.candidate.trim(),
10834
- [TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (context.evalCase.reference_answer ?? "").trim(),
10835
10828
  [TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(),
10836
- [TEMPLATE_VARIABLES.QUESTION]: formattedQuestion.trim(),
10829
+ [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
10830
+ [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context.candidate.trim(),
10831
+ [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context.evalCase.reference_answer ?? "").trim(),
10837
10832
  [TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? ""
10838
10833
  };
10839
10834
  if (this.evaluatorTemplate) {
@@ -10886,10 +10881,10 @@ ${context.fileChanges}`;
10886
10881
  const rubrics = config?.type === "llm-grader" || config?.type === "llm-judge" ? config.rubrics : void 0;
10887
10882
  if (this.evaluatorTemplate) {
10888
10883
  const variables = {
10889
- [TEMPLATE_VARIABLES.ANSWER]: context.candidate.trim(),
10890
- [TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (context.evalCase.reference_answer ?? "").trim(),
10891
10884
  [TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(),
10892
- [TEMPLATE_VARIABLES.QUESTION]: formattedQuestion.trim(),
10885
+ [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
10886
+ [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context.candidate.trim(),
10887
+ [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context.evalCase.reference_answer ?? "").trim(),
10893
10888
  [TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? ""
10894
10889
  };
10895
10890
  const customPrompt = substituteVariables(this.evaluatorTemplate, variables);
@@ -12317,7 +12312,10 @@ var COPILOT_MATCHER = {
12317
12312
  skillTools: ["Skill", "skill"],
12318
12313
  skillInputField: "skill",
12319
12314
  readTools: ["Read File", "readFile", "Read", "readTextFile"],
12320
- readInputField: "file_path"
12315
+ readInputField: "file_path",
12316
+ skillToolPrefixes: ["Using skill: "],
12317
+ readToolPrefixes: ["Viewing "],
12318
+ readInputFields: ["file_path", "path"]
12321
12319
  };
12322
12320
  var PROVIDER_TOOL_SEMANTICS = {
12323
12321
  claude: CLAUDE_MATCHER,
@@ -12359,12 +12357,22 @@ var SkillTriggerEvaluator = class {
12359
12357
  triggered = true;
12360
12358
  evidence = `Skill tool invoked with ${matcher.skillInputField}="${skillArg}"`;
12361
12359
  }
12360
+ } else if (matcher.skillToolPrefixes?.some(
12361
+ (prefix) => firstTool.tool.startsWith(prefix) && firstTool.tool.includes(skillName)
12362
+ )) {
12363
+ triggered = true;
12364
+ evidence = `Skill tool invoked via tool name "${firstTool.tool}"`;
12362
12365
  } else if (matcher.readTools.includes(firstTool.tool)) {
12363
- const filePath = String(input[matcher.readInputField] ?? "");
12366
+ const filePath = this.readPathFromInput(input, matcher);
12364
12367
  if (filePath.includes(skillName)) {
12365
12368
  triggered = true;
12366
12369
  evidence = `Read tool loaded skill file: ${filePath}`;
12367
12370
  }
12371
+ } else if (matcher.readToolPrefixes?.some(
12372
+ (prefix) => firstTool.tool.startsWith(prefix) && firstTool.tool.includes(skillName)
12373
+ )) {
12374
+ triggered = true;
12375
+ evidence = `Read tool loaded skill file via tool name "${firstTool.tool}"`;
12368
12376
  }
12369
12377
  }
12370
12378
  const pass = triggered === shouldTrigger;
@@ -12393,6 +12401,16 @@ var SkillTriggerEvaluator = class {
12393
12401
  expectedAspectCount: 1
12394
12402
  };
12395
12403
  }
12404
+ readPathFromInput(input, matcher) {
12405
+ const fields = matcher.readInputFields ?? [matcher.readInputField];
12406
+ for (const field of fields) {
12407
+ const value = input[field];
12408
+ if (value !== void 0 && value !== null) {
12409
+ return String(value);
12410
+ }
12411
+ }
12412
+ return "";
12413
+ }
12396
12414
  };
12397
12415
 
12398
12416
  // src/evaluation/evaluators/llm-grader-prompt.ts
@@ -12427,12 +12445,8 @@ function assembleFreeform(evalCase, candidate, promptInputs, fileChanges, evalua
12427
12445
  [TEMPLATE_VARIABLES.INPUT]: JSON.stringify(evalCase.input_segments, null, 2),
12428
12446
  [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: JSON.stringify(evalCase.expected_output, null, 2),
12429
12447
  [TEMPLATE_VARIABLES.OUTPUT]: JSON.stringify([], null, 2),
12430
- [TEMPLATE_VARIABLES.ANSWER]: candidate.trim(),
12431
- [TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (evalCase.reference_answer ?? "").trim(),
12432
12448
  [TEMPLATE_VARIABLES.CRITERIA]: evalCase.criteria.trim(),
12433
- [TEMPLATE_VARIABLES.QUESTION]: formattedQuestion.trim(),
12434
12449
  [TEMPLATE_VARIABLES.FILE_CHANGES]: fileChanges ?? "",
12435
- // Text convenience accessors (new names, always strings)
12436
12450
  [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
12437
12451
  [TEMPLATE_VARIABLES.OUTPUT_TEXT]: candidate.trim(),
12438
12452
  [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (evalCase.reference_answer ?? "").trim()
@@ -12764,11 +12778,9 @@ var ToolTrajectoryEvaluator = class {
12764
12778
  for (const call of toolCalls) {
12765
12779
  toolCallsByName[call.name] = (toolCallsByName[call.name] ?? 0) + 1;
12766
12780
  }
12767
- const toolNames = Object.keys(toolCallsByName).sort();
12768
12781
  return {
12769
12782
  eventCount: toolCalls.length,
12770
- toolNames,
12771
- toolCallsByName,
12783
+ toolCalls: toolCallsByName,
12772
12784
  errorCount: 0
12773
12785
  };
12774
12786
  }
@@ -12786,7 +12798,7 @@ var ToolTrajectoryEvaluator = class {
12786
12798
  const assertions = [];
12787
12799
  for (const toolName of toolNames) {
12788
12800
  const required = minimums[toolName];
12789
- const actual = summary.toolCallsByName[toolName] ?? 0;
12801
+ const actual = summary.toolCalls[toolName] ?? 0;
12790
12802
  if (actual >= required) {
12791
12803
  assertions.push({
12792
12804
  text: `${toolName}: called ${actual} times (required >=${required})`,
@@ -13489,11 +13501,9 @@ async function resolveCustomPrompt(promptConfig, context, timeoutMs) {
13489
13501
  }
13490
13502
  async function executePromptTemplate(script, context, config, timeoutMs) {
13491
13503
  const payload = {
13492
- question: context.evalCase.question,
13493
13504
  criteria: context.evalCase.criteria,
13494
13505
  expectedOutput: context.evalCase.expected_output,
13495
- referenceAnswer: context.evalCase.reference_answer,
13496
- answer: context.candidate,
13506
+ outputText: context.candidate,
13497
13507
  output: context.output ?? null,
13498
13508
  guidelineFiles: context.evalCase.guideline_paths,
13499
13509
  inputFiles: context.evalCase.file_paths.filter(
@@ -13504,9 +13514,7 @@ async function executePromptTemplate(script, context, config, timeoutMs) {
13504
13514
  fileChanges: context.fileChanges ?? null,
13505
13515
  workspacePath: context.workspacePath ?? null,
13506
13516
  config: config ?? context.config ?? null,
13507
- // Text convenience accessors (new names, always strings)
13508
13517
  inputText: context.evalCase.question,
13509
- outputText: context.candidate,
13510
13518
  expectedOutputText: context.evalCase.reference_answer ?? ""
13511
13519
  };
13512
13520
  const inputJson = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
@@ -15170,7 +15178,7 @@ async function runEvaluation(options) {
15170
15178
  dataset: evalCase.dataset,
15171
15179
  score: 0,
15172
15180
  assertions: [],
15173
- answer: "",
15181
+ outputText: "",
15174
15182
  target: target.name,
15175
15183
  error: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
15176
15184
  budgetExceeded: true,
@@ -15206,7 +15214,7 @@ async function runEvaluation(options) {
15206
15214
  dataset: evalCase.dataset,
15207
15215
  score: 0,
15208
15216
  assertions: [],
15209
- answer: "",
15217
+ outputText: "",
15210
15218
  target: target.name,
15211
15219
  error: errorMsg,
15212
15220
  executionStatus: "execution_error",
@@ -15471,7 +15479,7 @@ async function runBatchEvaluation(options) {
15471
15479
  const providerResponse = batchResponse[i];
15472
15480
  const output = providerResponse.output;
15473
15481
  const hasExecutionMetrics = providerResponse.tokenUsage !== void 0 || providerResponse.costUsd !== void 0 || providerResponse.durationMs !== void 0;
15474
- const computed = output ? computeTraceSummary(output) : hasExecutionMetrics ? { trace: { eventCount: 0, toolNames: [], toolCallsByName: {}, errorCount: 0 } } : void 0;
15482
+ const computed = output ? computeTraceSummary(output) : hasExecutionMetrics ? { trace: { eventCount: 0, toolCalls: {}, errorCount: 0 } } : void 0;
15475
15483
  const merged = computed ? mergeExecutionMetrics(computed, {
15476
15484
  tokenUsage: providerResponse.tokenUsage,
15477
15485
  costUsd: providerResponse.costUsd,
@@ -15868,7 +15876,7 @@ async function runEvalCase(options) {
15868
15876
  }
15869
15877
  const output = providerResponse.output;
15870
15878
  const hasExecutionMetrics = providerResponse.tokenUsage !== void 0 || providerResponse.costUsd !== void 0 || providerResponse.durationMs !== void 0;
15871
- const computed = output ? computeTraceSummary(output) : hasExecutionMetrics ? { trace: { eventCount: 0, toolNames: [], toolCallsByName: {}, errorCount: 0 } } : void 0;
15879
+ const computed = output ? computeTraceSummary(output) : hasExecutionMetrics ? { trace: { eventCount: 0, toolCalls: {}, errorCount: 0 } } : void 0;
15872
15880
  const merged = computed ? mergeExecutionMetrics(computed, {
15873
15881
  tokenUsage: providerResponse.tokenUsage,
15874
15882
  costUsd: providerResponse.costUsd,
@@ -16173,7 +16181,7 @@ async function evaluateCandidate(options) {
16173
16181
  conversationId: evalCase.conversation_id,
16174
16182
  score: score.score,
16175
16183
  assertions: score.assertions,
16176
- answer: candidate,
16184
+ outputText: candidate,
16177
16185
  target: target.name,
16178
16186
  tokenUsage,
16179
16187
  costUsd,
@@ -16529,7 +16537,7 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
16529
16537
  conversationId: evalCase.conversation_id,
16530
16538
  score: 0,
16531
16539
  assertions: [{ text: `Error: ${message}`, passed: false }],
16532
- answer: `Error occurred: ${message}`,
16540
+ outputText: `Error occurred: ${message}`,
16533
16541
  target: targetName,
16534
16542
  requests,
16535
16543
  input,
@@ -17067,7 +17075,7 @@ function shouldSkipCacheForTemperature(targetConfig) {
17067
17075
 
17068
17076
  // src/evaluation/baseline.ts
17069
17077
  var STRIPPED_TOP_LEVEL_FIELDS = /* @__PURE__ */ new Set([
17070
- "answer",
17078
+ "outputText",
17071
17079
  "requests",
17072
17080
  "trace",
17073
17081
  "workspacePath",
@@ -17241,14 +17249,17 @@ var OtelTraceExporter = class {
17241
17249
  rootSpan.setAttribute("agentv.target", result.target);
17242
17250
  if (result.dataset) rootSpan.setAttribute("agentv.dataset", result.dataset);
17243
17251
  rootSpan.setAttribute("agentv.score", result.score);
17244
- if (captureContent) rootSpan.setAttribute("agentv.answer", result.answer);
17252
+ if (captureContent) rootSpan.setAttribute("agentv.output_text", result.outputText);
17245
17253
  if (result.durationMs != null)
17246
17254
  rootSpan.setAttribute("agentv.trace.duration_ms", result.durationMs);
17247
17255
  if (result.costUsd != null) rootSpan.setAttribute("agentv.trace.cost_usd", result.costUsd);
17248
17256
  if (result.trace) {
17249
17257
  const t = result.trace;
17250
17258
  rootSpan.setAttribute("agentv.trace.event_count", t.eventCount);
17251
- rootSpan.setAttribute("agentv.trace.tool_names", t.toolNames.join(","));
17259
+ rootSpan.setAttribute(
17260
+ "agentv.trace.tool_names",
17261
+ Object.keys(t.toolCalls).sort().join(",")
17262
+ );
17252
17263
  if (t.llmCallCount != null)
17253
17264
  rootSpan.setAttribute("agentv.trace.llm_call_count", t.llmCallCount);
17254
17265
  }