@agentv/core 3.5.0 → 3.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,7 +6,7 @@ import {
6
6
  findGitRoot,
7
7
  isEvaluatorKind,
8
8
  resolveFileReference
9
- } from "../../chunk-EFR4JHPL.js";
9
+ } from "../../chunk-2IZOTQ25.js";
10
10
 
11
11
  // src/evaluation/validation/file-type.ts
12
12
  import { readFile } from "node:fs/promises";
package/dist/index.cjs CHANGED
@@ -1786,12 +1786,10 @@ function computeTraceSummary(messages) {
1786
1786
  }
1787
1787
  }
1788
1788
  }
1789
- const toolNames = Object.keys(toolCallCounts).sort();
1790
1789
  return {
1791
1790
  trace: {
1792
1791
  eventCount: totalToolCalls,
1793
- toolNames,
1794
- toolCallsByName: toolCallCounts,
1792
+ toolCalls: toolCallCounts,
1795
1793
  errorCount: 0,
1796
1794
  llmCallCount,
1797
1795
  ...hasAnyDuration ? { toolDurations } : {}
@@ -1815,7 +1813,7 @@ var DEFAULT_EXPLORATION_TOOLS = [
1815
1813
  function explorationRatio(summary, explorationTools = DEFAULT_EXPLORATION_TOOLS) {
1816
1814
  if (summary.eventCount === 0) return void 0;
1817
1815
  const explorationCalls = explorationTools.reduce(
1818
- (sum, tool2) => sum + (summary.toolCallsByName[tool2] ?? 0),
1816
+ (sum, tool2) => sum + (summary.toolCalls[tool2] ?? 0),
1819
1817
  0
1820
1818
  );
1821
1819
  return explorationCalls / summary.eventCount;
@@ -2445,14 +2443,8 @@ var import_promises5 = require("fs/promises");
2445
2443
 
2446
2444
  // src/evaluation/template-variables.ts
2447
2445
  var TEMPLATE_VARIABLES = {
2448
- /** @deprecated Use OUTPUT_TEXT instead */
2449
- ANSWER: "answer",
2450
2446
  EXPECTED_OUTPUT: "expected_output",
2451
- /** @deprecated Use INPUT_TEXT instead */
2452
- QUESTION: "question",
2453
2447
  CRITERIA: "criteria",
2454
- /** @deprecated Use EXPECTED_OUTPUT_TEXT instead */
2455
- REFERENCE_ANSWER: "reference_answer",
2456
2448
  INPUT: "input",
2457
2449
  OUTPUT: "output",
2458
2450
  FILE_CHANGES: "file_changes",
@@ -2462,9 +2454,8 @@ var TEMPLATE_VARIABLES = {
2462
2454
  };
2463
2455
  var VALID_TEMPLATE_VARIABLES = new Set(Object.values(TEMPLATE_VARIABLES));
2464
2456
  var REQUIRED_TEMPLATE_VARIABLES = /* @__PURE__ */ new Set([
2465
- TEMPLATE_VARIABLES.ANSWER,
2466
- TEMPLATE_VARIABLES.EXPECTED_OUTPUT,
2467
- TEMPLATE_VARIABLES.OUTPUT_TEXT
2457
+ TEMPLATE_VARIABLES.OUTPUT_TEXT,
2458
+ TEMPLATE_VARIABLES.EXPECTED_OUTPUT
2468
2459
  ]);
2469
2460
 
2470
2461
  // src/evaluation/validation/prompt-validator.ts
@@ -2487,13 +2478,13 @@ function validateTemplateVariables(content, source) {
2487
2478
  }
2488
2479
  match = variablePattern.exec(content);
2489
2480
  }
2490
- const hasCandidateAnswer = foundVariables.has(TEMPLATE_VARIABLES.ANSWER) || foundVariables.has(TEMPLATE_VARIABLES.OUTPUT_TEXT);
2481
+ const hasCandidateAnswer = foundVariables.has(TEMPLATE_VARIABLES.OUTPUT_TEXT);
2491
2482
  const hasExpectedOutput = foundVariables.has(TEMPLATE_VARIABLES.EXPECTED_OUTPUT);
2492
2483
  const hasRequiredFields = hasCandidateAnswer || hasExpectedOutput;
2493
2484
  if (!hasRequiredFields) {
2494
2485
  throw new Error(
2495
2486
  `Missing required fields. Must include at least one of:
2496
- - {{ ${TEMPLATE_VARIABLES.ANSWER} }} or {{ ${TEMPLATE_VARIABLES.OUTPUT_TEXT} }}
2487
+ - {{ ${TEMPLATE_VARIABLES.OUTPUT_TEXT} }}
2497
2488
  - {{ ${TEMPLATE_VARIABLES.EXPECTED_OUTPUT} }}`
2498
2489
  );
2499
2490
  }
@@ -5795,6 +5786,8 @@ async function invokeModel(options) {
5795
5786
  const { model, request, defaults, retryConfig, providerOptions } = options;
5796
5787
  const chatPrompt = buildChatPrompt(request);
5797
5788
  const { temperature, maxOutputTokens } = resolveModelSettings(request, defaults);
5789
+ const startTime = (/* @__PURE__ */ new Date()).toISOString();
5790
+ const startMs = Date.now();
5798
5791
  const result = await withRetry(
5799
5792
  () => (0, import_ai.generateText)({
5800
5793
  model,
@@ -5808,9 +5801,11 @@ async function invokeModel(options) {
5808
5801
  retryConfig,
5809
5802
  request.signal
5810
5803
  );
5811
- return mapResponse(result);
5804
+ const endTime = (/* @__PURE__ */ new Date()).toISOString();
5805
+ const durationMs = Date.now() - startMs;
5806
+ return mapResponse(result, { durationMs, startTime, endTime });
5812
5807
  }
5813
- function mapResponse(result) {
5808
+ function mapResponse(result, timing) {
5814
5809
  const content = result.text ?? "";
5815
5810
  const rawUsage = result.totalUsage ?? result.usage;
5816
5811
  const reasoning = rawUsage?.outputTokenDetails?.reasoningTokens ?? void 0;
@@ -5825,7 +5820,10 @@ function mapResponse(result) {
5825
5820
  raw: result,
5826
5821
  usage: toJsonObject(rawUsage),
5827
5822
  output: [{ role: "assistant", content }],
5828
- tokenUsage
5823
+ tokenUsage,
5824
+ durationMs: timing?.durationMs,
5825
+ startTime: timing?.startTime,
5826
+ endTime: timing?.endTime
5829
5827
  };
5830
5828
  }
5831
5829
  function toJsonObject(value) {
@@ -6703,10 +6701,12 @@ var ClaudeSdkProvider = class {
6703
6701
  if (usage) {
6704
6702
  const inputTokens = (usage.input_tokens ?? 0) + (usage.cache_read_input_tokens ?? 0) + (usage.cache_creation_input_tokens ?? 0);
6705
6703
  const outputTokens = usage.output_tokens ?? 0;
6704
+ const reasoningTokens = usage.reasoning_tokens ?? void 0;
6706
6705
  tokenUsage = {
6707
6706
  input: inputTokens,
6708
6707
  output: outputTokens,
6709
- cached: usage.cache_read_input_tokens ?? void 0
6708
+ cached: usage.cache_read_input_tokens ?? void 0,
6709
+ reasoning: reasoningTokens
6710
6710
  };
6711
6711
  request.streamCallbacks?.onLlmCallEnd?.(this.config.model ?? "claude", tokenUsage);
6712
6712
  }
@@ -7720,7 +7720,8 @@ ${basePrompt}` : basePrompt;
7720
7720
  onUsage({
7721
7721
  input: usage.input_tokens ?? 0,
7722
7722
  output: usage.output_tokens ?? 0,
7723
- cached: usage.cached_input_tokens ?? void 0
7723
+ cached: usage.cached_input_tokens ?? void 0,
7724
+ reasoning: usage.reasoning_tokens ?? void 0
7724
7725
  });
7725
7726
  }
7726
7727
  }
@@ -9735,10 +9736,12 @@ function extractTokenUsage(events) {
9735
9736
  output: output ?? 0
9736
9737
  };
9737
9738
  const cached = toFiniteNumber(u.cache_read_input_tokens ?? u.cached ?? u.cachedTokens);
9738
- if (cached !== void 0) {
9739
- return { ...result, cached };
9740
- }
9741
- return result;
9739
+ const reasoning = toFiniteNumber(u.reasoning_tokens ?? u.reasoningTokens ?? u.reasoning);
9740
+ return {
9741
+ ...result,
9742
+ ...cached !== void 0 ? { cached } : {},
9743
+ ...reasoning !== void 0 ? { reasoning } : {}
9744
+ };
9742
9745
  }
9743
9746
  }
9744
9747
  const messages = record.messages;
@@ -13265,11 +13268,9 @@ var CodeEvaluator = class {
13265
13268
  }
13266
13269
  }
13267
13270
  const payload = {
13268
- question: context2.evalCase.question,
13269
13271
  criteria: context2.evalCase.criteria,
13270
13272
  expectedOutput: context2.evalCase.expected_output,
13271
- referenceAnswer: context2.evalCase.reference_answer,
13272
- answer: context2.candidate,
13273
+ outputText: context2.candidate,
13273
13274
  output: outputForPayload,
13274
13275
  outputPath,
13275
13276
  guidelineFiles: context2.evalCase.guideline_paths,
@@ -13286,9 +13287,7 @@ var CodeEvaluator = class {
13286
13287
  fileChanges: context2.fileChanges ?? null,
13287
13288
  workspacePath: context2.workspacePath ?? null,
13288
13289
  config: this.config ?? null,
13289
- // Text convenience accessors (new names, always strings)
13290
13290
  inputText: context2.evalCase.question,
13291
- outputText: context2.candidate,
13292
13291
  expectedOutputText: context2.evalCase.reference_answer ?? ""
13293
13292
  };
13294
13293
  const inputPayload = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
@@ -13488,13 +13487,13 @@ Be concise and focused in your evaluation. Provide succinct, specific feedback r
13488
13487
  {{${TEMPLATE_VARIABLES.CRITERIA}}}
13489
13488
 
13490
13489
  [[ ## question ## ]]
13491
- {{${TEMPLATE_VARIABLES.QUESTION}}}
13490
+ {{${TEMPLATE_VARIABLES.INPUT_TEXT}}}
13492
13491
 
13493
13492
  [[ ## reference_answer ## ]]
13494
- {{${TEMPLATE_VARIABLES.REFERENCE_ANSWER}}}
13493
+ {{${TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT}}}
13495
13494
 
13496
13495
  [[ ## answer ## ]]
13497
- {{${TEMPLATE_VARIABLES.ANSWER}}}`;
13496
+ {{${TEMPLATE_VARIABLES.OUTPUT_TEXT}}}`;
13498
13497
  var freeformEvaluationSchema = import_zod4.z.object({
13499
13498
  score: import_zod4.z.number().min(0).max(1).describe("Score between 0.0 and 1.0"),
13500
13499
  assertions: import_zod4.z.array(
@@ -13572,12 +13571,8 @@ var LlmGraderEvaluator = class {
13572
13571
  2
13573
13572
  ),
13574
13573
  [TEMPLATE_VARIABLES.OUTPUT]: JSON.stringify(context2.output ?? [], null, 2),
13575
- [TEMPLATE_VARIABLES.ANSWER]: context2.candidate.trim(),
13576
- [TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (context2.evalCase.reference_answer ?? "").trim(),
13577
13574
  [TEMPLATE_VARIABLES.CRITERIA]: context2.evalCase.criteria.trim(),
13578
- [TEMPLATE_VARIABLES.QUESTION]: formattedQuestion.trim(),
13579
13575
  [TEMPLATE_VARIABLES.FILE_CHANGES]: context2.fileChanges ?? "",
13580
- // Text convenience accessors (new names, always strings)
13581
13576
  [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
13582
13577
  [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context2.candidate.trim(),
13583
13578
  [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context2.evalCase.reference_answer ?? "").trim()
@@ -13882,10 +13877,10 @@ ${context2.fileChanges}`;
13882
13877
  buildAgentUserPrompt(context2) {
13883
13878
  const formattedQuestion = context2.promptInputs.question && context2.promptInputs.question.trim().length > 0 ? context2.promptInputs.question : context2.evalCase.question;
13884
13879
  const variables = {
13885
- [TEMPLATE_VARIABLES.ANSWER]: context2.candidate.trim(),
13886
- [TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (context2.evalCase.reference_answer ?? "").trim(),
13887
13880
  [TEMPLATE_VARIABLES.CRITERIA]: context2.evalCase.criteria.trim(),
13888
- [TEMPLATE_VARIABLES.QUESTION]: formattedQuestion.trim(),
13881
+ [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
13882
+ [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context2.candidate.trim(),
13883
+ [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context2.evalCase.reference_answer ?? "").trim(),
13889
13884
  [TEMPLATE_VARIABLES.FILE_CHANGES]: context2.fileChanges ?? ""
13890
13885
  };
13891
13886
  if (this.evaluatorTemplate) {
@@ -13938,10 +13933,10 @@ ${context2.fileChanges}`;
13938
13933
  const rubrics = config?.type === "llm-grader" || config?.type === "llm-judge" ? config.rubrics : void 0;
13939
13934
  if (this.evaluatorTemplate) {
13940
13935
  const variables = {
13941
- [TEMPLATE_VARIABLES.ANSWER]: context2.candidate.trim(),
13942
- [TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (context2.evalCase.reference_answer ?? "").trim(),
13943
13936
  [TEMPLATE_VARIABLES.CRITERIA]: context2.evalCase.criteria.trim(),
13944
- [TEMPLATE_VARIABLES.QUESTION]: formattedQuestion.trim(),
13937
+ [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
13938
+ [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context2.candidate.trim(),
13939
+ [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context2.evalCase.reference_answer ?? "").trim(),
13945
13940
  [TEMPLATE_VARIABLES.FILE_CHANGES]: context2.fileChanges ?? ""
13946
13941
  };
13947
13942
  const customPrompt = substituteVariables(this.evaluatorTemplate, variables);
@@ -15369,7 +15364,10 @@ var COPILOT_MATCHER = {
15369
15364
  skillTools: ["Skill", "skill"],
15370
15365
  skillInputField: "skill",
15371
15366
  readTools: ["Read File", "readFile", "Read", "readTextFile"],
15372
- readInputField: "file_path"
15367
+ readInputField: "file_path",
15368
+ skillToolPrefixes: ["Using skill: "],
15369
+ readToolPrefixes: ["Viewing "],
15370
+ readInputFields: ["file_path", "path"]
15373
15371
  };
15374
15372
  var PROVIDER_TOOL_SEMANTICS = {
15375
15373
  claude: CLAUDE_MATCHER,
@@ -15411,12 +15409,22 @@ var SkillTriggerEvaluator = class {
15411
15409
  triggered = true;
15412
15410
  evidence = `Skill tool invoked with ${matcher.skillInputField}="${skillArg}"`;
15413
15411
  }
15412
+ } else if (matcher.skillToolPrefixes?.some(
15413
+ (prefix) => firstTool.tool.startsWith(prefix) && firstTool.tool.includes(skillName)
15414
+ )) {
15415
+ triggered = true;
15416
+ evidence = `Skill tool invoked via tool name "${firstTool.tool}"`;
15414
15417
  } else if (matcher.readTools.includes(firstTool.tool)) {
15415
- const filePath = String(input[matcher.readInputField] ?? "");
15418
+ const filePath = this.readPathFromInput(input, matcher);
15416
15419
  if (filePath.includes(skillName)) {
15417
15420
  triggered = true;
15418
15421
  evidence = `Read tool loaded skill file: ${filePath}`;
15419
15422
  }
15423
+ } else if (matcher.readToolPrefixes?.some(
15424
+ (prefix) => firstTool.tool.startsWith(prefix) && firstTool.tool.includes(skillName)
15425
+ )) {
15426
+ triggered = true;
15427
+ evidence = `Read tool loaded skill file via tool name "${firstTool.tool}"`;
15420
15428
  }
15421
15429
  }
15422
15430
  const pass = triggered === shouldTrigger;
@@ -15445,6 +15453,16 @@ var SkillTriggerEvaluator = class {
15445
15453
  expectedAspectCount: 1
15446
15454
  };
15447
15455
  }
15456
+ readPathFromInput(input, matcher) {
15457
+ const fields = matcher.readInputFields ?? [matcher.readInputField];
15458
+ for (const field of fields) {
15459
+ const value = input[field];
15460
+ if (value !== void 0 && value !== null) {
15461
+ return String(value);
15462
+ }
15463
+ }
15464
+ return "";
15465
+ }
15448
15466
  };
15449
15467
 
15450
15468
  // src/evaluation/evaluators/llm-grader-prompt.ts
@@ -15479,12 +15497,8 @@ function assembleFreeform(evalCase, candidate, promptInputs, fileChanges, evalua
15479
15497
  [TEMPLATE_VARIABLES.INPUT]: JSON.stringify(evalCase.input_segments, null, 2),
15480
15498
  [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: JSON.stringify(evalCase.expected_output, null, 2),
15481
15499
  [TEMPLATE_VARIABLES.OUTPUT]: JSON.stringify([], null, 2),
15482
- [TEMPLATE_VARIABLES.ANSWER]: candidate.trim(),
15483
- [TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (evalCase.reference_answer ?? "").trim(),
15484
15500
  [TEMPLATE_VARIABLES.CRITERIA]: evalCase.criteria.trim(),
15485
- [TEMPLATE_VARIABLES.QUESTION]: formattedQuestion.trim(),
15486
15501
  [TEMPLATE_VARIABLES.FILE_CHANGES]: fileChanges ?? "",
15487
- // Text convenience accessors (new names, always strings)
15488
15502
  [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
15489
15503
  [TEMPLATE_VARIABLES.OUTPUT_TEXT]: candidate.trim(),
15490
15504
  [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (evalCase.reference_answer ?? "").trim()
@@ -15816,11 +15830,9 @@ var ToolTrajectoryEvaluator = class {
15816
15830
  for (const call of toolCalls) {
15817
15831
  toolCallsByName[call.name] = (toolCallsByName[call.name] ?? 0) + 1;
15818
15832
  }
15819
- const toolNames = Object.keys(toolCallsByName).sort();
15820
15833
  return {
15821
15834
  eventCount: toolCalls.length,
15822
- toolNames,
15823
- toolCallsByName,
15835
+ toolCalls: toolCallsByName,
15824
15836
  errorCount: 0
15825
15837
  };
15826
15838
  }
@@ -15838,7 +15850,7 @@ var ToolTrajectoryEvaluator = class {
15838
15850
  const assertions = [];
15839
15851
  for (const toolName of toolNames) {
15840
15852
  const required = minimums[toolName];
15841
- const actual = summary.toolCallsByName[toolName] ?? 0;
15853
+ const actual = summary.toolCalls[toolName] ?? 0;
15842
15854
  if (actual >= required) {
15843
15855
  assertions.push({
15844
15856
  text: `${toolName}: called ${actual} times (required >=${required})`,
@@ -16541,11 +16553,9 @@ async function resolveCustomPrompt(promptConfig, context2, timeoutMs) {
16541
16553
  }
16542
16554
  async function executePromptTemplate(script, context2, config, timeoutMs) {
16543
16555
  const payload = {
16544
- question: context2.evalCase.question,
16545
16556
  criteria: context2.evalCase.criteria,
16546
16557
  expectedOutput: context2.evalCase.expected_output,
16547
- referenceAnswer: context2.evalCase.reference_answer,
16548
- answer: context2.candidate,
16558
+ outputText: context2.candidate,
16549
16559
  output: context2.output ?? null,
16550
16560
  guidelineFiles: context2.evalCase.guideline_paths,
16551
16561
  inputFiles: context2.evalCase.file_paths.filter(
@@ -16556,9 +16566,7 @@ async function executePromptTemplate(script, context2, config, timeoutMs) {
16556
16566
  fileChanges: context2.fileChanges ?? null,
16557
16567
  workspacePath: context2.workspacePath ?? null,
16558
16568
  config: config ?? context2.config ?? null,
16559
- // Text convenience accessors (new names, always strings)
16560
16569
  inputText: context2.evalCase.question,
16561
- outputText: context2.candidate,
16562
16570
  expectedOutputText: context2.evalCase.reference_answer ?? ""
16563
16571
  };
16564
16572
  const inputJson = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
@@ -18222,7 +18230,7 @@ async function runEvaluation(options) {
18222
18230
  dataset: evalCase.dataset,
18223
18231
  score: 0,
18224
18232
  assertions: [],
18225
- answer: "",
18233
+ outputText: "",
18226
18234
  target: target.name,
18227
18235
  error: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
18228
18236
  budgetExceeded: true,
@@ -18258,7 +18266,7 @@ async function runEvaluation(options) {
18258
18266
  dataset: evalCase.dataset,
18259
18267
  score: 0,
18260
18268
  assertions: [],
18261
- answer: "",
18269
+ outputText: "",
18262
18270
  target: target.name,
18263
18271
  error: errorMsg,
18264
18272
  executionStatus: "execution_error",
@@ -18523,7 +18531,7 @@ async function runBatchEvaluation(options) {
18523
18531
  const providerResponse = batchResponse[i];
18524
18532
  const output = providerResponse.output;
18525
18533
  const hasExecutionMetrics = providerResponse.tokenUsage !== void 0 || providerResponse.costUsd !== void 0 || providerResponse.durationMs !== void 0;
18526
- const computed = output ? computeTraceSummary(output) : hasExecutionMetrics ? { trace: { eventCount: 0, toolNames: [], toolCallsByName: {}, errorCount: 0 } } : void 0;
18534
+ const computed = output ? computeTraceSummary(output) : hasExecutionMetrics ? { trace: { eventCount: 0, toolCalls: {}, errorCount: 0 } } : void 0;
18527
18535
  const merged = computed ? mergeExecutionMetrics(computed, {
18528
18536
  tokenUsage: providerResponse.tokenUsage,
18529
18537
  costUsd: providerResponse.costUsd,
@@ -18920,7 +18928,7 @@ async function runEvalCase(options) {
18920
18928
  }
18921
18929
  const output = providerResponse.output;
18922
18930
  const hasExecutionMetrics = providerResponse.tokenUsage !== void 0 || providerResponse.costUsd !== void 0 || providerResponse.durationMs !== void 0;
18923
- const computed = output ? computeTraceSummary(output) : hasExecutionMetrics ? { trace: { eventCount: 0, toolNames: [], toolCallsByName: {}, errorCount: 0 } } : void 0;
18931
+ const computed = output ? computeTraceSummary(output) : hasExecutionMetrics ? { trace: { eventCount: 0, toolCalls: {}, errorCount: 0 } } : void 0;
18924
18932
  const merged = computed ? mergeExecutionMetrics(computed, {
18925
18933
  tokenUsage: providerResponse.tokenUsage,
18926
18934
  costUsd: providerResponse.costUsd,
@@ -19225,7 +19233,7 @@ async function evaluateCandidate(options) {
19225
19233
  conversationId: evalCase.conversation_id,
19226
19234
  score: score.score,
19227
19235
  assertions: score.assertions,
19228
- answer: candidate,
19236
+ outputText: candidate,
19229
19237
  target: target.name,
19230
19238
  tokenUsage,
19231
19239
  costUsd,
@@ -19581,7 +19589,7 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
19581
19589
  conversationId: evalCase.conversation_id,
19582
19590
  score: 0,
19583
19591
  assertions: [{ text: `Error: ${message}`, passed: false }],
19584
- answer: `Error occurred: ${message}`,
19592
+ outputText: `Error occurred: ${message}`,
19585
19593
  target: targetName,
19586
19594
  requests,
19587
19595
  input,
@@ -20119,7 +20127,7 @@ function shouldSkipCacheForTemperature(targetConfig) {
20119
20127
 
20120
20128
  // src/evaluation/baseline.ts
20121
20129
  var STRIPPED_TOP_LEVEL_FIELDS = /* @__PURE__ */ new Set([
20122
- "answer",
20130
+ "outputText",
20123
20131
  "requests",
20124
20132
  "trace",
20125
20133
  "workspacePath",
@@ -20293,14 +20301,17 @@ var OtelTraceExporter = class {
20293
20301
  rootSpan.setAttribute("agentv.target", result.target);
20294
20302
  if (result.dataset) rootSpan.setAttribute("agentv.dataset", result.dataset);
20295
20303
  rootSpan.setAttribute("agentv.score", result.score);
20296
- if (captureContent) rootSpan.setAttribute("agentv.answer", result.answer);
20304
+ if (captureContent) rootSpan.setAttribute("agentv.output_text", result.outputText);
20297
20305
  if (result.durationMs != null)
20298
20306
  rootSpan.setAttribute("agentv.trace.duration_ms", result.durationMs);
20299
20307
  if (result.costUsd != null) rootSpan.setAttribute("agentv.trace.cost_usd", result.costUsd);
20300
20308
  if (result.trace) {
20301
20309
  const t = result.trace;
20302
20310
  rootSpan.setAttribute("agentv.trace.event_count", t.eventCount);
20303
- rootSpan.setAttribute("agentv.trace.tool_names", t.toolNames.join(","));
20311
+ rootSpan.setAttribute(
20312
+ "agentv.trace.tool_names",
20313
+ Object.keys(t.toolCalls).sort().join(",")
20314
+ );
20304
20315
  if (t.llmCallCount != null)
20305
20316
  rootSpan.setAttribute("agentv.trace.llm_call_count", t.llmCallCount);
20306
20317
  }