@agentv/core 3.6.0 → 3.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -1786,12 +1786,10 @@ function computeTraceSummary(messages) {
1786
1786
  }
1787
1787
  }
1788
1788
  }
1789
- const toolNames = Object.keys(toolCallCounts).sort();
1790
1789
  return {
1791
1790
  trace: {
1792
1791
  eventCount: totalToolCalls,
1793
- toolNames,
1794
- toolCallsByName: toolCallCounts,
1792
+ toolCalls: toolCallCounts,
1795
1793
  errorCount: 0,
1796
1794
  llmCallCount,
1797
1795
  ...hasAnyDuration ? { toolDurations } : {}
@@ -1815,7 +1813,7 @@ var DEFAULT_EXPLORATION_TOOLS = [
1815
1813
  function explorationRatio(summary, explorationTools = DEFAULT_EXPLORATION_TOOLS) {
1816
1814
  if (summary.eventCount === 0) return void 0;
1817
1815
  const explorationCalls = explorationTools.reduce(
1818
- (sum, tool2) => sum + (summary.toolCallsByName[tool2] ?? 0),
1816
+ (sum, tool2) => sum + (summary.toolCalls[tool2] ?? 0),
1819
1817
  0
1820
1818
  );
1821
1819
  return explorationCalls / summary.eventCount;
@@ -15832,11 +15830,9 @@ var ToolTrajectoryEvaluator = class {
15832
15830
  for (const call of toolCalls) {
15833
15831
  toolCallsByName[call.name] = (toolCallsByName[call.name] ?? 0) + 1;
15834
15832
  }
15835
- const toolNames = Object.keys(toolCallsByName).sort();
15836
15833
  return {
15837
15834
  eventCount: toolCalls.length,
15838
- toolNames,
15839
- toolCallsByName,
15835
+ toolCalls: toolCallsByName,
15840
15836
  errorCount: 0
15841
15837
  };
15842
15838
  }
@@ -15854,7 +15850,7 @@ var ToolTrajectoryEvaluator = class {
15854
15850
  const assertions = [];
15855
15851
  for (const toolName of toolNames) {
15856
15852
  const required = minimums[toolName];
15857
- const actual = summary.toolCallsByName[toolName] ?? 0;
15853
+ const actual = summary.toolCalls[toolName] ?? 0;
15858
15854
  if (actual >= required) {
15859
15855
  assertions.push({
15860
15856
  text: `${toolName}: called ${actual} times (required >=${required})`,
@@ -18234,7 +18230,7 @@ async function runEvaluation(options) {
18234
18230
  dataset: evalCase.dataset,
18235
18231
  score: 0,
18236
18232
  assertions: [],
18237
- outputText: "",
18233
+ output: [],
18238
18234
  target: target.name,
18239
18235
  error: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
18240
18236
  budgetExceeded: true,
@@ -18270,7 +18266,7 @@ async function runEvaluation(options) {
18270
18266
  dataset: evalCase.dataset,
18271
18267
  score: 0,
18272
18268
  assertions: [],
18273
- outputText: "",
18269
+ output: [],
18274
18270
  target: target.name,
18275
18271
  error: errorMsg,
18276
18272
  executionStatus: "execution_error",
@@ -18535,7 +18531,7 @@ async function runBatchEvaluation(options) {
18535
18531
  const providerResponse = batchResponse[i];
18536
18532
  const output = providerResponse.output;
18537
18533
  const hasExecutionMetrics = providerResponse.tokenUsage !== void 0 || providerResponse.costUsd !== void 0 || providerResponse.durationMs !== void 0;
18538
- const computed = output ? computeTraceSummary(output) : hasExecutionMetrics ? { trace: { eventCount: 0, toolNames: [], toolCallsByName: {}, errorCount: 0 } } : void 0;
18534
+ const computed = output ? computeTraceSummary(output) : hasExecutionMetrics ? { trace: { eventCount: 0, toolCalls: {}, errorCount: 0 } } : void 0;
18539
18535
  const merged = computed ? mergeExecutionMetrics(computed, {
18540
18536
  tokenUsage: providerResponse.tokenUsage,
18541
18537
  costUsd: providerResponse.costUsd,
@@ -18932,7 +18928,7 @@ async function runEvalCase(options) {
18932
18928
  }
18933
18929
  const output = providerResponse.output;
18934
18930
  const hasExecutionMetrics = providerResponse.tokenUsage !== void 0 || providerResponse.costUsd !== void 0 || providerResponse.durationMs !== void 0;
18935
- const computed = output ? computeTraceSummary(output) : hasExecutionMetrics ? { trace: { eventCount: 0, toolNames: [], toolCallsByName: {}, errorCount: 0 } } : void 0;
18931
+ const computed = output ? computeTraceSummary(output) : hasExecutionMetrics ? { trace: { eventCount: 0, toolCalls: {}, errorCount: 0 } } : void 0;
18936
18932
  const merged = computed ? mergeExecutionMetrics(computed, {
18937
18933
  tokenUsage: providerResponse.tokenUsage,
18938
18934
  costUsd: providerResponse.costUsd,
@@ -19237,7 +19233,6 @@ async function evaluateCandidate(options) {
19237
19233
  conversationId: evalCase.conversation_id,
19238
19234
  score: score.score,
19239
19235
  assertions: score.assertions,
19240
- outputText: candidate,
19241
19236
  target: target.name,
19242
19237
  tokenUsage,
19243
19238
  costUsd,
@@ -19248,7 +19243,7 @@ async function evaluateCandidate(options) {
19248
19243
  input,
19249
19244
  scores,
19250
19245
  trace: trace2,
19251
- output,
19246
+ output: output ?? [{ role: "assistant", content: candidate }],
19252
19247
  fileChanges,
19253
19248
  executionStatus: classifyQualityStatus(score.score)
19254
19249
  };
@@ -19413,7 +19408,7 @@ async function runEvaluatorList(options) {
19413
19408
  weight,
19414
19409
  verdict: score2.verdict,
19415
19410
  assertions: score2.assertions,
19416
- evaluatorProviderRequest: score2.evaluatorRawRequest,
19411
+ input: score2.evaluatorRawRequest,
19417
19412
  details: score2.details,
19418
19413
  scores: mapChildResults(score2.scores),
19419
19414
  tokenUsage: score2.tokenUsage,
@@ -19593,7 +19588,7 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
19593
19588
  conversationId: evalCase.conversation_id,
19594
19589
  score: 0,
19595
19590
  assertions: [{ text: `Error: ${message}`, passed: false }],
19596
- outputText: `Error occurred: ${message}`,
19591
+ output: [{ role: "assistant", content: `Error occurred: ${message}` }],
19597
19592
  target: targetName,
19598
19593
  requests,
19599
19594
  input,
@@ -19637,7 +19632,7 @@ function buildResultInput(promptInputs) {
19637
19632
  content: message.content
19638
19633
  }));
19639
19634
  }
19640
- return promptInputs.question;
19635
+ return [{ role: "user", content: promptInputs.question }];
19641
19636
  }
19642
19637
  function aggregateEvaluatorTokenUsage(scores) {
19643
19638
  if (!scores || scores.length === 0) return void 0;
@@ -19703,7 +19698,7 @@ function mapChildResults(children) {
19703
19698
  weight: child.weight,
19704
19699
  verdict: child.verdict,
19705
19700
  assertions: child.assertions,
19706
- evaluatorProviderRequest: child.evaluatorRawRequest,
19701
+ input: child.evaluatorRawRequest,
19707
19702
  scores: mapChildResults(child.scores),
19708
19703
  details: child.details,
19709
19704
  tokenUsage: child.tokenUsage
@@ -20131,7 +20126,6 @@ function shouldSkipCacheForTemperature(targetConfig) {
20131
20126
 
20132
20127
  // src/evaluation/baseline.ts
20133
20128
  var STRIPPED_TOP_LEVEL_FIELDS = /* @__PURE__ */ new Set([
20134
- "outputText",
20135
20129
  "requests",
20136
20130
  "trace",
20137
20131
  "workspacePath",
@@ -20148,7 +20142,7 @@ var STRIPPED_TOP_LEVEL_FIELDS = /* @__PURE__ */ new Set([
20148
20142
  "startTime",
20149
20143
  "endTime"
20150
20144
  ]);
20151
- var STRIPPED_EVALUATOR_FIELDS = /* @__PURE__ */ new Set(["rawRequest", "evaluatorProviderRequest"]);
20145
+ var STRIPPED_EVALUATOR_FIELDS = /* @__PURE__ */ new Set(["rawRequest", "input"]);
20152
20146
  function trimEvaluatorResult(result) {
20153
20147
  const trimmed = {};
20154
20148
  for (const [key, value] of Object.entries(result)) {
@@ -20305,14 +20299,21 @@ var OtelTraceExporter = class {
20305
20299
  rootSpan.setAttribute("agentv.target", result.target);
20306
20300
  if (result.dataset) rootSpan.setAttribute("agentv.dataset", result.dataset);
20307
20301
  rootSpan.setAttribute("agentv.score", result.score);
20308
- if (captureContent) rootSpan.setAttribute("agentv.output_text", result.outputText);
20302
+ if (captureContent && result.output.length > 0) {
20303
+ const lastMsg = result.output[result.output.length - 1];
20304
+ const text = typeof lastMsg.content === "string" ? lastMsg.content : JSON.stringify(lastMsg.content);
20305
+ rootSpan.setAttribute("agentv.output_text", text);
20306
+ }
20309
20307
  if (result.durationMs != null)
20310
20308
  rootSpan.setAttribute("agentv.trace.duration_ms", result.durationMs);
20311
20309
  if (result.costUsd != null) rootSpan.setAttribute("agentv.trace.cost_usd", result.costUsd);
20312
20310
  if (result.trace) {
20313
20311
  const t = result.trace;
20314
20312
  rootSpan.setAttribute("agentv.trace.event_count", t.eventCount);
20315
- rootSpan.setAttribute("agentv.trace.tool_names", t.toolNames.join(","));
20313
+ rootSpan.setAttribute(
20314
+ "agentv.trace.tool_names",
20315
+ Object.keys(t.toolCalls).sort().join(",")
20316
+ );
20316
20317
  if (t.llmCallCount != null)
20317
20318
  rootSpan.setAttribute("agentv.trace.llm_call_count", t.llmCallCount);
20318
20319
  }