@agentv/core 3.6.0 → 3.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-2IZOTQ25.js → chunk-3ZS3GCMI.js} +143 -3
- package/dist/chunk-3ZS3GCMI.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +227 -39
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +84 -5
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +23 -22
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +6 -9
- package/dist/index.d.ts +6 -9
- package/dist/index.js +429 -562
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/dist/chunk-2IZOTQ25.js.map +0 -1
package/dist/index.cjs
CHANGED
|
@@ -1786,12 +1786,10 @@ function computeTraceSummary(messages) {
|
|
|
1786
1786
|
}
|
|
1787
1787
|
}
|
|
1788
1788
|
}
|
|
1789
|
-
const toolNames = Object.keys(toolCallCounts).sort();
|
|
1790
1789
|
return {
|
|
1791
1790
|
trace: {
|
|
1792
1791
|
eventCount: totalToolCalls,
|
|
1793
|
-
|
|
1794
|
-
toolCallsByName: toolCallCounts,
|
|
1792
|
+
toolCalls: toolCallCounts,
|
|
1795
1793
|
errorCount: 0,
|
|
1796
1794
|
llmCallCount,
|
|
1797
1795
|
...hasAnyDuration ? { toolDurations } : {}
|
|
@@ -1815,7 +1813,7 @@ var DEFAULT_EXPLORATION_TOOLS = [
|
|
|
1815
1813
|
function explorationRatio(summary, explorationTools = DEFAULT_EXPLORATION_TOOLS) {
|
|
1816
1814
|
if (summary.eventCount === 0) return void 0;
|
|
1817
1815
|
const explorationCalls = explorationTools.reduce(
|
|
1818
|
-
(sum, tool2) => sum + (summary.
|
|
1816
|
+
(sum, tool2) => sum + (summary.toolCalls[tool2] ?? 0),
|
|
1819
1817
|
0
|
|
1820
1818
|
);
|
|
1821
1819
|
return explorationCalls / summary.eventCount;
|
|
@@ -15832,11 +15830,9 @@ var ToolTrajectoryEvaluator = class {
|
|
|
15832
15830
|
for (const call of toolCalls) {
|
|
15833
15831
|
toolCallsByName[call.name] = (toolCallsByName[call.name] ?? 0) + 1;
|
|
15834
15832
|
}
|
|
15835
|
-
const toolNames = Object.keys(toolCallsByName).sort();
|
|
15836
15833
|
return {
|
|
15837
15834
|
eventCount: toolCalls.length,
|
|
15838
|
-
|
|
15839
|
-
toolCallsByName,
|
|
15835
|
+
toolCalls: toolCallsByName,
|
|
15840
15836
|
errorCount: 0
|
|
15841
15837
|
};
|
|
15842
15838
|
}
|
|
@@ -15854,7 +15850,7 @@ var ToolTrajectoryEvaluator = class {
|
|
|
15854
15850
|
const assertions = [];
|
|
15855
15851
|
for (const toolName of toolNames) {
|
|
15856
15852
|
const required = minimums[toolName];
|
|
15857
|
-
const actual = summary.
|
|
15853
|
+
const actual = summary.toolCalls[toolName] ?? 0;
|
|
15858
15854
|
if (actual >= required) {
|
|
15859
15855
|
assertions.push({
|
|
15860
15856
|
text: `${toolName}: called ${actual} times (required >=${required})`,
|
|
@@ -18234,7 +18230,7 @@ async function runEvaluation(options) {
|
|
|
18234
18230
|
dataset: evalCase.dataset,
|
|
18235
18231
|
score: 0,
|
|
18236
18232
|
assertions: [],
|
|
18237
|
-
|
|
18233
|
+
output: [],
|
|
18238
18234
|
target: target.name,
|
|
18239
18235
|
error: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
|
|
18240
18236
|
budgetExceeded: true,
|
|
@@ -18270,7 +18266,7 @@ async function runEvaluation(options) {
|
|
|
18270
18266
|
dataset: evalCase.dataset,
|
|
18271
18267
|
score: 0,
|
|
18272
18268
|
assertions: [],
|
|
18273
|
-
|
|
18269
|
+
output: [],
|
|
18274
18270
|
target: target.name,
|
|
18275
18271
|
error: errorMsg,
|
|
18276
18272
|
executionStatus: "execution_error",
|
|
@@ -18535,7 +18531,7 @@ async function runBatchEvaluation(options) {
|
|
|
18535
18531
|
const providerResponse = batchResponse[i];
|
|
18536
18532
|
const output = providerResponse.output;
|
|
18537
18533
|
const hasExecutionMetrics = providerResponse.tokenUsage !== void 0 || providerResponse.costUsd !== void 0 || providerResponse.durationMs !== void 0;
|
|
18538
|
-
const computed = output ? computeTraceSummary(output) : hasExecutionMetrics ? { trace: { eventCount: 0,
|
|
18534
|
+
const computed = output ? computeTraceSummary(output) : hasExecutionMetrics ? { trace: { eventCount: 0, toolCalls: {}, errorCount: 0 } } : void 0;
|
|
18539
18535
|
const merged = computed ? mergeExecutionMetrics(computed, {
|
|
18540
18536
|
tokenUsage: providerResponse.tokenUsage,
|
|
18541
18537
|
costUsd: providerResponse.costUsd,
|
|
@@ -18932,7 +18928,7 @@ async function runEvalCase(options) {
|
|
|
18932
18928
|
}
|
|
18933
18929
|
const output = providerResponse.output;
|
|
18934
18930
|
const hasExecutionMetrics = providerResponse.tokenUsage !== void 0 || providerResponse.costUsd !== void 0 || providerResponse.durationMs !== void 0;
|
|
18935
|
-
const computed = output ? computeTraceSummary(output) : hasExecutionMetrics ? { trace: { eventCount: 0,
|
|
18931
|
+
const computed = output ? computeTraceSummary(output) : hasExecutionMetrics ? { trace: { eventCount: 0, toolCalls: {}, errorCount: 0 } } : void 0;
|
|
18936
18932
|
const merged = computed ? mergeExecutionMetrics(computed, {
|
|
18937
18933
|
tokenUsage: providerResponse.tokenUsage,
|
|
18938
18934
|
costUsd: providerResponse.costUsd,
|
|
@@ -19237,7 +19233,6 @@ async function evaluateCandidate(options) {
|
|
|
19237
19233
|
conversationId: evalCase.conversation_id,
|
|
19238
19234
|
score: score.score,
|
|
19239
19235
|
assertions: score.assertions,
|
|
19240
|
-
outputText: candidate,
|
|
19241
19236
|
target: target.name,
|
|
19242
19237
|
tokenUsage,
|
|
19243
19238
|
costUsd,
|
|
@@ -19248,7 +19243,7 @@ async function evaluateCandidate(options) {
|
|
|
19248
19243
|
input,
|
|
19249
19244
|
scores,
|
|
19250
19245
|
trace: trace2,
|
|
19251
|
-
output,
|
|
19246
|
+
output: output ?? [{ role: "assistant", content: candidate }],
|
|
19252
19247
|
fileChanges,
|
|
19253
19248
|
executionStatus: classifyQualityStatus(score.score)
|
|
19254
19249
|
};
|
|
@@ -19413,7 +19408,7 @@ async function runEvaluatorList(options) {
|
|
|
19413
19408
|
weight,
|
|
19414
19409
|
verdict: score2.verdict,
|
|
19415
19410
|
assertions: score2.assertions,
|
|
19416
|
-
|
|
19411
|
+
input: score2.evaluatorRawRequest,
|
|
19417
19412
|
details: score2.details,
|
|
19418
19413
|
scores: mapChildResults(score2.scores),
|
|
19419
19414
|
tokenUsage: score2.tokenUsage,
|
|
@@ -19593,7 +19588,7 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
|
|
|
19593
19588
|
conversationId: evalCase.conversation_id,
|
|
19594
19589
|
score: 0,
|
|
19595
19590
|
assertions: [{ text: `Error: ${message}`, passed: false }],
|
|
19596
|
-
|
|
19591
|
+
output: [{ role: "assistant", content: `Error occurred: ${message}` }],
|
|
19597
19592
|
target: targetName,
|
|
19598
19593
|
requests,
|
|
19599
19594
|
input,
|
|
@@ -19637,7 +19632,7 @@ function buildResultInput(promptInputs) {
|
|
|
19637
19632
|
content: message.content
|
|
19638
19633
|
}));
|
|
19639
19634
|
}
|
|
19640
|
-
return promptInputs.question;
|
|
19635
|
+
return [{ role: "user", content: promptInputs.question }];
|
|
19641
19636
|
}
|
|
19642
19637
|
function aggregateEvaluatorTokenUsage(scores) {
|
|
19643
19638
|
if (!scores || scores.length === 0) return void 0;
|
|
@@ -19703,7 +19698,7 @@ function mapChildResults(children) {
|
|
|
19703
19698
|
weight: child.weight,
|
|
19704
19699
|
verdict: child.verdict,
|
|
19705
19700
|
assertions: child.assertions,
|
|
19706
|
-
|
|
19701
|
+
input: child.evaluatorRawRequest,
|
|
19707
19702
|
scores: mapChildResults(child.scores),
|
|
19708
19703
|
details: child.details,
|
|
19709
19704
|
tokenUsage: child.tokenUsage
|
|
@@ -20131,7 +20126,6 @@ function shouldSkipCacheForTemperature(targetConfig) {
|
|
|
20131
20126
|
|
|
20132
20127
|
// src/evaluation/baseline.ts
|
|
20133
20128
|
var STRIPPED_TOP_LEVEL_FIELDS = /* @__PURE__ */ new Set([
|
|
20134
|
-
"outputText",
|
|
20135
20129
|
"requests",
|
|
20136
20130
|
"trace",
|
|
20137
20131
|
"workspacePath",
|
|
@@ -20148,7 +20142,7 @@ var STRIPPED_TOP_LEVEL_FIELDS = /* @__PURE__ */ new Set([
|
|
|
20148
20142
|
"startTime",
|
|
20149
20143
|
"endTime"
|
|
20150
20144
|
]);
|
|
20151
|
-
var STRIPPED_EVALUATOR_FIELDS = /* @__PURE__ */ new Set(["rawRequest", "
|
|
20145
|
+
var STRIPPED_EVALUATOR_FIELDS = /* @__PURE__ */ new Set(["rawRequest", "input"]);
|
|
20152
20146
|
function trimEvaluatorResult(result) {
|
|
20153
20147
|
const trimmed = {};
|
|
20154
20148
|
for (const [key, value] of Object.entries(result)) {
|
|
@@ -20305,14 +20299,21 @@ var OtelTraceExporter = class {
|
|
|
20305
20299
|
rootSpan.setAttribute("agentv.target", result.target);
|
|
20306
20300
|
if (result.dataset) rootSpan.setAttribute("agentv.dataset", result.dataset);
|
|
20307
20301
|
rootSpan.setAttribute("agentv.score", result.score);
|
|
20308
|
-
if (captureContent
|
|
20302
|
+
if (captureContent && result.output.length > 0) {
|
|
20303
|
+
const lastMsg = result.output[result.output.length - 1];
|
|
20304
|
+
const text = typeof lastMsg.content === "string" ? lastMsg.content : JSON.stringify(lastMsg.content);
|
|
20305
|
+
rootSpan.setAttribute("agentv.output_text", text);
|
|
20306
|
+
}
|
|
20309
20307
|
if (result.durationMs != null)
|
|
20310
20308
|
rootSpan.setAttribute("agentv.trace.duration_ms", result.durationMs);
|
|
20311
20309
|
if (result.costUsd != null) rootSpan.setAttribute("agentv.trace.cost_usd", result.costUsd);
|
|
20312
20310
|
if (result.trace) {
|
|
20313
20311
|
const t = result.trace;
|
|
20314
20312
|
rootSpan.setAttribute("agentv.trace.event_count", t.eventCount);
|
|
20315
|
-
rootSpan.setAttribute(
|
|
20313
|
+
rootSpan.setAttribute(
|
|
20314
|
+
"agentv.trace.tool_names",
|
|
20315
|
+
Object.keys(t.toolCalls).sort().join(",")
|
|
20316
|
+
);
|
|
20316
20317
|
if (t.llmCallCount != null)
|
|
20317
20318
|
rootSpan.setAttribute("agentv.trace.llm_call_count", t.llmCallCount);
|
|
20318
20319
|
}
|