@agentv/core 2.8.0-next.1 → 2.9.0-next.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-P2465XAH.js → chunk-7Q4PH265.js} +1 -1
- package/dist/chunk-7Q4PH265.js.map +1 -0
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +1 -1
- package/dist/index.cjs +163 -72
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +48 -20
- package/dist/index.d.ts +48 -20
- package/dist/index.js +164 -73
- package/dist/index.js.map +1 -1
- package/package.json +6 -4
- package/dist/chunk-P2465XAH.js.map +0 -1
package/dist/index.cjs
CHANGED
|
@@ -1686,14 +1686,16 @@ function computeTraceSummary(messages) {
|
|
|
1686
1686
|
}
|
|
1687
1687
|
const toolNames = Object.keys(toolCallCounts).sort();
|
|
1688
1688
|
return {
|
|
1689
|
-
|
|
1690
|
-
|
|
1691
|
-
|
|
1692
|
-
|
|
1689
|
+
trace: {
|
|
1690
|
+
eventCount: totalToolCalls,
|
|
1691
|
+
toolNames,
|
|
1692
|
+
toolCallsByName: toolCallCounts,
|
|
1693
|
+
errorCount: 0,
|
|
1694
|
+
llmCallCount,
|
|
1695
|
+
...hasAnyDuration ? { toolDurations } : {}
|
|
1696
|
+
},
|
|
1693
1697
|
startTime: earliestStart?.toISOString(),
|
|
1694
|
-
endTime: latestEnd?.toISOString()
|
|
1695
|
-
llmCallCount,
|
|
1696
|
-
...hasAnyDuration ? { toolDurations } : {}
|
|
1698
|
+
endTime: latestEnd?.toISOString()
|
|
1697
1699
|
};
|
|
1698
1700
|
}
|
|
1699
1701
|
var DEFAULT_EXPLORATION_TOOLS = [
|
|
@@ -1716,9 +1718,9 @@ function explorationRatio(summary, explorationTools = DEFAULT_EXPLORATION_TOOLS)
|
|
|
1716
1718
|
);
|
|
1717
1719
|
return explorationCalls / summary.eventCount;
|
|
1718
1720
|
}
|
|
1719
|
-
function tokensPerTool(summary) {
|
|
1720
|
-
if (!
|
|
1721
|
-
const totalTokens =
|
|
1721
|
+
function tokensPerTool(summary, tokenUsage) {
|
|
1722
|
+
if (!tokenUsage || summary.eventCount === 0) return void 0;
|
|
1723
|
+
const totalTokens = tokenUsage.input + tokenUsage.output;
|
|
1722
1724
|
return totalTokens / summary.eventCount;
|
|
1723
1725
|
}
|
|
1724
1726
|
function avgToolDurationMs(summary) {
|
|
@@ -1734,16 +1736,15 @@ function avgToolDurationMs(summary) {
|
|
|
1734
1736
|
if (totalCalls === 0) return void 0;
|
|
1735
1737
|
return totalDuration / totalCalls;
|
|
1736
1738
|
}
|
|
1737
|
-
function mergeExecutionMetrics(
|
|
1738
|
-
if (!metrics) return
|
|
1739
|
+
function mergeExecutionMetrics(computed, metrics) {
|
|
1740
|
+
if (!metrics) return computed;
|
|
1739
1741
|
return {
|
|
1740
|
-
|
|
1742
|
+
trace: computed.trace,
|
|
1741
1743
|
tokenUsage: metrics.tokenUsage,
|
|
1742
1744
|
costUsd: metrics.costUsd,
|
|
1743
1745
|
durationMs: metrics.durationMs,
|
|
1744
|
-
|
|
1745
|
-
|
|
1746
|
-
endTime: metrics.endTime ?? summary.endTime
|
|
1746
|
+
startTime: metrics.startTime ?? computed.startTime,
|
|
1747
|
+
endTime: metrics.endTime ?? computed.endTime
|
|
1747
1748
|
};
|
|
1748
1749
|
}
|
|
1749
1750
|
|
|
@@ -4815,10 +4816,13 @@ async function invokeModel(options) {
|
|
|
4815
4816
|
}
|
|
4816
4817
|
function mapResponse(result) {
|
|
4817
4818
|
const content = result.text ?? "";
|
|
4819
|
+
const rawUsage = result.totalUsage ?? result.usage;
|
|
4820
|
+
const tokenUsage = rawUsage?.inputTokens != null && rawUsage?.outputTokens != null ? { input: rawUsage.inputTokens, output: rawUsage.outputTokens } : void 0;
|
|
4818
4821
|
return {
|
|
4819
4822
|
raw: result,
|
|
4820
|
-
usage: toJsonObject(
|
|
4821
|
-
output: [{ role: "assistant", content }]
|
|
4823
|
+
usage: toJsonObject(rawUsage),
|
|
4824
|
+
output: [{ role: "assistant", content }],
|
|
4825
|
+
tokenUsage
|
|
4822
4826
|
};
|
|
4823
4827
|
}
|
|
4824
4828
|
function toJsonObject(value) {
|
|
@@ -11208,6 +11212,8 @@ async function createTargetProxy(options) {
|
|
|
11208
11212
|
const token = (0, import_node_crypto7.randomBytes)(32).toString("hex");
|
|
11209
11213
|
let callCount = 0;
|
|
11210
11214
|
let isShutdown = false;
|
|
11215
|
+
let totalInputTokens = 0;
|
|
11216
|
+
let totalOutputTokens = 0;
|
|
11211
11217
|
const targetsList = availableTargets ?? [defaultProvider.targetName];
|
|
11212
11218
|
function resolveProvider(targetName) {
|
|
11213
11219
|
if (targetName === void 0 || targetName === defaultProvider.targetName) {
|
|
@@ -11286,11 +11292,16 @@ async function createTargetProxy(options) {
|
|
|
11286
11292
|
evalCaseId: request.evalCaseId ?? "proxy",
|
|
11287
11293
|
attempt: request.attempt ?? 1
|
|
11288
11294
|
});
|
|
11295
|
+
if (response.tokenUsage) {
|
|
11296
|
+
totalInputTokens += response.tokenUsage.input;
|
|
11297
|
+
totalOutputTokens += response.tokenUsage.output;
|
|
11298
|
+
}
|
|
11289
11299
|
const output = response.output ?? [];
|
|
11290
11300
|
const rawText = extractLastAssistantContent(output);
|
|
11291
11301
|
const result = {
|
|
11292
11302
|
output,
|
|
11293
|
-
rawText
|
|
11303
|
+
rawText,
|
|
11304
|
+
tokenUsage: response.tokenUsage
|
|
11294
11305
|
};
|
|
11295
11306
|
sendJson(res, 200, result);
|
|
11296
11307
|
} catch (error) {
|
|
@@ -11337,10 +11348,15 @@ async function createTargetProxy(options) {
|
|
|
11337
11348
|
evalCaseId: request.evalCaseId ?? "proxy",
|
|
11338
11349
|
attempt: request.attempt ?? 1
|
|
11339
11350
|
});
|
|
11351
|
+
if (response.tokenUsage) {
|
|
11352
|
+
totalInputTokens += response.tokenUsage.input;
|
|
11353
|
+
totalOutputTokens += response.tokenUsage.output;
|
|
11354
|
+
}
|
|
11340
11355
|
const output = response.output ?? [];
|
|
11341
11356
|
responses.push({
|
|
11342
11357
|
output,
|
|
11343
|
-
rawText: extractLastAssistantContent(output)
|
|
11358
|
+
rawText: extractLastAssistantContent(output),
|
|
11359
|
+
tokenUsage: response.tokenUsage
|
|
11344
11360
|
});
|
|
11345
11361
|
} catch (error) {
|
|
11346
11362
|
const message = error instanceof Error ? error.message : String(error);
|
|
@@ -11379,7 +11395,8 @@ async function createTargetProxy(options) {
|
|
|
11379
11395
|
},
|
|
11380
11396
|
getUsageMetadata: () => ({
|
|
11381
11397
|
callCount,
|
|
11382
|
-
maxCalls
|
|
11398
|
+
maxCalls,
|
|
11399
|
+
tokenUsage: totalInputTokens > 0 || totalOutputTokens > 0 ? { input: totalInputTokens, output: totalOutputTokens } : void 0
|
|
11383
11400
|
})
|
|
11384
11401
|
};
|
|
11385
11402
|
}
|
|
@@ -11504,6 +11521,11 @@ var CodeEvaluator = class {
|
|
|
11504
11521
|
),
|
|
11505
11522
|
input: context2.evalCase.input,
|
|
11506
11523
|
trace: context2.trace ?? null,
|
|
11524
|
+
tokenUsage: context2.tokenUsage ?? null,
|
|
11525
|
+
costUsd: context2.costUsd ?? null,
|
|
11526
|
+
durationMs: context2.durationMs ?? null,
|
|
11527
|
+
startTime: context2.startTime ?? null,
|
|
11528
|
+
endTime: context2.endTime ?? null,
|
|
11507
11529
|
fileChanges: context2.fileChanges ?? null,
|
|
11508
11530
|
workspacePath: context2.workspacePath ?? null,
|
|
11509
11531
|
config: this.config ?? null
|
|
@@ -11562,7 +11584,8 @@ var CodeEvaluator = class {
|
|
|
11562
11584
|
expectedAspectCount: hits.length + misses.length || 1,
|
|
11563
11585
|
reasoning,
|
|
11564
11586
|
evaluatorRawRequest,
|
|
11565
|
-
...details ? { details } : {}
|
|
11587
|
+
...details ? { details } : {},
|
|
11588
|
+
tokenUsage: proxyUsage?.tokenUsage
|
|
11566
11589
|
};
|
|
11567
11590
|
} catch (error) {
|
|
11568
11591
|
const message = error instanceof Error ? error.message : String(error);
|
|
@@ -11584,7 +11607,8 @@ var CodeEvaluator = class {
|
|
|
11584
11607
|
}
|
|
11585
11608
|
} : {},
|
|
11586
11609
|
error: message
|
|
11587
|
-
}
|
|
11610
|
+
},
|
|
11611
|
+
tokenUsage: proxyUsage?.tokenUsage
|
|
11588
11612
|
};
|
|
11589
11613
|
} finally {
|
|
11590
11614
|
if (proxyShutdown) {
|
|
@@ -11748,7 +11772,7 @@ ${context2.fileChanges}`;
|
|
|
11748
11772
|
target: judgeProvider.targetName
|
|
11749
11773
|
};
|
|
11750
11774
|
try {
|
|
11751
|
-
const { data } = await this.runWithRetry({
|
|
11775
|
+
const { data, tokenUsage } = await this.runWithRetry({
|
|
11752
11776
|
context: context2,
|
|
11753
11777
|
judgeProvider,
|
|
11754
11778
|
systemPrompt,
|
|
@@ -11767,7 +11791,8 @@ ${context2.fileChanges}`;
|
|
|
11767
11791
|
misses,
|
|
11768
11792
|
expectedAspectCount,
|
|
11769
11793
|
reasoning,
|
|
11770
|
-
evaluatorRawRequest
|
|
11794
|
+
evaluatorRawRequest,
|
|
11795
|
+
tokenUsage
|
|
11771
11796
|
};
|
|
11772
11797
|
} catch {
|
|
11773
11798
|
return {
|
|
@@ -11797,7 +11822,7 @@ ${context2.fileChanges}`;
|
|
|
11797
11822
|
systemPrompt,
|
|
11798
11823
|
target: judgeProvider.targetName
|
|
11799
11824
|
};
|
|
11800
|
-
const { data } = await this.runWithRetry({
|
|
11825
|
+
const { data, tokenUsage } = await this.runWithRetry({
|
|
11801
11826
|
context: context2,
|
|
11802
11827
|
judgeProvider,
|
|
11803
11828
|
systemPrompt,
|
|
@@ -11812,7 +11837,8 @@ ${context2.fileChanges}`;
|
|
|
11812
11837
|
misses,
|
|
11813
11838
|
expectedAspectCount: rubrics.length,
|
|
11814
11839
|
reasoning: data.overall_reasoning,
|
|
11815
|
-
evaluatorRawRequest
|
|
11840
|
+
evaluatorRawRequest,
|
|
11841
|
+
tokenUsage
|
|
11816
11842
|
};
|
|
11817
11843
|
}
|
|
11818
11844
|
/**
|
|
@@ -11827,7 +11853,7 @@ ${context2.fileChanges}`;
|
|
|
11827
11853
|
systemPrompt,
|
|
11828
11854
|
target: judgeProvider.targetName
|
|
11829
11855
|
};
|
|
11830
|
-
const { data } = await this.runWithRetry({
|
|
11856
|
+
const { data, tokenUsage } = await this.runWithRetry({
|
|
11831
11857
|
context: context2,
|
|
11832
11858
|
judgeProvider,
|
|
11833
11859
|
systemPrompt,
|
|
@@ -11843,7 +11869,8 @@ ${context2.fileChanges}`;
|
|
|
11843
11869
|
expectedAspectCount: rubrics.length,
|
|
11844
11870
|
reasoning: data.overall_reasoning,
|
|
11845
11871
|
evaluatorRawRequest,
|
|
11846
|
-
details
|
|
11872
|
+
details,
|
|
11873
|
+
tokenUsage
|
|
11847
11874
|
};
|
|
11848
11875
|
}
|
|
11849
11876
|
/**
|
|
@@ -11927,15 +11954,17 @@ ${context2.fileChanges}`;
|
|
|
11927
11954
|
try {
|
|
11928
11955
|
const model = judgeProvider.asLanguageModel?.();
|
|
11929
11956
|
if (model) {
|
|
11930
|
-
const
|
|
11957
|
+
const result = await (0, import_ai2.generateText)({
|
|
11931
11958
|
model,
|
|
11932
11959
|
system: systemPrompt,
|
|
11933
11960
|
prompt: userPrompt,
|
|
11934
11961
|
...this.maxOutputTokens ? { maxTokens: this.maxOutputTokens } : {},
|
|
11935
11962
|
...typeof this.temperature === "number" ? { temperature: this.temperature } : {}
|
|
11936
11963
|
});
|
|
11937
|
-
const data2 = schema.parse(parseJsonFromText(text));
|
|
11938
|
-
|
|
11964
|
+
const data2 = schema.parse(parseJsonFromText(result.text));
|
|
11965
|
+
const rawUsage = result.usage;
|
|
11966
|
+
const tokenUsage = rawUsage?.inputTokens != null && rawUsage?.outputTokens != null ? { input: rawUsage.inputTokens, output: rawUsage.outputTokens } : void 0;
|
|
11967
|
+
return { data: data2, tokenUsage };
|
|
11939
11968
|
}
|
|
11940
11969
|
const response = await judgeProvider.invoke({
|
|
11941
11970
|
question: userPrompt,
|
|
@@ -11946,7 +11975,7 @@ ${context2.fileChanges}`;
|
|
|
11946
11975
|
temperature: this.temperature
|
|
11947
11976
|
});
|
|
11948
11977
|
const data = schema.parse(parseJsonFromText(extractLastAssistantContent2(response.output)));
|
|
11949
|
-
return { data, providerResponse: response };
|
|
11978
|
+
return { data, providerResponse: response, tokenUsage: response.tokenUsage };
|
|
11950
11979
|
} catch (e) {
|
|
11951
11980
|
lastError = e instanceof Error ? e : new Error(String(e));
|
|
11952
11981
|
}
|
|
@@ -12152,7 +12181,8 @@ var CompositeEvaluator = class {
|
|
|
12152
12181
|
reasoning: member.result.reasoning,
|
|
12153
12182
|
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
12154
12183
|
scores: member.result.scores,
|
|
12155
|
-
details: member.result.details
|
|
12184
|
+
details: member.result.details,
|
|
12185
|
+
tokenUsage: member.result.tokenUsage
|
|
12156
12186
|
});
|
|
12157
12187
|
}
|
|
12158
12188
|
const finalScore = totalWeight > 0 ? weightedSum / totalWeight : 0;
|
|
@@ -12200,7 +12230,8 @@ var CompositeEvaluator = class {
|
|
|
12200
12230
|
reasoning: member.result.reasoning,
|
|
12201
12231
|
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
12202
12232
|
scores: member.result.scores,
|
|
12203
|
-
details: member.result.details
|
|
12233
|
+
details: member.result.details,
|
|
12234
|
+
tokenUsage: member.result.tokenUsage
|
|
12204
12235
|
});
|
|
12205
12236
|
}
|
|
12206
12237
|
const totalCount = results.length;
|
|
@@ -12379,7 +12410,7 @@ var CostEvaluator = class {
|
|
|
12379
12410
|
}
|
|
12380
12411
|
evaluate(context2) {
|
|
12381
12412
|
const { budget } = this.config;
|
|
12382
|
-
const costUsd = context2.
|
|
12413
|
+
const costUsd = context2.costUsd;
|
|
12383
12414
|
if (costUsd === void 0) {
|
|
12384
12415
|
return {
|
|
12385
12416
|
score: 0,
|
|
@@ -12422,7 +12453,7 @@ var ExecutionMetricsEvaluator = class {
|
|
|
12422
12453
|
this.config = options.config;
|
|
12423
12454
|
}
|
|
12424
12455
|
evaluate(context2) {
|
|
12425
|
-
const { trace: trace2 } = context2;
|
|
12456
|
+
const { trace: trace2, tokenUsage, costUsd, durationMs } = context2;
|
|
12426
12457
|
const {
|
|
12427
12458
|
max_tool_calls,
|
|
12428
12459
|
max_llm_calls,
|
|
@@ -12432,7 +12463,8 @@ var ExecutionMetricsEvaluator = class {
|
|
|
12432
12463
|
target_exploration_ratio,
|
|
12433
12464
|
exploration_tolerance = 0.2
|
|
12434
12465
|
} = this.config;
|
|
12435
|
-
|
|
12466
|
+
const needsTrace = max_tool_calls !== void 0 || max_llm_calls !== void 0 || target_exploration_ratio !== void 0;
|
|
12467
|
+
if (needsTrace && !trace2) {
|
|
12436
12468
|
return {
|
|
12437
12469
|
score: 0,
|
|
12438
12470
|
verdict: "fail",
|
|
@@ -12447,11 +12479,12 @@ var ExecutionMetricsEvaluator = class {
|
|
|
12447
12479
|
}
|
|
12448
12480
|
};
|
|
12449
12481
|
}
|
|
12482
|
+
const narrowedTrace = trace2;
|
|
12450
12483
|
const hits = [];
|
|
12451
12484
|
const misses = [];
|
|
12452
12485
|
const actualMetrics = {};
|
|
12453
|
-
if (max_tool_calls !== void 0) {
|
|
12454
|
-
const toolCalls =
|
|
12486
|
+
if (max_tool_calls !== void 0 && narrowedTrace) {
|
|
12487
|
+
const toolCalls = narrowedTrace.eventCount;
|
|
12455
12488
|
actualMetrics.tool_calls = toolCalls;
|
|
12456
12489
|
if (toolCalls <= max_tool_calls) {
|
|
12457
12490
|
hits.push(`Tool calls ${toolCalls} <= ${max_tool_calls} max`);
|
|
@@ -12459,8 +12492,8 @@ var ExecutionMetricsEvaluator = class {
|
|
|
12459
12492
|
misses.push(`Tool calls ${toolCalls} > ${max_tool_calls} max`);
|
|
12460
12493
|
}
|
|
12461
12494
|
}
|
|
12462
|
-
if (max_llm_calls !== void 0) {
|
|
12463
|
-
const llmCalls =
|
|
12495
|
+
if (max_llm_calls !== void 0 && narrowedTrace) {
|
|
12496
|
+
const llmCalls = narrowedTrace.llmCallCount;
|
|
12464
12497
|
if (llmCalls === void 0) {
|
|
12465
12498
|
misses.push("LLM call count data not available");
|
|
12466
12499
|
} else {
|
|
@@ -12473,7 +12506,6 @@ var ExecutionMetricsEvaluator = class {
|
|
|
12473
12506
|
}
|
|
12474
12507
|
}
|
|
12475
12508
|
if (max_tokens !== void 0) {
|
|
12476
|
-
const tokenUsage = trace2.tokenUsage;
|
|
12477
12509
|
if (!tokenUsage) {
|
|
12478
12510
|
misses.push("Token usage data not available");
|
|
12479
12511
|
} else {
|
|
@@ -12487,7 +12519,6 @@ var ExecutionMetricsEvaluator = class {
|
|
|
12487
12519
|
}
|
|
12488
12520
|
}
|
|
12489
12521
|
if (max_cost_usd !== void 0) {
|
|
12490
|
-
const costUsd = trace2.costUsd;
|
|
12491
12522
|
if (costUsd === void 0) {
|
|
12492
12523
|
misses.push("Cost data not available");
|
|
12493
12524
|
} else {
|
|
@@ -12501,7 +12532,6 @@ var ExecutionMetricsEvaluator = class {
|
|
|
12501
12532
|
}
|
|
12502
12533
|
}
|
|
12503
12534
|
if (max_duration_ms !== void 0) {
|
|
12504
|
-
const durationMs = trace2.durationMs;
|
|
12505
12535
|
if (durationMs === void 0) {
|
|
12506
12536
|
misses.push("Duration data not available");
|
|
12507
12537
|
} else {
|
|
@@ -12513,8 +12543,8 @@ var ExecutionMetricsEvaluator = class {
|
|
|
12513
12543
|
}
|
|
12514
12544
|
}
|
|
12515
12545
|
}
|
|
12516
|
-
if (target_exploration_ratio !== void 0) {
|
|
12517
|
-
const ratio = explorationRatio(
|
|
12546
|
+
if (target_exploration_ratio !== void 0 && narrowedTrace) {
|
|
12547
|
+
const ratio = explorationRatio(narrowedTrace);
|
|
12518
12548
|
if (ratio === void 0) {
|
|
12519
12549
|
misses.push("Exploration ratio not available (no tool calls)");
|
|
12520
12550
|
} else {
|
|
@@ -13028,7 +13058,7 @@ var LatencyEvaluator = class {
|
|
|
13028
13058
|
}
|
|
13029
13059
|
evaluate(context2) {
|
|
13030
13060
|
const { threshold } = this.config;
|
|
13031
|
-
const durationMs = context2.
|
|
13061
|
+
const durationMs = context2.durationMs;
|
|
13032
13062
|
if (durationMs === void 0) {
|
|
13033
13063
|
return {
|
|
13034
13064
|
score: 0,
|
|
@@ -13673,7 +13703,7 @@ var TokenUsageEvaluator = class {
|
|
|
13673
13703
|
this.config = options.config;
|
|
13674
13704
|
}
|
|
13675
13705
|
evaluate(context2) {
|
|
13676
|
-
const usage = context2.
|
|
13706
|
+
const usage = context2.tokenUsage;
|
|
13677
13707
|
const maxTotal = this.config.max_total;
|
|
13678
13708
|
const maxInput = this.config.max_input;
|
|
13679
13709
|
const maxOutput = this.config.max_output;
|
|
@@ -15366,7 +15396,7 @@ async function runEvaluation(options) {
|
|
|
15366
15396
|
caseCost = trialCostSum;
|
|
15367
15397
|
}
|
|
15368
15398
|
} else {
|
|
15369
|
-
caseCost = result.
|
|
15399
|
+
caseCost = result.costUsd;
|
|
15370
15400
|
}
|
|
15371
15401
|
if (caseCost !== void 0) {
|
|
15372
15402
|
cumulativeBudgetCost += caseCost;
|
|
@@ -15527,17 +15557,18 @@ async function runBatchEvaluation(options) {
|
|
|
15527
15557
|
const providerResponse = batchResponse[i];
|
|
15528
15558
|
const output = providerResponse.output;
|
|
15529
15559
|
const hasExecutionMetrics = providerResponse.tokenUsage !== void 0 || providerResponse.costUsd !== void 0 || providerResponse.durationMs !== void 0;
|
|
15530
|
-
const
|
|
15531
|
-
|
|
15532
|
-
toolNames: [],
|
|
15533
|
-
toolCallsByName: {},
|
|
15534
|
-
errorCount: 0
|
|
15535
|
-
} : void 0;
|
|
15536
|
-
const trace2 = baseSummary ? mergeExecutionMetrics(baseSummary, {
|
|
15560
|
+
const computed = output ? computeTraceSummary(output) : hasExecutionMetrics ? { trace: { eventCount: 0, toolNames: [], toolCallsByName: {}, errorCount: 0 } } : void 0;
|
|
15561
|
+
const merged = computed ? mergeExecutionMetrics(computed, {
|
|
15537
15562
|
tokenUsage: providerResponse.tokenUsage,
|
|
15538
15563
|
costUsd: providerResponse.costUsd,
|
|
15539
15564
|
durationMs: providerResponse.durationMs
|
|
15540
15565
|
}) : void 0;
|
|
15566
|
+
const trace2 = merged?.trace;
|
|
15567
|
+
const costUsd = merged?.costUsd;
|
|
15568
|
+
const durationMs = merged?.durationMs;
|
|
15569
|
+
const tokenUsage = merged?.tokenUsage;
|
|
15570
|
+
const startTime = merged?.startTime;
|
|
15571
|
+
const endTime = merged?.endTime;
|
|
15541
15572
|
const candidate = extractLastAssistantContent2(output);
|
|
15542
15573
|
const providerError = extractProviderError(providerResponse);
|
|
15543
15574
|
let result;
|
|
@@ -15556,6 +15587,11 @@ async function runBatchEvaluation(options) {
|
|
|
15556
15587
|
agentTimeoutMs,
|
|
15557
15588
|
output,
|
|
15558
15589
|
trace: trace2,
|
|
15590
|
+
costUsd,
|
|
15591
|
+
durationMs,
|
|
15592
|
+
tokenUsage,
|
|
15593
|
+
startTime,
|
|
15594
|
+
endTime,
|
|
15559
15595
|
targetResolver,
|
|
15560
15596
|
availableTargets
|
|
15561
15597
|
});
|
|
@@ -15792,17 +15828,18 @@ async function runEvalCase(options) {
|
|
|
15792
15828
|
}
|
|
15793
15829
|
const output = providerResponse.output;
|
|
15794
15830
|
const hasExecutionMetrics = providerResponse.tokenUsage !== void 0 || providerResponse.costUsd !== void 0 || providerResponse.durationMs !== void 0;
|
|
15795
|
-
const
|
|
15796
|
-
|
|
15797
|
-
toolNames: [],
|
|
15798
|
-
toolCallsByName: {},
|
|
15799
|
-
errorCount: 0
|
|
15800
|
-
} : void 0;
|
|
15801
|
-
const trace2 = baseSummary ? mergeExecutionMetrics(baseSummary, {
|
|
15831
|
+
const computed = output ? computeTraceSummary(output) : hasExecutionMetrics ? { trace: { eventCount: 0, toolNames: [], toolCallsByName: {}, errorCount: 0 } } : void 0;
|
|
15832
|
+
const merged = computed ? mergeExecutionMetrics(computed, {
|
|
15802
15833
|
tokenUsage: providerResponse.tokenUsage,
|
|
15803
15834
|
costUsd: providerResponse.costUsd,
|
|
15804
15835
|
durationMs: providerResponse.durationMs
|
|
15805
15836
|
}) : void 0;
|
|
15837
|
+
const trace2 = merged?.trace;
|
|
15838
|
+
const costUsd = merged?.costUsd;
|
|
15839
|
+
const durationMs = merged?.durationMs;
|
|
15840
|
+
const tokenUsage = merged?.tokenUsage;
|
|
15841
|
+
const startTime = merged?.startTime;
|
|
15842
|
+
const endTime = merged?.endTime;
|
|
15806
15843
|
const candidate = extractLastAssistantContent2(output);
|
|
15807
15844
|
let fileChanges;
|
|
15808
15845
|
if (baselineCommit && workspacePath) {
|
|
@@ -15847,6 +15884,11 @@ async function runEvalCase(options) {
|
|
|
15847
15884
|
agentTimeoutMs,
|
|
15848
15885
|
output,
|
|
15849
15886
|
trace: trace2,
|
|
15887
|
+
costUsd,
|
|
15888
|
+
durationMs,
|
|
15889
|
+
tokenUsage,
|
|
15890
|
+
startTime,
|
|
15891
|
+
endTime,
|
|
15850
15892
|
targetResolver,
|
|
15851
15893
|
availableTargets,
|
|
15852
15894
|
fileChanges,
|
|
@@ -15903,7 +15945,7 @@ async function runEvalCaseWithTrials(options, trialsConfig) {
|
|
|
15903
15945
|
};
|
|
15904
15946
|
const result = await runEvalCase(trialOptions);
|
|
15905
15947
|
allResults.push(result);
|
|
15906
|
-
const trialCost = result.
|
|
15948
|
+
const trialCost = result.costUsd;
|
|
15907
15949
|
const trialVerdict = scoreToVerdict(result.score);
|
|
15908
15950
|
const trial = {
|
|
15909
15951
|
attempt,
|
|
@@ -15959,6 +16001,11 @@ async function evaluateCandidate(options) {
|
|
|
15959
16001
|
agentTimeoutMs,
|
|
15960
16002
|
output,
|
|
15961
16003
|
trace: trace2,
|
|
16004
|
+
costUsd,
|
|
16005
|
+
durationMs,
|
|
16006
|
+
tokenUsage,
|
|
16007
|
+
startTime,
|
|
16008
|
+
endTime,
|
|
15962
16009
|
targetResolver,
|
|
15963
16010
|
availableTargets,
|
|
15964
16011
|
fileChanges,
|
|
@@ -15979,6 +16026,11 @@ async function evaluateCandidate(options) {
|
|
|
15979
16026
|
agentTimeoutMs,
|
|
15980
16027
|
output,
|
|
15981
16028
|
trace: trace2,
|
|
16029
|
+
costUsd,
|
|
16030
|
+
durationMs,
|
|
16031
|
+
tokenUsage,
|
|
16032
|
+
startTime,
|
|
16033
|
+
endTime,
|
|
15982
16034
|
targetResolver,
|
|
15983
16035
|
availableTargets,
|
|
15984
16036
|
fileChanges,
|
|
@@ -16022,6 +16074,11 @@ async function evaluateCandidate(options) {
|
|
|
16022
16074
|
answer: candidate,
|
|
16023
16075
|
target: target.name,
|
|
16024
16076
|
reasoning: score.reasoning,
|
|
16077
|
+
tokenUsage,
|
|
16078
|
+
costUsd,
|
|
16079
|
+
durationMs,
|
|
16080
|
+
startTime,
|
|
16081
|
+
endTime,
|
|
16025
16082
|
requests,
|
|
16026
16083
|
input,
|
|
16027
16084
|
scores,
|
|
@@ -16045,6 +16102,11 @@ async function runEvaluatorsForCase(options) {
|
|
|
16045
16102
|
agentTimeoutMs,
|
|
16046
16103
|
output,
|
|
16047
16104
|
trace: trace2,
|
|
16105
|
+
costUsd,
|
|
16106
|
+
durationMs,
|
|
16107
|
+
tokenUsage,
|
|
16108
|
+
startTime,
|
|
16109
|
+
endTime,
|
|
16048
16110
|
targetResolver,
|
|
16049
16111
|
availableTargets,
|
|
16050
16112
|
fileChanges,
|
|
@@ -16066,6 +16128,11 @@ async function runEvaluatorsForCase(options) {
|
|
|
16066
16128
|
agentTimeoutMs,
|
|
16067
16129
|
output,
|
|
16068
16130
|
trace: trace2,
|
|
16131
|
+
costUsd,
|
|
16132
|
+
durationMs,
|
|
16133
|
+
tokenUsage,
|
|
16134
|
+
startTime,
|
|
16135
|
+
endTime,
|
|
16069
16136
|
targetResolver,
|
|
16070
16137
|
availableTargets,
|
|
16071
16138
|
fileChanges,
|
|
@@ -16088,6 +16155,11 @@ async function runEvaluatorsForCase(options) {
|
|
|
16088
16155
|
judgeProvider,
|
|
16089
16156
|
output,
|
|
16090
16157
|
trace: trace2,
|
|
16158
|
+
tokenUsage,
|
|
16159
|
+
costUsd,
|
|
16160
|
+
durationMs,
|
|
16161
|
+
startTime,
|
|
16162
|
+
endTime,
|
|
16091
16163
|
targetResolver,
|
|
16092
16164
|
availableTargets,
|
|
16093
16165
|
fileChanges,
|
|
@@ -16111,6 +16183,11 @@ async function runEvaluatorList(options) {
|
|
|
16111
16183
|
agentTimeoutMs,
|
|
16112
16184
|
output,
|
|
16113
16185
|
trace: trace2,
|
|
16186
|
+
costUsd,
|
|
16187
|
+
durationMs,
|
|
16188
|
+
tokenUsage,
|
|
16189
|
+
startTime,
|
|
16190
|
+
endTime,
|
|
16114
16191
|
targetResolver,
|
|
16115
16192
|
availableTargets,
|
|
16116
16193
|
fileChanges,
|
|
@@ -16129,6 +16206,11 @@ async function runEvaluatorList(options) {
|
|
|
16129
16206
|
judgeProvider,
|
|
16130
16207
|
output,
|
|
16131
16208
|
trace: trace2,
|
|
16209
|
+
tokenUsage,
|
|
16210
|
+
costUsd,
|
|
16211
|
+
durationMs,
|
|
16212
|
+
startTime,
|
|
16213
|
+
endTime,
|
|
16132
16214
|
targetResolver,
|
|
16133
16215
|
availableTargets,
|
|
16134
16216
|
fileChanges,
|
|
@@ -16168,7 +16250,8 @@ async function runEvaluatorList(options) {
|
|
|
16168
16250
|
reasoning: score2.reasoning,
|
|
16169
16251
|
evaluatorProviderRequest: score2.evaluatorRawRequest,
|
|
16170
16252
|
details: score2.details,
|
|
16171
|
-
scores: mapChildResults(score2.scores)
|
|
16253
|
+
scores: mapChildResults(score2.scores),
|
|
16254
|
+
tokenUsage: score2.tokenUsage
|
|
16172
16255
|
});
|
|
16173
16256
|
} catch (error) {
|
|
16174
16257
|
const message = error instanceof Error ? error.message : String(error);
|
|
@@ -16416,7 +16499,8 @@ function mapChildResults(children) {
|
|
|
16416
16499
|
reasoning: child.reasoning,
|
|
16417
16500
|
evaluatorProviderRequest: child.evaluatorRawRequest,
|
|
16418
16501
|
scores: mapChildResults(child.scores),
|
|
16419
|
-
details: child.details
|
|
16502
|
+
details: child.details,
|
|
16503
|
+
tokenUsage: child.tokenUsage
|
|
16420
16504
|
}));
|
|
16421
16505
|
}
|
|
16422
16506
|
function computeWeightedMean(entries) {
|
|
@@ -16796,7 +16880,13 @@ var STRIPPED_TOP_LEVEL_FIELDS = /* @__PURE__ */ new Set([
|
|
|
16796
16880
|
"beforeEachOutput",
|
|
16797
16881
|
"afterAllOutput",
|
|
16798
16882
|
"afterEachOutput",
|
|
16799
|
-
"fileChanges"
|
|
16883
|
+
"fileChanges",
|
|
16884
|
+
// Promoted execution metrics (debug, not needed for regression comparison)
|
|
16885
|
+
"tokenUsage",
|
|
16886
|
+
"costUsd",
|
|
16887
|
+
"durationMs",
|
|
16888
|
+
"startTime",
|
|
16889
|
+
"endTime"
|
|
16800
16890
|
]);
|
|
16801
16891
|
var STRIPPED_EVALUATOR_FIELDS = /* @__PURE__ */ new Set(["rawRequest", "evaluatorProviderRequest"]);
|
|
16802
16892
|
function trimEvaluatorResult(result) {
|
|
@@ -16919,8 +17009,8 @@ var OtelTraceExporter = class {
|
|
|
16919
17009
|
const api = this.api;
|
|
16920
17010
|
const tracer = this.tracer;
|
|
16921
17011
|
const captureContent = this.options.captureContent ?? false;
|
|
16922
|
-
const startHr = toHrTime(result.
|
|
16923
|
-
const endHr = toHrTime(result.
|
|
17012
|
+
const startHr = toHrTime(result.startTime ?? result.timestamp);
|
|
17013
|
+
const endHr = toHrTime(result.endTime ?? result.timestamp);
|
|
16924
17014
|
let parentCtx = api.ROOT_CONTEXT;
|
|
16925
17015
|
const traceparent = process.env.TRACEPARENT;
|
|
16926
17016
|
if (traceparent && this.W3CPropagator) {
|
|
@@ -16949,12 +17039,13 @@ var OtelTraceExporter = class {
|
|
|
16949
17039
|
if (result.dataset) rootSpan.setAttribute("agentv.dataset", result.dataset);
|
|
16950
17040
|
rootSpan.setAttribute("agentv.score", result.score);
|
|
16951
17041
|
if (captureContent) rootSpan.setAttribute("agentv.answer", result.answer);
|
|
17042
|
+
if (result.durationMs != null)
|
|
17043
|
+
rootSpan.setAttribute("agentv.trace.duration_ms", result.durationMs);
|
|
17044
|
+
if (result.costUsd != null) rootSpan.setAttribute("agentv.trace.cost_usd", result.costUsd);
|
|
16952
17045
|
if (result.trace) {
|
|
16953
17046
|
const t = result.trace;
|
|
16954
17047
|
rootSpan.setAttribute("agentv.trace.event_count", t.eventCount);
|
|
16955
17048
|
rootSpan.setAttribute("agentv.trace.tool_names", t.toolNames.join(","));
|
|
16956
|
-
if (t.durationMs != null) rootSpan.setAttribute("agentv.trace.duration_ms", t.durationMs);
|
|
16957
|
-
if (t.costUsd != null) rootSpan.setAttribute("agentv.trace.cost_usd", t.costUsd);
|
|
16958
17049
|
if (t.llmCallCount != null)
|
|
16959
17050
|
rootSpan.setAttribute("agentv.trace.llm_call_count", t.llmCallCount);
|
|
16960
17051
|
}
|