@agentv/core 2.8.0-next.1 → 2.9.0-next.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-P2465XAH.js → chunk-7Q4PH265.js} +1 -1
- package/dist/chunk-7Q4PH265.js.map +1 -0
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +1 -1
- package/dist/index.cjs +163 -72
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +48 -20
- package/dist/index.d.ts +48 -20
- package/dist/index.js +164 -73
- package/dist/index.js.map +1 -1
- package/package.json +6 -4
- package/dist/chunk-P2465XAH.js.map +0 -1
package/dist/index.js
CHANGED
|
@@ -17,7 +17,7 @@ import {
|
|
|
17
17
|
readTextFile,
|
|
18
18
|
resolveFileReference,
|
|
19
19
|
resolveTargetDefinition
|
|
20
|
-
} from "./chunk-
|
|
20
|
+
} from "./chunk-7Q4PH265.js";
|
|
21
21
|
import {
|
|
22
22
|
OtlpJsonFileExporter
|
|
23
23
|
} from "./chunk-HFSYZHGF.js";
|
|
@@ -83,14 +83,16 @@ function computeTraceSummary(messages) {
|
|
|
83
83
|
}
|
|
84
84
|
const toolNames = Object.keys(toolCallCounts).sort();
|
|
85
85
|
return {
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
86
|
+
trace: {
|
|
87
|
+
eventCount: totalToolCalls,
|
|
88
|
+
toolNames,
|
|
89
|
+
toolCallsByName: toolCallCounts,
|
|
90
|
+
errorCount: 0,
|
|
91
|
+
llmCallCount,
|
|
92
|
+
...hasAnyDuration ? { toolDurations } : {}
|
|
93
|
+
},
|
|
90
94
|
startTime: earliestStart?.toISOString(),
|
|
91
|
-
endTime: latestEnd?.toISOString()
|
|
92
|
-
llmCallCount,
|
|
93
|
-
...hasAnyDuration ? { toolDurations } : {}
|
|
95
|
+
endTime: latestEnd?.toISOString()
|
|
94
96
|
};
|
|
95
97
|
}
|
|
96
98
|
var DEFAULT_EXPLORATION_TOOLS = [
|
|
@@ -113,9 +115,9 @@ function explorationRatio(summary, explorationTools = DEFAULT_EXPLORATION_TOOLS)
|
|
|
113
115
|
);
|
|
114
116
|
return explorationCalls / summary.eventCount;
|
|
115
117
|
}
|
|
116
|
-
function tokensPerTool(summary) {
|
|
117
|
-
if (!
|
|
118
|
-
const totalTokens =
|
|
118
|
+
function tokensPerTool(summary, tokenUsage) {
|
|
119
|
+
if (!tokenUsage || summary.eventCount === 0) return void 0;
|
|
120
|
+
const totalTokens = tokenUsage.input + tokenUsage.output;
|
|
119
121
|
return totalTokens / summary.eventCount;
|
|
120
122
|
}
|
|
121
123
|
function avgToolDurationMs(summary) {
|
|
@@ -131,16 +133,15 @@ function avgToolDurationMs(summary) {
|
|
|
131
133
|
if (totalCalls === 0) return void 0;
|
|
132
134
|
return totalDuration / totalCalls;
|
|
133
135
|
}
|
|
134
|
-
function mergeExecutionMetrics(
|
|
135
|
-
if (!metrics) return
|
|
136
|
+
function mergeExecutionMetrics(computed, metrics) {
|
|
137
|
+
if (!metrics) return computed;
|
|
136
138
|
return {
|
|
137
|
-
|
|
139
|
+
trace: computed.trace,
|
|
138
140
|
tokenUsage: metrics.tokenUsage,
|
|
139
141
|
costUsd: metrics.costUsd,
|
|
140
142
|
durationMs: metrics.durationMs,
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
endTime: metrics.endTime ?? summary.endTime
|
|
143
|
+
startTime: metrics.startTime ?? computed.startTime,
|
|
144
|
+
endTime: metrics.endTime ?? computed.endTime
|
|
144
145
|
};
|
|
145
146
|
}
|
|
146
147
|
|
|
@@ -3097,10 +3098,13 @@ async function invokeModel(options) {
|
|
|
3097
3098
|
}
|
|
3098
3099
|
function mapResponse(result) {
|
|
3099
3100
|
const content = result.text ?? "";
|
|
3101
|
+
const rawUsage = result.totalUsage ?? result.usage;
|
|
3102
|
+
const tokenUsage = rawUsage?.inputTokens != null && rawUsage?.outputTokens != null ? { input: rawUsage.inputTokens, output: rawUsage.outputTokens } : void 0;
|
|
3100
3103
|
return {
|
|
3101
3104
|
raw: result,
|
|
3102
|
-
usage: toJsonObject(
|
|
3103
|
-
output: [{ role: "assistant", content }]
|
|
3105
|
+
usage: toJsonObject(rawUsage),
|
|
3106
|
+
output: [{ role: "assistant", content }],
|
|
3107
|
+
tokenUsage
|
|
3104
3108
|
};
|
|
3105
3109
|
}
|
|
3106
3110
|
function toJsonObject(value) {
|
|
@@ -8393,6 +8397,8 @@ async function createTargetProxy(options) {
|
|
|
8393
8397
|
const token = randomBytes(32).toString("hex");
|
|
8394
8398
|
let callCount = 0;
|
|
8395
8399
|
let isShutdown = false;
|
|
8400
|
+
let totalInputTokens = 0;
|
|
8401
|
+
let totalOutputTokens = 0;
|
|
8396
8402
|
const targetsList = availableTargets ?? [defaultProvider.targetName];
|
|
8397
8403
|
function resolveProvider(targetName) {
|
|
8398
8404
|
if (targetName === void 0 || targetName === defaultProvider.targetName) {
|
|
@@ -8471,11 +8477,16 @@ async function createTargetProxy(options) {
|
|
|
8471
8477
|
evalCaseId: request.evalCaseId ?? "proxy",
|
|
8472
8478
|
attempt: request.attempt ?? 1
|
|
8473
8479
|
});
|
|
8480
|
+
if (response.tokenUsage) {
|
|
8481
|
+
totalInputTokens += response.tokenUsage.input;
|
|
8482
|
+
totalOutputTokens += response.tokenUsage.output;
|
|
8483
|
+
}
|
|
8474
8484
|
const output = response.output ?? [];
|
|
8475
8485
|
const rawText = extractLastAssistantContent2(output);
|
|
8476
8486
|
const result = {
|
|
8477
8487
|
output,
|
|
8478
|
-
rawText
|
|
8488
|
+
rawText,
|
|
8489
|
+
tokenUsage: response.tokenUsage
|
|
8479
8490
|
};
|
|
8480
8491
|
sendJson(res, 200, result);
|
|
8481
8492
|
} catch (error) {
|
|
@@ -8522,10 +8533,15 @@ async function createTargetProxy(options) {
|
|
|
8522
8533
|
evalCaseId: request.evalCaseId ?? "proxy",
|
|
8523
8534
|
attempt: request.attempt ?? 1
|
|
8524
8535
|
});
|
|
8536
|
+
if (response.tokenUsage) {
|
|
8537
|
+
totalInputTokens += response.tokenUsage.input;
|
|
8538
|
+
totalOutputTokens += response.tokenUsage.output;
|
|
8539
|
+
}
|
|
8525
8540
|
const output = response.output ?? [];
|
|
8526
8541
|
responses.push({
|
|
8527
8542
|
output,
|
|
8528
|
-
rawText: extractLastAssistantContent2(output)
|
|
8543
|
+
rawText: extractLastAssistantContent2(output),
|
|
8544
|
+
tokenUsage: response.tokenUsage
|
|
8529
8545
|
});
|
|
8530
8546
|
} catch (error) {
|
|
8531
8547
|
const message = error instanceof Error ? error.message : String(error);
|
|
@@ -8564,7 +8580,8 @@ async function createTargetProxy(options) {
|
|
|
8564
8580
|
},
|
|
8565
8581
|
getUsageMetadata: () => ({
|
|
8566
8582
|
callCount,
|
|
8567
|
-
maxCalls
|
|
8583
|
+
maxCalls,
|
|
8584
|
+
tokenUsage: totalInputTokens > 0 || totalOutputTokens > 0 ? { input: totalInputTokens, output: totalOutputTokens } : void 0
|
|
8568
8585
|
})
|
|
8569
8586
|
};
|
|
8570
8587
|
}
|
|
@@ -8689,6 +8706,11 @@ var CodeEvaluator = class {
|
|
|
8689
8706
|
),
|
|
8690
8707
|
input: context.evalCase.input,
|
|
8691
8708
|
trace: context.trace ?? null,
|
|
8709
|
+
tokenUsage: context.tokenUsage ?? null,
|
|
8710
|
+
costUsd: context.costUsd ?? null,
|
|
8711
|
+
durationMs: context.durationMs ?? null,
|
|
8712
|
+
startTime: context.startTime ?? null,
|
|
8713
|
+
endTime: context.endTime ?? null,
|
|
8692
8714
|
fileChanges: context.fileChanges ?? null,
|
|
8693
8715
|
workspacePath: context.workspacePath ?? null,
|
|
8694
8716
|
config: this.config ?? null
|
|
@@ -8747,7 +8769,8 @@ var CodeEvaluator = class {
|
|
|
8747
8769
|
expectedAspectCount: hits.length + misses.length || 1,
|
|
8748
8770
|
reasoning,
|
|
8749
8771
|
evaluatorRawRequest,
|
|
8750
|
-
...details ? { details } : {}
|
|
8772
|
+
...details ? { details } : {},
|
|
8773
|
+
tokenUsage: proxyUsage?.tokenUsage
|
|
8751
8774
|
};
|
|
8752
8775
|
} catch (error) {
|
|
8753
8776
|
const message = error instanceof Error ? error.message : String(error);
|
|
@@ -8769,7 +8792,8 @@ var CodeEvaluator = class {
|
|
|
8769
8792
|
}
|
|
8770
8793
|
} : {},
|
|
8771
8794
|
error: message
|
|
8772
|
-
}
|
|
8795
|
+
},
|
|
8796
|
+
tokenUsage: proxyUsage?.tokenUsage
|
|
8773
8797
|
};
|
|
8774
8798
|
} finally {
|
|
8775
8799
|
if (proxyShutdown) {
|
|
@@ -8904,7 +8928,7 @@ ${context.fileChanges}`;
|
|
|
8904
8928
|
target: judgeProvider.targetName
|
|
8905
8929
|
};
|
|
8906
8930
|
try {
|
|
8907
|
-
const { data } = await this.runWithRetry({
|
|
8931
|
+
const { data, tokenUsage } = await this.runWithRetry({
|
|
8908
8932
|
context,
|
|
8909
8933
|
judgeProvider,
|
|
8910
8934
|
systemPrompt,
|
|
@@ -8923,7 +8947,8 @@ ${context.fileChanges}`;
|
|
|
8923
8947
|
misses,
|
|
8924
8948
|
expectedAspectCount,
|
|
8925
8949
|
reasoning,
|
|
8926
|
-
evaluatorRawRequest
|
|
8950
|
+
evaluatorRawRequest,
|
|
8951
|
+
tokenUsage
|
|
8927
8952
|
};
|
|
8928
8953
|
} catch {
|
|
8929
8954
|
return {
|
|
@@ -8953,7 +8978,7 @@ ${context.fileChanges}`;
|
|
|
8953
8978
|
systemPrompt,
|
|
8954
8979
|
target: judgeProvider.targetName
|
|
8955
8980
|
};
|
|
8956
|
-
const { data } = await this.runWithRetry({
|
|
8981
|
+
const { data, tokenUsage } = await this.runWithRetry({
|
|
8957
8982
|
context,
|
|
8958
8983
|
judgeProvider,
|
|
8959
8984
|
systemPrompt,
|
|
@@ -8968,7 +8993,8 @@ ${context.fileChanges}`;
|
|
|
8968
8993
|
misses,
|
|
8969
8994
|
expectedAspectCount: rubrics.length,
|
|
8970
8995
|
reasoning: data.overall_reasoning,
|
|
8971
|
-
evaluatorRawRequest
|
|
8996
|
+
evaluatorRawRequest,
|
|
8997
|
+
tokenUsage
|
|
8972
8998
|
};
|
|
8973
8999
|
}
|
|
8974
9000
|
/**
|
|
@@ -8983,7 +9009,7 @@ ${context.fileChanges}`;
|
|
|
8983
9009
|
systemPrompt,
|
|
8984
9010
|
target: judgeProvider.targetName
|
|
8985
9011
|
};
|
|
8986
|
-
const { data } = await this.runWithRetry({
|
|
9012
|
+
const { data, tokenUsage } = await this.runWithRetry({
|
|
8987
9013
|
context,
|
|
8988
9014
|
judgeProvider,
|
|
8989
9015
|
systemPrompt,
|
|
@@ -8999,7 +9025,8 @@ ${context.fileChanges}`;
|
|
|
8999
9025
|
expectedAspectCount: rubrics.length,
|
|
9000
9026
|
reasoning: data.overall_reasoning,
|
|
9001
9027
|
evaluatorRawRequest,
|
|
9002
|
-
details
|
|
9028
|
+
details,
|
|
9029
|
+
tokenUsage
|
|
9003
9030
|
};
|
|
9004
9031
|
}
|
|
9005
9032
|
/**
|
|
@@ -9083,15 +9110,17 @@ ${context.fileChanges}`;
|
|
|
9083
9110
|
try {
|
|
9084
9111
|
const model = judgeProvider.asLanguageModel?.();
|
|
9085
9112
|
if (model) {
|
|
9086
|
-
const
|
|
9113
|
+
const result = await generateText2({
|
|
9087
9114
|
model,
|
|
9088
9115
|
system: systemPrompt,
|
|
9089
9116
|
prompt: userPrompt,
|
|
9090
9117
|
...this.maxOutputTokens ? { maxTokens: this.maxOutputTokens } : {},
|
|
9091
9118
|
...typeof this.temperature === "number" ? { temperature: this.temperature } : {}
|
|
9092
9119
|
});
|
|
9093
|
-
const data2 = schema.parse(parseJsonFromText(text));
|
|
9094
|
-
|
|
9120
|
+
const data2 = schema.parse(parseJsonFromText(result.text));
|
|
9121
|
+
const rawUsage = result.usage;
|
|
9122
|
+
const tokenUsage = rawUsage?.inputTokens != null && rawUsage?.outputTokens != null ? { input: rawUsage.inputTokens, output: rawUsage.outputTokens } : void 0;
|
|
9123
|
+
return { data: data2, tokenUsage };
|
|
9095
9124
|
}
|
|
9096
9125
|
const response = await judgeProvider.invoke({
|
|
9097
9126
|
question: userPrompt,
|
|
@@ -9102,7 +9131,7 @@ ${context.fileChanges}`;
|
|
|
9102
9131
|
temperature: this.temperature
|
|
9103
9132
|
});
|
|
9104
9133
|
const data = schema.parse(parseJsonFromText(extractLastAssistantContent(response.output)));
|
|
9105
|
-
return { data, providerResponse: response };
|
|
9134
|
+
return { data, providerResponse: response, tokenUsage: response.tokenUsage };
|
|
9106
9135
|
} catch (e) {
|
|
9107
9136
|
lastError = e instanceof Error ? e : new Error(String(e));
|
|
9108
9137
|
}
|
|
@@ -9308,7 +9337,8 @@ var CompositeEvaluator = class {
|
|
|
9308
9337
|
reasoning: member.result.reasoning,
|
|
9309
9338
|
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
9310
9339
|
scores: member.result.scores,
|
|
9311
|
-
details: member.result.details
|
|
9340
|
+
details: member.result.details,
|
|
9341
|
+
tokenUsage: member.result.tokenUsage
|
|
9312
9342
|
});
|
|
9313
9343
|
}
|
|
9314
9344
|
const finalScore = totalWeight > 0 ? weightedSum / totalWeight : 0;
|
|
@@ -9356,7 +9386,8 @@ var CompositeEvaluator = class {
|
|
|
9356
9386
|
reasoning: member.result.reasoning,
|
|
9357
9387
|
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
9358
9388
|
scores: member.result.scores,
|
|
9359
|
-
details: member.result.details
|
|
9389
|
+
details: member.result.details,
|
|
9390
|
+
tokenUsage: member.result.tokenUsage
|
|
9360
9391
|
});
|
|
9361
9392
|
}
|
|
9362
9393
|
const totalCount = results.length;
|
|
@@ -9535,7 +9566,7 @@ var CostEvaluator = class {
|
|
|
9535
9566
|
}
|
|
9536
9567
|
evaluate(context) {
|
|
9537
9568
|
const { budget } = this.config;
|
|
9538
|
-
const costUsd = context.
|
|
9569
|
+
const costUsd = context.costUsd;
|
|
9539
9570
|
if (costUsd === void 0) {
|
|
9540
9571
|
return {
|
|
9541
9572
|
score: 0,
|
|
@@ -9578,7 +9609,7 @@ var ExecutionMetricsEvaluator = class {
|
|
|
9578
9609
|
this.config = options.config;
|
|
9579
9610
|
}
|
|
9580
9611
|
evaluate(context) {
|
|
9581
|
-
const { trace } = context;
|
|
9612
|
+
const { trace, tokenUsage, costUsd, durationMs } = context;
|
|
9582
9613
|
const {
|
|
9583
9614
|
max_tool_calls,
|
|
9584
9615
|
max_llm_calls,
|
|
@@ -9588,7 +9619,8 @@ var ExecutionMetricsEvaluator = class {
|
|
|
9588
9619
|
target_exploration_ratio,
|
|
9589
9620
|
exploration_tolerance = 0.2
|
|
9590
9621
|
} = this.config;
|
|
9591
|
-
|
|
9622
|
+
const needsTrace = max_tool_calls !== void 0 || max_llm_calls !== void 0 || target_exploration_ratio !== void 0;
|
|
9623
|
+
if (needsTrace && !trace) {
|
|
9592
9624
|
return {
|
|
9593
9625
|
score: 0,
|
|
9594
9626
|
verdict: "fail",
|
|
@@ -9603,11 +9635,12 @@ var ExecutionMetricsEvaluator = class {
|
|
|
9603
9635
|
}
|
|
9604
9636
|
};
|
|
9605
9637
|
}
|
|
9638
|
+
const narrowedTrace = trace;
|
|
9606
9639
|
const hits = [];
|
|
9607
9640
|
const misses = [];
|
|
9608
9641
|
const actualMetrics = {};
|
|
9609
|
-
if (max_tool_calls !== void 0) {
|
|
9610
|
-
const toolCalls =
|
|
9642
|
+
if (max_tool_calls !== void 0 && narrowedTrace) {
|
|
9643
|
+
const toolCalls = narrowedTrace.eventCount;
|
|
9611
9644
|
actualMetrics.tool_calls = toolCalls;
|
|
9612
9645
|
if (toolCalls <= max_tool_calls) {
|
|
9613
9646
|
hits.push(`Tool calls ${toolCalls} <= ${max_tool_calls} max`);
|
|
@@ -9615,8 +9648,8 @@ var ExecutionMetricsEvaluator = class {
|
|
|
9615
9648
|
misses.push(`Tool calls ${toolCalls} > ${max_tool_calls} max`);
|
|
9616
9649
|
}
|
|
9617
9650
|
}
|
|
9618
|
-
if (max_llm_calls !== void 0) {
|
|
9619
|
-
const llmCalls =
|
|
9651
|
+
if (max_llm_calls !== void 0 && narrowedTrace) {
|
|
9652
|
+
const llmCalls = narrowedTrace.llmCallCount;
|
|
9620
9653
|
if (llmCalls === void 0) {
|
|
9621
9654
|
misses.push("LLM call count data not available");
|
|
9622
9655
|
} else {
|
|
@@ -9629,7 +9662,6 @@ var ExecutionMetricsEvaluator = class {
|
|
|
9629
9662
|
}
|
|
9630
9663
|
}
|
|
9631
9664
|
if (max_tokens !== void 0) {
|
|
9632
|
-
const tokenUsage = trace.tokenUsage;
|
|
9633
9665
|
if (!tokenUsage) {
|
|
9634
9666
|
misses.push("Token usage data not available");
|
|
9635
9667
|
} else {
|
|
@@ -9643,7 +9675,6 @@ var ExecutionMetricsEvaluator = class {
|
|
|
9643
9675
|
}
|
|
9644
9676
|
}
|
|
9645
9677
|
if (max_cost_usd !== void 0) {
|
|
9646
|
-
const costUsd = trace.costUsd;
|
|
9647
9678
|
if (costUsd === void 0) {
|
|
9648
9679
|
misses.push("Cost data not available");
|
|
9649
9680
|
} else {
|
|
@@ -9657,7 +9688,6 @@ var ExecutionMetricsEvaluator = class {
|
|
|
9657
9688
|
}
|
|
9658
9689
|
}
|
|
9659
9690
|
if (max_duration_ms !== void 0) {
|
|
9660
|
-
const durationMs = trace.durationMs;
|
|
9661
9691
|
if (durationMs === void 0) {
|
|
9662
9692
|
misses.push("Duration data not available");
|
|
9663
9693
|
} else {
|
|
@@ -9669,8 +9699,8 @@ var ExecutionMetricsEvaluator = class {
|
|
|
9669
9699
|
}
|
|
9670
9700
|
}
|
|
9671
9701
|
}
|
|
9672
|
-
if (target_exploration_ratio !== void 0) {
|
|
9673
|
-
const ratio = explorationRatio(
|
|
9702
|
+
if (target_exploration_ratio !== void 0 && narrowedTrace) {
|
|
9703
|
+
const ratio = explorationRatio(narrowedTrace);
|
|
9674
9704
|
if (ratio === void 0) {
|
|
9675
9705
|
misses.push("Exploration ratio not available (no tool calls)");
|
|
9676
9706
|
} else {
|
|
@@ -10184,7 +10214,7 @@ var LatencyEvaluator = class {
|
|
|
10184
10214
|
}
|
|
10185
10215
|
evaluate(context) {
|
|
10186
10216
|
const { threshold } = this.config;
|
|
10187
|
-
const durationMs = context.
|
|
10217
|
+
const durationMs = context.durationMs;
|
|
10188
10218
|
if (durationMs === void 0) {
|
|
10189
10219
|
return {
|
|
10190
10220
|
score: 0,
|
|
@@ -10829,7 +10859,7 @@ var TokenUsageEvaluator = class {
|
|
|
10829
10859
|
this.config = options.config;
|
|
10830
10860
|
}
|
|
10831
10861
|
evaluate(context) {
|
|
10832
|
-
const usage = context.
|
|
10862
|
+
const usage = context.tokenUsage;
|
|
10833
10863
|
const maxTotal = this.config.max_total;
|
|
10834
10864
|
const maxInput = this.config.max_input;
|
|
10835
10865
|
const maxOutput = this.config.max_output;
|
|
@@ -12522,7 +12552,7 @@ async function runEvaluation(options) {
|
|
|
12522
12552
|
caseCost = trialCostSum;
|
|
12523
12553
|
}
|
|
12524
12554
|
} else {
|
|
12525
|
-
caseCost = result.
|
|
12555
|
+
caseCost = result.costUsd;
|
|
12526
12556
|
}
|
|
12527
12557
|
if (caseCost !== void 0) {
|
|
12528
12558
|
cumulativeBudgetCost += caseCost;
|
|
@@ -12683,17 +12713,18 @@ async function runBatchEvaluation(options) {
|
|
|
12683
12713
|
const providerResponse = batchResponse[i];
|
|
12684
12714
|
const output = providerResponse.output;
|
|
12685
12715
|
const hasExecutionMetrics = providerResponse.tokenUsage !== void 0 || providerResponse.costUsd !== void 0 || providerResponse.durationMs !== void 0;
|
|
12686
|
-
const
|
|
12687
|
-
|
|
12688
|
-
toolNames: [],
|
|
12689
|
-
toolCallsByName: {},
|
|
12690
|
-
errorCount: 0
|
|
12691
|
-
} : void 0;
|
|
12692
|
-
const trace = baseSummary ? mergeExecutionMetrics(baseSummary, {
|
|
12716
|
+
const computed = output ? computeTraceSummary(output) : hasExecutionMetrics ? { trace: { eventCount: 0, toolNames: [], toolCallsByName: {}, errorCount: 0 } } : void 0;
|
|
12717
|
+
const merged = computed ? mergeExecutionMetrics(computed, {
|
|
12693
12718
|
tokenUsage: providerResponse.tokenUsage,
|
|
12694
12719
|
costUsd: providerResponse.costUsd,
|
|
12695
12720
|
durationMs: providerResponse.durationMs
|
|
12696
12721
|
}) : void 0;
|
|
12722
|
+
const trace = merged?.trace;
|
|
12723
|
+
const costUsd = merged?.costUsd;
|
|
12724
|
+
const durationMs = merged?.durationMs;
|
|
12725
|
+
const tokenUsage = merged?.tokenUsage;
|
|
12726
|
+
const startTime = merged?.startTime;
|
|
12727
|
+
const endTime = merged?.endTime;
|
|
12697
12728
|
const candidate = extractLastAssistantContent(output);
|
|
12698
12729
|
const providerError = extractProviderError(providerResponse);
|
|
12699
12730
|
let result;
|
|
@@ -12712,6 +12743,11 @@ async function runBatchEvaluation(options) {
|
|
|
12712
12743
|
agentTimeoutMs,
|
|
12713
12744
|
output,
|
|
12714
12745
|
trace,
|
|
12746
|
+
costUsd,
|
|
12747
|
+
durationMs,
|
|
12748
|
+
tokenUsage,
|
|
12749
|
+
startTime,
|
|
12750
|
+
endTime,
|
|
12715
12751
|
targetResolver,
|
|
12716
12752
|
availableTargets
|
|
12717
12753
|
});
|
|
@@ -12948,17 +12984,18 @@ async function runEvalCase(options) {
|
|
|
12948
12984
|
}
|
|
12949
12985
|
const output = providerResponse.output;
|
|
12950
12986
|
const hasExecutionMetrics = providerResponse.tokenUsage !== void 0 || providerResponse.costUsd !== void 0 || providerResponse.durationMs !== void 0;
|
|
12951
|
-
const
|
|
12952
|
-
|
|
12953
|
-
toolNames: [],
|
|
12954
|
-
toolCallsByName: {},
|
|
12955
|
-
errorCount: 0
|
|
12956
|
-
} : void 0;
|
|
12957
|
-
const trace = baseSummary ? mergeExecutionMetrics(baseSummary, {
|
|
12987
|
+
const computed = output ? computeTraceSummary(output) : hasExecutionMetrics ? { trace: { eventCount: 0, toolNames: [], toolCallsByName: {}, errorCount: 0 } } : void 0;
|
|
12988
|
+
const merged = computed ? mergeExecutionMetrics(computed, {
|
|
12958
12989
|
tokenUsage: providerResponse.tokenUsage,
|
|
12959
12990
|
costUsd: providerResponse.costUsd,
|
|
12960
12991
|
durationMs: providerResponse.durationMs
|
|
12961
12992
|
}) : void 0;
|
|
12993
|
+
const trace = merged?.trace;
|
|
12994
|
+
const costUsd = merged?.costUsd;
|
|
12995
|
+
const durationMs = merged?.durationMs;
|
|
12996
|
+
const tokenUsage = merged?.tokenUsage;
|
|
12997
|
+
const startTime = merged?.startTime;
|
|
12998
|
+
const endTime = merged?.endTime;
|
|
12962
12999
|
const candidate = extractLastAssistantContent(output);
|
|
12963
13000
|
let fileChanges;
|
|
12964
13001
|
if (baselineCommit && workspacePath) {
|
|
@@ -13003,6 +13040,11 @@ async function runEvalCase(options) {
|
|
|
13003
13040
|
agentTimeoutMs,
|
|
13004
13041
|
output,
|
|
13005
13042
|
trace,
|
|
13043
|
+
costUsd,
|
|
13044
|
+
durationMs,
|
|
13045
|
+
tokenUsage,
|
|
13046
|
+
startTime,
|
|
13047
|
+
endTime,
|
|
13006
13048
|
targetResolver,
|
|
13007
13049
|
availableTargets,
|
|
13008
13050
|
fileChanges,
|
|
@@ -13059,7 +13101,7 @@ async function runEvalCaseWithTrials(options, trialsConfig) {
|
|
|
13059
13101
|
};
|
|
13060
13102
|
const result = await runEvalCase(trialOptions);
|
|
13061
13103
|
allResults.push(result);
|
|
13062
|
-
const trialCost = result.
|
|
13104
|
+
const trialCost = result.costUsd;
|
|
13063
13105
|
const trialVerdict = scoreToVerdict(result.score);
|
|
13064
13106
|
const trial = {
|
|
13065
13107
|
attempt,
|
|
@@ -13115,6 +13157,11 @@ async function evaluateCandidate(options) {
|
|
|
13115
13157
|
agentTimeoutMs,
|
|
13116
13158
|
output,
|
|
13117
13159
|
trace,
|
|
13160
|
+
costUsd,
|
|
13161
|
+
durationMs,
|
|
13162
|
+
tokenUsage,
|
|
13163
|
+
startTime,
|
|
13164
|
+
endTime,
|
|
13118
13165
|
targetResolver,
|
|
13119
13166
|
availableTargets,
|
|
13120
13167
|
fileChanges,
|
|
@@ -13135,6 +13182,11 @@ async function evaluateCandidate(options) {
|
|
|
13135
13182
|
agentTimeoutMs,
|
|
13136
13183
|
output,
|
|
13137
13184
|
trace,
|
|
13185
|
+
costUsd,
|
|
13186
|
+
durationMs,
|
|
13187
|
+
tokenUsage,
|
|
13188
|
+
startTime,
|
|
13189
|
+
endTime,
|
|
13138
13190
|
targetResolver,
|
|
13139
13191
|
availableTargets,
|
|
13140
13192
|
fileChanges,
|
|
@@ -13178,6 +13230,11 @@ async function evaluateCandidate(options) {
|
|
|
13178
13230
|
answer: candidate,
|
|
13179
13231
|
target: target.name,
|
|
13180
13232
|
reasoning: score.reasoning,
|
|
13233
|
+
tokenUsage,
|
|
13234
|
+
costUsd,
|
|
13235
|
+
durationMs,
|
|
13236
|
+
startTime,
|
|
13237
|
+
endTime,
|
|
13181
13238
|
requests,
|
|
13182
13239
|
input,
|
|
13183
13240
|
scores,
|
|
@@ -13201,6 +13258,11 @@ async function runEvaluatorsForCase(options) {
|
|
|
13201
13258
|
agentTimeoutMs,
|
|
13202
13259
|
output,
|
|
13203
13260
|
trace,
|
|
13261
|
+
costUsd,
|
|
13262
|
+
durationMs,
|
|
13263
|
+
tokenUsage,
|
|
13264
|
+
startTime,
|
|
13265
|
+
endTime,
|
|
13204
13266
|
targetResolver,
|
|
13205
13267
|
availableTargets,
|
|
13206
13268
|
fileChanges,
|
|
@@ -13222,6 +13284,11 @@ async function runEvaluatorsForCase(options) {
|
|
|
13222
13284
|
agentTimeoutMs,
|
|
13223
13285
|
output,
|
|
13224
13286
|
trace,
|
|
13287
|
+
costUsd,
|
|
13288
|
+
durationMs,
|
|
13289
|
+
tokenUsage,
|
|
13290
|
+
startTime,
|
|
13291
|
+
endTime,
|
|
13225
13292
|
targetResolver,
|
|
13226
13293
|
availableTargets,
|
|
13227
13294
|
fileChanges,
|
|
@@ -13244,6 +13311,11 @@ async function runEvaluatorsForCase(options) {
|
|
|
13244
13311
|
judgeProvider,
|
|
13245
13312
|
output,
|
|
13246
13313
|
trace,
|
|
13314
|
+
tokenUsage,
|
|
13315
|
+
costUsd,
|
|
13316
|
+
durationMs,
|
|
13317
|
+
startTime,
|
|
13318
|
+
endTime,
|
|
13247
13319
|
targetResolver,
|
|
13248
13320
|
availableTargets,
|
|
13249
13321
|
fileChanges,
|
|
@@ -13267,6 +13339,11 @@ async function runEvaluatorList(options) {
|
|
|
13267
13339
|
agentTimeoutMs,
|
|
13268
13340
|
output,
|
|
13269
13341
|
trace,
|
|
13342
|
+
costUsd,
|
|
13343
|
+
durationMs,
|
|
13344
|
+
tokenUsage,
|
|
13345
|
+
startTime,
|
|
13346
|
+
endTime,
|
|
13270
13347
|
targetResolver,
|
|
13271
13348
|
availableTargets,
|
|
13272
13349
|
fileChanges,
|
|
@@ -13285,6 +13362,11 @@ async function runEvaluatorList(options) {
|
|
|
13285
13362
|
judgeProvider,
|
|
13286
13363
|
output,
|
|
13287
13364
|
trace,
|
|
13365
|
+
tokenUsage,
|
|
13366
|
+
costUsd,
|
|
13367
|
+
durationMs,
|
|
13368
|
+
startTime,
|
|
13369
|
+
endTime,
|
|
13288
13370
|
targetResolver,
|
|
13289
13371
|
availableTargets,
|
|
13290
13372
|
fileChanges,
|
|
@@ -13324,7 +13406,8 @@ async function runEvaluatorList(options) {
|
|
|
13324
13406
|
reasoning: score2.reasoning,
|
|
13325
13407
|
evaluatorProviderRequest: score2.evaluatorRawRequest,
|
|
13326
13408
|
details: score2.details,
|
|
13327
|
-
scores: mapChildResults(score2.scores)
|
|
13409
|
+
scores: mapChildResults(score2.scores),
|
|
13410
|
+
tokenUsage: score2.tokenUsage
|
|
13328
13411
|
});
|
|
13329
13412
|
} catch (error) {
|
|
13330
13413
|
const message = error instanceof Error ? error.message : String(error);
|
|
@@ -13572,7 +13655,8 @@ function mapChildResults(children) {
|
|
|
13572
13655
|
reasoning: child.reasoning,
|
|
13573
13656
|
evaluatorProviderRequest: child.evaluatorRawRequest,
|
|
13574
13657
|
scores: mapChildResults(child.scores),
|
|
13575
|
-
details: child.details
|
|
13658
|
+
details: child.details,
|
|
13659
|
+
tokenUsage: child.tokenUsage
|
|
13576
13660
|
}));
|
|
13577
13661
|
}
|
|
13578
13662
|
function computeWeightedMean(entries) {
|
|
@@ -13952,7 +14036,13 @@ var STRIPPED_TOP_LEVEL_FIELDS = /* @__PURE__ */ new Set([
|
|
|
13952
14036
|
"beforeEachOutput",
|
|
13953
14037
|
"afterAllOutput",
|
|
13954
14038
|
"afterEachOutput",
|
|
13955
|
-
"fileChanges"
|
|
14039
|
+
"fileChanges",
|
|
14040
|
+
// Promoted execution metrics (debug, not needed for regression comparison)
|
|
14041
|
+
"tokenUsage",
|
|
14042
|
+
"costUsd",
|
|
14043
|
+
"durationMs",
|
|
14044
|
+
"startTime",
|
|
14045
|
+
"endTime"
|
|
13956
14046
|
]);
|
|
13957
14047
|
var STRIPPED_EVALUATOR_FIELDS = /* @__PURE__ */ new Set(["rawRequest", "evaluatorProviderRequest"]);
|
|
13958
14048
|
function trimEvaluatorResult(result) {
|
|
@@ -14075,8 +14165,8 @@ var OtelTraceExporter = class {
|
|
|
14075
14165
|
const api = this.api;
|
|
14076
14166
|
const tracer = this.tracer;
|
|
14077
14167
|
const captureContent = this.options.captureContent ?? false;
|
|
14078
|
-
const startHr = toHrTime(result.
|
|
14079
|
-
const endHr = toHrTime(result.
|
|
14168
|
+
const startHr = toHrTime(result.startTime ?? result.timestamp);
|
|
14169
|
+
const endHr = toHrTime(result.endTime ?? result.timestamp);
|
|
14080
14170
|
let parentCtx = api.ROOT_CONTEXT;
|
|
14081
14171
|
const traceparent = process.env.TRACEPARENT;
|
|
14082
14172
|
if (traceparent && this.W3CPropagator) {
|
|
@@ -14105,12 +14195,13 @@ var OtelTraceExporter = class {
|
|
|
14105
14195
|
if (result.dataset) rootSpan.setAttribute("agentv.dataset", result.dataset);
|
|
14106
14196
|
rootSpan.setAttribute("agentv.score", result.score);
|
|
14107
14197
|
if (captureContent) rootSpan.setAttribute("agentv.answer", result.answer);
|
|
14198
|
+
if (result.durationMs != null)
|
|
14199
|
+
rootSpan.setAttribute("agentv.trace.duration_ms", result.durationMs);
|
|
14200
|
+
if (result.costUsd != null) rootSpan.setAttribute("agentv.trace.cost_usd", result.costUsd);
|
|
14108
14201
|
if (result.trace) {
|
|
14109
14202
|
const t = result.trace;
|
|
14110
14203
|
rootSpan.setAttribute("agentv.trace.event_count", t.eventCount);
|
|
14111
14204
|
rootSpan.setAttribute("agentv.trace.tool_names", t.toolNames.join(","));
|
|
14112
|
-
if (t.durationMs != null) rootSpan.setAttribute("agentv.trace.duration_ms", t.durationMs);
|
|
14113
|
-
if (t.costUsd != null) rootSpan.setAttribute("agentv.trace.cost_usd", t.costUsd);
|
|
14114
14205
|
if (t.llmCallCount != null)
|
|
14115
14206
|
rootSpan.setAttribute("agentv.trace.llm_call_count", t.llmCallCount);
|
|
14116
14207
|
}
|