@agentv/core 2.7.1-next.6 → 2.9.0-next.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-5SV2QC6V.js → chunk-7Q4PH265.js} +6 -18
- package/dist/chunk-7Q4PH265.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +4 -11
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +2 -2
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +234 -89
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +54 -22
- package/dist/index.d.ts +54 -22
- package/dist/index.js +230 -73
- package/dist/index.js.map +1 -1
- package/package.json +6 -4
- package/dist/chunk-5SV2QC6V.js.map +0 -1
package/dist/index.js
CHANGED
|
@@ -17,7 +17,7 @@ import {
|
|
|
17
17
|
readTextFile,
|
|
18
18
|
resolveFileReference,
|
|
19
19
|
resolveTargetDefinition
|
|
20
|
-
} from "./chunk-
|
|
20
|
+
} from "./chunk-7Q4PH265.js";
|
|
21
21
|
import {
|
|
22
22
|
OtlpJsonFileExporter
|
|
23
23
|
} from "./chunk-HFSYZHGF.js";
|
|
@@ -83,14 +83,16 @@ function computeTraceSummary(messages) {
|
|
|
83
83
|
}
|
|
84
84
|
const toolNames = Object.keys(toolCallCounts).sort();
|
|
85
85
|
return {
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
86
|
+
trace: {
|
|
87
|
+
eventCount: totalToolCalls,
|
|
88
|
+
toolNames,
|
|
89
|
+
toolCallsByName: toolCallCounts,
|
|
90
|
+
errorCount: 0,
|
|
91
|
+
llmCallCount,
|
|
92
|
+
...hasAnyDuration ? { toolDurations } : {}
|
|
93
|
+
},
|
|
90
94
|
startTime: earliestStart?.toISOString(),
|
|
91
|
-
endTime: latestEnd?.toISOString()
|
|
92
|
-
llmCallCount,
|
|
93
|
-
...hasAnyDuration ? { toolDurations } : {}
|
|
95
|
+
endTime: latestEnd?.toISOString()
|
|
94
96
|
};
|
|
95
97
|
}
|
|
96
98
|
var DEFAULT_EXPLORATION_TOOLS = [
|
|
@@ -113,9 +115,9 @@ function explorationRatio(summary, explorationTools = DEFAULT_EXPLORATION_TOOLS)
|
|
|
113
115
|
);
|
|
114
116
|
return explorationCalls / summary.eventCount;
|
|
115
117
|
}
|
|
116
|
-
function tokensPerTool(summary) {
|
|
117
|
-
if (!
|
|
118
|
-
const totalTokens =
|
|
118
|
+
function tokensPerTool(summary, tokenUsage) {
|
|
119
|
+
if (!tokenUsage || summary.eventCount === 0) return void 0;
|
|
120
|
+
const totalTokens = tokenUsage.input + tokenUsage.output;
|
|
119
121
|
return totalTokens / summary.eventCount;
|
|
120
122
|
}
|
|
121
123
|
function avgToolDurationMs(summary) {
|
|
@@ -131,16 +133,15 @@ function avgToolDurationMs(summary) {
|
|
|
131
133
|
if (totalCalls === 0) return void 0;
|
|
132
134
|
return totalDuration / totalCalls;
|
|
133
135
|
}
|
|
134
|
-
function mergeExecutionMetrics(
|
|
135
|
-
if (!metrics) return
|
|
136
|
+
function mergeExecutionMetrics(computed, metrics) {
|
|
137
|
+
if (!metrics) return computed;
|
|
136
138
|
return {
|
|
137
|
-
|
|
139
|
+
trace: computed.trace,
|
|
138
140
|
tokenUsage: metrics.tokenUsage,
|
|
139
141
|
costUsd: metrics.costUsd,
|
|
140
142
|
durationMs: metrics.durationMs,
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
endTime: metrics.endTime ?? summary.endTime
|
|
143
|
+
startTime: metrics.startTime ?? computed.startTime,
|
|
144
|
+
endTime: metrics.endTime ?? computed.endTime
|
|
144
145
|
};
|
|
145
146
|
}
|
|
146
147
|
|
|
@@ -538,6 +539,24 @@ function extractCacheConfig(suite) {
|
|
|
538
539
|
const resolvedCachePath = typeof cachePath === "string" && cachePath.trim().length > 0 ? cachePath.trim() : void 0;
|
|
539
540
|
return { enabled: cache, cachePath: resolvedCachePath };
|
|
540
541
|
}
|
|
542
|
+
function extractTotalBudgetUsd(suite) {
|
|
543
|
+
const execution = suite.execution;
|
|
544
|
+
if (!execution || typeof execution !== "object" || Array.isArray(execution)) {
|
|
545
|
+
return void 0;
|
|
546
|
+
}
|
|
547
|
+
const executionObj = execution;
|
|
548
|
+
const rawBudget = executionObj.total_budget_usd ?? executionObj.totalBudgetUsd;
|
|
549
|
+
if (rawBudget === void 0 || rawBudget === null) {
|
|
550
|
+
return void 0;
|
|
551
|
+
}
|
|
552
|
+
if (typeof rawBudget === "number" && rawBudget > 0) {
|
|
553
|
+
return rawBudget;
|
|
554
|
+
}
|
|
555
|
+
logWarning(
|
|
556
|
+
`Invalid execution.total_budget_usd: ${rawBudget}. Must be a positive number. Ignoring.`
|
|
557
|
+
);
|
|
558
|
+
return void 0;
|
|
559
|
+
}
|
|
541
560
|
function logWarning(message) {
|
|
542
561
|
console.warn(`${ANSI_YELLOW2}Warning: ${message}${ANSI_RESET2}`);
|
|
543
562
|
}
|
|
@@ -2595,6 +2614,7 @@ async function loadTestSuite(evalFilePath, repoRoot, options) {
|
|
|
2595
2614
|
trials: extractTrialsConfig(parsed),
|
|
2596
2615
|
targets: extractTargetsFromSuite(parsed),
|
|
2597
2616
|
cacheConfig: extractCacheConfig(parsed),
|
|
2617
|
+
totalBudgetUsd: extractTotalBudgetUsd(parsed),
|
|
2598
2618
|
...metadata !== void 0 && { metadata }
|
|
2599
2619
|
};
|
|
2600
2620
|
}
|
|
@@ -3078,10 +3098,13 @@ async function invokeModel(options) {
|
|
|
3078
3098
|
}
|
|
3079
3099
|
function mapResponse(result) {
|
|
3080
3100
|
const content = result.text ?? "";
|
|
3101
|
+
const rawUsage = result.totalUsage ?? result.usage;
|
|
3102
|
+
const tokenUsage = rawUsage?.inputTokens != null && rawUsage?.outputTokens != null ? { input: rawUsage.inputTokens, output: rawUsage.outputTokens } : void 0;
|
|
3081
3103
|
return {
|
|
3082
3104
|
raw: result,
|
|
3083
|
-
usage: toJsonObject(
|
|
3084
|
-
output: [{ role: "assistant", content }]
|
|
3105
|
+
usage: toJsonObject(rawUsage),
|
|
3106
|
+
output: [{ role: "assistant", content }],
|
|
3107
|
+
tokenUsage
|
|
3085
3108
|
};
|
|
3086
3109
|
}
|
|
3087
3110
|
function toJsonObject(value) {
|
|
@@ -8374,6 +8397,8 @@ async function createTargetProxy(options) {
|
|
|
8374
8397
|
const token = randomBytes(32).toString("hex");
|
|
8375
8398
|
let callCount = 0;
|
|
8376
8399
|
let isShutdown = false;
|
|
8400
|
+
let totalInputTokens = 0;
|
|
8401
|
+
let totalOutputTokens = 0;
|
|
8377
8402
|
const targetsList = availableTargets ?? [defaultProvider.targetName];
|
|
8378
8403
|
function resolveProvider(targetName) {
|
|
8379
8404
|
if (targetName === void 0 || targetName === defaultProvider.targetName) {
|
|
@@ -8452,11 +8477,16 @@ async function createTargetProxy(options) {
|
|
|
8452
8477
|
evalCaseId: request.evalCaseId ?? "proxy",
|
|
8453
8478
|
attempt: request.attempt ?? 1
|
|
8454
8479
|
});
|
|
8480
|
+
if (response.tokenUsage) {
|
|
8481
|
+
totalInputTokens += response.tokenUsage.input;
|
|
8482
|
+
totalOutputTokens += response.tokenUsage.output;
|
|
8483
|
+
}
|
|
8455
8484
|
const output = response.output ?? [];
|
|
8456
8485
|
const rawText = extractLastAssistantContent2(output);
|
|
8457
8486
|
const result = {
|
|
8458
8487
|
output,
|
|
8459
|
-
rawText
|
|
8488
|
+
rawText,
|
|
8489
|
+
tokenUsage: response.tokenUsage
|
|
8460
8490
|
};
|
|
8461
8491
|
sendJson(res, 200, result);
|
|
8462
8492
|
} catch (error) {
|
|
@@ -8503,10 +8533,15 @@ async function createTargetProxy(options) {
|
|
|
8503
8533
|
evalCaseId: request.evalCaseId ?? "proxy",
|
|
8504
8534
|
attempt: request.attempt ?? 1
|
|
8505
8535
|
});
|
|
8536
|
+
if (response.tokenUsage) {
|
|
8537
|
+
totalInputTokens += response.tokenUsage.input;
|
|
8538
|
+
totalOutputTokens += response.tokenUsage.output;
|
|
8539
|
+
}
|
|
8506
8540
|
const output = response.output ?? [];
|
|
8507
8541
|
responses.push({
|
|
8508
8542
|
output,
|
|
8509
|
-
rawText: extractLastAssistantContent2(output)
|
|
8543
|
+
rawText: extractLastAssistantContent2(output),
|
|
8544
|
+
tokenUsage: response.tokenUsage
|
|
8510
8545
|
});
|
|
8511
8546
|
} catch (error) {
|
|
8512
8547
|
const message = error instanceof Error ? error.message : String(error);
|
|
@@ -8545,7 +8580,8 @@ async function createTargetProxy(options) {
|
|
|
8545
8580
|
},
|
|
8546
8581
|
getUsageMetadata: () => ({
|
|
8547
8582
|
callCount,
|
|
8548
|
-
maxCalls
|
|
8583
|
+
maxCalls,
|
|
8584
|
+
tokenUsage: totalInputTokens > 0 || totalOutputTokens > 0 ? { input: totalInputTokens, output: totalOutputTokens } : void 0
|
|
8549
8585
|
})
|
|
8550
8586
|
};
|
|
8551
8587
|
}
|
|
@@ -8670,6 +8706,11 @@ var CodeEvaluator = class {
|
|
|
8670
8706
|
),
|
|
8671
8707
|
input: context.evalCase.input,
|
|
8672
8708
|
trace: context.trace ?? null,
|
|
8709
|
+
tokenUsage: context.tokenUsage ?? null,
|
|
8710
|
+
costUsd: context.costUsd ?? null,
|
|
8711
|
+
durationMs: context.durationMs ?? null,
|
|
8712
|
+
startTime: context.startTime ?? null,
|
|
8713
|
+
endTime: context.endTime ?? null,
|
|
8673
8714
|
fileChanges: context.fileChanges ?? null,
|
|
8674
8715
|
workspacePath: context.workspacePath ?? null,
|
|
8675
8716
|
config: this.config ?? null
|
|
@@ -8728,7 +8769,8 @@ var CodeEvaluator = class {
|
|
|
8728
8769
|
expectedAspectCount: hits.length + misses.length || 1,
|
|
8729
8770
|
reasoning,
|
|
8730
8771
|
evaluatorRawRequest,
|
|
8731
|
-
...details ? { details } : {}
|
|
8772
|
+
...details ? { details } : {},
|
|
8773
|
+
tokenUsage: proxyUsage?.tokenUsage
|
|
8732
8774
|
};
|
|
8733
8775
|
} catch (error) {
|
|
8734
8776
|
const message = error instanceof Error ? error.message : String(error);
|
|
@@ -8750,7 +8792,8 @@ var CodeEvaluator = class {
|
|
|
8750
8792
|
}
|
|
8751
8793
|
} : {},
|
|
8752
8794
|
error: message
|
|
8753
|
-
}
|
|
8795
|
+
},
|
|
8796
|
+
tokenUsage: proxyUsage?.tokenUsage
|
|
8754
8797
|
};
|
|
8755
8798
|
} finally {
|
|
8756
8799
|
if (proxyShutdown) {
|
|
@@ -8885,7 +8928,7 @@ ${context.fileChanges}`;
|
|
|
8885
8928
|
target: judgeProvider.targetName
|
|
8886
8929
|
};
|
|
8887
8930
|
try {
|
|
8888
|
-
const { data } = await this.runWithRetry({
|
|
8931
|
+
const { data, tokenUsage } = await this.runWithRetry({
|
|
8889
8932
|
context,
|
|
8890
8933
|
judgeProvider,
|
|
8891
8934
|
systemPrompt,
|
|
@@ -8904,7 +8947,8 @@ ${context.fileChanges}`;
|
|
|
8904
8947
|
misses,
|
|
8905
8948
|
expectedAspectCount,
|
|
8906
8949
|
reasoning,
|
|
8907
|
-
evaluatorRawRequest
|
|
8950
|
+
evaluatorRawRequest,
|
|
8951
|
+
tokenUsage
|
|
8908
8952
|
};
|
|
8909
8953
|
} catch {
|
|
8910
8954
|
return {
|
|
@@ -8934,7 +8978,7 @@ ${context.fileChanges}`;
|
|
|
8934
8978
|
systemPrompt,
|
|
8935
8979
|
target: judgeProvider.targetName
|
|
8936
8980
|
};
|
|
8937
|
-
const { data } = await this.runWithRetry({
|
|
8981
|
+
const { data, tokenUsage } = await this.runWithRetry({
|
|
8938
8982
|
context,
|
|
8939
8983
|
judgeProvider,
|
|
8940
8984
|
systemPrompt,
|
|
@@ -8949,7 +8993,8 @@ ${context.fileChanges}`;
|
|
|
8949
8993
|
misses,
|
|
8950
8994
|
expectedAspectCount: rubrics.length,
|
|
8951
8995
|
reasoning: data.overall_reasoning,
|
|
8952
|
-
evaluatorRawRequest
|
|
8996
|
+
evaluatorRawRequest,
|
|
8997
|
+
tokenUsage
|
|
8953
8998
|
};
|
|
8954
8999
|
}
|
|
8955
9000
|
/**
|
|
@@ -8964,7 +9009,7 @@ ${context.fileChanges}`;
|
|
|
8964
9009
|
systemPrompt,
|
|
8965
9010
|
target: judgeProvider.targetName
|
|
8966
9011
|
};
|
|
8967
|
-
const { data } = await this.runWithRetry({
|
|
9012
|
+
const { data, tokenUsage } = await this.runWithRetry({
|
|
8968
9013
|
context,
|
|
8969
9014
|
judgeProvider,
|
|
8970
9015
|
systemPrompt,
|
|
@@ -8980,7 +9025,8 @@ ${context.fileChanges}`;
|
|
|
8980
9025
|
expectedAspectCount: rubrics.length,
|
|
8981
9026
|
reasoning: data.overall_reasoning,
|
|
8982
9027
|
evaluatorRawRequest,
|
|
8983
|
-
details
|
|
9028
|
+
details,
|
|
9029
|
+
tokenUsage
|
|
8984
9030
|
};
|
|
8985
9031
|
}
|
|
8986
9032
|
/**
|
|
@@ -9064,15 +9110,17 @@ ${context.fileChanges}`;
|
|
|
9064
9110
|
try {
|
|
9065
9111
|
const model = judgeProvider.asLanguageModel?.();
|
|
9066
9112
|
if (model) {
|
|
9067
|
-
const
|
|
9113
|
+
const result = await generateText2({
|
|
9068
9114
|
model,
|
|
9069
9115
|
system: systemPrompt,
|
|
9070
9116
|
prompt: userPrompt,
|
|
9071
9117
|
...this.maxOutputTokens ? { maxTokens: this.maxOutputTokens } : {},
|
|
9072
9118
|
...typeof this.temperature === "number" ? { temperature: this.temperature } : {}
|
|
9073
9119
|
});
|
|
9074
|
-
const data2 = schema.parse(parseJsonFromText(text));
|
|
9075
|
-
|
|
9120
|
+
const data2 = schema.parse(parseJsonFromText(result.text));
|
|
9121
|
+
const rawUsage = result.usage;
|
|
9122
|
+
const tokenUsage = rawUsage?.inputTokens != null && rawUsage?.outputTokens != null ? { input: rawUsage.inputTokens, output: rawUsage.outputTokens } : void 0;
|
|
9123
|
+
return { data: data2, tokenUsage };
|
|
9076
9124
|
}
|
|
9077
9125
|
const response = await judgeProvider.invoke({
|
|
9078
9126
|
question: userPrompt,
|
|
@@ -9083,7 +9131,7 @@ ${context.fileChanges}`;
|
|
|
9083
9131
|
temperature: this.temperature
|
|
9084
9132
|
});
|
|
9085
9133
|
const data = schema.parse(parseJsonFromText(extractLastAssistantContent(response.output)));
|
|
9086
|
-
return { data, providerResponse: response };
|
|
9134
|
+
return { data, providerResponse: response, tokenUsage: response.tokenUsage };
|
|
9087
9135
|
} catch (e) {
|
|
9088
9136
|
lastError = e instanceof Error ? e : new Error(String(e));
|
|
9089
9137
|
}
|
|
@@ -9289,7 +9337,8 @@ var CompositeEvaluator = class {
|
|
|
9289
9337
|
reasoning: member.result.reasoning,
|
|
9290
9338
|
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
9291
9339
|
scores: member.result.scores,
|
|
9292
|
-
details: member.result.details
|
|
9340
|
+
details: member.result.details,
|
|
9341
|
+
tokenUsage: member.result.tokenUsage
|
|
9293
9342
|
});
|
|
9294
9343
|
}
|
|
9295
9344
|
const finalScore = totalWeight > 0 ? weightedSum / totalWeight : 0;
|
|
@@ -9337,7 +9386,8 @@ var CompositeEvaluator = class {
|
|
|
9337
9386
|
reasoning: member.result.reasoning,
|
|
9338
9387
|
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
9339
9388
|
scores: member.result.scores,
|
|
9340
|
-
details: member.result.details
|
|
9389
|
+
details: member.result.details,
|
|
9390
|
+
tokenUsage: member.result.tokenUsage
|
|
9341
9391
|
});
|
|
9342
9392
|
}
|
|
9343
9393
|
const totalCount = results.length;
|
|
@@ -9516,7 +9566,7 @@ var CostEvaluator = class {
|
|
|
9516
9566
|
}
|
|
9517
9567
|
evaluate(context) {
|
|
9518
9568
|
const { budget } = this.config;
|
|
9519
|
-
const costUsd = context.
|
|
9569
|
+
const costUsd = context.costUsd;
|
|
9520
9570
|
if (costUsd === void 0) {
|
|
9521
9571
|
return {
|
|
9522
9572
|
score: 0,
|
|
@@ -9559,7 +9609,7 @@ var ExecutionMetricsEvaluator = class {
|
|
|
9559
9609
|
this.config = options.config;
|
|
9560
9610
|
}
|
|
9561
9611
|
evaluate(context) {
|
|
9562
|
-
const { trace } = context;
|
|
9612
|
+
const { trace, tokenUsage, costUsd, durationMs } = context;
|
|
9563
9613
|
const {
|
|
9564
9614
|
max_tool_calls,
|
|
9565
9615
|
max_llm_calls,
|
|
@@ -9569,7 +9619,8 @@ var ExecutionMetricsEvaluator = class {
|
|
|
9569
9619
|
target_exploration_ratio,
|
|
9570
9620
|
exploration_tolerance = 0.2
|
|
9571
9621
|
} = this.config;
|
|
9572
|
-
|
|
9622
|
+
const needsTrace = max_tool_calls !== void 0 || max_llm_calls !== void 0 || target_exploration_ratio !== void 0;
|
|
9623
|
+
if (needsTrace && !trace) {
|
|
9573
9624
|
return {
|
|
9574
9625
|
score: 0,
|
|
9575
9626
|
verdict: "fail",
|
|
@@ -9584,11 +9635,12 @@ var ExecutionMetricsEvaluator = class {
|
|
|
9584
9635
|
}
|
|
9585
9636
|
};
|
|
9586
9637
|
}
|
|
9638
|
+
const narrowedTrace = trace;
|
|
9587
9639
|
const hits = [];
|
|
9588
9640
|
const misses = [];
|
|
9589
9641
|
const actualMetrics = {};
|
|
9590
|
-
if (max_tool_calls !== void 0) {
|
|
9591
|
-
const toolCalls =
|
|
9642
|
+
if (max_tool_calls !== void 0 && narrowedTrace) {
|
|
9643
|
+
const toolCalls = narrowedTrace.eventCount;
|
|
9592
9644
|
actualMetrics.tool_calls = toolCalls;
|
|
9593
9645
|
if (toolCalls <= max_tool_calls) {
|
|
9594
9646
|
hits.push(`Tool calls ${toolCalls} <= ${max_tool_calls} max`);
|
|
@@ -9596,8 +9648,8 @@ var ExecutionMetricsEvaluator = class {
|
|
|
9596
9648
|
misses.push(`Tool calls ${toolCalls} > ${max_tool_calls} max`);
|
|
9597
9649
|
}
|
|
9598
9650
|
}
|
|
9599
|
-
if (max_llm_calls !== void 0) {
|
|
9600
|
-
const llmCalls =
|
|
9651
|
+
if (max_llm_calls !== void 0 && narrowedTrace) {
|
|
9652
|
+
const llmCalls = narrowedTrace.llmCallCount;
|
|
9601
9653
|
if (llmCalls === void 0) {
|
|
9602
9654
|
misses.push("LLM call count data not available");
|
|
9603
9655
|
} else {
|
|
@@ -9610,7 +9662,6 @@ var ExecutionMetricsEvaluator = class {
|
|
|
9610
9662
|
}
|
|
9611
9663
|
}
|
|
9612
9664
|
if (max_tokens !== void 0) {
|
|
9613
|
-
const tokenUsage = trace.tokenUsage;
|
|
9614
9665
|
if (!tokenUsage) {
|
|
9615
9666
|
misses.push("Token usage data not available");
|
|
9616
9667
|
} else {
|
|
@@ -9624,7 +9675,6 @@ var ExecutionMetricsEvaluator = class {
|
|
|
9624
9675
|
}
|
|
9625
9676
|
}
|
|
9626
9677
|
if (max_cost_usd !== void 0) {
|
|
9627
|
-
const costUsd = trace.costUsd;
|
|
9628
9678
|
if (costUsd === void 0) {
|
|
9629
9679
|
misses.push("Cost data not available");
|
|
9630
9680
|
} else {
|
|
@@ -9638,7 +9688,6 @@ var ExecutionMetricsEvaluator = class {
|
|
|
9638
9688
|
}
|
|
9639
9689
|
}
|
|
9640
9690
|
if (max_duration_ms !== void 0) {
|
|
9641
|
-
const durationMs = trace.durationMs;
|
|
9642
9691
|
if (durationMs === void 0) {
|
|
9643
9692
|
misses.push("Duration data not available");
|
|
9644
9693
|
} else {
|
|
@@ -9650,8 +9699,8 @@ var ExecutionMetricsEvaluator = class {
|
|
|
9650
9699
|
}
|
|
9651
9700
|
}
|
|
9652
9701
|
}
|
|
9653
|
-
if (target_exploration_ratio !== void 0) {
|
|
9654
|
-
const ratio = explorationRatio(
|
|
9702
|
+
if (target_exploration_ratio !== void 0 && narrowedTrace) {
|
|
9703
|
+
const ratio = explorationRatio(narrowedTrace);
|
|
9655
9704
|
if (ratio === void 0) {
|
|
9656
9705
|
misses.push("Exploration ratio not available (no tool calls)");
|
|
9657
9706
|
} else {
|
|
@@ -10165,7 +10214,7 @@ var LatencyEvaluator = class {
|
|
|
10165
10214
|
}
|
|
10166
10215
|
evaluate(context) {
|
|
10167
10216
|
const { threshold } = this.config;
|
|
10168
|
-
const durationMs = context.
|
|
10217
|
+
const durationMs = context.durationMs;
|
|
10169
10218
|
if (durationMs === void 0) {
|
|
10170
10219
|
return {
|
|
10171
10220
|
score: 0,
|
|
@@ -10810,7 +10859,7 @@ var TokenUsageEvaluator = class {
|
|
|
10810
10859
|
this.config = options.config;
|
|
10811
10860
|
}
|
|
10812
10861
|
evaluate(context) {
|
|
10813
|
-
const usage = context.
|
|
10862
|
+
const usage = context.tokenUsage;
|
|
10814
10863
|
const maxTotal = this.config.max_total;
|
|
10815
10864
|
const maxInput = this.config.max_input;
|
|
10816
10865
|
const maxOutput = this.config.max_output;
|
|
@@ -12255,7 +12304,8 @@ async function runEvaluation(options) {
|
|
|
12255
12304
|
keepWorkspaces,
|
|
12256
12305
|
cleanupWorkspaces,
|
|
12257
12306
|
trials,
|
|
12258
|
-
streamCallbacks
|
|
12307
|
+
streamCallbacks,
|
|
12308
|
+
totalBudgetUsd
|
|
12259
12309
|
} = options;
|
|
12260
12310
|
let useCache = options.useCache;
|
|
12261
12311
|
if (trials && trials.count > 1 && useCache) {
|
|
@@ -12428,10 +12478,39 @@ async function runEvaluation(options) {
|
|
|
12428
12478
|
let nextWorkerId = 1;
|
|
12429
12479
|
const workerIdByEvalId = /* @__PURE__ */ new Map();
|
|
12430
12480
|
let beforeAllOutputAttached = false;
|
|
12481
|
+
let cumulativeBudgetCost = 0;
|
|
12482
|
+
let budgetExhausted = false;
|
|
12431
12483
|
const promises = filteredEvalCases.map(
|
|
12432
12484
|
(evalCase) => limit(async () => {
|
|
12433
12485
|
const workerId = nextWorkerId++;
|
|
12434
12486
|
workerIdByEvalId.set(evalCase.id, workerId);
|
|
12487
|
+
if (totalBudgetUsd !== void 0 && budgetExhausted) {
|
|
12488
|
+
const budgetResult = {
|
|
12489
|
+
timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
|
|
12490
|
+
testId: evalCase.id,
|
|
12491
|
+
dataset: evalCase.dataset,
|
|
12492
|
+
score: 0,
|
|
12493
|
+
hits: [],
|
|
12494
|
+
misses: [],
|
|
12495
|
+
answer: "",
|
|
12496
|
+
target: target.name,
|
|
12497
|
+
error: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
|
|
12498
|
+
budgetExceeded: true
|
|
12499
|
+
};
|
|
12500
|
+
if (onProgress) {
|
|
12501
|
+
await onProgress({
|
|
12502
|
+
workerId,
|
|
12503
|
+
testId: evalCase.id,
|
|
12504
|
+
status: "failed",
|
|
12505
|
+
completedAt: Date.now(),
|
|
12506
|
+
error: budgetResult.error
|
|
12507
|
+
});
|
|
12508
|
+
}
|
|
12509
|
+
if (onResult) {
|
|
12510
|
+
await onResult(budgetResult);
|
|
12511
|
+
}
|
|
12512
|
+
return budgetResult;
|
|
12513
|
+
}
|
|
12435
12514
|
if (onProgress) {
|
|
12436
12515
|
await onProgress({
|
|
12437
12516
|
workerId,
|
|
@@ -12465,6 +12544,23 @@ async function runEvaluation(options) {
|
|
|
12465
12544
|
typeRegistry
|
|
12466
12545
|
};
|
|
12467
12546
|
let result = trials && trials.count > 1 ? await runEvalCaseWithTrials(runCaseOptions, trials) : await runEvalCase(runCaseOptions);
|
|
12547
|
+
if (totalBudgetUsd !== void 0) {
|
|
12548
|
+
let caseCost;
|
|
12549
|
+
if (result.trials && result.trials.length > 0) {
|
|
12550
|
+
const trialCostSum = result.trials.reduce((sum, t) => sum + (t.costUsd ?? 0), 0);
|
|
12551
|
+
if (trialCostSum > 0) {
|
|
12552
|
+
caseCost = trialCostSum;
|
|
12553
|
+
}
|
|
12554
|
+
} else {
|
|
12555
|
+
caseCost = result.costUsd;
|
|
12556
|
+
}
|
|
12557
|
+
if (caseCost !== void 0) {
|
|
12558
|
+
cumulativeBudgetCost += caseCost;
|
|
12559
|
+
if (cumulativeBudgetCost >= totalBudgetUsd) {
|
|
12560
|
+
budgetExhausted = true;
|
|
12561
|
+
}
|
|
12562
|
+
}
|
|
12563
|
+
}
|
|
12468
12564
|
if (beforeAllOutput && !beforeAllOutputAttached) {
|
|
12469
12565
|
result = { ...result, beforeAllOutput };
|
|
12470
12566
|
beforeAllOutputAttached = true;
|
|
@@ -12617,17 +12713,18 @@ async function runBatchEvaluation(options) {
|
|
|
12617
12713
|
const providerResponse = batchResponse[i];
|
|
12618
12714
|
const output = providerResponse.output;
|
|
12619
12715
|
const hasExecutionMetrics = providerResponse.tokenUsage !== void 0 || providerResponse.costUsd !== void 0 || providerResponse.durationMs !== void 0;
|
|
12620
|
-
const
|
|
12621
|
-
|
|
12622
|
-
toolNames: [],
|
|
12623
|
-
toolCallsByName: {},
|
|
12624
|
-
errorCount: 0
|
|
12625
|
-
} : void 0;
|
|
12626
|
-
const trace = baseSummary ? mergeExecutionMetrics(baseSummary, {
|
|
12716
|
+
const computed = output ? computeTraceSummary(output) : hasExecutionMetrics ? { trace: { eventCount: 0, toolNames: [], toolCallsByName: {}, errorCount: 0 } } : void 0;
|
|
12717
|
+
const merged = computed ? mergeExecutionMetrics(computed, {
|
|
12627
12718
|
tokenUsage: providerResponse.tokenUsage,
|
|
12628
12719
|
costUsd: providerResponse.costUsd,
|
|
12629
12720
|
durationMs: providerResponse.durationMs
|
|
12630
12721
|
}) : void 0;
|
|
12722
|
+
const trace = merged?.trace;
|
|
12723
|
+
const costUsd = merged?.costUsd;
|
|
12724
|
+
const durationMs = merged?.durationMs;
|
|
12725
|
+
const tokenUsage = merged?.tokenUsage;
|
|
12726
|
+
const startTime = merged?.startTime;
|
|
12727
|
+
const endTime = merged?.endTime;
|
|
12631
12728
|
const candidate = extractLastAssistantContent(output);
|
|
12632
12729
|
const providerError = extractProviderError(providerResponse);
|
|
12633
12730
|
let result;
|
|
@@ -12646,6 +12743,11 @@ async function runBatchEvaluation(options) {
|
|
|
12646
12743
|
agentTimeoutMs,
|
|
12647
12744
|
output,
|
|
12648
12745
|
trace,
|
|
12746
|
+
costUsd,
|
|
12747
|
+
durationMs,
|
|
12748
|
+
tokenUsage,
|
|
12749
|
+
startTime,
|
|
12750
|
+
endTime,
|
|
12649
12751
|
targetResolver,
|
|
12650
12752
|
availableTargets
|
|
12651
12753
|
});
|
|
@@ -12882,17 +12984,18 @@ async function runEvalCase(options) {
|
|
|
12882
12984
|
}
|
|
12883
12985
|
const output = providerResponse.output;
|
|
12884
12986
|
const hasExecutionMetrics = providerResponse.tokenUsage !== void 0 || providerResponse.costUsd !== void 0 || providerResponse.durationMs !== void 0;
|
|
12885
|
-
const
|
|
12886
|
-
|
|
12887
|
-
toolNames: [],
|
|
12888
|
-
toolCallsByName: {},
|
|
12889
|
-
errorCount: 0
|
|
12890
|
-
} : void 0;
|
|
12891
|
-
const trace = baseSummary ? mergeExecutionMetrics(baseSummary, {
|
|
12987
|
+
const computed = output ? computeTraceSummary(output) : hasExecutionMetrics ? { trace: { eventCount: 0, toolNames: [], toolCallsByName: {}, errorCount: 0 } } : void 0;
|
|
12988
|
+
const merged = computed ? mergeExecutionMetrics(computed, {
|
|
12892
12989
|
tokenUsage: providerResponse.tokenUsage,
|
|
12893
12990
|
costUsd: providerResponse.costUsd,
|
|
12894
12991
|
durationMs: providerResponse.durationMs
|
|
12895
12992
|
}) : void 0;
|
|
12993
|
+
const trace = merged?.trace;
|
|
12994
|
+
const costUsd = merged?.costUsd;
|
|
12995
|
+
const durationMs = merged?.durationMs;
|
|
12996
|
+
const tokenUsage = merged?.tokenUsage;
|
|
12997
|
+
const startTime = merged?.startTime;
|
|
12998
|
+
const endTime = merged?.endTime;
|
|
12896
12999
|
const candidate = extractLastAssistantContent(output);
|
|
12897
13000
|
let fileChanges;
|
|
12898
13001
|
if (baselineCommit && workspacePath) {
|
|
@@ -12937,6 +13040,11 @@ async function runEvalCase(options) {
|
|
|
12937
13040
|
agentTimeoutMs,
|
|
12938
13041
|
output,
|
|
12939
13042
|
trace,
|
|
13043
|
+
costUsd,
|
|
13044
|
+
durationMs,
|
|
13045
|
+
tokenUsage,
|
|
13046
|
+
startTime,
|
|
13047
|
+
endTime,
|
|
12940
13048
|
targetResolver,
|
|
12941
13049
|
availableTargets,
|
|
12942
13050
|
fileChanges,
|
|
@@ -12993,7 +13101,7 @@ async function runEvalCaseWithTrials(options, trialsConfig) {
|
|
|
12993
13101
|
};
|
|
12994
13102
|
const result = await runEvalCase(trialOptions);
|
|
12995
13103
|
allResults.push(result);
|
|
12996
|
-
const trialCost = result.
|
|
13104
|
+
const trialCost = result.costUsd;
|
|
12997
13105
|
const trialVerdict = scoreToVerdict(result.score);
|
|
12998
13106
|
const trial = {
|
|
12999
13107
|
attempt,
|
|
@@ -13049,6 +13157,11 @@ async function evaluateCandidate(options) {
|
|
|
13049
13157
|
agentTimeoutMs,
|
|
13050
13158
|
output,
|
|
13051
13159
|
trace,
|
|
13160
|
+
costUsd,
|
|
13161
|
+
durationMs,
|
|
13162
|
+
tokenUsage,
|
|
13163
|
+
startTime,
|
|
13164
|
+
endTime,
|
|
13052
13165
|
targetResolver,
|
|
13053
13166
|
availableTargets,
|
|
13054
13167
|
fileChanges,
|
|
@@ -13069,6 +13182,11 @@ async function evaluateCandidate(options) {
|
|
|
13069
13182
|
agentTimeoutMs,
|
|
13070
13183
|
output,
|
|
13071
13184
|
trace,
|
|
13185
|
+
costUsd,
|
|
13186
|
+
durationMs,
|
|
13187
|
+
tokenUsage,
|
|
13188
|
+
startTime,
|
|
13189
|
+
endTime,
|
|
13072
13190
|
targetResolver,
|
|
13073
13191
|
availableTargets,
|
|
13074
13192
|
fileChanges,
|
|
@@ -13112,6 +13230,11 @@ async function evaluateCandidate(options) {
|
|
|
13112
13230
|
answer: candidate,
|
|
13113
13231
|
target: target.name,
|
|
13114
13232
|
reasoning: score.reasoning,
|
|
13233
|
+
tokenUsage,
|
|
13234
|
+
costUsd,
|
|
13235
|
+
durationMs,
|
|
13236
|
+
startTime,
|
|
13237
|
+
endTime,
|
|
13115
13238
|
requests,
|
|
13116
13239
|
input,
|
|
13117
13240
|
scores,
|
|
@@ -13135,6 +13258,11 @@ async function runEvaluatorsForCase(options) {
|
|
|
13135
13258
|
agentTimeoutMs,
|
|
13136
13259
|
output,
|
|
13137
13260
|
trace,
|
|
13261
|
+
costUsd,
|
|
13262
|
+
durationMs,
|
|
13263
|
+
tokenUsage,
|
|
13264
|
+
startTime,
|
|
13265
|
+
endTime,
|
|
13138
13266
|
targetResolver,
|
|
13139
13267
|
availableTargets,
|
|
13140
13268
|
fileChanges,
|
|
@@ -13156,6 +13284,11 @@ async function runEvaluatorsForCase(options) {
|
|
|
13156
13284
|
agentTimeoutMs,
|
|
13157
13285
|
output,
|
|
13158
13286
|
trace,
|
|
13287
|
+
costUsd,
|
|
13288
|
+
durationMs,
|
|
13289
|
+
tokenUsage,
|
|
13290
|
+
startTime,
|
|
13291
|
+
endTime,
|
|
13159
13292
|
targetResolver,
|
|
13160
13293
|
availableTargets,
|
|
13161
13294
|
fileChanges,
|
|
@@ -13178,6 +13311,11 @@ async function runEvaluatorsForCase(options) {
|
|
|
13178
13311
|
judgeProvider,
|
|
13179
13312
|
output,
|
|
13180
13313
|
trace,
|
|
13314
|
+
tokenUsage,
|
|
13315
|
+
costUsd,
|
|
13316
|
+
durationMs,
|
|
13317
|
+
startTime,
|
|
13318
|
+
endTime,
|
|
13181
13319
|
targetResolver,
|
|
13182
13320
|
availableTargets,
|
|
13183
13321
|
fileChanges,
|
|
@@ -13201,6 +13339,11 @@ async function runEvaluatorList(options) {
|
|
|
13201
13339
|
agentTimeoutMs,
|
|
13202
13340
|
output,
|
|
13203
13341
|
trace,
|
|
13342
|
+
costUsd,
|
|
13343
|
+
durationMs,
|
|
13344
|
+
tokenUsage,
|
|
13345
|
+
startTime,
|
|
13346
|
+
endTime,
|
|
13204
13347
|
targetResolver,
|
|
13205
13348
|
availableTargets,
|
|
13206
13349
|
fileChanges,
|
|
@@ -13219,6 +13362,11 @@ async function runEvaluatorList(options) {
|
|
|
13219
13362
|
judgeProvider,
|
|
13220
13363
|
output,
|
|
13221
13364
|
trace,
|
|
13365
|
+
tokenUsage,
|
|
13366
|
+
costUsd,
|
|
13367
|
+
durationMs,
|
|
13368
|
+
startTime,
|
|
13369
|
+
endTime,
|
|
13222
13370
|
targetResolver,
|
|
13223
13371
|
availableTargets,
|
|
13224
13372
|
fileChanges,
|
|
@@ -13258,7 +13406,8 @@ async function runEvaluatorList(options) {
|
|
|
13258
13406
|
reasoning: score2.reasoning,
|
|
13259
13407
|
evaluatorProviderRequest: score2.evaluatorRawRequest,
|
|
13260
13408
|
details: score2.details,
|
|
13261
|
-
scores: mapChildResults(score2.scores)
|
|
13409
|
+
scores: mapChildResults(score2.scores),
|
|
13410
|
+
tokenUsage: score2.tokenUsage
|
|
13262
13411
|
});
|
|
13263
13412
|
} catch (error) {
|
|
13264
13413
|
const message = error instanceof Error ? error.message : String(error);
|
|
@@ -13506,7 +13655,8 @@ function mapChildResults(children) {
|
|
|
13506
13655
|
reasoning: child.reasoning,
|
|
13507
13656
|
evaluatorProviderRequest: child.evaluatorRawRequest,
|
|
13508
13657
|
scores: mapChildResults(child.scores),
|
|
13509
|
-
details: child.details
|
|
13658
|
+
details: child.details,
|
|
13659
|
+
tokenUsage: child.tokenUsage
|
|
13510
13660
|
}));
|
|
13511
13661
|
}
|
|
13512
13662
|
function computeWeightedMean(entries) {
|
|
@@ -13886,7 +14036,13 @@ var STRIPPED_TOP_LEVEL_FIELDS = /* @__PURE__ */ new Set([
|
|
|
13886
14036
|
"beforeEachOutput",
|
|
13887
14037
|
"afterAllOutput",
|
|
13888
14038
|
"afterEachOutput",
|
|
13889
|
-
"fileChanges"
|
|
14039
|
+
"fileChanges",
|
|
14040
|
+
// Promoted execution metrics (debug, not needed for regression comparison)
|
|
14041
|
+
"tokenUsage",
|
|
14042
|
+
"costUsd",
|
|
14043
|
+
"durationMs",
|
|
14044
|
+
"startTime",
|
|
14045
|
+
"endTime"
|
|
13890
14046
|
]);
|
|
13891
14047
|
var STRIPPED_EVALUATOR_FIELDS = /* @__PURE__ */ new Set(["rawRequest", "evaluatorProviderRequest"]);
|
|
13892
14048
|
function trimEvaluatorResult(result) {
|
|
@@ -14009,8 +14165,8 @@ var OtelTraceExporter = class {
|
|
|
14009
14165
|
const api = this.api;
|
|
14010
14166
|
const tracer = this.tracer;
|
|
14011
14167
|
const captureContent = this.options.captureContent ?? false;
|
|
14012
|
-
const startHr = toHrTime(result.
|
|
14013
|
-
const endHr = toHrTime(result.
|
|
14168
|
+
const startHr = toHrTime(result.startTime ?? result.timestamp);
|
|
14169
|
+
const endHr = toHrTime(result.endTime ?? result.timestamp);
|
|
14014
14170
|
let parentCtx = api.ROOT_CONTEXT;
|
|
14015
14171
|
const traceparent = process.env.TRACEPARENT;
|
|
14016
14172
|
if (traceparent && this.W3CPropagator) {
|
|
@@ -14039,12 +14195,13 @@ var OtelTraceExporter = class {
|
|
|
14039
14195
|
if (result.dataset) rootSpan.setAttribute("agentv.dataset", result.dataset);
|
|
14040
14196
|
rootSpan.setAttribute("agentv.score", result.score);
|
|
14041
14197
|
if (captureContent) rootSpan.setAttribute("agentv.answer", result.answer);
|
|
14198
|
+
if (result.durationMs != null)
|
|
14199
|
+
rootSpan.setAttribute("agentv.trace.duration_ms", result.durationMs);
|
|
14200
|
+
if (result.costUsd != null) rootSpan.setAttribute("agentv.trace.cost_usd", result.costUsd);
|
|
14042
14201
|
if (result.trace) {
|
|
14043
14202
|
const t = result.trace;
|
|
14044
14203
|
rootSpan.setAttribute("agentv.trace.event_count", t.eventCount);
|
|
14045
14204
|
rootSpan.setAttribute("agentv.trace.tool_names", t.toolNames.join(","));
|
|
14046
|
-
if (t.durationMs != null) rootSpan.setAttribute("agentv.trace.duration_ms", t.durationMs);
|
|
14047
|
-
if (t.costUsd != null) rootSpan.setAttribute("agentv.trace.cost_usd", t.costUsd);
|
|
14048
14205
|
if (t.llmCallCount != null)
|
|
14049
14206
|
rootSpan.setAttribute("agentv.trace.llm_call_count", t.llmCallCount);
|
|
14050
14207
|
}
|