@agentv/core 2.7.1-next.6 → 2.9.0-next.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-5SV2QC6V.js → chunk-7Q4PH265.js} +6 -18
- package/dist/chunk-7Q4PH265.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +4 -11
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +2 -2
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +234 -89
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +54 -22
- package/dist/index.d.ts +54 -22
- package/dist/index.js +230 -73
- package/dist/index.js.map +1 -1
- package/package.json +6 -4
- package/dist/chunk-5SV2QC6V.js.map +0 -1
package/dist/index.cjs
CHANGED
|
@@ -1686,14 +1686,16 @@ function computeTraceSummary(messages) {
|
|
|
1686
1686
|
}
|
|
1687
1687
|
const toolNames = Object.keys(toolCallCounts).sort();
|
|
1688
1688
|
return {
|
|
1689
|
-
|
|
1690
|
-
|
|
1691
|
-
|
|
1692
|
-
|
|
1689
|
+
trace: {
|
|
1690
|
+
eventCount: totalToolCalls,
|
|
1691
|
+
toolNames,
|
|
1692
|
+
toolCallsByName: toolCallCounts,
|
|
1693
|
+
errorCount: 0,
|
|
1694
|
+
llmCallCount,
|
|
1695
|
+
...hasAnyDuration ? { toolDurations } : {}
|
|
1696
|
+
},
|
|
1693
1697
|
startTime: earliestStart?.toISOString(),
|
|
1694
|
-
endTime: latestEnd?.toISOString()
|
|
1695
|
-
llmCallCount,
|
|
1696
|
-
...hasAnyDuration ? { toolDurations } : {}
|
|
1698
|
+
endTime: latestEnd?.toISOString()
|
|
1697
1699
|
};
|
|
1698
1700
|
}
|
|
1699
1701
|
var DEFAULT_EXPLORATION_TOOLS = [
|
|
@@ -1716,9 +1718,9 @@ function explorationRatio(summary, explorationTools = DEFAULT_EXPLORATION_TOOLS)
|
|
|
1716
1718
|
);
|
|
1717
1719
|
return explorationCalls / summary.eventCount;
|
|
1718
1720
|
}
|
|
1719
|
-
function tokensPerTool(summary) {
|
|
1720
|
-
if (!
|
|
1721
|
-
const totalTokens =
|
|
1721
|
+
function tokensPerTool(summary, tokenUsage) {
|
|
1722
|
+
if (!tokenUsage || summary.eventCount === 0) return void 0;
|
|
1723
|
+
const totalTokens = tokenUsage.input + tokenUsage.output;
|
|
1722
1724
|
return totalTokens / summary.eventCount;
|
|
1723
1725
|
}
|
|
1724
1726
|
function avgToolDurationMs(summary) {
|
|
@@ -1734,16 +1736,15 @@ function avgToolDurationMs(summary) {
|
|
|
1734
1736
|
if (totalCalls === 0) return void 0;
|
|
1735
1737
|
return totalDuration / totalCalls;
|
|
1736
1738
|
}
|
|
1737
|
-
function mergeExecutionMetrics(
|
|
1738
|
-
if (!metrics) return
|
|
1739
|
+
function mergeExecutionMetrics(computed, metrics) {
|
|
1740
|
+
if (!metrics) return computed;
|
|
1739
1741
|
return {
|
|
1740
|
-
|
|
1742
|
+
trace: computed.trace,
|
|
1741
1743
|
tokenUsage: metrics.tokenUsage,
|
|
1742
1744
|
costUsd: metrics.costUsd,
|
|
1743
1745
|
durationMs: metrics.durationMs,
|
|
1744
|
-
|
|
1745
|
-
|
|
1746
|
-
endTime: metrics.endTime ?? summary.endTime
|
|
1746
|
+
startTime: metrics.startTime ?? computed.startTime,
|
|
1747
|
+
endTime: metrics.endTime ?? computed.endTime
|
|
1747
1748
|
};
|
|
1748
1749
|
}
|
|
1749
1750
|
|
|
@@ -2141,6 +2142,24 @@ function extractCacheConfig(suite) {
|
|
|
2141
2142
|
const resolvedCachePath = typeof cachePath === "string" && cachePath.trim().length > 0 ? cachePath.trim() : void 0;
|
|
2142
2143
|
return { enabled: cache, cachePath: resolvedCachePath };
|
|
2143
2144
|
}
|
|
2145
|
+
function extractTotalBudgetUsd(suite) {
|
|
2146
|
+
const execution = suite.execution;
|
|
2147
|
+
if (!execution || typeof execution !== "object" || Array.isArray(execution)) {
|
|
2148
|
+
return void 0;
|
|
2149
|
+
}
|
|
2150
|
+
const executionObj = execution;
|
|
2151
|
+
const rawBudget = executionObj.total_budget_usd ?? executionObj.totalBudgetUsd;
|
|
2152
|
+
if (rawBudget === void 0 || rawBudget === null) {
|
|
2153
|
+
return void 0;
|
|
2154
|
+
}
|
|
2155
|
+
if (typeof rawBudget === "number" && rawBudget > 0) {
|
|
2156
|
+
return rawBudget;
|
|
2157
|
+
}
|
|
2158
|
+
logWarning(
|
|
2159
|
+
`Invalid execution.total_budget_usd: ${rawBudget}. Must be a positive number. Ignoring.`
|
|
2160
|
+
);
|
|
2161
|
+
return void 0;
|
|
2162
|
+
}
|
|
2144
2163
|
function logWarning(message) {
|
|
2145
2164
|
console.warn(`${ANSI_YELLOW2}Warning: ${message}${ANSI_RESET2}`);
|
|
2146
2165
|
}
|
|
@@ -4198,6 +4217,7 @@ async function loadTestSuite(evalFilePath, repoRoot, options) {
|
|
|
4198
4217
|
trials: extractTrialsConfig(parsed),
|
|
4199
4218
|
targets: extractTargetsFromSuite(parsed),
|
|
4200
4219
|
cacheConfig: extractCacheConfig(parsed),
|
|
4220
|
+
totalBudgetUsd: extractTotalBudgetUsd(parsed),
|
|
4201
4221
|
...metadata !== void 0 && { metadata }
|
|
4202
4222
|
};
|
|
4203
4223
|
}
|
|
@@ -4796,10 +4816,13 @@ async function invokeModel(options) {
|
|
|
4796
4816
|
}
|
|
4797
4817
|
function mapResponse(result) {
|
|
4798
4818
|
const content = result.text ?? "";
|
|
4819
|
+
const rawUsage = result.totalUsage ?? result.usage;
|
|
4820
|
+
const tokenUsage = rawUsage?.inputTokens != null && rawUsage?.outputTokens != null ? { input: rawUsage.inputTokens, output: rawUsage.outputTokens } : void 0;
|
|
4799
4821
|
return {
|
|
4800
4822
|
raw: result,
|
|
4801
|
-
usage: toJsonObject(
|
|
4802
|
-
output: [{ role: "assistant", content }]
|
|
4823
|
+
usage: toJsonObject(rawUsage),
|
|
4824
|
+
output: [{ role: "assistant", content }],
|
|
4825
|
+
tokenUsage
|
|
4803
4826
|
};
|
|
4804
4827
|
}
|
|
4805
4828
|
function toJsonObject(value) {
|
|
@@ -8335,10 +8358,8 @@ var CliHealthcheckInputSchema = import_zod3.z.union([
|
|
|
8335
8358
|
var CliTargetInputSchema = import_zod3.z.object({
|
|
8336
8359
|
name: import_zod3.z.string().min(1, "target name is required"),
|
|
8337
8360
|
provider: import_zod3.z.string().refine((p) => p.toLowerCase() === "cli", { message: "provider must be 'cli'" }),
|
|
8338
|
-
// Command - required
|
|
8339
|
-
command: import_zod3.z.string()
|
|
8340
|
-
command_template: import_zod3.z.string().optional(),
|
|
8341
|
-
commandTemplate: import_zod3.z.string().optional(),
|
|
8361
|
+
// Command - required
|
|
8362
|
+
command: import_zod3.z.string(),
|
|
8342
8363
|
// Files format - optional
|
|
8343
8364
|
files_format: import_zod3.z.string().optional(),
|
|
8344
8365
|
filesFormat: import_zod3.z.string().optional(),
|
|
@@ -8368,12 +8389,7 @@ var CliTargetInputSchema = import_zod3.z.object({
|
|
|
8368
8389
|
workers: import_zod3.z.number().int().min(1).optional(),
|
|
8369
8390
|
provider_batching: import_zod3.z.boolean().optional(),
|
|
8370
8391
|
providerBatching: import_zod3.z.boolean().optional()
|
|
8371
|
-
})
|
|
8372
|
-
(data) => data.command !== void 0 || data.command_template !== void 0 || data.commandTemplate !== void 0,
|
|
8373
|
-
{
|
|
8374
|
-
message: "'command' is required"
|
|
8375
|
-
}
|
|
8376
|
-
);
|
|
8392
|
+
});
|
|
8377
8393
|
var CliHealthcheckHttpSchema = import_zod3.z.object({
|
|
8378
8394
|
url: import_zod3.z.string().min(1),
|
|
8379
8395
|
timeoutMs: import_zod3.z.number().positive().optional()
|
|
@@ -8431,11 +8447,7 @@ function normalizeCliHealthcheck(input, env, targetName, evalFilePath) {
|
|
|
8431
8447
|
}
|
|
8432
8448
|
function normalizeCliTargetInput(input, env, evalFilePath) {
|
|
8433
8449
|
const targetName = input.name;
|
|
8434
|
-
const
|
|
8435
|
-
if (commandSource === void 0) {
|
|
8436
|
-
throw new Error(`${targetName}: 'command' is required`);
|
|
8437
|
-
}
|
|
8438
|
-
const command = resolveString(commandSource, env, `${targetName} CLI command`, true);
|
|
8450
|
+
const command = resolveString(input.command, env, `${targetName} CLI command`, true);
|
|
8439
8451
|
const filesFormatSource = input.files_format ?? input.filesFormat ?? input.attachments_format ?? input.attachmentsFormat;
|
|
8440
8452
|
const filesFormat = resolveOptionalLiteralString(filesFormatSource);
|
|
8441
8453
|
const workspaceTemplateSource = input.workspace_template ?? input.workspaceTemplate;
|
|
@@ -9228,8 +9240,7 @@ function resolveCliConfig(target, env, evalFilePath) {
|
|
|
9228
9240
|
return normalized;
|
|
9229
9241
|
}
|
|
9230
9242
|
function resolveDiscoveredProviderConfig(target, providerKind, env, evalFilePath) {
|
|
9231
|
-
const
|
|
9232
|
-
const command = commandSource ? resolveString(commandSource, env, `${target.name} command`, true) : `bun run .agentv/providers/${providerKind}.ts {PROMPT}`;
|
|
9243
|
+
const command = target.command ? resolveString(target.command, env, `${target.name} command`, true) : `bun run .agentv/providers/${providerKind}.ts {PROMPT}`;
|
|
9233
9244
|
const timeoutSeconds = target.timeout_seconds ?? target.timeoutSeconds;
|
|
9234
9245
|
const timeoutMs = resolveTimeoutMs(timeoutSeconds, `${target.name} timeout`);
|
|
9235
9246
|
let cwd = resolveOptionalString(target.cwd, env, `${target.name} working directory`, {
|
|
@@ -11201,6 +11212,8 @@ async function createTargetProxy(options) {
|
|
|
11201
11212
|
const token = (0, import_node_crypto7.randomBytes)(32).toString("hex");
|
|
11202
11213
|
let callCount = 0;
|
|
11203
11214
|
let isShutdown = false;
|
|
11215
|
+
let totalInputTokens = 0;
|
|
11216
|
+
let totalOutputTokens = 0;
|
|
11204
11217
|
const targetsList = availableTargets ?? [defaultProvider.targetName];
|
|
11205
11218
|
function resolveProvider(targetName) {
|
|
11206
11219
|
if (targetName === void 0 || targetName === defaultProvider.targetName) {
|
|
@@ -11279,11 +11292,16 @@ async function createTargetProxy(options) {
|
|
|
11279
11292
|
evalCaseId: request.evalCaseId ?? "proxy",
|
|
11280
11293
|
attempt: request.attempt ?? 1
|
|
11281
11294
|
});
|
|
11295
|
+
if (response.tokenUsage) {
|
|
11296
|
+
totalInputTokens += response.tokenUsage.input;
|
|
11297
|
+
totalOutputTokens += response.tokenUsage.output;
|
|
11298
|
+
}
|
|
11282
11299
|
const output = response.output ?? [];
|
|
11283
11300
|
const rawText = extractLastAssistantContent(output);
|
|
11284
11301
|
const result = {
|
|
11285
11302
|
output,
|
|
11286
|
-
rawText
|
|
11303
|
+
rawText,
|
|
11304
|
+
tokenUsage: response.tokenUsage
|
|
11287
11305
|
};
|
|
11288
11306
|
sendJson(res, 200, result);
|
|
11289
11307
|
} catch (error) {
|
|
@@ -11330,10 +11348,15 @@ async function createTargetProxy(options) {
|
|
|
11330
11348
|
evalCaseId: request.evalCaseId ?? "proxy",
|
|
11331
11349
|
attempt: request.attempt ?? 1
|
|
11332
11350
|
});
|
|
11351
|
+
if (response.tokenUsage) {
|
|
11352
|
+
totalInputTokens += response.tokenUsage.input;
|
|
11353
|
+
totalOutputTokens += response.tokenUsage.output;
|
|
11354
|
+
}
|
|
11333
11355
|
const output = response.output ?? [];
|
|
11334
11356
|
responses.push({
|
|
11335
11357
|
output,
|
|
11336
|
-
rawText: extractLastAssistantContent(output)
|
|
11358
|
+
rawText: extractLastAssistantContent(output),
|
|
11359
|
+
tokenUsage: response.tokenUsage
|
|
11337
11360
|
});
|
|
11338
11361
|
} catch (error) {
|
|
11339
11362
|
const message = error instanceof Error ? error.message : String(error);
|
|
@@ -11372,7 +11395,8 @@ async function createTargetProxy(options) {
|
|
|
11372
11395
|
},
|
|
11373
11396
|
getUsageMetadata: () => ({
|
|
11374
11397
|
callCount,
|
|
11375
|
-
maxCalls
|
|
11398
|
+
maxCalls,
|
|
11399
|
+
tokenUsage: totalInputTokens > 0 || totalOutputTokens > 0 ? { input: totalInputTokens, output: totalOutputTokens } : void 0
|
|
11376
11400
|
})
|
|
11377
11401
|
};
|
|
11378
11402
|
}
|
|
@@ -11497,6 +11521,11 @@ var CodeEvaluator = class {
|
|
|
11497
11521
|
),
|
|
11498
11522
|
input: context2.evalCase.input,
|
|
11499
11523
|
trace: context2.trace ?? null,
|
|
11524
|
+
tokenUsage: context2.tokenUsage ?? null,
|
|
11525
|
+
costUsd: context2.costUsd ?? null,
|
|
11526
|
+
durationMs: context2.durationMs ?? null,
|
|
11527
|
+
startTime: context2.startTime ?? null,
|
|
11528
|
+
endTime: context2.endTime ?? null,
|
|
11500
11529
|
fileChanges: context2.fileChanges ?? null,
|
|
11501
11530
|
workspacePath: context2.workspacePath ?? null,
|
|
11502
11531
|
config: this.config ?? null
|
|
@@ -11555,7 +11584,8 @@ var CodeEvaluator = class {
|
|
|
11555
11584
|
expectedAspectCount: hits.length + misses.length || 1,
|
|
11556
11585
|
reasoning,
|
|
11557
11586
|
evaluatorRawRequest,
|
|
11558
|
-
...details ? { details } : {}
|
|
11587
|
+
...details ? { details } : {},
|
|
11588
|
+
tokenUsage: proxyUsage?.tokenUsage
|
|
11559
11589
|
};
|
|
11560
11590
|
} catch (error) {
|
|
11561
11591
|
const message = error instanceof Error ? error.message : String(error);
|
|
@@ -11577,7 +11607,8 @@ var CodeEvaluator = class {
|
|
|
11577
11607
|
}
|
|
11578
11608
|
} : {},
|
|
11579
11609
|
error: message
|
|
11580
|
-
}
|
|
11610
|
+
},
|
|
11611
|
+
tokenUsage: proxyUsage?.tokenUsage
|
|
11581
11612
|
};
|
|
11582
11613
|
} finally {
|
|
11583
11614
|
if (proxyShutdown) {
|
|
@@ -11741,7 +11772,7 @@ ${context2.fileChanges}`;
|
|
|
11741
11772
|
target: judgeProvider.targetName
|
|
11742
11773
|
};
|
|
11743
11774
|
try {
|
|
11744
|
-
const { data } = await this.runWithRetry({
|
|
11775
|
+
const { data, tokenUsage } = await this.runWithRetry({
|
|
11745
11776
|
context: context2,
|
|
11746
11777
|
judgeProvider,
|
|
11747
11778
|
systemPrompt,
|
|
@@ -11760,7 +11791,8 @@ ${context2.fileChanges}`;
|
|
|
11760
11791
|
misses,
|
|
11761
11792
|
expectedAspectCount,
|
|
11762
11793
|
reasoning,
|
|
11763
|
-
evaluatorRawRequest
|
|
11794
|
+
evaluatorRawRequest,
|
|
11795
|
+
tokenUsage
|
|
11764
11796
|
};
|
|
11765
11797
|
} catch {
|
|
11766
11798
|
return {
|
|
@@ -11790,7 +11822,7 @@ ${context2.fileChanges}`;
|
|
|
11790
11822
|
systemPrompt,
|
|
11791
11823
|
target: judgeProvider.targetName
|
|
11792
11824
|
};
|
|
11793
|
-
const { data } = await this.runWithRetry({
|
|
11825
|
+
const { data, tokenUsage } = await this.runWithRetry({
|
|
11794
11826
|
context: context2,
|
|
11795
11827
|
judgeProvider,
|
|
11796
11828
|
systemPrompt,
|
|
@@ -11805,7 +11837,8 @@ ${context2.fileChanges}`;
|
|
|
11805
11837
|
misses,
|
|
11806
11838
|
expectedAspectCount: rubrics.length,
|
|
11807
11839
|
reasoning: data.overall_reasoning,
|
|
11808
|
-
evaluatorRawRequest
|
|
11840
|
+
evaluatorRawRequest,
|
|
11841
|
+
tokenUsage
|
|
11809
11842
|
};
|
|
11810
11843
|
}
|
|
11811
11844
|
/**
|
|
@@ -11820,7 +11853,7 @@ ${context2.fileChanges}`;
|
|
|
11820
11853
|
systemPrompt,
|
|
11821
11854
|
target: judgeProvider.targetName
|
|
11822
11855
|
};
|
|
11823
|
-
const { data } = await this.runWithRetry({
|
|
11856
|
+
const { data, tokenUsage } = await this.runWithRetry({
|
|
11824
11857
|
context: context2,
|
|
11825
11858
|
judgeProvider,
|
|
11826
11859
|
systemPrompt,
|
|
@@ -11836,7 +11869,8 @@ ${context2.fileChanges}`;
|
|
|
11836
11869
|
expectedAspectCount: rubrics.length,
|
|
11837
11870
|
reasoning: data.overall_reasoning,
|
|
11838
11871
|
evaluatorRawRequest,
|
|
11839
|
-
details
|
|
11872
|
+
details,
|
|
11873
|
+
tokenUsage
|
|
11840
11874
|
};
|
|
11841
11875
|
}
|
|
11842
11876
|
/**
|
|
@@ -11920,15 +11954,17 @@ ${context2.fileChanges}`;
|
|
|
11920
11954
|
try {
|
|
11921
11955
|
const model = judgeProvider.asLanguageModel?.();
|
|
11922
11956
|
if (model) {
|
|
11923
|
-
const
|
|
11957
|
+
const result = await (0, import_ai2.generateText)({
|
|
11924
11958
|
model,
|
|
11925
11959
|
system: systemPrompt,
|
|
11926
11960
|
prompt: userPrompt,
|
|
11927
11961
|
...this.maxOutputTokens ? { maxTokens: this.maxOutputTokens } : {},
|
|
11928
11962
|
...typeof this.temperature === "number" ? { temperature: this.temperature } : {}
|
|
11929
11963
|
});
|
|
11930
|
-
const data2 = schema.parse(parseJsonFromText(text));
|
|
11931
|
-
|
|
11964
|
+
const data2 = schema.parse(parseJsonFromText(result.text));
|
|
11965
|
+
const rawUsage = result.usage;
|
|
11966
|
+
const tokenUsage = rawUsage?.inputTokens != null && rawUsage?.outputTokens != null ? { input: rawUsage.inputTokens, output: rawUsage.outputTokens } : void 0;
|
|
11967
|
+
return { data: data2, tokenUsage };
|
|
11932
11968
|
}
|
|
11933
11969
|
const response = await judgeProvider.invoke({
|
|
11934
11970
|
question: userPrompt,
|
|
@@ -11939,7 +11975,7 @@ ${context2.fileChanges}`;
|
|
|
11939
11975
|
temperature: this.temperature
|
|
11940
11976
|
});
|
|
11941
11977
|
const data = schema.parse(parseJsonFromText(extractLastAssistantContent2(response.output)));
|
|
11942
|
-
return { data, providerResponse: response };
|
|
11978
|
+
return { data, providerResponse: response, tokenUsage: response.tokenUsage };
|
|
11943
11979
|
} catch (e) {
|
|
11944
11980
|
lastError = e instanceof Error ? e : new Error(String(e));
|
|
11945
11981
|
}
|
|
@@ -12145,7 +12181,8 @@ var CompositeEvaluator = class {
|
|
|
12145
12181
|
reasoning: member.result.reasoning,
|
|
12146
12182
|
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
12147
12183
|
scores: member.result.scores,
|
|
12148
|
-
details: member.result.details
|
|
12184
|
+
details: member.result.details,
|
|
12185
|
+
tokenUsage: member.result.tokenUsage
|
|
12149
12186
|
});
|
|
12150
12187
|
}
|
|
12151
12188
|
const finalScore = totalWeight > 0 ? weightedSum / totalWeight : 0;
|
|
@@ -12193,7 +12230,8 @@ var CompositeEvaluator = class {
|
|
|
12193
12230
|
reasoning: member.result.reasoning,
|
|
12194
12231
|
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
12195
12232
|
scores: member.result.scores,
|
|
12196
|
-
details: member.result.details
|
|
12233
|
+
details: member.result.details,
|
|
12234
|
+
tokenUsage: member.result.tokenUsage
|
|
12197
12235
|
});
|
|
12198
12236
|
}
|
|
12199
12237
|
const totalCount = results.length;
|
|
@@ -12372,7 +12410,7 @@ var CostEvaluator = class {
|
|
|
12372
12410
|
}
|
|
12373
12411
|
evaluate(context2) {
|
|
12374
12412
|
const { budget } = this.config;
|
|
12375
|
-
const costUsd = context2.
|
|
12413
|
+
const costUsd = context2.costUsd;
|
|
12376
12414
|
if (costUsd === void 0) {
|
|
12377
12415
|
return {
|
|
12378
12416
|
score: 0,
|
|
@@ -12415,7 +12453,7 @@ var ExecutionMetricsEvaluator = class {
|
|
|
12415
12453
|
this.config = options.config;
|
|
12416
12454
|
}
|
|
12417
12455
|
evaluate(context2) {
|
|
12418
|
-
const { trace: trace2 } = context2;
|
|
12456
|
+
const { trace: trace2, tokenUsage, costUsd, durationMs } = context2;
|
|
12419
12457
|
const {
|
|
12420
12458
|
max_tool_calls,
|
|
12421
12459
|
max_llm_calls,
|
|
@@ -12425,7 +12463,8 @@ var ExecutionMetricsEvaluator = class {
|
|
|
12425
12463
|
target_exploration_ratio,
|
|
12426
12464
|
exploration_tolerance = 0.2
|
|
12427
12465
|
} = this.config;
|
|
12428
|
-
|
|
12466
|
+
const needsTrace = max_tool_calls !== void 0 || max_llm_calls !== void 0 || target_exploration_ratio !== void 0;
|
|
12467
|
+
if (needsTrace && !trace2) {
|
|
12429
12468
|
return {
|
|
12430
12469
|
score: 0,
|
|
12431
12470
|
verdict: "fail",
|
|
@@ -12440,11 +12479,12 @@ var ExecutionMetricsEvaluator = class {
|
|
|
12440
12479
|
}
|
|
12441
12480
|
};
|
|
12442
12481
|
}
|
|
12482
|
+
const narrowedTrace = trace2;
|
|
12443
12483
|
const hits = [];
|
|
12444
12484
|
const misses = [];
|
|
12445
12485
|
const actualMetrics = {};
|
|
12446
|
-
if (max_tool_calls !== void 0) {
|
|
12447
|
-
const toolCalls =
|
|
12486
|
+
if (max_tool_calls !== void 0 && narrowedTrace) {
|
|
12487
|
+
const toolCalls = narrowedTrace.eventCount;
|
|
12448
12488
|
actualMetrics.tool_calls = toolCalls;
|
|
12449
12489
|
if (toolCalls <= max_tool_calls) {
|
|
12450
12490
|
hits.push(`Tool calls ${toolCalls} <= ${max_tool_calls} max`);
|
|
@@ -12452,8 +12492,8 @@ var ExecutionMetricsEvaluator = class {
|
|
|
12452
12492
|
misses.push(`Tool calls ${toolCalls} > ${max_tool_calls} max`);
|
|
12453
12493
|
}
|
|
12454
12494
|
}
|
|
12455
|
-
if (max_llm_calls !== void 0) {
|
|
12456
|
-
const llmCalls =
|
|
12495
|
+
if (max_llm_calls !== void 0 && narrowedTrace) {
|
|
12496
|
+
const llmCalls = narrowedTrace.llmCallCount;
|
|
12457
12497
|
if (llmCalls === void 0) {
|
|
12458
12498
|
misses.push("LLM call count data not available");
|
|
12459
12499
|
} else {
|
|
@@ -12466,7 +12506,6 @@ var ExecutionMetricsEvaluator = class {
|
|
|
12466
12506
|
}
|
|
12467
12507
|
}
|
|
12468
12508
|
if (max_tokens !== void 0) {
|
|
12469
|
-
const tokenUsage = trace2.tokenUsage;
|
|
12470
12509
|
if (!tokenUsage) {
|
|
12471
12510
|
misses.push("Token usage data not available");
|
|
12472
12511
|
} else {
|
|
@@ -12480,7 +12519,6 @@ var ExecutionMetricsEvaluator = class {
|
|
|
12480
12519
|
}
|
|
12481
12520
|
}
|
|
12482
12521
|
if (max_cost_usd !== void 0) {
|
|
12483
|
-
const costUsd = trace2.costUsd;
|
|
12484
12522
|
if (costUsd === void 0) {
|
|
12485
12523
|
misses.push("Cost data not available");
|
|
12486
12524
|
} else {
|
|
@@ -12494,7 +12532,6 @@ var ExecutionMetricsEvaluator = class {
|
|
|
12494
12532
|
}
|
|
12495
12533
|
}
|
|
12496
12534
|
if (max_duration_ms !== void 0) {
|
|
12497
|
-
const durationMs = trace2.durationMs;
|
|
12498
12535
|
if (durationMs === void 0) {
|
|
12499
12536
|
misses.push("Duration data not available");
|
|
12500
12537
|
} else {
|
|
@@ -12506,8 +12543,8 @@ var ExecutionMetricsEvaluator = class {
|
|
|
12506
12543
|
}
|
|
12507
12544
|
}
|
|
12508
12545
|
}
|
|
12509
|
-
if (target_exploration_ratio !== void 0) {
|
|
12510
|
-
const ratio = explorationRatio(
|
|
12546
|
+
if (target_exploration_ratio !== void 0 && narrowedTrace) {
|
|
12547
|
+
const ratio = explorationRatio(narrowedTrace);
|
|
12511
12548
|
if (ratio === void 0) {
|
|
12512
12549
|
misses.push("Exploration ratio not available (no tool calls)");
|
|
12513
12550
|
} else {
|
|
@@ -13021,7 +13058,7 @@ var LatencyEvaluator = class {
|
|
|
13021
13058
|
}
|
|
13022
13059
|
evaluate(context2) {
|
|
13023
13060
|
const { threshold } = this.config;
|
|
13024
|
-
const durationMs = context2.
|
|
13061
|
+
const durationMs = context2.durationMs;
|
|
13025
13062
|
if (durationMs === void 0) {
|
|
13026
13063
|
return {
|
|
13027
13064
|
score: 0,
|
|
@@ -13666,7 +13703,7 @@ var TokenUsageEvaluator = class {
|
|
|
13666
13703
|
this.config = options.config;
|
|
13667
13704
|
}
|
|
13668
13705
|
evaluate(context2) {
|
|
13669
|
-
const usage = context2.
|
|
13706
|
+
const usage = context2.tokenUsage;
|
|
13670
13707
|
const maxTotal = this.config.max_total;
|
|
13671
13708
|
const maxInput = this.config.max_input;
|
|
13672
13709
|
const maxOutput = this.config.max_output;
|
|
@@ -15111,7 +15148,8 @@ async function runEvaluation(options) {
|
|
|
15111
15148
|
keepWorkspaces,
|
|
15112
15149
|
cleanupWorkspaces,
|
|
15113
15150
|
trials,
|
|
15114
|
-
streamCallbacks
|
|
15151
|
+
streamCallbacks,
|
|
15152
|
+
totalBudgetUsd
|
|
15115
15153
|
} = options;
|
|
15116
15154
|
let useCache = options.useCache;
|
|
15117
15155
|
if (trials && trials.count > 1 && useCache) {
|
|
@@ -15284,10 +15322,39 @@ async function runEvaluation(options) {
|
|
|
15284
15322
|
let nextWorkerId = 1;
|
|
15285
15323
|
const workerIdByEvalId = /* @__PURE__ */ new Map();
|
|
15286
15324
|
let beforeAllOutputAttached = false;
|
|
15325
|
+
let cumulativeBudgetCost = 0;
|
|
15326
|
+
let budgetExhausted = false;
|
|
15287
15327
|
const promises = filteredEvalCases.map(
|
|
15288
15328
|
(evalCase) => limit(async () => {
|
|
15289
15329
|
const workerId = nextWorkerId++;
|
|
15290
15330
|
workerIdByEvalId.set(evalCase.id, workerId);
|
|
15331
|
+
if (totalBudgetUsd !== void 0 && budgetExhausted) {
|
|
15332
|
+
const budgetResult = {
|
|
15333
|
+
timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
|
|
15334
|
+
testId: evalCase.id,
|
|
15335
|
+
dataset: evalCase.dataset,
|
|
15336
|
+
score: 0,
|
|
15337
|
+
hits: [],
|
|
15338
|
+
misses: [],
|
|
15339
|
+
answer: "",
|
|
15340
|
+
target: target.name,
|
|
15341
|
+
error: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
|
|
15342
|
+
budgetExceeded: true
|
|
15343
|
+
};
|
|
15344
|
+
if (onProgress) {
|
|
15345
|
+
await onProgress({
|
|
15346
|
+
workerId,
|
|
15347
|
+
testId: evalCase.id,
|
|
15348
|
+
status: "failed",
|
|
15349
|
+
completedAt: Date.now(),
|
|
15350
|
+
error: budgetResult.error
|
|
15351
|
+
});
|
|
15352
|
+
}
|
|
15353
|
+
if (onResult) {
|
|
15354
|
+
await onResult(budgetResult);
|
|
15355
|
+
}
|
|
15356
|
+
return budgetResult;
|
|
15357
|
+
}
|
|
15291
15358
|
if (onProgress) {
|
|
15292
15359
|
await onProgress({
|
|
15293
15360
|
workerId,
|
|
@@ -15321,6 +15388,23 @@ async function runEvaluation(options) {
|
|
|
15321
15388
|
typeRegistry
|
|
15322
15389
|
};
|
|
15323
15390
|
let result = trials && trials.count > 1 ? await runEvalCaseWithTrials(runCaseOptions, trials) : await runEvalCase(runCaseOptions);
|
|
15391
|
+
if (totalBudgetUsd !== void 0) {
|
|
15392
|
+
let caseCost;
|
|
15393
|
+
if (result.trials && result.trials.length > 0) {
|
|
15394
|
+
const trialCostSum = result.trials.reduce((sum, t) => sum + (t.costUsd ?? 0), 0);
|
|
15395
|
+
if (trialCostSum > 0) {
|
|
15396
|
+
caseCost = trialCostSum;
|
|
15397
|
+
}
|
|
15398
|
+
} else {
|
|
15399
|
+
caseCost = result.costUsd;
|
|
15400
|
+
}
|
|
15401
|
+
if (caseCost !== void 0) {
|
|
15402
|
+
cumulativeBudgetCost += caseCost;
|
|
15403
|
+
if (cumulativeBudgetCost >= totalBudgetUsd) {
|
|
15404
|
+
budgetExhausted = true;
|
|
15405
|
+
}
|
|
15406
|
+
}
|
|
15407
|
+
}
|
|
15324
15408
|
if (beforeAllOutput && !beforeAllOutputAttached) {
|
|
15325
15409
|
result = { ...result, beforeAllOutput };
|
|
15326
15410
|
beforeAllOutputAttached = true;
|
|
@@ -15473,17 +15557,18 @@ async function runBatchEvaluation(options) {
|
|
|
15473
15557
|
const providerResponse = batchResponse[i];
|
|
15474
15558
|
const output = providerResponse.output;
|
|
15475
15559
|
const hasExecutionMetrics = providerResponse.tokenUsage !== void 0 || providerResponse.costUsd !== void 0 || providerResponse.durationMs !== void 0;
|
|
15476
|
-
const
|
|
15477
|
-
|
|
15478
|
-
toolNames: [],
|
|
15479
|
-
toolCallsByName: {},
|
|
15480
|
-
errorCount: 0
|
|
15481
|
-
} : void 0;
|
|
15482
|
-
const trace2 = baseSummary ? mergeExecutionMetrics(baseSummary, {
|
|
15560
|
+
const computed = output ? computeTraceSummary(output) : hasExecutionMetrics ? { trace: { eventCount: 0, toolNames: [], toolCallsByName: {}, errorCount: 0 } } : void 0;
|
|
15561
|
+
const merged = computed ? mergeExecutionMetrics(computed, {
|
|
15483
15562
|
tokenUsage: providerResponse.tokenUsage,
|
|
15484
15563
|
costUsd: providerResponse.costUsd,
|
|
15485
15564
|
durationMs: providerResponse.durationMs
|
|
15486
15565
|
}) : void 0;
|
|
15566
|
+
const trace2 = merged?.trace;
|
|
15567
|
+
const costUsd = merged?.costUsd;
|
|
15568
|
+
const durationMs = merged?.durationMs;
|
|
15569
|
+
const tokenUsage = merged?.tokenUsage;
|
|
15570
|
+
const startTime = merged?.startTime;
|
|
15571
|
+
const endTime = merged?.endTime;
|
|
15487
15572
|
const candidate = extractLastAssistantContent2(output);
|
|
15488
15573
|
const providerError = extractProviderError(providerResponse);
|
|
15489
15574
|
let result;
|
|
@@ -15502,6 +15587,11 @@ async function runBatchEvaluation(options) {
|
|
|
15502
15587
|
agentTimeoutMs,
|
|
15503
15588
|
output,
|
|
15504
15589
|
trace: trace2,
|
|
15590
|
+
costUsd,
|
|
15591
|
+
durationMs,
|
|
15592
|
+
tokenUsage,
|
|
15593
|
+
startTime,
|
|
15594
|
+
endTime,
|
|
15505
15595
|
targetResolver,
|
|
15506
15596
|
availableTargets
|
|
15507
15597
|
});
|
|
@@ -15738,17 +15828,18 @@ async function runEvalCase(options) {
|
|
|
15738
15828
|
}
|
|
15739
15829
|
const output = providerResponse.output;
|
|
15740
15830
|
const hasExecutionMetrics = providerResponse.tokenUsage !== void 0 || providerResponse.costUsd !== void 0 || providerResponse.durationMs !== void 0;
|
|
15741
|
-
const
|
|
15742
|
-
|
|
15743
|
-
toolNames: [],
|
|
15744
|
-
toolCallsByName: {},
|
|
15745
|
-
errorCount: 0
|
|
15746
|
-
} : void 0;
|
|
15747
|
-
const trace2 = baseSummary ? mergeExecutionMetrics(baseSummary, {
|
|
15831
|
+
const computed = output ? computeTraceSummary(output) : hasExecutionMetrics ? { trace: { eventCount: 0, toolNames: [], toolCallsByName: {}, errorCount: 0 } } : void 0;
|
|
15832
|
+
const merged = computed ? mergeExecutionMetrics(computed, {
|
|
15748
15833
|
tokenUsage: providerResponse.tokenUsage,
|
|
15749
15834
|
costUsd: providerResponse.costUsd,
|
|
15750
15835
|
durationMs: providerResponse.durationMs
|
|
15751
15836
|
}) : void 0;
|
|
15837
|
+
const trace2 = merged?.trace;
|
|
15838
|
+
const costUsd = merged?.costUsd;
|
|
15839
|
+
const durationMs = merged?.durationMs;
|
|
15840
|
+
const tokenUsage = merged?.tokenUsage;
|
|
15841
|
+
const startTime = merged?.startTime;
|
|
15842
|
+
const endTime = merged?.endTime;
|
|
15752
15843
|
const candidate = extractLastAssistantContent2(output);
|
|
15753
15844
|
let fileChanges;
|
|
15754
15845
|
if (baselineCommit && workspacePath) {
|
|
@@ -15793,6 +15884,11 @@ async function runEvalCase(options) {
|
|
|
15793
15884
|
agentTimeoutMs,
|
|
15794
15885
|
output,
|
|
15795
15886
|
trace: trace2,
|
|
15887
|
+
costUsd,
|
|
15888
|
+
durationMs,
|
|
15889
|
+
tokenUsage,
|
|
15890
|
+
startTime,
|
|
15891
|
+
endTime,
|
|
15796
15892
|
targetResolver,
|
|
15797
15893
|
availableTargets,
|
|
15798
15894
|
fileChanges,
|
|
@@ -15849,7 +15945,7 @@ async function runEvalCaseWithTrials(options, trialsConfig) {
|
|
|
15849
15945
|
};
|
|
15850
15946
|
const result = await runEvalCase(trialOptions);
|
|
15851
15947
|
allResults.push(result);
|
|
15852
|
-
const trialCost = result.
|
|
15948
|
+
const trialCost = result.costUsd;
|
|
15853
15949
|
const trialVerdict = scoreToVerdict(result.score);
|
|
15854
15950
|
const trial = {
|
|
15855
15951
|
attempt,
|
|
@@ -15905,6 +16001,11 @@ async function evaluateCandidate(options) {
|
|
|
15905
16001
|
agentTimeoutMs,
|
|
15906
16002
|
output,
|
|
15907
16003
|
trace: trace2,
|
|
16004
|
+
costUsd,
|
|
16005
|
+
durationMs,
|
|
16006
|
+
tokenUsage,
|
|
16007
|
+
startTime,
|
|
16008
|
+
endTime,
|
|
15908
16009
|
targetResolver,
|
|
15909
16010
|
availableTargets,
|
|
15910
16011
|
fileChanges,
|
|
@@ -15925,6 +16026,11 @@ async function evaluateCandidate(options) {
|
|
|
15925
16026
|
agentTimeoutMs,
|
|
15926
16027
|
output,
|
|
15927
16028
|
trace: trace2,
|
|
16029
|
+
costUsd,
|
|
16030
|
+
durationMs,
|
|
16031
|
+
tokenUsage,
|
|
16032
|
+
startTime,
|
|
16033
|
+
endTime,
|
|
15928
16034
|
targetResolver,
|
|
15929
16035
|
availableTargets,
|
|
15930
16036
|
fileChanges,
|
|
@@ -15968,6 +16074,11 @@ async function evaluateCandidate(options) {
|
|
|
15968
16074
|
answer: candidate,
|
|
15969
16075
|
target: target.name,
|
|
15970
16076
|
reasoning: score.reasoning,
|
|
16077
|
+
tokenUsage,
|
|
16078
|
+
costUsd,
|
|
16079
|
+
durationMs,
|
|
16080
|
+
startTime,
|
|
16081
|
+
endTime,
|
|
15971
16082
|
requests,
|
|
15972
16083
|
input,
|
|
15973
16084
|
scores,
|
|
@@ -15991,6 +16102,11 @@ async function runEvaluatorsForCase(options) {
|
|
|
15991
16102
|
agentTimeoutMs,
|
|
15992
16103
|
output,
|
|
15993
16104
|
trace: trace2,
|
|
16105
|
+
costUsd,
|
|
16106
|
+
durationMs,
|
|
16107
|
+
tokenUsage,
|
|
16108
|
+
startTime,
|
|
16109
|
+
endTime,
|
|
15994
16110
|
targetResolver,
|
|
15995
16111
|
availableTargets,
|
|
15996
16112
|
fileChanges,
|
|
@@ -16012,6 +16128,11 @@ async function runEvaluatorsForCase(options) {
|
|
|
16012
16128
|
agentTimeoutMs,
|
|
16013
16129
|
output,
|
|
16014
16130
|
trace: trace2,
|
|
16131
|
+
costUsd,
|
|
16132
|
+
durationMs,
|
|
16133
|
+
tokenUsage,
|
|
16134
|
+
startTime,
|
|
16135
|
+
endTime,
|
|
16015
16136
|
targetResolver,
|
|
16016
16137
|
availableTargets,
|
|
16017
16138
|
fileChanges,
|
|
@@ -16034,6 +16155,11 @@ async function runEvaluatorsForCase(options) {
|
|
|
16034
16155
|
judgeProvider,
|
|
16035
16156
|
output,
|
|
16036
16157
|
trace: trace2,
|
|
16158
|
+
tokenUsage,
|
|
16159
|
+
costUsd,
|
|
16160
|
+
durationMs,
|
|
16161
|
+
startTime,
|
|
16162
|
+
endTime,
|
|
16037
16163
|
targetResolver,
|
|
16038
16164
|
availableTargets,
|
|
16039
16165
|
fileChanges,
|
|
@@ -16057,6 +16183,11 @@ async function runEvaluatorList(options) {
|
|
|
16057
16183
|
agentTimeoutMs,
|
|
16058
16184
|
output,
|
|
16059
16185
|
trace: trace2,
|
|
16186
|
+
costUsd,
|
|
16187
|
+
durationMs,
|
|
16188
|
+
tokenUsage,
|
|
16189
|
+
startTime,
|
|
16190
|
+
endTime,
|
|
16060
16191
|
targetResolver,
|
|
16061
16192
|
availableTargets,
|
|
16062
16193
|
fileChanges,
|
|
@@ -16075,6 +16206,11 @@ async function runEvaluatorList(options) {
|
|
|
16075
16206
|
judgeProvider,
|
|
16076
16207
|
output,
|
|
16077
16208
|
trace: trace2,
|
|
16209
|
+
tokenUsage,
|
|
16210
|
+
costUsd,
|
|
16211
|
+
durationMs,
|
|
16212
|
+
startTime,
|
|
16213
|
+
endTime,
|
|
16078
16214
|
targetResolver,
|
|
16079
16215
|
availableTargets,
|
|
16080
16216
|
fileChanges,
|
|
@@ -16114,7 +16250,8 @@ async function runEvaluatorList(options) {
|
|
|
16114
16250
|
reasoning: score2.reasoning,
|
|
16115
16251
|
evaluatorProviderRequest: score2.evaluatorRawRequest,
|
|
16116
16252
|
details: score2.details,
|
|
16117
|
-
scores: mapChildResults(score2.scores)
|
|
16253
|
+
scores: mapChildResults(score2.scores),
|
|
16254
|
+
tokenUsage: score2.tokenUsage
|
|
16118
16255
|
});
|
|
16119
16256
|
} catch (error) {
|
|
16120
16257
|
const message = error instanceof Error ? error.message : String(error);
|
|
@@ -16362,7 +16499,8 @@ function mapChildResults(children) {
|
|
|
16362
16499
|
reasoning: child.reasoning,
|
|
16363
16500
|
evaluatorProviderRequest: child.evaluatorRawRequest,
|
|
16364
16501
|
scores: mapChildResults(child.scores),
|
|
16365
|
-
details: child.details
|
|
16502
|
+
details: child.details,
|
|
16503
|
+
tokenUsage: child.tokenUsage
|
|
16366
16504
|
}));
|
|
16367
16505
|
}
|
|
16368
16506
|
function computeWeightedMean(entries) {
|
|
@@ -16742,7 +16880,13 @@ var STRIPPED_TOP_LEVEL_FIELDS = /* @__PURE__ */ new Set([
|
|
|
16742
16880
|
"beforeEachOutput",
|
|
16743
16881
|
"afterAllOutput",
|
|
16744
16882
|
"afterEachOutput",
|
|
16745
|
-
"fileChanges"
|
|
16883
|
+
"fileChanges",
|
|
16884
|
+
// Promoted execution metrics (debug, not needed for regression comparison)
|
|
16885
|
+
"tokenUsage",
|
|
16886
|
+
"costUsd",
|
|
16887
|
+
"durationMs",
|
|
16888
|
+
"startTime",
|
|
16889
|
+
"endTime"
|
|
16746
16890
|
]);
|
|
16747
16891
|
var STRIPPED_EVALUATOR_FIELDS = /* @__PURE__ */ new Set(["rawRequest", "evaluatorProviderRequest"]);
|
|
16748
16892
|
function trimEvaluatorResult(result) {
|
|
@@ -16865,8 +17009,8 @@ var OtelTraceExporter = class {
|
|
|
16865
17009
|
const api = this.api;
|
|
16866
17010
|
const tracer = this.tracer;
|
|
16867
17011
|
const captureContent = this.options.captureContent ?? false;
|
|
16868
|
-
const startHr = toHrTime(result.
|
|
16869
|
-
const endHr = toHrTime(result.
|
|
17012
|
+
const startHr = toHrTime(result.startTime ?? result.timestamp);
|
|
17013
|
+
const endHr = toHrTime(result.endTime ?? result.timestamp);
|
|
16870
17014
|
let parentCtx = api.ROOT_CONTEXT;
|
|
16871
17015
|
const traceparent = process.env.TRACEPARENT;
|
|
16872
17016
|
if (traceparent && this.W3CPropagator) {
|
|
@@ -16895,12 +17039,13 @@ var OtelTraceExporter = class {
|
|
|
16895
17039
|
if (result.dataset) rootSpan.setAttribute("agentv.dataset", result.dataset);
|
|
16896
17040
|
rootSpan.setAttribute("agentv.score", result.score);
|
|
16897
17041
|
if (captureContent) rootSpan.setAttribute("agentv.answer", result.answer);
|
|
17042
|
+
if (result.durationMs != null)
|
|
17043
|
+
rootSpan.setAttribute("agentv.trace.duration_ms", result.durationMs);
|
|
17044
|
+
if (result.costUsd != null) rootSpan.setAttribute("agentv.trace.cost_usd", result.costUsd);
|
|
16898
17045
|
if (result.trace) {
|
|
16899
17046
|
const t = result.trace;
|
|
16900
17047
|
rootSpan.setAttribute("agentv.trace.event_count", t.eventCount);
|
|
16901
17048
|
rootSpan.setAttribute("agentv.trace.tool_names", t.toolNames.join(","));
|
|
16902
|
-
if (t.durationMs != null) rootSpan.setAttribute("agentv.trace.duration_ms", t.durationMs);
|
|
16903
|
-
if (t.costUsd != null) rootSpan.setAttribute("agentv.trace.cost_usd", t.costUsd);
|
|
16904
17049
|
if (t.llmCallCount != null)
|
|
16905
17050
|
rootSpan.setAttribute("agentv.trace.llm_call_count", t.llmCallCount);
|
|
16906
17051
|
}
|