agentv 4.38.1 → 4.40.1-next.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{artifact-writer-MK5X5MSO.js → artifact-writer-GIAIMGPQ.js} +14 -11
- package/dist/{chunk-QOBQ5XYF.js → chunk-76FOHROU.js} +16 -4
- package/dist/chunk-76FOHROU.js.map +1 -0
- package/dist/{chunk-VBHHZQS6.js → chunk-BLXYBUU4.js} +1825 -333
- package/dist/chunk-BLXYBUU4.js.map +1 -0
- package/dist/{chunk-NLTIK3LV.js → chunk-I3SC4FOT.js} +499 -347
- package/dist/chunk-I3SC4FOT.js.map +1 -0
- package/dist/{chunk-OIN3MVOD.js → chunk-S2JJCLHV.js} +67 -68
- package/dist/chunk-S2JJCLHV.js.map +1 -0
- package/dist/chunk-TWQP7JYQ.js +494 -0
- package/dist/chunk-TWQP7JYQ.js.map +1 -0
- package/dist/{chunk-6M5S4IJW.js → chunk-WKA5QDNQ.js} +586 -183
- package/dist/chunk-WKA5QDNQ.js.map +1 -0
- package/dist/cli.js +6 -6
- package/dist/dashboard/assets/index-BnYCCJ7O.css +1 -0
- package/dist/dashboard/assets/index-DaueD7GO.js +118 -0
- package/dist/dashboard/assets/{index-SIl6NbIJ.js → index-_jpKSzIf.js} +1 -1
- package/dist/dashboard/index.html +2 -2
- package/dist/{dist-HVLBDG5F.js → dist-6Z4OSITR.js} +54 -16
- package/dist/index.js +6 -6
- package/dist/{interactive-45LPG2YJ.js → interactive-OUB3GZRC.js} +6 -6
- package/dist/{otlp-json-file-exporter-RJFPCKVK-T6N4OGWG.js → otlp-json-file-exporter-RY63S3IG-PZBQPVYY.js} +2 -2
- package/dist/skills/agentv-eval-writer/SKILL.md +49 -24
- package/dist/skills/agentv-eval-writer/references/custom-evaluators.md +21 -15
- package/dist/{ts-eval-loader-TJT6BGFF-DI7XNSO4.js → ts-eval-loader-NWH3B4HG-UXXCZKLP.js} +2 -2
- package/package.json +1 -1
- package/dist/chunk-6M5S4IJW.js.map +0 -1
- package/dist/chunk-DKUAETXE.js +0 -1362
- package/dist/chunk-DKUAETXE.js.map +0 -1
- package/dist/chunk-NLTIK3LV.js.map +0 -1
- package/dist/chunk-OIN3MVOD.js.map +0 -1
- package/dist/chunk-QOBQ5XYF.js.map +0 -1
- package/dist/chunk-VBHHZQS6.js.map +0 -1
- package/dist/dashboard/assets/index-BpnllKET.css +0 -1
- package/dist/dashboard/assets/index-Cm9SUopp.js +0 -118
- /package/dist/{artifact-writer-MK5X5MSO.js.map → artifact-writer-GIAIMGPQ.js.map} +0 -0
- /package/dist/{dist-HVLBDG5F.js.map → dist-6Z4OSITR.js.map} +0 -0
- /package/dist/{interactive-45LPG2YJ.js.map → interactive-OUB3GZRC.js.map} +0 -0
- /package/dist/{otlp-json-file-exporter-RJFPCKVK-T6N4OGWG.js.map → otlp-json-file-exporter-RY63S3IG-PZBQPVYY.js.map} +0 -0
- /package/dist/{ts-eval-loader-TJT6BGFF-DI7XNSO4.js.map → ts-eval-loader-NWH3B4HG-UXXCZKLP.js.map} +0 -0
|
@@ -493,8 +493,8 @@ function getErrorMap() {
|
|
|
493
493
|
|
|
494
494
|
// ../../node_modules/.bun/zod@3.25.76/node_modules/zod/v3/helpers/parseUtil.js
|
|
495
495
|
var makeIssue = (params) => {
|
|
496
|
-
const { data, path:
|
|
497
|
-
const fullPath = [...
|
|
496
|
+
const { data, path: path50, errorMaps, issueData } = params;
|
|
497
|
+
const fullPath = [...path50, ...issueData.path || []];
|
|
498
498
|
const fullIssue = {
|
|
499
499
|
...issueData,
|
|
500
500
|
path: fullPath
|
|
@@ -610,11 +610,11 @@ var errorUtil;
|
|
|
610
610
|
|
|
611
611
|
// ../../node_modules/.bun/zod@3.25.76/node_modules/zod/v3/types.js
|
|
612
612
|
var ParseInputLazyPath = class {
|
|
613
|
-
constructor(parent, value,
|
|
613
|
+
constructor(parent, value, path50, key) {
|
|
614
614
|
this._cachedPath = [];
|
|
615
615
|
this.parent = parent;
|
|
616
616
|
this.data = value;
|
|
617
|
-
this._path =
|
|
617
|
+
this._path = path50;
|
|
618
618
|
this._key = key;
|
|
619
619
|
}
|
|
620
620
|
get path() {
|
|
@@ -4056,7 +4056,7 @@ var coerce = {
|
|
|
4056
4056
|
};
|
|
4057
4057
|
var NEVER = INVALID;
|
|
4058
4058
|
|
|
4059
|
-
// ../../packages/core/dist/chunk-
|
|
4059
|
+
// ../../packages/core/dist/chunk-5JNFEE7J.js
|
|
4060
4060
|
import { parse } from "yaml";
|
|
4061
4061
|
import os from "node:os";
|
|
4062
4062
|
import path from "node:path";
|
|
@@ -5146,6 +5146,7 @@ function resolveStreamLog(target, envFallback) {
|
|
|
5146
5146
|
function resolveCopilotSdkConfig(target, env, _evalFilePath) {
|
|
5147
5147
|
const cliUrlSource = target.cli_url;
|
|
5148
5148
|
const cliPathSource = target.cli_path;
|
|
5149
|
+
const argsSource = target.args ?? target.arguments;
|
|
5149
5150
|
const githubTokenSource = target.github_token;
|
|
5150
5151
|
const modelSource = target.model;
|
|
5151
5152
|
const cwdSource = target.cwd;
|
|
@@ -5166,6 +5167,7 @@ function resolveCopilotSdkConfig(target, env, _evalFilePath) {
|
|
|
5166
5167
|
allowLiteral: true,
|
|
5167
5168
|
optionalEnv: true
|
|
5168
5169
|
});
|
|
5170
|
+
const args = resolveOptionalStringArray(argsSource, env, `${target.name} copilot-sdk args`);
|
|
5169
5171
|
const githubToken = resolveOptionalString(
|
|
5170
5172
|
githubTokenSource,
|
|
5171
5173
|
env,
|
|
@@ -5195,12 +5197,11 @@ function resolveCopilotSdkConfig(target, env, _evalFilePath) {
|
|
|
5195
5197
|
);
|
|
5196
5198
|
const logFormat = normalizeCopilotLogFormat(logFormatSource);
|
|
5197
5199
|
const systemPrompt = typeof systemPromptSource === "string" && systemPromptSource.trim().length > 0 ? systemPromptSource.trim() : void 0;
|
|
5198
|
-
const customProvider =
|
|
5199
|
-
includeByokAlias: true
|
|
5200
|
-
});
|
|
5200
|
+
const customProvider = resolveCopilotFlatProviderConfig(target, env);
|
|
5201
5201
|
return {
|
|
5202
5202
|
cliUrl,
|
|
5203
5203
|
cliPath,
|
|
5204
|
+
args,
|
|
5204
5205
|
githubToken,
|
|
5205
5206
|
model,
|
|
5206
5207
|
cwd,
|
|
@@ -5209,86 +5210,52 @@ function resolveCopilotSdkConfig(target, env, _evalFilePath) {
|
|
|
5209
5210
|
logFormat,
|
|
5210
5211
|
streamLog: streamLogResult.streamLog,
|
|
5211
5212
|
systemPrompt,
|
|
5212
|
-
...customProvider ? {
|
|
5213
|
-
|
|
5214
|
-
|
|
5215
|
-
|
|
5216
|
-
|
|
5217
|
-
|
|
5218
|
-
|
|
5219
|
-
byokWireApi: customProvider.wireApi
|
|
5220
|
-
} : {}
|
|
5221
|
-
};
|
|
5222
|
-
}
|
|
5223
|
-
function resolveCopilotCustomProviderConfig(target, env, options = {}) {
|
|
5224
|
-
const hasCustomProvider = target.custom_provider !== void 0;
|
|
5225
|
-
const hasByokAlias = options.includeByokAlias === true && target.byok !== void 0;
|
|
5226
|
-
if (!hasCustomProvider && !hasByokAlias) {
|
|
5227
|
-
return void 0;
|
|
5228
|
-
}
|
|
5229
|
-
const sourceName = hasCustomProvider ? "custom_provider" : "byok";
|
|
5230
|
-
const raw = sourceName === "custom_provider" ? target.custom_provider : target.byok;
|
|
5231
|
-
if (raw === null) {
|
|
5232
|
-
return void 0;
|
|
5233
|
-
}
|
|
5234
|
-
if (typeof raw !== "object" || Array.isArray(raw)) {
|
|
5235
|
-
throw new Error(`${target.name}: '${sourceName}' must be an object`);
|
|
5236
|
-
}
|
|
5237
|
-
const provider = raw;
|
|
5238
|
-
const type = resolveOptionalString(provider.type, env, `${target.name} ${sourceName} type`, {
|
|
5213
|
+
...customProvider ? { customProvider } : {}
|
|
5214
|
+
};
|
|
5215
|
+
}
|
|
5216
|
+
function resolveCopilotFlatProviderConfig(target, env) {
|
|
5217
|
+
const baseUrlSource = target.base_url;
|
|
5218
|
+
if (!baseUrlSource) return void 0;
|
|
5219
|
+
const baseUrl = resolveOptionalString(baseUrlSource, env, `${target.name} copilot base URL`, {
|
|
5239
5220
|
allowLiteral: true,
|
|
5240
5221
|
optionalEnv: true
|
|
5241
5222
|
});
|
|
5242
|
-
|
|
5243
|
-
|
|
5223
|
+
if (!baseUrl) return void 0;
|
|
5224
|
+
const type = resolveOptionalString(
|
|
5225
|
+
target.subprovider,
|
|
5244
5226
|
env,
|
|
5245
|
-
`${target.name}
|
|
5227
|
+
`${target.name} copilot provider type`,
|
|
5246
5228
|
{
|
|
5247
5229
|
allowLiteral: true,
|
|
5248
5230
|
optionalEnv: true
|
|
5249
5231
|
}
|
|
5250
5232
|
);
|
|
5251
|
-
const apiKey = resolveOptionalString(
|
|
5252
|
-
|
|
5253
|
-
|
|
5254
|
-
|
|
5255
|
-
{
|
|
5256
|
-
allowLiteral: false,
|
|
5257
|
-
optionalEnv: true
|
|
5258
|
-
}
|
|
5259
|
-
);
|
|
5233
|
+
const apiKey = resolveOptionalString(target.api_key, env, `${target.name} copilot API key`, {
|
|
5234
|
+
allowLiteral: false,
|
|
5235
|
+
optionalEnv: true
|
|
5236
|
+
});
|
|
5260
5237
|
const bearerToken = resolveOptionalString(
|
|
5261
|
-
|
|
5238
|
+
target.bearer_token,
|
|
5262
5239
|
env,
|
|
5263
|
-
`${target.name}
|
|
5240
|
+
`${target.name} copilot bearer token`,
|
|
5264
5241
|
{
|
|
5265
5242
|
allowLiteral: false,
|
|
5266
5243
|
optionalEnv: true
|
|
5267
5244
|
}
|
|
5268
5245
|
);
|
|
5269
5246
|
const apiVersion = resolveOptionalString(
|
|
5270
|
-
|
|
5247
|
+
target.api_version,
|
|
5271
5248
|
env,
|
|
5272
|
-
`${target.name}
|
|
5249
|
+
`${target.name} copilot API version`,
|
|
5273
5250
|
{
|
|
5274
5251
|
allowLiteral: true,
|
|
5275
5252
|
optionalEnv: true
|
|
5276
5253
|
}
|
|
5277
5254
|
);
|
|
5278
|
-
const wireApi = resolveOptionalString(
|
|
5279
|
-
|
|
5280
|
-
|
|
5281
|
-
|
|
5282
|
-
{
|
|
5283
|
-
allowLiteral: true,
|
|
5284
|
-
optionalEnv: true
|
|
5285
|
-
}
|
|
5286
|
-
);
|
|
5287
|
-
if (!baseUrl) {
|
|
5288
|
-
throw new Error(
|
|
5289
|
-
`${target.name}: '${sourceName}.base_url' is required when '${sourceName}' is specified`
|
|
5290
|
-
);
|
|
5291
|
-
}
|
|
5255
|
+
const wireApi = resolveOptionalString(target.wire_api, env, `${target.name} copilot wire API`, {
|
|
5256
|
+
allowLiteral: true,
|
|
5257
|
+
optionalEnv: true
|
|
5258
|
+
});
|
|
5292
5259
|
return {
|
|
5293
5260
|
...type ? { type } : {},
|
|
5294
5261
|
baseUrl,
|
|
@@ -5337,7 +5304,7 @@ function resolveCopilotCliConfig(target, env, _evalFilePath) {
|
|
|
5337
5304
|
);
|
|
5338
5305
|
const logFormat = normalizeCopilotLogFormat(logFormatSource);
|
|
5339
5306
|
const systemPrompt = typeof systemPromptSource === "string" && systemPromptSource.trim().length > 0 ? systemPromptSource.trim() : void 0;
|
|
5340
|
-
const customProvider =
|
|
5307
|
+
const customProvider = resolveCopilotFlatProviderConfig(target, env);
|
|
5341
5308
|
return {
|
|
5342
5309
|
executable,
|
|
5343
5310
|
model,
|
|
@@ -5600,22 +5567,22 @@ function resolveReplayConfig(target, env, evalFilePath) {
|
|
|
5600
5567
|
const fixtures = resolveOptionalString(target.fixtures, env, `${target.name} replay fixtures`, {
|
|
5601
5568
|
allowLiteral: true
|
|
5602
5569
|
});
|
|
5603
|
-
const
|
|
5604
|
-
target.
|
|
5570
|
+
const executionTraces = resolveOptionalString(
|
|
5571
|
+
target.execution_traces,
|
|
5605
5572
|
env,
|
|
5606
|
-
`${target.name} replay
|
|
5573
|
+
`${target.name} replay execution_traces`,
|
|
5607
5574
|
{
|
|
5608
5575
|
allowLiteral: true
|
|
5609
5576
|
}
|
|
5610
5577
|
);
|
|
5611
|
-
if ((fixtures ? 1 : 0) + (
|
|
5578
|
+
if ((fixtures ? 1 : 0) + (executionTraces ? 1 : 0) !== 1) {
|
|
5612
5579
|
throw new Error(
|
|
5613
|
-
`Target "${target.name}" (provider: replay) requires exactly one replay source: "fixtures" or "
|
|
5580
|
+
`Target "${target.name}" (provider: replay) requires exactly one replay source: "fixtures" or "execution_traces"`
|
|
5614
5581
|
);
|
|
5615
5582
|
}
|
|
5616
5583
|
const fixturesPath = fixtures ? resolveReplaySourcePath(fixtures, evalFilePath) : void 0;
|
|
5617
|
-
const
|
|
5618
|
-
const source = fixturesPath ? { kind: "fixtures", path: fixturesPath } : { kind: "
|
|
5584
|
+
const executionTracesPath = executionTraces ? resolveReplaySourcePath(executionTraces, evalFilePath) : void 0;
|
|
5585
|
+
const source = fixturesPath ? { kind: "fixtures", path: fixturesPath } : { kind: "execution_traces", path: executionTracesPath };
|
|
5619
5586
|
const sourceTarget = resolveString(
|
|
5620
5587
|
target.source_target,
|
|
5621
5588
|
env,
|
|
@@ -6184,11 +6151,11 @@ async function expandFileReferences(tests, evalFileDir) {
|
|
|
6184
6151
|
return expanded;
|
|
6185
6152
|
}
|
|
6186
6153
|
|
|
6187
|
-
// ../../packages/core/dist/chunk-
|
|
6188
|
-
import
|
|
6154
|
+
// ../../packages/core/dist/chunk-M6LF2BEU.js
|
|
6155
|
+
import path49 from "node:path";
|
|
6189
6156
|
import { pathToFileURL as pathToFileURL2 } from "node:url";
|
|
6190
6157
|
import { existsSync as existsSync7 } from "node:fs";
|
|
6191
|
-
import
|
|
6158
|
+
import path48 from "node:path";
|
|
6192
6159
|
import micromatch4 from "micromatch";
|
|
6193
6160
|
import { mkdir, readFile as readFile3, writeFile } from "node:fs/promises";
|
|
6194
6161
|
import path5 from "node:path";
|
|
@@ -6916,10 +6883,10 @@ function assignProp(target, prop, value) {
|
|
|
6916
6883
|
configurable: true
|
|
6917
6884
|
});
|
|
6918
6885
|
}
|
|
6919
|
-
function getElementAtPath(obj,
|
|
6920
|
-
if (!
|
|
6886
|
+
function getElementAtPath(obj, path50) {
|
|
6887
|
+
if (!path50)
|
|
6921
6888
|
return obj;
|
|
6922
|
-
return
|
|
6889
|
+
return path50.reduce((acc, key) => acc?.[key], obj);
|
|
6923
6890
|
}
|
|
6924
6891
|
function promiseAllObject(promisesObj) {
|
|
6925
6892
|
const keys = Object.keys(promisesObj);
|
|
@@ -7239,11 +7206,11 @@ function aborted(x, startIndex = 0) {
|
|
|
7239
7206
|
}
|
|
7240
7207
|
return false;
|
|
7241
7208
|
}
|
|
7242
|
-
function prefixIssues(
|
|
7209
|
+
function prefixIssues(path50, issues) {
|
|
7243
7210
|
return issues.map((iss) => {
|
|
7244
7211
|
var _a;
|
|
7245
7212
|
(_a = iss).path ?? (_a.path = []);
|
|
7246
|
-
iss.path.unshift(
|
|
7213
|
+
iss.path.unshift(path50);
|
|
7247
7214
|
return iss;
|
|
7248
7215
|
});
|
|
7249
7216
|
}
|
|
@@ -7380,7 +7347,7 @@ function treeifyError(error40, _mapper) {
|
|
|
7380
7347
|
return issue2.message;
|
|
7381
7348
|
};
|
|
7382
7349
|
const result = { errors: [] };
|
|
7383
|
-
const processError = (error41,
|
|
7350
|
+
const processError = (error41, path50 = []) => {
|
|
7384
7351
|
var _a, _b;
|
|
7385
7352
|
for (const issue2 of error41.issues) {
|
|
7386
7353
|
if (issue2.code === "invalid_union" && issue2.errors.length) {
|
|
@@ -7390,7 +7357,7 @@ function treeifyError(error40, _mapper) {
|
|
|
7390
7357
|
} else if (issue2.code === "invalid_element") {
|
|
7391
7358
|
processError({ issues: issue2.issues }, issue2.path);
|
|
7392
7359
|
} else {
|
|
7393
|
-
const fullpath = [...
|
|
7360
|
+
const fullpath = [...path50, ...issue2.path];
|
|
7394
7361
|
if (fullpath.length === 0) {
|
|
7395
7362
|
result.errors.push(mapper(issue2));
|
|
7396
7363
|
continue;
|
|
@@ -7420,9 +7387,9 @@ function treeifyError(error40, _mapper) {
|
|
|
7420
7387
|
processError(error40);
|
|
7421
7388
|
return result;
|
|
7422
7389
|
}
|
|
7423
|
-
function toDotPath(
|
|
7390
|
+
function toDotPath(path50) {
|
|
7424
7391
|
const segs = [];
|
|
7425
|
-
for (const seg of
|
|
7392
|
+
for (const seg of path50) {
|
|
7426
7393
|
if (typeof seg === "number")
|
|
7427
7394
|
segs.push(`[${seg}]`);
|
|
7428
7395
|
else if (typeof seg === "symbol")
|
|
@@ -18852,7 +18819,7 @@ var RequestError = class _RequestError extends Error {
|
|
|
18852
18819
|
}
|
|
18853
18820
|
};
|
|
18854
18821
|
|
|
18855
|
-
// ../../packages/core/dist/chunk-
|
|
18822
|
+
// ../../packages/core/dist/chunk-M6LF2BEU.js
|
|
18856
18823
|
import { exec as execCallback } from "node:child_process";
|
|
18857
18824
|
import { readdirSync, statSync } from "node:fs";
|
|
18858
18825
|
import { readFile as readFile32, readdir as readdir2, stat as stat2 } from "node:fs/promises";
|
|
@@ -18962,6 +18929,9 @@ import path44 from "node:path";
|
|
|
18962
18929
|
import micromatch from "micromatch";
|
|
18963
18930
|
import { readFile as readFile16 } from "node:fs/promises";
|
|
18964
18931
|
import path43 from "node:path";
|
|
18932
|
+
import { mkdir as mkdir18, readFile as readFile20, writeFile as writeFile10 } from "node:fs/promises";
|
|
18933
|
+
import path47 from "node:path";
|
|
18934
|
+
import { readFile as readFile19 } from "node:fs/promises";
|
|
18965
18935
|
var DEFAULT_CACHE_PATH = ".agentv/cache";
|
|
18966
18936
|
var ResponseCache = class {
|
|
18967
18937
|
cachePath;
|
|
@@ -19602,7 +19572,6 @@ var CodeGrader = class {
|
|
|
19602
19572
|
getImageDir
|
|
19603
19573
|
),
|
|
19604
19574
|
output: outputForPayload,
|
|
19605
|
-
answer: context.candidate,
|
|
19606
19575
|
messages: materializedMessages ?? [],
|
|
19607
19576
|
outputPath,
|
|
19608
19577
|
inputFiles: context.evalCase.file_paths,
|
|
@@ -19950,24 +19919,13 @@ var TEMPLATE_VARIABLES = {
|
|
|
19950
19919
|
INPUT: "input",
|
|
19951
19920
|
OUTPUT: "output",
|
|
19952
19921
|
FILE_CHANGES: "file_changes",
|
|
19953
|
-
TOOL_CALLS: "tool_calls"
|
|
19954
|
-
/** @deprecated Use INPUT instead — resolves to the same text value. */
|
|
19955
|
-
INPUT_TEXT: "input_text",
|
|
19956
|
-
/** @deprecated Use OUTPUT instead — resolves to the same text value. */
|
|
19957
|
-
OUTPUT_TEXT: "output_text",
|
|
19958
|
-
/** @deprecated Use EXPECTED_OUTPUT instead — resolves to the same text value. */
|
|
19959
|
-
EXPECTED_OUTPUT_TEXT: "expected_output_text"
|
|
19922
|
+
TOOL_CALLS: "tool_calls"
|
|
19960
19923
|
};
|
|
19961
19924
|
var VALID_TEMPLATE_VARIABLES = new Set(Object.values(TEMPLATE_VARIABLES));
|
|
19962
19925
|
var REQUIRED_TEMPLATE_VARIABLES = /* @__PURE__ */ new Set([
|
|
19963
19926
|
TEMPLATE_VARIABLES.OUTPUT,
|
|
19964
19927
|
TEMPLATE_VARIABLES.EXPECTED_OUTPUT
|
|
19965
19928
|
]);
|
|
19966
|
-
var DEPRECATED_TEMPLATE_VARIABLES = /* @__PURE__ */ new Map([
|
|
19967
|
-
[TEMPLATE_VARIABLES.INPUT_TEXT, TEMPLATE_VARIABLES.INPUT],
|
|
19968
|
-
[TEMPLATE_VARIABLES.OUTPUT_TEXT, TEMPLATE_VARIABLES.OUTPUT],
|
|
19969
|
-
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT, TEMPLATE_VARIABLES.EXPECTED_OUTPUT]
|
|
19970
|
-
]);
|
|
19971
19929
|
var OPERATOR_GUIDANCE = {
|
|
19972
19930
|
correctness: "Correctness: mark satisfied only when the answer positively supports or fulfills the outcome. Omission or contradiction should not satisfy it.",
|
|
19973
19931
|
contradiction: "Contradiction guard: mark satisfied when the answer does not make a claim that contradicts the outcome. Do not require the answer to mention the outcome; mark unsatisfied only for incompatible claims."
|
|
@@ -20087,11 +20045,7 @@ function buildTemplateVariables(context) {
|
|
|
20087
20045
|
[TEMPLATE_VARIABLES.RUBRICS]: stringifyPretty(rubrics),
|
|
20088
20046
|
[TEMPLATE_VARIABLES.RUBRICS_JSON]: stringifyCompact(rubrics),
|
|
20089
20047
|
[TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? "",
|
|
20090
|
-
[TEMPLATE_VARIABLES.TOOL_CALLS]: context.toolCalls ?? ""
|
|
20091
|
-
// Deprecated aliases — same values as the primary variables above
|
|
20092
|
-
[TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
|
|
20093
|
-
[TEMPLATE_VARIABLES.OUTPUT_TEXT]: context.candidate.trim(),
|
|
20094
|
-
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context.evalCase.reference_answer ?? "").trim()
|
|
20048
|
+
[TEMPLATE_VARIABLES.TOOL_CALLS]: context.toolCalls ?? ""
|
|
20095
20049
|
};
|
|
20096
20050
|
}
|
|
20097
20051
|
function resolveContentBasePath(context) {
|
|
@@ -20168,7 +20122,6 @@ var LlmGrader = class {
|
|
|
20168
20122
|
const variables = buildTemplateVariables(context);
|
|
20169
20123
|
const systemPrompt = buildOutputSchema();
|
|
20170
20124
|
const graderTemplate = context.graderTemplateOverride ?? this.graderTemplate ?? DEFAULT_GRADER_TEMPLATE;
|
|
20171
|
-
warnDeprecatedTemplateVars(graderTemplate);
|
|
20172
20125
|
let userPrompt = substituteVariables(graderTemplate, variables);
|
|
20173
20126
|
if (context.fileChanges && !context.graderTemplateOverride && !this.graderTemplate) {
|
|
20174
20127
|
userPrompt += `
|
|
@@ -20499,7 +20452,6 @@ ${context.toolCalls}`;
|
|
|
20499
20452
|
const variables = buildTemplateVariables(context);
|
|
20500
20453
|
const template = context.graderTemplateOverride ?? this.graderTemplate;
|
|
20501
20454
|
if (template) {
|
|
20502
|
-
warnDeprecatedTemplateVars(template);
|
|
20503
20455
|
return substituteVariables(template, variables);
|
|
20504
20456
|
}
|
|
20505
20457
|
const config2 = context.evaluator;
|
|
@@ -20553,7 +20505,6 @@ ${context.toolCalls}`;
|
|
|
20553
20505
|
const template = context.graderTemplateOverride ?? this.graderTemplate;
|
|
20554
20506
|
if (template) {
|
|
20555
20507
|
const variables = buildTemplateVariables(context);
|
|
20556
|
-
warnDeprecatedTemplateVars(template);
|
|
20557
20508
|
const customPrompt = substituteVariables(template, variables);
|
|
20558
20509
|
const outputSchema = rubrics && rubrics.length > 0 ? buildRubricOutputSchema() : buildOutputSchema();
|
|
20559
20510
|
return `${customPrompt}
|
|
@@ -20707,7 +20658,6 @@ ${outputSchema}`;
|
|
|
20707
20658
|
}
|
|
20708
20659
|
buildCustomPrompt(context) {
|
|
20709
20660
|
const template = context.graderTemplateOverride ?? this.graderTemplate ?? "";
|
|
20710
|
-
warnDeprecatedTemplateVars(template);
|
|
20711
20661
|
return substituteVariables(template, buildTemplateVariables(context));
|
|
20712
20662
|
}
|
|
20713
20663
|
buildRubricPrompt(context, rubrics) {
|
|
@@ -20890,26 +20840,6 @@ function substituteVariables(template, variables) {
|
|
|
20890
20840
|
return variables[varName] ?? match;
|
|
20891
20841
|
});
|
|
20892
20842
|
}
|
|
20893
|
-
var ANSI_YELLOW2 = "\x1B[33m";
|
|
20894
|
-
var ANSI_RESET2 = "\x1B[0m";
|
|
20895
|
-
var warnedTemplateStrings = /* @__PURE__ */ new Set();
|
|
20896
|
-
function warnDeprecatedTemplateVars(template) {
|
|
20897
|
-
if (warnedTemplateStrings.has(template)) return;
|
|
20898
|
-
const used = [];
|
|
20899
|
-
for (const [deprecated, replacement] of DEPRECATED_TEMPLATE_VARIABLES) {
|
|
20900
|
-
if (new RegExp(`\\{\\{\\s*${deprecated}\\s*\\}\\}`).test(template)) {
|
|
20901
|
-
used.push(`{{ ${deprecated} }} \u2192 {{ ${replacement} }}`);
|
|
20902
|
-
}
|
|
20903
|
-
}
|
|
20904
|
-
if (used.length > 0) {
|
|
20905
|
-
warnedTemplateStrings.add(template);
|
|
20906
|
-
console.warn(
|
|
20907
|
-
`${ANSI_YELLOW2}\u26A0 Deprecated template variables detected (they still work but will be removed in a future version):
|
|
20908
|
-
${used.join("\n ")}
|
|
20909
|
-
Update your custom grader template to use the new names.${ANSI_RESET2}`
|
|
20910
|
-
);
|
|
20911
|
-
}
|
|
20912
|
-
}
|
|
20913
20843
|
function calculateRubricScore(result, rubrics) {
|
|
20914
20844
|
const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
|
|
20915
20845
|
const assertions = [];
|
|
@@ -21473,7 +21403,7 @@ var CostGrader = class {
|
|
|
21473
21403
|
};
|
|
21474
21404
|
}
|
|
21475
21405
|
};
|
|
21476
|
-
var NORMALIZED_TRAJECTORY_SCHEMA_VERSION = "agentv.
|
|
21406
|
+
var NORMALIZED_TRAJECTORY_SCHEMA_VERSION = "agentv.trajectory.v1";
|
|
21477
21407
|
var NORMALIZED_TRACE_SOURCE_KINDS = [
|
|
21478
21408
|
"agentv_run",
|
|
21479
21409
|
"otlp",
|
|
@@ -22532,115 +22462,115 @@ var FieldAccuracyGrader = class {
|
|
|
22532
22462
|
* Evaluate a single field against the expected value.
|
|
22533
22463
|
*/
|
|
22534
22464
|
evaluateField(fieldConfig, candidateData, expectedData) {
|
|
22535
|
-
const { path:
|
|
22536
|
-
const candidateValue = resolvePath(candidateData,
|
|
22537
|
-
const expectedValue = resolvePath(expectedData,
|
|
22465
|
+
const { path: path50, match, required: required2 = true, weight = 1 } = fieldConfig;
|
|
22466
|
+
const candidateValue = resolvePath(candidateData, path50);
|
|
22467
|
+
const expectedValue = resolvePath(expectedData, path50);
|
|
22538
22468
|
if (expectedValue === void 0) {
|
|
22539
22469
|
return {
|
|
22540
|
-
path:
|
|
22470
|
+
path: path50,
|
|
22541
22471
|
score: 1,
|
|
22542
22472
|
// No expected value means no comparison needed
|
|
22543
22473
|
weight,
|
|
22544
22474
|
hit: true,
|
|
22545
|
-
message: `${
|
|
22475
|
+
message: `${path50}: no expected value`
|
|
22546
22476
|
};
|
|
22547
22477
|
}
|
|
22548
22478
|
if (candidateValue === void 0) {
|
|
22549
22479
|
if (required2) {
|
|
22550
22480
|
return {
|
|
22551
|
-
path:
|
|
22481
|
+
path: path50,
|
|
22552
22482
|
score: 0,
|
|
22553
22483
|
weight,
|
|
22554
22484
|
hit: false,
|
|
22555
|
-
message: `${
|
|
22485
|
+
message: `${path50} (required, missing)`
|
|
22556
22486
|
};
|
|
22557
22487
|
}
|
|
22558
22488
|
return {
|
|
22559
|
-
path:
|
|
22489
|
+
path: path50,
|
|
22560
22490
|
score: 1,
|
|
22561
22491
|
// Don't penalize missing optional fields
|
|
22562
22492
|
weight: 0,
|
|
22563
22493
|
// Zero weight means it won't affect the score
|
|
22564
22494
|
hit: true,
|
|
22565
|
-
message: `${
|
|
22495
|
+
message: `${path50}: optional field missing`
|
|
22566
22496
|
};
|
|
22567
22497
|
}
|
|
22568
22498
|
switch (match) {
|
|
22569
22499
|
case "exact":
|
|
22570
|
-
return this.compareExact(
|
|
22500
|
+
return this.compareExact(path50, candidateValue, expectedValue, weight);
|
|
22571
22501
|
case "numeric_tolerance":
|
|
22572
22502
|
return this.compareNumericTolerance(
|
|
22573
|
-
|
|
22503
|
+
path50,
|
|
22574
22504
|
candidateValue,
|
|
22575
22505
|
expectedValue,
|
|
22576
22506
|
fieldConfig,
|
|
22577
22507
|
weight
|
|
22578
22508
|
);
|
|
22579
22509
|
case "date":
|
|
22580
|
-
return this.compareDate(
|
|
22510
|
+
return this.compareDate(path50, candidateValue, expectedValue, fieldConfig, weight);
|
|
22581
22511
|
default:
|
|
22582
22512
|
return {
|
|
22583
|
-
path:
|
|
22513
|
+
path: path50,
|
|
22584
22514
|
score: 0,
|
|
22585
22515
|
weight,
|
|
22586
22516
|
hit: false,
|
|
22587
|
-
message: `${
|
|
22517
|
+
message: `${path50}: unknown match type "${match}"`
|
|
22588
22518
|
};
|
|
22589
22519
|
}
|
|
22590
22520
|
}
|
|
22591
22521
|
/**
|
|
22592
22522
|
* Exact equality comparison.
|
|
22593
22523
|
*/
|
|
22594
|
-
compareExact(
|
|
22524
|
+
compareExact(path50, candidateValue, expectedValue, weight) {
|
|
22595
22525
|
if (deepEqual(candidateValue, expectedValue)) {
|
|
22596
22526
|
return {
|
|
22597
|
-
path:
|
|
22527
|
+
path: path50,
|
|
22598
22528
|
score: 1,
|
|
22599
22529
|
weight,
|
|
22600
22530
|
hit: true,
|
|
22601
|
-
message:
|
|
22531
|
+
message: path50
|
|
22602
22532
|
};
|
|
22603
22533
|
}
|
|
22604
22534
|
if (typeof candidateValue !== typeof expectedValue) {
|
|
22605
22535
|
return {
|
|
22606
|
-
path:
|
|
22536
|
+
path: path50,
|
|
22607
22537
|
score: 0,
|
|
22608
22538
|
weight,
|
|
22609
22539
|
hit: false,
|
|
22610
|
-
message: `${
|
|
22540
|
+
message: `${path50} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
|
|
22611
22541
|
};
|
|
22612
22542
|
}
|
|
22613
22543
|
return {
|
|
22614
|
-
path:
|
|
22544
|
+
path: path50,
|
|
22615
22545
|
score: 0,
|
|
22616
22546
|
weight,
|
|
22617
22547
|
hit: false,
|
|
22618
|
-
message: `${
|
|
22548
|
+
message: `${path50} (value mismatch)`
|
|
22619
22549
|
};
|
|
22620
22550
|
}
|
|
22621
22551
|
/**
|
|
22622
22552
|
* Numeric comparison with absolute or relative tolerance.
|
|
22623
22553
|
*/
|
|
22624
|
-
compareNumericTolerance(
|
|
22554
|
+
compareNumericTolerance(path50, candidateValue, expectedValue, fieldConfig, weight) {
|
|
22625
22555
|
const { tolerance = 0, relative = false } = fieldConfig;
|
|
22626
22556
|
const candidateNum = toNumber(candidateValue);
|
|
22627
22557
|
const expectedNum = toNumber(expectedValue);
|
|
22628
22558
|
if (candidateNum === null || expectedNum === null) {
|
|
22629
22559
|
return {
|
|
22630
|
-
path:
|
|
22560
|
+
path: path50,
|
|
22631
22561
|
score: 0,
|
|
22632
22562
|
weight,
|
|
22633
22563
|
hit: false,
|
|
22634
|
-
message: `${
|
|
22564
|
+
message: `${path50} (non-numeric value)`
|
|
22635
22565
|
};
|
|
22636
22566
|
}
|
|
22637
22567
|
if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
|
|
22638
22568
|
return {
|
|
22639
|
-
path:
|
|
22569
|
+
path: path50,
|
|
22640
22570
|
score: 0,
|
|
22641
22571
|
weight,
|
|
22642
22572
|
hit: false,
|
|
22643
|
-
message: `${
|
|
22573
|
+
message: `${path50} (invalid numeric value)`
|
|
22644
22574
|
};
|
|
22645
22575
|
}
|
|
22646
22576
|
const diff = Math.abs(candidateNum - expectedNum);
|
|
@@ -22653,61 +22583,61 @@ var FieldAccuracyGrader = class {
|
|
|
22653
22583
|
}
|
|
22654
22584
|
if (withinTolerance) {
|
|
22655
22585
|
return {
|
|
22656
|
-
path:
|
|
22586
|
+
path: path50,
|
|
22657
22587
|
score: 1,
|
|
22658
22588
|
weight,
|
|
22659
22589
|
hit: true,
|
|
22660
|
-
message: `${
|
|
22590
|
+
message: `${path50} (within tolerance: diff=${diff.toFixed(2)})`
|
|
22661
22591
|
};
|
|
22662
22592
|
}
|
|
22663
22593
|
return {
|
|
22664
|
-
path:
|
|
22594
|
+
path: path50,
|
|
22665
22595
|
score: 0,
|
|
22666
22596
|
weight,
|
|
22667
22597
|
hit: false,
|
|
22668
|
-
message: `${
|
|
22598
|
+
message: `${path50} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
|
|
22669
22599
|
};
|
|
22670
22600
|
}
|
|
22671
22601
|
/**
|
|
22672
22602
|
* Date comparison with format normalization.
|
|
22673
22603
|
*/
|
|
22674
|
-
compareDate(
|
|
22604
|
+
compareDate(path50, candidateValue, expectedValue, fieldConfig, weight) {
|
|
22675
22605
|
const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
|
|
22676
22606
|
const candidateDate = parseDate(String(candidateValue), formats);
|
|
22677
22607
|
const expectedDate = parseDate(String(expectedValue), formats);
|
|
22678
22608
|
if (candidateDate === null) {
|
|
22679
22609
|
return {
|
|
22680
|
-
path:
|
|
22610
|
+
path: path50,
|
|
22681
22611
|
score: 0,
|
|
22682
22612
|
weight,
|
|
22683
22613
|
hit: false,
|
|
22684
|
-
message: `${
|
|
22614
|
+
message: `${path50} (unparseable candidate date)`
|
|
22685
22615
|
};
|
|
22686
22616
|
}
|
|
22687
22617
|
if (expectedDate === null) {
|
|
22688
22618
|
return {
|
|
22689
|
-
path:
|
|
22619
|
+
path: path50,
|
|
22690
22620
|
score: 0,
|
|
22691
22621
|
weight,
|
|
22692
22622
|
hit: false,
|
|
22693
|
-
message: `${
|
|
22623
|
+
message: `${path50} (unparseable expected date)`
|
|
22694
22624
|
};
|
|
22695
22625
|
}
|
|
22696
22626
|
if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
|
|
22697
22627
|
return {
|
|
22698
|
-
path:
|
|
22628
|
+
path: path50,
|
|
22699
22629
|
score: 1,
|
|
22700
22630
|
weight,
|
|
22701
22631
|
hit: true,
|
|
22702
|
-
message:
|
|
22632
|
+
message: path50
|
|
22703
22633
|
};
|
|
22704
22634
|
}
|
|
22705
22635
|
return {
|
|
22706
|
-
path:
|
|
22636
|
+
path: path50,
|
|
22707
22637
|
score: 0,
|
|
22708
22638
|
weight,
|
|
22709
22639
|
hit: false,
|
|
22710
|
-
message: `${
|
|
22640
|
+
message: `${path50} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
|
|
22711
22641
|
};
|
|
22712
22642
|
}
|
|
22713
22643
|
/**
|
|
@@ -22740,11 +22670,11 @@ var FieldAccuracyGrader = class {
|
|
|
22740
22670
|
};
|
|
22741
22671
|
}
|
|
22742
22672
|
};
|
|
22743
|
-
function resolvePath(obj,
|
|
22744
|
-
if (!
|
|
22673
|
+
function resolvePath(obj, path50) {
|
|
22674
|
+
if (!path50 || !obj) {
|
|
22745
22675
|
return void 0;
|
|
22746
22676
|
}
|
|
22747
|
-
const parts =
|
|
22677
|
+
const parts = path50.split(/\.|\[|\]/).filter((p) => p.length > 0);
|
|
22748
22678
|
let current = obj;
|
|
22749
22679
|
for (const part of parts) {
|
|
22750
22680
|
if (current === null || current === void 0) {
|
|
@@ -22999,10 +22929,7 @@ function buildTemplateVariables2(input) {
|
|
|
22999
22929
|
[TEMPLATE_VARIABLES.RUBRICS]: stringifyPretty2(input.rubrics),
|
|
23000
22930
|
[TEMPLATE_VARIABLES.RUBRICS_JSON]: stringifyCompact2(input.rubrics),
|
|
23001
22931
|
[TEMPLATE_VARIABLES.FILE_CHANGES]: input.fileChanges ?? "",
|
|
23002
|
-
[TEMPLATE_VARIABLES.TOOL_CALLS]: input.toolCalls ?? ""
|
|
23003
|
-
[TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
|
|
23004
|
-
[TEMPLATE_VARIABLES.OUTPUT_TEXT]: input.candidate.trim(),
|
|
23005
|
-
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (input.evalCase.reference_answer ?? "").trim()
|
|
22932
|
+
[TEMPLATE_VARIABLES.TOOL_CALLS]: input.toolCalls ?? ""
|
|
23006
22933
|
};
|
|
23007
22934
|
}
|
|
23008
22935
|
function assembleLlmGraderPrompt(input) {
|
|
@@ -23278,8 +23205,8 @@ var TokenUsageGrader = class {
|
|
|
23278
23205
|
};
|
|
23279
23206
|
}
|
|
23280
23207
|
};
|
|
23281
|
-
function getNestedValue(obj,
|
|
23282
|
-
const parts =
|
|
23208
|
+
function getNestedValue(obj, path50) {
|
|
23209
|
+
const parts = path50.split(".");
|
|
23283
23210
|
let current = obj;
|
|
23284
23211
|
for (const part of parts) {
|
|
23285
23212
|
if (current === null || current === void 0 || typeof current !== "object") {
|
|
@@ -26420,11 +26347,13 @@ var CopilotCliProvider = class {
|
|
|
26420
26347
|
supportsBatch = false;
|
|
26421
26348
|
config;
|
|
26422
26349
|
runPromptMode;
|
|
26423
|
-
|
|
26350
|
+
spawnAcpProcess;
|
|
26351
|
+
constructor(targetName, config2, promptRunner = defaultCopilotCliPromptRunner, spawnAcpProcess = spawn2) {
|
|
26424
26352
|
this.id = `copilot-cli:${targetName}`;
|
|
26425
26353
|
this.targetName = targetName;
|
|
26426
26354
|
this.config = config2;
|
|
26427
26355
|
this.runPromptMode = promptRunner;
|
|
26356
|
+
this.spawnAcpProcess = spawnAcpProcess;
|
|
26428
26357
|
}
|
|
26429
26358
|
async invoke(request) {
|
|
26430
26359
|
if (request.signal?.aborted) {
|
|
@@ -26432,14 +26361,12 @@ var CopilotCliProvider = class {
|
|
|
26432
26361
|
}
|
|
26433
26362
|
const startTime = (/* @__PURE__ */ new Date()).toISOString();
|
|
26434
26363
|
const startMs = Date.now();
|
|
26435
|
-
if (this.config.customProvider) {
|
|
26436
|
-
return await this.invokePromptMode(request, startTime, startMs);
|
|
26437
|
-
}
|
|
26438
26364
|
const logger = await this.createStreamLogger(request, "acp").catch(() => void 0);
|
|
26439
26365
|
const executable = this.resolveExecutable();
|
|
26440
26366
|
const args = this.buildCliArgs();
|
|
26441
|
-
const agentProcess =
|
|
26367
|
+
const agentProcess = this.spawnAcpProcess(executable, args, {
|
|
26442
26368
|
env: buildCopilotCliProviderEnv(process.env, this.config.customProvider),
|
|
26369
|
+
cwd: this.resolveCwd(request.cwd) ?? process.cwd(),
|
|
26443
26370
|
stdio: ["pipe", "pipe", "inherit"]
|
|
26444
26371
|
});
|
|
26445
26372
|
trackChild(agentProcess);
|
|
@@ -26455,6 +26382,7 @@ var CopilotCliProvider = class {
|
|
|
26455
26382
|
const input = Writable.toWeb(agentProcess.stdin);
|
|
26456
26383
|
const output = Readable.toWeb(agentProcess.stdout);
|
|
26457
26384
|
const stream = ndJsonStream(input, output);
|
|
26385
|
+
const customProvider = this.config.customProvider;
|
|
26458
26386
|
const client = {
|
|
26459
26387
|
async requestPermission() {
|
|
26460
26388
|
return {
|
|
@@ -26464,7 +26392,7 @@ var CopilotCliProvider = class {
|
|
|
26464
26392
|
async sessionUpdate(params) {
|
|
26465
26393
|
const update = params.update;
|
|
26466
26394
|
const sessionUpdate = update.sessionUpdate;
|
|
26467
|
-
logger?.handleEvent(sessionUpdate, update);
|
|
26395
|
+
logger?.handleEvent(sessionUpdate, sanitizeSensitiveValue(update, customProvider));
|
|
26468
26396
|
if (sessionUpdate === "tool_call") {
|
|
26469
26397
|
const callId = update.toolCallId ?? randomUUID5();
|
|
26470
26398
|
if (!update.status || update.status === "pending" || update.status === "in_progress") {
|
|
@@ -26956,6 +26884,26 @@ function sanitizeSensitiveText(text, customProvider) {
|
|
|
26956
26884
|
}
|
|
26957
26885
|
return sanitized;
|
|
26958
26886
|
}
|
|
26887
|
+
function sanitizeSensitiveValue(value, customProvider) {
|
|
26888
|
+
if (!customProvider) {
|
|
26889
|
+
return value;
|
|
26890
|
+
}
|
|
26891
|
+
if (typeof value === "string") {
|
|
26892
|
+
return sanitizeSensitiveText(value, customProvider);
|
|
26893
|
+
}
|
|
26894
|
+
if (Array.isArray(value)) {
|
|
26895
|
+
return value.map((item) => sanitizeSensitiveValue(item, customProvider));
|
|
26896
|
+
}
|
|
26897
|
+
if (value && typeof value === "object") {
|
|
26898
|
+
return Object.fromEntries(
|
|
26899
|
+
Object.entries(value).map(([key, entry]) => [
|
|
26900
|
+
key,
|
|
26901
|
+
sanitizeSensitiveValue(entry, customProvider)
|
|
26902
|
+
])
|
|
26903
|
+
);
|
|
26904
|
+
}
|
|
26905
|
+
return value;
|
|
26906
|
+
}
|
|
26959
26907
|
async function defaultCopilotCliPromptRunner(options) {
|
|
26960
26908
|
return await new Promise((resolve, reject) => {
|
|
26961
26909
|
const child = spawn2(options.executable, options.args, {
|
|
@@ -27322,12 +27270,14 @@ async function loadCopilotSdk() {
|
|
|
27322
27270
|
const message = error40 instanceof Error ? error40.message : String(error40);
|
|
27323
27271
|
if (message.includes("vscode-jsonrpc")) {
|
|
27324
27272
|
throw new Error(
|
|
27325
|
-
|
|
27273
|
+
`@github/copilot-sdk failed to load: vscode-jsonrpc ESM import specifier mismatch.
|
|
27274
|
+
The package imports 'vscode-jsonrpc/node' but the installed version exposes 'node.js'.
|
|
27326
27275
|
|
|
27327
|
-
|
|
27328
|
-
-
|
|
27329
|
-
|
|
27330
|
-
|
|
27276
|
+
Repair (run once in your project root):
|
|
27277
|
+
node -e "const p=require.resolve('vscode-jsonrpc/package.json').replace('/package.json',''); require('fs').symlinkSync(p+'/node.js',p+'/node','file')" 2>/dev/null || true
|
|
27278
|
+
|
|
27279
|
+
Or switch to the copilot-cli target (no SDK dependency):
|
|
27280
|
+
Set provider: copilot-cli in your eval YAML`
|
|
27331
27281
|
);
|
|
27332
27282
|
}
|
|
27333
27283
|
throw new Error(
|
|
@@ -27358,7 +27308,8 @@ var CopilotSdkProvider = class {
|
|
|
27358
27308
|
throw new Error("Copilot SDK request was aborted before execution");
|
|
27359
27309
|
}
|
|
27360
27310
|
const sdk = await loadCopilotSdk();
|
|
27361
|
-
const
|
|
27311
|
+
const evalCwd = this.resolveCwd(request.cwd);
|
|
27312
|
+
const client = await this.getOrCreateClient(sdk, evalCwd ?? void 0);
|
|
27362
27313
|
const startTime = (/* @__PURE__ */ new Date()).toISOString();
|
|
27363
27314
|
const startMs = Date.now();
|
|
27364
27315
|
const logger = await this.createStreamLogger(request).catch(() => void 0);
|
|
@@ -27368,10 +27319,9 @@ var CopilotSdkProvider = class {
|
|
|
27368
27319
|
if (this.config.model) {
|
|
27369
27320
|
sessionOptions.model = this.config.model;
|
|
27370
27321
|
}
|
|
27371
|
-
|
|
27372
|
-
|
|
27373
|
-
sessionOptions.
|
|
27374
|
-
sessionOptions.skillDirectories = resolveSkillDirectories(cwd);
|
|
27322
|
+
if (evalCwd) {
|
|
27323
|
+
sessionOptions.workingDirectory = evalCwd;
|
|
27324
|
+
sessionOptions.skillDirectories = resolveSkillDirectories(evalCwd);
|
|
27375
27325
|
}
|
|
27376
27326
|
const systemPrompt = this.config.systemPrompt;
|
|
27377
27327
|
if (systemPrompt) {
|
|
@@ -27380,12 +27330,12 @@ var CopilotSdkProvider = class {
|
|
|
27380
27330
|
content: systemPrompt
|
|
27381
27331
|
};
|
|
27382
27332
|
}
|
|
27383
|
-
const customProvider =
|
|
27333
|
+
const customProvider = this.config.customProvider;
|
|
27384
27334
|
if (customProvider) {
|
|
27385
27335
|
const providerType = customProvider.type ?? "openai";
|
|
27386
27336
|
const provider = {
|
|
27387
27337
|
type: providerType,
|
|
27388
|
-
baseUrl:
|
|
27338
|
+
baseUrl: normalizeProviderBaseUrl(customProvider.baseUrl, providerType)
|
|
27389
27339
|
};
|
|
27390
27340
|
if (customProvider.bearerToken) {
|
|
27391
27341
|
provider.bearerToken = customProvider.bearerToken;
|
|
@@ -27532,7 +27482,7 @@ var CopilotSdkProvider = class {
|
|
|
27532
27482
|
}
|
|
27533
27483
|
}
|
|
27534
27484
|
// biome-ignore lint/suspicious/noExplicitAny: SDK client type is dynamically loaded
|
|
27535
|
-
async getOrCreateClient(sdk) {
|
|
27485
|
+
async getOrCreateClient(sdk, evalCwd) {
|
|
27536
27486
|
if (!this.client) {
|
|
27537
27487
|
const clientOptions = {};
|
|
27538
27488
|
if (this.config.cliUrl) {
|
|
@@ -27546,6 +27496,11 @@ var CopilotSdkProvider = class {
|
|
|
27546
27496
|
clientOptions.cliPath = nativePath;
|
|
27547
27497
|
}
|
|
27548
27498
|
}
|
|
27499
|
+
const resolvedCwd = evalCwd ?? process.cwd();
|
|
27500
|
+
clientOptions.cwd = resolvedCwd;
|
|
27501
|
+
if (this.config.args && this.config.args.length > 0) {
|
|
27502
|
+
clientOptions.cliArgs = [...this.config.args];
|
|
27503
|
+
}
|
|
27549
27504
|
if (this.config.githubToken) {
|
|
27550
27505
|
clientOptions.githubToken = this.config.githubToken;
|
|
27551
27506
|
}
|
|
@@ -27629,22 +27584,6 @@ var CopilotSdkProvider = class {
|
|
|
27629
27584
|
}
|
|
27630
27585
|
}
|
|
27631
27586
|
};
|
|
27632
|
-
function resolveCustomProviderConfig(config2) {
|
|
27633
|
-
if (config2.customProvider) {
|
|
27634
|
-
return config2.customProvider;
|
|
27635
|
-
}
|
|
27636
|
-
if (!config2.byokBaseUrl) {
|
|
27637
|
-
return void 0;
|
|
27638
|
-
}
|
|
27639
|
-
return {
|
|
27640
|
-
...config2.byokType ? { type: config2.byokType } : {},
|
|
27641
|
-
baseUrl: config2.byokBaseUrl,
|
|
27642
|
-
...config2.byokApiKey ? { apiKey: config2.byokApiKey } : {},
|
|
27643
|
-
...config2.byokBearerToken ? { bearerToken: config2.byokBearerToken } : {},
|
|
27644
|
-
...config2.byokApiVersion ? { apiVersion: config2.byokApiVersion } : {},
|
|
27645
|
-
...config2.byokWireApi ? { wireApi: config2.byokWireApi } : {}
|
|
27646
|
-
};
|
|
27647
|
-
}
|
|
27648
27587
|
function resolveSkillDirectories(cwd) {
|
|
27649
27588
|
const candidates = [
|
|
27650
27589
|
path14.join(cwd, ".claude", "skills"),
|
|
@@ -27653,7 +27592,7 @@ function resolveSkillDirectories(cwd) {
|
|
|
27653
27592
|
];
|
|
27654
27593
|
return candidates.filter((dir) => existsSync22(dir));
|
|
27655
27594
|
}
|
|
27656
|
-
function
|
|
27595
|
+
function normalizeProviderBaseUrl(baseUrl, type) {
|
|
27657
27596
|
const trimmed = baseUrl.trim().replace(/\/+$/, "");
|
|
27658
27597
|
if (/^https?:\/\//i.test(trimmed)) {
|
|
27659
27598
|
return trimmed;
|
|
@@ -29690,8 +29629,9 @@ function extractTranscript(raw) {
|
|
|
29690
29629
|
const transcript = raw.transcript;
|
|
29691
29630
|
return transcript;
|
|
29692
29631
|
}
|
|
29693
|
-
var
|
|
29632
|
+
var EXECUTION_TRACE_SCHEMA_VERSION = "agentv.trace.v1";
|
|
29694
29633
|
var TRACE_ENVELOPE_FORMAT = "otlp_openinference_spans";
|
|
29634
|
+
var TRANSCRIPT_MESSAGE_EVENT_NAME = "agentv.transcript.message";
|
|
29695
29635
|
var CAPTURE_CONTENT_VALUES = ["none", "metadata", "full"];
|
|
29696
29636
|
var REDACTION_LEVEL_VALUES = ["none", "partial", "full"];
|
|
29697
29637
|
var WARNING_SEVERITY_VALUES = ["info", "warning", "error"];
|
|
@@ -29793,8 +29733,8 @@ var TraceEnvelopeScoreWireSchema = external_exports.object({
|
|
|
29793
29733
|
evidence: AttributeMapWireSchema.optional()
|
|
29794
29734
|
}).strict();
|
|
29795
29735
|
var TraceEnvelopeWireSchema = external_exports.object({
|
|
29796
|
-
schema_version: external_exports.literal(
|
|
29797
|
-
|
|
29736
|
+
schema_version: external_exports.literal(EXECUTION_TRACE_SCHEMA_VERSION),
|
|
29737
|
+
artifact_id: external_exports.string(),
|
|
29798
29738
|
created_at: external_exports.string(),
|
|
29799
29739
|
eval: TraceEnvelopeEvalWireSchema,
|
|
29800
29740
|
replay: TraceEnvelopeReplayWireSchema.optional(),
|
|
@@ -29808,6 +29748,9 @@ var TraceEnvelopeWireSchema = external_exports.object({
|
|
|
29808
29748
|
function dropUndefined2(value) {
|
|
29809
29749
|
return Object.fromEntries(Object.entries(value).filter(([, entry]) => entry !== void 0));
|
|
29810
29750
|
}
|
|
29751
|
+
function isRecord(value) {
|
|
29752
|
+
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
29753
|
+
}
|
|
29811
29754
|
function definedStringRecord(value) {
|
|
29812
29755
|
if (!value) {
|
|
29813
29756
|
return void 0;
|
|
@@ -29831,6 +29774,32 @@ function parseTimeMs(timestamp) {
|
|
|
29831
29774
|
function msToUnixNano(ms) {
|
|
29832
29775
|
return String(BigInt(Math.round(ms)) * 1000000n);
|
|
29833
29776
|
}
|
|
29777
|
+
function compareUnixNanoStrings(first, second) {
|
|
29778
|
+
try {
|
|
29779
|
+
const left = BigInt(first);
|
|
29780
|
+
const right = BigInt(second);
|
|
29781
|
+
return left < right ? -1 : left > right ? 1 : 0;
|
|
29782
|
+
} catch {
|
|
29783
|
+
return first.localeCompare(second);
|
|
29784
|
+
}
|
|
29785
|
+
}
|
|
29786
|
+
function compareSpanTime(first, second) {
|
|
29787
|
+
const byStart = compareUnixNanoStrings(first.startTimeUnixNano, second.startTimeUnixNano);
|
|
29788
|
+
if (byStart !== 0) {
|
|
29789
|
+
return byStart;
|
|
29790
|
+
}
|
|
29791
|
+
if (first.spanId === second.parentSpanId) {
|
|
29792
|
+
return -1;
|
|
29793
|
+
}
|
|
29794
|
+
if (second.spanId === first.parentSpanId) {
|
|
29795
|
+
return 1;
|
|
29796
|
+
}
|
|
29797
|
+
const byEnd = compareUnixNanoStrings(first.endTimeUnixNano, second.endTimeUnixNano);
|
|
29798
|
+
return byEnd !== 0 ? byEnd : first.spanId.localeCompare(second.spanId);
|
|
29799
|
+
}
|
|
29800
|
+
function orderedSpans(spans) {
|
|
29801
|
+
return [...spans].sort(compareSpanTime);
|
|
29802
|
+
}
|
|
29834
29803
|
function unixNanoToIso(value) {
|
|
29835
29804
|
if (!value) {
|
|
29836
29805
|
return void 0;
|
|
@@ -29919,6 +29888,41 @@ function maybeToolContentAttributes(toolCall, capture) {
|
|
|
29919
29888
|
"gen_ai.tool.call.result": toolCall.output
|
|
29920
29889
|
});
|
|
29921
29890
|
}
|
|
29891
|
+
function toTranscriptToolCallWire(toolCall, capture) {
|
|
29892
|
+
return dropUndefined2({
|
|
29893
|
+
tool: toolCall.tool,
|
|
29894
|
+
input: capture.content === "full" ? toolCall.input : void 0,
|
|
29895
|
+
output: capture.content === "full" ? toolCall.output : void 0,
|
|
29896
|
+
id: toolCall.id,
|
|
29897
|
+
start_time: toolCall.startTime,
|
|
29898
|
+
end_time: toolCall.endTime,
|
|
29899
|
+
duration_ms: toolCall.durationMs
|
|
29900
|
+
});
|
|
29901
|
+
}
|
|
29902
|
+
function toTranscriptMessageWire(message, capture) {
|
|
29903
|
+
return dropUndefined2({
|
|
29904
|
+
role: message.role,
|
|
29905
|
+
name: message.name,
|
|
29906
|
+
content: capture.content === "full" ? message.content : void 0,
|
|
29907
|
+
tool_calls: message.toolCalls?.map((toolCall) => toTranscriptToolCallWire(toolCall, capture)),
|
|
29908
|
+
start_time: message.startTime,
|
|
29909
|
+
end_time: message.endTime,
|
|
29910
|
+
duration_ms: message.durationMs,
|
|
29911
|
+
metadata: message.metadata,
|
|
29912
|
+
token_usage: message.tokenUsage
|
|
29913
|
+
});
|
|
29914
|
+
}
|
|
29915
|
+
function transcriptMessageEvent(message, index, capture) {
|
|
29916
|
+
const startMs = parseTimeMs(message.startTime);
|
|
29917
|
+
return {
|
|
29918
|
+
name: TRANSCRIPT_MESSAGE_EVENT_NAME,
|
|
29919
|
+
timeUnixNano: startMs !== void 0 ? msToUnixNano(startMs) : void 0,
|
|
29920
|
+
attributes: dropUndefined2({
|
|
29921
|
+
"agentv.transcript.message.index": index,
|
|
29922
|
+
"agentv.transcript.message": toTranscriptMessageWire(message, capture)
|
|
29923
|
+
})
|
|
29924
|
+
};
|
|
29925
|
+
}
|
|
29922
29926
|
function spanStatusFromResult(result) {
|
|
29923
29927
|
if (result.executionStatus === "execution_error" || result.error) {
|
|
29924
29928
|
return { code: "ERROR", message: result.error };
|
|
@@ -29965,7 +29969,14 @@ function buildTraceEnvelopeFromEvaluationResult(result, options = {}) {
|
|
|
29965
29969
|
const capture = capturePolicy(options);
|
|
29966
29970
|
const source = sourceFromResult(result, options);
|
|
29967
29971
|
const traceId = hashHex(
|
|
29968
|
-
[
|
|
29972
|
+
[
|
|
29973
|
+
"execution-trace",
|
|
29974
|
+
result.timestamp,
|
|
29975
|
+
result.suite,
|
|
29976
|
+
result.testId,
|
|
29977
|
+
result.target,
|
|
29978
|
+
options.runId
|
|
29979
|
+
],
|
|
29969
29980
|
32
|
|
29970
29981
|
);
|
|
29971
29982
|
const rootSpanId = hashHex([traceId, "root"], 16);
|
|
@@ -29974,6 +29985,9 @@ function buildTraceEnvelopeFromEvaluationResult(result, options = {}) {
|
|
|
29974
29985
|
const rootStatus = spanStatusFromResult(result);
|
|
29975
29986
|
const conversionWarnings = [];
|
|
29976
29987
|
const spans = [];
|
|
29988
|
+
const rootEvents = result.trace.messages.map(
|
|
29989
|
+
(message, index) => transcriptMessageEvent(message, index, capture)
|
|
29990
|
+
);
|
|
29977
29991
|
const rootAttributes = dropUndefined2({
|
|
29978
29992
|
"gen_ai.operation.name": "invoke_agent",
|
|
29979
29993
|
"gen_ai.provider.name": "agentv",
|
|
@@ -30005,12 +30019,13 @@ function buildTraceEnvelopeFromEvaluationResult(result, options = {}) {
|
|
|
30005
30019
|
status: rootStatus,
|
|
30006
30020
|
attributes: rootAttributes,
|
|
30007
30021
|
events: result.error ? [
|
|
30022
|
+
...rootEvents,
|
|
30008
30023
|
{
|
|
30009
30024
|
name: "exception",
|
|
30010
30025
|
timeUnixNano: msToUnixNano(Math.max(rootStartMs, rootEndMs)),
|
|
30011
30026
|
attributes: { "exception.message": result.error }
|
|
30012
30027
|
}
|
|
30013
|
-
] :
|
|
30028
|
+
] : rootEvents
|
|
30014
30029
|
});
|
|
30015
30030
|
const assistantEntries = assistantMessages(result.trace.messages);
|
|
30016
30031
|
const chatEntries = assistantEntries.length > 0 ? assistantEntries : result.output.length > 0 ? [{ message: { role: "assistant", content: result.output }, index: 0 }] : [];
|
|
@@ -30094,7 +30109,7 @@ function buildTraceEnvelopeFromEvaluationResult(result, options = {}) {
|
|
|
30094
30109
|
});
|
|
30095
30110
|
}
|
|
30096
30111
|
}
|
|
30097
|
-
const
|
|
30112
|
+
const artifactId = `execution-trace-${hashHex([traceId, result.timestamp, result.score], 20)}`;
|
|
30098
30113
|
const evalIdentity = {
|
|
30099
30114
|
evalId: options.evalId,
|
|
30100
30115
|
evalPath: options.evalPath,
|
|
@@ -30109,8 +30124,8 @@ function buildTraceEnvelopeFromEvaluationResult(result, options = {}) {
|
|
|
30109
30124
|
experiment: options.experiment
|
|
30110
30125
|
};
|
|
30111
30126
|
return {
|
|
30112
|
-
schemaVersion:
|
|
30113
|
-
|
|
30127
|
+
schemaVersion: EXECUTION_TRACE_SCHEMA_VERSION,
|
|
30128
|
+
artifactId,
|
|
30114
30129
|
createdAt: now.toISOString(),
|
|
30115
30130
|
eval: evalIdentity,
|
|
30116
30131
|
replay: options.replay,
|
|
@@ -30133,7 +30148,7 @@ function toTraceEnvelopeWire(envelope) {
|
|
|
30133
30148
|
return TraceEnvelopeWireSchema.parse(
|
|
30134
30149
|
dropUndefined2({
|
|
30135
30150
|
schema_version: envelope.schemaVersion,
|
|
30136
|
-
|
|
30151
|
+
artifact_id: envelope.artifactId,
|
|
30137
30152
|
created_at: envelope.createdAt,
|
|
30138
30153
|
eval: toTraceEnvelopeEvalWire(envelope.eval),
|
|
30139
30154
|
replay: envelope.replay ? toTraceEnvelopeReplayWire(envelope.replay) : void 0,
|
|
@@ -30150,7 +30165,7 @@ function fromTraceEnvelopeWire(input) {
|
|
|
30150
30165
|
const wire = TraceEnvelopeWireSchema.parse(input);
|
|
30151
30166
|
return {
|
|
30152
30167
|
schemaVersion: wire.schema_version,
|
|
30153
|
-
|
|
30168
|
+
artifactId: wire.artifact_id,
|
|
30154
30169
|
createdAt: wire.created_at,
|
|
30155
30170
|
eval: fromTraceEnvelopeEvalWire(wire.eval),
|
|
30156
30171
|
replay: wire.replay ? fromTraceEnvelopeReplayWire(wire.replay) : void 0,
|
|
@@ -30433,10 +30448,68 @@ function toolCallFromSpan(span) {
|
|
|
30433
30448
|
durationMs: durationMsFromSpan(span)
|
|
30434
30449
|
};
|
|
30435
30450
|
}
|
|
30436
|
-
function
|
|
30437
|
-
|
|
30438
|
-
|
|
30439
|
-
|
|
30451
|
+
function buildSpanMap(spans) {
|
|
30452
|
+
return new Map(spans.map((span) => [span.spanId, span]));
|
|
30453
|
+
}
|
|
30454
|
+
function ancestorSpanIds(span, spansById) {
|
|
30455
|
+
const ancestors = [];
|
|
30456
|
+
const seen = /* @__PURE__ */ new Set();
|
|
30457
|
+
let parentSpanId = span.parentSpanId ?? void 0;
|
|
30458
|
+
while (parentSpanId && !seen.has(parentSpanId)) {
|
|
30459
|
+
seen.add(parentSpanId);
|
|
30460
|
+
ancestors.push(parentSpanId);
|
|
30461
|
+
parentSpanId = spansById.get(parentSpanId)?.parentSpanId ?? void 0;
|
|
30462
|
+
}
|
|
30463
|
+
return ancestors;
|
|
30464
|
+
}
|
|
30465
|
+
function nearestAncestorToolCallId(ancestorIds, spansById) {
|
|
30466
|
+
for (const ancestorId of ancestorIds) {
|
|
30467
|
+
const ancestor = spansById.get(ancestorId);
|
|
30468
|
+
if (ancestor && isToolSpan(ancestor)) {
|
|
30469
|
+
return toolCallFromSpan(ancestor).id;
|
|
30470
|
+
}
|
|
30471
|
+
}
|
|
30472
|
+
return void 0;
|
|
30473
|
+
}
|
|
30474
|
+
function fromTranscriptToolCallWire(wire) {
|
|
30475
|
+
if (!isRecord(wire) || typeof wire.tool !== "string") {
|
|
30476
|
+
return void 0;
|
|
30477
|
+
}
|
|
30478
|
+
return {
|
|
30479
|
+
tool: wire.tool,
|
|
30480
|
+
input: wire.input,
|
|
30481
|
+
output: wire.output,
|
|
30482
|
+
id: typeof wire.id === "string" ? wire.id : void 0,
|
|
30483
|
+
startTime: typeof wire.start_time === "string" ? wire.start_time : void 0,
|
|
30484
|
+
endTime: typeof wire.end_time === "string" ? wire.end_time : void 0,
|
|
30485
|
+
durationMs: numberAttribute(wire, "duration_ms")
|
|
30486
|
+
};
|
|
30487
|
+
}
|
|
30488
|
+
function fromTranscriptMessageWire(wire) {
|
|
30489
|
+
if (!isRecord(wire) || typeof wire.role !== "string") {
|
|
30490
|
+
return void 0;
|
|
30491
|
+
}
|
|
30492
|
+
const toolCalls = Array.isArray(wire.tool_calls) ? wire.tool_calls.map(fromTranscriptToolCallWire).filter((toolCall) => toolCall !== void 0) : void 0;
|
|
30493
|
+
return dropUndefined2({
|
|
30494
|
+
role: wire.role,
|
|
30495
|
+
name: typeof wire.name === "string" ? wire.name : void 0,
|
|
30496
|
+
content: wire.content,
|
|
30497
|
+
toolCalls: toolCalls && toolCalls.length > 0 ? toolCalls : void 0,
|
|
30498
|
+
startTime: typeof wire.start_time === "string" ? wire.start_time : void 0,
|
|
30499
|
+
endTime: typeof wire.end_time === "string" ? wire.end_time : void 0,
|
|
30500
|
+
durationMs: numberAttribute(wire, "duration_ms"),
|
|
30501
|
+
metadata: isRecord(wire.metadata) ? wire.metadata : void 0,
|
|
30502
|
+
tokenUsage: isRecord(wire.token_usage) ? tokenUsageFromAttributes({
|
|
30503
|
+
"gen_ai.usage.input_tokens": wire.token_usage.input,
|
|
30504
|
+
"gen_ai.usage.output_tokens": wire.token_usage.output,
|
|
30505
|
+
"gen_ai.usage.cache_read.input_tokens": wire.token_usage.cached,
|
|
30506
|
+
"gen_ai.usage.reasoning.output_tokens": wire.token_usage.reasoning
|
|
30507
|
+
}) : void 0
|
|
30508
|
+
});
|
|
30509
|
+
}
|
|
30510
|
+
function traceEnvelopeToMessageEntries(envelope) {
|
|
30511
|
+
const spans = orderedSpans(envelope.trace.spans);
|
|
30512
|
+
const spansById = buildSpanMap(spans);
|
|
30440
30513
|
const toolSpansByParent = /* @__PURE__ */ new Map();
|
|
30441
30514
|
for (const span of spans.filter(isToolSpan)) {
|
|
30442
30515
|
const parentSpanId = span.parentSpanId ?? envelope.trace.rootSpanId;
|
|
@@ -30444,20 +30517,97 @@ function traceEnvelopeToMessages(envelope) {
|
|
|
30444
30517
|
existing.push(span);
|
|
30445
30518
|
toolSpansByParent.set(parentSpanId, existing);
|
|
30446
30519
|
}
|
|
30447
|
-
return spans.filter(isChatSpan).map((span) => ({
|
|
30448
|
-
|
|
30449
|
-
|
|
30450
|
-
|
|
30451
|
-
|
|
30452
|
-
|
|
30453
|
-
|
|
30454
|
-
|
|
30455
|
-
|
|
30456
|
-
|
|
30457
|
-
|
|
30520
|
+
return spans.filter(isChatSpan).map((span, fallbackIndex) => ({
|
|
30521
|
+
index: numberAttribute(span.attributes, "agentv.message.index") ?? fallbackIndex,
|
|
30522
|
+
timeUnixNano: span.startTimeUnixNano,
|
|
30523
|
+
message: {
|
|
30524
|
+
role: "assistant",
|
|
30525
|
+
content: span.attributes["gen_ai.output.messages"],
|
|
30526
|
+
toolCalls: toolSpansByParent.get(span.spanId)?.map(toolCallFromSpan),
|
|
30527
|
+
startTime: unixNanoToIso(span.startTimeUnixNano),
|
|
30528
|
+
endTime: unixNanoToIso(span.endTimeUnixNano),
|
|
30529
|
+
durationMs: durationMsFromSpan(span),
|
|
30530
|
+
tokenUsage: tokenUsageFromAttributes(span.attributes),
|
|
30531
|
+
metadata: dropUndefined2({
|
|
30532
|
+
span_id: span.spanId,
|
|
30533
|
+
trace_id: span.traceId,
|
|
30534
|
+
parent_span_id: span.parentSpanId ?? void 0,
|
|
30535
|
+
parent_tool_call_id: nearestAncestorToolCallId(ancestorSpanIds(span, spansById), spansById)
|
|
30536
|
+
})
|
|
30458
30537
|
}
|
|
30459
30538
|
}));
|
|
30460
30539
|
}
|
|
30540
|
+
function traceEnvelopeToMessages(envelope) {
|
|
30541
|
+
return traceEnvelopeToMessageEntries(envelope).map((entry) => entry.message);
|
|
30542
|
+
}
|
|
30543
|
+
function transcriptMessageEntries(envelope) {
|
|
30544
|
+
const entries = [];
|
|
30545
|
+
for (const span of orderedSpans(envelope.trace.spans)) {
|
|
30546
|
+
for (const event of span.events ?? []) {
|
|
30547
|
+
if (event.name !== TRANSCRIPT_MESSAGE_EVENT_NAME) {
|
|
30548
|
+
continue;
|
|
30549
|
+
}
|
|
30550
|
+
const attributes = event.attributes ?? {};
|
|
30551
|
+
const message = fromTranscriptMessageWire(attributes["agentv.transcript.message"]);
|
|
30552
|
+
if (!message) {
|
|
30553
|
+
continue;
|
|
30554
|
+
}
|
|
30555
|
+
entries.push({
|
|
30556
|
+
index: numberAttribute(attributes, "agentv.transcript.message.index") ?? entries.length,
|
|
30557
|
+
timeUnixNano: event.timeUnixNano,
|
|
30558
|
+
message
|
|
30559
|
+
});
|
|
30560
|
+
}
|
|
30561
|
+
}
|
|
30562
|
+
return entries;
|
|
30563
|
+
}
|
|
30564
|
+
function traceEnvelopeToTranscriptMessages(envelope) {
|
|
30565
|
+
const entries = transcriptMessageEntries(envelope);
|
|
30566
|
+
if (entries.length === 0) {
|
|
30567
|
+
return traceEnvelopeToMessages(envelope);
|
|
30568
|
+
}
|
|
30569
|
+
return [...entries].sort((first, second) => {
|
|
30570
|
+
const byIndex = first.index - second.index;
|
|
30571
|
+
if (byIndex !== 0) {
|
|
30572
|
+
return byIndex;
|
|
30573
|
+
}
|
|
30574
|
+
if (first.timeUnixNano && second.timeUnixNano) {
|
|
30575
|
+
return compareUnixNanoStrings(first.timeUnixNano, second.timeUnixNano);
|
|
30576
|
+
}
|
|
30577
|
+
return 0;
|
|
30578
|
+
}).map((entry) => entry.message);
|
|
30579
|
+
}
|
|
30580
|
+
function traceEnvelopeToToolTrajectoryView(envelope) {
|
|
30581
|
+
const spans = orderedSpans(envelope.trace.spans);
|
|
30582
|
+
const spansById = buildSpanMap(spans);
|
|
30583
|
+
const tools = spans.filter(isToolSpan).map((span, position) => {
|
|
30584
|
+
const toolCall = toolCallFromSpan(span);
|
|
30585
|
+
const toolCallId = toolCall.id ?? span.spanId;
|
|
30586
|
+
const ancestorIds = ancestorSpanIds(span, spansById);
|
|
30587
|
+
return {
|
|
30588
|
+
position,
|
|
30589
|
+
traceId: span.traceId,
|
|
30590
|
+
spanId: span.spanId,
|
|
30591
|
+
parentSpanId: span.parentSpanId ?? void 0,
|
|
30592
|
+
ancestorSpanIds: ancestorIds,
|
|
30593
|
+
tool: toolCall.tool,
|
|
30594
|
+
toolCallId,
|
|
30595
|
+
parentToolCallId: nearestAncestorToolCallId(ancestorIds, spansById),
|
|
30596
|
+
input: toolCall.input,
|
|
30597
|
+
output: toolCall.output,
|
|
30598
|
+
status: span.status.code === "ERROR" ? "error" : "ok",
|
|
30599
|
+
startTime: toolCall.startTime,
|
|
30600
|
+
endTime: toolCall.endTime,
|
|
30601
|
+
durationMs: toolCall.durationMs
|
|
30602
|
+
};
|
|
30603
|
+
});
|
|
30604
|
+
return {
|
|
30605
|
+
schemaVersion: NORMALIZED_TRAJECTORY_SCHEMA_VERSION,
|
|
30606
|
+
traceId: envelope.trace.traceId,
|
|
30607
|
+
rootSpanId: envelope.trace.rootSpanId,
|
|
30608
|
+
tools
|
|
30609
|
+
};
|
|
30610
|
+
}
|
|
30461
30611
|
function traceEnvelopeToTraceSummary(envelope) {
|
|
30462
30612
|
const toolCallCounts = {};
|
|
30463
30613
|
const toolDurations = {};
|
|
@@ -30509,7 +30659,7 @@ function traceEnvelopeToTraceSummary(envelope) {
|
|
|
30509
30659
|
function traceEnvelopeToTraceArtifact(envelope) {
|
|
30510
30660
|
const events = [];
|
|
30511
30661
|
let ordinal = 0;
|
|
30512
|
-
for (const span of envelope.trace.spans) {
|
|
30662
|
+
for (const span of orderedSpans(envelope.trace.spans)) {
|
|
30513
30663
|
if (isChatSpan(span)) {
|
|
30514
30664
|
events.push({
|
|
30515
30665
|
eventId: `span-${span.spanId}`,
|
|
@@ -30578,6 +30728,95 @@ function traceEnvelopeToTraceArtifact(envelope) {
|
|
|
30578
30728
|
function getTraceEnvelopeSummary(envelope) {
|
|
30579
30729
|
return traceEnvelopeToTraceSummary(envelope).trace;
|
|
30580
30730
|
}
|
|
30731
|
+
function traceEnvelopeToOtlpJson(envelope) {
|
|
30732
|
+
return {
|
|
30733
|
+
resourceSpans: [
|
|
30734
|
+
{
|
|
30735
|
+
resource: {
|
|
30736
|
+
attributes: attributesToOtlp(envelope.trace.resource?.attributes)
|
|
30737
|
+
},
|
|
30738
|
+
scopeSpans: [
|
|
30739
|
+
{
|
|
30740
|
+
scope: dropUndefined2({
|
|
30741
|
+
name: envelope.trace.scope?.name,
|
|
30742
|
+
version: envelope.trace.scope?.version
|
|
30743
|
+
}),
|
|
30744
|
+
spans: orderedSpans(envelope.trace.spans).map(
|
|
30745
|
+
(span) => dropUndefined2({
|
|
30746
|
+
traceId: span.traceId,
|
|
30747
|
+
spanId: span.spanId,
|
|
30748
|
+
parentSpanId: span.parentSpanId ?? void 0,
|
|
30749
|
+
name: span.name,
|
|
30750
|
+
kind: spanKindToOtlp(span.kind),
|
|
30751
|
+
startTimeUnixNano: span.startTimeUnixNano,
|
|
30752
|
+
endTimeUnixNano: span.endTimeUnixNano,
|
|
30753
|
+
attributes: attributesToOtlp(span.attributes),
|
|
30754
|
+
status: spanStatusToOtlp(span.status),
|
|
30755
|
+
events: span.events?.map(
|
|
30756
|
+
(event) => dropUndefined2({
|
|
30757
|
+
name: event.name,
|
|
30758
|
+
timeUnixNano: event.timeUnixNano,
|
|
30759
|
+
attributes: attributesToOtlp(event.attributes)
|
|
30760
|
+
})
|
|
30761
|
+
)
|
|
30762
|
+
})
|
|
30763
|
+
)
|
|
30764
|
+
}
|
|
30765
|
+
]
|
|
30766
|
+
}
|
|
30767
|
+
]
|
|
30768
|
+
};
|
|
30769
|
+
}
|
|
30770
|
+
function spanKindToOtlp(kind) {
|
|
30771
|
+
if (kind === "SERVER") {
|
|
30772
|
+
return 1;
|
|
30773
|
+
}
|
|
30774
|
+
if (kind === "CLIENT") {
|
|
30775
|
+
return 2;
|
|
30776
|
+
}
|
|
30777
|
+
if (kind === "PRODUCER") {
|
|
30778
|
+
return 3;
|
|
30779
|
+
}
|
|
30780
|
+
if (kind === "CONSUMER") {
|
|
30781
|
+
return 4;
|
|
30782
|
+
}
|
|
30783
|
+
return 0;
|
|
30784
|
+
}
|
|
30785
|
+
function spanStatusToOtlp(status) {
|
|
30786
|
+
const code = status.code === "OK" ? 1 : status.code === "ERROR" ? 2 : 0;
|
|
30787
|
+
return dropUndefined2({ code, message: status.message });
|
|
30788
|
+
}
|
|
30789
|
+
function attributesToOtlp(attributes) {
|
|
30790
|
+
return Object.entries(attributes ?? {}).map(([key, value]) => ({
|
|
30791
|
+
key,
|
|
30792
|
+
value: toOtlpAnyValue(value)
|
|
30793
|
+
}));
|
|
30794
|
+
}
|
|
30795
|
+
function toOtlpAnyValue(value) {
|
|
30796
|
+
if (typeof value === "string") {
|
|
30797
|
+
return { stringValue: value };
|
|
30798
|
+
}
|
|
30799
|
+
if (typeof value === "number") {
|
|
30800
|
+
return Number.isInteger(value) ? { intValue: value } : { doubleValue: value };
|
|
30801
|
+
}
|
|
30802
|
+
if (typeof value === "boolean") {
|
|
30803
|
+
return { boolValue: value };
|
|
30804
|
+
}
|
|
30805
|
+
if (Array.isArray(value)) {
|
|
30806
|
+
return { arrayValue: { values: value.map(toOtlpAnyValue) } };
|
|
30807
|
+
}
|
|
30808
|
+
return { stringValue: stringifyOtlpAttribute(value) };
|
|
30809
|
+
}
|
|
30810
|
+
function stringifyOtlpAttribute(value) {
|
|
30811
|
+
if (value === void 0) {
|
|
30812
|
+
return "";
|
|
30813
|
+
}
|
|
30814
|
+
try {
|
|
30815
|
+
return JSON.stringify(value);
|
|
30816
|
+
} catch {
|
|
30817
|
+
return String(value);
|
|
30818
|
+
}
|
|
30819
|
+
}
|
|
30581
30820
|
async function readTraceEnvelopeReplayRecords(sourcePath) {
|
|
30582
30821
|
let raw;
|
|
30583
30822
|
try {
|
|
@@ -30585,7 +30824,7 @@ async function readTraceEnvelopeReplayRecords(sourcePath) {
|
|
|
30585
30824
|
} catch (error40) {
|
|
30586
30825
|
const reason = error40 instanceof Error ? error40.message : String(error40);
|
|
30587
30826
|
throw new Error(
|
|
30588
|
-
`
|
|
30827
|
+
`Execution trace replay source not found or unreadable: ${sourcePath}: ${reason}`
|
|
30589
30828
|
);
|
|
30590
30829
|
}
|
|
30591
30830
|
const documents = parseTraceEnvelopeDocuments(raw, sourcePath);
|
|
@@ -30604,10 +30843,10 @@ function findTraceEnvelopeReplayRecord(records, lookup) {
|
|
|
30604
30843
|
}
|
|
30605
30844
|
const key = formatReplayLookupKey(lookup);
|
|
30606
30845
|
if (matches.length === 0) {
|
|
30607
|
-
throw new Error(`
|
|
30846
|
+
throw new Error(`Execution trace replay lookup found no record for ${key}`);
|
|
30608
30847
|
}
|
|
30609
30848
|
throw new Error(
|
|
30610
|
-
`
|
|
30849
|
+
`Execution trace replay lookup found ${matches.length} duplicate records for ${key}`
|
|
30611
30850
|
);
|
|
30612
30851
|
}
|
|
30613
30852
|
function traceEnvelopeReplayRecordToProviderResponse(record2) {
|
|
@@ -30624,8 +30863,8 @@ function traceEnvelopeReplayRecordToProviderResponse(record2) {
|
|
|
30624
30863
|
startTime: summary.startTime,
|
|
30625
30864
|
endTime: summary.endTime,
|
|
30626
30865
|
raw: {
|
|
30627
|
-
|
|
30628
|
-
|
|
30866
|
+
replay_execution_trace: dropUndefined3({
|
|
30867
|
+
artifact_id: record2.envelope.artifactId,
|
|
30629
30868
|
source_path: record2.sourcePath,
|
|
30630
30869
|
line_number: record2.lineNumber,
|
|
30631
30870
|
suite: identity.suite,
|
|
@@ -30668,7 +30907,7 @@ function parseTraceEnvelopeDocuments(raw, sourcePath) {
|
|
|
30668
30907
|
documents.push({ value: JSON.parse(line), lineNumber: i + 1 });
|
|
30669
30908
|
} catch (error40) {
|
|
30670
30909
|
const reason = error40 instanceof Error ? error40.message : String(error40);
|
|
30671
|
-
throw new Error(`Invalid trace
|
|
30910
|
+
throw new Error(`Invalid execution trace JSONL at ${sourcePath}:${i + 1}: ${reason}`);
|
|
30672
30911
|
}
|
|
30673
30912
|
}
|
|
30674
30913
|
return documents;
|
|
@@ -30680,7 +30919,7 @@ function parseTraceEnvelopeDocument(value, sourcePath, lineNumber) {
|
|
|
30680
30919
|
} catch (error40) {
|
|
30681
30920
|
const location = lineNumber === void 0 ? sourcePath : `${sourcePath}:${lineNumber}`;
|
|
30682
30921
|
const reason = error40 instanceof Error ? error40.message : String(error40);
|
|
30683
|
-
throw new Error(`Invalid trace
|
|
30922
|
+
throw new Error(`Invalid execution trace replay record at ${location}: ${reason}`);
|
|
30684
30923
|
}
|
|
30685
30924
|
}
|
|
30686
30925
|
function traceEnvelopeReplayIdentity(envelope) {
|
|
@@ -30705,13 +30944,13 @@ function lookupKeyNumber(lookupKey, key) {
|
|
|
30705
30944
|
function assertReplayableMessages(output, record2) {
|
|
30706
30945
|
if (output.length === 0) {
|
|
30707
30946
|
throw new Error(
|
|
30708
|
-
`
|
|
30947
|
+
`Execution trace replay source ${formatRecordLocation(record2)} cannot project to provider Message[]: no chat spans found`
|
|
30709
30948
|
);
|
|
30710
30949
|
}
|
|
30711
30950
|
const lastAssistant = [...output].reverse().find((message) => message.role === "assistant");
|
|
30712
30951
|
if (!lastAssistant || lastAssistant.content === void 0) {
|
|
30713
30952
|
throw new Error(
|
|
30714
|
-
`
|
|
30953
|
+
`Execution trace replay source ${formatRecordLocation(record2)} is missing assistant output content; replay requires a full-content execution trace before grading`
|
|
30715
30954
|
);
|
|
30716
30955
|
}
|
|
30717
30956
|
}
|
|
@@ -30740,7 +30979,7 @@ var ReplayProvider = class {
|
|
|
30740
30979
|
const record2 = findReplayFixtureRecord(records, this.lookupForRequest(request));
|
|
30741
30980
|
return replayFixtureRecordToProviderResponse(record2);
|
|
30742
30981
|
}
|
|
30743
|
-
case "
|
|
30982
|
+
case "execution_traces": {
|
|
30744
30983
|
const records = await readTraceEnvelopeReplayRecords(source.path);
|
|
30745
30984
|
const record2 = findTraceEnvelopeReplayRecord(records, this.lookupForRequest(request));
|
|
30746
30985
|
return traceEnvelopeReplayRecordToProviderResponse(record2);
|
|
@@ -30758,7 +30997,7 @@ var ReplayProvider = class {
|
|
|
30758
30997
|
)
|
|
30759
30998
|
);
|
|
30760
30999
|
}
|
|
30761
|
-
case "
|
|
31000
|
+
case "execution_traces": {
|
|
30762
31001
|
const records = await readTraceEnvelopeReplayRecords(source.path);
|
|
30763
31002
|
return requests.map(
|
|
30764
31003
|
(request) => traceEnvelopeReplayRecordToProviderResponse(
|
|
@@ -30791,7 +31030,7 @@ function resolveReplaySource(config2) {
|
|
|
30791
31030
|
return { kind: "fixtures", path: config2.fixturesPath };
|
|
30792
31031
|
}
|
|
30793
31032
|
throw new Error(
|
|
30794
|
-
"Replay provider requires exactly one replay source: fixtures or
|
|
31033
|
+
"Replay provider requires exactly one replay source: fixtures or execution_traces"
|
|
30795
31034
|
);
|
|
30796
31035
|
}
|
|
30797
31036
|
async function pathExists(target) {
|
|
@@ -32132,7 +32371,7 @@ total unlocked subagents available: ${result.created.length + result.skippedExis
|
|
|
32132
32371
|
};
|
|
32133
32372
|
}
|
|
32134
32373
|
}
|
|
32135
|
-
function
|
|
32374
|
+
function isRecord2(value) {
|
|
32136
32375
|
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
32137
32376
|
}
|
|
32138
32377
|
function extractTargetsArray(parsed, absolutePath) {
|
|
@@ -32143,7 +32382,7 @@ function extractTargetsArray(parsed, absolutePath) {
|
|
|
32143
32382
|
return targets;
|
|
32144
32383
|
}
|
|
32145
32384
|
function assertTargetDefinition(value, index, filePath) {
|
|
32146
|
-
if (!
|
|
32385
|
+
if (!isRecord2(value)) {
|
|
32147
32386
|
throw new Error(`targets.yaml entry at index ${index} in ${filePath} must be an object`);
|
|
32148
32387
|
}
|
|
32149
32388
|
const name = value.name;
|
|
@@ -32176,7 +32415,7 @@ async function readTargetDefinitions(filePath) {
|
|
|
32176
32415
|
}
|
|
32177
32416
|
const raw = await readFile10(absolutePath, "utf8");
|
|
32178
32417
|
const parsed = parseYamlValue(raw);
|
|
32179
|
-
if (!
|
|
32418
|
+
if (!isRecord2(parsed)) {
|
|
32180
32419
|
throw new Error(`targets.yaml at ${absolutePath} must be a YAML object with a 'targets' field`);
|
|
32181
32420
|
}
|
|
32182
32421
|
const targets = extractTargetsArray(parsed, absolutePath);
|
|
@@ -33248,13 +33487,22 @@ function fromYaml(raw) {
|
|
|
33248
33487
|
}
|
|
33249
33488
|
if (e.results && typeof e.results === "object") {
|
|
33250
33489
|
const r = e.results;
|
|
33251
|
-
|
|
33490
|
+
const repoUrl = typeof r.repo_url === "string" && r.repo_url.trim().length > 0 ? r.repo_url.trim() : void 0;
|
|
33491
|
+
const repoPath = typeof r.repo_path === "string" && r.repo_path.trim().length > 0 ? r.repo_path.trim() : void 0;
|
|
33492
|
+
if (repoUrl || repoPath) {
|
|
33252
33493
|
const sync = r.sync && typeof r.sync === "object" ? r.sync : void 0;
|
|
33253
33494
|
entry.results = {
|
|
33254
|
-
repoUrl:
|
|
33495
|
+
...repoUrl ? { repoUrl } : {},
|
|
33496
|
+
...repoPath ? { repoPath } : {},
|
|
33255
33497
|
...typeof r.branch === "string" && r.branch.trim().length > 0 ? { branch: r.branch.trim() } : {},
|
|
33498
|
+
...typeof r.remote === "string" && r.remote.trim().length > 0 ? { remote: r.remote.trim() } : {},
|
|
33256
33499
|
...typeof r.path === "string" && r.path.trim().length > 0 ? { path: r.path.trim() } : {},
|
|
33257
|
-
...sync && typeof sync.auto_push === "boolean"
|
|
33500
|
+
...sync && (typeof sync.auto_push === "boolean" || typeof sync.require_push === "boolean") ? {
|
|
33501
|
+
sync: {
|
|
33502
|
+
...typeof sync.auto_push === "boolean" ? { autoPush: sync.auto_push } : {},
|
|
33503
|
+
...typeof sync.require_push === "boolean" ? { requirePush: sync.require_push } : {}
|
|
33504
|
+
}
|
|
33505
|
+
} : {},
|
|
33258
33506
|
...typeof r.branch_prefix === "string" && r.branch_prefix.trim().length > 0 ? { branchPrefix: r.branch_prefix.trim() } : {}
|
|
33259
33507
|
};
|
|
33260
33508
|
}
|
|
@@ -33273,11 +33521,20 @@ function toYaml(entry) {
|
|
|
33273
33521
|
};
|
|
33274
33522
|
if (entry.results) {
|
|
33275
33523
|
yaml.results = {
|
|
33276
|
-
repo_url: entry.results.repoUrl,
|
|
33524
|
+
...entry.results.repoUrl !== void 0 && { repo_url: entry.results.repoUrl },
|
|
33525
|
+
...entry.results.repoPath !== void 0 && { repo_path: entry.results.repoPath },
|
|
33277
33526
|
...entry.results.branch !== void 0 && { branch: entry.results.branch },
|
|
33527
|
+
...entry.results.remote !== void 0 && { remote: entry.results.remote },
|
|
33278
33528
|
...entry.results.path !== void 0 && { path: entry.results.path },
|
|
33279
|
-
...entry.results.sync?.autoPush !== void 0 && {
|
|
33280
|
-
sync: {
|
|
33529
|
+
...(entry.results.sync?.autoPush !== void 0 || entry.results.sync?.requirePush !== void 0) && {
|
|
33530
|
+
sync: {
|
|
33531
|
+
...entry.results.sync?.autoPush !== void 0 && {
|
|
33532
|
+
auto_push: entry.results.sync.autoPush
|
|
33533
|
+
},
|
|
33534
|
+
...entry.results.sync?.requirePush !== void 0 && {
|
|
33535
|
+
require_push: entry.results.sync.requirePush
|
|
33536
|
+
}
|
|
33537
|
+
}
|
|
33281
33538
|
},
|
|
33282
33539
|
...entry.results.branchPrefix !== void 0 && {
|
|
33283
33540
|
branch_prefix: entry.results.branchPrefix
|
|
@@ -34031,9 +34288,9 @@ function cloneJsonValue(value) {
|
|
|
34031
34288
|
return value;
|
|
34032
34289
|
}
|
|
34033
34290
|
var ANSI_RED = "\x1B[31m";
|
|
34034
|
-
var
|
|
34291
|
+
var ANSI_RESET2 = "\x1B[0m";
|
|
34035
34292
|
function logError(msg) {
|
|
34036
|
-
console.error(`${ANSI_RED}Error: ${msg}${
|
|
34293
|
+
console.error(`${ANSI_RED}Error: ${msg}${ANSI_RESET2}`);
|
|
34037
34294
|
}
|
|
34038
34295
|
function isAgentSkillsFormat(parsed) {
|
|
34039
34296
|
if (typeof parsed !== "object" || parsed === null) return false;
|
|
@@ -34201,8 +34458,8 @@ async function resolveFileReference3(rawValue, searchRoots) {
|
|
|
34201
34458
|
}
|
|
34202
34459
|
return { displayPath, attempted };
|
|
34203
34460
|
}
|
|
34204
|
-
var
|
|
34205
|
-
var
|
|
34461
|
+
var ANSI_YELLOW2 = "\x1B[33m";
|
|
34462
|
+
var ANSI_RESET22 = "\x1B[0m";
|
|
34206
34463
|
var DEFAULT_EVAL_PATTERNS = [
|
|
34207
34464
|
"**/evals/**/*.eval.yaml",
|
|
34208
34465
|
"**/evals/**/eval.yaml",
|
|
@@ -34566,13 +34823,20 @@ function parseResultsConfig(raw, configPath2) {
|
|
|
34566
34823
|
return void 0;
|
|
34567
34824
|
}
|
|
34568
34825
|
const obj = raw;
|
|
34569
|
-
if (obj.mode !== "github") {
|
|
34826
|
+
if (obj.mode !== void 0 && obj.mode !== "github") {
|
|
34570
34827
|
logWarning(`Invalid results.mode in ${configPath2}, expected 'github'`);
|
|
34571
34828
|
return void 0;
|
|
34572
34829
|
}
|
|
34573
|
-
const
|
|
34574
|
-
|
|
34575
|
-
|
|
34830
|
+
const legacyRepo = typeof obj.repo === "string" ? obj.repo.trim() : "";
|
|
34831
|
+
const repoUrl = typeof obj.repo_url === "string" ? obj.repo_url.trim() : "";
|
|
34832
|
+
const repoPath = typeof obj.repo_path === "string" ? obj.repo_path.trim() : "";
|
|
34833
|
+
const repo = legacyRepo || repoUrl;
|
|
34834
|
+
if (!repo && !repoPath) {
|
|
34835
|
+
logWarning(`Invalid results in ${configPath2}, expected repo_url/repo or repo_path`);
|
|
34836
|
+
return void 0;
|
|
34837
|
+
}
|
|
34838
|
+
if (repo && repoPath) {
|
|
34839
|
+
logWarning(`Invalid results in ${configPath2}, set only one of repo_url/repo or repo_path`);
|
|
34576
34840
|
return void 0;
|
|
34577
34841
|
}
|
|
34578
34842
|
let branch;
|
|
@@ -34583,6 +34847,14 @@ function parseResultsConfig(raw, configPath2) {
|
|
|
34583
34847
|
}
|
|
34584
34848
|
branch = obj.branch.trim();
|
|
34585
34849
|
}
|
|
34850
|
+
let remote;
|
|
34851
|
+
if (obj.remote !== void 0) {
|
|
34852
|
+
if (typeof obj.remote !== "string" || obj.remote.trim().length === 0) {
|
|
34853
|
+
logWarning(`Invalid results.remote in ${configPath2}, expected non-empty string`);
|
|
34854
|
+
return void 0;
|
|
34855
|
+
}
|
|
34856
|
+
remote = obj.remote.trim();
|
|
34857
|
+
}
|
|
34586
34858
|
let resultsPath;
|
|
34587
34859
|
if (obj.path !== void 0) {
|
|
34588
34860
|
if (typeof obj.path !== "string" || obj.path.trim().length === 0) {
|
|
@@ -34602,6 +34874,26 @@ function parseResultsConfig(raw, configPath2) {
|
|
|
34602
34874
|
logWarning(`Invalid results.auto_push in ${configPath2}, expected boolean`);
|
|
34603
34875
|
return void 0;
|
|
34604
34876
|
}
|
|
34877
|
+
let sync;
|
|
34878
|
+
if (obj.sync !== void 0) {
|
|
34879
|
+
if (typeof obj.sync !== "object" || obj.sync === null || Array.isArray(obj.sync)) {
|
|
34880
|
+
logWarning(`Invalid results.sync in ${configPath2}, expected object`);
|
|
34881
|
+
return void 0;
|
|
34882
|
+
}
|
|
34883
|
+
const syncObj = obj.sync;
|
|
34884
|
+
if (syncObj.auto_push !== void 0 && typeof syncObj.auto_push !== "boolean") {
|
|
34885
|
+
logWarning(`Invalid results.sync.auto_push in ${configPath2}, expected boolean`);
|
|
34886
|
+
return void 0;
|
|
34887
|
+
}
|
|
34888
|
+
if (syncObj.require_push !== void 0 && typeof syncObj.require_push !== "boolean") {
|
|
34889
|
+
logWarning(`Invalid results.sync.require_push in ${configPath2}, expected boolean`);
|
|
34890
|
+
return void 0;
|
|
34891
|
+
}
|
|
34892
|
+
sync = {
|
|
34893
|
+
...typeof syncObj.auto_push === "boolean" && { auto_push: syncObj.auto_push },
|
|
34894
|
+
...typeof syncObj.require_push === "boolean" && { require_push: syncObj.require_push }
|
|
34895
|
+
};
|
|
34896
|
+
}
|
|
34605
34897
|
let branchPrefix;
|
|
34606
34898
|
if (obj.branch_prefix !== void 0) {
|
|
34607
34899
|
if (typeof obj.branch_prefix !== "string" || obj.branch_prefix.trim().length === 0) {
|
|
@@ -34612,10 +34904,14 @@ function parseResultsConfig(raw, configPath2) {
|
|
|
34612
34904
|
}
|
|
34613
34905
|
return {
|
|
34614
34906
|
mode: "github",
|
|
34615
|
-
repo,
|
|
34907
|
+
...repo && { repo },
|
|
34908
|
+
...repoUrl && { repo_url: repoUrl },
|
|
34909
|
+
...repoPath && { repo_path: repoPath },
|
|
34616
34910
|
...branch !== void 0 && { branch },
|
|
34911
|
+
...remote !== void 0 && { remote },
|
|
34617
34912
|
...resultsPath !== void 0 && { path: resultsPath },
|
|
34618
34913
|
...typeof obj.auto_push === "boolean" && { auto_push: obj.auto_push },
|
|
34914
|
+
...sync && { sync },
|
|
34619
34915
|
...branchPrefix && { branch_prefix: branchPrefix }
|
|
34620
34916
|
};
|
|
34621
34917
|
}
|
|
@@ -34645,10 +34941,8 @@ function parseHooksConfig(raw, configPath2) {
|
|
|
34645
34941
|
return void 0;
|
|
34646
34942
|
}
|
|
34647
34943
|
function logWarning(message) {
|
|
34648
|
-
console.warn(`${
|
|
34944
|
+
console.warn(`${ANSI_YELLOW2}Warning: ${message}${ANSI_RESET22}`);
|
|
34649
34945
|
}
|
|
34650
|
-
var ANSI_YELLOW3 = "\x1B[33m";
|
|
34651
|
-
var ANSI_RESET4 = "\x1B[0m";
|
|
34652
34946
|
async function validateCustomPromptContent(promptPath) {
|
|
34653
34947
|
const content = await readFile14(promptPath, "utf8");
|
|
34654
34948
|
validateTemplateVariables(content, promptPath);
|
|
@@ -34666,8 +34960,8 @@ function validateTemplateVariables(content, source) {
|
|
|
34666
34960
|
}
|
|
34667
34961
|
match = variablePattern.exec(content);
|
|
34668
34962
|
}
|
|
34669
|
-
const hasCandidateAnswer = foundVariables.has(TEMPLATE_VARIABLES.OUTPUT)
|
|
34670
|
-
const hasExpectedOutput = foundVariables.has(TEMPLATE_VARIABLES.EXPECTED_OUTPUT)
|
|
34963
|
+
const hasCandidateAnswer = foundVariables.has(TEMPLATE_VARIABLES.OUTPUT);
|
|
34964
|
+
const hasExpectedOutput = foundVariables.has(TEMPLATE_VARIABLES.EXPECTED_OUTPUT);
|
|
34671
34965
|
const hasRequiredFields = hasCandidateAnswer || hasExpectedOutput;
|
|
34672
34966
|
if (!hasRequiredFields) {
|
|
34673
34967
|
throw new Error(
|
|
@@ -34676,28 +34970,15 @@ function validateTemplateVariables(content, source) {
|
|
|
34676
34970
|
- {{ ${TEMPLATE_VARIABLES.EXPECTED_OUTPUT} }}`
|
|
34677
34971
|
);
|
|
34678
34972
|
}
|
|
34679
|
-
const deprecatedUsed = [];
|
|
34680
|
-
for (const [deprecated, replacement] of DEPRECATED_TEMPLATE_VARIABLES) {
|
|
34681
|
-
if (foundVariables.has(deprecated)) {
|
|
34682
|
-
deprecatedUsed.push(`{{ ${deprecated} }} \u2192 {{ ${replacement} }}`);
|
|
34683
|
-
}
|
|
34684
|
-
}
|
|
34685
|
-
if (deprecatedUsed.length > 0) {
|
|
34686
|
-
console.warn(
|
|
34687
|
-
`${ANSI_YELLOW3}Warning: Template at ${source} uses deprecated variable names:
|
|
34688
|
-
${deprecatedUsed.join("\n ")}
|
|
34689
|
-
These still work but will be removed in a future version.${ANSI_RESET4}`
|
|
34690
|
-
);
|
|
34691
|
-
}
|
|
34692
34973
|
if (invalidVariables.length > 0) {
|
|
34693
|
-
const warningMessage =
|
|
34974
|
+
const warningMessage = `Warning: Custom grader template at ${source}
|
|
34694
34975
|
Contains invalid variables: ${invalidVariables.map((v) => `{{ ${v} }}`).join(", ")}
|
|
34695
|
-
Valid variables: ${Array.from(VALID_TEMPLATE_VARIABLES).map((v) => `{{ ${v} }}`).join(", ")}
|
|
34976
|
+
Valid variables: ${Array.from(VALID_TEMPLATE_VARIABLES).map((v) => `{{ ${v} }}`).join(", ")}`;
|
|
34696
34977
|
console.warn(warningMessage);
|
|
34697
34978
|
}
|
|
34698
34979
|
}
|
|
34699
|
-
var
|
|
34700
|
-
var
|
|
34980
|
+
var ANSI_YELLOW22 = "\x1B[33m";
|
|
34981
|
+
var ANSI_RESET3 = "\x1B[0m";
|
|
34701
34982
|
var MAX_ASSERTION_INCLUDE_DEPTH = 3;
|
|
34702
34983
|
var PROMPT_FILE_PREFIX = "file://";
|
|
34703
34984
|
function normalizeGraderType(type) {
|
|
@@ -35027,7 +35308,7 @@ async function parseGraderList(candidateEvaluators, searchRoots, evalId, default
|
|
|
35027
35308
|
let command;
|
|
35028
35309
|
if (rawEvaluator.script !== void 0 && rawEvaluator.command === void 0) {
|
|
35029
35310
|
console.warn(
|
|
35030
|
-
`${
|
|
35311
|
+
`${ANSI_YELLOW22}Warning: 'script' is deprecated in evaluator '${name}' in '${evalId}'. Use 'command' instead.${ANSI_RESET3}`
|
|
35031
35312
|
);
|
|
35032
35313
|
}
|
|
35033
35314
|
const rawCommand = rawEvaluator.command ?? rawEvaluator.script;
|
|
@@ -35909,7 +36190,7 @@ async function parseGraderList(candidateEvaluators, searchRoots, evalId, default
|
|
|
35909
36190
|
if (isJsonObject2(rawPrompt)) {
|
|
35910
36191
|
if (rawPrompt.script !== void 0 && rawPrompt.command === void 0) {
|
|
35911
36192
|
console.warn(
|
|
35912
|
-
`${
|
|
36193
|
+
`${ANSI_YELLOW22}Warning: 'prompt.script' is deprecated in evaluator '${name}' in '${evalId}'. Use 'prompt.command' instead.${ANSI_RESET3}`
|
|
35913
36194
|
);
|
|
35914
36195
|
}
|
|
35915
36196
|
const commandArray = asStringArray(
|
|
@@ -36210,10 +36491,10 @@ function warnUnconsumedCriteria(_criteria, _evaluators, _testId) {
|
|
|
36210
36491
|
function logWarning2(message, details) {
|
|
36211
36492
|
if (details && details.length > 0) {
|
|
36212
36493
|
const detailBlock = details.join("\n");
|
|
36213
|
-
console.warn(`${
|
|
36214
|
-
${detailBlock}${
|
|
36494
|
+
console.warn(`${ANSI_YELLOW22}Warning: ${message}
|
|
36495
|
+
${detailBlock}${ANSI_RESET3}`);
|
|
36215
36496
|
} else {
|
|
36216
|
-
console.warn(`${
|
|
36497
|
+
console.warn(`${ANSI_YELLOW22}Warning: ${message}${ANSI_RESET3}`);
|
|
36217
36498
|
}
|
|
36218
36499
|
}
|
|
36219
36500
|
function parseRequiredAndMinScore(rawRequired, rawMinScore, evaluatorName, evalId) {
|
|
@@ -36584,8 +36865,8 @@ function detectImageMediaType(filePath) {
|
|
|
36584
36865
|
const ext = path43.extname(filePath).toLowerCase();
|
|
36585
36866
|
return IMAGE_MEDIA_TYPES[ext];
|
|
36586
36867
|
}
|
|
36587
|
-
var
|
|
36588
|
-
var
|
|
36868
|
+
var ANSI_YELLOW3 = "\x1B[33m";
|
|
36869
|
+
var ANSI_RESET4 = "\x1B[0m";
|
|
36589
36870
|
async function processMessages(options) {
|
|
36590
36871
|
const { messages, searchRoots, repoRootPath, textParts, messageType, verbose } = options;
|
|
36591
36872
|
const processedMessages = [];
|
|
@@ -36716,10 +36997,10 @@ function asString3(value) {
|
|
|
36716
36997
|
function logWarning3(message, details) {
|
|
36717
36998
|
if (details && details.length > 0) {
|
|
36718
36999
|
const detailBlock = details.join("\n");
|
|
36719
|
-
console.warn(`${
|
|
36720
|
-
${detailBlock}${
|
|
37000
|
+
console.warn(`${ANSI_YELLOW3}Warning: ${message}
|
|
37001
|
+
${detailBlock}${ANSI_RESET4}`);
|
|
36721
37002
|
} else {
|
|
36722
|
-
console.warn(`${
|
|
37003
|
+
console.warn(`${ANSI_YELLOW3}Warning: ${message}${ANSI_RESET4}`);
|
|
36723
37004
|
}
|
|
36724
37005
|
}
|
|
36725
37006
|
async function processExpectedMessages(options) {
|
|
@@ -36908,9 +37189,9 @@ function resolveInputMessages(raw, suiteInputFiles) {
|
|
|
36908
37189
|
function resolveExpectedMessages(raw) {
|
|
36909
37190
|
return expandExpectedOutputShorthand(raw.expected_output);
|
|
36910
37191
|
}
|
|
36911
|
-
var
|
|
37192
|
+
var ANSI_YELLOW4 = "\x1B[33m";
|
|
36912
37193
|
var ANSI_RED2 = "\x1B[31m";
|
|
36913
|
-
var
|
|
37194
|
+
var ANSI_RESET5 = "\x1B[0m";
|
|
36914
37195
|
function matchesFilter(id, filter) {
|
|
36915
37196
|
return typeof filter === "string" ? micromatch.isMatch(id, filter) : filter.some((pattern) => micromatch.isMatch(id, pattern));
|
|
36916
37197
|
}
|
|
@@ -37100,19 +37381,19 @@ function asString4(value) {
|
|
|
37100
37381
|
function logWarning4(message, details) {
|
|
37101
37382
|
if (details && details.length > 0) {
|
|
37102
37383
|
const detailBlock = details.join("\n");
|
|
37103
|
-
console.warn(`${
|
|
37104
|
-
${detailBlock}${
|
|
37384
|
+
console.warn(`${ANSI_YELLOW4}Warning: ${message}
|
|
37385
|
+
${detailBlock}${ANSI_RESET5}`);
|
|
37105
37386
|
} else {
|
|
37106
|
-
console.warn(`${
|
|
37387
|
+
console.warn(`${ANSI_YELLOW4}Warning: ${message}${ANSI_RESET5}`);
|
|
37107
37388
|
}
|
|
37108
37389
|
}
|
|
37109
37390
|
function logError2(message, details) {
|
|
37110
37391
|
if (details && details.length > 0) {
|
|
37111
37392
|
const detailBlock = details.join("\n");
|
|
37112
37393
|
console.error(`${ANSI_RED2}Error: ${message}
|
|
37113
|
-
${detailBlock}${
|
|
37394
|
+
${detailBlock}${ANSI_RESET5}`);
|
|
37114
37395
|
} else {
|
|
37115
|
-
console.error(`${ANSI_RED2}Error: ${message}${
|
|
37396
|
+
console.error(`${ANSI_RED2}Error: ${message}${ANSI_RESET5}`);
|
|
37116
37397
|
}
|
|
37117
37398
|
}
|
|
37118
37399
|
var MetadataSchema = external_exports.object({
|
|
@@ -37324,9 +37605,9 @@ function buildChatPromptFromSegments(options) {
|
|
|
37324
37605
|
}
|
|
37325
37606
|
return chatPrompt.length > 0 ? chatPrompt : void 0;
|
|
37326
37607
|
}
|
|
37327
|
-
var
|
|
37608
|
+
var ANSI_YELLOW5 = "\x1B[33m";
|
|
37328
37609
|
var ANSI_RED3 = "\x1B[31m";
|
|
37329
|
-
var
|
|
37610
|
+
var ANSI_RESET6 = "\x1B[0m";
|
|
37330
37611
|
function matchesFilter2(id, filter) {
|
|
37331
37612
|
return typeof filter === "string" ? micromatch2.isMatch(id, filter) : filter.some((pattern) => micromatch2.isMatch(id, pattern));
|
|
37332
37613
|
}
|
|
@@ -37404,7 +37685,7 @@ async function loadTestSuite(evalFilePath, repoRoot, options) {
|
|
|
37404
37685
|
return { tests: await loadTestsFromAgentSkills(evalFilePath) };
|
|
37405
37686
|
}
|
|
37406
37687
|
if (format === "typescript") {
|
|
37407
|
-
const { loadTsEvalSuite: loadTsEvalSuite2 } = await import("./ts-eval-loader-
|
|
37688
|
+
const { loadTsEvalSuite: loadTsEvalSuite2 } = await import("./ts-eval-loader-NWH3B4HG-UXXCZKLP.js");
|
|
37408
37689
|
return loadTsEvalSuite2(evalFilePath, resolveToAbsolutePath(repoRoot), options);
|
|
37409
37690
|
}
|
|
37410
37691
|
const { tests, parsed, suiteWorkspacePath } = await loadTestsFromYaml(
|
|
@@ -37439,7 +37720,7 @@ async function loadTests(evalFilePath, repoRoot, options) {
|
|
|
37439
37720
|
return loadTestsFromAgentSkills(evalFilePath);
|
|
37440
37721
|
}
|
|
37441
37722
|
if (format === "typescript") {
|
|
37442
|
-
const { loadTsEvalSuite: loadTsEvalSuite2 } = await import("./ts-eval-loader-
|
|
37723
|
+
const { loadTsEvalSuite: loadTsEvalSuite2 } = await import("./ts-eval-loader-NWH3B4HG-UXXCZKLP.js");
|
|
37443
37724
|
const suite = await loadTsEvalSuite2(evalFilePath, resolveToAbsolutePath(repoRoot), options);
|
|
37444
37725
|
return suite.tests;
|
|
37445
37726
|
}
|
|
@@ -38151,19 +38432,19 @@ function mergeSuiteMetadataPayload(caseMetadata, suitePayload) {
|
|
|
38151
38432
|
function logWarning5(message, details) {
|
|
38152
38433
|
if (details && details.length > 0) {
|
|
38153
38434
|
const detailBlock = details.join("\n");
|
|
38154
|
-
console.warn(`${
|
|
38155
|
-
${detailBlock}${
|
|
38435
|
+
console.warn(`${ANSI_YELLOW5}Warning: ${message}
|
|
38436
|
+
${detailBlock}${ANSI_RESET6}`);
|
|
38156
38437
|
} else {
|
|
38157
|
-
console.warn(`${
|
|
38438
|
+
console.warn(`${ANSI_YELLOW5}Warning: ${message}${ANSI_RESET6}`);
|
|
38158
38439
|
}
|
|
38159
38440
|
}
|
|
38160
38441
|
function logError3(message, details) {
|
|
38161
38442
|
if (details && details.length > 0) {
|
|
38162
38443
|
const detailBlock = details.join("\n");
|
|
38163
38444
|
console.error(`${ANSI_RED3}Error: ${message}
|
|
38164
|
-
${detailBlock}${
|
|
38445
|
+
${detailBlock}${ANSI_RESET6}`);
|
|
38165
38446
|
} else {
|
|
38166
|
-
console.error(`${ANSI_RED3}Error: ${message}${
|
|
38447
|
+
console.error(`${ANSI_RED3}Error: ${message}${ANSI_RESET6}`);
|
|
38167
38448
|
}
|
|
38168
38449
|
}
|
|
38169
38450
|
var execFileAsync2 = promisify6(execFile2);
|
|
@@ -38266,18 +38547,18 @@ function validateDependencyGraph(tests) {
|
|
|
38266
38547
|
}
|
|
38267
38548
|
const visited = /* @__PURE__ */ new Set();
|
|
38268
38549
|
const visiting = /* @__PURE__ */ new Set();
|
|
38269
|
-
function visit(id,
|
|
38550
|
+
function visit(id, path50) {
|
|
38270
38551
|
if (visiting.has(id)) {
|
|
38271
|
-
const cycle = [...
|
|
38552
|
+
const cycle = [...path50.slice(path50.indexOf(id)), id];
|
|
38272
38553
|
throw new Error(`Circular dependency detected: ${cycle.join(" \u2192 ")}`);
|
|
38273
38554
|
}
|
|
38274
38555
|
if (visited.has(id)) return;
|
|
38275
38556
|
visiting.add(id);
|
|
38276
|
-
|
|
38557
|
+
path50.push(id);
|
|
38277
38558
|
for (const dep of depMap.get(id) ?? []) {
|
|
38278
|
-
visit(dep,
|
|
38559
|
+
visit(dep, path50);
|
|
38279
38560
|
}
|
|
38280
|
-
|
|
38561
|
+
path50.pop();
|
|
38281
38562
|
visiting.delete(id);
|
|
38282
38563
|
visited.add(id);
|
|
38283
38564
|
}
|
|
@@ -41201,6 +41482,1179 @@ function createFunctionProvider(taskFn) {
|
|
|
41201
41482
|
}
|
|
41202
41483
|
};
|
|
41203
41484
|
}
|
|
41485
|
+
function dropUndefined4(value) {
|
|
41486
|
+
return Object.fromEntries(Object.entries(value).filter(([, entry]) => entry !== void 0));
|
|
41487
|
+
}
|
|
41488
|
+
function toTranscriptTokenUsage(usage) {
|
|
41489
|
+
if (!usage) {
|
|
41490
|
+
return void 0;
|
|
41491
|
+
}
|
|
41492
|
+
return dropUndefined4({
|
|
41493
|
+
input: usage.input,
|
|
41494
|
+
output: usage.output,
|
|
41495
|
+
cached: usage.cached,
|
|
41496
|
+
reasoning: usage.reasoning
|
|
41497
|
+
});
|
|
41498
|
+
}
|
|
41499
|
+
function toTranscriptToolCall(toolCall) {
|
|
41500
|
+
return dropUndefined4({
|
|
41501
|
+
tool: toolCall.tool,
|
|
41502
|
+
input: toolCall.input,
|
|
41503
|
+
output: toolCall.output,
|
|
41504
|
+
id: toolCall.id,
|
|
41505
|
+
start_time: toolCall.startTime,
|
|
41506
|
+
end_time: toolCall.endTime,
|
|
41507
|
+
duration_ms: toolCall.durationMs
|
|
41508
|
+
});
|
|
41509
|
+
}
|
|
41510
|
+
function toTranscriptMessageFields(message) {
|
|
41511
|
+
return dropUndefined4({
|
|
41512
|
+
role: message.role,
|
|
41513
|
+
name: message.name,
|
|
41514
|
+
content: message.content,
|
|
41515
|
+
tool_calls: message.toolCalls?.map(toTranscriptToolCall),
|
|
41516
|
+
start_time: message.startTime,
|
|
41517
|
+
end_time: message.endTime,
|
|
41518
|
+
duration_ms: message.durationMs,
|
|
41519
|
+
metadata: message.metadata,
|
|
41520
|
+
token_usage: toTranscriptTokenUsage(message.tokenUsage)
|
|
41521
|
+
});
|
|
41522
|
+
}
|
|
41523
|
+
function toTranscriptJsonLines(entry, options) {
|
|
41524
|
+
const source = {
|
|
41525
|
+
provider: entry.source.provider,
|
|
41526
|
+
session_id: entry.source.sessionId,
|
|
41527
|
+
model: entry.source.model,
|
|
41528
|
+
timestamp: entry.source.startedAt,
|
|
41529
|
+
git_branch: entry.source.gitBranch,
|
|
41530
|
+
cwd: entry.source.cwd ?? entry.source.projectPath,
|
|
41531
|
+
version: entry.source.version
|
|
41532
|
+
};
|
|
41533
|
+
const transcriptTokenUsage = entry.tokenUsage ? {
|
|
41534
|
+
input: entry.tokenUsage.input,
|
|
41535
|
+
output: entry.tokenUsage.output,
|
|
41536
|
+
cached: entry.tokenUsage.cached,
|
|
41537
|
+
reasoning: entry.tokenUsage.reasoning
|
|
41538
|
+
} : void 0;
|
|
41539
|
+
const testId = options?.testId ?? entry.source.sessionId;
|
|
41540
|
+
const target = options?.target ?? entry.source.provider;
|
|
41541
|
+
return entry.messages.map((message, index) => ({
|
|
41542
|
+
test_id: testId,
|
|
41543
|
+
target,
|
|
41544
|
+
message_index: index,
|
|
41545
|
+
...toTranscriptMessageFields(message),
|
|
41546
|
+
transcript_token_usage: transcriptTokenUsage,
|
|
41547
|
+
transcript_duration_ms: entry.durationMs,
|
|
41548
|
+
transcript_cost_usd: entry.costUsd,
|
|
41549
|
+
source
|
|
41550
|
+
}));
|
|
41551
|
+
}
|
|
41552
|
+
function traceToTranscriptJsonLines(trace, options) {
|
|
41553
|
+
const provider = (typeof trace.metadata?.provider === "string" ? trace.metadata.provider : void 0) ?? options?.target ?? "agentv";
|
|
41554
|
+
const sessionId = (typeof trace.metadata?.provider_session_id === "string" ? trace.metadata.provider_session_id : void 0) ?? (typeof trace.metadata?.eval_case_id === "string" ? trace.metadata.eval_case_id : void 0) ?? options?.testId ?? "trace";
|
|
41555
|
+
return toTranscriptJsonLines(
|
|
41556
|
+
{
|
|
41557
|
+
messages: [...trace.messages],
|
|
41558
|
+
source: {
|
|
41559
|
+
provider,
|
|
41560
|
+
sessionId,
|
|
41561
|
+
startedAt: trace.startTime
|
|
41562
|
+
},
|
|
41563
|
+
tokenUsage: trace.tokenUsage,
|
|
41564
|
+
durationMs: trace.durationMs,
|
|
41565
|
+
costUsd: trace.costUsd
|
|
41566
|
+
},
|
|
41567
|
+
options
|
|
41568
|
+
);
|
|
41569
|
+
}
|
|
41570
|
+
function traceFromTranscriptJsonLines(lines) {
|
|
41571
|
+
const [entry] = groupTranscriptJsonLines(lines);
|
|
41572
|
+
if (!entry) {
|
|
41573
|
+
return buildTraceFromMessages();
|
|
41574
|
+
}
|
|
41575
|
+
return buildTraceFromMessages({
|
|
41576
|
+
output: entry.messages,
|
|
41577
|
+
tokenUsage: entry.tokenUsage,
|
|
41578
|
+
durationMs: entry.durationMs,
|
|
41579
|
+
costUsd: entry.costUsd ?? void 0,
|
|
41580
|
+
startTime: entry.source.startedAt,
|
|
41581
|
+
provider: entry.source.provider,
|
|
41582
|
+
target: entry.target,
|
|
41583
|
+
testId: entry.testId,
|
|
41584
|
+
conversationId: entry.source.sessionId
|
|
41585
|
+
});
|
|
41586
|
+
}
|
|
41587
|
+
function fromTranscriptTokenUsage(usage) {
|
|
41588
|
+
if (!usage) {
|
|
41589
|
+
return void 0;
|
|
41590
|
+
}
|
|
41591
|
+
return {
|
|
41592
|
+
input: usage.input,
|
|
41593
|
+
output: usage.output,
|
|
41594
|
+
cached: usage.cached,
|
|
41595
|
+
reasoning: usage.reasoning
|
|
41596
|
+
};
|
|
41597
|
+
}
|
|
41598
|
+
function readOptionalString(record2, key) {
|
|
41599
|
+
const value = record2[key];
|
|
41600
|
+
return typeof value === "string" ? value : void 0;
|
|
41601
|
+
}
|
|
41602
|
+
function readOptionalNumber(record2, key) {
|
|
41603
|
+
const value = record2[key];
|
|
41604
|
+
return typeof value === "number" && Number.isFinite(value) ? value : void 0;
|
|
41605
|
+
}
|
|
41606
|
+
function fromTranscriptToolCall(wire) {
|
|
41607
|
+
const tool = readOptionalString(wire, "tool");
|
|
41608
|
+
if (!tool) {
|
|
41609
|
+
return void 0;
|
|
41610
|
+
}
|
|
41611
|
+
return {
|
|
41612
|
+
tool,
|
|
41613
|
+
input: wire.input,
|
|
41614
|
+
output: wire.output,
|
|
41615
|
+
id: readOptionalString(wire, "id"),
|
|
41616
|
+
startTime: readOptionalString(wire, "start_time"),
|
|
41617
|
+
endTime: readOptionalString(wire, "end_time"),
|
|
41618
|
+
durationMs: readOptionalNumber(wire, "duration_ms")
|
|
41619
|
+
};
|
|
41620
|
+
}
|
|
41621
|
+
function buildReplayMessage(line) {
|
|
41622
|
+
return {
|
|
41623
|
+
role: line.role,
|
|
41624
|
+
name: line.name,
|
|
41625
|
+
content: line.content,
|
|
41626
|
+
toolCalls: line.tool_calls?.map(fromTranscriptToolCall).filter((toolCall) => toolCall !== void 0),
|
|
41627
|
+
startTime: line.start_time,
|
|
41628
|
+
endTime: line.end_time,
|
|
41629
|
+
durationMs: line.duration_ms,
|
|
41630
|
+
metadata: line.metadata,
|
|
41631
|
+
tokenUsage: fromTranscriptTokenUsage(line.token_usage)
|
|
41632
|
+
};
|
|
41633
|
+
}
|
|
41634
|
+
function groupTranscriptJsonLines(lines) {
|
|
41635
|
+
const grouped = /* @__PURE__ */ new Map();
|
|
41636
|
+
for (const line of lines) {
|
|
41637
|
+
const existing = grouped.get(line.test_id);
|
|
41638
|
+
const source = {
|
|
41639
|
+
provider: line.source.provider,
|
|
41640
|
+
sessionId: line.source.session_id,
|
|
41641
|
+
startedAt: line.source.timestamp,
|
|
41642
|
+
model: line.source.model,
|
|
41643
|
+
gitBranch: line.source.git_branch,
|
|
41644
|
+
cwd: line.source.cwd,
|
|
41645
|
+
version: line.source.version
|
|
41646
|
+
};
|
|
41647
|
+
const transcriptTokenUsage = line.transcript_token_usage ? {
|
|
41648
|
+
input: line.transcript_token_usage.input,
|
|
41649
|
+
output: line.transcript_token_usage.output,
|
|
41650
|
+
cached: line.transcript_token_usage.cached,
|
|
41651
|
+
reasoning: line.transcript_token_usage.reasoning
|
|
41652
|
+
} : void 0;
|
|
41653
|
+
if (existing) {
|
|
41654
|
+
existing.messages.push({ index: line.message_index, message: buildReplayMessage(line) });
|
|
41655
|
+
continue;
|
|
41656
|
+
}
|
|
41657
|
+
grouped.set(line.test_id, {
|
|
41658
|
+
target: line.target,
|
|
41659
|
+
tokenUsage: transcriptTokenUsage,
|
|
41660
|
+
durationMs: line.transcript_duration_ms,
|
|
41661
|
+
costUsd: line.transcript_cost_usd,
|
|
41662
|
+
source,
|
|
41663
|
+
messages: [{ index: line.message_index, message: buildReplayMessage(line) }]
|
|
41664
|
+
});
|
|
41665
|
+
}
|
|
41666
|
+
return [...grouped.entries()].map(([testId, entry]) => ({
|
|
41667
|
+
testId,
|
|
41668
|
+
target: entry.target,
|
|
41669
|
+
tokenUsage: entry.tokenUsage,
|
|
41670
|
+
durationMs: entry.durationMs,
|
|
41671
|
+
costUsd: entry.costUsd,
|
|
41672
|
+
source: entry.source,
|
|
41673
|
+
messages: entry.messages.sort((first, second) => first.index - second.index).map((item) => item.message)
|
|
41674
|
+
}));
|
|
41675
|
+
}
|
|
41676
|
+
async function readTranscriptJsonl(filePath) {
|
|
41677
|
+
const text = await readFile19(filePath, "utf8");
|
|
41678
|
+
return text.split("\n").filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
|
|
41679
|
+
}
|
|
41680
|
+
async function readTranscriptFile(filePath) {
|
|
41681
|
+
return readFile19(filePath, "utf8");
|
|
41682
|
+
}
|
|
41683
|
+
var ResultRowSchemaError = class extends Error {
|
|
41684
|
+
constructor(message) {
|
|
41685
|
+
super(message);
|
|
41686
|
+
this.name = "ResultRowSchemaError";
|
|
41687
|
+
}
|
|
41688
|
+
};
|
|
41689
|
+
var MIGRATION_GUIDANCE = "Expected an AgentV result row with a numeric score. Eval-case JSONL is input data, not a results artifact. Run `agentv eval <eval-file> --output <run-dir>` and pass the run workspace or its index.jsonl manifest.";
|
|
41690
|
+
var RESULT_ROW_ALIASES = {
|
|
41691
|
+
answerPath: "answer_path",
|
|
41692
|
+
artifactDir: "artifact_dir",
|
|
41693
|
+
conversationId: "conversation_id",
|
|
41694
|
+
costUsd: "cost_usd",
|
|
41695
|
+
durationMs: "duration_ms",
|
|
41696
|
+
endTime: "end_time",
|
|
41697
|
+
evalPath: "eval_path",
|
|
41698
|
+
executionStatus: "execution_status",
|
|
41699
|
+
failureReasonCode: "failure_reason_code",
|
|
41700
|
+
failureStage: "failure_stage",
|
|
41701
|
+
filesPath: "files_path",
|
|
41702
|
+
gradersPath: "graders_path",
|
|
41703
|
+
gradingPath: "grading_path",
|
|
41704
|
+
inputPath: "input_path",
|
|
41705
|
+
outputPath: "output_path",
|
|
41706
|
+
responsePath: "response_path",
|
|
41707
|
+
startTime: "start_time",
|
|
41708
|
+
targetsPath: "targets_path",
|
|
41709
|
+
taskDir: "task_dir",
|
|
41710
|
+
testId: "test_id",
|
|
41711
|
+
timingPath: "timing_path",
|
|
41712
|
+
tokenUsage: "token_usage",
|
|
41713
|
+
transcriptPath: "transcript_path",
|
|
41714
|
+
workspacePath: "workspace_path"
|
|
41715
|
+
};
|
|
41716
|
+
var TRACE_SUMMARY_ALIASES = {
|
|
41717
|
+
costUsd: "cost_usd",
|
|
41718
|
+
durationMs: "duration_ms",
|
|
41719
|
+
errorCount: "error_count",
|
|
41720
|
+
eventCount: "event_count",
|
|
41721
|
+
llmCallCount: "llm_call_count",
|
|
41722
|
+
tokenUsage: "token_usage",
|
|
41723
|
+
toolCalls: "tool_calls",
|
|
41724
|
+
toolDurations: "tool_durations"
|
|
41725
|
+
};
|
|
41726
|
+
var MESSAGE_ALIASES = {
|
|
41727
|
+
durationMs: "duration_ms",
|
|
41728
|
+
endTime: "end_time",
|
|
41729
|
+
startTime: "start_time",
|
|
41730
|
+
tokenUsage: "token_usage",
|
|
41731
|
+
toolCalls: "tool_calls"
|
|
41732
|
+
};
|
|
41733
|
+
var TOOL_CALL_ALIASES = {
|
|
41734
|
+
durationMs: "duration_ms",
|
|
41735
|
+
endTime: "end_time",
|
|
41736
|
+
startTime: "start_time"
|
|
41737
|
+
};
|
|
41738
|
+
function isRecord3(value) {
|
|
41739
|
+
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
41740
|
+
}
|
|
41741
|
+
function normalizeKnownAliases(value, aliases) {
|
|
41742
|
+
const normalized = { ...value };
|
|
41743
|
+
for (const [camelKey, snakeKey] of Object.entries(aliases)) {
|
|
41744
|
+
if (normalized[snakeKey] === void 0 && normalized[camelKey] !== void 0) {
|
|
41745
|
+
normalized[snakeKey] = normalized[camelKey];
|
|
41746
|
+
}
|
|
41747
|
+
if (camelKey !== snakeKey) {
|
|
41748
|
+
delete normalized[camelKey];
|
|
41749
|
+
}
|
|
41750
|
+
}
|
|
41751
|
+
return normalized;
|
|
41752
|
+
}
|
|
41753
|
+
function normalizeToolCall2(value) {
|
|
41754
|
+
if (!isRecord3(value)) {
|
|
41755
|
+
return value;
|
|
41756
|
+
}
|
|
41757
|
+
return normalizeKnownAliases(value, TOOL_CALL_ALIASES);
|
|
41758
|
+
}
|
|
41759
|
+
function normalizeMessage(value) {
|
|
41760
|
+
if (!isRecord3(value)) {
|
|
41761
|
+
return value;
|
|
41762
|
+
}
|
|
41763
|
+
const normalized = normalizeKnownAliases(value, MESSAGE_ALIASES);
|
|
41764
|
+
if (Array.isArray(normalized.tool_calls)) {
|
|
41765
|
+
normalized.tool_calls = normalized.tool_calls.map(normalizeToolCall2);
|
|
41766
|
+
}
|
|
41767
|
+
return normalized;
|
|
41768
|
+
}
|
|
41769
|
+
function normalizeTraceSummary(value) {
|
|
41770
|
+
if (!isRecord3(value)) {
|
|
41771
|
+
return value;
|
|
41772
|
+
}
|
|
41773
|
+
const normalized = normalizeKnownAliases(value, TRACE_SUMMARY_ALIASES);
|
|
41774
|
+
if (Array.isArray(normalized.messages)) {
|
|
41775
|
+
normalized.messages = normalized.messages.map(normalizeMessage);
|
|
41776
|
+
}
|
|
41777
|
+
return normalized;
|
|
41778
|
+
}
|
|
41779
|
+
function normalizeOutput(value) {
|
|
41780
|
+
if (!Array.isArray(value)) {
|
|
41781
|
+
return value;
|
|
41782
|
+
}
|
|
41783
|
+
return value.map(normalizeMessage);
|
|
41784
|
+
}
|
|
41785
|
+
function buildSchemaError(context) {
|
|
41786
|
+
const location = [
|
|
41787
|
+
context.sourceLabel ? ` in ${context.sourceLabel}` : "",
|
|
41788
|
+
context.lineNumber !== void 0 ? ` at line ${context.lineNumber}` : ""
|
|
41789
|
+
].join("");
|
|
41790
|
+
return new ResultRowSchemaError(`Unsupported result row${location}. ${MIGRATION_GUIDANCE}`);
|
|
41791
|
+
}
|
|
41792
|
+
function buildInvalidScoreError(context) {
|
|
41793
|
+
const location = [
|
|
41794
|
+
context.sourceLabel ? ` in ${context.sourceLabel}` : "",
|
|
41795
|
+
context.lineNumber !== void 0 ? ` at line ${context.lineNumber}` : ""
|
|
41796
|
+
].join("");
|
|
41797
|
+
return new ResultRowSchemaError(`Missing or invalid score in result row${location}.`);
|
|
41798
|
+
}
|
|
41799
|
+
function looksLikeResultRow(value) {
|
|
41800
|
+
return typeof value.test_id === "string" || Object.hasOwn(value, "score") || Object.hasOwn(value, "trace") || Object.hasOwn(value, "spans") || Object.hasOwn(value, "target") || Object.hasOwn(value, "grading_path") || Object.hasOwn(value, "timing_path");
|
|
41801
|
+
}
|
|
41802
|
+
function normalizeResultRow(value, context = {}) {
|
|
41803
|
+
if (!isRecord3(value)) {
|
|
41804
|
+
throw buildSchemaError(context);
|
|
41805
|
+
}
|
|
41806
|
+
const normalized = normalizeKnownAliases(value, RESULT_ROW_ALIASES);
|
|
41807
|
+
if (normalized.trace !== void 0) {
|
|
41808
|
+
normalized.trace = normalizeTraceSummary(normalized.trace);
|
|
41809
|
+
}
|
|
41810
|
+
if (normalized.output !== void 0) {
|
|
41811
|
+
normalized.output = normalizeOutput(normalized.output);
|
|
41812
|
+
}
|
|
41813
|
+
if (typeof normalized.score !== "number" || !Number.isFinite(normalized.score)) {
|
|
41814
|
+
if (looksLikeResultRow(normalized)) {
|
|
41815
|
+
throw buildInvalidScoreError(context);
|
|
41816
|
+
}
|
|
41817
|
+
throw buildSchemaError(context);
|
|
41818
|
+
}
|
|
41819
|
+
return normalized;
|
|
41820
|
+
}
|
|
41821
|
+
var RESULT_INDEX_FILENAME = "index.jsonl";
|
|
41822
|
+
function buildTestTargetKey(testId, target) {
|
|
41823
|
+
return `${testId ?? "unknown"}::${target ?? "unknown"}`;
|
|
41824
|
+
}
|
|
41825
|
+
function deduplicateByTestIdTarget(results) {
|
|
41826
|
+
const seen = /* @__PURE__ */ new Map();
|
|
41827
|
+
for (let i = 0; i < results.length; i++) {
|
|
41828
|
+
seen.set(buildTestTargetKey(results[i].testId, results[i].target), i);
|
|
41829
|
+
}
|
|
41830
|
+
const deduped = [];
|
|
41831
|
+
for (let i = 0; i < results.length; i++) {
|
|
41832
|
+
const key = buildTestTargetKey(results[i].testId, results[i].target);
|
|
41833
|
+
if (seen.get(key) === i) {
|
|
41834
|
+
deduped.push(results[i]);
|
|
41835
|
+
}
|
|
41836
|
+
}
|
|
41837
|
+
return deduped;
|
|
41838
|
+
}
|
|
41839
|
+
async function aggregateRunDir(runDir, options) {
|
|
41840
|
+
const indexPath = path47.join(runDir, RESULT_INDEX_FILENAME);
|
|
41841
|
+
const content = await readFile20(indexPath, "utf8");
|
|
41842
|
+
const allResults = parseJsonlResults(content);
|
|
41843
|
+
const results = deduplicateByTestIdTarget(allResults);
|
|
41844
|
+
const timing = buildTimingArtifact(results);
|
|
41845
|
+
const timingPath = path47.join(runDir, "timing.json");
|
|
41846
|
+
await writeFile10(timingPath, `${JSON.stringify(timing, null, 2)}
|
|
41847
|
+
`, "utf8");
|
|
41848
|
+
const plannedTestCount = options?.plannedTestCount ?? await readPlannedTestCount(path47.join(runDir, "benchmark.json"));
|
|
41849
|
+
const benchmark = buildBenchmarkArtifact(
|
|
41850
|
+
results,
|
|
41851
|
+
options?.evalFile,
|
|
41852
|
+
options?.experiment,
|
|
41853
|
+
plannedTestCount
|
|
41854
|
+
);
|
|
41855
|
+
const benchmarkPath = path47.join(runDir, "benchmark.json");
|
|
41856
|
+
await writeFile10(benchmarkPath, `${JSON.stringify(benchmark, null, 2)}
|
|
41857
|
+
`, "utf8");
|
|
41858
|
+
const targetSet = new Set(results.map((r) => r.target ?? "unknown"));
|
|
41859
|
+
return { benchmarkPath, timingPath, testCount: results.length, targetCount: targetSet.size };
|
|
41860
|
+
}
|
|
41861
|
+
async function readPlannedTestCount(benchmarkPath) {
|
|
41862
|
+
try {
|
|
41863
|
+
const raw = await readFile20(benchmarkPath, "utf8");
|
|
41864
|
+
const parsed = JSON.parse(raw);
|
|
41865
|
+
const value = parsed.metadata?.planned_test_count;
|
|
41866
|
+
return typeof value === "number" && Number.isFinite(value) ? value : void 0;
|
|
41867
|
+
} catch {
|
|
41868
|
+
return void 0;
|
|
41869
|
+
}
|
|
41870
|
+
}
|
|
41871
|
+
function computeStats(values) {
|
|
41872
|
+
if (values.length === 0) {
|
|
41873
|
+
return { mean: 0, stddev: 0 };
|
|
41874
|
+
}
|
|
41875
|
+
const mean = values.reduce((sum, v) => sum + v, 0) / values.length;
|
|
41876
|
+
const variance = values.reduce((sum, v) => sum + (v - mean) ** 2, 0) / values.length;
|
|
41877
|
+
return {
|
|
41878
|
+
mean: Math.round(mean * 1e3) / 1e3,
|
|
41879
|
+
stddev: Math.round(Math.sqrt(variance) * 1e3) / 1e3
|
|
41880
|
+
};
|
|
41881
|
+
}
|
|
41882
|
+
function computePassRate(result) {
|
|
41883
|
+
const scores = result.scores;
|
|
41884
|
+
if (scores && scores.length > 0) {
|
|
41885
|
+
const passed = scores.filter((s) => s.score >= DEFAULT_THRESHOLD).length;
|
|
41886
|
+
return passed / scores.length;
|
|
41887
|
+
}
|
|
41888
|
+
return (result.score ?? 0) >= DEFAULT_THRESHOLD ? 1 : 0;
|
|
41889
|
+
}
|
|
41890
|
+
function isExecutionError(result) {
|
|
41891
|
+
return result.executionStatus === "execution_error";
|
|
41892
|
+
}
|
|
41893
|
+
function countToolCalls(result) {
|
|
41894
|
+
const toolCalls = { ...result.trace?.toolCalls ?? {} };
|
|
41895
|
+
const total = Object.values(toolCalls).reduce((sum, count) => sum + count, 0);
|
|
41896
|
+
return { toolCalls, total };
|
|
41897
|
+
}
|
|
41898
|
+
function parseWorkspaceChanges(fileChanges) {
|
|
41899
|
+
if (!fileChanges) {
|
|
41900
|
+
return void 0;
|
|
41901
|
+
}
|
|
41902
|
+
let filesModified = 0;
|
|
41903
|
+
let filesCreated = 0;
|
|
41904
|
+
for (const line of fileChanges.split("\n")) {
|
|
41905
|
+
if (line.startsWith("--- /dev/null")) {
|
|
41906
|
+
filesCreated += 1;
|
|
41907
|
+
} else if (line.startsWith("--- a/")) {
|
|
41908
|
+
filesModified += 1;
|
|
41909
|
+
}
|
|
41910
|
+
}
|
|
41911
|
+
const lines = fileChanges.split("\n");
|
|
41912
|
+
const summaryLines = lines.slice(0, 20);
|
|
41913
|
+
const diffSummary = lines.length > 20 ? `${summaryLines.join("\n")}
|
|
41914
|
+
... (${lines.length - 20} more lines)` : fileChanges;
|
|
41915
|
+
return {
|
|
41916
|
+
files_modified: filesModified,
|
|
41917
|
+
files_created: filesCreated,
|
|
41918
|
+
diff_summary: diffSummary
|
|
41919
|
+
};
|
|
41920
|
+
}
|
|
41921
|
+
function buildAssertions(result) {
|
|
41922
|
+
if (!result.assertions) return [];
|
|
41923
|
+
return result.assertions.map((a) => ({
|
|
41924
|
+
text: a.text,
|
|
41925
|
+
passed: a.passed,
|
|
41926
|
+
evidence: a.evidence ?? ""
|
|
41927
|
+
}));
|
|
41928
|
+
}
|
|
41929
|
+
function buildEvaluators(scores) {
|
|
41930
|
+
if (!scores || scores.length === 0) {
|
|
41931
|
+
return void 0;
|
|
41932
|
+
}
|
|
41933
|
+
return scores.map((s) => ({
|
|
41934
|
+
name: s.name,
|
|
41935
|
+
type: s.type,
|
|
41936
|
+
score: s.score,
|
|
41937
|
+
reasoning: "",
|
|
41938
|
+
weight: s.weight,
|
|
41939
|
+
verdict: s.verdict,
|
|
41940
|
+
assertions: s.assertions,
|
|
41941
|
+
details: s.details
|
|
41942
|
+
}));
|
|
41943
|
+
}
|
|
41944
|
+
function toIndexAssertion(assertion) {
|
|
41945
|
+
return {
|
|
41946
|
+
text: assertion.text,
|
|
41947
|
+
passed: assertion.passed,
|
|
41948
|
+
evidence: assertion.evidence
|
|
41949
|
+
};
|
|
41950
|
+
}
|
|
41951
|
+
function toIndexScore(score) {
|
|
41952
|
+
return {
|
|
41953
|
+
name: score.name,
|
|
41954
|
+
type: score.type,
|
|
41955
|
+
score: score.score,
|
|
41956
|
+
weight: score.weight,
|
|
41957
|
+
verdict: score.verdict,
|
|
41958
|
+
assertions: score.assertions.map(toIndexAssertion),
|
|
41959
|
+
raw_request: score.rawRequest,
|
|
41960
|
+
input: score.input,
|
|
41961
|
+
target: score.target,
|
|
41962
|
+
scores: score.scores?.map(toIndexScore),
|
|
41963
|
+
details: score.details,
|
|
41964
|
+
token_usage: score.tokenUsage,
|
|
41965
|
+
duration_ms: score.durationMs,
|
|
41966
|
+
started_at: score.startedAt,
|
|
41967
|
+
ended_at: score.endedAt
|
|
41968
|
+
};
|
|
41969
|
+
}
|
|
41970
|
+
function toIndexScores(scores) {
|
|
41971
|
+
return scores?.map(toIndexScore);
|
|
41972
|
+
}
|
|
41973
|
+
function dropUndefined5(value) {
|
|
41974
|
+
return Object.fromEntries(Object.entries(value).filter(([, entry]) => entry !== void 0));
|
|
41975
|
+
}
|
|
41976
|
+
function isRecord4(value) {
|
|
41977
|
+
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
41978
|
+
}
|
|
41979
|
+
function toIndexRerunSource(value) {
|
|
41980
|
+
if (!isRecord4(value)) {
|
|
41981
|
+
return void 0;
|
|
41982
|
+
}
|
|
41983
|
+
return dropUndefined5({
|
|
41984
|
+
mode: value.mode,
|
|
41985
|
+
source_run_dir: value.sourceRunDir,
|
|
41986
|
+
source_index_path: value.sourceIndexPath,
|
|
41987
|
+
source_artifact_dir: value.sourceArtifactDir,
|
|
41988
|
+
source_task_dir: value.sourceTaskDir,
|
|
41989
|
+
source_test_id: value.sourceTestId,
|
|
41990
|
+
source_target: value.sourceTarget,
|
|
41991
|
+
source_timestamp: value.sourceTimestamp
|
|
41992
|
+
});
|
|
41993
|
+
}
|
|
41994
|
+
function toIndexMetadata(metadata) {
|
|
41995
|
+
if (!metadata) {
|
|
41996
|
+
return void 0;
|
|
41997
|
+
}
|
|
41998
|
+
const rerunSource = toIndexRerunSource(metadata.rerunSource);
|
|
41999
|
+
if (!rerunSource) {
|
|
42000
|
+
return { ...metadata };
|
|
42001
|
+
}
|
|
42002
|
+
return {
|
|
42003
|
+
...Object.fromEntries(Object.entries(metadata).filter(([key]) => key !== "rerunSource")),
|
|
42004
|
+
rerun_source: rerunSource
|
|
42005
|
+
};
|
|
42006
|
+
}
|
|
42007
|
+
function buildGradingArtifact(result) {
|
|
42008
|
+
const assertions = buildAssertions(result);
|
|
42009
|
+
const passed = assertions.filter((e) => e.passed).length;
|
|
42010
|
+
const failed = assertions.filter((e) => !e.passed).length;
|
|
42011
|
+
const total = assertions.length;
|
|
42012
|
+
const { toolCalls, total: totalToolCalls } = countToolCalls(result);
|
|
42013
|
+
const errorsEncountered = result.error ? 1 : 0;
|
|
42014
|
+
return {
|
|
42015
|
+
assertions,
|
|
42016
|
+
summary: {
|
|
42017
|
+
passed,
|
|
42018
|
+
failed,
|
|
42019
|
+
total,
|
|
42020
|
+
pass_rate: total > 0 ? Math.round(passed / total * 1e3) / 1e3 : 0
|
|
42021
|
+
},
|
|
42022
|
+
execution_metrics: {
|
|
42023
|
+
tool_calls: toolCalls,
|
|
42024
|
+
total_tool_calls: totalToolCalls,
|
|
42025
|
+
errors_encountered: errorsEncountered
|
|
42026
|
+
},
|
|
42027
|
+
graders: buildEvaluators(result.scores),
|
|
42028
|
+
workspace_changes: parseWorkspaceChanges(result.fileChanges),
|
|
42029
|
+
conversation: result.conversationId ? {
|
|
42030
|
+
turns: result.trace?.messages.filter((message) => message.role === "assistant").length ?? 0,
|
|
42031
|
+
conversation_id: result.conversationId
|
|
42032
|
+
} : void 0
|
|
42033
|
+
};
|
|
42034
|
+
}
|
|
42035
|
+
function buildTimingArtifact(results) {
|
|
42036
|
+
let totalInput = 0;
|
|
42037
|
+
let totalOutput = 0;
|
|
42038
|
+
let totalReasoning = 0;
|
|
42039
|
+
let totalDurationMs = 0;
|
|
42040
|
+
for (const result of results) {
|
|
42041
|
+
const usage = result.tokenUsage;
|
|
42042
|
+
if (usage) {
|
|
42043
|
+
totalInput += usage.input ?? 0;
|
|
42044
|
+
totalOutput += usage.output ?? 0;
|
|
42045
|
+
totalReasoning += usage.reasoning ?? 0;
|
|
42046
|
+
}
|
|
42047
|
+
if (result.durationMs != null) {
|
|
42048
|
+
totalDurationMs += result.durationMs;
|
|
42049
|
+
}
|
|
42050
|
+
}
|
|
42051
|
+
return {
|
|
42052
|
+
total_tokens: totalInput + totalOutput,
|
|
42053
|
+
duration_ms: totalDurationMs,
|
|
42054
|
+
total_duration_seconds: Math.round(totalDurationMs / 1e3 * 1e3) / 1e3,
|
|
42055
|
+
token_usage: {
|
|
42056
|
+
input: totalInput,
|
|
42057
|
+
output: totalOutput,
|
|
42058
|
+
reasoning: totalReasoning
|
|
42059
|
+
}
|
|
42060
|
+
};
|
|
42061
|
+
}
|
|
42062
|
+
function buildBenchmarkArtifact(results, evalFile = "", experiment, plannedTestCount) {
|
|
42063
|
+
const targetSet = /* @__PURE__ */ new Set();
|
|
42064
|
+
const testIdSet = /* @__PURE__ */ new Set();
|
|
42065
|
+
for (const result of results) {
|
|
42066
|
+
targetSet.add(result.target ?? "unknown");
|
|
42067
|
+
testIdSet.add(result.testId ?? "unknown");
|
|
42068
|
+
}
|
|
42069
|
+
const targets = [...targetSet].sort();
|
|
42070
|
+
const testIds = [...testIdSet].sort();
|
|
42071
|
+
const runSummary = {};
|
|
42072
|
+
const notes = [];
|
|
42073
|
+
for (const target of targets) {
|
|
42074
|
+
const targetResults = results.filter((r) => r.target === target);
|
|
42075
|
+
const qualityResults = targetResults.filter((r) => !isExecutionError(r));
|
|
42076
|
+
const passRates = qualityResults.map(computePassRate);
|
|
42077
|
+
const timings = targetResults.filter((r) => r.durationMs != null).map((r) => r.durationMs / 1e3);
|
|
42078
|
+
const tokens = targetResults.filter((r) => r.tokenUsage != null).map((r) => {
|
|
42079
|
+
const usage = r.tokenUsage;
|
|
42080
|
+
return (usage.input ?? 0) + (usage.output ?? 0);
|
|
42081
|
+
});
|
|
42082
|
+
const entry = {
|
|
42083
|
+
pass_rate: computeStats(passRates),
|
|
42084
|
+
time_seconds: computeStats(timings),
|
|
42085
|
+
tokens: computeStats(tokens)
|
|
42086
|
+
};
|
|
42087
|
+
const toolCallCounts = targetResults.map((r) => countToolCalls(r).total);
|
|
42088
|
+
if (toolCallCounts.some((count) => count > 0)) {
|
|
42089
|
+
entry.tool_calls = computeStats(toolCallCounts);
|
|
42090
|
+
}
|
|
42091
|
+
const costs = targetResults.filter((r) => r.costUsd != null).map((r) => r.costUsd);
|
|
42092
|
+
if (costs.length > 0) {
|
|
42093
|
+
entry.cost_usd = computeStats(costs);
|
|
42094
|
+
}
|
|
42095
|
+
runSummary[target] = entry;
|
|
42096
|
+
}
|
|
42097
|
+
const evaluatorScores = /* @__PURE__ */ new Map();
|
|
42098
|
+
for (const result of results) {
|
|
42099
|
+
if (isExecutionError(result)) {
|
|
42100
|
+
continue;
|
|
42101
|
+
}
|
|
42102
|
+
for (const score of result.scores ?? []) {
|
|
42103
|
+
const key = `${score.name}:${score.type}`;
|
|
42104
|
+
if (!evaluatorScores.has(key)) {
|
|
42105
|
+
evaluatorScores.set(key, []);
|
|
42106
|
+
}
|
|
42107
|
+
evaluatorScores.get(key)?.push(score.score);
|
|
42108
|
+
}
|
|
42109
|
+
}
|
|
42110
|
+
let perEvaluatorSummary;
|
|
42111
|
+
if (evaluatorScores.size > 0) {
|
|
42112
|
+
perEvaluatorSummary = {};
|
|
42113
|
+
for (const [key, scores] of evaluatorScores) {
|
|
42114
|
+
perEvaluatorSummary[key] = computeStats(scores);
|
|
42115
|
+
}
|
|
42116
|
+
}
|
|
42117
|
+
const errorCount = results.filter((r) => r.executionStatus === "execution_error").length;
|
|
42118
|
+
if (errorCount > 0) {
|
|
42119
|
+
notes.push(
|
|
42120
|
+
`${errorCount} test(s) had execution errors and are excluded from quality pass_rate`
|
|
42121
|
+
);
|
|
42122
|
+
}
|
|
42123
|
+
if (results.length === 0) {
|
|
42124
|
+
notes.push("No results to summarize");
|
|
42125
|
+
}
|
|
42126
|
+
const firstResult = results[0];
|
|
42127
|
+
const timestamp = firstResult?.timestamp ?? (/* @__PURE__ */ new Date()).toISOString();
|
|
42128
|
+
return {
|
|
42129
|
+
metadata: {
|
|
42130
|
+
eval_file: evalFile,
|
|
42131
|
+
timestamp,
|
|
42132
|
+
targets,
|
|
42133
|
+
tests_run: testIds,
|
|
42134
|
+
experiment,
|
|
42135
|
+
planned_test_count: plannedTestCount
|
|
42136
|
+
},
|
|
42137
|
+
run_summary: runSummary,
|
|
42138
|
+
per_grader_summary: perEvaluatorSummary,
|
|
42139
|
+
notes
|
|
42140
|
+
};
|
|
42141
|
+
}
|
|
42142
|
+
async function writeInitialBenchmarkArtifact(runDir, options) {
|
|
42143
|
+
await mkdir18(runDir, { recursive: true });
|
|
42144
|
+
const stub = buildBenchmarkArtifact(
|
|
42145
|
+
[],
|
|
42146
|
+
options.evalFile,
|
|
42147
|
+
options.experiment,
|
|
42148
|
+
options.plannedTestCount
|
|
42149
|
+
);
|
|
42150
|
+
const benchmarkPath = path47.join(runDir, "benchmark.json");
|
|
42151
|
+
await writeFile10(benchmarkPath, `${JSON.stringify(stub, null, 2)}
|
|
42152
|
+
`, "utf8");
|
|
42153
|
+
}
|
|
42154
|
+
function buildAggregateGradingArtifact(results) {
|
|
42155
|
+
const assertions = [];
|
|
42156
|
+
for (const result of results.filter((r) => !isExecutionError(r))) {
|
|
42157
|
+
const testId = result.testId ?? "unknown";
|
|
42158
|
+
for (const assertion of result.assertions ?? []) {
|
|
42159
|
+
assertions.push({
|
|
42160
|
+
test_id: testId,
|
|
42161
|
+
text: assertion.text,
|
|
42162
|
+
passed: assertion.passed,
|
|
42163
|
+
evidence: assertion.evidence ?? ""
|
|
42164
|
+
});
|
|
42165
|
+
}
|
|
42166
|
+
}
|
|
42167
|
+
const passed = assertions.filter((a) => a.passed).length;
|
|
42168
|
+
const failed = assertions.filter((a) => !a.passed).length;
|
|
42169
|
+
const total = assertions.length;
|
|
42170
|
+
return {
|
|
42171
|
+
assertions,
|
|
42172
|
+
summary: {
|
|
42173
|
+
passed,
|
|
42174
|
+
failed,
|
|
42175
|
+
total,
|
|
42176
|
+
pass_rate: total > 0 ? Math.round(passed / total * 1e3) / 1e3 : 0
|
|
42177
|
+
}
|
|
42178
|
+
};
|
|
42179
|
+
}
|
|
42180
|
+
function safeArtifactPathSegment(value, fallback) {
|
|
42181
|
+
const trimmed = value?.trim();
|
|
42182
|
+
if (!trimmed) {
|
|
42183
|
+
return fallback;
|
|
42184
|
+
}
|
|
42185
|
+
return trimmed.replace(/[/\\:*?"<>|]/g, "_");
|
|
42186
|
+
}
|
|
42187
|
+
function safeTestId(testId) {
|
|
42188
|
+
return safeArtifactPathSegment(testId, "unknown");
|
|
42189
|
+
}
|
|
42190
|
+
function getSuite(result) {
|
|
42191
|
+
return result.suite;
|
|
42192
|
+
}
|
|
42193
|
+
function buildArtifactSubdir(result) {
|
|
42194
|
+
const segments = [];
|
|
42195
|
+
const evalSet = getSuite(result);
|
|
42196
|
+
if (evalSet) {
|
|
42197
|
+
segments.push(safeArtifactPathSegment(evalSet, "default"));
|
|
42198
|
+
}
|
|
42199
|
+
segments.push(safeTestId(result.testId));
|
|
42200
|
+
return path47.posix.join(...segments);
|
|
42201
|
+
}
|
|
42202
|
+
function formatOutputMarkdown(output) {
|
|
42203
|
+
return output.map((msg) => `@[${msg.role}]:
|
|
42204
|
+
${String(msg.content ?? "")}`).join("\n\n");
|
|
42205
|
+
}
|
|
42206
|
+
function extractInput(result) {
|
|
42207
|
+
const input = result.input;
|
|
42208
|
+
if (!input) return null;
|
|
42209
|
+
if (typeof input === "string") return input;
|
|
42210
|
+
if (Array.isArray(input) && input.length > 0) {
|
|
42211
|
+
return formatOutputMarkdown(input);
|
|
42212
|
+
}
|
|
42213
|
+
return null;
|
|
42214
|
+
}
|
|
42215
|
+
function toRelativeArtifactPath(outputDir, filePath) {
|
|
42216
|
+
return path47.relative(outputDir, filePath).split(path47.sep).join("/");
|
|
42217
|
+
}
|
|
42218
|
+
function findResultSourceTest(result, testByTestId) {
|
|
42219
|
+
return testByTestId.get(result.testId ?? "unknown");
|
|
42220
|
+
}
|
|
42221
|
+
function resolveEnvelopeEvalPath(result, testByTestId, fallbackEvalFile) {
|
|
42222
|
+
const source = findResultSourceTest(result, testByTestId)?.source;
|
|
42223
|
+
return source?.evalFileRepoPath ?? source?.evalFilePath ?? fallbackEvalFile;
|
|
42224
|
+
}
|
|
42225
|
+
function resultHasExecutionTraceTranscript(result) {
|
|
42226
|
+
return result.output.length > 0 || result.trace.messages.length > 0;
|
|
42227
|
+
}
|
|
42228
|
+
async function writeTraceEnvelopeSidecar(params) {
|
|
42229
|
+
const hasTranscript = resultHasExecutionTraceTranscript(params.result);
|
|
42230
|
+
const envelope = buildTraceEnvelopeFromEvaluationResult(params.result, {
|
|
42231
|
+
evalPath: params.evalPath,
|
|
42232
|
+
runId: path47.basename(params.outputDir),
|
|
42233
|
+
experiment: params.experiment,
|
|
42234
|
+
source: { path: RESULT_INDEX_FILENAME },
|
|
42235
|
+
capture: { content: "full", redactionLevel: "none", redactedFields: [] },
|
|
42236
|
+
artifacts: {
|
|
42237
|
+
execution_trace_path: "outputs/execution-trace.json",
|
|
42238
|
+
answer_path: params.result.output.length > 0 ? "outputs/answer.md" : void 0,
|
|
42239
|
+
response_path: params.result.output.length > 0 ? "outputs/response.md" : void 0,
|
|
42240
|
+
transcript_path: hasTranscript ? "outputs/transcript.jsonl" : void 0
|
|
42241
|
+
}
|
|
42242
|
+
});
|
|
42243
|
+
await writeFile10(
|
|
42244
|
+
path47.join(params.outputsDir, "execution-trace.json"),
|
|
42245
|
+
`${JSON.stringify(toTraceEnvelopeWire(envelope), null, 2)}
|
|
42246
|
+
`,
|
|
42247
|
+
"utf8"
|
|
42248
|
+
);
|
|
42249
|
+
return envelope;
|
|
42250
|
+
}
|
|
42251
|
+
function buildIndexArtifactEntry(result, options) {
|
|
42252
|
+
return {
|
|
42253
|
+
timestamp: result.timestamp,
|
|
42254
|
+
test_id: result.testId ?? "unknown",
|
|
42255
|
+
suite: getSuite(result),
|
|
42256
|
+
category: result.category,
|
|
42257
|
+
conversation_id: result.conversationId,
|
|
42258
|
+
score: result.score,
|
|
42259
|
+
target: result.target ?? "unknown",
|
|
42260
|
+
token_usage: result.tokenUsage,
|
|
42261
|
+
cost_usd: result.costUsd,
|
|
42262
|
+
duration_ms: result.durationMs,
|
|
42263
|
+
start_time: result.startTime,
|
|
42264
|
+
end_time: result.endTime,
|
|
42265
|
+
scores: toIndexScores(result.scores),
|
|
42266
|
+
execution_status: result.executionStatus,
|
|
42267
|
+
error: result.error,
|
|
42268
|
+
failure_stage: result.failureStage,
|
|
42269
|
+
failure_reason_code: result.failureReasonCode,
|
|
42270
|
+
workspace_path: result.workspacePath,
|
|
42271
|
+
artifact_dir: options.artifactDir ? toRelativeArtifactPath(options.outputDir, options.artifactDir) : void 0,
|
|
42272
|
+
grading_path: toRelativeArtifactPath(options.outputDir, options.gradingPath),
|
|
42273
|
+
timing_path: toRelativeArtifactPath(options.outputDir, options.timingPath),
|
|
42274
|
+
output_path: options.outputPath ? toRelativeArtifactPath(options.outputDir, options.outputPath) : void 0,
|
|
42275
|
+
answer_path: options.answerPath ? toRelativeArtifactPath(options.outputDir, options.answerPath) : void 0,
|
|
42276
|
+
transcript_path: options.transcriptPath ? toRelativeArtifactPath(options.outputDir, options.transcriptPath) : void 0,
|
|
42277
|
+
input_path: options.inputPath ? toRelativeArtifactPath(options.outputDir, options.inputPath) : void 0,
|
|
42278
|
+
response_path: options.responsePath ? toRelativeArtifactPath(options.outputDir, options.responsePath) : void 0,
|
|
42279
|
+
...options.extraIndexFields,
|
|
42280
|
+
metadata: toIndexMetadata(result.metadata)
|
|
42281
|
+
};
|
|
42282
|
+
}
|
|
42283
|
+
function buildResultIndexArtifact(result, extraIndexFields) {
|
|
42284
|
+
const artifactSubdir = buildArtifactSubdir(result);
|
|
42285
|
+
const input = extractInput(result);
|
|
42286
|
+
const hasAnswer = result.output.length > 0;
|
|
42287
|
+
const hasTranscript = resultHasExecutionTraceTranscript(result);
|
|
42288
|
+
return {
|
|
42289
|
+
timestamp: result.timestamp,
|
|
42290
|
+
test_id: result.testId ?? "unknown",
|
|
42291
|
+
suite: getSuite(result),
|
|
42292
|
+
category: result.category,
|
|
42293
|
+
conversation_id: result.conversationId,
|
|
42294
|
+
score: result.score,
|
|
42295
|
+
target: result.target ?? "unknown",
|
|
42296
|
+
token_usage: result.tokenUsage,
|
|
42297
|
+
cost_usd: result.costUsd,
|
|
42298
|
+
duration_ms: result.durationMs,
|
|
42299
|
+
start_time: result.startTime,
|
|
42300
|
+
end_time: result.endTime,
|
|
42301
|
+
scores: toIndexScores(result.scores),
|
|
42302
|
+
execution_status: result.executionStatus,
|
|
42303
|
+
error: result.error,
|
|
42304
|
+
failure_stage: result.failureStage,
|
|
42305
|
+
failure_reason_code: result.failureReasonCode,
|
|
42306
|
+
workspace_path: result.workspacePath,
|
|
42307
|
+
artifact_dir: artifactSubdir,
|
|
42308
|
+
grading_path: path47.posix.join(artifactSubdir, "grading.json"),
|
|
42309
|
+
timing_path: path47.posix.join(artifactSubdir, "timing.json"),
|
|
42310
|
+
input_path: input ? path47.posix.join(artifactSubdir, "input.md") : void 0,
|
|
42311
|
+
output_path: hasAnswer ? path47.posix.join(artifactSubdir, "outputs", "answer.md") : void 0,
|
|
42312
|
+
answer_path: hasAnswer ? path47.posix.join(artifactSubdir, "outputs", "answer.md") : void 0,
|
|
42313
|
+
transcript_path: hasTranscript ? path47.posix.join(artifactSubdir, "outputs", "transcript.jsonl") : void 0,
|
|
42314
|
+
response_path: hasAnswer ? path47.posix.join(artifactSubdir, "outputs", "response.md") : void 0,
|
|
42315
|
+
...extraIndexFields,
|
|
42316
|
+
metadata: toIndexMetadata(result.metadata)
|
|
42317
|
+
};
|
|
42318
|
+
}
|
|
42319
|
+
async function writeJsonlFile(filePath, records) {
|
|
42320
|
+
const content = records.length === 0 ? "" : `${records.map((record2) => JSON.stringify(record2)).join("\n")}
|
|
42321
|
+
`;
|
|
42322
|
+
await writeFile10(filePath, content, "utf8");
|
|
42323
|
+
}
|
|
42324
|
+
function traceProjectionForTranscript(result, envelope) {
|
|
42325
|
+
return {
|
|
42326
|
+
...result.trace,
|
|
42327
|
+
messages: traceEnvelopeToTranscriptMessages(envelope)
|
|
42328
|
+
};
|
|
42329
|
+
}
|
|
42330
|
+
function hasTranscriptProjection(result, envelope) {
|
|
42331
|
+
return result.output.length > 0 || traceEnvelopeToTranscriptMessages(envelope).length > 0;
|
|
42332
|
+
}
|
|
42333
|
+
async function writeTranscriptJsonl(filePath, result, envelope) {
|
|
42334
|
+
const lines = traceToTranscriptJsonLines(traceProjectionForTranscript(result, envelope), {
|
|
42335
|
+
testId: result.testId,
|
|
42336
|
+
target: result.target
|
|
42337
|
+
});
|
|
42338
|
+
const content = lines.length > 0 ? `${lines.map((line) => JSON.stringify(line)).join("\n")}
|
|
42339
|
+
` : "";
|
|
42340
|
+
await writeFile10(filePath, content, "utf8");
|
|
42341
|
+
}
|
|
42342
|
+
function indexRecordKey(record2) {
|
|
42343
|
+
if (!isRecord4(record2)) {
|
|
42344
|
+
return void 0;
|
|
42345
|
+
}
|
|
42346
|
+
const testId = typeof record2.test_id === "string" ? record2.test_id : typeof record2.testId === "string" ? record2.testId : void 0;
|
|
42347
|
+
const target = typeof record2.target === "string" ? record2.target : void 0;
|
|
42348
|
+
return testId ? buildTestTargetKey(testId, target) : void 0;
|
|
42349
|
+
}
|
|
42350
|
+
async function rewriteExistingIndexRecords(outputDir, replacements) {
|
|
42351
|
+
if (replacements.length === 0) {
|
|
42352
|
+
return;
|
|
42353
|
+
}
|
|
42354
|
+
const indexPath = path47.join(outputDir, RESULT_INDEX_FILENAME);
|
|
42355
|
+
const content = await readFile20(indexPath, "utf8").catch(() => void 0);
|
|
42356
|
+
if (content === void 0) {
|
|
42357
|
+
return;
|
|
42358
|
+
}
|
|
42359
|
+
const replacementsByKey = new Map(
|
|
42360
|
+
replacements.map((record2) => [buildTestTargetKey(record2.test_id, record2.target), record2])
|
|
42361
|
+
);
|
|
42362
|
+
const seen = /* @__PURE__ */ new Set();
|
|
42363
|
+
const records = [];
|
|
42364
|
+
for (const line of content.split("\n")) {
|
|
42365
|
+
if (line.trim().length === 0) {
|
|
42366
|
+
continue;
|
|
42367
|
+
}
|
|
42368
|
+
try {
|
|
42369
|
+
const parsed = JSON.parse(line);
|
|
42370
|
+
const key = indexRecordKey(parsed);
|
|
42371
|
+
const replacement = key ? replacementsByKey.get(key) : void 0;
|
|
42372
|
+
if (key && replacement) {
|
|
42373
|
+
records.push(replacement);
|
|
42374
|
+
seen.add(key);
|
|
42375
|
+
} else {
|
|
42376
|
+
records.push(parsed);
|
|
42377
|
+
}
|
|
42378
|
+
} catch {
|
|
42379
|
+
}
|
|
42380
|
+
}
|
|
42381
|
+
for (const replacement of replacements) {
|
|
42382
|
+
const key = buildTestTargetKey(replacement.test_id, replacement.target);
|
|
42383
|
+
if (!seen.has(key)) {
|
|
42384
|
+
records.push(replacement);
|
|
42385
|
+
}
|
|
42386
|
+
}
|
|
42387
|
+
await writeJsonlFile(indexPath, records);
|
|
42388
|
+
}
|
|
42389
|
+
function toCamelCase2(str) {
|
|
42390
|
+
return str.replace(/_([a-z])/g, (_, letter) => letter.toUpperCase());
|
|
42391
|
+
}
|
|
42392
|
+
function toCamelCaseDeep2(obj) {
|
|
42393
|
+
if (obj === null || obj === void 0) {
|
|
42394
|
+
return obj;
|
|
42395
|
+
}
|
|
42396
|
+
if (Array.isArray(obj)) {
|
|
42397
|
+
return obj.map((item) => toCamelCaseDeep2(item));
|
|
42398
|
+
}
|
|
42399
|
+
if (typeof obj === "object") {
|
|
42400
|
+
const result = {};
|
|
42401
|
+
for (const [key, value] of Object.entries(obj)) {
|
|
42402
|
+
result[toCamelCase2(key)] = toCamelCaseDeep2(value);
|
|
42403
|
+
}
|
|
42404
|
+
return result;
|
|
42405
|
+
}
|
|
42406
|
+
return obj;
|
|
42407
|
+
}
|
|
42408
|
+
var EXECUTION_STATUSES = /* @__PURE__ */ new Set([
|
|
42409
|
+
"ok",
|
|
42410
|
+
"quality_failure",
|
|
42411
|
+
"execution_error"
|
|
42412
|
+
]);
|
|
42413
|
+
function isAssertionEntry(value) {
|
|
42414
|
+
if (!value || typeof value !== "object" || Array.isArray(value)) {
|
|
42415
|
+
return false;
|
|
42416
|
+
}
|
|
42417
|
+
const candidate = value;
|
|
42418
|
+
return typeof candidate.text === "string" && typeof candidate.passed === "boolean" && (candidate.evidence === void 0 || typeof candidate.evidence === "string");
|
|
42419
|
+
}
|
|
42420
|
+
function isOutputMessage(value) {
|
|
42421
|
+
if (!value || typeof value !== "object" || Array.isArray(value)) {
|
|
42422
|
+
return false;
|
|
42423
|
+
}
|
|
42424
|
+
return typeof value.role === "string";
|
|
42425
|
+
}
|
|
42426
|
+
function isExecutionStatus(value) {
|
|
42427
|
+
return typeof value === "string" && EXECUTION_STATUSES.has(value);
|
|
42428
|
+
}
|
|
42429
|
+
function isTraceRecord(value) {
|
|
42430
|
+
return !!value && typeof value === "object" && !Array.isArray(value) && Array.isArray(value.messages) && Array.isArray(value.events);
|
|
42431
|
+
}
|
|
42432
|
+
function normalizeParsedResult(value) {
|
|
42433
|
+
if (!value || typeof value !== "object" || Array.isArray(value)) {
|
|
42434
|
+
return void 0;
|
|
42435
|
+
}
|
|
42436
|
+
const result = value;
|
|
42437
|
+
const legacyOutputMessages = Array.isArray(result.output) ? result.output.filter(isOutputMessage) : void 0;
|
|
42438
|
+
const output = typeof result.output === "string" ? result.output : extractLastAssistantContent(legacyOutputMessages);
|
|
42439
|
+
const legacySummary = result.trace && typeof result.trace === "object" && !Array.isArray(result.trace) ? result.trace : void 0;
|
|
42440
|
+
const trace = isTraceRecord(result.trace) ? result.trace : buildTraceFromMessages({
|
|
42441
|
+
input: Array.isArray(result.input) ? result.input : [],
|
|
42442
|
+
output: legacyOutputMessages,
|
|
42443
|
+
summary: legacySummary,
|
|
42444
|
+
finalOutput: output,
|
|
42445
|
+
tokenUsage: result.tokenUsage,
|
|
42446
|
+
costUsd: typeof result.costUsd === "number" ? result.costUsd : void 0,
|
|
42447
|
+
durationMs: typeof result.durationMs === "number" ? result.durationMs : void 0,
|
|
42448
|
+
target: typeof result.target === "string" ? result.target : void 0,
|
|
42449
|
+
testId: typeof result.testId === "string" ? result.testId : void 0
|
|
42450
|
+
});
|
|
42451
|
+
return {
|
|
42452
|
+
...result,
|
|
42453
|
+
timestamp: typeof result.timestamp === "string" ? result.timestamp : (/* @__PURE__ */ new Date(0)).toISOString(),
|
|
42454
|
+
testId: typeof result.testId === "string" ? result.testId : "unknown",
|
|
42455
|
+
score: typeof result.score === "number" ? result.score : 0,
|
|
42456
|
+
assertions: Array.isArray(result.assertions) ? result.assertions.filter(isAssertionEntry) : [],
|
|
42457
|
+
target: typeof result.target === "string" ? result.target : "unknown",
|
|
42458
|
+
output,
|
|
42459
|
+
trace,
|
|
42460
|
+
executionStatus: isExecutionStatus(result.executionStatus) ? result.executionStatus : "ok"
|
|
42461
|
+
};
|
|
42462
|
+
}
|
|
42463
|
+
function parseJsonlResults(content) {
|
|
42464
|
+
const results = [];
|
|
42465
|
+
const lines = content.split("\n");
|
|
42466
|
+
for (let i = 0; i < lines.length; i++) {
|
|
42467
|
+
const trimmed = lines[i]?.trim();
|
|
42468
|
+
if (!trimmed) {
|
|
42469
|
+
continue;
|
|
42470
|
+
}
|
|
42471
|
+
let parsed;
|
|
42472
|
+
try {
|
|
42473
|
+
parsed = JSON.parse(trimmed);
|
|
42474
|
+
} catch {
|
|
42475
|
+
continue;
|
|
42476
|
+
}
|
|
42477
|
+
const canonicalRow = normalizeResultRow(parsed, { lineNumber: i + 1 });
|
|
42478
|
+
const camelCased = toCamelCaseDeep2(canonicalRow);
|
|
42479
|
+
const normalized = normalizeParsedResult(camelCased);
|
|
42480
|
+
if (normalized) {
|
|
42481
|
+
results.push(normalized);
|
|
42482
|
+
}
|
|
42483
|
+
}
|
|
42484
|
+
return results;
|
|
42485
|
+
}
|
|
42486
|
+
async function writeArtifacts(jsonlPath, outputDir, options) {
|
|
42487
|
+
const content = await readFile20(jsonlPath, "utf8");
|
|
42488
|
+
const results = parseJsonlResults(content);
|
|
42489
|
+
return writeArtifactsFromResults(results, outputDir, options);
|
|
42490
|
+
}
|
|
42491
|
+
function buildTranscriptMessageLines(results) {
|
|
42492
|
+
const lines = [];
|
|
42493
|
+
for (const result of results) {
|
|
42494
|
+
const transcriptLines = traceToTranscriptJsonLines(result.trace, {
|
|
42495
|
+
testId: result.testId,
|
|
42496
|
+
target: result.target
|
|
42497
|
+
});
|
|
42498
|
+
lines.push(...transcriptLines.map((line) => JSON.stringify(line)));
|
|
42499
|
+
}
|
|
42500
|
+
return lines.length > 0 ? `${lines.join("\n")}
|
|
42501
|
+
` : "";
|
|
42502
|
+
}
|
|
42503
|
+
async function collectAdditionalIndexFields(result, outputDir, testDir, testByTestId, additionalArtifacts) {
|
|
42504
|
+
if (!additionalArtifacts) {
|
|
42505
|
+
return void 0;
|
|
42506
|
+
}
|
|
42507
|
+
return additionalArtifacts({
|
|
42508
|
+
result,
|
|
42509
|
+
outputDir,
|
|
42510
|
+
testDir,
|
|
42511
|
+
sourceTest: testByTestId.get(result.testId ?? "unknown"),
|
|
42512
|
+
sourceTestsById: testByTestId
|
|
42513
|
+
});
|
|
42514
|
+
}
|
|
42515
|
+
async function writePerTestArtifacts(results, outputDir, options) {
|
|
42516
|
+
await mkdir18(outputDir, { recursive: true });
|
|
42517
|
+
const testByTestId = new Map((options?.sourceTests ?? []).map((test) => [test.id, test]));
|
|
42518
|
+
const indexRecords = [];
|
|
42519
|
+
for (const result of results) {
|
|
42520
|
+
const grading = buildGradingArtifact(result);
|
|
42521
|
+
const timing = buildTimingArtifact([result]);
|
|
42522
|
+
const artifactSubdir = buildArtifactSubdir(result);
|
|
42523
|
+
const testDir = path47.join(outputDir, artifactSubdir);
|
|
42524
|
+
await mkdir18(testDir, { recursive: true });
|
|
42525
|
+
await writeFile10(
|
|
42526
|
+
path47.join(testDir, "grading.json"),
|
|
42527
|
+
`${JSON.stringify(grading, null, 2)}
|
|
42528
|
+
`,
|
|
42529
|
+
"utf8"
|
|
42530
|
+
);
|
|
42531
|
+
await writeFile10(
|
|
42532
|
+
path47.join(testDir, "timing.json"),
|
|
42533
|
+
`${JSON.stringify(timing, null, 2)}
|
|
42534
|
+
`,
|
|
42535
|
+
"utf8"
|
|
42536
|
+
);
|
|
42537
|
+
const input = extractInput(result);
|
|
42538
|
+
if (input) {
|
|
42539
|
+
await writeFile10(path47.join(testDir, "input.md"), input, "utf8");
|
|
42540
|
+
}
|
|
42541
|
+
const outputsDir = path47.join(testDir, "outputs");
|
|
42542
|
+
await mkdir18(outputsDir, { recursive: true });
|
|
42543
|
+
if (result.output.length > 0) {
|
|
42544
|
+
await writeFile10(path47.join(outputsDir, "answer.md"), result.output, "utf8");
|
|
42545
|
+
await writeFile10(path47.join(outputsDir, "response.md"), result.output, "utf8");
|
|
42546
|
+
}
|
|
42547
|
+
const envelope = await writeTraceEnvelopeSidecar({
|
|
42548
|
+
result,
|
|
42549
|
+
outputDir,
|
|
42550
|
+
outputsDir,
|
|
42551
|
+
evalPath: resolveEnvelopeEvalPath(result, testByTestId, options?.evalFile),
|
|
42552
|
+
experiment: options?.experiment
|
|
42553
|
+
});
|
|
42554
|
+
if (hasTranscriptProjection(result, envelope)) {
|
|
42555
|
+
await writeTranscriptJsonl(path47.join(outputsDir, "transcript.jsonl"), result, envelope);
|
|
42556
|
+
}
|
|
42557
|
+
const extraIndexFields = await collectAdditionalIndexFields(
|
|
42558
|
+
result,
|
|
42559
|
+
outputDir,
|
|
42560
|
+
testDir,
|
|
42561
|
+
testByTestId,
|
|
42562
|
+
options?.additionalArtifacts
|
|
42563
|
+
);
|
|
42564
|
+
indexRecords.push({
|
|
42565
|
+
...buildResultIndexArtifact(result, extraIndexFields),
|
|
42566
|
+
experiment: options?.experiment
|
|
42567
|
+
});
|
|
42568
|
+
}
|
|
42569
|
+
await rewriteExistingIndexRecords(outputDir, indexRecords);
|
|
42570
|
+
}
|
|
42571
|
+
async function writeArtifactsFromResults(results, outputDir, options) {
|
|
42572
|
+
const testArtifactDir = outputDir;
|
|
42573
|
+
const timingPath = path47.join(outputDir, "timing.json");
|
|
42574
|
+
const benchmarkPath = path47.join(outputDir, "benchmark.json");
|
|
42575
|
+
const indexPath = path47.join(outputDir, RESULT_INDEX_FILENAME);
|
|
42576
|
+
await mkdir18(outputDir, { recursive: true });
|
|
42577
|
+
const indexRecords = [];
|
|
42578
|
+
const testByTestId = new Map((options?.sourceTests ?? []).map((test) => [test.id, test]));
|
|
42579
|
+
for (const result of results) {
|
|
42580
|
+
const grading = buildGradingArtifact(result);
|
|
42581
|
+
const timing2 = buildTimingArtifact([result]);
|
|
42582
|
+
const artifactSubdir = buildArtifactSubdir(result);
|
|
42583
|
+
const testDir = path47.join(outputDir, artifactSubdir);
|
|
42584
|
+
const gradingPath = path47.join(testDir, "grading.json");
|
|
42585
|
+
const perTestTimingPath = path47.join(testDir, "timing.json");
|
|
42586
|
+
await mkdir18(testDir, { recursive: true });
|
|
42587
|
+
await writeFile10(gradingPath, `${JSON.stringify(grading, null, 2)}
|
|
42588
|
+
`, "utf8");
|
|
42589
|
+
await writeFile10(perTestTimingPath, `${JSON.stringify(timing2, null, 2)}
|
|
42590
|
+
`, "utf8");
|
|
42591
|
+
const input = extractInput(result);
|
|
42592
|
+
const inputPath = input ? path47.join(testDir, "input.md") : void 0;
|
|
42593
|
+
if (inputPath && input) {
|
|
42594
|
+
await writeFile10(inputPath, input, "utf8");
|
|
42595
|
+
}
|
|
42596
|
+
const outputsDir = path47.join(testDir, "outputs");
|
|
42597
|
+
await mkdir18(outputsDir, { recursive: true });
|
|
42598
|
+
const answerPath = result.output.length > 0 ? path47.join(outputsDir, "answer.md") : void 0;
|
|
42599
|
+
const responsePath = result.output.length > 0 ? path47.join(outputsDir, "response.md") : void 0;
|
|
42600
|
+
if (answerPath && responsePath) {
|
|
42601
|
+
await writeFile10(answerPath, result.output, "utf8");
|
|
42602
|
+
await writeFile10(responsePath, result.output, "utf8");
|
|
42603
|
+
}
|
|
42604
|
+
const envelope = await writeTraceEnvelopeSidecar({
|
|
42605
|
+
result,
|
|
42606
|
+
outputDir,
|
|
42607
|
+
outputsDir,
|
|
42608
|
+
evalPath: resolveEnvelopeEvalPath(result, testByTestId, options?.evalFile),
|
|
42609
|
+
experiment: options?.experiment
|
|
42610
|
+
});
|
|
42611
|
+
const transcriptPath = hasTranscriptProjection(result, envelope) ? path47.join(outputsDir, "transcript.jsonl") : void 0;
|
|
42612
|
+
if (transcriptPath) {
|
|
42613
|
+
await writeTranscriptJsonl(transcriptPath, result, envelope);
|
|
42614
|
+
}
|
|
42615
|
+
const extraIndexFields = await collectAdditionalIndexFields(
|
|
42616
|
+
result,
|
|
42617
|
+
outputDir,
|
|
42618
|
+
testDir,
|
|
42619
|
+
testByTestId,
|
|
42620
|
+
options?.additionalArtifacts
|
|
42621
|
+
);
|
|
42622
|
+
indexRecords.push({
|
|
42623
|
+
...buildIndexArtifactEntry(result, {
|
|
42624
|
+
outputDir,
|
|
42625
|
+
artifactDir: testDir,
|
|
42626
|
+
gradingPath,
|
|
42627
|
+
timingPath: perTestTimingPath,
|
|
42628
|
+
outputPath: answerPath,
|
|
42629
|
+
answerPath,
|
|
42630
|
+
transcriptPath,
|
|
42631
|
+
inputPath,
|
|
42632
|
+
responsePath,
|
|
42633
|
+
extraIndexFields
|
|
42634
|
+
}),
|
|
42635
|
+
experiment: options?.experiment
|
|
42636
|
+
});
|
|
42637
|
+
}
|
|
42638
|
+
const timing = buildTimingArtifact(results);
|
|
42639
|
+
await writeFile10(timingPath, `${JSON.stringify(timing, null, 2)}
|
|
42640
|
+
`, "utf8");
|
|
42641
|
+
const plannedTestCount = options?.plannedTestCount ?? await readPlannedTestCount(benchmarkPath);
|
|
42642
|
+
const benchmark = buildBenchmarkArtifact(
|
|
42643
|
+
results,
|
|
42644
|
+
options?.evalFile,
|
|
42645
|
+
options?.experiment,
|
|
42646
|
+
plannedTestCount
|
|
42647
|
+
);
|
|
42648
|
+
await writeFile10(benchmarkPath, `${JSON.stringify(benchmark, null, 2)}
|
|
42649
|
+
`, "utf8");
|
|
42650
|
+
await writeJsonlFile(indexPath, indexRecords);
|
|
42651
|
+
await writeFile10(
|
|
42652
|
+
path47.join(outputDir, "transcript.jsonl"),
|
|
42653
|
+
buildTranscriptMessageLines(results),
|
|
42654
|
+
"utf8"
|
|
42655
|
+
);
|
|
42656
|
+
return { testArtifactDir, timingPath, benchmarkPath, indexPath };
|
|
42657
|
+
}
|
|
41204
42658
|
async function evaluate(config2) {
|
|
41205
42659
|
const startTime = Date.now();
|
|
41206
42660
|
if (config2.tests && config2.specFile) {
|
|
@@ -41246,7 +42700,7 @@ async function evaluate(config2) {
|
|
|
41246
42700
|
cliNoCache: false,
|
|
41247
42701
|
yamlCache: config2.cache === void 0 ? materialized.cache : void 0
|
|
41248
42702
|
});
|
|
41249
|
-
const cache = cacheEnabled ? new ResponseCache(materialized.cachePath ?
|
|
42703
|
+
const cache = cacheEnabled ? new ResponseCache(materialized.cachePath ? path48.resolve(materialized.cachePath) : void 0) : void 0;
|
|
41250
42704
|
const results = await runEvaluation({
|
|
41251
42705
|
testFilePath,
|
|
41252
42706
|
repoRoot,
|
|
@@ -41269,15 +42723,27 @@ async function evaluate(config2) {
|
|
|
41269
42723
|
});
|
|
41270
42724
|
const allResults = collectedResults.length > 0 ? collectedResults : [...results];
|
|
41271
42725
|
const durationMs = Date.now() - startTime;
|
|
42726
|
+
const outputDir = config2.outputDir ? path48.resolve(config2.outputDir) : void 0;
|
|
42727
|
+
const artifacts = outputDir ? await writeArtifactsFromResults(allResults, outputDir, {
|
|
42728
|
+
evalFile: config2.specFile ? testFilePath : "",
|
|
42729
|
+
experiment: config2.experiment,
|
|
42730
|
+
sourceTests: materialized.tests
|
|
42731
|
+
}).then(({ benchmarkPath, indexPath, timingPath }) => ({
|
|
42732
|
+
runDir: outputDir,
|
|
42733
|
+
benchmarkPath,
|
|
42734
|
+
indexPath,
|
|
42735
|
+
timingPath
|
|
42736
|
+
})) : void 0;
|
|
41272
42737
|
return {
|
|
41273
42738
|
results: allResults,
|
|
41274
|
-
summary: computeSummary(allResults, durationMs, config2.threshold)
|
|
42739
|
+
summary: computeSummary(allResults, durationMs, config2.threshold),
|
|
42740
|
+
artifacts
|
|
41275
42741
|
};
|
|
41276
42742
|
}
|
|
41277
42743
|
async function materializeEvalConfig(config2, options) {
|
|
41278
42744
|
const baseDir = options?.baseDir ?? process.cwd();
|
|
41279
42745
|
const repoRoot = options?.repoRoot ?? await findGitRoot(baseDir) ?? baseDir;
|
|
41280
|
-
const testFilePath = config2.specFile ?
|
|
42746
|
+
const testFilePath = config2.specFile ? path48.resolve(baseDir, config2.specFile) : path48.join(baseDir, "__programmatic__.yaml");
|
|
41281
42747
|
const effectiveFilter = options?.filter ?? config2.filter;
|
|
41282
42748
|
if (config2.specFile) {
|
|
41283
42749
|
const suite = await loadTestSuite(testFilePath, repoRoot, {
|
|
@@ -41354,7 +42820,7 @@ function convertAssertions(entries) {
|
|
|
41354
42820
|
}
|
|
41355
42821
|
function buildInlineEvalTests(config2, options) {
|
|
41356
42822
|
const suiteWorkspace = config2.beforeAll ? { hooks: { before_all: toBeforeAllHook(config2.beforeAll) } } : void 0;
|
|
41357
|
-
const derivedSuiteName =
|
|
42823
|
+
const derivedSuiteName = path48.basename(options.testFilePath).replace(/\.eval\.[cm]?ts$/i, "").replace(/\.[cm]?ts$/i, "");
|
|
41358
42824
|
const suiteName = config2.metadata?.name ?? (derivedSuiteName || "eval");
|
|
41359
42825
|
return (config2.tests ?? []).filter((test) => !options.filter || matchesFilter4(test.id, options.filter)).map((test) => {
|
|
41360
42826
|
const isConversation = test.mode === "conversation" || test.turns && test.turns.length > 0;
|
|
@@ -41450,10 +42916,10 @@ function computeSummary(results, durationMs, threshold = DEFAULT_THRESHOLD) {
|
|
|
41450
42916
|
var TARGET_FILE_CANDIDATES = [".agentv/targets.yaml", ".agentv/targets.yml"];
|
|
41451
42917
|
async function discoverDefaultTarget(repoRoot) {
|
|
41452
42918
|
const cwd = process.cwd();
|
|
41453
|
-
const chain = buildDirectoryChain(
|
|
42919
|
+
const chain = buildDirectoryChain(path48.join(cwd, "_placeholder"), repoRoot);
|
|
41454
42920
|
for (const dir of chain) {
|
|
41455
42921
|
for (const candidate of TARGET_FILE_CANDIDATES) {
|
|
41456
|
-
const targetsPath =
|
|
42922
|
+
const targetsPath = path48.join(dir, candidate);
|
|
41457
42923
|
if (!existsSync7(targetsPath)) continue;
|
|
41458
42924
|
try {
|
|
41459
42925
|
const definitions = await readTargetDefinitions(targetsPath);
|
|
@@ -41470,7 +42936,7 @@ async function loadEnvHierarchy(repoRoot, startPath) {
|
|
|
41470
42936
|
const chain = buildDirectoryChain(startPath, repoRoot);
|
|
41471
42937
|
const envFiles = [];
|
|
41472
42938
|
for (const dir of chain) {
|
|
41473
|
-
const envPath =
|
|
42939
|
+
const envPath = path48.join(dir, ".env");
|
|
41474
42940
|
if (existsSync7(envPath)) envFiles.push(envPath);
|
|
41475
42941
|
}
|
|
41476
42942
|
for (let i = 0; i < envFiles.length; i++) {
|
|
@@ -41496,7 +42962,7 @@ async function loadEnvHierarchy(repoRoot, startPath) {
|
|
|
41496
42962
|
}
|
|
41497
42963
|
var EXPORT_NAMES = ["default", "config", "evalConfig"];
|
|
41498
42964
|
async function loadTsEvalFile(filePath) {
|
|
41499
|
-
const absolutePath =
|
|
42965
|
+
const absolutePath = path49.resolve(filePath);
|
|
41500
42966
|
const moduleUrl = pathToFileURL2(absolutePath).href;
|
|
41501
42967
|
const module = await import(moduleUrl);
|
|
41502
42968
|
let config2;
|
|
@@ -41518,7 +42984,7 @@ async function loadTsEvalSuite(filePath, repoRoot, options) {
|
|
|
41518
42984
|
const { config: config2, filePath: absolutePath } = await loadTsEvalFile(filePath);
|
|
41519
42985
|
const materialized = await materializeEvalConfig(config2, {
|
|
41520
42986
|
repoRoot,
|
|
41521
|
-
baseDir:
|
|
42987
|
+
baseDir: path49.dirname(absolutePath),
|
|
41522
42988
|
filter: options?.filter,
|
|
41523
42989
|
category: options?.category
|
|
41524
42990
|
});
|
|
@@ -41718,7 +43184,7 @@ export {
|
|
|
41718
43184
|
formatReplayLookupKey,
|
|
41719
43185
|
replayFixtureRecordToProviderResponse,
|
|
41720
43186
|
buildReplayFixtureRecord,
|
|
41721
|
-
|
|
43187
|
+
EXECUTION_TRACE_SCHEMA_VERSION,
|
|
41722
43188
|
TraceEnvelopeEvalWireSchema,
|
|
41723
43189
|
TraceEnvelopeReplayWireSchema,
|
|
41724
43190
|
TraceEnvelopeSpanStatusWireSchema,
|
|
@@ -41735,9 +43201,12 @@ export {
|
|
|
41735
43201
|
toTraceEnvelopeWire,
|
|
41736
43202
|
fromTraceEnvelopeWire,
|
|
41737
43203
|
traceEnvelopeToMessages,
|
|
43204
|
+
traceEnvelopeToTranscriptMessages,
|
|
43205
|
+
traceEnvelopeToToolTrajectoryView,
|
|
41738
43206
|
traceEnvelopeToTraceSummary,
|
|
41739
43207
|
traceEnvelopeToTraceArtifact,
|
|
41740
43208
|
getTraceEnvelopeSummary,
|
|
43209
|
+
traceEnvelopeToOtlpJson,
|
|
41741
43210
|
readTraceEnvelopeReplayRecords,
|
|
41742
43211
|
findTraceEnvelopeReplayRecord,
|
|
41743
43212
|
traceEnvelopeReplayRecordToProviderResponse,
|
|
@@ -41804,8 +43273,31 @@ export {
|
|
|
41804
43273
|
loadEvalCaseById,
|
|
41805
43274
|
runEvaluation,
|
|
41806
43275
|
runEvalCase,
|
|
43276
|
+
toTranscriptJsonLines,
|
|
43277
|
+
traceToTranscriptJsonLines,
|
|
43278
|
+
traceFromTranscriptJsonLines,
|
|
43279
|
+
groupTranscriptJsonLines,
|
|
43280
|
+
readTranscriptJsonl,
|
|
43281
|
+
readTranscriptFile,
|
|
43282
|
+
ResultRowSchemaError,
|
|
43283
|
+
normalizeResultRow,
|
|
43284
|
+
RESULT_INDEX_FILENAME,
|
|
43285
|
+
buildTestTargetKey,
|
|
43286
|
+
deduplicateByTestIdTarget,
|
|
43287
|
+
aggregateRunDir,
|
|
43288
|
+
buildGradingArtifact,
|
|
43289
|
+
buildTimingArtifact,
|
|
43290
|
+
buildBenchmarkArtifact,
|
|
43291
|
+
writeInitialBenchmarkArtifact,
|
|
43292
|
+
buildAggregateGradingArtifact,
|
|
43293
|
+
buildIndexArtifactEntry,
|
|
43294
|
+
buildResultIndexArtifact,
|
|
43295
|
+
parseJsonlResults,
|
|
43296
|
+
writeArtifacts,
|
|
43297
|
+
writePerTestArtifacts,
|
|
43298
|
+
writeArtifactsFromResults,
|
|
41807
43299
|
evaluate,
|
|
41808
43300
|
loadTsEvalFile,
|
|
41809
43301
|
loadTsEvalSuite
|
|
41810
43302
|
};
|
|
41811
|
-
//# sourceMappingURL=chunk-
|
|
43303
|
+
//# sourceMappingURL=chunk-BLXYBUU4.js.map
|