agentv 4.14.0 → 4.15.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-XDIQBOPH.js → chunk-4GFW2O4T.js} +125 -16
- package/dist/chunk-4GFW2O4T.js.map +1 -0
- package/dist/{chunk-IX3FYDXN.js → chunk-HQY6LN6F.js} +4 -4
- package/dist/{chunk-HP3ZUKYB.js → chunk-VSWJTL3T.js} +774 -192
- package/dist/chunk-VSWJTL3T.js.map +1 -0
- package/dist/cli.js +3 -3
- package/dist/{dist-Y3NRIBUT.js → dist-G6XAYD4R.js} +2 -2
- package/dist/index.js +3 -3
- package/dist/{interactive-TNCUWC46.js → interactive-53RJ7LOT.js} +3 -3
- package/package.json +1 -1
- package/dist/chunk-HP3ZUKYB.js.map +0 -1
- package/dist/chunk-XDIQBOPH.js.map +0 -1
- /package/dist/{chunk-IX3FYDXN.js.map → chunk-HQY6LN6F.js.map} +0 -0
- /package/dist/{dist-Y3NRIBUT.js.map → dist-G6XAYD4R.js.map} +0 -0
- /package/dist/{interactive-TNCUWC46.js.map → interactive-53RJ7LOT.js.map} +0 -0
|
@@ -305,7 +305,7 @@ var require_dist = __commonJS({
|
|
|
305
305
|
}
|
|
306
306
|
});
|
|
307
307
|
|
|
308
|
-
// ../../packages/core/dist/chunk-
|
|
308
|
+
// ../../packages/core/dist/chunk-AOOU6PLC.js
|
|
309
309
|
import { constants } from "node:fs";
|
|
310
310
|
import { access, readFile } from "node:fs/promises";
|
|
311
311
|
import path from "node:path";
|
|
@@ -423,7 +423,7 @@ __export(external_exports2, {
|
|
|
423
423
|
void: () => voidType
|
|
424
424
|
});
|
|
425
425
|
|
|
426
|
-
// ../../packages/core/dist/chunk-
|
|
426
|
+
// ../../packages/core/dist/chunk-AOOU6PLC.js
|
|
427
427
|
import { readFile as readFile2 } from "node:fs/promises";
|
|
428
428
|
import path3 from "node:path";
|
|
429
429
|
import fg from "fast-glob";
|
|
@@ -1293,6 +1293,11 @@ function resolveCodexConfig(target, env, evalFilePath) {
|
|
|
1293
1293
|
const logDirSource = target.log_dir ?? target.log_directory;
|
|
1294
1294
|
const logFormatSource = target.log_format ?? target.log_output_format ?? env.AGENTV_CODEX_LOG_FORMAT;
|
|
1295
1295
|
const systemPromptSource = target.system_prompt;
|
|
1296
|
+
const streamLogResult = resolveStreamLog(target, env.AGENTV_CODEX_LOG_FORMAT);
|
|
1297
|
+
if (streamLogResult.deprecationWarning) {
|
|
1298
|
+
process.stderr.write(`[agentv] \u26A0 ${streamLogResult.deprecationWarning}
|
|
1299
|
+
`);
|
|
1300
|
+
}
|
|
1296
1301
|
const model = resolveOptionalString(modelSource, env, `${target.name} codex model`, {
|
|
1297
1302
|
allowLiteral: true,
|
|
1298
1303
|
optionalEnv: true
|
|
@@ -1339,6 +1344,7 @@ function resolveCodexConfig(target, env, evalFilePath) {
|
|
|
1339
1344
|
timeoutMs,
|
|
1340
1345
|
logDir,
|
|
1341
1346
|
logFormat,
|
|
1347
|
+
streamLog: streamLogResult.streamLog,
|
|
1342
1348
|
systemPrompt
|
|
1343
1349
|
};
|
|
1344
1350
|
}
|
|
@@ -1355,6 +1361,38 @@ function normalizeCodexLogFormat(value) {
|
|
|
1355
1361
|
}
|
|
1356
1362
|
throw new Error("codex log format must be 'summary' or 'json'");
|
|
1357
1363
|
}
|
|
1364
|
+
function resolveStreamLog(target, envFallback) {
|
|
1365
|
+
if (target.stream_log !== void 0 && target.stream_log !== null) {
|
|
1366
|
+
const val = target.stream_log;
|
|
1367
|
+
if (val === false || val === "false") {
|
|
1368
|
+
return { streamLog: false, logFormat: void 0 };
|
|
1369
|
+
}
|
|
1370
|
+
if (val === "raw") {
|
|
1371
|
+
return { streamLog: "raw", logFormat: "json" };
|
|
1372
|
+
}
|
|
1373
|
+
if (val === "summary") {
|
|
1374
|
+
return { streamLog: "summary", logFormat: "summary" };
|
|
1375
|
+
}
|
|
1376
|
+
throw new Error(`${target.name}: stream_log must be false, 'raw', or 'summary'`);
|
|
1377
|
+
}
|
|
1378
|
+
const logFormatRaw = target.log_format ?? target.log_output_format ?? envFallback;
|
|
1379
|
+
if (logFormatRaw === void 0 || logFormatRaw === null) {
|
|
1380
|
+
return { streamLog: void 0, logFormat: void 0 };
|
|
1381
|
+
}
|
|
1382
|
+
if (typeof logFormatRaw !== "string") {
|
|
1383
|
+
throw new Error(`${target.name}: log_format must be 'summary' or 'json'`);
|
|
1384
|
+
}
|
|
1385
|
+
const normalized = logFormatRaw.trim().toLowerCase();
|
|
1386
|
+
if (normalized !== "json" && normalized !== "summary") {
|
|
1387
|
+
throw new Error(`${target.name}: log_format must be 'summary' or 'json'`);
|
|
1388
|
+
}
|
|
1389
|
+
const streamLogEquivalent = normalized === "json" ? "raw" : "summary";
|
|
1390
|
+
return {
|
|
1391
|
+
streamLog: streamLogEquivalent,
|
|
1392
|
+
logFormat: normalized,
|
|
1393
|
+
deprecationWarning: `${target.name}: 'log_format' is deprecated and will be removed in v4.16. Use 'stream_log: ${streamLogEquivalent}' instead (log_format: '${normalized}' \u2192 stream_log: '${streamLogEquivalent}').`
|
|
1394
|
+
};
|
|
1395
|
+
}
|
|
1358
1396
|
function resolveCopilotSdkConfig(target, env, evalFilePath) {
|
|
1359
1397
|
const cliUrlSource = target.cli_url;
|
|
1360
1398
|
const cliPathSource = target.cli_path;
|
|
@@ -1366,6 +1404,11 @@ function resolveCopilotSdkConfig(target, env, evalFilePath) {
|
|
|
1366
1404
|
const logDirSource = target.log_dir ?? target.log_directory;
|
|
1367
1405
|
const logFormatSource = target.log_format;
|
|
1368
1406
|
const systemPromptSource = target.system_prompt;
|
|
1407
|
+
const streamLogResult = resolveStreamLog(target);
|
|
1408
|
+
if (streamLogResult.deprecationWarning) {
|
|
1409
|
+
process.stderr.write(`[agentv] \u26A0 ${streamLogResult.deprecationWarning}
|
|
1410
|
+
`);
|
|
1411
|
+
}
|
|
1369
1412
|
const cliUrl = resolveOptionalString(cliUrlSource, env, `${target.name} copilot-sdk cli URL`, {
|
|
1370
1413
|
allowLiteral: true,
|
|
1371
1414
|
optionalEnv: true
|
|
@@ -1476,6 +1519,7 @@ function resolveCopilotSdkConfig(target, env, evalFilePath) {
|
|
|
1476
1519
|
timeoutMs,
|
|
1477
1520
|
logDir,
|
|
1478
1521
|
logFormat,
|
|
1522
|
+
streamLog: streamLogResult.streamLog,
|
|
1479
1523
|
systemPrompt,
|
|
1480
1524
|
byokType,
|
|
1481
1525
|
byokBaseUrl,
|
|
@@ -1495,6 +1539,11 @@ function resolveCopilotCliConfig(target, env, evalFilePath) {
|
|
|
1495
1539
|
const logDirSource = target.log_dir ?? target.log_directory;
|
|
1496
1540
|
const logFormatSource = target.log_format;
|
|
1497
1541
|
const systemPromptSource = target.system_prompt;
|
|
1542
|
+
const streamLogResult = resolveStreamLog(target);
|
|
1543
|
+
if (streamLogResult.deprecationWarning) {
|
|
1544
|
+
process.stderr.write(`[agentv] \u26A0 ${streamLogResult.deprecationWarning}
|
|
1545
|
+
`);
|
|
1546
|
+
}
|
|
1498
1547
|
const executable = resolveOptionalString(executableSource, env, `${target.name} copilot-cli executable`, {
|
|
1499
1548
|
allowLiteral: true,
|
|
1500
1549
|
optionalEnv: true
|
|
@@ -1546,6 +1595,7 @@ function resolveCopilotCliConfig(target, env, evalFilePath) {
|
|
|
1546
1595
|
timeoutMs,
|
|
1547
1596
|
logDir,
|
|
1548
1597
|
logFormat,
|
|
1598
|
+
streamLog: streamLogResult.streamLog,
|
|
1549
1599
|
systemPrompt
|
|
1550
1600
|
};
|
|
1551
1601
|
}
|
|
@@ -1568,6 +1618,11 @@ function resolvePiCodingAgentConfig(target, env, evalFilePath) {
|
|
|
1568
1618
|
const logDirSource = target.log_dir ?? target.log_directory;
|
|
1569
1619
|
const logFormatSource = target.log_format;
|
|
1570
1620
|
const systemPromptSource = target.system_prompt;
|
|
1621
|
+
const streamLogResult = resolveStreamLog(target);
|
|
1622
|
+
if (streamLogResult.deprecationWarning) {
|
|
1623
|
+
process.stderr.write(`[agentv] \u26A0 ${streamLogResult.deprecationWarning}
|
|
1624
|
+
`);
|
|
1625
|
+
}
|
|
1571
1626
|
const subprovider = resolveOptionalString(
|
|
1572
1627
|
subproviderSource,
|
|
1573
1628
|
env,
|
|
@@ -1638,6 +1693,7 @@ function resolvePiCodingAgentConfig(target, env, evalFilePath) {
|
|
|
1638
1693
|
timeoutMs,
|
|
1639
1694
|
logDir,
|
|
1640
1695
|
logFormat,
|
|
1696
|
+
streamLog: streamLogResult.streamLog,
|
|
1641
1697
|
systemPrompt
|
|
1642
1698
|
};
|
|
1643
1699
|
}
|
|
@@ -1654,6 +1710,11 @@ function resolvePiCliConfig(target, env, evalFilePath) {
|
|
|
1654
1710
|
const logDirSource = target.log_dir ?? target.log_directory;
|
|
1655
1711
|
const logFormatSource = target.log_format;
|
|
1656
1712
|
const systemPromptSource = target.system_prompt;
|
|
1713
|
+
const streamLogResult = resolveStreamLog(target);
|
|
1714
|
+
if (streamLogResult.deprecationWarning) {
|
|
1715
|
+
process.stderr.write(`[agentv] \u26A0 ${streamLogResult.deprecationWarning}
|
|
1716
|
+
`);
|
|
1717
|
+
}
|
|
1657
1718
|
const executable = resolveOptionalString(executableSource, env, `${target.name} pi-cli executable`, {
|
|
1658
1719
|
allowLiteral: true,
|
|
1659
1720
|
optionalEnv: true
|
|
@@ -1724,6 +1785,7 @@ function resolvePiCliConfig(target, env, evalFilePath) {
|
|
|
1724
1785
|
timeoutMs,
|
|
1725
1786
|
logDir,
|
|
1726
1787
|
logFormat,
|
|
1788
|
+
streamLog: streamLogResult.streamLog,
|
|
1727
1789
|
systemPrompt
|
|
1728
1790
|
};
|
|
1729
1791
|
}
|
|
@@ -1735,6 +1797,11 @@ function resolveClaudeConfig(target, env, evalFilePath) {
|
|
|
1735
1797
|
const logDirSource = target.log_dir ?? target.log_directory;
|
|
1736
1798
|
const logFormatSource = target.log_format ?? target.log_output_format ?? env.AGENTV_CLAUDE_LOG_FORMAT;
|
|
1737
1799
|
const systemPromptSource = target.system_prompt;
|
|
1800
|
+
const streamLogResult = resolveStreamLog(target);
|
|
1801
|
+
if (streamLogResult.deprecationWarning) {
|
|
1802
|
+
process.stderr.write(`[agentv] \u26A0 ${streamLogResult.deprecationWarning}
|
|
1803
|
+
`);
|
|
1804
|
+
}
|
|
1738
1805
|
const model = resolveOptionalString(modelSource, env, `${target.name} claude model`, {
|
|
1739
1806
|
allowLiteral: true,
|
|
1740
1807
|
optionalEnv: true
|
|
@@ -1778,7 +1845,8 @@ function resolveClaudeConfig(target, env, evalFilePath) {
|
|
|
1778
1845
|
maxTurns,
|
|
1779
1846
|
maxBudgetUsd,
|
|
1780
1847
|
logDir,
|
|
1781
|
-
logFormat
|
|
1848
|
+
logFormat,
|
|
1849
|
+
streamLog: streamLogResult.streamLog
|
|
1782
1850
|
};
|
|
1783
1851
|
}
|
|
1784
1852
|
function normalizeClaudeLogFormat(value) {
|
|
@@ -14542,10 +14610,12 @@ import { randomBytes } from "node:crypto";
|
|
|
14542
14610
|
import { createServer } from "node:http";
|
|
14543
14611
|
import fs2 from "node:fs/promises";
|
|
14544
14612
|
import path37 from "node:path";
|
|
14613
|
+
import { execFile as execFile3 } from "node:child_process";
|
|
14545
14614
|
import { createHash as createHash2, randomUUID as randomUUID9 } from "node:crypto";
|
|
14546
14615
|
import { existsSync as existsSync5 } from "node:fs";
|
|
14547
14616
|
import { copyFile as copyFile2, mkdir as mkdir14, readdir as readdir8, stat as stat9 } from "node:fs/promises";
|
|
14548
14617
|
import path45 from "node:path";
|
|
14618
|
+
import { promisify as promisify7 } from "node:util";
|
|
14549
14619
|
import micromatch3 from "micromatch";
|
|
14550
14620
|
import path38 from "node:path";
|
|
14551
14621
|
import path39 from "node:path";
|
|
@@ -14573,12 +14643,12 @@ import path47 from "node:path";
|
|
|
14573
14643
|
import { parse as parse6 } from "yaml";
|
|
14574
14644
|
import { mkdir as mkdir15, readFile as readFile17, writeFile as writeFile8 } from "node:fs/promises";
|
|
14575
14645
|
import path48 from "node:path";
|
|
14576
|
-
import { execFile as
|
|
14646
|
+
import { execFile as execFile4 } from "node:child_process";
|
|
14577
14647
|
import { existsSync as existsSync7, mkdirSync as mkdirSync2, readFileSync as readFileSync3, rmSync, writeFileSync } from "node:fs";
|
|
14578
14648
|
import { cp as cp3, mkdtemp as mkdtemp3, readdir as readdir9, rm as rm6, stat as stat10 } from "node:fs/promises";
|
|
14579
14649
|
import os3 from "node:os";
|
|
14580
14650
|
import path49 from "node:path";
|
|
14581
|
-
import { promisify as
|
|
14651
|
+
import { promisify as promisify8 } from "node:util";
|
|
14582
14652
|
import { existsSync as existsSync8, mkdirSync as mkdirSync3, readFileSync as readFileSync4, readdirSync as readdirSync3, statSync as statSync2, writeFileSync as writeFileSync2 } from "node:fs";
|
|
14583
14653
|
import path50 from "node:path";
|
|
14584
14654
|
import { parse as parseYaml3, stringify as stringifyYaml } from "yaml";
|
|
@@ -18150,10 +18220,10 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
18150
18220
|
const effectiveSuiteInputFiles = suiteInputFiles && !skipDefaults ? suiteInputFiles : void 0;
|
|
18151
18221
|
const testInputMessages = resolveInputMessages(testCaseConfig, effectiveSuiteInputFiles);
|
|
18152
18222
|
const expectedMessages = resolveExpectedMessages(testCaseConfig) ?? [];
|
|
18153
|
-
const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 || testCaseConfig.assertions !== void 0 || testCaseConfig.assert !== void 0;
|
|
18223
|
+
const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 || testCaseConfig.assertions !== void 0 || testCaseConfig.assert !== void 0 || Array.isArray(testCaseConfig.turns) && testCaseConfig.turns.length > 0;
|
|
18154
18224
|
if (!id || !hasEvaluationSpec || !testInputMessages || testInputMessages.length === 0) {
|
|
18155
18225
|
logError3(
|
|
18156
|
-
`Skipping incomplete test: ${id ?? "unknown"}. Missing required fields: id, input, and at least one of criteria/expected_output/assertions`
|
|
18226
|
+
`Skipping incomplete test: ${id ?? "unknown"}. Missing required fields: id, input, and at least one of criteria/expected_output/assertions/turns`
|
|
18157
18227
|
);
|
|
18158
18228
|
continue;
|
|
18159
18229
|
}
|
|
@@ -18230,6 +18300,14 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
18230
18300
|
) : void 0;
|
|
18231
18301
|
const onDependencyFailureRaw = asString5(testCaseConfig.on_dependency_failure);
|
|
18232
18302
|
const onDependencyFailure = onDependencyFailureRaw === "skip" || onDependencyFailureRaw === "fail" || onDependencyFailureRaw === "run" ? onDependencyFailureRaw : void 0;
|
|
18303
|
+
const modeRaw = asString5(testCaseConfig.mode);
|
|
18304
|
+
const mode = modeRaw === "conversation" ? "conversation" : void 0;
|
|
18305
|
+
const turns = Array.isArray(testCaseConfig.turns) ? parseTurns(testCaseConfig.turns) : void 0;
|
|
18306
|
+
const aggregationRaw = asString5(testCaseConfig.aggregation);
|
|
18307
|
+
const aggregation = aggregationRaw === "mean" || aggregationRaw === "min" || aggregationRaw === "max" ? aggregationRaw : void 0;
|
|
18308
|
+
const onTurnFailureRaw = asString5(testCaseConfig.on_turn_failure);
|
|
18309
|
+
const onTurnFailure = onTurnFailureRaw === "continue" || onTurnFailureRaw === "stop" ? onTurnFailureRaw : void 0;
|
|
18310
|
+
const windowSize = typeof testCaseConfig.window_size === "number" && testCaseConfig.window_size >= 1 ? testCaseConfig.window_size : void 0;
|
|
18233
18311
|
const testCase = {
|
|
18234
18312
|
id,
|
|
18235
18313
|
suite: suiteName,
|
|
@@ -18248,6 +18326,11 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
18248
18326
|
metadata,
|
|
18249
18327
|
targets: caseTargets,
|
|
18250
18328
|
...caseThreshold !== void 0 ? { threshold: caseThreshold } : {},
|
|
18329
|
+
...mode ? { mode } : {},
|
|
18330
|
+
...turns && turns.length > 0 ? { turns } : {},
|
|
18331
|
+
...aggregation ? { aggregation } : {},
|
|
18332
|
+
...onTurnFailure ? { on_turn_failure: onTurnFailure } : {},
|
|
18333
|
+
...windowSize !== void 0 ? { window_size: windowSize } : {},
|
|
18251
18334
|
...dependsOn && dependsOn.length > 0 ? { depends_on: dependsOn } : {},
|
|
18252
18335
|
...onDependencyFailure ? { on_dependency_failure: onDependencyFailure } : {}
|
|
18253
18336
|
};
|
|
@@ -18265,6 +18348,25 @@ async function loadTestById(evalFilePath, repoRoot, evalId) {
|
|
|
18265
18348
|
return match;
|
|
18266
18349
|
}
|
|
18267
18350
|
var loadEvalCaseById = loadTestById;
|
|
18351
|
+
function parseTurns(rawTurns) {
|
|
18352
|
+
return rawTurns.map((rawTurn) => {
|
|
18353
|
+
const turn = rawTurn;
|
|
18354
|
+
const input = turn.input;
|
|
18355
|
+
const expectedOutput = turn.expected_output;
|
|
18356
|
+
let assertions;
|
|
18357
|
+
if (Array.isArray(turn.assertions)) {
|
|
18358
|
+
assertions = turn.assertions.map((a) => {
|
|
18359
|
+
if (typeof a === "string") return a;
|
|
18360
|
+
return a;
|
|
18361
|
+
});
|
|
18362
|
+
}
|
|
18363
|
+
return {
|
|
18364
|
+
input,
|
|
18365
|
+
...expectedOutput !== void 0 ? { expected_output: expectedOutput } : {},
|
|
18366
|
+
...assertions && assertions.length > 0 ? { assertions } : {}
|
|
18367
|
+
};
|
|
18368
|
+
});
|
|
18369
|
+
}
|
|
18268
18370
|
function parseCommandArray(source) {
|
|
18269
18371
|
if (typeof source === "string") {
|
|
18270
18372
|
const parts = source.trim().split(/\s+/);
|
|
@@ -19197,6 +19299,152 @@ function subscribeToClaudeLogEntries(listener) {
|
|
|
19197
19299
|
store.delete(listener);
|
|
19198
19300
|
};
|
|
19199
19301
|
}
|
|
19302
|
+
var TOOL_NAME_MAP = /* @__PURE__ */ new Map([
|
|
19303
|
+
// --- Claude (already canonical) ---
|
|
19304
|
+
["claude::Skill", "Skill"],
|
|
19305
|
+
["claude::Read", "Read"],
|
|
19306
|
+
["claude::Write", "Write"],
|
|
19307
|
+
["claude::Edit", "Edit"],
|
|
19308
|
+
["claude::Bash", "Bash"],
|
|
19309
|
+
["claude-cli::Skill", "Skill"],
|
|
19310
|
+
["claude-cli::Read", "Read"],
|
|
19311
|
+
["claude-cli::Write", "Write"],
|
|
19312
|
+
["claude-cli::Edit", "Edit"],
|
|
19313
|
+
["claude-cli::Bash", "Bash"],
|
|
19314
|
+
["claude-sdk::Skill", "Skill"],
|
|
19315
|
+
["claude-sdk::Read", "Read"],
|
|
19316
|
+
["claude-sdk::Write", "Write"],
|
|
19317
|
+
["claude-sdk::Edit", "Edit"],
|
|
19318
|
+
["claude-sdk::Bash", "Bash"],
|
|
19319
|
+
// --- Copilot ---
|
|
19320
|
+
["copilot-cli::Skill", "Skill"],
|
|
19321
|
+
["copilot-cli::skill", "Skill"],
|
|
19322
|
+
["copilot-cli::Read File", "Read"],
|
|
19323
|
+
["copilot-cli::readFile", "Read"],
|
|
19324
|
+
["copilot-cli::Read", "Read"],
|
|
19325
|
+
["copilot-cli::readTextFile", "Read"],
|
|
19326
|
+
["copilot-cli::writeTextFile", "Write"],
|
|
19327
|
+
["copilot-cli::Write File", "Write"],
|
|
19328
|
+
["copilot-cli::editFile", "Edit"],
|
|
19329
|
+
["copilot-cli::Edit File", "Edit"],
|
|
19330
|
+
["copilot-cli::runTerminalCommand", "Bash"],
|
|
19331
|
+
["copilot-sdk::Skill", "Skill"],
|
|
19332
|
+
["copilot-sdk::skill", "Skill"],
|
|
19333
|
+
["copilot-sdk::Read File", "Read"],
|
|
19334
|
+
["copilot-sdk::readFile", "Read"],
|
|
19335
|
+
["copilot-sdk::Read", "Read"],
|
|
19336
|
+
["copilot-sdk::readTextFile", "Read"],
|
|
19337
|
+
["copilot-sdk::writeTextFile", "Write"],
|
|
19338
|
+
["copilot-sdk::Write File", "Write"],
|
|
19339
|
+
["copilot-sdk::editFile", "Edit"],
|
|
19340
|
+
["copilot-sdk::Edit File", "Edit"],
|
|
19341
|
+
["copilot-sdk::runTerminalCommand", "Bash"],
|
|
19342
|
+
["copilot-log::Skill", "Skill"],
|
|
19343
|
+
["copilot-log::skill", "Skill"],
|
|
19344
|
+
["copilot-log::Read File", "Read"],
|
|
19345
|
+
["copilot-log::readFile", "Read"],
|
|
19346
|
+
["copilot-log::Read", "Read"],
|
|
19347
|
+
["copilot-log::readTextFile", "Read"],
|
|
19348
|
+
["copilot-log::writeTextFile", "Write"],
|
|
19349
|
+
["copilot-log::Write File", "Write"],
|
|
19350
|
+
["copilot-log::editFile", "Edit"],
|
|
19351
|
+
["copilot-log::Edit File", "Edit"],
|
|
19352
|
+
["copilot-log::runTerminalCommand", "Bash"],
|
|
19353
|
+
["vscode::Skill", "Skill"],
|
|
19354
|
+
["vscode::skill", "Skill"],
|
|
19355
|
+
["vscode::Read File", "Read"],
|
|
19356
|
+
["vscode::readFile", "Read"],
|
|
19357
|
+
["vscode::Read", "Read"],
|
|
19358
|
+
["vscode::readTextFile", "Read"],
|
|
19359
|
+
["vscode::writeTextFile", "Write"],
|
|
19360
|
+
["vscode::Write File", "Write"],
|
|
19361
|
+
["vscode::editFile", "Edit"],
|
|
19362
|
+
["vscode::Edit File", "Edit"],
|
|
19363
|
+
["vscode::runTerminalCommand", "Bash"],
|
|
19364
|
+
["vscode-insiders::Skill", "Skill"],
|
|
19365
|
+
["vscode-insiders::skill", "Skill"],
|
|
19366
|
+
["vscode-insiders::Read File", "Read"],
|
|
19367
|
+
["vscode-insiders::readFile", "Read"],
|
|
19368
|
+
["vscode-insiders::Read", "Read"],
|
|
19369
|
+
["vscode-insiders::readTextFile", "Read"],
|
|
19370
|
+
["vscode-insiders::writeTextFile", "Write"],
|
|
19371
|
+
["vscode-insiders::Write File", "Write"],
|
|
19372
|
+
["vscode-insiders::editFile", "Edit"],
|
|
19373
|
+
["vscode-insiders::Edit File", "Edit"],
|
|
19374
|
+
["vscode-insiders::runTerminalCommand", "Bash"],
|
|
19375
|
+
// --- Codex ---
|
|
19376
|
+
["codex::command_execution", "Bash"],
|
|
19377
|
+
["codex::file_change", "Edit"],
|
|
19378
|
+
// --- Pi ---
|
|
19379
|
+
["pi-coding-agent::read", "Read"],
|
|
19380
|
+
["pi-coding-agent::bash", "Bash"],
|
|
19381
|
+
["pi-cli::read", "Read"],
|
|
19382
|
+
["pi-cli::bash", "Bash"]
|
|
19383
|
+
]);
|
|
19384
|
+
var COPILOT_PREFIXES = [
|
|
19385
|
+
{ prefix: "Using skill: ", canonical: "Skill", extractSkillFromName: true },
|
|
19386
|
+
{ prefix: "Viewing ", canonical: "Read" }
|
|
19387
|
+
];
|
|
19388
|
+
var CODEX_PREFIXES = [
|
|
19389
|
+
{ prefix: "mcp:", canonical: "Skill", extractSkillFromName: true }
|
|
19390
|
+
];
|
|
19391
|
+
var TOOL_PREFIX_MAP = /* @__PURE__ */ new Map([
|
|
19392
|
+
["copilot-cli", COPILOT_PREFIXES],
|
|
19393
|
+
["copilot-sdk", COPILOT_PREFIXES],
|
|
19394
|
+
["copilot-log", COPILOT_PREFIXES],
|
|
19395
|
+
["vscode", COPILOT_PREFIXES],
|
|
19396
|
+
["vscode-insiders", COPILOT_PREFIXES],
|
|
19397
|
+
["codex", CODEX_PREFIXES]
|
|
19398
|
+
]);
|
|
19399
|
+
var normalizeSkillInput = (input) => {
|
|
19400
|
+
if (input.skill !== void 0) return input;
|
|
19401
|
+
return input;
|
|
19402
|
+
};
|
|
19403
|
+
var normalizeReadInput = (input) => {
|
|
19404
|
+
if (input.file_path !== void 0) return input;
|
|
19405
|
+
if (input.path !== void 0) return { ...input, file_path: input.path };
|
|
19406
|
+
if (input.filePath !== void 0) return { ...input, file_path: input.filePath };
|
|
19407
|
+
return input;
|
|
19408
|
+
};
|
|
19409
|
+
var INPUT_NORMALIZERS = /* @__PURE__ */ new Map([
|
|
19410
|
+
["Skill", normalizeSkillInput],
|
|
19411
|
+
["Read", normalizeReadInput]
|
|
19412
|
+
]);
|
|
19413
|
+
function normalizeToolCall(providerKind, tc) {
|
|
19414
|
+
const nativeName = tc.tool;
|
|
19415
|
+
const exactKey = `${providerKind}::${nativeName}`;
|
|
19416
|
+
const canonical = TOOL_NAME_MAP.get(exactKey);
|
|
19417
|
+
if (canonical) {
|
|
19418
|
+
return applyInputNormalization(canonical, { ...tc, tool: canonical });
|
|
19419
|
+
}
|
|
19420
|
+
const prefixRules = TOOL_PREFIX_MAP.get(providerKind);
|
|
19421
|
+
if (prefixRules) {
|
|
19422
|
+
for (const rule of prefixRules) {
|
|
19423
|
+
if (nativeName.startsWith(rule.prefix)) {
|
|
19424
|
+
const suffix = nativeName.slice(rule.prefix.length);
|
|
19425
|
+
let normalizedInput = tc.input;
|
|
19426
|
+
if (rule.extractSkillFromName && suffix) {
|
|
19427
|
+
const existingInput = tc.input ?? {};
|
|
19428
|
+
normalizedInput = { ...existingInput, skill: suffix };
|
|
19429
|
+
}
|
|
19430
|
+
const normalized = {
|
|
19431
|
+
...tc,
|
|
19432
|
+
tool: rule.canonical,
|
|
19433
|
+
input: normalizedInput
|
|
19434
|
+
};
|
|
19435
|
+
return applyInputNormalization(rule.canonical, normalized);
|
|
19436
|
+
}
|
|
19437
|
+
}
|
|
19438
|
+
}
|
|
19439
|
+
return tc;
|
|
19440
|
+
}
|
|
19441
|
+
function applyInputNormalization(canonical, tc) {
|
|
19442
|
+
const normalizer = INPUT_NORMALIZERS.get(canonical);
|
|
19443
|
+
if (!normalizer || tc.input === void 0 || tc.input === null) return tc;
|
|
19444
|
+
const input = tc.input;
|
|
19445
|
+
const normalized = normalizer(input);
|
|
19446
|
+
return normalized === input ? tc : { ...tc, input: normalized };
|
|
19447
|
+
}
|
|
19200
19448
|
function buildPromptDocument(request, inputFiles) {
|
|
19201
19449
|
const parts = [];
|
|
19202
19450
|
const inputFilesList = collectInputFiles(inputFiles);
|
|
@@ -19660,11 +19908,13 @@ function extractToolCalls(content) {
|
|
|
19660
19908
|
}
|
|
19661
19909
|
const p = part;
|
|
19662
19910
|
if (p.type === "tool_use" && typeof p.name === "string") {
|
|
19663
|
-
toolCalls.push(
|
|
19664
|
-
|
|
19665
|
-
|
|
19666
|
-
|
|
19667
|
-
|
|
19911
|
+
toolCalls.push(
|
|
19912
|
+
normalizeToolCall("claude-cli", {
|
|
19913
|
+
tool: p.name,
|
|
19914
|
+
input: p.input,
|
|
19915
|
+
id: typeof p.id === "string" ? p.id : void 0
|
|
19916
|
+
})
|
|
19917
|
+
);
|
|
19668
19918
|
}
|
|
19669
19919
|
}
|
|
19670
19920
|
return toolCalls;
|
|
@@ -19949,11 +20199,13 @@ function extractToolCalls2(content) {
|
|
|
19949
20199
|
}
|
|
19950
20200
|
const p = part;
|
|
19951
20201
|
if (p.type === "tool_use" && typeof p.name === "string") {
|
|
19952
|
-
toolCalls.push(
|
|
19953
|
-
|
|
19954
|
-
|
|
19955
|
-
|
|
19956
|
-
|
|
20202
|
+
toolCalls.push(
|
|
20203
|
+
normalizeToolCall("claude-sdk", {
|
|
20204
|
+
tool: p.name,
|
|
20205
|
+
input: p.input,
|
|
20206
|
+
id: typeof p.id === "string" ? p.id : void 0
|
|
20207
|
+
})
|
|
20208
|
+
);
|
|
19957
20209
|
}
|
|
19958
20210
|
}
|
|
19959
20211
|
return toolCalls;
|
|
@@ -20850,27 +21102,33 @@ ${basePrompt}` : basePrompt;
|
|
|
20850
21102
|
}
|
|
20851
21103
|
}
|
|
20852
21104
|
if (itemType === "command_execution") {
|
|
20853
|
-
completedToolCalls.push(
|
|
20854
|
-
|
|
20855
|
-
|
|
20856
|
-
|
|
20857
|
-
|
|
20858
|
-
|
|
21105
|
+
completedToolCalls.push(
|
|
21106
|
+
normalizeToolCall("codex", {
|
|
21107
|
+
tool: "command_execution",
|
|
21108
|
+
input: { command: item.command },
|
|
21109
|
+
output: item.aggregated_output,
|
|
21110
|
+
id: item.id
|
|
21111
|
+
})
|
|
21112
|
+
);
|
|
20859
21113
|
}
|
|
20860
21114
|
if (itemType === "file_change") {
|
|
20861
|
-
completedToolCalls.push(
|
|
20862
|
-
|
|
20863
|
-
|
|
20864
|
-
|
|
20865
|
-
|
|
21115
|
+
completedToolCalls.push(
|
|
21116
|
+
normalizeToolCall("codex", {
|
|
21117
|
+
tool: "file_change",
|
|
21118
|
+
input: item.changes,
|
|
21119
|
+
id: item.id
|
|
21120
|
+
})
|
|
21121
|
+
);
|
|
20866
21122
|
}
|
|
20867
21123
|
if (itemType === "mcp_tool_call") {
|
|
20868
|
-
completedToolCalls.push(
|
|
20869
|
-
|
|
20870
|
-
|
|
20871
|
-
|
|
20872
|
-
|
|
20873
|
-
|
|
21124
|
+
completedToolCalls.push(
|
|
21125
|
+
normalizeToolCall("codex", {
|
|
21126
|
+
tool: `mcp:${item.server}/${item.tool}`,
|
|
21127
|
+
input: item.arguments,
|
|
21128
|
+
output: item.result ?? item.error,
|
|
21129
|
+
id: item.id
|
|
21130
|
+
})
|
|
21131
|
+
);
|
|
20874
21132
|
}
|
|
20875
21133
|
}
|
|
20876
21134
|
resolveCwd(cwdOverride) {
|
|
@@ -21380,12 +21638,6 @@ var CopilotStreamLogger = class _CopilotStreamLogger {
|
|
|
21380
21638
|
return logger;
|
|
21381
21639
|
}
|
|
21382
21640
|
handleEvent(eventType, data) {
|
|
21383
|
-
if (this.format === "json") {
|
|
21384
|
-
const elapsed2 = formatElapsed4(this.startedAt);
|
|
21385
|
-
this.stream.write(`${JSON.stringify({ time: elapsed2, event: eventType, data })}
|
|
21386
|
-
`);
|
|
21387
|
-
return;
|
|
21388
|
-
}
|
|
21389
21641
|
if (this.chunkExtractor) {
|
|
21390
21642
|
const chunkText = this.chunkExtractor(eventType, data);
|
|
21391
21643
|
if (chunkText === null) {
|
|
@@ -21398,6 +21650,12 @@ var CopilotStreamLogger = class _CopilotStreamLogger {
|
|
|
21398
21650
|
}
|
|
21399
21651
|
this.flushPendingText();
|
|
21400
21652
|
}
|
|
21653
|
+
if (this.format === "json") {
|
|
21654
|
+
const elapsed2 = formatElapsed4(this.startedAt);
|
|
21655
|
+
this.stream.write(`${JSON.stringify({ time: elapsed2, event: eventType, data })}
|
|
21656
|
+
`);
|
|
21657
|
+
return;
|
|
21658
|
+
}
|
|
21401
21659
|
const elapsed = formatElapsed4(this.startedAt);
|
|
21402
21660
|
const summary = this.summarize(eventType, data);
|
|
21403
21661
|
if (summary) {
|
|
@@ -21408,14 +21666,19 @@ var CopilotStreamLogger = class _CopilotStreamLogger {
|
|
|
21408
21666
|
flushPendingText() {
|
|
21409
21667
|
if (!this.pendingText) return;
|
|
21410
21668
|
const elapsed = formatElapsed4(this.startedAt);
|
|
21411
|
-
this.
|
|
21669
|
+
if (this.format === "json") {
|
|
21670
|
+
this.stream.write(
|
|
21671
|
+
`${JSON.stringify({ time: elapsed, event: "assistant_message", data: { content: this.pendingText } })}
|
|
21672
|
+
`
|
|
21673
|
+
);
|
|
21674
|
+
} else {
|
|
21675
|
+
this.stream.write(`[+${elapsed}] [assistant_message] ${this.pendingText}
|
|
21412
21676
|
`);
|
|
21677
|
+
}
|
|
21413
21678
|
this.pendingText = "";
|
|
21414
21679
|
}
|
|
21415
21680
|
async close() {
|
|
21416
|
-
|
|
21417
|
-
this.flushPendingText();
|
|
21418
|
-
}
|
|
21681
|
+
this.flushPendingText();
|
|
21419
21682
|
await new Promise((resolve2, reject) => {
|
|
21420
21683
|
this.stream.once("error", reject);
|
|
21421
21684
|
this.stream.end(() => resolve2());
|
|
@@ -21488,15 +21751,17 @@ var CopilotCliProvider = class {
|
|
|
21488
21751
|
}
|
|
21489
21752
|
if (update.status === "completed" || update.status === "failed") {
|
|
21490
21753
|
const toolName = update.title ?? update.kind ?? "unknown";
|
|
21491
|
-
completedToolCalls.push(
|
|
21492
|
-
|
|
21493
|
-
|
|
21494
|
-
|
|
21495
|
-
|
|
21496
|
-
|
|
21497
|
-
|
|
21498
|
-
|
|
21499
|
-
|
|
21754
|
+
completedToolCalls.push(
|
|
21755
|
+
normalizeToolCall("copilot-cli", {
|
|
21756
|
+
tool: toolName,
|
|
21757
|
+
input: update.rawInput,
|
|
21758
|
+
output: update.rawOutput,
|
|
21759
|
+
id: callId,
|
|
21760
|
+
startTime: (/* @__PURE__ */ new Date()).toISOString(),
|
|
21761
|
+
endTime: (/* @__PURE__ */ new Date()).toISOString(),
|
|
21762
|
+
durationMs: 0
|
|
21763
|
+
})
|
|
21764
|
+
);
|
|
21500
21765
|
request.streamCallbacks?.onToolCallEnd?.(
|
|
21501
21766
|
toolName,
|
|
21502
21767
|
update.rawInput,
|
|
@@ -21513,15 +21778,17 @@ var CopilotCliProvider = class {
|
|
|
21513
21778
|
if (inProgress) {
|
|
21514
21779
|
toolCallsInProgress.delete(callId);
|
|
21515
21780
|
const duration = Date.now() - inProgress.startMs;
|
|
21516
|
-
completedToolCalls.push(
|
|
21517
|
-
|
|
21518
|
-
|
|
21519
|
-
|
|
21520
|
-
|
|
21521
|
-
|
|
21522
|
-
|
|
21523
|
-
|
|
21524
|
-
|
|
21781
|
+
completedToolCalls.push(
|
|
21782
|
+
normalizeToolCall("copilot-cli", {
|
|
21783
|
+
tool: inProgress.tool,
|
|
21784
|
+
input: inProgress.input,
|
|
21785
|
+
output: update.rawOutput,
|
|
21786
|
+
id: inProgress.id,
|
|
21787
|
+
startTime: inProgress.startTime,
|
|
21788
|
+
endTime: (/* @__PURE__ */ new Date()).toISOString(),
|
|
21789
|
+
durationMs: duration
|
|
21790
|
+
})
|
|
21791
|
+
);
|
|
21525
21792
|
request.streamCallbacks?.onToolCallEnd?.(
|
|
21526
21793
|
inProgress.tool,
|
|
21527
21794
|
inProgress.input,
|
|
@@ -21858,11 +22125,13 @@ function parseCopilotEvents(eventsJsonl) {
|
|
|
21858
22125
|
}
|
|
21859
22126
|
case "assistant.message": {
|
|
21860
22127
|
const toolRequests = data.toolRequests;
|
|
21861
|
-
const toolCalls = (toolRequests ?? []).map(
|
|
21862
|
-
|
|
21863
|
-
|
|
21864
|
-
|
|
21865
|
-
|
|
22128
|
+
const toolCalls = (toolRequests ?? []).map(
|
|
22129
|
+
(req) => normalizeToolCall("copilot-log", {
|
|
22130
|
+
tool: String(req.name ?? req.toolName ?? ""),
|
|
22131
|
+
input: req.arguments,
|
|
22132
|
+
id: req.toolCallId ? String(req.toolCallId) : void 0
|
|
22133
|
+
})
|
|
22134
|
+
);
|
|
21866
22135
|
messages.push({
|
|
21867
22136
|
role: "assistant",
|
|
21868
22137
|
content: data.content != null ? String(data.content) : void 0,
|
|
@@ -21902,12 +22171,12 @@ function parseCopilotEvents(eventsJsonl) {
|
|
|
21902
22171
|
messages.push({
|
|
21903
22172
|
role: "assistant",
|
|
21904
22173
|
toolCalls: [
|
|
21905
|
-
{
|
|
22174
|
+
normalizeToolCall("copilot-log", {
|
|
21906
22175
|
tool: started.toolName,
|
|
21907
22176
|
input: started.input,
|
|
21908
22177
|
output: data.result,
|
|
21909
22178
|
id: toolCallId
|
|
21910
|
-
}
|
|
22179
|
+
})
|
|
21911
22180
|
]
|
|
21912
22181
|
});
|
|
21913
22182
|
}
|
|
@@ -22235,15 +22504,17 @@ var CopilotSdkProvider = class {
|
|
|
22235
22504
|
if (inProgress) {
|
|
22236
22505
|
toolCallsInProgress.delete(callId);
|
|
22237
22506
|
const endMs = Date.now();
|
|
22238
|
-
completedToolCalls.push(
|
|
22239
|
-
|
|
22240
|
-
|
|
22241
|
-
|
|
22242
|
-
|
|
22243
|
-
|
|
22244
|
-
|
|
22245
|
-
|
|
22246
|
-
|
|
22507
|
+
completedToolCalls.push(
|
|
22508
|
+
normalizeToolCall("copilot-sdk", {
|
|
22509
|
+
tool: inProgress.tool,
|
|
22510
|
+
input: inProgress.input,
|
|
22511
|
+
output: data?.output ?? data?.result,
|
|
22512
|
+
id: inProgress.id,
|
|
22513
|
+
startTime: inProgress.startTime,
|
|
22514
|
+
endTime: (/* @__PURE__ */ new Date()).toISOString(),
|
|
22515
|
+
durationMs: endMs - inProgress.startMs
|
|
22516
|
+
})
|
|
22517
|
+
);
|
|
22247
22518
|
}
|
|
22248
22519
|
}
|
|
22249
22520
|
if (eventType === "assistant.message") {
|
|
@@ -23204,12 +23475,14 @@ function extractToolCallsFromEvents(events) {
|
|
|
23204
23475
|
}
|
|
23205
23476
|
const toolCalls = [];
|
|
23206
23477
|
for (const [id, { tool: tool2, input }] of starts) {
|
|
23207
|
-
toolCalls.push(
|
|
23208
|
-
|
|
23209
|
-
|
|
23210
|
-
|
|
23211
|
-
|
|
23212
|
-
|
|
23478
|
+
toolCalls.push(
|
|
23479
|
+
normalizeToolCall("pi-cli", {
|
|
23480
|
+
tool: tool2,
|
|
23481
|
+
input,
|
|
23482
|
+
id: id.startsWith("anon-") ? void 0 : id,
|
|
23483
|
+
output: results.get(id)
|
|
23484
|
+
})
|
|
23485
|
+
);
|
|
23213
23486
|
}
|
|
23214
23487
|
return toolCalls;
|
|
23215
23488
|
}
|
|
@@ -23331,17 +23604,21 @@ function extractToolCalls3(content) {
|
|
|
23331
23604
|
if (!part || typeof part !== "object") continue;
|
|
23332
23605
|
const p = part;
|
|
23333
23606
|
if (p.type === "tool_use" && typeof p.name === "string") {
|
|
23334
|
-
toolCalls.push(
|
|
23335
|
-
|
|
23336
|
-
|
|
23337
|
-
|
|
23338
|
-
|
|
23607
|
+
toolCalls.push(
|
|
23608
|
+
normalizeToolCall("pi-cli", {
|
|
23609
|
+
tool: p.name,
|
|
23610
|
+
input: p.input,
|
|
23611
|
+
id: typeof p.id === "string" ? p.id : void 0
|
|
23612
|
+
})
|
|
23613
|
+
);
|
|
23339
23614
|
} else if ((p.type === "toolCall" || p.type === "tool_call") && typeof p.name === "string") {
|
|
23340
|
-
toolCalls.push(
|
|
23341
|
-
|
|
23342
|
-
|
|
23343
|
-
|
|
23344
|
-
|
|
23615
|
+
toolCalls.push(
|
|
23616
|
+
normalizeToolCall("pi-cli", {
|
|
23617
|
+
tool: p.name,
|
|
23618
|
+
input: p.arguments ?? p.input,
|
|
23619
|
+
id: typeof p.id === "string" ? p.id : void 0
|
|
23620
|
+
})
|
|
23621
|
+
);
|
|
23345
23622
|
} else if (p.type === "tool_result" && typeof p.tool_use_id === "string") {
|
|
23346
23623
|
const existing = toolCalls.find((tc) => tc.id === p.tool_use_id);
|
|
23347
23624
|
if (existing) {
|
|
@@ -28287,100 +28564,35 @@ var LatencyEvaluator = class {
|
|
|
28287
28564
|
};
|
|
28288
28565
|
}
|
|
28289
28566
|
};
|
|
28290
|
-
var CLAUDE_MATCHER = {
|
|
28291
|
-
skillTools: ["Skill"],
|
|
28292
|
-
skillInputField: "skill",
|
|
28293
|
-
readTools: ["Read"],
|
|
28294
|
-
readInputField: "file_path"
|
|
28295
|
-
};
|
|
28296
|
-
var COPILOT_MATCHER = {
|
|
28297
|
-
skillTools: ["Skill", "skill"],
|
|
28298
|
-
skillInputField: "skill",
|
|
28299
|
-
readTools: ["Read File", "readFile", "Read", "readTextFile"],
|
|
28300
|
-
readInputField: "file_path",
|
|
28301
|
-
skillToolPrefixes: ["Using skill: "],
|
|
28302
|
-
readToolPrefixes: ["Viewing "],
|
|
28303
|
-
readInputFields: ["file_path", "path"]
|
|
28304
|
-
};
|
|
28305
|
-
var PI_CODING_AGENT_MATCHER = {
|
|
28306
|
-
skillTools: [],
|
|
28307
|
-
skillInputField: "skill",
|
|
28308
|
-
readTools: ["read"],
|
|
28309
|
-
readInputField: "path",
|
|
28310
|
-
readInputFields: ["path", "file_path", "filePath"]
|
|
28311
|
-
};
|
|
28312
|
-
var CODEX_MATCHER = {
|
|
28313
|
-
skillTools: [],
|
|
28314
|
-
skillInputField: "skill",
|
|
28315
|
-
readTools: ["command_execution"],
|
|
28316
|
-
readInputField: "command",
|
|
28317
|
-
skillToolPrefixes: ["mcp:"],
|
|
28318
|
-
readToolPrefixes: ["mcp:"],
|
|
28319
|
-
readInputFields: ["command", "path", "file_path", "filePath"]
|
|
28320
|
-
};
|
|
28321
|
-
var PROVIDER_TOOL_SEMANTICS = {
|
|
28322
|
-
claude: CLAUDE_MATCHER,
|
|
28323
|
-
"claude-cli": CLAUDE_MATCHER,
|
|
28324
|
-
"claude-sdk": CLAUDE_MATCHER,
|
|
28325
|
-
codex: CODEX_MATCHER,
|
|
28326
|
-
"pi-coding-agent": PI_CODING_AGENT_MATCHER,
|
|
28327
|
-
"pi-cli": PI_CODING_AGENT_MATCHER,
|
|
28328
|
-
"copilot-cli": COPILOT_MATCHER,
|
|
28329
|
-
"copilot-log": COPILOT_MATCHER,
|
|
28330
|
-
"copilot-sdk": COPILOT_MATCHER,
|
|
28331
|
-
vscode: COPILOT_MATCHER,
|
|
28332
|
-
"vscode-insiders": COPILOT_MATCHER
|
|
28333
|
-
};
|
|
28334
28567
|
var SkillTriggerEvaluator = class {
|
|
28335
28568
|
kind = "skill-trigger";
|
|
28336
28569
|
config;
|
|
28337
28570
|
constructor(config) {
|
|
28338
28571
|
this.config = config;
|
|
28339
28572
|
}
|
|
28340
|
-
resolveMatcher(providerKind) {
|
|
28341
|
-
if (providerKind) {
|
|
28342
|
-
const match = PROVIDER_TOOL_SEMANTICS[providerKind];
|
|
28343
|
-
if (match) return match;
|
|
28344
|
-
}
|
|
28345
|
-
return CLAUDE_MATCHER;
|
|
28346
|
-
}
|
|
28347
28573
|
evaluate(context2) {
|
|
28348
28574
|
const skillName = this.config.skill;
|
|
28349
28575
|
const shouldTrigger = this.config.should_trigger !== false;
|
|
28350
|
-
const providerKind = context2.provider?.kind;
|
|
28351
|
-
const matcher = this.resolveMatcher(providerKind);
|
|
28352
28576
|
const allToolCalls = (context2.output ?? []).flatMap((msg) => msg.toolCalls ?? []);
|
|
28353
28577
|
let triggered = false;
|
|
28354
28578
|
let evidence = "";
|
|
28355
28579
|
for (const toolCall of allToolCalls) {
|
|
28356
28580
|
const toolName = toolCall.tool ?? "";
|
|
28357
28581
|
const input = toolCall.input ?? {};
|
|
28358
|
-
if (
|
|
28359
|
-
const skillArg = String(input
|
|
28582
|
+
if (toolName === "Skill") {
|
|
28583
|
+
const skillArg = String(input.skill ?? "");
|
|
28360
28584
|
if (skillArg.includes(skillName)) {
|
|
28361
28585
|
triggered = true;
|
|
28362
|
-
evidence = `Skill tool invoked with
|
|
28586
|
+
evidence = `Skill tool invoked with skill="${skillArg}"`;
|
|
28363
28587
|
break;
|
|
28364
28588
|
}
|
|
28365
|
-
} else if (
|
|
28366
|
-
|
|
28367
|
-
|
|
28368
|
-
triggered = true;
|
|
28369
|
-
evidence = `Skill tool invoked via tool name "${toolName}"`;
|
|
28370
|
-
break;
|
|
28371
|
-
} else if (matcher.readTools.includes(toolName)) {
|
|
28372
|
-
const filePath = this.readPathFromInput(input, matcher);
|
|
28373
|
-
if (filePath.includes(skillName)) {
|
|
28589
|
+
} else if (toolName === "Read") {
|
|
28590
|
+
const filePath = String(input.file_path ?? "");
|
|
28591
|
+
if (filePath.includes(`skills/${skillName}/`)) {
|
|
28374
28592
|
triggered = true;
|
|
28375
28593
|
evidence = `Read tool loaded skill file: ${filePath}`;
|
|
28376
28594
|
break;
|
|
28377
28595
|
}
|
|
28378
|
-
} else if (matcher.readToolPrefixes?.some(
|
|
28379
|
-
(prefix) => toolName.startsWith(prefix) && toolName.includes(skillName)
|
|
28380
|
-
)) {
|
|
28381
|
-
triggered = true;
|
|
28382
|
-
evidence = `Read tool loaded skill file via tool name "${toolName}"`;
|
|
28383
|
-
break;
|
|
28384
28596
|
}
|
|
28385
28597
|
if (!triggered && toolCall.output != null) {
|
|
28386
28598
|
const outputStr = typeof toolCall.output === "string" ? toolCall.output : JSON.stringify(toolCall.output);
|
|
@@ -28417,16 +28629,6 @@ var SkillTriggerEvaluator = class {
|
|
|
28417
28629
|
expectedAspectCount: 1
|
|
28418
28630
|
};
|
|
28419
28631
|
}
|
|
28420
|
-
readPathFromInput(input, matcher) {
|
|
28421
|
-
const fields = matcher.readInputFields ?? [matcher.readInputField];
|
|
28422
|
-
for (const field of fields) {
|
|
28423
|
-
const value = input[field];
|
|
28424
|
-
if (value !== void 0 && value !== null) {
|
|
28425
|
-
return String(value);
|
|
28426
|
-
}
|
|
28427
|
-
}
|
|
28428
|
-
return "";
|
|
28429
|
-
}
|
|
28430
28632
|
};
|
|
28431
28633
|
function assembleLlmGraderPrompt(input) {
|
|
28432
28634
|
const {
|
|
@@ -30664,6 +30866,8 @@ async function executeWorkspaceScript(config, context2, failureMode = "fatal") {
|
|
|
30664
30866
|
}
|
|
30665
30867
|
return result.stdout;
|
|
30666
30868
|
}
|
|
30869
|
+
var execFileAsync3 = promisify7(execFile3);
|
|
30870
|
+
var WORKSPACE_GIT_TIMEOUT_MS = 3e5;
|
|
30667
30871
|
function classifyQualityStatus(score, threshold = DEFAULT_THRESHOLD) {
|
|
30668
30872
|
return score >= threshold ? "ok" : "quality_failure";
|
|
30669
30873
|
}
|
|
@@ -30701,6 +30905,35 @@ function hasHookCommand(hook) {
|
|
|
30701
30905
|
function hooksEnabled(workspace) {
|
|
30702
30906
|
return workspace?.hooks?.enabled !== false;
|
|
30703
30907
|
}
|
|
30908
|
+
function workspaceGitEnv() {
|
|
30909
|
+
const env = { ...process.env };
|
|
30910
|
+
for (const key of Object.keys(env)) {
|
|
30911
|
+
if (key.startsWith("GIT_") && key !== "GIT_SSH_COMMAND") {
|
|
30912
|
+
delete env[key];
|
|
30913
|
+
}
|
|
30914
|
+
}
|
|
30915
|
+
return {
|
|
30916
|
+
...env,
|
|
30917
|
+
GIT_TERMINAL_PROMPT: "0",
|
|
30918
|
+
GIT_ASKPASS: "",
|
|
30919
|
+
GIT_SSH_COMMAND: "ssh -o BatchMode=yes"
|
|
30920
|
+
};
|
|
30921
|
+
}
|
|
30922
|
+
async function resetWorkspaceRoot(workspacePath, resetMode, baselineRef) {
|
|
30923
|
+
if (!existsSync5(path45.join(workspacePath, ".git"))) {
|
|
30924
|
+
return false;
|
|
30925
|
+
}
|
|
30926
|
+
const cleanFlag = resetMode === "strict" ? "-fdx" : "-fd";
|
|
30927
|
+
const opts = {
|
|
30928
|
+
cwd: workspacePath,
|
|
30929
|
+
timeout: WORKSPACE_GIT_TIMEOUT_MS,
|
|
30930
|
+
env: workspaceGitEnv(),
|
|
30931
|
+
maxBuffer: 50 * 1024 * 1024
|
|
30932
|
+
};
|
|
30933
|
+
await execFileAsync3("git", ["reset", "--hard", baselineRef ?? "HEAD"], opts);
|
|
30934
|
+
await execFileAsync3("git", ["clean", cleanFlag], opts);
|
|
30935
|
+
return true;
|
|
30936
|
+
}
|
|
30704
30937
|
function getWorkspaceTemplate(target) {
|
|
30705
30938
|
const config = target.config;
|
|
30706
30939
|
if ("workspaceTemplate" in config && typeof config.workspaceTemplate === "string") {
|
|
@@ -31962,6 +32195,37 @@ async function runEvalCase(options) {
|
|
|
31962
32195
|
}
|
|
31963
32196
|
}
|
|
31964
32197
|
}
|
|
32198
|
+
let beforeEachNeedsFreshBaseline = false;
|
|
32199
|
+
if (caseHooksEnabled && workspacePath && evalCase.workspace?.hooks?.before_each?.reset && evalCase.workspace.hooks.before_each.reset !== "none") {
|
|
32200
|
+
try {
|
|
32201
|
+
if (repoManager && evalCase.workspace.repos?.length) {
|
|
32202
|
+
await repoManager.reset(
|
|
32203
|
+
evalCase.workspace.repos,
|
|
32204
|
+
workspacePath,
|
|
32205
|
+
evalCase.workspace.hooks.before_each.reset
|
|
32206
|
+
);
|
|
32207
|
+
} else {
|
|
32208
|
+
await resetWorkspaceRoot(
|
|
32209
|
+
workspacePath,
|
|
32210
|
+
evalCase.workspace.hooks.before_each.reset,
|
|
32211
|
+
sharedBaselineCommit
|
|
32212
|
+
);
|
|
32213
|
+
}
|
|
32214
|
+
} catch (error) {
|
|
32215
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
32216
|
+
return buildErrorResult(
|
|
32217
|
+
evalCase,
|
|
32218
|
+
target.name,
|
|
32219
|
+
nowFn(),
|
|
32220
|
+
new Error(`before_each reset failed: ${message}`),
|
|
32221
|
+
promptInputs,
|
|
32222
|
+
provider,
|
|
32223
|
+
"setup",
|
|
32224
|
+
"script_error",
|
|
32225
|
+
verbose
|
|
32226
|
+
);
|
|
32227
|
+
}
|
|
32228
|
+
}
|
|
31965
32229
|
const caseBeforeEachHook = evalCase.workspace?.hooks?.before_each;
|
|
31966
32230
|
if (workspacePath && caseHooksEnabled && hasHookCommand(caseBeforeEachHook)) {
|
|
31967
32231
|
const beforeEachHook = caseBeforeEachHook;
|
|
@@ -31978,6 +32242,7 @@ async function runEvalCase(options) {
|
|
|
31978
32242
|
toScriptConfig(beforeEachHook, "before_each", `test '${evalCase.id}'`),
|
|
31979
32243
|
scriptContext
|
|
31980
32244
|
);
|
|
32245
|
+
beforeEachNeedsFreshBaseline = true;
|
|
31981
32246
|
} catch (error) {
|
|
31982
32247
|
const message = error instanceof Error ? error.message : String(error);
|
|
31983
32248
|
return buildErrorResult(
|
|
@@ -31993,7 +32258,7 @@ async function runEvalCase(options) {
|
|
|
31993
32258
|
);
|
|
31994
32259
|
}
|
|
31995
32260
|
}
|
|
31996
|
-
let baselineCommit = sharedBaselineCommit;
|
|
32261
|
+
let baselineCommit = beforeEachNeedsFreshBaseline ? void 0 : sharedBaselineCommit;
|
|
31997
32262
|
if (!baselineCommit && workspacePath) {
|
|
31998
32263
|
try {
|
|
31999
32264
|
baselineCommit = await initializeBaseline(workspacePath);
|
|
@@ -32004,6 +32269,35 @@ async function runEvalCase(options) {
|
|
|
32004
32269
|
}
|
|
32005
32270
|
}
|
|
32006
32271
|
}
|
|
32272
|
+
if (evalCase.mode === "conversation" && evalCase.turns?.length) {
|
|
32273
|
+
const conversationResult = await runConversationMode({
|
|
32274
|
+
evalCase,
|
|
32275
|
+
provider,
|
|
32276
|
+
target,
|
|
32277
|
+
evaluators,
|
|
32278
|
+
typeRegistry,
|
|
32279
|
+
graderProvider,
|
|
32280
|
+
promptInputs,
|
|
32281
|
+
nowFn,
|
|
32282
|
+
signal,
|
|
32283
|
+
workspacePath,
|
|
32284
|
+
caseWorkspaceFile: caseWorkspaceFile ?? suiteWorkspaceFile,
|
|
32285
|
+
agentTimeoutMs,
|
|
32286
|
+
streamCallbacks: options.streamCallbacks,
|
|
32287
|
+
verbose,
|
|
32288
|
+
threshold: evalCase.threshold ?? caseThreshold,
|
|
32289
|
+
targetResolver,
|
|
32290
|
+
availableTargets
|
|
32291
|
+
});
|
|
32292
|
+
if (workspacePath && !isSharedWorkspace) {
|
|
32293
|
+
const shouldRetain = conversationResult.executionStatus === "ok" ? retainOnSuccess === "keep" || keepWorkspaces : retainOnFailure === "keep" || !forceCleanup && !keepWorkspaces;
|
|
32294
|
+
if (!shouldRetain) {
|
|
32295
|
+
await cleanupWorkspace(workspacePath).catch(() => {
|
|
32296
|
+
});
|
|
32297
|
+
}
|
|
32298
|
+
}
|
|
32299
|
+
return conversationResult;
|
|
32300
|
+
}
|
|
32007
32301
|
const caseStartMs = Date.now();
|
|
32008
32302
|
const attemptBudget = (maxRetries ?? 0) + 1;
|
|
32009
32303
|
let attempt = 0;
|
|
@@ -32118,13 +32412,21 @@ async function runEvalCase(options) {
|
|
|
32118
32412
|
${providerFileChanges}` : providerFileChanges;
|
|
32119
32413
|
}
|
|
32120
32414
|
const providerError = extractProviderError(providerResponse);
|
|
32121
|
-
if (caseHooksEnabled &&
|
|
32415
|
+
if (caseHooksEnabled && workspacePath && evalCase.workspace?.hooks?.after_each?.reset && evalCase.workspace.hooks.after_each.reset !== "none") {
|
|
32122
32416
|
try {
|
|
32123
|
-
|
|
32124
|
-
|
|
32125
|
-
|
|
32126
|
-
|
|
32127
|
-
|
|
32417
|
+
if (repoManager && evalCase.workspace.repos?.length) {
|
|
32418
|
+
await repoManager.reset(
|
|
32419
|
+
evalCase.workspace.repos,
|
|
32420
|
+
workspacePath,
|
|
32421
|
+
evalCase.workspace.hooks.after_each.reset
|
|
32422
|
+
);
|
|
32423
|
+
} else {
|
|
32424
|
+
await resetWorkspaceRoot(
|
|
32425
|
+
workspacePath,
|
|
32426
|
+
evalCase.workspace.hooks.after_each.reset,
|
|
32427
|
+
baselineCommit
|
|
32428
|
+
);
|
|
32429
|
+
}
|
|
32128
32430
|
} catch {
|
|
32129
32431
|
}
|
|
32130
32432
|
}
|
|
@@ -32740,6 +33042,276 @@ function buildEvaluatorRegistry(overrides, resolveGraderProvider) {
|
|
|
32740
33042
|
"llm-grader": llmGrader
|
|
32741
33043
|
};
|
|
32742
33044
|
}
|
|
33045
|
+
async function runConversationMode(options) {
|
|
33046
|
+
const {
|
|
33047
|
+
evalCase,
|
|
33048
|
+
provider,
|
|
33049
|
+
target,
|
|
33050
|
+
evaluators,
|
|
33051
|
+
typeRegistry,
|
|
33052
|
+
graderProvider,
|
|
33053
|
+
promptInputs,
|
|
33054
|
+
nowFn,
|
|
33055
|
+
signal,
|
|
33056
|
+
workspacePath,
|
|
33057
|
+
caseWorkspaceFile,
|
|
33058
|
+
agentTimeoutMs,
|
|
33059
|
+
streamCallbacks,
|
|
33060
|
+
verbose,
|
|
33061
|
+
threshold,
|
|
33062
|
+
targetResolver,
|
|
33063
|
+
availableTargets
|
|
33064
|
+
} = options;
|
|
33065
|
+
const turns = evalCase.turns;
|
|
33066
|
+
const aggregation = evalCase.aggregation ?? "mean";
|
|
33067
|
+
const onTurnFailure = evalCase.on_turn_failure ?? "continue";
|
|
33068
|
+
const windowSize = evalCase.window_size;
|
|
33069
|
+
const history = [];
|
|
33070
|
+
for (const msg of evalCase.input) {
|
|
33071
|
+
const content = typeof msg.content === "string" ? msg.content : JSON.stringify(msg.content);
|
|
33072
|
+
history.push({ role: msg.role, content });
|
|
33073
|
+
}
|
|
33074
|
+
const turnScores = [];
|
|
33075
|
+
const allTurnScoreValues = [];
|
|
33076
|
+
let stopped = false;
|
|
33077
|
+
const caseStartMs = Date.now();
|
|
33078
|
+
for (let i = 0; i < turns.length; i++) {
|
|
33079
|
+
const turn = turns[i];
|
|
33080
|
+
const turnIndex = i + 1;
|
|
33081
|
+
if (stopped) {
|
|
33082
|
+
turnScores.push({
|
|
33083
|
+
name: `turn-${turnIndex}`,
|
|
33084
|
+
type: "rubrics",
|
|
33085
|
+
score: 0,
|
|
33086
|
+
verdict: "skip",
|
|
33087
|
+
assertions: [{ text: "Skipped due to previous turn failure", passed: false }]
|
|
33088
|
+
});
|
|
33089
|
+
allTurnScoreValues.push(0);
|
|
33090
|
+
continue;
|
|
33091
|
+
}
|
|
33092
|
+
const userContent = typeof turn.input === "string" ? turn.input : JSON.stringify(turn.input);
|
|
33093
|
+
history.push({ role: "user", content: userContent });
|
|
33094
|
+
const chatPromptForProvider = windowSize ? buildWindowedHistory(history, windowSize) : [...history];
|
|
33095
|
+
let response;
|
|
33096
|
+
try {
|
|
33097
|
+
response = await provider.invoke({
|
|
33098
|
+
question: userContent,
|
|
33099
|
+
chatPrompt: chatPromptForProvider,
|
|
33100
|
+
evalCaseId: `${evalCase.id}/turn-${turnIndex}`,
|
|
33101
|
+
signal,
|
|
33102
|
+
cwd: workspacePath,
|
|
33103
|
+
workspaceFile: caseWorkspaceFile,
|
|
33104
|
+
streamCallbacks
|
|
33105
|
+
});
|
|
33106
|
+
} catch (error) {
|
|
33107
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
33108
|
+
turnScores.push({
|
|
33109
|
+
name: `turn-${turnIndex}`,
|
|
33110
|
+
type: "rubrics",
|
|
33111
|
+
score: 0,
|
|
33112
|
+
verdict: "fail",
|
|
33113
|
+
assertions: [{ text: `Provider error: ${message}`, passed: false }]
|
|
33114
|
+
});
|
|
33115
|
+
allTurnScoreValues.push(0);
|
|
33116
|
+
if (onTurnFailure === "stop") stopped = true;
|
|
33117
|
+
continue;
|
|
33118
|
+
}
|
|
33119
|
+
const assistantContent = extractLastAssistantContent(response.output);
|
|
33120
|
+
history.push({ role: "assistant", content: assistantContent });
|
|
33121
|
+
if (!turn.assertions?.length && !turn.expected_output) {
|
|
33122
|
+
turnScores.push({
|
|
33123
|
+
name: `turn-${turnIndex}`,
|
|
33124
|
+
type: "rubrics",
|
|
33125
|
+
score: 1,
|
|
33126
|
+
verdict: "pass",
|
|
33127
|
+
assertions: []
|
|
33128
|
+
});
|
|
33129
|
+
allTurnScoreValues.push(1);
|
|
33130
|
+
continue;
|
|
33131
|
+
}
|
|
33132
|
+
const turnAssertions = buildTurnAssertions(turn);
|
|
33133
|
+
const turnEvalCase = {
|
|
33134
|
+
...evalCase,
|
|
33135
|
+
id: `${evalCase.id}/turn-${turnIndex}`,
|
|
33136
|
+
assertions: turnAssertions,
|
|
33137
|
+
input: buildTurnGraderInput(history, windowSize),
|
|
33138
|
+
expected_output: turn.expected_output ? [
|
|
33139
|
+
typeof turn.expected_output === "string" ? { content: turn.expected_output } : turn.expected_output
|
|
33140
|
+
] : [],
|
|
33141
|
+
// Clear conversation fields to prevent recursion
|
|
33142
|
+
mode: void 0,
|
|
33143
|
+
turns: void 0
|
|
33144
|
+
};
|
|
33145
|
+
const turnResult = await evaluateCandidate({
|
|
33146
|
+
evalCase: turnEvalCase,
|
|
33147
|
+
candidate: assistantContent,
|
|
33148
|
+
target,
|
|
33149
|
+
provider,
|
|
33150
|
+
evaluators,
|
|
33151
|
+
typeRegistry,
|
|
33152
|
+
promptInputs: {
|
|
33153
|
+
question: buildConversationContext(history, windowSize),
|
|
33154
|
+
chatPrompt: windowSize ? buildWindowedHistory(history, windowSize) : [...history]
|
|
33155
|
+
},
|
|
33156
|
+
nowFn,
|
|
33157
|
+
attempt: 0,
|
|
33158
|
+
graderProvider,
|
|
33159
|
+
agentTimeoutMs,
|
|
33160
|
+
output: response.output,
|
|
33161
|
+
verbose,
|
|
33162
|
+
threshold,
|
|
33163
|
+
targetResolver,
|
|
33164
|
+
availableTargets
|
|
33165
|
+
});
|
|
33166
|
+
const turnScore = turnResult.score;
|
|
33167
|
+
allTurnScoreValues.push(turnScore);
|
|
33168
|
+
turnScores.push({
|
|
33169
|
+
name: `turn-${turnIndex}`,
|
|
33170
|
+
type: "rubrics",
|
|
33171
|
+
score: turnScore,
|
|
33172
|
+
verdict: scoreToVerdict(turnScore, threshold ?? DEFAULT_THRESHOLD),
|
|
33173
|
+
assertions: turnResult.assertions ? [...turnResult.assertions] : [],
|
|
33174
|
+
scores: turnResult.scores
|
|
33175
|
+
});
|
|
33176
|
+
if (onTurnFailure === "stop" && turnScore < (threshold ?? DEFAULT_THRESHOLD)) {
|
|
33177
|
+
stopped = true;
|
|
33178
|
+
}
|
|
33179
|
+
}
|
|
33180
|
+
let conversationScores = [];
|
|
33181
|
+
if (evalCase.assertions?.length) {
|
|
33182
|
+
const conversationEvalCase = {
|
|
33183
|
+
...evalCase,
|
|
33184
|
+
id: `${evalCase.id}/conversation`,
|
|
33185
|
+
input: history.map((m) => ({
|
|
33186
|
+
role: m.role,
|
|
33187
|
+
content: m.content
|
|
33188
|
+
})),
|
|
33189
|
+
expected_output: [],
|
|
33190
|
+
mode: void 0,
|
|
33191
|
+
turns: void 0
|
|
33192
|
+
};
|
|
33193
|
+
const fullTranscript = history.map((m) => {
|
|
33194
|
+
const content = typeof m.content === "string" ? m.content : JSON.stringify(m.content);
|
|
33195
|
+
return `${m.role}: ${content}`;
|
|
33196
|
+
}).join("\n\n");
|
|
33197
|
+
const conversationResult = await evaluateCandidate({
|
|
33198
|
+
evalCase: conversationEvalCase,
|
|
33199
|
+
candidate: fullTranscript,
|
|
33200
|
+
target,
|
|
33201
|
+
provider,
|
|
33202
|
+
evaluators,
|
|
33203
|
+
typeRegistry,
|
|
33204
|
+
promptInputs: {
|
|
33205
|
+
question: fullTranscript,
|
|
33206
|
+
chatPrompt: [...history]
|
|
33207
|
+
},
|
|
33208
|
+
nowFn,
|
|
33209
|
+
attempt: 0,
|
|
33210
|
+
graderProvider,
|
|
33211
|
+
agentTimeoutMs,
|
|
33212
|
+
verbose,
|
|
33213
|
+
threshold,
|
|
33214
|
+
targetResolver,
|
|
33215
|
+
availableTargets
|
|
33216
|
+
});
|
|
33217
|
+
conversationScores = [
|
|
33218
|
+
{
|
|
33219
|
+
name: "conversation",
|
|
33220
|
+
type: "rubrics",
|
|
33221
|
+
score: conversationResult.score,
|
|
33222
|
+
verdict: scoreToVerdict(
|
|
33223
|
+
conversationResult.score,
|
|
33224
|
+
threshold ?? DEFAULT_THRESHOLD
|
|
33225
|
+
),
|
|
33226
|
+
assertions: conversationResult.assertions ? [...conversationResult.assertions] : [],
|
|
33227
|
+
scores: conversationResult.scores
|
|
33228
|
+
}
|
|
33229
|
+
];
|
|
33230
|
+
}
|
|
33231
|
+
const allScoreValues = [...allTurnScoreValues, ...conversationScores.map((s) => s.score)];
|
|
33232
|
+
const finalScore = aggregateConversationScores(allScoreValues, aggregation);
|
|
33233
|
+
const allResultScores = [...turnScores, ...conversationScores];
|
|
33234
|
+
const outputMessages = history.map((m) => ({
|
|
33235
|
+
role: m.role,
|
|
33236
|
+
content: m.content
|
|
33237
|
+
}));
|
|
33238
|
+
const flatAssertions = allResultScores.flatMap((s) => [...s.assertions]);
|
|
33239
|
+
const totalDurationMs = Date.now() - caseStartMs;
|
|
33240
|
+
return {
|
|
33241
|
+
timestamp: nowFn().toISOString(),
|
|
33242
|
+
testId: evalCase.id,
|
|
33243
|
+
suite: evalCase.suite,
|
|
33244
|
+
category: evalCase.category,
|
|
33245
|
+
score: finalScore,
|
|
33246
|
+
assertions: flatAssertions,
|
|
33247
|
+
target: target.name,
|
|
33248
|
+
output: outputMessages,
|
|
33249
|
+
scores: allResultScores,
|
|
33250
|
+
executionStatus: classifyQualityStatus(finalScore, threshold ?? DEFAULT_THRESHOLD),
|
|
33251
|
+
input: evalCase.input.map((m) => ({
|
|
33252
|
+
role: m.role,
|
|
33253
|
+
content: typeof m.content === "string" ? m.content : JSON.stringify(m.content)
|
|
33254
|
+
})),
|
|
33255
|
+
evalRun: { durationMs: totalDurationMs }
|
|
33256
|
+
};
|
|
33257
|
+
}
|
|
33258
|
+
function buildWindowedHistory(history, windowSize) {
|
|
33259
|
+
const systemMessages = history.filter((m) => m.role === "system");
|
|
33260
|
+
const nonSystem = history.filter((m) => m.role !== "system");
|
|
33261
|
+
const windowed = nonSystem.slice(-windowSize * 2);
|
|
33262
|
+
return [...systemMessages, ...windowed];
|
|
33263
|
+
}
|
|
33264
|
+
function buildConversationContext(history, windowSize) {
|
|
33265
|
+
const msgs = windowSize ? buildWindowedHistory(history, windowSize) : history;
|
|
33266
|
+
return msgs.map((m) => {
|
|
33267
|
+
const content = typeof m.content === "string" ? m.content : JSON.stringify(m.content);
|
|
33268
|
+
return `${m.role}: ${content}`;
|
|
33269
|
+
}).join("\n\n");
|
|
33270
|
+
}
|
|
33271
|
+
function buildTurnGraderInput(history, windowSize) {
|
|
33272
|
+
const msgs = windowSize ? buildWindowedHistory(history, windowSize) : history;
|
|
33273
|
+
return msgs.map((m) => ({
|
|
33274
|
+
role: m.role,
|
|
33275
|
+
content: m.content
|
|
33276
|
+
}));
|
|
33277
|
+
}
|
|
33278
|
+
function buildTurnAssertions(turn) {
|
|
33279
|
+
if (!turn.assertions?.length) return [];
|
|
33280
|
+
const stringCriteria = [];
|
|
33281
|
+
const structured = [];
|
|
33282
|
+
for (const a of turn.assertions) {
|
|
33283
|
+
if (typeof a === "string") {
|
|
33284
|
+
stringCriteria.push(a);
|
|
33285
|
+
} else {
|
|
33286
|
+
structured.push(a);
|
|
33287
|
+
}
|
|
33288
|
+
}
|
|
33289
|
+
const result = [];
|
|
33290
|
+
if (stringCriteria.length > 0) {
|
|
33291
|
+
result.push({
|
|
33292
|
+
name: "turn-rubrics",
|
|
33293
|
+
type: "llm-grader",
|
|
33294
|
+
rubrics: stringCriteria.map((text2, idx) => ({
|
|
33295
|
+
id: `criterion-${idx + 1}`,
|
|
33296
|
+
outcome: text2,
|
|
33297
|
+
weight: 1
|
|
33298
|
+
}))
|
|
33299
|
+
});
|
|
33300
|
+
}
|
|
33301
|
+
result.push(...structured);
|
|
33302
|
+
return result;
|
|
33303
|
+
}
|
|
33304
|
+
function aggregateConversationScores(scores, aggregation) {
|
|
33305
|
+
if (scores.length === 0) return 1;
|
|
33306
|
+
switch (aggregation) {
|
|
33307
|
+
case "min":
|
|
33308
|
+
return Math.min(...scores);
|
|
33309
|
+
case "max":
|
|
33310
|
+
return Math.max(...scores);
|
|
33311
|
+
default:
|
|
33312
|
+
return scores.reduce((sum, s) => sum + s, 0) / scores.length;
|
|
33313
|
+
}
|
|
33314
|
+
}
|
|
32743
33315
|
async function invokeProvider(provider, options) {
|
|
32744
33316
|
const {
|
|
32745
33317
|
evalCase,
|
|
@@ -33430,7 +34002,7 @@ function shouldSkipCacheForTemperature(targetConfig) {
|
|
|
33430
34002
|
}
|
|
33431
34003
|
return false;
|
|
33432
34004
|
}
|
|
33433
|
-
var
|
|
34005
|
+
var execFileAsync4 = promisify8(execFile4);
|
|
33434
34006
|
function sanitizeRepoSlug(repo) {
|
|
33435
34007
|
return repo.trim().replace(/[^A-Za-z0-9._-]+/g, "-");
|
|
33436
34008
|
}
|
|
@@ -33481,7 +34053,7 @@ function writePersistedStatus(statusFile, status) {
|
|
|
33481
34053
|
}
|
|
33482
34054
|
async function runCommand(executable, args, options) {
|
|
33483
34055
|
try {
|
|
33484
|
-
const { stdout, stderr } = await
|
|
34056
|
+
const { stdout, stderr } = await execFileAsync4(executable, [...args], {
|
|
33485
34057
|
cwd: options?.cwd,
|
|
33486
34058
|
env: process.env
|
|
33487
34059
|
});
|
|
@@ -34516,11 +35088,13 @@ function extractAssistantContent(content) {
|
|
|
34516
35088
|
break;
|
|
34517
35089
|
case "tool_use":
|
|
34518
35090
|
if (block.name) {
|
|
34519
|
-
toolCalls.push(
|
|
34520
|
-
|
|
34521
|
-
|
|
34522
|
-
|
|
34523
|
-
|
|
35091
|
+
toolCalls.push(
|
|
35092
|
+
normalizeToolCall("claude", {
|
|
35093
|
+
tool: block.name,
|
|
35094
|
+
input: block.input,
|
|
35095
|
+
id: block.id
|
|
35096
|
+
})
|
|
35097
|
+
);
|
|
34524
35098
|
}
|
|
34525
35099
|
break;
|
|
34526
35100
|
}
|
|
@@ -34610,7 +35184,11 @@ function parseCodexSession(jsonl) {
|
|
|
34610
35184
|
} else {
|
|
34611
35185
|
input = payload.arguments;
|
|
34612
35186
|
}
|
|
34613
|
-
const toolCall =
|
|
35187
|
+
const toolCall = normalizeToolCall("codex", {
|
|
35188
|
+
tool: toolName,
|
|
35189
|
+
input,
|
|
35190
|
+
id: callId
|
|
35191
|
+
});
|
|
34614
35192
|
const msgIdx = messages.length;
|
|
34615
35193
|
messages.push({
|
|
34616
35194
|
role: "assistant",
|
|
@@ -34634,7 +35212,11 @@ function parseCodexSession(jsonl) {
|
|
|
34634
35212
|
} else {
|
|
34635
35213
|
input = payload.arguments;
|
|
34636
35214
|
}
|
|
34637
|
-
const toolCall =
|
|
35215
|
+
const toolCall = normalizeToolCall("codex", {
|
|
35216
|
+
tool: toolName,
|
|
35217
|
+
input,
|
|
35218
|
+
id: callId
|
|
35219
|
+
});
|
|
34638
35220
|
const msgIdx = messages.length;
|
|
34639
35221
|
messages.push({
|
|
34640
35222
|
role: "assistant",
|
|
@@ -35088,4 +35670,4 @@ export {
|
|
|
35088
35670
|
TranscriptProvider,
|
|
35089
35671
|
createAgentKernel
|
|
35090
35672
|
};
|
|
35091
|
-
//# sourceMappingURL=chunk-
|
|
35673
|
+
//# sourceMappingURL=chunk-VSWJTL3T.js.map
|