agentv 4.14.0 → 4.15.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -305,7 +305,7 @@ var require_dist = __commonJS({
305
305
  }
306
306
  });
307
307
 
308
- // ../../packages/core/dist/chunk-A3HYVKTI.js
308
+ // ../../packages/core/dist/chunk-AOOU6PLC.js
309
309
  import { constants } from "node:fs";
310
310
  import { access, readFile } from "node:fs/promises";
311
311
  import path from "node:path";
@@ -423,7 +423,7 @@ __export(external_exports2, {
423
423
  void: () => voidType
424
424
  });
425
425
 
426
- // ../../packages/core/dist/chunk-A3HYVKTI.js
426
+ // ../../packages/core/dist/chunk-AOOU6PLC.js
427
427
  import { readFile as readFile2 } from "node:fs/promises";
428
428
  import path3 from "node:path";
429
429
  import fg from "fast-glob";
@@ -1293,6 +1293,11 @@ function resolveCodexConfig(target, env, evalFilePath) {
1293
1293
  const logDirSource = target.log_dir ?? target.log_directory;
1294
1294
  const logFormatSource = target.log_format ?? target.log_output_format ?? env.AGENTV_CODEX_LOG_FORMAT;
1295
1295
  const systemPromptSource = target.system_prompt;
1296
+ const streamLogResult = resolveStreamLog(target, env.AGENTV_CODEX_LOG_FORMAT);
1297
+ if (streamLogResult.deprecationWarning) {
1298
+ process.stderr.write(`[agentv] \u26A0 ${streamLogResult.deprecationWarning}
1299
+ `);
1300
+ }
1296
1301
  const model = resolveOptionalString(modelSource, env, `${target.name} codex model`, {
1297
1302
  allowLiteral: true,
1298
1303
  optionalEnv: true
@@ -1339,6 +1344,7 @@ function resolveCodexConfig(target, env, evalFilePath) {
1339
1344
  timeoutMs,
1340
1345
  logDir,
1341
1346
  logFormat,
1347
+ streamLog: streamLogResult.streamLog,
1342
1348
  systemPrompt
1343
1349
  };
1344
1350
  }
@@ -1355,6 +1361,38 @@ function normalizeCodexLogFormat(value) {
1355
1361
  }
1356
1362
  throw new Error("codex log format must be 'summary' or 'json'");
1357
1363
  }
1364
+ function resolveStreamLog(target, envFallback) {
1365
+ if (target.stream_log !== void 0 && target.stream_log !== null) {
1366
+ const val = target.stream_log;
1367
+ if (val === false || val === "false") {
1368
+ return { streamLog: false, logFormat: void 0 };
1369
+ }
1370
+ if (val === "raw") {
1371
+ return { streamLog: "raw", logFormat: "json" };
1372
+ }
1373
+ if (val === "summary") {
1374
+ return { streamLog: "summary", logFormat: "summary" };
1375
+ }
1376
+ throw new Error(`${target.name}: stream_log must be false, 'raw', or 'summary'`);
1377
+ }
1378
+ const logFormatRaw = target.log_format ?? target.log_output_format ?? envFallback;
1379
+ if (logFormatRaw === void 0 || logFormatRaw === null) {
1380
+ return { streamLog: void 0, logFormat: void 0 };
1381
+ }
1382
+ if (typeof logFormatRaw !== "string") {
1383
+ throw new Error(`${target.name}: log_format must be 'summary' or 'json'`);
1384
+ }
1385
+ const normalized = logFormatRaw.trim().toLowerCase();
1386
+ if (normalized !== "json" && normalized !== "summary") {
1387
+ throw new Error(`${target.name}: log_format must be 'summary' or 'json'`);
1388
+ }
1389
+ const streamLogEquivalent = normalized === "json" ? "raw" : "summary";
1390
+ return {
1391
+ streamLog: streamLogEquivalent,
1392
+ logFormat: normalized,
1393
+ deprecationWarning: `${target.name}: 'log_format' is deprecated and will be removed in v4.16. Use 'stream_log: ${streamLogEquivalent}' instead (log_format: '${normalized}' \u2192 stream_log: '${streamLogEquivalent}').`
1394
+ };
1395
+ }
1358
1396
  function resolveCopilotSdkConfig(target, env, evalFilePath) {
1359
1397
  const cliUrlSource = target.cli_url;
1360
1398
  const cliPathSource = target.cli_path;
@@ -1366,6 +1404,11 @@ function resolveCopilotSdkConfig(target, env, evalFilePath) {
1366
1404
  const logDirSource = target.log_dir ?? target.log_directory;
1367
1405
  const logFormatSource = target.log_format;
1368
1406
  const systemPromptSource = target.system_prompt;
1407
+ const streamLogResult = resolveStreamLog(target);
1408
+ if (streamLogResult.deprecationWarning) {
1409
+ process.stderr.write(`[agentv] \u26A0 ${streamLogResult.deprecationWarning}
1410
+ `);
1411
+ }
1369
1412
  const cliUrl = resolveOptionalString(cliUrlSource, env, `${target.name} copilot-sdk cli URL`, {
1370
1413
  allowLiteral: true,
1371
1414
  optionalEnv: true
@@ -1476,6 +1519,7 @@ function resolveCopilotSdkConfig(target, env, evalFilePath) {
1476
1519
  timeoutMs,
1477
1520
  logDir,
1478
1521
  logFormat,
1522
+ streamLog: streamLogResult.streamLog,
1479
1523
  systemPrompt,
1480
1524
  byokType,
1481
1525
  byokBaseUrl,
@@ -1495,6 +1539,11 @@ function resolveCopilotCliConfig(target, env, evalFilePath) {
1495
1539
  const logDirSource = target.log_dir ?? target.log_directory;
1496
1540
  const logFormatSource = target.log_format;
1497
1541
  const systemPromptSource = target.system_prompt;
1542
+ const streamLogResult = resolveStreamLog(target);
1543
+ if (streamLogResult.deprecationWarning) {
1544
+ process.stderr.write(`[agentv] \u26A0 ${streamLogResult.deprecationWarning}
1545
+ `);
1546
+ }
1498
1547
  const executable = resolveOptionalString(executableSource, env, `${target.name} copilot-cli executable`, {
1499
1548
  allowLiteral: true,
1500
1549
  optionalEnv: true
@@ -1546,6 +1595,7 @@ function resolveCopilotCliConfig(target, env, evalFilePath) {
1546
1595
  timeoutMs,
1547
1596
  logDir,
1548
1597
  logFormat,
1598
+ streamLog: streamLogResult.streamLog,
1549
1599
  systemPrompt
1550
1600
  };
1551
1601
  }
@@ -1568,6 +1618,11 @@ function resolvePiCodingAgentConfig(target, env, evalFilePath) {
1568
1618
  const logDirSource = target.log_dir ?? target.log_directory;
1569
1619
  const logFormatSource = target.log_format;
1570
1620
  const systemPromptSource = target.system_prompt;
1621
+ const streamLogResult = resolveStreamLog(target);
1622
+ if (streamLogResult.deprecationWarning) {
1623
+ process.stderr.write(`[agentv] \u26A0 ${streamLogResult.deprecationWarning}
1624
+ `);
1625
+ }
1571
1626
  const subprovider = resolveOptionalString(
1572
1627
  subproviderSource,
1573
1628
  env,
@@ -1638,6 +1693,7 @@ function resolvePiCodingAgentConfig(target, env, evalFilePath) {
1638
1693
  timeoutMs,
1639
1694
  logDir,
1640
1695
  logFormat,
1696
+ streamLog: streamLogResult.streamLog,
1641
1697
  systemPrompt
1642
1698
  };
1643
1699
  }
@@ -1654,6 +1710,11 @@ function resolvePiCliConfig(target, env, evalFilePath) {
1654
1710
  const logDirSource = target.log_dir ?? target.log_directory;
1655
1711
  const logFormatSource = target.log_format;
1656
1712
  const systemPromptSource = target.system_prompt;
1713
+ const streamLogResult = resolveStreamLog(target);
1714
+ if (streamLogResult.deprecationWarning) {
1715
+ process.stderr.write(`[agentv] \u26A0 ${streamLogResult.deprecationWarning}
1716
+ `);
1717
+ }
1657
1718
  const executable = resolveOptionalString(executableSource, env, `${target.name} pi-cli executable`, {
1658
1719
  allowLiteral: true,
1659
1720
  optionalEnv: true
@@ -1724,6 +1785,7 @@ function resolvePiCliConfig(target, env, evalFilePath) {
1724
1785
  timeoutMs,
1725
1786
  logDir,
1726
1787
  logFormat,
1788
+ streamLog: streamLogResult.streamLog,
1727
1789
  systemPrompt
1728
1790
  };
1729
1791
  }
@@ -1735,6 +1797,11 @@ function resolveClaudeConfig(target, env, evalFilePath) {
1735
1797
  const logDirSource = target.log_dir ?? target.log_directory;
1736
1798
  const logFormatSource = target.log_format ?? target.log_output_format ?? env.AGENTV_CLAUDE_LOG_FORMAT;
1737
1799
  const systemPromptSource = target.system_prompt;
1800
+ const streamLogResult = resolveStreamLog(target);
1801
+ if (streamLogResult.deprecationWarning) {
1802
+ process.stderr.write(`[agentv] \u26A0 ${streamLogResult.deprecationWarning}
1803
+ `);
1804
+ }
1738
1805
  const model = resolveOptionalString(modelSource, env, `${target.name} claude model`, {
1739
1806
  allowLiteral: true,
1740
1807
  optionalEnv: true
@@ -1778,7 +1845,8 @@ function resolveClaudeConfig(target, env, evalFilePath) {
1778
1845
  maxTurns,
1779
1846
  maxBudgetUsd,
1780
1847
  logDir,
1781
- logFormat
1848
+ logFormat,
1849
+ streamLog: streamLogResult.streamLog
1782
1850
  };
1783
1851
  }
1784
1852
  function normalizeClaudeLogFormat(value) {
@@ -14542,10 +14610,12 @@ import { randomBytes } from "node:crypto";
14542
14610
  import { createServer } from "node:http";
14543
14611
  import fs2 from "node:fs/promises";
14544
14612
  import path37 from "node:path";
14613
+ import { execFile as execFile3 } from "node:child_process";
14545
14614
  import { createHash as createHash2, randomUUID as randomUUID9 } from "node:crypto";
14546
14615
  import { existsSync as existsSync5 } from "node:fs";
14547
14616
  import { copyFile as copyFile2, mkdir as mkdir14, readdir as readdir8, stat as stat9 } from "node:fs/promises";
14548
14617
  import path45 from "node:path";
14618
+ import { promisify as promisify7 } from "node:util";
14549
14619
  import micromatch3 from "micromatch";
14550
14620
  import path38 from "node:path";
14551
14621
  import path39 from "node:path";
@@ -14573,12 +14643,12 @@ import path47 from "node:path";
14573
14643
  import { parse as parse6 } from "yaml";
14574
14644
  import { mkdir as mkdir15, readFile as readFile17, writeFile as writeFile8 } from "node:fs/promises";
14575
14645
  import path48 from "node:path";
14576
- import { execFile as execFile3 } from "node:child_process";
14646
+ import { execFile as execFile4 } from "node:child_process";
14577
14647
  import { existsSync as existsSync7, mkdirSync as mkdirSync2, readFileSync as readFileSync3, rmSync, writeFileSync } from "node:fs";
14578
14648
  import { cp as cp3, mkdtemp as mkdtemp3, readdir as readdir9, rm as rm6, stat as stat10 } from "node:fs/promises";
14579
14649
  import os3 from "node:os";
14580
14650
  import path49 from "node:path";
14581
- import { promisify as promisify7 } from "node:util";
14651
+ import { promisify as promisify8 } from "node:util";
14582
14652
  import { existsSync as existsSync8, mkdirSync as mkdirSync3, readFileSync as readFileSync4, readdirSync as readdirSync3, statSync as statSync2, writeFileSync as writeFileSync2 } from "node:fs";
14583
14653
  import path50 from "node:path";
14584
14654
  import { parse as parseYaml3, stringify as stringifyYaml } from "yaml";
@@ -18150,10 +18220,10 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
18150
18220
  const effectiveSuiteInputFiles = suiteInputFiles && !skipDefaults ? suiteInputFiles : void 0;
18151
18221
  const testInputMessages = resolveInputMessages(testCaseConfig, effectiveSuiteInputFiles);
18152
18222
  const expectedMessages = resolveExpectedMessages(testCaseConfig) ?? [];
18153
- const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 || testCaseConfig.assertions !== void 0 || testCaseConfig.assert !== void 0;
18223
+ const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 || testCaseConfig.assertions !== void 0 || testCaseConfig.assert !== void 0 || Array.isArray(testCaseConfig.turns) && testCaseConfig.turns.length > 0;
18154
18224
  if (!id || !hasEvaluationSpec || !testInputMessages || testInputMessages.length === 0) {
18155
18225
  logError3(
18156
- `Skipping incomplete test: ${id ?? "unknown"}. Missing required fields: id, input, and at least one of criteria/expected_output/assertions`
18226
+ `Skipping incomplete test: ${id ?? "unknown"}. Missing required fields: id, input, and at least one of criteria/expected_output/assertions/turns`
18157
18227
  );
18158
18228
  continue;
18159
18229
  }
@@ -18230,6 +18300,14 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
18230
18300
  ) : void 0;
18231
18301
  const onDependencyFailureRaw = asString5(testCaseConfig.on_dependency_failure);
18232
18302
  const onDependencyFailure = onDependencyFailureRaw === "skip" || onDependencyFailureRaw === "fail" || onDependencyFailureRaw === "run" ? onDependencyFailureRaw : void 0;
18303
+ const modeRaw = asString5(testCaseConfig.mode);
18304
+ const mode = modeRaw === "conversation" ? "conversation" : void 0;
18305
+ const turns = Array.isArray(testCaseConfig.turns) ? parseTurns(testCaseConfig.turns) : void 0;
18306
+ const aggregationRaw = asString5(testCaseConfig.aggregation);
18307
+ const aggregation = aggregationRaw === "mean" || aggregationRaw === "min" || aggregationRaw === "max" ? aggregationRaw : void 0;
18308
+ const onTurnFailureRaw = asString5(testCaseConfig.on_turn_failure);
18309
+ const onTurnFailure = onTurnFailureRaw === "continue" || onTurnFailureRaw === "stop" ? onTurnFailureRaw : void 0;
18310
+ const windowSize = typeof testCaseConfig.window_size === "number" && testCaseConfig.window_size >= 1 ? testCaseConfig.window_size : void 0;
18233
18311
  const testCase = {
18234
18312
  id,
18235
18313
  suite: suiteName,
@@ -18248,6 +18326,11 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
18248
18326
  metadata,
18249
18327
  targets: caseTargets,
18250
18328
  ...caseThreshold !== void 0 ? { threshold: caseThreshold } : {},
18329
+ ...mode ? { mode } : {},
18330
+ ...turns && turns.length > 0 ? { turns } : {},
18331
+ ...aggregation ? { aggregation } : {},
18332
+ ...onTurnFailure ? { on_turn_failure: onTurnFailure } : {},
18333
+ ...windowSize !== void 0 ? { window_size: windowSize } : {},
18251
18334
  ...dependsOn && dependsOn.length > 0 ? { depends_on: dependsOn } : {},
18252
18335
  ...onDependencyFailure ? { on_dependency_failure: onDependencyFailure } : {}
18253
18336
  };
@@ -18265,6 +18348,25 @@ async function loadTestById(evalFilePath, repoRoot, evalId) {
18265
18348
  return match;
18266
18349
  }
18267
18350
  var loadEvalCaseById = loadTestById;
18351
+ function parseTurns(rawTurns) {
18352
+ return rawTurns.map((rawTurn) => {
18353
+ const turn = rawTurn;
18354
+ const input = turn.input;
18355
+ const expectedOutput = turn.expected_output;
18356
+ let assertions;
18357
+ if (Array.isArray(turn.assertions)) {
18358
+ assertions = turn.assertions.map((a) => {
18359
+ if (typeof a === "string") return a;
18360
+ return a;
18361
+ });
18362
+ }
18363
+ return {
18364
+ input,
18365
+ ...expectedOutput !== void 0 ? { expected_output: expectedOutput } : {},
18366
+ ...assertions && assertions.length > 0 ? { assertions } : {}
18367
+ };
18368
+ });
18369
+ }
18268
18370
  function parseCommandArray(source) {
18269
18371
  if (typeof source === "string") {
18270
18372
  const parts = source.trim().split(/\s+/);
@@ -19197,6 +19299,152 @@ function subscribeToClaudeLogEntries(listener) {
19197
19299
  store.delete(listener);
19198
19300
  };
19199
19301
  }
19302
+ var TOOL_NAME_MAP = /* @__PURE__ */ new Map([
19303
+ // --- Claude (already canonical) ---
19304
+ ["claude::Skill", "Skill"],
19305
+ ["claude::Read", "Read"],
19306
+ ["claude::Write", "Write"],
19307
+ ["claude::Edit", "Edit"],
19308
+ ["claude::Bash", "Bash"],
19309
+ ["claude-cli::Skill", "Skill"],
19310
+ ["claude-cli::Read", "Read"],
19311
+ ["claude-cli::Write", "Write"],
19312
+ ["claude-cli::Edit", "Edit"],
19313
+ ["claude-cli::Bash", "Bash"],
19314
+ ["claude-sdk::Skill", "Skill"],
19315
+ ["claude-sdk::Read", "Read"],
19316
+ ["claude-sdk::Write", "Write"],
19317
+ ["claude-sdk::Edit", "Edit"],
19318
+ ["claude-sdk::Bash", "Bash"],
19319
+ // --- Copilot ---
19320
+ ["copilot-cli::Skill", "Skill"],
19321
+ ["copilot-cli::skill", "Skill"],
19322
+ ["copilot-cli::Read File", "Read"],
19323
+ ["copilot-cli::readFile", "Read"],
19324
+ ["copilot-cli::Read", "Read"],
19325
+ ["copilot-cli::readTextFile", "Read"],
19326
+ ["copilot-cli::writeTextFile", "Write"],
19327
+ ["copilot-cli::Write File", "Write"],
19328
+ ["copilot-cli::editFile", "Edit"],
19329
+ ["copilot-cli::Edit File", "Edit"],
19330
+ ["copilot-cli::runTerminalCommand", "Bash"],
19331
+ ["copilot-sdk::Skill", "Skill"],
19332
+ ["copilot-sdk::skill", "Skill"],
19333
+ ["copilot-sdk::Read File", "Read"],
19334
+ ["copilot-sdk::readFile", "Read"],
19335
+ ["copilot-sdk::Read", "Read"],
19336
+ ["copilot-sdk::readTextFile", "Read"],
19337
+ ["copilot-sdk::writeTextFile", "Write"],
19338
+ ["copilot-sdk::Write File", "Write"],
19339
+ ["copilot-sdk::editFile", "Edit"],
19340
+ ["copilot-sdk::Edit File", "Edit"],
19341
+ ["copilot-sdk::runTerminalCommand", "Bash"],
19342
+ ["copilot-log::Skill", "Skill"],
19343
+ ["copilot-log::skill", "Skill"],
19344
+ ["copilot-log::Read File", "Read"],
19345
+ ["copilot-log::readFile", "Read"],
19346
+ ["copilot-log::Read", "Read"],
19347
+ ["copilot-log::readTextFile", "Read"],
19348
+ ["copilot-log::writeTextFile", "Write"],
19349
+ ["copilot-log::Write File", "Write"],
19350
+ ["copilot-log::editFile", "Edit"],
19351
+ ["copilot-log::Edit File", "Edit"],
19352
+ ["copilot-log::runTerminalCommand", "Bash"],
19353
+ ["vscode::Skill", "Skill"],
19354
+ ["vscode::skill", "Skill"],
19355
+ ["vscode::Read File", "Read"],
19356
+ ["vscode::readFile", "Read"],
19357
+ ["vscode::Read", "Read"],
19358
+ ["vscode::readTextFile", "Read"],
19359
+ ["vscode::writeTextFile", "Write"],
19360
+ ["vscode::Write File", "Write"],
19361
+ ["vscode::editFile", "Edit"],
19362
+ ["vscode::Edit File", "Edit"],
19363
+ ["vscode::runTerminalCommand", "Bash"],
19364
+ ["vscode-insiders::Skill", "Skill"],
19365
+ ["vscode-insiders::skill", "Skill"],
19366
+ ["vscode-insiders::Read File", "Read"],
19367
+ ["vscode-insiders::readFile", "Read"],
19368
+ ["vscode-insiders::Read", "Read"],
19369
+ ["vscode-insiders::readTextFile", "Read"],
19370
+ ["vscode-insiders::writeTextFile", "Write"],
19371
+ ["vscode-insiders::Write File", "Write"],
19372
+ ["vscode-insiders::editFile", "Edit"],
19373
+ ["vscode-insiders::Edit File", "Edit"],
19374
+ ["vscode-insiders::runTerminalCommand", "Bash"],
19375
+ // --- Codex ---
19376
+ ["codex::command_execution", "Bash"],
19377
+ ["codex::file_change", "Edit"],
19378
+ // --- Pi ---
19379
+ ["pi-coding-agent::read", "Read"],
19380
+ ["pi-coding-agent::bash", "Bash"],
19381
+ ["pi-cli::read", "Read"],
19382
+ ["pi-cli::bash", "Bash"]
19383
+ ]);
19384
+ var COPILOT_PREFIXES = [
19385
+ { prefix: "Using skill: ", canonical: "Skill", extractSkillFromName: true },
19386
+ { prefix: "Viewing ", canonical: "Read" }
19387
+ ];
19388
+ var CODEX_PREFIXES = [
19389
+ { prefix: "mcp:", canonical: "Skill", extractSkillFromName: true }
19390
+ ];
19391
+ var TOOL_PREFIX_MAP = /* @__PURE__ */ new Map([
19392
+ ["copilot-cli", COPILOT_PREFIXES],
19393
+ ["copilot-sdk", COPILOT_PREFIXES],
19394
+ ["copilot-log", COPILOT_PREFIXES],
19395
+ ["vscode", COPILOT_PREFIXES],
19396
+ ["vscode-insiders", COPILOT_PREFIXES],
19397
+ ["codex", CODEX_PREFIXES]
19398
+ ]);
19399
+ var normalizeSkillInput = (input) => {
19400
+ if (input.skill !== void 0) return input;
19401
+ return input;
19402
+ };
19403
+ var normalizeReadInput = (input) => {
19404
+ if (input.file_path !== void 0) return input;
19405
+ if (input.path !== void 0) return { ...input, file_path: input.path };
19406
+ if (input.filePath !== void 0) return { ...input, file_path: input.filePath };
19407
+ return input;
19408
+ };
19409
+ var INPUT_NORMALIZERS = /* @__PURE__ */ new Map([
19410
+ ["Skill", normalizeSkillInput],
19411
+ ["Read", normalizeReadInput]
19412
+ ]);
19413
+ function normalizeToolCall(providerKind, tc) {
19414
+ const nativeName = tc.tool;
19415
+ const exactKey = `${providerKind}::${nativeName}`;
19416
+ const canonical = TOOL_NAME_MAP.get(exactKey);
19417
+ if (canonical) {
19418
+ return applyInputNormalization(canonical, { ...tc, tool: canonical });
19419
+ }
19420
+ const prefixRules = TOOL_PREFIX_MAP.get(providerKind);
19421
+ if (prefixRules) {
19422
+ for (const rule of prefixRules) {
19423
+ if (nativeName.startsWith(rule.prefix)) {
19424
+ const suffix = nativeName.slice(rule.prefix.length);
19425
+ let normalizedInput = tc.input;
19426
+ if (rule.extractSkillFromName && suffix) {
19427
+ const existingInput = tc.input ?? {};
19428
+ normalizedInput = { ...existingInput, skill: suffix };
19429
+ }
19430
+ const normalized = {
19431
+ ...tc,
19432
+ tool: rule.canonical,
19433
+ input: normalizedInput
19434
+ };
19435
+ return applyInputNormalization(rule.canonical, normalized);
19436
+ }
19437
+ }
19438
+ }
19439
+ return tc;
19440
+ }
19441
+ function applyInputNormalization(canonical, tc) {
19442
+ const normalizer = INPUT_NORMALIZERS.get(canonical);
19443
+ if (!normalizer || tc.input === void 0 || tc.input === null) return tc;
19444
+ const input = tc.input;
19445
+ const normalized = normalizer(input);
19446
+ return normalized === input ? tc : { ...tc, input: normalized };
19447
+ }
19200
19448
  function buildPromptDocument(request, inputFiles) {
19201
19449
  const parts = [];
19202
19450
  const inputFilesList = collectInputFiles(inputFiles);
@@ -19660,11 +19908,13 @@ function extractToolCalls(content) {
19660
19908
  }
19661
19909
  const p = part;
19662
19910
  if (p.type === "tool_use" && typeof p.name === "string") {
19663
- toolCalls.push({
19664
- tool: p.name,
19665
- input: p.input,
19666
- id: typeof p.id === "string" ? p.id : void 0
19667
- });
19911
+ toolCalls.push(
19912
+ normalizeToolCall("claude-cli", {
19913
+ tool: p.name,
19914
+ input: p.input,
19915
+ id: typeof p.id === "string" ? p.id : void 0
19916
+ })
19917
+ );
19668
19918
  }
19669
19919
  }
19670
19920
  return toolCalls;
@@ -19949,11 +20199,13 @@ function extractToolCalls2(content) {
19949
20199
  }
19950
20200
  const p = part;
19951
20201
  if (p.type === "tool_use" && typeof p.name === "string") {
19952
- toolCalls.push({
19953
- tool: p.name,
19954
- input: p.input,
19955
- id: typeof p.id === "string" ? p.id : void 0
19956
- });
20202
+ toolCalls.push(
20203
+ normalizeToolCall("claude-sdk", {
20204
+ tool: p.name,
20205
+ input: p.input,
20206
+ id: typeof p.id === "string" ? p.id : void 0
20207
+ })
20208
+ );
19957
20209
  }
19958
20210
  }
19959
20211
  return toolCalls;
@@ -20850,27 +21102,33 @@ ${basePrompt}` : basePrompt;
20850
21102
  }
20851
21103
  }
20852
21104
  if (itemType === "command_execution") {
20853
- completedToolCalls.push({
20854
- tool: "command_execution",
20855
- input: { command: item.command },
20856
- output: item.aggregated_output,
20857
- id: item.id
20858
- });
21105
+ completedToolCalls.push(
21106
+ normalizeToolCall("codex", {
21107
+ tool: "command_execution",
21108
+ input: { command: item.command },
21109
+ output: item.aggregated_output,
21110
+ id: item.id
21111
+ })
21112
+ );
20859
21113
  }
20860
21114
  if (itemType === "file_change") {
20861
- completedToolCalls.push({
20862
- tool: "file_change",
20863
- input: item.changes,
20864
- id: item.id
20865
- });
21115
+ completedToolCalls.push(
21116
+ normalizeToolCall("codex", {
21117
+ tool: "file_change",
21118
+ input: item.changes,
21119
+ id: item.id
21120
+ })
21121
+ );
20866
21122
  }
20867
21123
  if (itemType === "mcp_tool_call") {
20868
- completedToolCalls.push({
20869
- tool: `mcp:${item.server}/${item.tool}`,
20870
- input: item.arguments,
20871
- output: item.result ?? item.error,
20872
- id: item.id
20873
- });
21124
+ completedToolCalls.push(
21125
+ normalizeToolCall("codex", {
21126
+ tool: `mcp:${item.server}/${item.tool}`,
21127
+ input: item.arguments,
21128
+ output: item.result ?? item.error,
21129
+ id: item.id
21130
+ })
21131
+ );
20874
21132
  }
20875
21133
  }
20876
21134
  resolveCwd(cwdOverride) {
@@ -21380,12 +21638,6 @@ var CopilotStreamLogger = class _CopilotStreamLogger {
21380
21638
  return logger;
21381
21639
  }
21382
21640
  handleEvent(eventType, data) {
21383
- if (this.format === "json") {
21384
- const elapsed2 = formatElapsed4(this.startedAt);
21385
- this.stream.write(`${JSON.stringify({ time: elapsed2, event: eventType, data })}
21386
- `);
21387
- return;
21388
- }
21389
21641
  if (this.chunkExtractor) {
21390
21642
  const chunkText = this.chunkExtractor(eventType, data);
21391
21643
  if (chunkText === null) {
@@ -21398,6 +21650,12 @@ var CopilotStreamLogger = class _CopilotStreamLogger {
21398
21650
  }
21399
21651
  this.flushPendingText();
21400
21652
  }
21653
+ if (this.format === "json") {
21654
+ const elapsed2 = formatElapsed4(this.startedAt);
21655
+ this.stream.write(`${JSON.stringify({ time: elapsed2, event: eventType, data })}
21656
+ `);
21657
+ return;
21658
+ }
21401
21659
  const elapsed = formatElapsed4(this.startedAt);
21402
21660
  const summary = this.summarize(eventType, data);
21403
21661
  if (summary) {
@@ -21408,14 +21666,19 @@ var CopilotStreamLogger = class _CopilotStreamLogger {
21408
21666
  flushPendingText() {
21409
21667
  if (!this.pendingText) return;
21410
21668
  const elapsed = formatElapsed4(this.startedAt);
21411
- this.stream.write(`[+${elapsed}] [assistant_message] ${this.pendingText}
21669
+ if (this.format === "json") {
21670
+ this.stream.write(
21671
+ `${JSON.stringify({ time: elapsed, event: "assistant_message", data: { content: this.pendingText } })}
21672
+ `
21673
+ );
21674
+ } else {
21675
+ this.stream.write(`[+${elapsed}] [assistant_message] ${this.pendingText}
21412
21676
  `);
21677
+ }
21413
21678
  this.pendingText = "";
21414
21679
  }
21415
21680
  async close() {
21416
- if (this.format !== "json") {
21417
- this.flushPendingText();
21418
- }
21681
+ this.flushPendingText();
21419
21682
  await new Promise((resolve2, reject) => {
21420
21683
  this.stream.once("error", reject);
21421
21684
  this.stream.end(() => resolve2());
@@ -21488,15 +21751,17 @@ var CopilotCliProvider = class {
21488
21751
  }
21489
21752
  if (update.status === "completed" || update.status === "failed") {
21490
21753
  const toolName = update.title ?? update.kind ?? "unknown";
21491
- completedToolCalls.push({
21492
- tool: toolName,
21493
- input: update.rawInput,
21494
- output: update.rawOutput,
21495
- id: callId,
21496
- startTime: (/* @__PURE__ */ new Date()).toISOString(),
21497
- endTime: (/* @__PURE__ */ new Date()).toISOString(),
21498
- durationMs: 0
21499
- });
21754
+ completedToolCalls.push(
21755
+ normalizeToolCall("copilot-cli", {
21756
+ tool: toolName,
21757
+ input: update.rawInput,
21758
+ output: update.rawOutput,
21759
+ id: callId,
21760
+ startTime: (/* @__PURE__ */ new Date()).toISOString(),
21761
+ endTime: (/* @__PURE__ */ new Date()).toISOString(),
21762
+ durationMs: 0
21763
+ })
21764
+ );
21500
21765
  request.streamCallbacks?.onToolCallEnd?.(
21501
21766
  toolName,
21502
21767
  update.rawInput,
@@ -21513,15 +21778,17 @@ var CopilotCliProvider = class {
21513
21778
  if (inProgress) {
21514
21779
  toolCallsInProgress.delete(callId);
21515
21780
  const duration = Date.now() - inProgress.startMs;
21516
- completedToolCalls.push({
21517
- tool: inProgress.tool,
21518
- input: inProgress.input,
21519
- output: update.rawOutput,
21520
- id: inProgress.id,
21521
- startTime: inProgress.startTime,
21522
- endTime: (/* @__PURE__ */ new Date()).toISOString(),
21523
- durationMs: duration
21524
- });
21781
+ completedToolCalls.push(
21782
+ normalizeToolCall("copilot-cli", {
21783
+ tool: inProgress.tool,
21784
+ input: inProgress.input,
21785
+ output: update.rawOutput,
21786
+ id: inProgress.id,
21787
+ startTime: inProgress.startTime,
21788
+ endTime: (/* @__PURE__ */ new Date()).toISOString(),
21789
+ durationMs: duration
21790
+ })
21791
+ );
21525
21792
  request.streamCallbacks?.onToolCallEnd?.(
21526
21793
  inProgress.tool,
21527
21794
  inProgress.input,
@@ -21858,11 +22125,13 @@ function parseCopilotEvents(eventsJsonl) {
21858
22125
  }
21859
22126
  case "assistant.message": {
21860
22127
  const toolRequests = data.toolRequests;
21861
- const toolCalls = (toolRequests ?? []).map((req) => ({
21862
- tool: String(req.name ?? req.toolName ?? ""),
21863
- input: req.arguments,
21864
- id: req.toolCallId ? String(req.toolCallId) : void 0
21865
- }));
22128
+ const toolCalls = (toolRequests ?? []).map(
22129
+ (req) => normalizeToolCall("copilot-log", {
22130
+ tool: String(req.name ?? req.toolName ?? ""),
22131
+ input: req.arguments,
22132
+ id: req.toolCallId ? String(req.toolCallId) : void 0
22133
+ })
22134
+ );
21866
22135
  messages.push({
21867
22136
  role: "assistant",
21868
22137
  content: data.content != null ? String(data.content) : void 0,
@@ -21902,12 +22171,12 @@ function parseCopilotEvents(eventsJsonl) {
21902
22171
  messages.push({
21903
22172
  role: "assistant",
21904
22173
  toolCalls: [
21905
- {
22174
+ normalizeToolCall("copilot-log", {
21906
22175
  tool: started.toolName,
21907
22176
  input: started.input,
21908
22177
  output: data.result,
21909
22178
  id: toolCallId
21910
- }
22179
+ })
21911
22180
  ]
21912
22181
  });
21913
22182
  }
@@ -22235,15 +22504,17 @@ var CopilotSdkProvider = class {
22235
22504
  if (inProgress) {
22236
22505
  toolCallsInProgress.delete(callId);
22237
22506
  const endMs = Date.now();
22238
- completedToolCalls.push({
22239
- tool: inProgress.tool,
22240
- input: inProgress.input,
22241
- output: data?.output ?? data?.result,
22242
- id: inProgress.id,
22243
- startTime: inProgress.startTime,
22244
- endTime: (/* @__PURE__ */ new Date()).toISOString(),
22245
- durationMs: endMs - inProgress.startMs
22246
- });
22507
+ completedToolCalls.push(
22508
+ normalizeToolCall("copilot-sdk", {
22509
+ tool: inProgress.tool,
22510
+ input: inProgress.input,
22511
+ output: data?.output ?? data?.result,
22512
+ id: inProgress.id,
22513
+ startTime: inProgress.startTime,
22514
+ endTime: (/* @__PURE__ */ new Date()).toISOString(),
22515
+ durationMs: endMs - inProgress.startMs
22516
+ })
22517
+ );
22247
22518
  }
22248
22519
  }
22249
22520
  if (eventType === "assistant.message") {
@@ -23204,12 +23475,14 @@ function extractToolCallsFromEvents(events) {
23204
23475
  }
23205
23476
  const toolCalls = [];
23206
23477
  for (const [id, { tool: tool2, input }] of starts) {
23207
- toolCalls.push({
23208
- tool: tool2,
23209
- input,
23210
- id: id.startsWith("anon-") ? void 0 : id,
23211
- output: results.get(id)
23212
- });
23478
+ toolCalls.push(
23479
+ normalizeToolCall("pi-cli", {
23480
+ tool: tool2,
23481
+ input,
23482
+ id: id.startsWith("anon-") ? void 0 : id,
23483
+ output: results.get(id)
23484
+ })
23485
+ );
23213
23486
  }
23214
23487
  return toolCalls;
23215
23488
  }
@@ -23331,17 +23604,21 @@ function extractToolCalls3(content) {
23331
23604
  if (!part || typeof part !== "object") continue;
23332
23605
  const p = part;
23333
23606
  if (p.type === "tool_use" && typeof p.name === "string") {
23334
- toolCalls.push({
23335
- tool: p.name,
23336
- input: p.input,
23337
- id: typeof p.id === "string" ? p.id : void 0
23338
- });
23607
+ toolCalls.push(
23608
+ normalizeToolCall("pi-cli", {
23609
+ tool: p.name,
23610
+ input: p.input,
23611
+ id: typeof p.id === "string" ? p.id : void 0
23612
+ })
23613
+ );
23339
23614
  } else if ((p.type === "toolCall" || p.type === "tool_call") && typeof p.name === "string") {
23340
- toolCalls.push({
23341
- tool: p.name,
23342
- input: p.arguments ?? p.input,
23343
- id: typeof p.id === "string" ? p.id : void 0
23344
- });
23615
+ toolCalls.push(
23616
+ normalizeToolCall("pi-cli", {
23617
+ tool: p.name,
23618
+ input: p.arguments ?? p.input,
23619
+ id: typeof p.id === "string" ? p.id : void 0
23620
+ })
23621
+ );
23345
23622
  } else if (p.type === "tool_result" && typeof p.tool_use_id === "string") {
23346
23623
  const existing = toolCalls.find((tc) => tc.id === p.tool_use_id);
23347
23624
  if (existing) {
@@ -28287,100 +28564,35 @@ var LatencyEvaluator = class {
28287
28564
  };
28288
28565
  }
28289
28566
  };
28290
- var CLAUDE_MATCHER = {
28291
- skillTools: ["Skill"],
28292
- skillInputField: "skill",
28293
- readTools: ["Read"],
28294
- readInputField: "file_path"
28295
- };
28296
- var COPILOT_MATCHER = {
28297
- skillTools: ["Skill", "skill"],
28298
- skillInputField: "skill",
28299
- readTools: ["Read File", "readFile", "Read", "readTextFile"],
28300
- readInputField: "file_path",
28301
- skillToolPrefixes: ["Using skill: "],
28302
- readToolPrefixes: ["Viewing "],
28303
- readInputFields: ["file_path", "path"]
28304
- };
28305
- var PI_CODING_AGENT_MATCHER = {
28306
- skillTools: [],
28307
- skillInputField: "skill",
28308
- readTools: ["read"],
28309
- readInputField: "path",
28310
- readInputFields: ["path", "file_path", "filePath"]
28311
- };
28312
- var CODEX_MATCHER = {
28313
- skillTools: [],
28314
- skillInputField: "skill",
28315
- readTools: ["command_execution"],
28316
- readInputField: "command",
28317
- skillToolPrefixes: ["mcp:"],
28318
- readToolPrefixes: ["mcp:"],
28319
- readInputFields: ["command", "path", "file_path", "filePath"]
28320
- };
28321
- var PROVIDER_TOOL_SEMANTICS = {
28322
- claude: CLAUDE_MATCHER,
28323
- "claude-cli": CLAUDE_MATCHER,
28324
- "claude-sdk": CLAUDE_MATCHER,
28325
- codex: CODEX_MATCHER,
28326
- "pi-coding-agent": PI_CODING_AGENT_MATCHER,
28327
- "pi-cli": PI_CODING_AGENT_MATCHER,
28328
- "copilot-cli": COPILOT_MATCHER,
28329
- "copilot-log": COPILOT_MATCHER,
28330
- "copilot-sdk": COPILOT_MATCHER,
28331
- vscode: COPILOT_MATCHER,
28332
- "vscode-insiders": COPILOT_MATCHER
28333
- };
28334
28567
  var SkillTriggerEvaluator = class {
28335
28568
  kind = "skill-trigger";
28336
28569
  config;
28337
28570
  constructor(config) {
28338
28571
  this.config = config;
28339
28572
  }
28340
- resolveMatcher(providerKind) {
28341
- if (providerKind) {
28342
- const match = PROVIDER_TOOL_SEMANTICS[providerKind];
28343
- if (match) return match;
28344
- }
28345
- return CLAUDE_MATCHER;
28346
- }
28347
28573
  evaluate(context2) {
28348
28574
  const skillName = this.config.skill;
28349
28575
  const shouldTrigger = this.config.should_trigger !== false;
28350
- const providerKind = context2.provider?.kind;
28351
- const matcher = this.resolveMatcher(providerKind);
28352
28576
  const allToolCalls = (context2.output ?? []).flatMap((msg) => msg.toolCalls ?? []);
28353
28577
  let triggered = false;
28354
28578
  let evidence = "";
28355
28579
  for (const toolCall of allToolCalls) {
28356
28580
  const toolName = toolCall.tool ?? "";
28357
28581
  const input = toolCall.input ?? {};
28358
- if (matcher.skillTools.includes(toolName)) {
28359
- const skillArg = String(input[matcher.skillInputField] ?? "");
28582
+ if (toolName === "Skill") {
28583
+ const skillArg = String(input.skill ?? "");
28360
28584
  if (skillArg.includes(skillName)) {
28361
28585
  triggered = true;
28362
- evidence = `Skill tool invoked with ${matcher.skillInputField}="${skillArg}"`;
28586
+ evidence = `Skill tool invoked with skill="${skillArg}"`;
28363
28587
  break;
28364
28588
  }
28365
- } else if (matcher.skillToolPrefixes?.some(
28366
- (prefix) => toolName.startsWith(prefix) && toolName.includes(skillName)
28367
- )) {
28368
- triggered = true;
28369
- evidence = `Skill tool invoked via tool name "${toolName}"`;
28370
- break;
28371
- } else if (matcher.readTools.includes(toolName)) {
28372
- const filePath = this.readPathFromInput(input, matcher);
28373
- if (filePath.includes(skillName)) {
28589
+ } else if (toolName === "Read") {
28590
+ const filePath = String(input.file_path ?? "");
28591
+ if (filePath.includes(`skills/${skillName}/`)) {
28374
28592
  triggered = true;
28375
28593
  evidence = `Read tool loaded skill file: ${filePath}`;
28376
28594
  break;
28377
28595
  }
28378
- } else if (matcher.readToolPrefixes?.some(
28379
- (prefix) => toolName.startsWith(prefix) && toolName.includes(skillName)
28380
- )) {
28381
- triggered = true;
28382
- evidence = `Read tool loaded skill file via tool name "${toolName}"`;
28383
- break;
28384
28596
  }
28385
28597
  if (!triggered && toolCall.output != null) {
28386
28598
  const outputStr = typeof toolCall.output === "string" ? toolCall.output : JSON.stringify(toolCall.output);
@@ -28417,16 +28629,6 @@ var SkillTriggerEvaluator = class {
28417
28629
  expectedAspectCount: 1
28418
28630
  };
28419
28631
  }
28420
- readPathFromInput(input, matcher) {
28421
- const fields = matcher.readInputFields ?? [matcher.readInputField];
28422
- for (const field of fields) {
28423
- const value = input[field];
28424
- if (value !== void 0 && value !== null) {
28425
- return String(value);
28426
- }
28427
- }
28428
- return "";
28429
- }
28430
28632
  };
28431
28633
  function assembleLlmGraderPrompt(input) {
28432
28634
  const {
@@ -30664,6 +30866,8 @@ async function executeWorkspaceScript(config, context2, failureMode = "fatal") {
30664
30866
  }
30665
30867
  return result.stdout;
30666
30868
  }
30869
+ var execFileAsync3 = promisify7(execFile3);
30870
+ var WORKSPACE_GIT_TIMEOUT_MS = 3e5;
30667
30871
  function classifyQualityStatus(score, threshold = DEFAULT_THRESHOLD) {
30668
30872
  return score >= threshold ? "ok" : "quality_failure";
30669
30873
  }
@@ -30701,6 +30905,35 @@ function hasHookCommand(hook) {
30701
30905
  function hooksEnabled(workspace) {
30702
30906
  return workspace?.hooks?.enabled !== false;
30703
30907
  }
30908
+ function workspaceGitEnv() {
30909
+ const env = { ...process.env };
30910
+ for (const key of Object.keys(env)) {
30911
+ if (key.startsWith("GIT_") && key !== "GIT_SSH_COMMAND") {
30912
+ delete env[key];
30913
+ }
30914
+ }
30915
+ return {
30916
+ ...env,
30917
+ GIT_TERMINAL_PROMPT: "0",
30918
+ GIT_ASKPASS: "",
30919
+ GIT_SSH_COMMAND: "ssh -o BatchMode=yes"
30920
+ };
30921
+ }
30922
+ async function resetWorkspaceRoot(workspacePath, resetMode, baselineRef) {
30923
+ if (!existsSync5(path45.join(workspacePath, ".git"))) {
30924
+ return false;
30925
+ }
30926
+ const cleanFlag = resetMode === "strict" ? "-fdx" : "-fd";
30927
+ const opts = {
30928
+ cwd: workspacePath,
30929
+ timeout: WORKSPACE_GIT_TIMEOUT_MS,
30930
+ env: workspaceGitEnv(),
30931
+ maxBuffer: 50 * 1024 * 1024
30932
+ };
30933
+ await execFileAsync3("git", ["reset", "--hard", baselineRef ?? "HEAD"], opts);
30934
+ await execFileAsync3("git", ["clean", cleanFlag], opts);
30935
+ return true;
30936
+ }
30704
30937
  function getWorkspaceTemplate(target) {
30705
30938
  const config = target.config;
30706
30939
  if ("workspaceTemplate" in config && typeof config.workspaceTemplate === "string") {
@@ -31962,6 +32195,37 @@ async function runEvalCase(options) {
31962
32195
  }
31963
32196
  }
31964
32197
  }
32198
+ let beforeEachNeedsFreshBaseline = false;
32199
+ if (caseHooksEnabled && workspacePath && evalCase.workspace?.hooks?.before_each?.reset && evalCase.workspace.hooks.before_each.reset !== "none") {
32200
+ try {
32201
+ if (repoManager && evalCase.workspace.repos?.length) {
32202
+ await repoManager.reset(
32203
+ evalCase.workspace.repos,
32204
+ workspacePath,
32205
+ evalCase.workspace.hooks.before_each.reset
32206
+ );
32207
+ } else {
32208
+ await resetWorkspaceRoot(
32209
+ workspacePath,
32210
+ evalCase.workspace.hooks.before_each.reset,
32211
+ sharedBaselineCommit
32212
+ );
32213
+ }
32214
+ } catch (error) {
32215
+ const message = error instanceof Error ? error.message : String(error);
32216
+ return buildErrorResult(
32217
+ evalCase,
32218
+ target.name,
32219
+ nowFn(),
32220
+ new Error(`before_each reset failed: ${message}`),
32221
+ promptInputs,
32222
+ provider,
32223
+ "setup",
32224
+ "script_error",
32225
+ verbose
32226
+ );
32227
+ }
32228
+ }
31965
32229
  const caseBeforeEachHook = evalCase.workspace?.hooks?.before_each;
31966
32230
  if (workspacePath && caseHooksEnabled && hasHookCommand(caseBeforeEachHook)) {
31967
32231
  const beforeEachHook = caseBeforeEachHook;
@@ -31978,6 +32242,7 @@ async function runEvalCase(options) {
31978
32242
  toScriptConfig(beforeEachHook, "before_each", `test '${evalCase.id}'`),
31979
32243
  scriptContext
31980
32244
  );
32245
+ beforeEachNeedsFreshBaseline = true;
31981
32246
  } catch (error) {
31982
32247
  const message = error instanceof Error ? error.message : String(error);
31983
32248
  return buildErrorResult(
@@ -31993,7 +32258,7 @@ async function runEvalCase(options) {
31993
32258
  );
31994
32259
  }
31995
32260
  }
31996
- let baselineCommit = sharedBaselineCommit;
32261
+ let baselineCommit = beforeEachNeedsFreshBaseline ? void 0 : sharedBaselineCommit;
31997
32262
  if (!baselineCommit && workspacePath) {
31998
32263
  try {
31999
32264
  baselineCommit = await initializeBaseline(workspacePath);
@@ -32004,6 +32269,35 @@ async function runEvalCase(options) {
32004
32269
  }
32005
32270
  }
32006
32271
  }
32272
+ if (evalCase.mode === "conversation" && evalCase.turns?.length) {
32273
+ const conversationResult = await runConversationMode({
32274
+ evalCase,
32275
+ provider,
32276
+ target,
32277
+ evaluators,
32278
+ typeRegistry,
32279
+ graderProvider,
32280
+ promptInputs,
32281
+ nowFn,
32282
+ signal,
32283
+ workspacePath,
32284
+ caseWorkspaceFile: caseWorkspaceFile ?? suiteWorkspaceFile,
32285
+ agentTimeoutMs,
32286
+ streamCallbacks: options.streamCallbacks,
32287
+ verbose,
32288
+ threshold: evalCase.threshold ?? caseThreshold,
32289
+ targetResolver,
32290
+ availableTargets
32291
+ });
32292
+ if (workspacePath && !isSharedWorkspace) {
32293
+ const shouldRetain = conversationResult.executionStatus === "ok" ? retainOnSuccess === "keep" || keepWorkspaces : retainOnFailure === "keep" || !forceCleanup && !keepWorkspaces;
32294
+ if (!shouldRetain) {
32295
+ await cleanupWorkspace(workspacePath).catch(() => {
32296
+ });
32297
+ }
32298
+ }
32299
+ return conversationResult;
32300
+ }
32007
32301
  const caseStartMs = Date.now();
32008
32302
  const attemptBudget = (maxRetries ?? 0) + 1;
32009
32303
  let attempt = 0;
@@ -32118,13 +32412,21 @@ async function runEvalCase(options) {
32118
32412
  ${providerFileChanges}` : providerFileChanges;
32119
32413
  }
32120
32414
  const providerError = extractProviderError(providerResponse);
32121
- if (caseHooksEnabled && repoManager && workspacePath && evalCase.workspace?.hooks?.after_each?.reset && evalCase.workspace.hooks.after_each.reset !== "none" && evalCase.workspace.repos) {
32415
+ if (caseHooksEnabled && workspacePath && evalCase.workspace?.hooks?.after_each?.reset && evalCase.workspace.hooks.after_each.reset !== "none") {
32122
32416
  try {
32123
- await repoManager.reset(
32124
- evalCase.workspace.repos,
32125
- workspacePath,
32126
- evalCase.workspace.hooks.after_each.reset
32127
- );
32417
+ if (repoManager && evalCase.workspace.repos?.length) {
32418
+ await repoManager.reset(
32419
+ evalCase.workspace.repos,
32420
+ workspacePath,
32421
+ evalCase.workspace.hooks.after_each.reset
32422
+ );
32423
+ } else {
32424
+ await resetWorkspaceRoot(
32425
+ workspacePath,
32426
+ evalCase.workspace.hooks.after_each.reset,
32427
+ baselineCommit
32428
+ );
32429
+ }
32128
32430
  } catch {
32129
32431
  }
32130
32432
  }
@@ -32740,6 +33042,276 @@ function buildEvaluatorRegistry(overrides, resolveGraderProvider) {
32740
33042
  "llm-grader": llmGrader
32741
33043
  };
32742
33044
  }
33045
+ async function runConversationMode(options) {
33046
+ const {
33047
+ evalCase,
33048
+ provider,
33049
+ target,
33050
+ evaluators,
33051
+ typeRegistry,
33052
+ graderProvider,
33053
+ promptInputs,
33054
+ nowFn,
33055
+ signal,
33056
+ workspacePath,
33057
+ caseWorkspaceFile,
33058
+ agentTimeoutMs,
33059
+ streamCallbacks,
33060
+ verbose,
33061
+ threshold,
33062
+ targetResolver,
33063
+ availableTargets
33064
+ } = options;
33065
+ const turns = evalCase.turns;
33066
+ const aggregation = evalCase.aggregation ?? "mean";
33067
+ const onTurnFailure = evalCase.on_turn_failure ?? "continue";
33068
+ const windowSize = evalCase.window_size;
33069
+ const history = [];
33070
+ for (const msg of evalCase.input) {
33071
+ const content = typeof msg.content === "string" ? msg.content : JSON.stringify(msg.content);
33072
+ history.push({ role: msg.role, content });
33073
+ }
33074
+ const turnScores = [];
33075
+ const allTurnScoreValues = [];
33076
+ let stopped = false;
33077
+ const caseStartMs = Date.now();
33078
+ for (let i = 0; i < turns.length; i++) {
33079
+ const turn = turns[i];
33080
+ const turnIndex = i + 1;
33081
+ if (stopped) {
33082
+ turnScores.push({
33083
+ name: `turn-${turnIndex}`,
33084
+ type: "rubrics",
33085
+ score: 0,
33086
+ verdict: "skip",
33087
+ assertions: [{ text: "Skipped due to previous turn failure", passed: false }]
33088
+ });
33089
+ allTurnScoreValues.push(0);
33090
+ continue;
33091
+ }
33092
+ const userContent = typeof turn.input === "string" ? turn.input : JSON.stringify(turn.input);
33093
+ history.push({ role: "user", content: userContent });
33094
+ const chatPromptForProvider = windowSize ? buildWindowedHistory(history, windowSize) : [...history];
33095
+ let response;
33096
+ try {
33097
+ response = await provider.invoke({
33098
+ question: userContent,
33099
+ chatPrompt: chatPromptForProvider,
33100
+ evalCaseId: `${evalCase.id}/turn-${turnIndex}`,
33101
+ signal,
33102
+ cwd: workspacePath,
33103
+ workspaceFile: caseWorkspaceFile,
33104
+ streamCallbacks
33105
+ });
33106
+ } catch (error) {
33107
+ const message = error instanceof Error ? error.message : String(error);
33108
+ turnScores.push({
33109
+ name: `turn-${turnIndex}`,
33110
+ type: "rubrics",
33111
+ score: 0,
33112
+ verdict: "fail",
33113
+ assertions: [{ text: `Provider error: ${message}`, passed: false }]
33114
+ });
33115
+ allTurnScoreValues.push(0);
33116
+ if (onTurnFailure === "stop") stopped = true;
33117
+ continue;
33118
+ }
33119
+ const assistantContent = extractLastAssistantContent(response.output);
33120
+ history.push({ role: "assistant", content: assistantContent });
33121
+ if (!turn.assertions?.length && !turn.expected_output) {
33122
+ turnScores.push({
33123
+ name: `turn-${turnIndex}`,
33124
+ type: "rubrics",
33125
+ score: 1,
33126
+ verdict: "pass",
33127
+ assertions: []
33128
+ });
33129
+ allTurnScoreValues.push(1);
33130
+ continue;
33131
+ }
33132
+ const turnAssertions = buildTurnAssertions(turn);
33133
+ const turnEvalCase = {
33134
+ ...evalCase,
33135
+ id: `${evalCase.id}/turn-${turnIndex}`,
33136
+ assertions: turnAssertions,
33137
+ input: buildTurnGraderInput(history, windowSize),
33138
+ expected_output: turn.expected_output ? [
33139
+ typeof turn.expected_output === "string" ? { content: turn.expected_output } : turn.expected_output
33140
+ ] : [],
33141
+ // Clear conversation fields to prevent recursion
33142
+ mode: void 0,
33143
+ turns: void 0
33144
+ };
33145
+ const turnResult = await evaluateCandidate({
33146
+ evalCase: turnEvalCase,
33147
+ candidate: assistantContent,
33148
+ target,
33149
+ provider,
33150
+ evaluators,
33151
+ typeRegistry,
33152
+ promptInputs: {
33153
+ question: buildConversationContext(history, windowSize),
33154
+ chatPrompt: windowSize ? buildWindowedHistory(history, windowSize) : [...history]
33155
+ },
33156
+ nowFn,
33157
+ attempt: 0,
33158
+ graderProvider,
33159
+ agentTimeoutMs,
33160
+ output: response.output,
33161
+ verbose,
33162
+ threshold,
33163
+ targetResolver,
33164
+ availableTargets
33165
+ });
33166
+ const turnScore = turnResult.score;
33167
+ allTurnScoreValues.push(turnScore);
33168
+ turnScores.push({
33169
+ name: `turn-${turnIndex}`,
33170
+ type: "rubrics",
33171
+ score: turnScore,
33172
+ verdict: scoreToVerdict(turnScore, threshold ?? DEFAULT_THRESHOLD),
33173
+ assertions: turnResult.assertions ? [...turnResult.assertions] : [],
33174
+ scores: turnResult.scores
33175
+ });
33176
+ if (onTurnFailure === "stop" && turnScore < (threshold ?? DEFAULT_THRESHOLD)) {
33177
+ stopped = true;
33178
+ }
33179
+ }
33180
+ let conversationScores = [];
33181
+ if (evalCase.assertions?.length) {
33182
+ const conversationEvalCase = {
33183
+ ...evalCase,
33184
+ id: `${evalCase.id}/conversation`,
33185
+ input: history.map((m) => ({
33186
+ role: m.role,
33187
+ content: m.content
33188
+ })),
33189
+ expected_output: [],
33190
+ mode: void 0,
33191
+ turns: void 0
33192
+ };
33193
+ const fullTranscript = history.map((m) => {
33194
+ const content = typeof m.content === "string" ? m.content : JSON.stringify(m.content);
33195
+ return `${m.role}: ${content}`;
33196
+ }).join("\n\n");
33197
+ const conversationResult = await evaluateCandidate({
33198
+ evalCase: conversationEvalCase,
33199
+ candidate: fullTranscript,
33200
+ target,
33201
+ provider,
33202
+ evaluators,
33203
+ typeRegistry,
33204
+ promptInputs: {
33205
+ question: fullTranscript,
33206
+ chatPrompt: [...history]
33207
+ },
33208
+ nowFn,
33209
+ attempt: 0,
33210
+ graderProvider,
33211
+ agentTimeoutMs,
33212
+ verbose,
33213
+ threshold,
33214
+ targetResolver,
33215
+ availableTargets
33216
+ });
33217
+ conversationScores = [
33218
+ {
33219
+ name: "conversation",
33220
+ type: "rubrics",
33221
+ score: conversationResult.score,
33222
+ verdict: scoreToVerdict(
33223
+ conversationResult.score,
33224
+ threshold ?? DEFAULT_THRESHOLD
33225
+ ),
33226
+ assertions: conversationResult.assertions ? [...conversationResult.assertions] : [],
33227
+ scores: conversationResult.scores
33228
+ }
33229
+ ];
33230
+ }
33231
+ const allScoreValues = [...allTurnScoreValues, ...conversationScores.map((s) => s.score)];
33232
+ const finalScore = aggregateConversationScores(allScoreValues, aggregation);
33233
+ const allResultScores = [...turnScores, ...conversationScores];
33234
+ const outputMessages = history.map((m) => ({
33235
+ role: m.role,
33236
+ content: m.content
33237
+ }));
33238
+ const flatAssertions = allResultScores.flatMap((s) => [...s.assertions]);
33239
+ const totalDurationMs = Date.now() - caseStartMs;
33240
+ return {
33241
+ timestamp: nowFn().toISOString(),
33242
+ testId: evalCase.id,
33243
+ suite: evalCase.suite,
33244
+ category: evalCase.category,
33245
+ score: finalScore,
33246
+ assertions: flatAssertions,
33247
+ target: target.name,
33248
+ output: outputMessages,
33249
+ scores: allResultScores,
33250
+ executionStatus: classifyQualityStatus(finalScore, threshold ?? DEFAULT_THRESHOLD),
33251
+ input: evalCase.input.map((m) => ({
33252
+ role: m.role,
33253
+ content: typeof m.content === "string" ? m.content : JSON.stringify(m.content)
33254
+ })),
33255
+ evalRun: { durationMs: totalDurationMs }
33256
+ };
33257
+ }
33258
+ function buildWindowedHistory(history, windowSize) {
33259
+ const systemMessages = history.filter((m) => m.role === "system");
33260
+ const nonSystem = history.filter((m) => m.role !== "system");
33261
+ const windowed = nonSystem.slice(-windowSize * 2);
33262
+ return [...systemMessages, ...windowed];
33263
+ }
33264
+ function buildConversationContext(history, windowSize) {
33265
+ const msgs = windowSize ? buildWindowedHistory(history, windowSize) : history;
33266
+ return msgs.map((m) => {
33267
+ const content = typeof m.content === "string" ? m.content : JSON.stringify(m.content);
33268
+ return `${m.role}: ${content}`;
33269
+ }).join("\n\n");
33270
+ }
33271
+ function buildTurnGraderInput(history, windowSize) {
33272
+ const msgs = windowSize ? buildWindowedHistory(history, windowSize) : history;
33273
+ return msgs.map((m) => ({
33274
+ role: m.role,
33275
+ content: m.content
33276
+ }));
33277
+ }
33278
+ function buildTurnAssertions(turn) {
33279
+ if (!turn.assertions?.length) return [];
33280
+ const stringCriteria = [];
33281
+ const structured = [];
33282
+ for (const a of turn.assertions) {
33283
+ if (typeof a === "string") {
33284
+ stringCriteria.push(a);
33285
+ } else {
33286
+ structured.push(a);
33287
+ }
33288
+ }
33289
+ const result = [];
33290
+ if (stringCriteria.length > 0) {
33291
+ result.push({
33292
+ name: "turn-rubrics",
33293
+ type: "llm-grader",
33294
+ rubrics: stringCriteria.map((text2, idx) => ({
33295
+ id: `criterion-${idx + 1}`,
33296
+ outcome: text2,
33297
+ weight: 1
33298
+ }))
33299
+ });
33300
+ }
33301
+ result.push(...structured);
33302
+ return result;
33303
+ }
33304
+ function aggregateConversationScores(scores, aggregation) {
33305
+ if (scores.length === 0) return 1;
33306
+ switch (aggregation) {
33307
+ case "min":
33308
+ return Math.min(...scores);
33309
+ case "max":
33310
+ return Math.max(...scores);
33311
+ default:
33312
+ return scores.reduce((sum, s) => sum + s, 0) / scores.length;
33313
+ }
33314
+ }
32743
33315
  async function invokeProvider(provider, options) {
32744
33316
  const {
32745
33317
  evalCase,
@@ -33430,7 +34002,7 @@ function shouldSkipCacheForTemperature(targetConfig) {
33430
34002
  }
33431
34003
  return false;
33432
34004
  }
33433
- var execFileAsync3 = promisify7(execFile3);
34005
+ var execFileAsync4 = promisify8(execFile4);
33434
34006
  function sanitizeRepoSlug(repo) {
33435
34007
  return repo.trim().replace(/[^A-Za-z0-9._-]+/g, "-");
33436
34008
  }
@@ -33481,7 +34053,7 @@ function writePersistedStatus(statusFile, status) {
33481
34053
  }
33482
34054
  async function runCommand(executable, args, options) {
33483
34055
  try {
33484
- const { stdout, stderr } = await execFileAsync3(executable, [...args], {
34056
+ const { stdout, stderr } = await execFileAsync4(executable, [...args], {
33485
34057
  cwd: options?.cwd,
33486
34058
  env: process.env
33487
34059
  });
@@ -34516,11 +35088,13 @@ function extractAssistantContent(content) {
34516
35088
  break;
34517
35089
  case "tool_use":
34518
35090
  if (block.name) {
34519
- toolCalls.push({
34520
- tool: block.name,
34521
- input: block.input,
34522
- id: block.id
34523
- });
35091
+ toolCalls.push(
35092
+ normalizeToolCall("claude", {
35093
+ tool: block.name,
35094
+ input: block.input,
35095
+ id: block.id
35096
+ })
35097
+ );
34524
35098
  }
34525
35099
  break;
34526
35100
  }
@@ -34610,7 +35184,11 @@ function parseCodexSession(jsonl) {
34610
35184
  } else {
34611
35185
  input = payload.arguments;
34612
35186
  }
34613
- const toolCall = { tool: toolName, input, id: callId };
35187
+ const toolCall = normalizeToolCall("codex", {
35188
+ tool: toolName,
35189
+ input,
35190
+ id: callId
35191
+ });
34614
35192
  const msgIdx = messages.length;
34615
35193
  messages.push({
34616
35194
  role: "assistant",
@@ -34634,7 +35212,11 @@ function parseCodexSession(jsonl) {
34634
35212
  } else {
34635
35213
  input = payload.arguments;
34636
35214
  }
34637
- const toolCall = { tool: toolName, input, id: callId };
35215
+ const toolCall = normalizeToolCall("codex", {
35216
+ tool: toolName,
35217
+ input,
35218
+ id: callId
35219
+ });
34638
35220
  const msgIdx = messages.length;
34639
35221
  messages.push({
34640
35222
  role: "assistant",
@@ -35088,4 +35670,4 @@ export {
35088
35670
  TranscriptProvider,
35089
35671
  createAgentKernel
35090
35672
  };
35091
- //# sourceMappingURL=chunk-HP3ZUKYB.js.map
35673
+ //# sourceMappingURL=chunk-VSWJTL3T.js.map