@agentv/core 4.14.0-next.1 → 4.15.0-next.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -25,7 +25,7 @@ import {
25
25
  resolveDelegatedTargetDefinition,
26
26
  resolveFileReference,
27
27
  resolveTargetDefinition
28
- } from "./chunk-A3HYVKTI.js";
28
+ } from "./chunk-AOOU6PLC.js";
29
29
  import {
30
30
  execFileWithStdin,
31
31
  execShellWithStdin
@@ -3673,10 +3673,10 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
3673
3673
  const effectiveSuiteInputFiles = suiteInputFiles && !skipDefaults ? suiteInputFiles : void 0;
3674
3674
  const testInputMessages = resolveInputMessages(testCaseConfig, effectiveSuiteInputFiles);
3675
3675
  const expectedMessages = resolveExpectedMessages(testCaseConfig) ?? [];
3676
- const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 || testCaseConfig.assertions !== void 0 || testCaseConfig.assert !== void 0;
3676
+ const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 || testCaseConfig.assertions !== void 0 || testCaseConfig.assert !== void 0 || Array.isArray(testCaseConfig.turns) && testCaseConfig.turns.length > 0;
3677
3677
  if (!id || !hasEvaluationSpec || !testInputMessages || testInputMessages.length === 0) {
3678
3678
  logError3(
3679
- `Skipping incomplete test: ${id ?? "unknown"}. Missing required fields: id, input, and at least one of criteria/expected_output/assertions`
3679
+ `Skipping incomplete test: ${id ?? "unknown"}. Missing required fields: id, input, and at least one of criteria/expected_output/assertions/turns`
3680
3680
  );
3681
3681
  continue;
3682
3682
  }
@@ -3753,6 +3753,14 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
3753
3753
  ) : void 0;
3754
3754
  const onDependencyFailureRaw = asString5(testCaseConfig.on_dependency_failure);
3755
3755
  const onDependencyFailure = onDependencyFailureRaw === "skip" || onDependencyFailureRaw === "fail" || onDependencyFailureRaw === "run" ? onDependencyFailureRaw : void 0;
3756
+ const modeRaw = asString5(testCaseConfig.mode);
3757
+ const mode = modeRaw === "conversation" ? "conversation" : void 0;
3758
+ const turns = Array.isArray(testCaseConfig.turns) ? parseTurns(testCaseConfig.turns) : void 0;
3759
+ const aggregationRaw = asString5(testCaseConfig.aggregation);
3760
+ const aggregation = aggregationRaw === "mean" || aggregationRaw === "min" || aggregationRaw === "max" ? aggregationRaw : void 0;
3761
+ const onTurnFailureRaw = asString5(testCaseConfig.on_turn_failure);
3762
+ const onTurnFailure = onTurnFailureRaw === "continue" || onTurnFailureRaw === "stop" ? onTurnFailureRaw : void 0;
3763
+ const windowSize = typeof testCaseConfig.window_size === "number" && testCaseConfig.window_size >= 1 ? testCaseConfig.window_size : void 0;
3756
3764
  const testCase = {
3757
3765
  id,
3758
3766
  suite: suiteName,
@@ -3771,6 +3779,11 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
3771
3779
  metadata,
3772
3780
  targets: caseTargets,
3773
3781
  ...caseThreshold !== void 0 ? { threshold: caseThreshold } : {},
3782
+ ...mode ? { mode } : {},
3783
+ ...turns && turns.length > 0 ? { turns } : {},
3784
+ ...aggregation ? { aggregation } : {},
3785
+ ...onTurnFailure ? { on_turn_failure: onTurnFailure } : {},
3786
+ ...windowSize !== void 0 ? { window_size: windowSize } : {},
3774
3787
  ...dependsOn && dependsOn.length > 0 ? { depends_on: dependsOn } : {},
3775
3788
  ...onDependencyFailure ? { on_dependency_failure: onDependencyFailure } : {}
3776
3789
  };
@@ -3788,6 +3801,25 @@ async function loadTestById(evalFilePath, repoRoot, evalId) {
3788
3801
  return match;
3789
3802
  }
3790
3803
  var loadEvalCaseById = loadTestById;
3804
+ function parseTurns(rawTurns) {
3805
+ return rawTurns.map((rawTurn) => {
3806
+ const turn = rawTurn;
3807
+ const input = turn.input;
3808
+ const expectedOutput = turn.expected_output;
3809
+ let assertions;
3810
+ if (Array.isArray(turn.assertions)) {
3811
+ assertions = turn.assertions.map((a) => {
3812
+ if (typeof a === "string") return a;
3813
+ return a;
3814
+ });
3815
+ }
3816
+ return {
3817
+ input,
3818
+ ...expectedOutput !== void 0 ? { expected_output: expectedOutput } : {},
3819
+ ...assertions && assertions.length > 0 ? { assertions } : {}
3820
+ };
3821
+ });
3822
+ }
3791
3823
  function parseCommandArray(source) {
3792
3824
  if (typeof source === "string") {
3793
3825
  const parts = source.trim().split(/\s+/);
@@ -4745,6 +4777,154 @@ function subscribeToClaudeLogEntries(listener) {
4745
4777
  };
4746
4778
  }
4747
4779
 
4780
+ // src/evaluation/providers/normalize-tool-call.ts
4781
+ var TOOL_NAME_MAP = /* @__PURE__ */ new Map([
4782
+ // --- Claude (already canonical) ---
4783
+ ["claude::Skill", "Skill"],
4784
+ ["claude::Read", "Read"],
4785
+ ["claude::Write", "Write"],
4786
+ ["claude::Edit", "Edit"],
4787
+ ["claude::Bash", "Bash"],
4788
+ ["claude-cli::Skill", "Skill"],
4789
+ ["claude-cli::Read", "Read"],
4790
+ ["claude-cli::Write", "Write"],
4791
+ ["claude-cli::Edit", "Edit"],
4792
+ ["claude-cli::Bash", "Bash"],
4793
+ ["claude-sdk::Skill", "Skill"],
4794
+ ["claude-sdk::Read", "Read"],
4795
+ ["claude-sdk::Write", "Write"],
4796
+ ["claude-sdk::Edit", "Edit"],
4797
+ ["claude-sdk::Bash", "Bash"],
4798
+ // --- Copilot ---
4799
+ ["copilot-cli::Skill", "Skill"],
4800
+ ["copilot-cli::skill", "Skill"],
4801
+ ["copilot-cli::Read File", "Read"],
4802
+ ["copilot-cli::readFile", "Read"],
4803
+ ["copilot-cli::Read", "Read"],
4804
+ ["copilot-cli::readTextFile", "Read"],
4805
+ ["copilot-cli::writeTextFile", "Write"],
4806
+ ["copilot-cli::Write File", "Write"],
4807
+ ["copilot-cli::editFile", "Edit"],
4808
+ ["copilot-cli::Edit File", "Edit"],
4809
+ ["copilot-cli::runTerminalCommand", "Bash"],
4810
+ ["copilot-sdk::Skill", "Skill"],
4811
+ ["copilot-sdk::skill", "Skill"],
4812
+ ["copilot-sdk::Read File", "Read"],
4813
+ ["copilot-sdk::readFile", "Read"],
4814
+ ["copilot-sdk::Read", "Read"],
4815
+ ["copilot-sdk::readTextFile", "Read"],
4816
+ ["copilot-sdk::writeTextFile", "Write"],
4817
+ ["copilot-sdk::Write File", "Write"],
4818
+ ["copilot-sdk::editFile", "Edit"],
4819
+ ["copilot-sdk::Edit File", "Edit"],
4820
+ ["copilot-sdk::runTerminalCommand", "Bash"],
4821
+ ["copilot-log::Skill", "Skill"],
4822
+ ["copilot-log::skill", "Skill"],
4823
+ ["copilot-log::Read File", "Read"],
4824
+ ["copilot-log::readFile", "Read"],
4825
+ ["copilot-log::Read", "Read"],
4826
+ ["copilot-log::readTextFile", "Read"],
4827
+ ["copilot-log::writeTextFile", "Write"],
4828
+ ["copilot-log::Write File", "Write"],
4829
+ ["copilot-log::editFile", "Edit"],
4830
+ ["copilot-log::Edit File", "Edit"],
4831
+ ["copilot-log::runTerminalCommand", "Bash"],
4832
+ ["vscode::Skill", "Skill"],
4833
+ ["vscode::skill", "Skill"],
4834
+ ["vscode::Read File", "Read"],
4835
+ ["vscode::readFile", "Read"],
4836
+ ["vscode::Read", "Read"],
4837
+ ["vscode::readTextFile", "Read"],
4838
+ ["vscode::writeTextFile", "Write"],
4839
+ ["vscode::Write File", "Write"],
4840
+ ["vscode::editFile", "Edit"],
4841
+ ["vscode::Edit File", "Edit"],
4842
+ ["vscode::runTerminalCommand", "Bash"],
4843
+ ["vscode-insiders::Skill", "Skill"],
4844
+ ["vscode-insiders::skill", "Skill"],
4845
+ ["vscode-insiders::Read File", "Read"],
4846
+ ["vscode-insiders::readFile", "Read"],
4847
+ ["vscode-insiders::Read", "Read"],
4848
+ ["vscode-insiders::readTextFile", "Read"],
4849
+ ["vscode-insiders::writeTextFile", "Write"],
4850
+ ["vscode-insiders::Write File", "Write"],
4851
+ ["vscode-insiders::editFile", "Edit"],
4852
+ ["vscode-insiders::Edit File", "Edit"],
4853
+ ["vscode-insiders::runTerminalCommand", "Bash"],
4854
+ // --- Codex ---
4855
+ ["codex::command_execution", "Bash"],
4856
+ ["codex::file_change", "Edit"],
4857
+ // --- Pi ---
4858
+ ["pi-coding-agent::read", "Read"],
4859
+ ["pi-coding-agent::bash", "Bash"],
4860
+ ["pi-cli::read", "Read"],
4861
+ ["pi-cli::bash", "Bash"]
4862
+ ]);
4863
+ var COPILOT_PREFIXES = [
4864
+ { prefix: "Using skill: ", canonical: "Skill", extractSkillFromName: true },
4865
+ { prefix: "Viewing ", canonical: "Read" }
4866
+ ];
4867
+ var CODEX_PREFIXES = [
4868
+ { prefix: "mcp:", canonical: "Skill", extractSkillFromName: true }
4869
+ ];
4870
+ var TOOL_PREFIX_MAP = /* @__PURE__ */ new Map([
4871
+ ["copilot-cli", COPILOT_PREFIXES],
4872
+ ["copilot-sdk", COPILOT_PREFIXES],
4873
+ ["copilot-log", COPILOT_PREFIXES],
4874
+ ["vscode", COPILOT_PREFIXES],
4875
+ ["vscode-insiders", COPILOT_PREFIXES],
4876
+ ["codex", CODEX_PREFIXES]
4877
+ ]);
4878
+ var normalizeSkillInput = (input) => {
4879
+ if (input.skill !== void 0) return input;
4880
+ return input;
4881
+ };
4882
+ var normalizeReadInput = (input) => {
4883
+ if (input.file_path !== void 0) return input;
4884
+ if (input.path !== void 0) return { ...input, file_path: input.path };
4885
+ if (input.filePath !== void 0) return { ...input, file_path: input.filePath };
4886
+ return input;
4887
+ };
4888
+ var INPUT_NORMALIZERS = /* @__PURE__ */ new Map([
4889
+ ["Skill", normalizeSkillInput],
4890
+ ["Read", normalizeReadInput]
4891
+ ]);
4892
+ function normalizeToolCall(providerKind, tc) {
4893
+ const nativeName = tc.tool;
4894
+ const exactKey = `${providerKind}::${nativeName}`;
4895
+ const canonical = TOOL_NAME_MAP.get(exactKey);
4896
+ if (canonical) {
4897
+ return applyInputNormalization(canonical, { ...tc, tool: canonical });
4898
+ }
4899
+ const prefixRules = TOOL_PREFIX_MAP.get(providerKind);
4900
+ if (prefixRules) {
4901
+ for (const rule of prefixRules) {
4902
+ if (nativeName.startsWith(rule.prefix)) {
4903
+ const suffix = nativeName.slice(rule.prefix.length);
4904
+ let normalizedInput = tc.input;
4905
+ if (rule.extractSkillFromName && suffix) {
4906
+ const existingInput = tc.input ?? {};
4907
+ normalizedInput = { ...existingInput, skill: suffix };
4908
+ }
4909
+ const normalized = {
4910
+ ...tc,
4911
+ tool: rule.canonical,
4912
+ input: normalizedInput
4913
+ };
4914
+ return applyInputNormalization(rule.canonical, normalized);
4915
+ }
4916
+ }
4917
+ }
4918
+ return tc;
4919
+ }
4920
+ function applyInputNormalization(canonical, tc) {
4921
+ const normalizer = INPUT_NORMALIZERS.get(canonical);
4922
+ if (!normalizer || tc.input === void 0 || tc.input === null) return tc;
4923
+ const input = tc.input;
4924
+ const normalized = normalizer(input);
4925
+ return normalized === input ? tc : { ...tc, input: normalized };
4926
+ }
4927
+
4748
4928
  // src/evaluation/providers/preread.ts
4749
4929
  import path10 from "node:path";
4750
4930
  function buildPromptDocument(request, inputFiles) {
@@ -5212,11 +5392,13 @@ function extractToolCalls(content) {
5212
5392
  }
5213
5393
  const p = part;
5214
5394
  if (p.type === "tool_use" && typeof p.name === "string") {
5215
- toolCalls.push({
5216
- tool: p.name,
5217
- input: p.input,
5218
- id: typeof p.id === "string" ? p.id : void 0
5219
- });
5395
+ toolCalls.push(
5396
+ normalizeToolCall("claude-cli", {
5397
+ tool: p.name,
5398
+ input: p.input,
5399
+ id: typeof p.id === "string" ? p.id : void 0
5400
+ })
5401
+ );
5220
5402
  }
5221
5403
  }
5222
5404
  return toolCalls;
@@ -5507,11 +5689,13 @@ function extractToolCalls2(content) {
5507
5689
  }
5508
5690
  const p = part;
5509
5691
  if (p.type === "tool_use" && typeof p.name === "string") {
5510
- toolCalls.push({
5511
- tool: p.name,
5512
- input: p.input,
5513
- id: typeof p.id === "string" ? p.id : void 0
5514
- });
5692
+ toolCalls.push(
5693
+ normalizeToolCall("claude-sdk", {
5694
+ tool: p.name,
5695
+ input: p.input,
5696
+ id: typeof p.id === "string" ? p.id : void 0
5697
+ })
5698
+ );
5515
5699
  }
5516
5700
  }
5517
5701
  return toolCalls;
@@ -6426,27 +6610,33 @@ ${basePrompt}` : basePrompt;
6426
6610
  }
6427
6611
  }
6428
6612
  if (itemType === "command_execution") {
6429
- completedToolCalls.push({
6430
- tool: "command_execution",
6431
- input: { command: item.command },
6432
- output: item.aggregated_output,
6433
- id: item.id
6434
- });
6613
+ completedToolCalls.push(
6614
+ normalizeToolCall("codex", {
6615
+ tool: "command_execution",
6616
+ input: { command: item.command },
6617
+ output: item.aggregated_output,
6618
+ id: item.id
6619
+ })
6620
+ );
6435
6621
  }
6436
6622
  if (itemType === "file_change") {
6437
- completedToolCalls.push({
6438
- tool: "file_change",
6439
- input: item.changes,
6440
- id: item.id
6441
- });
6623
+ completedToolCalls.push(
6624
+ normalizeToolCall("codex", {
6625
+ tool: "file_change",
6626
+ input: item.changes,
6627
+ id: item.id
6628
+ })
6629
+ );
6442
6630
  }
6443
6631
  if (itemType === "mcp_tool_call") {
6444
- completedToolCalls.push({
6445
- tool: `mcp:${item.server}/${item.tool}`,
6446
- input: item.arguments,
6447
- output: item.result ?? item.error,
6448
- id: item.id
6449
- });
6632
+ completedToolCalls.push(
6633
+ normalizeToolCall("codex", {
6634
+ tool: `mcp:${item.server}/${item.tool}`,
6635
+ input: item.arguments,
6636
+ output: item.result ?? item.error,
6637
+ id: item.id
6638
+ })
6639
+ );
6450
6640
  }
6451
6641
  }
6452
6642
  resolveCwd(cwdOverride) {
@@ -6981,12 +7171,6 @@ var CopilotStreamLogger = class _CopilotStreamLogger {
6981
7171
  return logger;
6982
7172
  }
6983
7173
  handleEvent(eventType, data) {
6984
- if (this.format === "json") {
6985
- const elapsed2 = formatElapsed4(this.startedAt);
6986
- this.stream.write(`${JSON.stringify({ time: elapsed2, event: eventType, data })}
6987
- `);
6988
- return;
6989
- }
6990
7174
  if (this.chunkExtractor) {
6991
7175
  const chunkText = this.chunkExtractor(eventType, data);
6992
7176
  if (chunkText === null) {
@@ -6999,6 +7183,12 @@ var CopilotStreamLogger = class _CopilotStreamLogger {
6999
7183
  }
7000
7184
  this.flushPendingText();
7001
7185
  }
7186
+ if (this.format === "json") {
7187
+ const elapsed2 = formatElapsed4(this.startedAt);
7188
+ this.stream.write(`${JSON.stringify({ time: elapsed2, event: eventType, data })}
7189
+ `);
7190
+ return;
7191
+ }
7002
7192
  const elapsed = formatElapsed4(this.startedAt);
7003
7193
  const summary = this.summarize(eventType, data);
7004
7194
  if (summary) {
@@ -7009,14 +7199,19 @@ var CopilotStreamLogger = class _CopilotStreamLogger {
7009
7199
  flushPendingText() {
7010
7200
  if (!this.pendingText) return;
7011
7201
  const elapsed = formatElapsed4(this.startedAt);
7012
- this.stream.write(`[+${elapsed}] [assistant_message] ${this.pendingText}
7202
+ if (this.format === "json") {
7203
+ this.stream.write(
7204
+ `${JSON.stringify({ time: elapsed, event: "assistant_message", data: { content: this.pendingText } })}
7205
+ `
7206
+ );
7207
+ } else {
7208
+ this.stream.write(`[+${elapsed}] [assistant_message] ${this.pendingText}
7013
7209
  `);
7210
+ }
7014
7211
  this.pendingText = "";
7015
7212
  }
7016
7213
  async close() {
7017
- if (this.format !== "json") {
7018
- this.flushPendingText();
7019
- }
7214
+ this.flushPendingText();
7020
7215
  await new Promise((resolve, reject) => {
7021
7216
  this.stream.once("error", reject);
7022
7217
  this.stream.end(() => resolve());
@@ -7091,15 +7286,17 @@ var CopilotCliProvider = class {
7091
7286
  }
7092
7287
  if (update.status === "completed" || update.status === "failed") {
7093
7288
  const toolName = update.title ?? update.kind ?? "unknown";
7094
- completedToolCalls.push({
7095
- tool: toolName,
7096
- input: update.rawInput,
7097
- output: update.rawOutput,
7098
- id: callId,
7099
- startTime: (/* @__PURE__ */ new Date()).toISOString(),
7100
- endTime: (/* @__PURE__ */ new Date()).toISOString(),
7101
- durationMs: 0
7102
- });
7289
+ completedToolCalls.push(
7290
+ normalizeToolCall("copilot-cli", {
7291
+ tool: toolName,
7292
+ input: update.rawInput,
7293
+ output: update.rawOutput,
7294
+ id: callId,
7295
+ startTime: (/* @__PURE__ */ new Date()).toISOString(),
7296
+ endTime: (/* @__PURE__ */ new Date()).toISOString(),
7297
+ durationMs: 0
7298
+ })
7299
+ );
7103
7300
  request.streamCallbacks?.onToolCallEnd?.(
7104
7301
  toolName,
7105
7302
  update.rawInput,
@@ -7116,15 +7313,17 @@ var CopilotCliProvider = class {
7116
7313
  if (inProgress) {
7117
7314
  toolCallsInProgress.delete(callId);
7118
7315
  const duration = Date.now() - inProgress.startMs;
7119
- completedToolCalls.push({
7120
- tool: inProgress.tool,
7121
- input: inProgress.input,
7122
- output: update.rawOutput,
7123
- id: inProgress.id,
7124
- startTime: inProgress.startTime,
7125
- endTime: (/* @__PURE__ */ new Date()).toISOString(),
7126
- durationMs: duration
7127
- });
7316
+ completedToolCalls.push(
7317
+ normalizeToolCall("copilot-cli", {
7318
+ tool: inProgress.tool,
7319
+ input: inProgress.input,
7320
+ output: update.rawOutput,
7321
+ id: inProgress.id,
7322
+ startTime: inProgress.startTime,
7323
+ endTime: (/* @__PURE__ */ new Date()).toISOString(),
7324
+ durationMs: duration
7325
+ })
7326
+ );
7128
7327
  request.streamCallbacks?.onToolCallEnd?.(
7129
7328
  inProgress.tool,
7130
7329
  inProgress.input,
@@ -7468,11 +7667,13 @@ function parseCopilotEvents(eventsJsonl) {
7468
7667
  }
7469
7668
  case "assistant.message": {
7470
7669
  const toolRequests = data.toolRequests;
7471
- const toolCalls = (toolRequests ?? []).map((req) => ({
7472
- tool: String(req.name ?? req.toolName ?? ""),
7473
- input: req.arguments,
7474
- id: req.toolCallId ? String(req.toolCallId) : void 0
7475
- }));
7670
+ const toolCalls = (toolRequests ?? []).map(
7671
+ (req) => normalizeToolCall("copilot-log", {
7672
+ tool: String(req.name ?? req.toolName ?? ""),
7673
+ input: req.arguments,
7674
+ id: req.toolCallId ? String(req.toolCallId) : void 0
7675
+ })
7676
+ );
7476
7677
  messages.push({
7477
7678
  role: "assistant",
7478
7679
  content: data.content != null ? String(data.content) : void 0,
@@ -7512,12 +7713,12 @@ function parseCopilotEvents(eventsJsonl) {
7512
7713
  messages.push({
7513
7714
  role: "assistant",
7514
7715
  toolCalls: [
7515
- {
7716
+ normalizeToolCall("copilot-log", {
7516
7717
  tool: started.toolName,
7517
7718
  input: started.input,
7518
7719
  output: data.result,
7519
7720
  id: toolCallId
7520
- }
7721
+ })
7521
7722
  ]
7522
7723
  });
7523
7724
  }
@@ -7863,15 +8064,17 @@ var CopilotSdkProvider = class {
7863
8064
  if (inProgress) {
7864
8065
  toolCallsInProgress.delete(callId);
7865
8066
  const endMs = Date.now();
7866
- completedToolCalls.push({
7867
- tool: inProgress.tool,
7868
- input: inProgress.input,
7869
- output: data?.output ?? data?.result,
7870
- id: inProgress.id,
7871
- startTime: inProgress.startTime,
7872
- endTime: (/* @__PURE__ */ new Date()).toISOString(),
7873
- durationMs: endMs - inProgress.startMs
7874
- });
8067
+ completedToolCalls.push(
8068
+ normalizeToolCall("copilot-sdk", {
8069
+ tool: inProgress.tool,
8070
+ input: inProgress.input,
8071
+ output: data?.output ?? data?.result,
8072
+ id: inProgress.id,
8073
+ startTime: inProgress.startTime,
8074
+ endTime: (/* @__PURE__ */ new Date()).toISOString(),
8075
+ durationMs: endMs - inProgress.startMs
8076
+ })
8077
+ );
7875
8078
  }
7876
8079
  }
7877
8080
  if (eventType === "assistant.message") {
@@ -8850,12 +9053,14 @@ function extractToolCallsFromEvents(events) {
8850
9053
  }
8851
9054
  const toolCalls = [];
8852
9055
  for (const [id, { tool: tool2, input }] of starts) {
8853
- toolCalls.push({
8854
- tool: tool2,
8855
- input,
8856
- id: id.startsWith("anon-") ? void 0 : id,
8857
- output: results.get(id)
8858
- });
9056
+ toolCalls.push(
9057
+ normalizeToolCall("pi-cli", {
9058
+ tool: tool2,
9059
+ input,
9060
+ id: id.startsWith("anon-") ? void 0 : id,
9061
+ output: results.get(id)
9062
+ })
9063
+ );
8859
9064
  }
8860
9065
  return toolCalls;
8861
9066
  }
@@ -8977,17 +9182,21 @@ function extractToolCalls3(content) {
8977
9182
  if (!part || typeof part !== "object") continue;
8978
9183
  const p = part;
8979
9184
  if (p.type === "tool_use" && typeof p.name === "string") {
8980
- toolCalls.push({
8981
- tool: p.name,
8982
- input: p.input,
8983
- id: typeof p.id === "string" ? p.id : void 0
8984
- });
9185
+ toolCalls.push(
9186
+ normalizeToolCall("pi-cli", {
9187
+ tool: p.name,
9188
+ input: p.input,
9189
+ id: typeof p.id === "string" ? p.id : void 0
9190
+ })
9191
+ );
8985
9192
  } else if ((p.type === "toolCall" || p.type === "tool_call") && typeof p.name === "string") {
8986
- toolCalls.push({
8987
- tool: p.name,
8988
- input: p.arguments ?? p.input,
8989
- id: typeof p.id === "string" ? p.id : void 0
8990
- });
9193
+ toolCalls.push(
9194
+ normalizeToolCall("pi-cli", {
9195
+ tool: p.name,
9196
+ input: p.arguments ?? p.input,
9197
+ id: typeof p.id === "string" ? p.id : void 0
9198
+ })
9199
+ );
8991
9200
  } else if (p.type === "tool_result" && typeof p.tool_use_id === "string") {
8992
9201
  const existing = toolCalls.find((tc) => tc.id === p.tool_use_id);
8993
9202
  if (existing) {
@@ -14066,100 +14275,35 @@ var LatencyEvaluator = class {
14066
14275
  };
14067
14276
 
14068
14277
  // src/evaluation/evaluators/skill-trigger.ts
14069
- var CLAUDE_MATCHER = {
14070
- skillTools: ["Skill"],
14071
- skillInputField: "skill",
14072
- readTools: ["Read"],
14073
- readInputField: "file_path"
14074
- };
14075
- var COPILOT_MATCHER = {
14076
- skillTools: ["Skill", "skill"],
14077
- skillInputField: "skill",
14078
- readTools: ["Read File", "readFile", "Read", "readTextFile"],
14079
- readInputField: "file_path",
14080
- skillToolPrefixes: ["Using skill: "],
14081
- readToolPrefixes: ["Viewing "],
14082
- readInputFields: ["file_path", "path"]
14083
- };
14084
- var PI_CODING_AGENT_MATCHER = {
14085
- skillTools: [],
14086
- skillInputField: "skill",
14087
- readTools: ["read"],
14088
- readInputField: "path",
14089
- readInputFields: ["path", "file_path", "filePath"]
14090
- };
14091
- var CODEX_MATCHER = {
14092
- skillTools: [],
14093
- skillInputField: "skill",
14094
- readTools: ["command_execution"],
14095
- readInputField: "command",
14096
- skillToolPrefixes: ["mcp:"],
14097
- readToolPrefixes: ["mcp:"],
14098
- readInputFields: ["command", "path", "file_path", "filePath"]
14099
- };
14100
- var PROVIDER_TOOL_SEMANTICS = {
14101
- claude: CLAUDE_MATCHER,
14102
- "claude-cli": CLAUDE_MATCHER,
14103
- "claude-sdk": CLAUDE_MATCHER,
14104
- codex: CODEX_MATCHER,
14105
- "pi-coding-agent": PI_CODING_AGENT_MATCHER,
14106
- "pi-cli": PI_CODING_AGENT_MATCHER,
14107
- "copilot-cli": COPILOT_MATCHER,
14108
- "copilot-log": COPILOT_MATCHER,
14109
- "copilot-sdk": COPILOT_MATCHER,
14110
- vscode: COPILOT_MATCHER,
14111
- "vscode-insiders": COPILOT_MATCHER
14112
- };
14113
14278
  var SkillTriggerEvaluator = class {
14114
14279
  kind = "skill-trigger";
14115
14280
  config;
14116
14281
  constructor(config) {
14117
14282
  this.config = config;
14118
14283
  }
14119
- resolveMatcher(providerKind) {
14120
- if (providerKind) {
14121
- const match = PROVIDER_TOOL_SEMANTICS[providerKind];
14122
- if (match) return match;
14123
- }
14124
- return CLAUDE_MATCHER;
14125
- }
14126
14284
  evaluate(context) {
14127
14285
  const skillName = this.config.skill;
14128
14286
  const shouldTrigger = this.config.should_trigger !== false;
14129
- const providerKind = context.provider?.kind;
14130
- const matcher = this.resolveMatcher(providerKind);
14131
14287
  const allToolCalls = (context.output ?? []).flatMap((msg) => msg.toolCalls ?? []);
14132
14288
  let triggered = false;
14133
14289
  let evidence = "";
14134
14290
  for (const toolCall of allToolCalls) {
14135
14291
  const toolName = toolCall.tool ?? "";
14136
14292
  const input = toolCall.input ?? {};
14137
- if (matcher.skillTools.includes(toolName)) {
14138
- const skillArg = String(input[matcher.skillInputField] ?? "");
14293
+ if (toolName === "Skill") {
14294
+ const skillArg = String(input.skill ?? "");
14139
14295
  if (skillArg.includes(skillName)) {
14140
14296
  triggered = true;
14141
- evidence = `Skill tool invoked with ${matcher.skillInputField}="${skillArg}"`;
14297
+ evidence = `Skill tool invoked with skill="${skillArg}"`;
14142
14298
  break;
14143
14299
  }
14144
- } else if (matcher.skillToolPrefixes?.some(
14145
- (prefix) => toolName.startsWith(prefix) && toolName.includes(skillName)
14146
- )) {
14147
- triggered = true;
14148
- evidence = `Skill tool invoked via tool name "${toolName}"`;
14149
- break;
14150
- } else if (matcher.readTools.includes(toolName)) {
14151
- const filePath = this.readPathFromInput(input, matcher);
14152
- if (filePath.includes(skillName)) {
14300
+ } else if (toolName === "Read") {
14301
+ const filePath = String(input.file_path ?? "");
14302
+ if (filePath.includes(`skills/${skillName}/`)) {
14153
14303
  triggered = true;
14154
14304
  evidence = `Read tool loaded skill file: ${filePath}`;
14155
14305
  break;
14156
14306
  }
14157
- } else if (matcher.readToolPrefixes?.some(
14158
- (prefix) => toolName.startsWith(prefix) && toolName.includes(skillName)
14159
- )) {
14160
- triggered = true;
14161
- evidence = `Read tool loaded skill file via tool name "${toolName}"`;
14162
- break;
14163
14307
  }
14164
14308
  if (!triggered && toolCall.output != null) {
14165
14309
  const outputStr = typeof toolCall.output === "string" ? toolCall.output : JSON.stringify(toolCall.output);
@@ -14196,16 +14340,6 @@ var SkillTriggerEvaluator = class {
14196
14340
  expectedAspectCount: 1
14197
14341
  };
14198
14342
  }
14199
- readPathFromInput(input, matcher) {
14200
- const fields = matcher.readInputFields ?? [matcher.readInputField];
14201
- for (const field of fields) {
14202
- const value = input[field];
14203
- if (value !== void 0 && value !== null) {
14204
- return String(value);
14205
- }
14206
- }
14207
- return "";
14208
- }
14209
14343
  };
14210
14344
 
14211
14345
  // src/evaluation/evaluators/llm-grader-prompt.ts
@@ -15050,10 +15184,12 @@ function runEqualsAssertion(output, value) {
15050
15184
  }
15051
15185
 
15052
15186
  // src/evaluation/orchestrator.ts
15187
+ import { execFile as execFile3 } from "node:child_process";
15053
15188
  import { createHash as createHash2, randomUUID as randomUUID9 } from "node:crypto";
15054
15189
  import { existsSync as existsSync5 } from "node:fs";
15055
15190
  import { copyFile as copyFile2, mkdir as mkdir14, readdir as readdir8, stat as stat9 } from "node:fs/promises";
15056
15191
  import path45 from "node:path";
15192
+ import { promisify as promisify7 } from "node:util";
15057
15193
  import micromatch3 from "micromatch";
15058
15194
 
15059
15195
  // ../../node_modules/.bun/yocto-queue@1.2.2/node_modules/yocto-queue/index.js
@@ -16507,6 +16643,8 @@ async function executeWorkspaceScript(config, context, failureMode = "fatal") {
16507
16643
  }
16508
16644
 
16509
16645
  // src/evaluation/orchestrator.ts
16646
+ var execFileAsync3 = promisify7(execFile3);
16647
+ var WORKSPACE_GIT_TIMEOUT_MS = 3e5;
16510
16648
  function classifyQualityStatus(score, threshold = DEFAULT_THRESHOLD) {
16511
16649
  return score >= threshold ? "ok" : "quality_failure";
16512
16650
  }
@@ -16544,6 +16682,35 @@ function hasHookCommand(hook) {
16544
16682
  function hooksEnabled(workspace) {
16545
16683
  return workspace?.hooks?.enabled !== false;
16546
16684
  }
16685
+ function workspaceGitEnv() {
16686
+ const env = { ...process.env };
16687
+ for (const key of Object.keys(env)) {
16688
+ if (key.startsWith("GIT_") && key !== "GIT_SSH_COMMAND") {
16689
+ delete env[key];
16690
+ }
16691
+ }
16692
+ return {
16693
+ ...env,
16694
+ GIT_TERMINAL_PROMPT: "0",
16695
+ GIT_ASKPASS: "",
16696
+ GIT_SSH_COMMAND: "ssh -o BatchMode=yes"
16697
+ };
16698
+ }
16699
+ async function resetWorkspaceRoot(workspacePath, resetMode, baselineRef) {
16700
+ if (!existsSync5(path45.join(workspacePath, ".git"))) {
16701
+ return false;
16702
+ }
16703
+ const cleanFlag = resetMode === "strict" ? "-fdx" : "-fd";
16704
+ const opts = {
16705
+ cwd: workspacePath,
16706
+ timeout: WORKSPACE_GIT_TIMEOUT_MS,
16707
+ env: workspaceGitEnv(),
16708
+ maxBuffer: 50 * 1024 * 1024
16709
+ };
16710
+ await execFileAsync3("git", ["reset", "--hard", baselineRef ?? "HEAD"], opts);
16711
+ await execFileAsync3("git", ["clean", cleanFlag], opts);
16712
+ return true;
16713
+ }
16547
16714
  function getWorkspaceTemplate(target) {
16548
16715
  const config = target.config;
16549
16716
  if ("workspaceTemplate" in config && typeof config.workspaceTemplate === "string") {
@@ -17805,6 +17972,37 @@ async function runEvalCase(options) {
17805
17972
  }
17806
17973
  }
17807
17974
  }
17975
+ let beforeEachNeedsFreshBaseline = false;
17976
+ if (caseHooksEnabled && workspacePath && evalCase.workspace?.hooks?.before_each?.reset && evalCase.workspace.hooks.before_each.reset !== "none") {
17977
+ try {
17978
+ if (repoManager && evalCase.workspace.repos?.length) {
17979
+ await repoManager.reset(
17980
+ evalCase.workspace.repos,
17981
+ workspacePath,
17982
+ evalCase.workspace.hooks.before_each.reset
17983
+ );
17984
+ } else {
17985
+ await resetWorkspaceRoot(
17986
+ workspacePath,
17987
+ evalCase.workspace.hooks.before_each.reset,
17988
+ sharedBaselineCommit
17989
+ );
17990
+ }
17991
+ } catch (error) {
17992
+ const message = error instanceof Error ? error.message : String(error);
17993
+ return buildErrorResult(
17994
+ evalCase,
17995
+ target.name,
17996
+ nowFn(),
17997
+ new Error(`before_each reset failed: ${message}`),
17998
+ promptInputs,
17999
+ provider,
18000
+ "setup",
18001
+ "script_error",
18002
+ verbose
18003
+ );
18004
+ }
18005
+ }
17808
18006
  const caseBeforeEachHook = evalCase.workspace?.hooks?.before_each;
17809
18007
  if (workspacePath && caseHooksEnabled && hasHookCommand(caseBeforeEachHook)) {
17810
18008
  const beforeEachHook = caseBeforeEachHook;
@@ -17821,6 +18019,7 @@ async function runEvalCase(options) {
17821
18019
  toScriptConfig(beforeEachHook, "before_each", `test '${evalCase.id}'`),
17822
18020
  scriptContext
17823
18021
  );
18022
+ beforeEachNeedsFreshBaseline = true;
17824
18023
  } catch (error) {
17825
18024
  const message = error instanceof Error ? error.message : String(error);
17826
18025
  return buildErrorResult(
@@ -17836,7 +18035,7 @@ async function runEvalCase(options) {
17836
18035
  );
17837
18036
  }
17838
18037
  }
17839
- let baselineCommit = sharedBaselineCommit;
18038
+ let baselineCommit = beforeEachNeedsFreshBaseline ? void 0 : sharedBaselineCommit;
17840
18039
  if (!baselineCommit && workspacePath) {
17841
18040
  try {
17842
18041
  baselineCommit = await initializeBaseline(workspacePath);
@@ -17847,6 +18046,35 @@ async function runEvalCase(options) {
17847
18046
  }
17848
18047
  }
17849
18048
  }
18049
+ if (evalCase.mode === "conversation" && evalCase.turns?.length) {
18050
+ const conversationResult = await runConversationMode({
18051
+ evalCase,
18052
+ provider,
18053
+ target,
18054
+ evaluators,
18055
+ typeRegistry,
18056
+ graderProvider,
18057
+ promptInputs,
18058
+ nowFn,
18059
+ signal,
18060
+ workspacePath,
18061
+ caseWorkspaceFile: caseWorkspaceFile ?? suiteWorkspaceFile,
18062
+ agentTimeoutMs,
18063
+ streamCallbacks: options.streamCallbacks,
18064
+ verbose,
18065
+ threshold: evalCase.threshold ?? caseThreshold,
18066
+ targetResolver,
18067
+ availableTargets
18068
+ });
18069
+ if (workspacePath && !isSharedWorkspace) {
18070
+ const shouldRetain = conversationResult.executionStatus === "ok" ? retainOnSuccess === "keep" || keepWorkspaces : retainOnFailure === "keep" || !forceCleanup && !keepWorkspaces;
18071
+ if (!shouldRetain) {
18072
+ await cleanupWorkspace(workspacePath).catch(() => {
18073
+ });
18074
+ }
18075
+ }
18076
+ return conversationResult;
18077
+ }
17850
18078
  const caseStartMs = Date.now();
17851
18079
  const attemptBudget = (maxRetries ?? 0) + 1;
17852
18080
  let attempt = 0;
@@ -17961,13 +18189,21 @@ async function runEvalCase(options) {
17961
18189
  ${providerFileChanges}` : providerFileChanges;
17962
18190
  }
17963
18191
  const providerError = extractProviderError(providerResponse);
17964
- if (caseHooksEnabled && repoManager && workspacePath && evalCase.workspace?.hooks?.after_each?.reset && evalCase.workspace.hooks.after_each.reset !== "none" && evalCase.workspace.repos) {
18192
+ if (caseHooksEnabled && workspacePath && evalCase.workspace?.hooks?.after_each?.reset && evalCase.workspace.hooks.after_each.reset !== "none") {
17965
18193
  try {
17966
- await repoManager.reset(
17967
- evalCase.workspace.repos,
17968
- workspacePath,
17969
- evalCase.workspace.hooks.after_each.reset
17970
- );
18194
+ if (repoManager && evalCase.workspace.repos?.length) {
18195
+ await repoManager.reset(
18196
+ evalCase.workspace.repos,
18197
+ workspacePath,
18198
+ evalCase.workspace.hooks.after_each.reset
18199
+ );
18200
+ } else {
18201
+ await resetWorkspaceRoot(
18202
+ workspacePath,
18203
+ evalCase.workspace.hooks.after_each.reset,
18204
+ baselineCommit
18205
+ );
18206
+ }
17971
18207
  } catch {
17972
18208
  }
17973
18209
  }
@@ -18583,6 +18819,276 @@ function buildEvaluatorRegistry(overrides, resolveGraderProvider) {
18583
18819
  "llm-grader": llmGrader
18584
18820
  };
18585
18821
  }
18822
+ async function runConversationMode(options) {
18823
+ const {
18824
+ evalCase,
18825
+ provider,
18826
+ target,
18827
+ evaluators,
18828
+ typeRegistry,
18829
+ graderProvider,
18830
+ promptInputs,
18831
+ nowFn,
18832
+ signal,
18833
+ workspacePath,
18834
+ caseWorkspaceFile,
18835
+ agentTimeoutMs,
18836
+ streamCallbacks,
18837
+ verbose,
18838
+ threshold,
18839
+ targetResolver,
18840
+ availableTargets
18841
+ } = options;
18842
+ const turns = evalCase.turns;
18843
+ const aggregation = evalCase.aggregation ?? "mean";
18844
+ const onTurnFailure = evalCase.on_turn_failure ?? "continue";
18845
+ const windowSize = evalCase.window_size;
18846
+ const history = [];
18847
+ for (const msg of evalCase.input) {
18848
+ const content = typeof msg.content === "string" ? msg.content : JSON.stringify(msg.content);
18849
+ history.push({ role: msg.role, content });
18850
+ }
18851
+ const turnScores = [];
18852
+ const allTurnScoreValues = [];
18853
+ let stopped = false;
18854
+ const caseStartMs = Date.now();
18855
+ for (let i = 0; i < turns.length; i++) {
18856
+ const turn = turns[i];
18857
+ const turnIndex = i + 1;
18858
+ if (stopped) {
18859
+ turnScores.push({
18860
+ name: `turn-${turnIndex}`,
18861
+ type: "rubrics",
18862
+ score: 0,
18863
+ verdict: "skip",
18864
+ assertions: [{ text: "Skipped due to previous turn failure", passed: false }]
18865
+ });
18866
+ allTurnScoreValues.push(0);
18867
+ continue;
18868
+ }
18869
+ const userContent = typeof turn.input === "string" ? turn.input : JSON.stringify(turn.input);
18870
+ history.push({ role: "user", content: userContent });
18871
+ const chatPromptForProvider = windowSize ? buildWindowedHistory(history, windowSize) : [...history];
18872
+ let response;
18873
+ try {
18874
+ response = await provider.invoke({
18875
+ question: userContent,
18876
+ chatPrompt: chatPromptForProvider,
18877
+ evalCaseId: `${evalCase.id}/turn-${turnIndex}`,
18878
+ signal,
18879
+ cwd: workspacePath,
18880
+ workspaceFile: caseWorkspaceFile,
18881
+ streamCallbacks
18882
+ });
18883
+ } catch (error) {
18884
+ const message = error instanceof Error ? error.message : String(error);
18885
+ turnScores.push({
18886
+ name: `turn-${turnIndex}`,
18887
+ type: "rubrics",
18888
+ score: 0,
18889
+ verdict: "fail",
18890
+ assertions: [{ text: `Provider error: ${message}`, passed: false }]
18891
+ });
18892
+ allTurnScoreValues.push(0);
18893
+ if (onTurnFailure === "stop") stopped = true;
18894
+ continue;
18895
+ }
18896
+ const assistantContent = extractLastAssistantContent(response.output);
18897
+ history.push({ role: "assistant", content: assistantContent });
18898
+ if (!turn.assertions?.length && !turn.expected_output) {
18899
+ turnScores.push({
18900
+ name: `turn-${turnIndex}`,
18901
+ type: "rubrics",
18902
+ score: 1,
18903
+ verdict: "pass",
18904
+ assertions: []
18905
+ });
18906
+ allTurnScoreValues.push(1);
18907
+ continue;
18908
+ }
18909
+ const turnAssertions = buildTurnAssertions(turn);
18910
+ const turnEvalCase = {
18911
+ ...evalCase,
18912
+ id: `${evalCase.id}/turn-${turnIndex}`,
18913
+ assertions: turnAssertions,
18914
+ input: buildTurnGraderInput(history, windowSize),
18915
+ expected_output: turn.expected_output ? [
18916
+ typeof turn.expected_output === "string" ? { content: turn.expected_output } : turn.expected_output
18917
+ ] : [],
18918
+ // Clear conversation fields to prevent recursion
18919
+ mode: void 0,
18920
+ turns: void 0
18921
+ };
18922
+ const turnResult = await evaluateCandidate({
18923
+ evalCase: turnEvalCase,
18924
+ candidate: assistantContent,
18925
+ target,
18926
+ provider,
18927
+ evaluators,
18928
+ typeRegistry,
18929
+ promptInputs: {
18930
+ question: buildConversationContext(history, windowSize),
18931
+ chatPrompt: windowSize ? buildWindowedHistory(history, windowSize) : [...history]
18932
+ },
18933
+ nowFn,
18934
+ attempt: 0,
18935
+ graderProvider,
18936
+ agentTimeoutMs,
18937
+ output: response.output,
18938
+ verbose,
18939
+ threshold,
18940
+ targetResolver,
18941
+ availableTargets
18942
+ });
18943
+ const turnScore = turnResult.score;
18944
+ allTurnScoreValues.push(turnScore);
18945
+ turnScores.push({
18946
+ name: `turn-${turnIndex}`,
18947
+ type: "rubrics",
18948
+ score: turnScore,
18949
+ verdict: scoreToVerdict(turnScore, threshold ?? DEFAULT_THRESHOLD),
18950
+ assertions: turnResult.assertions ? [...turnResult.assertions] : [],
18951
+ scores: turnResult.scores
18952
+ });
18953
+ if (onTurnFailure === "stop" && turnScore < (threshold ?? DEFAULT_THRESHOLD)) {
18954
+ stopped = true;
18955
+ }
18956
+ }
18957
+ let conversationScores = [];
18958
+ if (evalCase.assertions?.length) {
18959
+ const conversationEvalCase = {
18960
+ ...evalCase,
18961
+ id: `${evalCase.id}/conversation`,
18962
+ input: history.map((m) => ({
18963
+ role: m.role,
18964
+ content: m.content
18965
+ })),
18966
+ expected_output: [],
18967
+ mode: void 0,
18968
+ turns: void 0
18969
+ };
18970
+ const fullTranscript = history.map((m) => {
18971
+ const content = typeof m.content === "string" ? m.content : JSON.stringify(m.content);
18972
+ return `${m.role}: ${content}`;
18973
+ }).join("\n\n");
18974
+ const conversationResult = await evaluateCandidate({
18975
+ evalCase: conversationEvalCase,
18976
+ candidate: fullTranscript,
18977
+ target,
18978
+ provider,
18979
+ evaluators,
18980
+ typeRegistry,
18981
+ promptInputs: {
18982
+ question: fullTranscript,
18983
+ chatPrompt: [...history]
18984
+ },
18985
+ nowFn,
18986
+ attempt: 0,
18987
+ graderProvider,
18988
+ agentTimeoutMs,
18989
+ verbose,
18990
+ threshold,
18991
+ targetResolver,
18992
+ availableTargets
18993
+ });
18994
+ conversationScores = [
18995
+ {
18996
+ name: "conversation",
18997
+ type: "rubrics",
18998
+ score: conversationResult.score,
18999
+ verdict: scoreToVerdict(
19000
+ conversationResult.score,
19001
+ threshold ?? DEFAULT_THRESHOLD
19002
+ ),
19003
+ assertions: conversationResult.assertions ? [...conversationResult.assertions] : [],
19004
+ scores: conversationResult.scores
19005
+ }
19006
+ ];
19007
+ }
19008
+ const allScoreValues = [...allTurnScoreValues, ...conversationScores.map((s) => s.score)];
19009
+ const finalScore = aggregateConversationScores(allScoreValues, aggregation);
19010
+ const allResultScores = [...turnScores, ...conversationScores];
19011
+ const outputMessages = history.map((m) => ({
19012
+ role: m.role,
19013
+ content: m.content
19014
+ }));
19015
+ const flatAssertions = allResultScores.flatMap((s) => [...s.assertions]);
19016
+ const totalDurationMs = Date.now() - caseStartMs;
19017
+ return {
19018
+ timestamp: nowFn().toISOString(),
19019
+ testId: evalCase.id,
19020
+ suite: evalCase.suite,
19021
+ category: evalCase.category,
19022
+ score: finalScore,
19023
+ assertions: flatAssertions,
19024
+ target: target.name,
19025
+ output: outputMessages,
19026
+ scores: allResultScores,
19027
+ executionStatus: classifyQualityStatus(finalScore, threshold ?? DEFAULT_THRESHOLD),
19028
+ input: evalCase.input.map((m) => ({
19029
+ role: m.role,
19030
+ content: typeof m.content === "string" ? m.content : JSON.stringify(m.content)
19031
+ })),
19032
+ evalRun: { durationMs: totalDurationMs }
19033
+ };
19034
+ }
19035
+ function buildWindowedHistory(history, windowSize) {
19036
+ const systemMessages = history.filter((m) => m.role === "system");
19037
+ const nonSystem = history.filter((m) => m.role !== "system");
19038
+ const windowed = nonSystem.slice(-windowSize * 2);
19039
+ return [...systemMessages, ...windowed];
19040
+ }
19041
+ function buildConversationContext(history, windowSize) {
19042
+ const msgs = windowSize ? buildWindowedHistory(history, windowSize) : history;
19043
+ return msgs.map((m) => {
19044
+ const content = typeof m.content === "string" ? m.content : JSON.stringify(m.content);
19045
+ return `${m.role}: ${content}`;
19046
+ }).join("\n\n");
19047
+ }
19048
+ function buildTurnGraderInput(history, windowSize) {
19049
+ const msgs = windowSize ? buildWindowedHistory(history, windowSize) : history;
19050
+ return msgs.map((m) => ({
19051
+ role: m.role,
19052
+ content: m.content
19053
+ }));
19054
+ }
19055
+ function buildTurnAssertions(turn) {
19056
+ if (!turn.assertions?.length) return [];
19057
+ const stringCriteria = [];
19058
+ const structured = [];
19059
+ for (const a of turn.assertions) {
19060
+ if (typeof a === "string") {
19061
+ stringCriteria.push(a);
19062
+ } else {
19063
+ structured.push(a);
19064
+ }
19065
+ }
19066
+ const result = [];
19067
+ if (stringCriteria.length > 0) {
19068
+ result.push({
19069
+ name: "turn-rubrics",
19070
+ type: "llm-grader",
19071
+ rubrics: stringCriteria.map((text, idx) => ({
19072
+ id: `criterion-${idx + 1}`,
19073
+ outcome: text,
19074
+ weight: 1
19075
+ }))
19076
+ });
19077
+ }
19078
+ result.push(...structured);
19079
+ return result;
19080
+ }
19081
+ function aggregateConversationScores(scores, aggregation) {
19082
+ if (scores.length === 0) return 1;
19083
+ switch (aggregation) {
19084
+ case "min":
19085
+ return Math.min(...scores);
19086
+ case "max":
19087
+ return Math.max(...scores);
19088
+ default:
19089
+ return scores.reduce((sum, s) => sum + s, 0) / scores.length;
19090
+ }
19091
+ }
18586
19092
  async function invokeProvider(provider, options) {
18587
19093
  const {
18588
19094
  evalCase,
@@ -19299,13 +19805,13 @@ function shouldSkipCacheForTemperature(targetConfig) {
19299
19805
  }
19300
19806
 
19301
19807
  // src/evaluation/results-repo.ts
19302
- import { execFile as execFile3 } from "node:child_process";
19808
+ import { execFile as execFile4 } from "node:child_process";
19303
19809
  import { existsSync as existsSync7, mkdirSync as mkdirSync2, readFileSync as readFileSync3, rmSync, writeFileSync } from "node:fs";
19304
19810
  import { cp as cp3, mkdtemp as mkdtemp3, readdir as readdir9, rm as rm6, stat as stat10 } from "node:fs/promises";
19305
19811
  import os3 from "node:os";
19306
19812
  import path49 from "node:path";
19307
- import { promisify as promisify7 } from "node:util";
19308
- var execFileAsync3 = promisify7(execFile3);
19813
+ import { promisify as promisify8 } from "node:util";
19814
+ var execFileAsync4 = promisify8(execFile4);
19309
19815
  function sanitizeRepoSlug(repo) {
19310
19816
  return repo.trim().replace(/[^A-Za-z0-9._-]+/g, "-");
19311
19817
  }
@@ -19356,7 +19862,7 @@ function writePersistedStatus(statusFile, status) {
19356
19862
  }
19357
19863
  async function runCommand(executable, args, options) {
19358
19864
  try {
19359
- const { stdout, stderr } = await execFileAsync3(executable, [...args], {
19865
+ const { stdout, stderr } = await execFileAsync4(executable, [...args], {
19360
19866
  cwd: options?.cwd,
19361
19867
  env: process.env
19362
19868
  });
@@ -20404,11 +20910,13 @@ function extractAssistantContent(content) {
20404
20910
  break;
20405
20911
  case "tool_use":
20406
20912
  if (block.name) {
20407
- toolCalls.push({
20408
- tool: block.name,
20409
- input: block.input,
20410
- id: block.id
20411
- });
20913
+ toolCalls.push(
20914
+ normalizeToolCall("claude", {
20915
+ tool: block.name,
20916
+ input: block.input,
20917
+ id: block.id
20918
+ })
20919
+ );
20412
20920
  }
20413
20921
  break;
20414
20922
  }
@@ -20500,7 +21008,11 @@ function parseCodexSession(jsonl) {
20500
21008
  } else {
20501
21009
  input = payload.arguments;
20502
21010
  }
20503
- const toolCall = { tool: toolName, input, id: callId };
21011
+ const toolCall = normalizeToolCall("codex", {
21012
+ tool: toolName,
21013
+ input,
21014
+ id: callId
21015
+ });
20504
21016
  const msgIdx = messages.length;
20505
21017
  messages.push({
20506
21018
  role: "assistant",
@@ -20524,7 +21036,11 @@ function parseCodexSession(jsonl) {
20524
21036
  } else {
20525
21037
  input = payload.arguments;
20526
21038
  }
20527
- const toolCall = { tool: toolName, input, id: callId };
21039
+ const toolCall = normalizeToolCall("codex", {
21040
+ tool: toolName,
21041
+ input,
21042
+ id: callId
21043
+ });
20528
21044
  const msgIdx = messages.length;
20529
21045
  messages.push({
20530
21046
  role: "assistant",